Spaces:

sawadogosalif
/

Sachi-ASR-demo

Running on Zero

App Files Files Community

sawadogosalif commited on 15 days ago

Commit

f289f22

verified ·

1 Parent(s): 74ec7fa

Update app.py

Browse files

always denoise

Files changed (1) hide show

app.py +171 -98

app.py CHANGED Viewed

@@ -1,116 +1,189 @@
 import os
-tmp = os.getcwd()  # just to initialize
 import gradio as gr
 import librosa
 import soundfile as sf
-import tempfile
-import uuid
-from datetime import datetime
 from transformers import pipeline
-from datasets import Dataset, concatenate_datasets, DownloadConfig, Audio, load_dataset, DatasetDict
-from huggingface_hub import login, HfApi
 import spaces
-from preproces import process_audio
-# Hugging Face authentication
-HF_TOKEN = os.getenv('HF_TOKEN')
-login(token=HF_TOKEN)
-default_sr = 16000
 CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
-asr_pipe = pipeline(model="sawadogosalif/SaChi-ASR")
 api = HfApi(token=HF_TOKEN)
-def check_dataset_exists():
     try:
-        api.dataset_info(CURRENT_DATASET)
-        return True
-    except:
-        return False
-if check_dataset_exists():
-    current_dataset = load_dataset(
-        CURRENT_DATASET,
-        split="train",
-        download_config=DownloadConfig(token=HF_TOKEN)
-    )
-else:
-    empty_ds = Dataset.from_dict({"audio": [], "text": [], "language": [], "datetime": []})
-    current_dataset = empty_ds
-    DatasetDict({"train": empty_ds}).push_to_hub(CURRENT_DATASET, token=HF_TOKEN)
-def process_and_transcribe(audio_path, state):
     """
-    1. Load and preprocess audio (denoise & enhance)
-    2. Transcribe with ASR
-    3. Append to HuggingFace dataset
     """
-    global current_dataset
-    if audio_path is None:
-        return "No audio detected.", state
-    # ---- Prétraitement audio ----
-    # Load raw audio
-    wav, sr = librosa.load(audio_path, sr=default_sr)
-    wav = librosa.to_mono(wav)
-    tensor = gr.numpy_to_torch(wav).unsqueeze(0)  # shape (1, T)
-    # Apply denoise & enhance (séquentiel par défaut)
-    res = process_audio(
-        audio=tensor,
-        sr=sr,
-        device="cuda" if torch.cuda.is_available() else "cpu",
-        solver="midpoint",
-        nfe=128,
-        tau=0.01,
-        denoise_before=True,
-        parallel=False
-    )
-    denoised_tensor, _ = res["denoised"]
-    enhanced_tensor, _ = res["enhanced"]
-    # Save enhanced audio to temp file for ASR
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmpf:
-        sf.write(tmpf.name, enhanced_tensor.squeeze(), sr)
-        processed_path = tmpf.name
-    # ---- Transcription ----
-    transcription = asr_pipe(processed_path)["text"]
-    # ---- Enregistrement dans le dataset HF ----
-    new_row = {
-        "audio": [processed_path],
-        "text": [transcription],
-        "language": ["moore"],
-        "datetime": [str(datetime.now())],
-    }
-    incoming = Dataset.from_dict(new_row).cast_column("audio", Audio())
-    if len(current_dataset) > 0:
-        current_dataset = concatenate_datasets([current_dataset, incoming])
-    else:
-        current_dataset = incoming
-    current_dataset.push_to_hub(CURRENT_DATASET, token=HF_TOKEN)
-    return transcription, state
-iface = gr.Interface(
-    fn=process_and_transcribe,
-    inputs=[
-        gr.Audio(source="microphone", type='filepath', label="Record or upload audio"),
-        "state"
-    ],
-    outputs=["text", "state"],
-    layout="horizontal",
-    theme="huggingface",
-    title="🗣️ ASR Moore Live avec Denoise & Enhance",
-    description="Enregistrement en direct, prétraitement automatique et transcription ASR de la langue Moore."
-)
 if __name__ == "__main__":
-    iface.launch(debug=True)

 import os
+import uuid
+import logging
+import tempfile
+from datetime import datetime
 import gradio as gr
 import librosa
 import soundfile as sf
+import torch
+from datasets import Dataset, DatasetDict, concatenate_datasets, Audio, load_dataset, DownloadConfig
 from transformers import pipeline
+from huggingface_hub import HfApi, login
 import spaces
+from resemble_enhance.enhancer.inference import denoise, enhance
+# Configure logging
+logging.basicConfig(
+    format="%(asctime)s — %(levelname)s — %(message)s",
+    level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+# Constants
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    logger.error("Hugging Face token not found. Please set HF_TOKEN environment variable.")
+    raise SystemExit
 CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
+SAMPLE_RATE = 16_000
+ASR_MODEL = "sawadogosalif/SaChi-ASR"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Authenticate with Hugging Face
+login(token=HF_TOKEN)
 api = HfApi(token=HF_TOKEN)
+def get_or_create_dataset(dataset_name: str) -> Dataset:
+    """
+    Load the dataset if it exists, otherwise create a new empty one.
+    """
     try:
+        ds = load_dataset(
+            dataset_name,
+            split="train",
+            download_config=DownloadConfig(token=HF_TOKEN)
+        )
+        logger.info(f"Loaded dataset '{dataset_name}' with {len(ds)} examples.")
+    except Exception:
+        logger.warning(f"Dataset '{dataset_name}' not found or failed to load. Creating a new one.")
+        ds = Dataset.from_dict({
+            "audio": [],
+            "text": [],
+            "language": [],
+            "datetime": [],
+        })
+        DatasetDict({"train": ds}).push_to_hub(dataset_name, token=HF_TOKEN)
+        logger.info(f"Created empty dataset '{dataset_name}'.")
+    return ds
+def save_dataset(dataset: Dataset, dataset_name: str) -> None:
     """
+    Push the updated dataset back to Hugging Face hub.
     """
+    ds_dict = DatasetDict({"train": dataset})
+    ds_dict.push_to_hub(dataset_name, token=HF_TOKEN)
+    logger.info(f"Pushed updated dataset to '{dataset_name}' ({len(dataset)} records).")
+def process_audio_file(filepath: str, target_sr: int = SAMPLE_RATE) -> tuple:
+    """
+    Load audio file, convert to mono and target sampling rate.
+    Returns audio array and sampling rate.
+    """
+    try:
+        data, sr = librosa.load(filepath, sr=target_sr, mono=True)
+        return data, sr
+    except Exception as exc:
+        logger.error(f"Failed to process audio file '{filepath}': {exc}")
+        raise
+class Transcriber:
+    def __init__(self, asr_model: str):
+        self.pipeline = pipeline(model=asr_model)
+    def transcribe(self, audio_path: str) -> str:
+        result = self.pipeline(audio_path)
+        return result.get("text", "")
+# Initialize components
+current_dataset = get_or_create_dataset(CURRENT_DATASET)
+asr_client = Transcriber(ASR_MODEL)
+@spaces.GPU(duration=15)
+def transcribe_and_update(audio_filepath: str, history: str, apply_enhance: bool) -> tuple:
+    """
+    Denoise every input, optionally enhance, then transcribe and push to HF dataset.
+    """
+    if not audio_filepath:
+        return "No audio detected. Please record or upload audio.", history
+    try:
+        # Load and preprocess
+        audio_data, sr = process_audio_file(audio_filepath)
+        # Always denoise
+        try:
+            denoised_data, sr = denoise(audio_data, sr, device)
+            logger.info("Audio denoised successfully.")
+        except Exception as e:
+            logger.warning(f"Denoise failed, using raw audio: {e}")
+            denoised_data = audio_data
+        # Optionally enhance
+        if apply_enhance:
+            try:
+                enhanced_data, sr = enhance(denoised_data, sr, device)
+                final_audio = enhanced_data
+                logger.info("Audio enhanced successfully.")
+            except Exception as e:
+                logger.warning(f"Enhancement failed, using denoised audio: {e}")
+                final_audio = denoised_data
+        else:
+            final_audio = denoised_data
+        # Save processed audio to temp file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpf:
+            sf.write(tmpf.name, final_audio, sr)
+            local_path = tmpf.name
+        # Transcription
+        transcription = asr_client.transcribe(local_path)
+        logger.info(f"Transcription: {transcription}")
+        # Prepare new record
+        new_record = {
+            "audio": [local_path],
+            "text": [transcription],
+            "language": ["moore"],
+            "datetime": [datetime.utcnow().isoformat()]
+        }
+        new_ds = Dataset.from_dict(new_record).cast_column("audio", Audio())
+        # Update in-memory dataset
+        global current_dataset
+        if len(current_dataset) == 0:
+            current_dataset = new_ds
+        else:
+            current_dataset = concatenate_datasets([current_dataset, new_ds])
+        # Push to hub
+        save_dataset(current_dataset, CURRENT_DATASET)
+        # Update conversation history
+        history = history + f"\nUser: [audio]\nAssistant: {transcription}"
+        return transcription, history
+    except Exception as exc:
+        logger.error(f"Error during transcription pipeline: {exc}")
+        return f"Error: {exc}", history
+def build_interface():
+    with gr.Blocks(theme="huggingface") as demo:
+        gr.Markdown("# 🗣️ ASR Moore Live 🧠")
+        gr.Markdown("Speech Recognition interface for Moore language. Records or uploads audio, always denoises, and optionally enhances before ASR.")
+        with gr.Row():
+            audio_input = gr.Audio(source="microphone", type="filepath", label="Record or upload audio")
+            state_box = gr.State(value="")
+            enhance_checkbox = gr.Checkbox(label="Apply Enhancement", value=False)
+        output_text = gr.Textbox(label="Transcription")
+        submit_btn = gr.Button("Transcribe and Save")
+        submit_btn.click(fn=transcribe_and_update,
+                         inputs=[audio_input, state_box, enhance_checkbox],
+                         outputs=[output_text, state_box])
+    demo.launch(debug=True)
 if __name__ == "__main__":
+    build_interface()