File size: 5,889 Bytes
535ff2b
f289f22
 
 
 
ffb8b65
535ff2b
41199e7
6d40a3f
535ff2b
f289f22
 
6d40a3f
f289f22
 
3bc425a
 
ffb8b65
 
 
 
 
 
 
535ff2b
f289f22
 
 
 
 
535ff2b
e0fb7f3
f289f22
 
653c292
f289f22
 
ffb8b65
f289f22
 
 
 
 
 
c989fa9
f289f22
 
 
ffb8b65
f289f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535ff2b
f289f22
535ff2b
f289f22
 
 
 
 
3bc425a
f289f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb8b65
 
f289f22
ffb8b65
f289f22
 
ffb8b65
f289f22
 
 
3bc425a
f289f22
 
ffb8b65
3bc425a
f289f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb8b65
 
 
 
 
 
 
 
 
 
 
 
 
f289f22
ffb8b65
 
 
 
f289f22
ffb8b65
 
 
f289f22
 
 
ffb8b65
 
 
 
9881c0d
ffb8b65
 
 
 
9881c0d
ffb8b65
 
 
 
 
 
 
 
 
 
 
 
653c292
ffb8b65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import os
import uuid
import logging
import tempfile
from datetime import datetime
import spaces

import gradio as gr
import librosa
import soundfile as sf
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets, Audio, load_dataset, DownloadConfig
from transformers import pipeline
from huggingface_hub import HfApi, login
from resemble_enhance.enhancer.inference import denoise, enhance
import torchaudio


# Configure logging
logging.basicConfig(
    format="%(asctime)s — %(levelname)s — %(message)s",
    level=logging.INFO
)
logger = logging.getLogger(__name__)

# Constants
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    logger.error("Hugging Face token not found. Please set HF_TOKEN environment variable.")
    raise SystemExit

CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
SAMPLE_RATE = 16_000
ASR_MODEL = "sawadogosalif/SaChi-ASR"

# Authenticate with Hugging Face
login(token=HF_TOKEN)
api = HfApi(token=HF_TOKEN)


def get_or_create_dataset(dataset_name: str) -> Dataset:
    """
    Load the dataset if it exists, otherwise create a new empty one.
    """
    try:
        ds = load_dataset(
            dataset_name,
            split="train",
            download_config=DownloadConfig(token=HF_TOKEN)
        )
        logger.info(f"Loaded dataset '{dataset_name}' with {len(ds)} examples.")
    except Exception:
        logger.warning(f"Dataset '{dataset_name}' not found or failed to load. Creating a new one.")
        ds = Dataset.from_dict({
            "audio": [],
            "text": [],
            "language": [],
            "datetime": [],
        })
        DatasetDict({"train": ds}).push_to_hub(dataset_name, token=HF_TOKEN)
        logger.info(f"Created empty dataset '{dataset_name}'.")
    return ds


def save_dataset(dataset: Dataset, dataset_name: str) -> None:
    """
    Push the updated dataset back to Hugging Face hub.
    """
    ds_dict = DatasetDict({"train": dataset})
    ds_dict.push_to_hub(dataset_name, token=HF_TOKEN)
    logger.info(f"Pushed updated dataset to '{dataset_name}' ({len(dataset)} records).")





class Transcriber:
    def __init__(self, asr_model: str):
        self.pipeline = pipeline(model=asr_model)

    def transcribe(self, audio_path: str) -> str:
        result = self.pipeline(audio_path)
        return result.get("text", "")


# Initialize components
current_dataset = get_or_create_dataset(CURRENT_DATASET)
asr_client = Transcriber(ASR_MODEL)


@spaces.GPU(duration=15)
def transcribe_and_update(audio_filepath: str, history: str, apply_enhance: bool) -> tuple:
    """
    Denoise every input, optionally enhance, then transcribe and push to HF dataset.
    """
    if not audio_filepath:
        return "No audio detected. Please record or upload audio.", history

    try:
        # Load and preprocess
        audio_data, sr =     dwav, sr = torchaudio.load(audio_filepath)
        # Always denoise
        try:
            device = "cuda"
            audio_data = audio_data.mean(dim=0)
            denoised_data, sr = denoise(audio_data, sr, device)
            logger.info("Audio denoised successfully.")
        except Exception as e:
            logger.warning(f"Denoise failed, using raw audio: {e}")
            denoised_data = audio_data

        # Optionally enhance
        if apply_enhance:
            try:
                enhanced_data, sr = enhance(denoised_data, sr, device)
                final_audio = enhanced_data
                logger.info("Audio enhanced successfully.")
            except Exception as e:
                logger.warning(f"Enhancement failed, using denoised audio: {e}")
                final_audio = denoised_data
        else:
            final_audio = denoised_data

        # Save processed audio to temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpf:
            sf.write(tmpf.name, final_audio, sr)
            local_path = tmpf.name

        # Transcription
        transcription = asr_client.transcribe(local_path)
        logger.info(f"Transcription: {transcription}")

        # Prepare new record
        new_record = {
            "audio": [local_path],
            "text": [transcription],
            "language": ["moore"],
            "datetime": [datetime.utcnow().isoformat()]
        }
        new_ds = Dataset.from_dict(new_record).cast_column("audio", Audio())

        # Update in-memory dataset
        global current_dataset
        if len(current_dataset) == 0:
            current_dataset = new_ds
        else:
            current_dataset = concatenate_datasets([current_dataset, new_ds])

        # Push to hub
        save_dataset(current_dataset, CURRENT_DATASET)

        # Update conversation history
        history = history + f"\nUser: [audio]\nAssistant: {transcription}"
        return transcription, history

    except Exception as exc:
        logger.error(f"Error during transcription pipeline: {exc}")
        return f"Error: {exc}", history


def build_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# 🗣️ ASR Moore Live 🧠")
        gr.Markdown("Speech Recognition interface for Moore language. Records or uploads audio, always denoises, and optionally enhances before ASR.")

        with gr.Row():
            audio_input = gr.Audio(type="filepath", label="Record or upload audio", sources=["microphone", "upload"])
            state_box = gr.State(value="")
            enhance_checkbox = gr.Checkbox(label="Apply Enhancement", value=False)

        output_text = gr.Textbox(label="Transcription")
        submit_btn = gr.Button("Transcribe and Save")
        submit_btn.click(fn=transcribe_and_update,
                         inputs=[audio_input, state_box, enhance_checkbox],
                         outputs=[output_text, state_box])

    demo.launch(debug=True)


if __name__ == "__main__":
    build_interface()