Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse filesalways denoise
app.py
CHANGED
@@ -1,116 +1,189 @@
|
|
1 |
import os
|
2 |
-
|
|
|
|
|
|
|
3 |
|
4 |
import gradio as gr
|
5 |
import librosa
|
6 |
import soundfile as sf
|
7 |
-
import
|
8 |
-
import
|
9 |
-
from datetime import datetime
|
10 |
from transformers import pipeline
|
11 |
-
from
|
12 |
-
from huggingface_hub import login, HfApi
|
13 |
import spaces
|
|
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
#
|
18 |
-
HF_TOKEN = os.getenv(
|
19 |
-
|
|
|
|
|
20 |
|
21 |
-
default_sr = 16000
|
22 |
CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
api = HfApi(token=HF_TOKEN)
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
28 |
try:
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
47 |
"""
|
48 |
-
|
49 |
-
2. Transcribe with ASR
|
50 |
-
3. Append to HuggingFace dataset
|
51 |
"""
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
"
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
if __name__ == "__main__":
|
116 |
-
|
|
|
1 |
import os
|
2 |
+
import uuid
|
3 |
+
import logging
|
4 |
+
import tempfile
|
5 |
+
from datetime import datetime
|
6 |
|
7 |
import gradio as gr
|
8 |
import librosa
|
9 |
import soundfile as sf
|
10 |
+
import torch
|
11 |
+
from datasets import Dataset, DatasetDict, concatenate_datasets, Audio, load_dataset, DownloadConfig
|
|
|
12 |
from transformers import pipeline
|
13 |
+
from huggingface_hub import HfApi, login
|
|
|
14 |
import spaces
|
15 |
+
from resemble_enhance.enhancer.inference import denoise, enhance
|
16 |
|
17 |
+
# Configure logging
|
18 |
+
logging.basicConfig(
|
19 |
+
format="%(asctime)s — %(levelname)s — %(message)s",
|
20 |
+
level=logging.INFO
|
21 |
+
)
|
22 |
+
logger = logging.getLogger(__name__)
|
23 |
|
24 |
+
# Constants
|
25 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
26 |
+
if not HF_TOKEN:
|
27 |
+
logger.error("Hugging Face token not found. Please set HF_TOKEN environment variable.")
|
28 |
+
raise SystemExit
|
29 |
|
|
|
30 |
CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
|
31 |
+
SAMPLE_RATE = 16_000
|
32 |
+
ASR_MODEL = "sawadogosalif/SaChi-ASR"
|
33 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
34 |
|
35 |
+
# Authenticate with Hugging Face
|
36 |
+
login(token=HF_TOKEN)
|
37 |
api = HfApi(token=HF_TOKEN)
|
38 |
+
|
39 |
+
|
40 |
+
def get_or_create_dataset(dataset_name: str) -> Dataset:
|
41 |
+
"""
|
42 |
+
Load the dataset if it exists, otherwise create a new empty one.
|
43 |
+
"""
|
44 |
try:
|
45 |
+
ds = load_dataset(
|
46 |
+
dataset_name,
|
47 |
+
split="train",
|
48 |
+
download_config=DownloadConfig(token=HF_TOKEN)
|
49 |
+
)
|
50 |
+
logger.info(f"Loaded dataset '{dataset_name}' with {len(ds)} examples.")
|
51 |
+
except Exception:
|
52 |
+
logger.warning(f"Dataset '{dataset_name}' not found or failed to load. Creating a new one.")
|
53 |
+
ds = Dataset.from_dict({
|
54 |
+
"audio": [],
|
55 |
+
"text": [],
|
56 |
+
"language": [],
|
57 |
+
"datetime": [],
|
58 |
+
})
|
59 |
+
DatasetDict({"train": ds}).push_to_hub(dataset_name, token=HF_TOKEN)
|
60 |
+
logger.info(f"Created empty dataset '{dataset_name}'.")
|
61 |
+
return ds
|
62 |
+
|
63 |
+
|
64 |
+
def save_dataset(dataset: Dataset, dataset_name: str) -> None:
|
65 |
"""
|
66 |
+
Push the updated dataset back to Hugging Face hub.
|
|
|
|
|
67 |
"""
|
68 |
+
ds_dict = DatasetDict({"train": dataset})
|
69 |
+
ds_dict.push_to_hub(dataset_name, token=HF_TOKEN)
|
70 |
+
logger.info(f"Pushed updated dataset to '{dataset_name}' ({len(dataset)} records).")
|
71 |
+
|
72 |
+
|
73 |
+
def process_audio_file(filepath: str, target_sr: int = SAMPLE_RATE) -> tuple:
|
74 |
+
"""
|
75 |
+
Load audio file, convert to mono and target sampling rate.
|
76 |
+
Returns audio array and sampling rate.
|
77 |
+
"""
|
78 |
+
try:
|
79 |
+
data, sr = librosa.load(filepath, sr=target_sr, mono=True)
|
80 |
+
return data, sr
|
81 |
+
except Exception as exc:
|
82 |
+
logger.error(f"Failed to process audio file '{filepath}': {exc}")
|
83 |
+
raise
|
84 |
+
|
85 |
+
|
86 |
+
class Transcriber:
|
87 |
+
def __init__(self, asr_model: str):
|
88 |
+
self.pipeline = pipeline(model=asr_model)
|
89 |
+
|
90 |
+
def transcribe(self, audio_path: str) -> str:
|
91 |
+
result = self.pipeline(audio_path)
|
92 |
+
return result.get("text", "")
|
93 |
+
|
94 |
+
|
95 |
+
# Initialize components
|
96 |
+
current_dataset = get_or_create_dataset(CURRENT_DATASET)
|
97 |
+
asr_client = Transcriber(ASR_MODEL)
|
98 |
+
|
99 |
+
|
100 |
+
@spaces.GPU(duration=15)
|
101 |
+
def transcribe_and_update(audio_filepath: str, history: str, apply_enhance: bool) -> tuple:
|
102 |
+
"""
|
103 |
+
Denoise every input, optionally enhance, then transcribe and push to HF dataset.
|
104 |
+
"""
|
105 |
+
if not audio_filepath:
|
106 |
+
return "No audio detected. Please record or upload audio.", history
|
107 |
+
|
108 |
+
try:
|
109 |
+
# Load and preprocess
|
110 |
+
audio_data, sr = process_audio_file(audio_filepath)
|
111 |
+
|
112 |
+
# Always denoise
|
113 |
+
try:
|
114 |
+
denoised_data, sr = denoise(audio_data, sr, device)
|
115 |
+
logger.info("Audio denoised successfully.")
|
116 |
+
except Exception as e:
|
117 |
+
logger.warning(f"Denoise failed, using raw audio: {e}")
|
118 |
+
denoised_data = audio_data
|
119 |
+
|
120 |
+
# Optionally enhance
|
121 |
+
if apply_enhance:
|
122 |
+
try:
|
123 |
+
enhanced_data, sr = enhance(denoised_data, sr, device)
|
124 |
+
final_audio = enhanced_data
|
125 |
+
logger.info("Audio enhanced successfully.")
|
126 |
+
except Exception as e:
|
127 |
+
logger.warning(f"Enhancement failed, using denoised audio: {e}")
|
128 |
+
final_audio = denoised_data
|
129 |
+
else:
|
130 |
+
final_audio = denoised_data
|
131 |
+
|
132 |
+
# Save processed audio to temp file
|
133 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpf:
|
134 |
+
sf.write(tmpf.name, final_audio, sr)
|
135 |
+
local_path = tmpf.name
|
136 |
+
|
137 |
+
# Transcription
|
138 |
+
transcription = asr_client.transcribe(local_path)
|
139 |
+
logger.info(f"Transcription: {transcription}")
|
140 |
+
|
141 |
+
# Prepare new record
|
142 |
+
new_record = {
|
143 |
+
"audio": [local_path],
|
144 |
+
"text": [transcription],
|
145 |
+
"language": ["moore"],
|
146 |
+
"datetime": [datetime.utcnow().isoformat()]
|
147 |
+
}
|
148 |
+
new_ds = Dataset.from_dict(new_record).cast_column("audio", Audio())
|
149 |
+
|
150 |
+
# Update in-memory dataset
|
151 |
+
global current_dataset
|
152 |
+
if len(current_dataset) == 0:
|
153 |
+
current_dataset = new_ds
|
154 |
+
else:
|
155 |
+
current_dataset = concatenate_datasets([current_dataset, new_ds])
|
156 |
+
|
157 |
+
# Push to hub
|
158 |
+
save_dataset(current_dataset, CURRENT_DATASET)
|
159 |
+
|
160 |
+
# Update conversation history
|
161 |
+
history = history + f"\nUser: [audio]\nAssistant: {transcription}"
|
162 |
+
return transcription, history
|
163 |
+
|
164 |
+
except Exception as exc:
|
165 |
+
logger.error(f"Error during transcription pipeline: {exc}")
|
166 |
+
return f"Error: {exc}", history
|
167 |
+
|
168 |
+
|
169 |
+
def build_interface():
|
170 |
+
with gr.Blocks(theme="huggingface") as demo:
|
171 |
+
gr.Markdown("# 🗣️ ASR Moore Live 🧠")
|
172 |
+
gr.Markdown("Speech Recognition interface for Moore language. Records or uploads audio, always denoises, and optionally enhances before ASR.")
|
173 |
+
|
174 |
+
with gr.Row():
|
175 |
+
audio_input = gr.Audio(source="microphone", type="filepath", label="Record or upload audio")
|
176 |
+
state_box = gr.State(value="")
|
177 |
+
enhance_checkbox = gr.Checkbox(label="Apply Enhancement", value=False)
|
178 |
+
|
179 |
+
output_text = gr.Textbox(label="Transcription")
|
180 |
+
submit_btn = gr.Button("Transcribe and Save")
|
181 |
+
submit_btn.click(fn=transcribe_and_update,
|
182 |
+
inputs=[audio_input, state_box, enhance_checkbox],
|
183 |
+
outputs=[output_text, state_box])
|
184 |
+
|
185 |
+
demo.launch(debug=True)
|
186 |
+
|
187 |
|
188 |
if __name__ == "__main__":
|
189 |
+
build_interface()
|