sawadogosalif commited on
Commit
f289f22
·
verified ·
1 Parent(s): 74ec7fa

Update app.py

Browse files

always denoise

Files changed (1) hide show
  1. app.py +171 -98
app.py CHANGED
@@ -1,116 +1,189 @@
1
  import os
2
- tmp = os.getcwd() # just to initialize
 
 
 
3
 
4
  import gradio as gr
5
  import librosa
6
  import soundfile as sf
7
- import tempfile
8
- import uuid
9
- from datetime import datetime
10
  from transformers import pipeline
11
- from datasets import Dataset, concatenate_datasets, DownloadConfig, Audio, load_dataset, DatasetDict
12
- from huggingface_hub import login, HfApi
13
  import spaces
 
14
 
15
- from preproces import process_audio
 
 
 
 
 
16
 
17
- # Hugging Face authentication
18
- HF_TOKEN = os.getenv('HF_TOKEN')
19
- login(token=HF_TOKEN)
 
 
20
 
21
- default_sr = 16000
22
  CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
 
 
 
23
 
24
- asr_pipe = pipeline(model="sawadogosalif/SaChi-ASR")
25
-
26
  api = HfApi(token=HF_TOKEN)
27
- def check_dataset_exists():
 
 
 
 
 
28
  try:
29
- api.dataset_info(CURRENT_DATASET)
30
- return True
31
- except:
32
- return False
33
-
34
- if check_dataset_exists():
35
- current_dataset = load_dataset(
36
- CURRENT_DATASET,
37
- split="train",
38
- download_config=DownloadConfig(token=HF_TOKEN)
39
- )
40
- else:
41
- empty_ds = Dataset.from_dict({"audio": [], "text": [], "language": [], "datetime": []})
42
- current_dataset = empty_ds
43
- DatasetDict({"train": empty_ds}).push_to_hub(CURRENT_DATASET, token=HF_TOKEN)
44
-
45
-
46
- def process_and_transcribe(audio_path, state):
 
 
47
  """
48
- 1. Load and preprocess audio (denoise & enhance)
49
- 2. Transcribe with ASR
50
- 3. Append to HuggingFace dataset
51
  """
52
- global current_dataset
53
-
54
- if audio_path is None:
55
- return "No audio detected.", state
56
-
57
- # ---- Prétraitement audio ----
58
- # Load raw audio
59
- wav, sr = librosa.load(audio_path, sr=default_sr)
60
- wav = librosa.to_mono(wav)
61
- tensor = gr.numpy_to_torch(wav).unsqueeze(0) # shape (1, T)
62
-
63
- # Apply denoise & enhance (séquentiel par défaut)
64
- res = process_audio(
65
- audio=tensor,
66
- sr=sr,
67
- device="cuda" if torch.cuda.is_available() else "cpu",
68
- solver="midpoint",
69
- nfe=128,
70
- tau=0.01,
71
- denoise_before=True,
72
- parallel=False
73
- )
74
- denoised_tensor, _ = res["denoised"]
75
- enhanced_tensor, _ = res["enhanced"]
76
-
77
- # Save enhanced audio to temp file for ASR
78
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmpf:
79
- sf.write(tmpf.name, enhanced_tensor.squeeze(), sr)
80
- processed_path = tmpf.name
81
-
82
- # ---- Transcription ----
83
- transcription = asr_pipe(processed_path)["text"]
84
-
85
- # ---- Enregistrement dans le dataset HF ----
86
- new_row = {
87
- "audio": [processed_path],
88
- "text": [transcription],
89
- "language": ["moore"],
90
- "datetime": [str(datetime.now())],
91
- }
92
- incoming = Dataset.from_dict(new_row).cast_column("audio", Audio())
93
- if len(current_dataset) > 0:
94
- current_dataset = concatenate_datasets([current_dataset, incoming])
95
- else:
96
- current_dataset = incoming
97
- current_dataset.push_to_hub(CURRENT_DATASET, token=HF_TOKEN)
98
-
99
- return transcription, state
100
-
101
-
102
- iface = gr.Interface(
103
- fn=process_and_transcribe,
104
- inputs=[
105
- gr.Audio(source="microphone", type='filepath', label="Record or upload audio"),
106
- "state"
107
- ],
108
- outputs=["text", "state"],
109
- layout="horizontal",
110
- theme="huggingface",
111
- title="🗣️ ASR Moore Live avec Denoise & Enhance",
112
- description="Enregistrement en direct, prétraitement automatique et transcription ASR de la langue Moore."
113
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  if __name__ == "__main__":
116
- iface.launch(debug=True)
 
1
  import os
2
+ import uuid
3
+ import logging
4
+ import tempfile
5
+ from datetime import datetime
6
 
7
  import gradio as gr
8
  import librosa
9
  import soundfile as sf
10
+ import torch
11
+ from datasets import Dataset, DatasetDict, concatenate_datasets, Audio, load_dataset, DownloadConfig
 
12
  from transformers import pipeline
13
+ from huggingface_hub import HfApi, login
 
14
  import spaces
15
+ from resemble_enhance.enhancer.inference import denoise, enhance
16
 
17
+ # Configure logging
18
+ logging.basicConfig(
19
+ format="%(asctime)s — %(levelname)s — %(message)s",
20
+ level=logging.INFO
21
+ )
22
+ logger = logging.getLogger(__name__)
23
 
24
+ # Constants
25
+ HF_TOKEN = os.getenv("HF_TOKEN")
26
+ if not HF_TOKEN:
27
+ logger.error("Hugging Face token not found. Please set HF_TOKEN environment variable.")
28
+ raise SystemExit
29
 
 
30
  CURRENT_DATASET = "sawadogosalif/Sachi_demo_dataset"
31
+ SAMPLE_RATE = 16_000
32
+ ASR_MODEL = "sawadogosalif/SaChi-ASR"
33
+ device = "cuda" if torch.cuda.is_available() else "cpu"
34
 
35
+ # Authenticate with Hugging Face
36
+ login(token=HF_TOKEN)
37
  api = HfApi(token=HF_TOKEN)
38
+
39
+
40
+ def get_or_create_dataset(dataset_name: str) -> Dataset:
41
+ """
42
+ Load the dataset if it exists, otherwise create a new empty one.
43
+ """
44
  try:
45
+ ds = load_dataset(
46
+ dataset_name,
47
+ split="train",
48
+ download_config=DownloadConfig(token=HF_TOKEN)
49
+ )
50
+ logger.info(f"Loaded dataset '{dataset_name}' with {len(ds)} examples.")
51
+ except Exception:
52
+ logger.warning(f"Dataset '{dataset_name}' not found or failed to load. Creating a new one.")
53
+ ds = Dataset.from_dict({
54
+ "audio": [],
55
+ "text": [],
56
+ "language": [],
57
+ "datetime": [],
58
+ })
59
+ DatasetDict({"train": ds}).push_to_hub(dataset_name, token=HF_TOKEN)
60
+ logger.info(f"Created empty dataset '{dataset_name}'.")
61
+ return ds
62
+
63
+
64
+ def save_dataset(dataset: Dataset, dataset_name: str) -> None:
65
  """
66
+ Push the updated dataset back to Hugging Face hub.
 
 
67
  """
68
+ ds_dict = DatasetDict({"train": dataset})
69
+ ds_dict.push_to_hub(dataset_name, token=HF_TOKEN)
70
+ logger.info(f"Pushed updated dataset to '{dataset_name}' ({len(dataset)} records).")
71
+
72
+
73
+ def process_audio_file(filepath: str, target_sr: int = SAMPLE_RATE) -> tuple:
74
+ """
75
+ Load audio file, convert to mono and target sampling rate.
76
+ Returns audio array and sampling rate.
77
+ """
78
+ try:
79
+ data, sr = librosa.load(filepath, sr=target_sr, mono=True)
80
+ return data, sr
81
+ except Exception as exc:
82
+ logger.error(f"Failed to process audio file '{filepath}': {exc}")
83
+ raise
84
+
85
+
86
+ class Transcriber:
87
+ def __init__(self, asr_model: str):
88
+ self.pipeline = pipeline(model=asr_model)
89
+
90
+ def transcribe(self, audio_path: str) -> str:
91
+ result = self.pipeline(audio_path)
92
+ return result.get("text", "")
93
+
94
+
95
+ # Initialize components
96
+ current_dataset = get_or_create_dataset(CURRENT_DATASET)
97
+ asr_client = Transcriber(ASR_MODEL)
98
+
99
+
100
+ @spaces.GPU(duration=15)
101
+ def transcribe_and_update(audio_filepath: str, history: str, apply_enhance: bool) -> tuple:
102
+ """
103
+ Denoise every input, optionally enhance, then transcribe and push to HF dataset.
104
+ """
105
+ if not audio_filepath:
106
+ return "No audio detected. Please record or upload audio.", history
107
+
108
+ try:
109
+ # Load and preprocess
110
+ audio_data, sr = process_audio_file(audio_filepath)
111
+
112
+ # Always denoise
113
+ try:
114
+ denoised_data, sr = denoise(audio_data, sr, device)
115
+ logger.info("Audio denoised successfully.")
116
+ except Exception as e:
117
+ logger.warning(f"Denoise failed, using raw audio: {e}")
118
+ denoised_data = audio_data
119
+
120
+ # Optionally enhance
121
+ if apply_enhance:
122
+ try:
123
+ enhanced_data, sr = enhance(denoised_data, sr, device)
124
+ final_audio = enhanced_data
125
+ logger.info("Audio enhanced successfully.")
126
+ except Exception as e:
127
+ logger.warning(f"Enhancement failed, using denoised audio: {e}")
128
+ final_audio = denoised_data
129
+ else:
130
+ final_audio = denoised_data
131
+
132
+ # Save processed audio to temp file
133
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpf:
134
+ sf.write(tmpf.name, final_audio, sr)
135
+ local_path = tmpf.name
136
+
137
+ # Transcription
138
+ transcription = asr_client.transcribe(local_path)
139
+ logger.info(f"Transcription: {transcription}")
140
+
141
+ # Prepare new record
142
+ new_record = {
143
+ "audio": [local_path],
144
+ "text": [transcription],
145
+ "language": ["moore"],
146
+ "datetime": [datetime.utcnow().isoformat()]
147
+ }
148
+ new_ds = Dataset.from_dict(new_record).cast_column("audio", Audio())
149
+
150
+ # Update in-memory dataset
151
+ global current_dataset
152
+ if len(current_dataset) == 0:
153
+ current_dataset = new_ds
154
+ else:
155
+ current_dataset = concatenate_datasets([current_dataset, new_ds])
156
+
157
+ # Push to hub
158
+ save_dataset(current_dataset, CURRENT_DATASET)
159
+
160
+ # Update conversation history
161
+ history = history + f"\nUser: [audio]\nAssistant: {transcription}"
162
+ return transcription, history
163
+
164
+ except Exception as exc:
165
+ logger.error(f"Error during transcription pipeline: {exc}")
166
+ return f"Error: {exc}", history
167
+
168
+
169
+ def build_interface():
170
+ with gr.Blocks(theme="huggingface") as demo:
171
+ gr.Markdown("# 🗣️ ASR Moore Live 🧠")
172
+ gr.Markdown("Speech Recognition interface for Moore language. Records or uploads audio, always denoises, and optionally enhances before ASR.")
173
+
174
+ with gr.Row():
175
+ audio_input = gr.Audio(source="microphone", type="filepath", label="Record or upload audio")
176
+ state_box = gr.State(value="")
177
+ enhance_checkbox = gr.Checkbox(label="Apply Enhancement", value=False)
178
+
179
+ output_text = gr.Textbox(label="Transcription")
180
+ submit_btn = gr.Button("Transcribe and Save")
181
+ submit_btn.click(fn=transcribe_and_update,
182
+ inputs=[audio_input, state_box, enhance_checkbox],
183
+ outputs=[output_text, state_box])
184
+
185
+ demo.launch(debug=True)
186
+
187
 
188
  if __name__ == "__main__":
189
+ build_interface()