Jerich commited on
Commit
bc9a053
·
verified ·
1 Parent(s): 5d63d86

Modified the code to include voice gender option

Browse files
Files changed (1) hide show
  1. app.py +36 -19
app.py CHANGED
@@ -3,6 +3,7 @@ import torch
3
  import gradio as gr
4
  import numpy as np
5
  import soundfile as sf
 
6
  from transformers import (
7
  AutoModelForSeq2SeqLM,
8
  AutoTokenizer,
@@ -17,7 +18,7 @@ from typing import Optional, Tuple, Dict, List
17
  class TalklasTranslator:
18
  """
19
  Speech-to-Speech translation pipeline for Philippine languages.
20
- Uses MMS/Whisper for STT, NLLB for MT, and MMS for TTS.
21
  """
22
 
23
  LANGUAGE_MAPPING = {
@@ -138,7 +139,6 @@ class TalklasTranslator:
138
  waveform, sample_rate = sf.read(audio_path)
139
 
140
  if sample_rate != 16000:
141
- import librosa
142
  waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
143
 
144
  inputs = self.stt_processor(
@@ -184,8 +184,8 @@ class TalklasTranslator:
184
  print(f"Translation failed: {e}")
185
  raise RuntimeError("Text translation failed")
186
 
187
- def text_to_speech(self, text: str) -> Tuple[int, np.ndarray]:
188
- """Convert text to speech"""
189
  try:
190
  inputs = self.tts_tokenizer(text, return_tensors="pt").to(self.device)
191
 
@@ -193,6 +193,16 @@ class TalklasTranslator:
193
  output = self.tts_model(**inputs)
194
 
195
  speech = output.waveform.cpu().numpy().squeeze()
 
 
 
 
 
 
 
 
 
 
196
  speech = (speech * 32767).astype(np.int16)
197
 
198
  return self.tts_model.config.sampling_rate, speech
@@ -201,12 +211,12 @@ class TalklasTranslator:
201
  print(f"Speech synthesis failed: {e}")
202
  raise RuntimeError("Speech synthesis failed")
203
 
204
- def translate_speech(self, audio_path: str) -> Dict:
205
- """Full speech-to-speech translation"""
206
  try:
207
  source_text = self.speech_to_text(audio_path)
208
  translated_text = self.translate_text(source_text)
209
- sample_rate, audio = self.text_to_speech(translated_text)
210
 
211
  return {
212
  "source_text": source_text,
@@ -222,11 +232,11 @@ class TalklasTranslator:
222
  "performance": f"Error: {str(e)}"
223
  }
224
 
225
- def translate_text_only(self, text: str) -> Dict:
226
- """Text-to-speech translation"""
227
  try:
228
  translated_text = self.translate_text(text)
229
- sample_rate, audio = self.text_to_speech(translated_text)
230
 
231
  return {
232
  "source_text": text,
@@ -251,8 +261,8 @@ class TranslatorSingleton:
251
  cls._instance = TalklasTranslator()
252
  return cls._instance
253
 
254
- def process_audio(audio_path, source_lang, target_lang):
255
- """Process audio through the full translation pipeline"""
256
  # Validate input
257
  if not audio_path:
258
  return None, "No audio provided", "No translation available", "Please provide audio input"
@@ -265,12 +275,12 @@ def process_audio(audio_path, source_lang, target_lang):
265
  status = translator.update_languages(source_code, target_code)
266
 
267
  # Process the audio
268
- results = translator.translate_speech(audio_path)
269
 
270
  return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
271
 
272
- def process_text(text, source_lang, target_lang):
273
- """Process text through the translation pipeline"""
274
  # Validate input
275
  if not text:
276
  return None, "No text provided", "No translation available", "Please provide text input"
@@ -283,14 +293,15 @@ def process_text(text, source_lang, target_lang):
283
  status = translator.update_languages(source_code, target_code)
284
 
285
  # Process the text
286
- results = translator.translate_text_only(text)
287
 
288
  return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
289
 
290
  def create_gradio_interface():
291
- """Create and launch Gradio interface"""
292
  # Define language options
293
  languages = list(TalklasTranslator.LANGUAGE_MAPPING.keys())
 
294
 
295
  # Define the interface
296
  demo = gr.Blocks(title="Talklas - Speech & Text Translation")
@@ -313,6 +324,12 @@ def create_gradio_interface():
313
  label="Target Language"
314
  )
315
 
 
 
 
 
 
 
316
  language_status = gr.Textbox(label="Language Status")
317
  update_btn = gr.Button("Update Languages")
318
 
@@ -372,7 +389,7 @@ def create_gradio_interface():
372
  # Audio translate button click
373
  audio_translate_btn.click(
374
  process_audio,
375
- inputs=[audio_input, source_lang, target_lang],
376
  outputs=[audio_output, source_text, translated_text, performance_info]
377
  ).then(
378
  None,
@@ -393,7 +410,7 @@ def create_gradio_interface():
393
  # Text translate button click
394
  text_translate_btn.click(
395
  process_text,
396
- inputs=[text_input, source_lang, target_lang],
397
  outputs=[text_output, source_text, translated_text, performance_info]
398
  ).then(
399
  None,
 
3
  import gradio as gr
4
  import numpy as np
5
  import soundfile as sf
6
+ import librosa
7
  from transformers import (
8
  AutoModelForSeq2SeqLM,
9
  AutoTokenizer,
 
18
  class TalklasTranslator:
19
  """
20
  Speech-to-Speech translation pipeline for Philippine languages.
21
+ Uses MMS/Whisper for STT, NLLB for MT, and MMS for TTS with pitch-shifting for voice gender.
22
  """
23
 
24
  LANGUAGE_MAPPING = {
 
139
  waveform, sample_rate = sf.read(audio_path)
140
 
141
  if sample_rate != 16000:
 
142
  waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
143
 
144
  inputs = self.stt_processor(
 
184
  print(f"Translation failed: {e}")
185
  raise RuntimeError("Text translation failed")
186
 
187
+ def text_to_speech(self, text: str, voice_gender: str = "neutral") -> Tuple[int, np.ndarray]:
188
+ """Convert text to speech with optional pitch-shifting for voice gender"""
189
  try:
190
  inputs = self.tts_tokenizer(text, return_tensors="pt").to(self.device)
191
 
 
193
  output = self.tts_model(**inputs)
194
 
195
  speech = output.waveform.cpu().numpy().squeeze()
196
+
197
+ # Apply pitch-shifting based on voice_gender
198
+ if voice_gender.lower() == "female":
199
+ # Increase pitch (e.g., +4 semitones for a more traditionally feminine voice)
200
+ speech = librosa.effects.pitch_shift(speech, sr=self.tts_model.config.sampling_rate, n_steps=4)
201
+ elif voice_gender.lower() == "male":
202
+ # Decrease pitch (e.g., -4 semitones for a more traditionally masculine voice)
203
+ speech = librosa.effects.pitch_shift(speech, sr=self.tts_model.config.sampling_rate, n_steps=-4)
204
+
205
+ # Convert to 16-bit PCM
206
  speech = (speech * 32767).astype(np.int16)
207
 
208
  return self.tts_model.config.sampling_rate, speech
 
211
  print(f"Speech synthesis failed: {e}")
212
  raise RuntimeError("Speech synthesis failed")
213
 
214
+ def translate_speech(self, audio_path: str, voice_gender: str = "neutral") -> Dict:
215
+ """Full speech-to-speech translation with voice gender option"""
216
  try:
217
  source_text = self.speech_to_text(audio_path)
218
  translated_text = self.translate_text(source_text)
219
+ sample_rate, audio = self.text_to_speech(translated_text, voice_gender)
220
 
221
  return {
222
  "source_text": source_text,
 
232
  "performance": f"Error: {str(e)}"
233
  }
234
 
235
+ def translate_text_only(self, text: str, voice_gender: str = "neutral") -> Dict:
236
+ """Text-to-speech translation with voice gender option"""
237
  try:
238
  translated_text = self.translate_text(text)
239
+ sample_rate, audio = self.text_to_speech(translated_text, voice_gender)
240
 
241
  return {
242
  "source_text": text,
 
261
  cls._instance = TalklasTranslator()
262
  return cls._instance
263
 
264
+ def process_audio(audio_path, source_lang, target_lang, voice_gender):
265
+ """Process audio through the full translation pipeline with voice gender"""
266
  # Validate input
267
  if not audio_path:
268
  return None, "No audio provided", "No translation available", "Please provide audio input"
 
275
  status = translator.update_languages(source_code, target_code)
276
 
277
  # Process the audio
278
+ results = translator.translate_speech(audio_path, voice_gender)
279
 
280
  return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
281
 
282
+ def process_text(text, source_lang, target_lang, voice_gender):
283
+ """Process text through the translation pipeline with voice gender"""
284
  # Validate input
285
  if not text:
286
  return None, "No text provided", "No translation available", "Please provide text input"
 
293
  status = translator.update_languages(source_code, target_code)
294
 
295
  # Process the text
296
+ results = translator.translate_text_only(text, voice_gender)
297
 
298
  return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
299
 
300
  def create_gradio_interface():
301
+ """Create and launch Gradio interface with voice gender selection"""
302
  # Define language options
303
  languages = list(TalklasTranslator.LANGUAGE_MAPPING.keys())
304
+ voice_genders = ["Neutral", "Male", "Female"]
305
 
306
  # Define the interface
307
  demo = gr.Blocks(title="Talklas - Speech & Text Translation")
 
324
  label="Target Language"
325
  )
326
 
327
+ voice_gender = gr.Dropdown(
328
+ choices=voice_genders,
329
+ value="Neutral",
330
+ label="Voice Gender"
331
+ )
332
+
333
  language_status = gr.Textbox(label="Language Status")
334
  update_btn = gr.Button("Update Languages")
335
 
 
389
  # Audio translate button click
390
  audio_translate_btn.click(
391
  process_audio,
392
+ inputs=[audio_input, source_lang, target_lang, voice_gender],
393
  outputs=[audio_output, source_text, translated_text, performance_info]
394
  ).then(
395
  None,
 
410
  # Text translate button click
411
  text_translate_btn.click(
412
  process_text,
413
+ inputs=[text_input, source_lang, target_lang, voice_gender],
414
  outputs=[text_output, source_text, translated_text, performance_info]
415
  ).then(
416
  None,