Spaces:

Jerich
/

talklas

Paused

App Files Files Community

Jerich commited on Apr 23

Commit

bc9a053

verified ·

1 Parent(s): 5d63d86

Modified the code to include voice gender option

Browse files

Files changed (1) hide show

app.py +36 -19

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 import gradio as gr
 import numpy as np
 import soundfile as sf
 from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
@@ -17,7 +18,7 @@ from typing import Optional, Tuple, Dict, List
 class TalklasTranslator:
     """
     Speech-to-Speech translation pipeline for Philippine languages.
-    Uses MMS/Whisper for STT, NLLB for MT, and MMS for TTS.
     """
     LANGUAGE_MAPPING = {
@@ -138,7 +139,6 @@ class TalklasTranslator:
             waveform, sample_rate = sf.read(audio_path)
             if sample_rate != 16000:
-                import librosa
                 waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
             inputs = self.stt_processor(
@@ -184,8 +184,8 @@ class TalklasTranslator:
             print(f"Translation failed: {e}")
             raise RuntimeError("Text translation failed")
-    def text_to_speech(self, text: str) -> Tuple[int, np.ndarray]:
-        """Convert text to speech"""
         try:
             inputs = self.tts_tokenizer(text, return_tensors="pt").to(self.device)
@@ -193,6 +193,16 @@ class TalklasTranslator:
                 output = self.tts_model(**inputs)
             speech = output.waveform.cpu().numpy().squeeze()
             speech = (speech * 32767).astype(np.int16)
             return self.tts_model.config.sampling_rate, speech
@@ -201,12 +211,12 @@ class TalklasTranslator:
             print(f"Speech synthesis failed: {e}")
             raise RuntimeError("Speech synthesis failed")
-    def translate_speech(self, audio_path: str) -> Dict:
-        """Full speech-to-speech translation"""
         try:
             source_text = self.speech_to_text(audio_path)
             translated_text = self.translate_text(source_text)
-            sample_rate, audio = self.text_to_speech(translated_text)
             return {
                 "source_text": source_text,
@@ -222,11 +232,11 @@ class TalklasTranslator:
                 "performance": f"Error: {str(e)}"
             }
-    def translate_text_only(self, text: str) -> Dict:
-        """Text-to-speech translation"""
         try:
             translated_text = self.translate_text(text)
-            sample_rate, audio = self.text_to_speech(translated_text)
             return {
                 "source_text": text,
@@ -251,8 +261,8 @@ class TranslatorSingleton:
             cls._instance = TalklasTranslator()
         return cls._instance
-def process_audio(audio_path, source_lang, target_lang):
-    """Process audio through the full translation pipeline"""
     # Validate input
     if not audio_path:
         return None, "No audio provided", "No translation available", "Please provide audio input"
@@ -265,12 +275,12 @@ def process_audio(audio_path, source_lang, target_lang):
     status = translator.update_languages(source_code, target_code)
     # Process the audio
-    results = translator.translate_speech(audio_path)
     return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
-def process_text(text, source_lang, target_lang):
-    """Process text through the translation pipeline"""
     # Validate input
     if not text:
         return None, "No text provided", "No translation available", "Please provide text input"
@@ -283,14 +293,15 @@ def process_text(text, source_lang, target_lang):
     status = translator.update_languages(source_code, target_code)
     # Process the text
-    results = translator.translate_text_only(text)
     return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
 def create_gradio_interface():
-    """Create and launch Gradio interface"""
     # Define language options
     languages = list(TalklasTranslator.LANGUAGE_MAPPING.keys())
     # Define the interface
     demo = gr.Blocks(title="Talklas - Speech & Text Translation")
@@ -313,6 +324,12 @@ def create_gradio_interface():
                     label="Target Language"
                 )
                 language_status = gr.Textbox(label="Language Status")
                 update_btn = gr.Button("Update Languages")
@@ -372,7 +389,7 @@ def create_gradio_interface():
         # Audio translate button click
         audio_translate_btn.click(
             process_audio,
-            inputs=[audio_input, source_lang, target_lang],
             outputs=[audio_output, source_text, translated_text, performance_info]
         ).then(
             None,
@@ -393,7 +410,7 @@ def create_gradio_interface():
         # Text translate button click
         text_translate_btn.click(
             process_text,
-            inputs=[text_input, source_lang, target_lang],
             outputs=[text_output, source_text, translated_text, performance_info]
         ).then(
             None,

 import gradio as gr
 import numpy as np
 import soundfile as sf
+import librosa
 from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
 class TalklasTranslator:
     """
     Speech-to-Speech translation pipeline for Philippine languages.
+    Uses MMS/Whisper for STT, NLLB for MT, and MMS for TTS with pitch-shifting for voice gender.
     """
     LANGUAGE_MAPPING = {
             waveform, sample_rate = sf.read(audio_path)
             if sample_rate != 16000:
                 waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
             inputs = self.stt_processor(
             print(f"Translation failed: {e}")
             raise RuntimeError("Text translation failed")
+    def text_to_speech(self, text: str, voice_gender: str = "neutral") -> Tuple[int, np.ndarray]:
+        """Convert text to speech with optional pitch-shifting for voice gender"""
         try:
             inputs = self.tts_tokenizer(text, return_tensors="pt").to(self.device)
                 output = self.tts_model(**inputs)
             speech = output.waveform.cpu().numpy().squeeze()
+            # Apply pitch-shifting based on voice_gender
+            if voice_gender.lower() == "female":
+                # Increase pitch (e.g., +4 semitones for a more traditionally feminine voice)
+                speech = librosa.effects.pitch_shift(speech, sr=self.tts_model.config.sampling_rate, n_steps=4)
+            elif voice_gender.lower() == "male":
+                # Decrease pitch (e.g., -4 semitones for a more traditionally masculine voice)
+                speech = librosa.effects.pitch_shift(speech, sr=self.tts_model.config.sampling_rate, n_steps=-4)
+            # Convert to 16-bit PCM
             speech = (speech * 32767).astype(np.int16)
             return self.tts_model.config.sampling_rate, speech
             print(f"Speech synthesis failed: {e}")
             raise RuntimeError("Speech synthesis failed")
+    def translate_speech(self, audio_path: str, voice_gender: str = "neutral") -> Dict:
+        """Full speech-to-speech translation with voice gender option"""
         try:
             source_text = self.speech_to_text(audio_path)
             translated_text = self.translate_text(source_text)
+            sample_rate, audio = self.text_to_speech(translated_text, voice_gender)
             return {
                 "source_text": source_text,
                 "performance": f"Error: {str(e)}"
             }
+    def translate_text_only(self, text: str, voice_gender: str = "neutral") -> Dict:
+        """Text-to-speech translation with voice gender option"""
         try:
             translated_text = self.translate_text(text)
+            sample_rate, audio = self.text_to_speech(translated_text, voice_gender)
             return {
                 "source_text": text,
             cls._instance = TalklasTranslator()
         return cls._instance
+def process_audio(audio_path, source_lang, target_lang, voice_gender):
+    """Process audio through the full translation pipeline with voice gender"""
     # Validate input
     if not audio_path:
         return None, "No audio provided", "No translation available", "Please provide audio input"
     status = translator.update_languages(source_code, target_code)
     # Process the audio
+    results = translator.translate_speech(audio_path, voice_gender)
     return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
+def process_text(text, source_lang, target_lang, voice_gender):
+    """Process text through the translation pipeline with voice gender"""
     # Validate input
     if not text:
         return None, "No text provided", "No translation available", "Please provide text input"
     status = translator.update_languages(source_code, target_code)
     # Process the text
+    results = translator.translate_text_only(text, voice_gender)
     return results["output_audio"], results["source_text"], results["translated_text"], results["performance"]
 def create_gradio_interface():
+    """Create and launch Gradio interface with voice gender selection"""
     # Define language options
     languages = list(TalklasTranslator.LANGUAGE_MAPPING.keys())
+    voice_genders = ["Neutral", "Male", "Female"]
     # Define the interface
     demo = gr.Blocks(title="Talklas - Speech & Text Translation")
                     label="Target Language"
                 )
+                voice_gender = gr.Dropdown(
+                    choices=voice_genders,
+                    value="Neutral",
+                    label="Voice Gender"
+                )
                 language_status = gr.Textbox(label="Language Status")
                 update_btn = gr.Button("Update Languages")
         # Audio translate button click
         audio_translate_btn.click(
             process_audio,
+            inputs=[audio_input, source_lang, target_lang, voice_gender],
             outputs=[audio_output, source_text, translated_text, performance_info]
         ).then(
             None,
         # Text translate button click
         text_translate_btn.click(
             process_text,
+            inputs=[text_input, source_lang, target_lang, voice_gender],
             outputs=[text_output, source_text, translated_text, performance_info]
         ).then(
             None,