eng-to-hau

Sleeping

Baghdad99 commited on Dec 6, 2023

Commit

17cfe18

•

1 Parent(s): dd785c2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,12 +14,15 @@ tts_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/english_voice_tts")
 tts_model = AutoModelForTextToWaveform.from_pretrained("Baghdad99/english_voice_tts")
 def translate_speech(speech):
     # Convert stereo to mono if necessary
-    if len(speech.shape) > 1:
-        speech = speech.mean(axis=0)
     # Transcribe the speech to text
-    inputs = asr_processor(speech, return_tensors="pt", padding=True)
     logits = asr_model(inputs.input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = asr_processor.decode(predicted_ids[0])
@@ -34,6 +37,7 @@ def translate_speech(speech):
     return audio
 # Define the Gradio interface
 iface = gr.Interface(fn=translate_speech, inputs=gr.inputs.Audio(source="microphone"), outputs="audio")
 iface.launch()

 tts_model = AutoModelForTextToWaveform.from_pretrained("Baghdad99/english_voice_tts")
 def translate_speech(speech):
+    # Extract the audio signal and sample rate
+    audio_signal, sample_rate = speech
     # Convert stereo to mono if necessary
+    if len(audio_signal.shape) > 1:
+        audio_signal = audio_signal.mean(axis=0)
     # Transcribe the speech to text
+    inputs = asr_processor(audio_signal, return_tensors="pt", padding=True)
     logits = asr_model(inputs.input_values).logits
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = asr_processor.decode(predicted_ids[0])
     return audio
 # Define the Gradio interface
 iface = gr.Interface(fn=translate_speech, inputs=gr.inputs.Audio(source="microphone"), outputs="audio")
 iface.launch()