eng-to-hau

Sleeping

App Files Files Community

Baghdad99 commited on Dec 7, 2023

Commit

fcc244c

•

1 Parent(s): 7c94b0d

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -41

app.py CHANGED Viewed

@@ -1,58 +1,47 @@
 import gradio as gr
-from transformers import pipeline, VitsModel, AutoTokenizer
-import numpy as np
-import torch
-import scipy
-# Load the pipeline for speech recognition and translation
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model="Baghdad99/saad-speech-recognition-hausa-audio-to-text",
-    tokenizer="Baghdad99/saad-speech-recognition-hausa-audio-to-text"
-)
-translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
-model = VitsModel.from_pretrained("facebook/mms-tts-eng")
-tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
-# Define the function to translate speech
-def translate_speech(audio):
-    # Separate the sample rate and the audio data
-    sample_rate, audio_data = audio
-    # Use the speech recognition pipeline to transcribe the audio
-    output = pipe(audio_data)
-    print(f"Output: {output}")  # Print the output to see what it contains
-    # Check if the output is not empty and contains 'text'
-    if output and 'text' in output[0]:
-        transcription = output[0]["text"]
     else:
-        print("The output is empty or does not contain 'text'")
         return
     # Use the translation pipeline to translate the transcription
-    translated_text = translator(transcription)
-    print(f"Translated text: {translated_text}")  # Print the translated text to see what it contains
-    # Use the VITS model to synthesize the translated text into speech
-    inputs = tokenizer(translated_text[0]['translation_text'], return_tensors="pt")
-    with torch.no_grad():
-        output = model.generate(**inputs)
-    # Save the synthesized speech to a WAV file
-    scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=output.float().numpy())
-    print("Translated text:", translated_text[0]['translation_text'])
-    print("Synthesized speech data shape:", output.shape)
-    print("Sampling rate:", model.config.sampling_rate)
-    return 16000, output.numpy()
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,
-    inputs=gr.inputs.Audio(source="microphone", type="numpy"),
-    outputs=gr.outputs.Audio(type="numpy"),
     title="Hausa to English Translation",
     description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
 )

 import gradio as gr
+import requests
+from IPython.display import Audio
+# Define the Hugging Face Inference API URLs and headers
+ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
+TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
+TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
+headers = {"Authorization": "Bearer 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"}
+# Define the function to query the Hugging Face Inference API
+def query(api_url, payload):
+    response = requests.post(api_url, headers=headers, json=payload)
+    return response.json()
+# Define the function to translate speech
+def translate_speech(audio):
+    # Use the ASR pipeline to transcribe the audio
+    with open(audio.name, "rb") as f:
+        data = f.read()
+    response = requests.post(ASR_API_URL, headers=headers, data=data)
+    output = response.json()
+    # Check if the output contains 'text'
+    if 'text' in output:
+        transcription = output["text"]
     else:
+        print("The output does not contain 'text'")
         return
     # Use the translation pipeline to translate the transcription
+    translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})
+    # Use the TTS pipeline to synthesize the translated text
+    response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
+    audio_bytes = response.content
+    return audio_bytes
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,
+    inputs=gr.inputs.Audio(source="microphone", type="file"),
+    outputs=gr.outputs.Audio(type="auto"),
     title="Hausa to English Translation",
     description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
 )