Baghdad99 commited on
Commit
fcc244c
1 Parent(s): 7c94b0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -41
app.py CHANGED
@@ -1,58 +1,47 @@
1
  import gradio as gr
2
- from transformers import pipeline, VitsModel, AutoTokenizer
3
- import numpy as np
4
- import torch
5
- import scipy
6
-
7
- # Load the pipeline for speech recognition and translation
8
- pipe = pipeline(
9
- "automatic-speech-recognition",
10
- model="Baghdad99/saad-speech-recognition-hausa-audio-to-text",
11
- tokenizer="Baghdad99/saad-speech-recognition-hausa-audio-to-text"
12
- )
13
- translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
14
- model = VitsModel.from_pretrained("facebook/mms-tts-eng")
15
- tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
16
 
17
- # Define the function to translate speech
18
- def translate_speech(audio):
19
- # Separate the sample rate and the audio data
20
- sample_rate, audio_data = audio
 
21
 
22
- # Use the speech recognition pipeline to transcribe the audio
23
- output = pipe(audio_data)
24
- print(f"Output: {output}") # Print the output to see what it contains
 
25
 
26
- # Check if the output is not empty and contains 'text'
27
- if output and 'text' in output[0]:
28
- transcription = output[0]["text"]
 
 
 
 
 
 
 
 
29
  else:
30
- print("The output is empty or does not contain 'text'")
31
  return
32
 
33
  # Use the translation pipeline to translate the transcription
34
- translated_text = translator(transcription)
35
- print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
36
-
37
- # Use the VITS model to synthesize the translated text into speech
38
- inputs = tokenizer(translated_text[0]['translation_text'], return_tensors="pt")
39
- with torch.no_grad():
40
- output = model.generate(**inputs)
41
-
42
- # Save the synthesized speech to a WAV file
43
- scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=output.float().numpy())
44
 
45
- print("Translated text:", translated_text[0]['translation_text'])
46
- print("Synthesized speech data shape:", output.shape)
47
- print("Sampling rate:", model.config.sampling_rate)
48
 
49
- return 16000, output.numpy()
50
 
51
  # Define the Gradio interface
52
  iface = gr.Interface(
53
  fn=translate_speech,
54
- inputs=gr.inputs.Audio(source="microphone", type="numpy"),
55
- outputs=gr.outputs.Audio(type="numpy"),
56
  title="Hausa to English Translation",
57
  description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
58
  )
 
1
  import gradio as gr
2
+ import requests
3
+ from IPython.display import Audio
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Define the Hugging Face Inference API URLs and headers
6
+ ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
7
+ TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
8
+ TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
9
+ headers = {"Authorization": "Bearer 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"}
10
 
11
+ # Define the function to query the Hugging Face Inference API
12
+ def query(api_url, payload):
13
+ response = requests.post(api_url, headers=headers, json=payload)
14
+ return response.json()
15
 
16
+ # Define the function to translate speech
17
+ def translate_speech(audio):
18
+ # Use the ASR pipeline to transcribe the audio
19
+ with open(audio.name, "rb") as f:
20
+ data = f.read()
21
+ response = requests.post(ASR_API_URL, headers=headers, data=data)
22
+ output = response.json()
23
+
24
+ # Check if the output contains 'text'
25
+ if 'text' in output:
26
+ transcription = output["text"]
27
  else:
28
+ print("The output does not contain 'text'")
29
  return
30
 
31
  # Use the translation pipeline to translate the transcription
32
+ translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})
 
 
 
 
 
 
 
 
 
33
 
34
+ # Use the TTS pipeline to synthesize the translated text
35
+ response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
36
+ audio_bytes = response.content
37
 
38
+ return audio_bytes
39
 
40
  # Define the Gradio interface
41
  iface = gr.Interface(
42
  fn=translate_speech,
43
+ inputs=gr.inputs.Audio(source="microphone", type="file"),
44
+ outputs=gr.outputs.Audio(type="auto"),
45
  title="Hausa to English Translation",
46
  description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
47
  )