Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import torch
|
2 |
import librosa
|
3 |
import gradio as gr
|
4 |
-
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,
|
5 |
import numpy as np
|
6 |
import soundfile as sf
|
7 |
import tempfile
|
@@ -9,7 +9,11 @@ import tempfile
|
|
9 |
# Load the models and processors
|
10 |
asr_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
|
11 |
asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")
|
14 |
|
15 |
def translate_speech(audio_file_path):
|
@@ -29,22 +33,19 @@ def translate_speech(audio_file_path):
|
|
29 |
transcription = asr_processor.decode(pred_ids)
|
30 |
print(f"Transcription: {transcription}") # Print the transcription
|
31 |
|
32 |
-
#
|
33 |
-
|
34 |
-
print(f"Translated text: {translated_text}") # Print the translated text
|
35 |
|
36 |
-
#
|
37 |
-
|
38 |
-
# Decode the tokens into text
|
39 |
-
translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
|
40 |
-
|
41 |
-
# Remove special tokens
|
42 |
-
translated_text_str = translated_text_str.replace("<pad>", "").replace("</s>", "").strip()
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
48 |
|
49 |
# Use the text-to-speech pipeline to synthesize the translated text
|
50 |
synthesised_speech = tts(translated_text_str)
|
@@ -63,14 +64,3 @@ def translate_speech(audio_file_path):
|
|
63 |
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
|
64 |
|
65 |
return 16000, synthesised_speech
|
66 |
-
|
67 |
-
# Define the Gradio interface
|
68 |
-
iface = gr.Interface(
|
69 |
-
fn=translate_speech,
|
70 |
-
inputs=gr.inputs.Audio(type="filepath"), # Change this line
|
71 |
-
outputs=gr.outputs.Audio(type="numpy"),
|
72 |
-
title="English to Hausa Translation",
|
73 |
-
description="Realtime demo for English to Hausa translation using speech recognition and text-to-speech synthesis."
|
74 |
-
)
|
75 |
-
|
76 |
-
iface.launch()
|
|
|
1 |
import torch
|
2 |
import librosa
|
3 |
import gradio as gr
|
4 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor, SeamlessM4Tv2Model
|
5 |
import numpy as np
|
6 |
import soundfile as sf
|
7 |
import tempfile
|
|
|
9 |
# Load the models and processors
|
10 |
asr_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
|
11 |
asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
|
12 |
+
|
13 |
+
# Load the SeamlessM4T model and processor
|
14 |
+
translator_model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
|
15 |
+
translator_processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
|
16 |
+
|
17 |
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")
|
18 |
|
19 |
def translate_speech(audio_file_path):
|
|
|
33 |
transcription = asr_processor.decode(pred_ids)
|
34 |
print(f"Transcription: {transcription}") # Print the transcription
|
35 |
|
36 |
+
# Prepare the input dictionary for the translator
|
37 |
+
text_inputs = translator_processor(text=transcription, src_lang="eng", return_tensors="pt")
|
|
|
38 |
|
39 |
+
# Use the translator model to translate the transcription
|
40 |
+
translated_text = translator_model.generate(**text_inputs, tgt_lang="ha") # Change the target language to Hausa
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
# Decode the translated text
|
43 |
+
translated_text_str = translator_processor.decode(translated_text[0])
|
44 |
+
|
45 |
+
# Remove special tokens
|
46 |
+
translated_text_str = translated_text_str.replace("<pad>", "").replace("</s>", "").strip()
|
47 |
+
|
48 |
+
print(f"Translated text string: {translated_text_str}") # Print the translated text string
|
49 |
|
50 |
# Use the text-to-speech pipeline to synthesize the translated text
|
51 |
synthesised_speech = tts(translated_text_str)
|
|
|
64 |
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
|
65 |
|
66 |
return 16000, synthesised_speech
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|