File size: 3,213 Bytes
bc3fe61
5b74a4b
bc3fe61
83e3ccb
c23e905
 
5b74a4b
bc3fe61
 
 
147483e
def416c
a927d1d
3431153
 
 
8a6097b
c23e905
 
 
 
bc3fe61
 
 
 
 
 
 
 
 
 
 
 
393002d
2ad4835
 
bc3fe61
cd0ec84
2ad4835
 
 
 
bc3fe61
2ad4835
 
 
cd0ec84
2ad4835
 
25fb027
2ad4835
 
 
 
 
 
c58bd88
2ad4835
 
8c23bfa
2ad4835
 
17cfe18
2ad4835
a5ec736
b2c7d3a
5b74a4b
 
ee37b95
8fe6fd5
2ad4835
035bf95
5b74a4b
 
ee37b95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import torch  # Add this line
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, AutoTokenizer
import numpy as np
import soundfile as sf
import tempfile

# Load the models and processors
asr_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
translator = pipeline("text2text-generation", model="dammyogt/damilola-finetuned-NLP-opus-mt-en-ha")
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")

def translate_speech(audio_data_tuple):
    # Extract the audio data from the tuple
    sample_rate, audio_data = audio_data_tuple

    # Save the audio data to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
        sf.write(temp_audio_file.name, audio_data, sample_rate)

        # Prepare the input dictionary
        input_dict = asr_processor(temp_audio_file.name, return_tensors="pt", padding=True)

        # Use the ASR model to get the logits
        logits = asr_model(input_dict.input_values.to("cpu")).logits

        # Get the predicted IDs
        pred_ids = torch.argmax(logits, dim=-1)[0]

        # Decode the predicted IDs to get the transcription
        transcription = asr_processor.decode(pred_ids)
        print(f"Transcription: {transcription}")  # Print the transcription

        # Use the translation pipeline to translate the transcription
        translated_text = translator(transcription, return_tensors="pt")
        print(f"Translated text: {translated_text}")  # Print the translated text

        # Check if the translated text contains 'generated_token_ids'
        if 'generated_token_ids' in translated_text[0]:
            # Decode the tokens into text
            translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
            print(f"Translated text string: {translated_text_str}")  # Print the translated text string
        else:
            print("The translated text does not contain 'generated_token_ids'")
            return

        # Use the text-to-speech pipeline to synthesize the translated text
        synthesised_speech = tts(translated_text_str)

        # Check if the synthesised speech contains 'audio'
        if 'audio' in synthesised_speech:
            synthesised_speech_data = synthesised_speech['audio']
        else:
            print("The synthesised speech does not contain 'audio'")
            return

        # Flatten the audio data
        synthesised_speech_data = synthesised_speech_data.flatten()

        # Scale the audio data to the range of int16 format
        synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

        return 16000, synthesised_speech

# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(source="microphone"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="English to Hausa Translation",
    description="Realtime demo for English to Hausa translation using speech recognition and text-to-speech synthesis."
)

iface.launch()