File size: 3,300 Bytes
5b74a4b
8a6097b
ee193bb
83e3ccb
c23e905
 
5b74a4b
ee193bb
 
f0d7e02
def416c
 
a927d1d
393002d
3431153
 
 
 
 
8a6097b
c23e905
 
 
 
 
 
 
cd07de6
 
 
 
 
 
 
 
 
 
 
393002d
8a6097b
cd07de6
8a6097b
cd0ec84
8a6097b
 
 
 
 
 
 
cd0ec84
25fb027
 
8a6097b
25fb027
 
 
 
 
 
 
c58bd88
8c23bfa
 
 
25fb027
 
17cfe18
25fb027
a5ec736
b2c7d3a
5b74a4b
 
ee37b95
8fe6fd5
5b74a4b
 
 
 
ee37b95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from transformers import pipeline, AutoTokenizer
from huggingsound import SpeechRecognitionModel
import numpy as np
import soundfile as sf
import tempfile

# Load the model for speech recognition
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")

translator = pipeline("text2text-generation", model="Baghdad99/saad-english-text-to-hausa-text")
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")

# Define the function to translate speech
def translate_speech(audio_data_tuple):
    print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}")  # Debug line

    # Extract the audio data from the tuple
    sample_rate, audio_data = audio_data_tuple

    # Save the audio data to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
        sf.write(temp_audio_file.name, audio_data, sample_rate)

        # Use the speech recognition model to transcribe the audio
        output = model.transcribe([temp_audio_file.name])
        print(f"Output: {output}")  # Print the output to see what it contains
        # Use the speech recognition model to transcribe the audio
        output = model.transcribe(audio_data)
        print(f"Output: {output}")  # Print the output to see what it contains
    
        # Check if the output contains 'transcription'
        if 'transcription' in output:
            transcription = output["transcription"]
        else:
            print("The output does not contain 'transcription'")
            return
    

    # Use the translation pipeline to translate the transcription
    translated_text = translator(transciption, return_tensors="pt")
    print(f"Translated text: {translated_text}")  # Print the translated text to see what it contains

    # Check if the translated text contains 'generated_token_ids'
    if 'generated_token_ids' in translated_text[0]:
        # Decode the tokens into text
        translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
    else:
        print("The translated text does not contain 'generated_token_ids'")
        return

    # Use the text-to-speech pipeline to synthesize the translated text
    synthesised_speech = tts(translated_text_str)
    print(f"Synthesised speech: {synthesised_speech}")  # Print the synthesised speech to see what it contains

    # Check if the synthesised speech contains 'audio'
    if 'audio' in synthesised_speech:
        synthesised_speech_data = synthesised_speech['audio']
    else:
        print("The synthesised speech does not contain 'audio'")
        return

    # Flatten the audio data
    synthesised_speech_data = synthesised_speech_data.flatten()

    # Scale the audio data to the range of int16 format
    synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

    return 16000, synthesised_speech

# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(source="microphone"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()