Spaces:
Sleeping
Sleeping
File size: 3,213 Bytes
bc3fe61 5b74a4b bc3fe61 83e3ccb c23e905 5b74a4b bc3fe61 147483e def416c a927d1d 3431153 8a6097b c23e905 bc3fe61 393002d 2ad4835 bc3fe61 cd0ec84 2ad4835 bc3fe61 2ad4835 cd0ec84 2ad4835 25fb027 2ad4835 c58bd88 2ad4835 8c23bfa 2ad4835 17cfe18 2ad4835 a5ec736 b2c7d3a 5b74a4b ee37b95 8fe6fd5 2ad4835 035bf95 5b74a4b ee37b95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import torch # Add this line
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, AutoTokenizer
import numpy as np
import soundfile as sf
import tempfile
# Load the models and processors
asr_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
translator = pipeline("text2text-generation", model="dammyogt/damilola-finetuned-NLP-opus-mt-en-ha")
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")
def translate_speech(audio_data_tuple):
# Extract the audio data from the tuple
sample_rate, audio_data = audio_data_tuple
# Save the audio data to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
sf.write(temp_audio_file.name, audio_data, sample_rate)
# Prepare the input dictionary
input_dict = asr_processor(temp_audio_file.name, return_tensors="pt", padding=True)
# Use the ASR model to get the logits
logits = asr_model(input_dict.input_values.to("cpu")).logits
# Get the predicted IDs
pred_ids = torch.argmax(logits, dim=-1)[0]
# Decode the predicted IDs to get the transcription
transcription = asr_processor.decode(pred_ids)
print(f"Transcription: {transcription}") # Print the transcription
# Use the translation pipeline to translate the transcription
translated_text = translator(transcription, return_tensors="pt")
print(f"Translated text: {translated_text}") # Print the translated text
# Check if the translated text contains 'generated_token_ids'
if 'generated_token_ids' in translated_text[0]:
# Decode the tokens into text
translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
print(f"Translated text string: {translated_text_str}") # Print the translated text string
else:
print("The translated text does not contain 'generated_token_ids'")
return
# Use the text-to-speech pipeline to synthesize the translated text
synthesised_speech = tts(translated_text_str)
# Check if the synthesised speech contains 'audio'
if 'audio' in synthesised_speech:
synthesised_speech_data = synthesised_speech['audio']
else:
print("The synthesised speech does not contain 'audio'")
return
# Flatten the audio data
synthesised_speech_data = synthesised_speech_data.flatten()
# Scale the audio data to the range of int16 format
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
return 16000, synthesised_speech
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.Audio(source="microphone"), # Change this line
outputs=gr.outputs.Audio(type="numpy"),
title="English to Hausa Translation",
description="Realtime demo for English to Hausa translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|