eng-to-hau

Sleeping

App Files Files Community

eng-to-hau / app.py

Baghdad99

Update app.py

cd07de6 11 months ago

raw

history blame

3.3 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer
	from huggingsound import SpeechRecognitionModel
	import numpy as np
	import soundfile as sf
	import tempfile

	# Load the model for speech recognition
	model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")

	translator = pipeline("text2text-generation", model="Baghdad99/saad-english-text-to-hausa-text")
	tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")

	# Define the function to translate speech
	def translate_speech(audio_data_tuple):
	print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}") # Debug line

	# Extract the audio data from the tuple
	sample_rate, audio_data = audio_data_tuple

	# Save the audio data to a temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
	sf.write(temp_audio_file.name, audio_data, sample_rate)

	# Use the speech recognition model to transcribe the audio
	output = model.transcribe([temp_audio_file.name])
	print(f"Output: {output}") # Print the output to see what it contains
	# Use the speech recognition model to transcribe the audio
	output = model.transcribe(audio_data)
	print(f"Output: {output}") # Print the output to see what it contains

	# Check if the output contains 'transcription'
	if 'transcription' in output:
	transcription = output["transcription"]
	else:
	print("The output does not contain 'transcription'")
	return


	# Use the translation pipeline to translate the transcription
	translated_text = translator(transciption, return_tensors="pt")
	print(f"Translated text: {translated_text}") # Print the translated text to see what it contains

	# Check if the translated text contains 'generated_token_ids'
	if 'generated_token_ids' in translated_text[0]:
	# Decode the tokens into text
	translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
	else:
	print("The translated text does not contain 'generated_token_ids'")
	return

	# Use the text-to-speech pipeline to synthesize the translated text
	synthesised_speech = tts(translated_text_str)
	print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains

	# Check if the synthesised speech contains 'audio'
	if 'audio' in synthesised_speech:
	synthesised_speech_data = synthesised_speech['audio']
	else:
	print("The synthesised speech does not contain 'audio'")
	return

	# Flatten the audio data
	synthesised_speech_data = synthesised_speech_data.flatten()

	# Scale the audio data to the range of int16 format
	synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

	return 16000, synthesised_speech

	# Define the Gradio interface
	iface = gr.Interface(
	fn=translate_speech,
	inputs=gr.inputs.Audio(source="microphone"), # Change this line
	outputs=gr.outputs.Audio(type="numpy"),
	title="Hausa to English Translation",
	description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
	)

	iface.launch()