eng-to-hau

Sleeping

App Files Files Community

eng-to-hau / app.py

Baghdad99

Update app.py

7e87039 12 months ago

raw

history blame

2.8 kB

	import gradio as gr
	import requests
	import soundfile as sf
	import numpy as np
	import tempfile
	from pydub import AudioSegment
	import io # Add this line

	# Define the Hugging Face Inference API URLs and headers
	ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
	TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
	TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
	headers = {"Authorization": "Bearer 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"}

	# Define the function to query the Hugging Face Inference API
	def query(api_url, payload):
	response = requests.post(api_url, headers=headers, json=payload)
	return response.json()

	# Define the function to translate speech
	def translate_speech(audio):
	print(f"Type of audio: {type(audio)}, Value of audio: {audio}") # Debug line

	# audio is a tuple (np.ndarray, int), we need to save it as a file
	sample_rate, audio_data = audio
	if isinstance(audio_data, np.ndarray) and len(audio_data.shape) == 1: # if audio_data is 1D, reshape it to 2D
	audio_data = np.reshape(audio_data, (-1, 1))
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	sf.write(f, audio_data, sample_rate)
	audio_file = f.name

	# Use the ASR pipeline to transcribe the audio
	with open(audio_file, "rb") as f:
	data = f.read()
	response = requests.post(ASR_API_URL, headers=headers, data=data)
	output = response.json()

	# Check if the output contains 'text'
	if 'text' in output:
	transcription = output["text"]
	else:
	print("The output does not contain 'text'")
	return

	# Use the translation pipeline to translate the transcription
	translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})

	# Use the TTS pipeline to synthesize the translated text
	response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
	audio_bytes = response.content

	# Convert the audio bytes to an audio segment
	audio_segment = AudioSegment.from_mp3(io.BytesIO(audio_bytes))


	# Convert the audio segment to a numpy array
	audio_data = np.array(audio_segment.get_array_of_samples())
	if audio_segment.channels == 2:
	audio_data = audio_data.reshape((-1, 2))

	return audio_data

	# Define the Gradio interface
	iface = gr.Interface(
	fn=translate_speech,
	inputs=gr.inputs.Audio(source="microphone", type="numpy"),
	outputs=gr.outputs.Audio(type="numpy"),
	title="Hausa to English Translation",
	description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
	)

	iface.launch()