indic_s2s / app.py
balaramas's picture
Update app.py
88567cb
raw
history blame
5.01 kB
"""
Script to translate given single english audio file to corresponding hindi text
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
"""
import gradio as gr
import sys
import os
import subprocess
from pydub import AudioSegment
from huggingface_hub import snapshot_download
def install_fairseq():
try:
# Run pip install command to install fairseq
subprocess.check_call(["pip", "install", "fairseq"])
subprocess.check_call(["pip", "install", "sentencepiece"])
subprocess.check_call(["pip", "install", "soundfile"])
subprocess.check_call(["pip", "install", "gTTS"])
subprocess.check_call(["pip", "install", "gtts"])
return "fairseq successfully installed!"
except subprocess.CalledProcessError as e:
return f"An error occurred while installing fairseq: {str(e)}"
from gtts import gTTS
def convert_audio_to_16k_wav(audio_input):
sound = AudioSegment.from_file(audio_input)
sample_rate = sound.frame_rate
num_channels = sound.channels
num_frames = int(sound.frame_count())
filename = audio_input.split("/")[-1]
print("original file is at:", audio_input)
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
if num_channels > 1:
sound = sound.set_channels(1)
if sample_rate != 16000:
sound = sound.set_frame_rate(16000)
num_frames = int(sound.frame_count())
filename = filename.replace(".wav", "") + "_16k.wav"
sound.export(f"{filename}", format="wav")
return filename
def run_my_code(input_text, language):
# TODO better argument handling
audio=convert_audio_to_16k_wav(input_text)
hi_wav = audio
data_root=""
model_checkpoint=""
d_r=""
lang=''
if(language=="Hindi"):
model_checkpoint = "./models/hi_m.pt"
data_root="./lang/hi/"
lang='hi'
if(language=="Gujrati"):
model_checkpoint = "./models/gj_m.pt"
data_root="./lang/gj/"
lang='gu'
if(language=="Bengali"):
model_checkpoint = "./models/bn_m.pt"
data_root="./lang/bn/"
lang='bn'
if(language=="Nepali"):
model_checkpoint = "./models/ne_m.pt"
data_root="./lang/ne/"
lang='ne'
if(language=="Tamil"):
model_checkpoint = "./models/tm_m.pt"
data_root="./lang/tm/"
lang='ta'
if(language=="Marathi"):
model_checkpoint = "./models/mt_m.pt"
data_root="./lang/mt/"
lang='mr'
#os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")
f = open('input.txt', 'w')
f.write(hi_wav)
f = open('input.txt', 'r')
content = f. read()
print(content)
print(hi_wav)
print("------Performing translation...")
#subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"])
translation_result = subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"], capture_output=True, text=True)
translation_result_text = translation_result.stdout
lines = translation_result_text.split("\n")
output_text=""
print("\n\n------Translation results are:")
for i in lines:
if (i.startswith("D-0")):
print(i.split("\t")[2])
output_text=i.split("\t")[2]
break
output_audio = "output_audio.mp3"
tts = gTTS(text=output_text, lang=lang)
tts.save(output_audio)
#os.system(f"rm test.wav")
f = open('input.txt', 'w')
f.write("")
f = open('input.txt', 'r')
content = f. read()
print(content)
return output_text, output_audio
install_fairseq()
# Define the input and output interfaces for Gradio
#inputs = [
# gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
# gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
# ]
#input_textbox = gr.inputs.Textbox(label="test2.wav")
#input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
#audio=convert_audio_to_16k_wav(input)
output_textbox = gr.outputs.Textbox(label="Translated Text")
# Create a Gradio interface
iface = gr.Interface(
fn=run_my_code,
inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American English accent)"), gr.inputs.Radio(["Hindi", "Gujrati", "Bengali", "Tamil", "Nepali", "Marathi"], label="Language")],
outputs=[output_textbox, gr.outputs.Audio(label="Output speech", type="filepath", live=True)],
title="English to Indic Language Translator")
# Launch the interface
iface.launch()