|
""" |
|
Script to translate given single english audio file to corresponding hindi text |
|
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path> |
|
""" |
|
|
|
|
|
|
|
import gradio as gr |
|
import sys |
|
import os |
|
import subprocess |
|
from pydub import AudioSegment |
|
from huggingface_hub import snapshot_download |
|
|
|
def install_fairseq(): |
|
try: |
|
|
|
subprocess.check_call(["pip", "install", "fairseq"]) |
|
subprocess.check_call(["pip", "install", "sentencepiece"]) |
|
subprocess.check_call(["pip", "install", "soundfile"]) |
|
subprocess.check_call(["pip", "install", "gTTS"]) |
|
subprocess.check_call(["pip", "install", "gtts"]) |
|
return "fairseq successfully installed!" |
|
except subprocess.CalledProcessError as e: |
|
return f"An error occurred while installing fairseq: {str(e)}" |
|
|
|
|
|
from gtts import gTTS |
|
|
|
def convert_audio_to_16k_wav(audio_input): |
|
sound = AudioSegment.from_file(audio_input) |
|
sample_rate = sound.frame_rate |
|
num_channels = sound.channels |
|
num_frames = int(sound.frame_count()) |
|
filename = audio_input.split("/")[-1] |
|
print("original file is at:", audio_input) |
|
if (num_channels > 1) or (sample_rate != 16000): |
|
if num_channels > 1: |
|
sound = sound.set_channels(1) |
|
if sample_rate != 16000: |
|
sound = sound.set_frame_rate(16000) |
|
num_frames = int(sound.frame_count()) |
|
filename = filename.replace(".wav", "") + "_16k.wav" |
|
sound.export(f"{filename}", format="wav") |
|
return filename |
|
|
|
|
|
def run_my_code(input_text, language): |
|
|
|
audio=convert_audio_to_16k_wav(input_text) |
|
hi_wav = audio |
|
|
|
data_root="" |
|
model_checkpoint="" |
|
d_r="" |
|
lang='' |
|
|
|
if(language=="Hindi"): |
|
model_checkpoint = "./models/hi_m.pt" |
|
data_root="./lang/hi/" |
|
lang='hi' |
|
|
|
if(language=="Gujrati"): |
|
model_checkpoint = "./models/gj_m.pt" |
|
data_root="./lang/gj/" |
|
lang='gu' |
|
|
|
if(language=="Bengali"): |
|
model_checkpoint = "./models/bn_m.pt" |
|
data_root="./lang/bn/" |
|
lang='bn' |
|
|
|
if(language=="Nepali"): |
|
model_checkpoint = "./models/ne_m.pt" |
|
data_root="./lang/ne/" |
|
lang='ne' |
|
|
|
if(language=="Tamil"): |
|
model_checkpoint = "./models/tm_m.pt" |
|
data_root="./lang/tm/" |
|
lang='ta' |
|
|
|
if(language=="Marathi"): |
|
model_checkpoint = "./models/mt_m.pt" |
|
data_root="./lang/mt/" |
|
lang='mr' |
|
|
|
|
|
f = open('input.txt', 'w') |
|
f.write(hi_wav) |
|
|
|
f = open('input.txt', 'r') |
|
content = f. read() |
|
print(content) |
|
print(hi_wav) |
|
|
|
print("------Performing translation...") |
|
|
|
translation_result = subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"], capture_output=True, text=True) |
|
translation_result_text = translation_result.stdout |
|
|
|
lines = translation_result_text.split("\n") |
|
|
|
output_text="" |
|
print("\n\n------Translation results are:") |
|
for i in lines: |
|
if (i.startswith("D-0")): |
|
print(i.split("\t")[2]) |
|
output_text=i.split("\t")[2] |
|
break |
|
|
|
output_audio = "output_audio.mp3" |
|
tts = gTTS(text=output_text, lang=lang) |
|
tts.save(output_audio) |
|
|
|
|
|
|
|
|
|
f = open('input.txt', 'w') |
|
f.write("") |
|
|
|
f = open('input.txt', 'r') |
|
content = f. read() |
|
print(content) |
|
return output_text, output_audio |
|
|
|
install_fairseq() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output_textbox = gr.outputs.Textbox(label="Translated Text") |
|
|
|
|
|
iface = gr.Interface( |
|
fn=run_my_code, |
|
inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American English accent)"), gr.inputs.Radio(["Hindi", "Gujrati", "Bengali", "Tamil", "Nepali", "Marathi"], label="Language")], |
|
outputs=[output_textbox, gr.outputs.Audio(label="Output speech", type="filepath", live=True)], |
|
title="English to Indic Language Translator") |
|
|
|
|
|
iface.launch() |