|
""" |
|
Script to translate given single english audio file to corresponding hindi text |
|
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path> |
|
""" |
|
|
|
|
|
|
|
import gradio as gr |
|
import sys |
|
import os |
|
import subprocess |
|
from pydub import AudioSegment |
|
from huggingface_hub import snapshot_download |
|
|
|
def install_fairseq(): |
|
try: |
|
|
|
subprocess.check_call(["pip", "install", "fairseq"]) |
|
subprocess.check_call(["pip", "install", "sentencepiece"]) |
|
subprocess.check_call(["pip", "install", "soundfile"]) |
|
return "fairseq successfully installed!" |
|
except subprocess.CalledProcessError as e: |
|
return f"An error occurred while installing fairseq: {str(e)}" |
|
|
|
def convert_audio_to_16k_wav(audio_input): |
|
sound = AudioSegment.from_file(audio_input) |
|
sample_rate = sound.frame_rate |
|
num_channels = sound.channels |
|
num_frames = int(sound.frame_count()) |
|
filename = audio_input.split("/")[-1] |
|
print("original file is at:", audio_input) |
|
if (num_channels > 1) or (sample_rate != 16000): |
|
if num_channels > 1: |
|
sound = sound.set_channels(1) |
|
if sample_rate != 16000: |
|
sound = sound.set_frame_rate(16000) |
|
num_frames = int(sound.frame_count()) |
|
filename = filename.replace(".wav", "") + "_16k.wav" |
|
sound.export(f"{filename}", format="wav") |
|
return filename |
|
|
|
|
|
def run_my_code(input_text, language): |
|
|
|
audio=convert_audio_to_16k_wav(input_text) |
|
hi_wav = audio |
|
|
|
data_root="" |
|
model_checkpoint="" |
|
d_r="" |
|
|
|
if(language=="Hindi"): |
|
model_checkpoint = "./models/hi_m.pt" |
|
data_root="./lang/bn" |
|
|
|
if(language=="Gujrati"): |
|
model_checkpoint = "./models/gj_m.pt" |
|
data_root="./lang/gj" |
|
|
|
|
|
|
|
|
|
f = open('input.txt', 'w') |
|
f.write(hi_wav) |
|
|
|
|
|
print("------Performing translation...") |
|
|
|
translation_result = subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"], capture_output=True, text=True) |
|
translation_result_text = translation_result.stdout |
|
|
|
lines = translation_result_text.split("\n") |
|
|
|
output_text="" |
|
print("\n\n------Translation results are:") |
|
for i in lines: |
|
if (i.startswith("D-0")): |
|
print(i.split("\t")[2]) |
|
output_text=i.split("\t")[2] |
|
break |
|
|
|
|
|
f = open('input.txt', 'w') |
|
f.write("") |
|
return output_text |
|
|
|
install_fairseq() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output_textbox = gr.outputs.Textbox(label="Output Text") |
|
|
|
|
|
iface = gr.Interface( |
|
fn=run_my_code, |
|
inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American English accent)"), gr.inputs.Radio(["Hindi", "Gujrati"], label="Language")], |
|
outputs=output_textbox, |
|
title="English to Indic Language Translator") |
|
|
|
|
|
iface.launch() |