Spaces:
Runtime error
Runtime error
""" | |
Script to translate given single english audio file to corresponding hindi text | |
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path> | |
""" | |
import gradio as gr | |
import sys | |
import os | |
import subprocess | |
from pydub import AudioSegment | |
import yaml | |
import wave | |
def get_wav_duration(file_path): | |
with wave.open(file_path, 'rb') as wav_file: | |
frames = wav_file.getnframes() | |
rate = wav_file.getframerate() | |
duration = frames / float(rate) | |
return duration | |
def install_fairseq(): | |
try: | |
# Run pip install command to install fairseq | |
subprocess.check_call(["pip", "install", "fairseq"]) | |
subprocess.check_call(["pip", "install", "sentencepiece"]) | |
subprocess.check_call(["pip", "install", "soundfile"]) | |
return "fairseq successfully installed!" | |
except subprocess.CalledProcessError as e: | |
return f"An error occurred while installing fairseq: {str(e)}" | |
def convert_audio_to_16k_wav(audio_input): | |
sound = AudioSegment.from_file(audio_input) | |
sample_rate = sound.frame_rate | |
num_channels = sound.channels | |
num_frames = int(sound.frame_count()) | |
filename = audio_input.split("/")[-1] | |
print("original file is at:", audio_input) | |
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav | |
if num_channels > 1: | |
sound = sound.set_channels(1) | |
if sample_rate != 16000: | |
sound = sound.set_frame_rate(16000) | |
num_frames = int(sound.frame_count()) | |
filename = filename.replace(".wav", "") + "_16k.wav" | |
sound.export(f"{filename}", format="wav") | |
return filename | |
def run_my_code(input_text, language): | |
# TODO better argument handling | |
audio=convert_audio_to_16k_wav(input_text) | |
hi_wav = audio | |
data_root="" | |
model_checkpoint="" | |
d_r="" | |
yam="" | |
if(language=="Hindi"): | |
model_checkpoint = "./models/hindi_model.pt" | |
data_root="./MUSTC_ROOT_hindi/en-hi/" | |
d_r="MUSTC_ROOT_hindi/" | |
yam="./MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.yaml" | |
if(language=="French"): | |
model_checkpoint = "./models/french_model.pt" | |
data_root="./MUSTC_ROOT_french/en-fr/" | |
d_r="MUSTC_ROOT_french/" | |
yam="./MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.yaml" | |
if(language=="German"): | |
model_checkpoint = "./models/german_model.pt" | |
data_root="./MUSTC_ROOT_german/en-de/" | |
d_r="MUSTC_ROOT_german/" | |
yam="./MUSTC_ROOT_german/en-de/data/tst-COMMON/txt/tst-COMMON.yaml" | |
if(language=="Bengali"): | |
model_checkpoint = "./models/bengali_model.pt" | |
data_root="./MUSTC_ROOT_bengali/en-bn/" | |
d_r="MUSTC_ROOT_bengali/" | |
yam="./MUSTC_ROOT_bengali/en-bn/data/tst-COMMON/txt/tst-COMMON.yaml" | |
if(language=="Nepali"): | |
model_checkpoint = "./models/nepali_model.pt" | |
data_root="./MUSTC_ROOT_nepali/en-ne/" | |
d_r="MUSTC_ROOT_nepali/" | |
yam="./MUSTC_ROOT_nepali/en-ne/data/tst-COMMON/txt/tst-COMMON.yaml" | |
#code to change the duration of the yaml file accordign to the audio input | |
with open(yam, 'r') as yaml_file: | |
data = yaml.safe_load(yaml_file) | |
data[0]['duration']=get_wav_duration(hi_wav) | |
with open(yam, 'w') as yaml_file: | |
yaml.dump(data, yaml_file) | |
os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav") | |
print("------Starting data prepration------") | |
subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
#For testing | |
#subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"]) | |
print("------Performing translation------") | |
#subprocess.run(["python", "generate.py", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint]) | |
translation_result = subprocess.run(["python", "generate.py", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint], capture_output=True, text=True) | |
translation_result_text = translation_result.stdout | |
lines = translation_result_text.split("\n") | |
#just for checking the duration from the yaml file of the current input audio | |
with open(yam, 'r') as yaml_file: | |
data = yaml.safe_load(yaml_file) | |
print(data[0]['duration'], " seconds duration") | |
output_text="" | |
print("\n\n------Translation results are:\n") | |
for i in lines: | |
if (i.startswith("D-0")): | |
print(i.split("\t")[2]) | |
output_text=i.split("\t")[2] | |
break | |
os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav") | |
return output_text | |
install_fairseq() | |
# Define the input and output interfaces for Gradio | |
#inputs = [ | |
# gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."), | |
# gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."), | |
# ] | |
#input_textbox = gr.inputs.Textbox(label="test2.wav") | |
#input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...") | |
#audio=convert_audio_to_16k_wav(input) | |
output_textbox = gr.outputs.Textbox(label="The Translated Text is:") | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=run_my_code, | |
inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American/British English Accent)..."), gr.inputs.Radio(["Hindi", "Bengali", "Nepali", "French", "German"], label="Language To be Translated To")], | |
outputs=output_textbox, | |
title="English Speech To Multilingual Text Translator") | |
# Launch the interface | |
iface.launch() |