""" Script to translate given single english audio file to corresponding hindi text Usage : python s2t_en2hi.py """ import gradio as gr import sys import os import subprocess from pydub import AudioSegment import yaml import wave def get_wav_duration(file_path): with wave.open(file_path, 'rb') as wav_file: frames = wav_file.getnframes() rate = wav_file.getframerate() duration = frames / float(rate) return duration def install_fairseq(): try: # Run pip install command to install fairseq subprocess.check_call(["pip", "install", "fairseq"]) subprocess.check_call(["pip", "install", "sentencepiece"]) subprocess.check_call(["pip", "install", "soundfile"]) return "fairseq successfully installed!" except subprocess.CalledProcessError as e: return f"An error occurred while installing fairseq: {str(e)}" def convert_audio_to_16k_wav(audio_input): sound = AudioSegment.from_file(audio_input) sample_rate = sound.frame_rate num_channels = sound.channels num_frames = int(sound.frame_count()) filename = audio_input.split("/")[-1] print("original file is at:", audio_input) if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav if num_channels > 1: sound = sound.set_channels(1) if sample_rate != 16000: sound = sound.set_frame_rate(16000) num_frames = int(sound.frame_count()) filename = filename.replace(".wav", "") + "_16k.wav" sound.export(f"{filename}", format="wav") return filename def run_my_code(input_text, language): # TODO better argument handling audio=convert_audio_to_16k_wav(input_text) hi_wav = audio data_root="" model_checkpoint="" d_r="" yam="" if(language=="Hindi"): model_checkpoint = "./models/hindi_model.pt" data_root="./MUSTC_ROOT_hindi/en-hi/" d_r="MUSTC_ROOT_hindi/" yam="./MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.yaml" if(language=="French"): model_checkpoint = "./models/french_model.pt" data_root="./MUSTC_ROOT_french/en-fr/" d_r="MUSTC_ROOT_french/" yam="./MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.yaml" if(language=="German"): model_checkpoint = "./models/german_model.pt" data_root="./MUSTC_ROOT_german/en-de/" d_r="MUSTC_ROOT_german/" yam="./MUSTC_ROOT_german/en-de/data/tst-COMMON/txt/tst-COMMON.yaml" #code to change the duration of the yaml file accordign to the audio input with open(yam, 'r') as yaml_file: data = yaml.safe_load(yaml_file) data[0]['duration']=get_wav_duration(hi_wav) with open(yam, 'w') as yaml_file: yaml.dump(data, yaml_file) os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav") print("------Starting data prepration------") subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) print("------Performing translation------") translation_result = subprocess.run(["python", "generate.py", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint], capture_output=True, text=True) translation_result_text = translation_result.stdout lines = translation_result_text.split("\n") #just for checking the duration from the yaml file of the current input audio with open(yam, 'r') as yaml_file: data = yaml.safe_load(yaml_file) print(data[0]['duration'], " seconds duration") output_text="" print("\n\n------Translation results are:\n") for i in lines: if (i.startswith("D-0")): print(i.split("\t")[2]) output_text=i.split("\t")[2] break os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav") return output_text install_fairseq() # Define the input and output interfaces for Gradio #inputs = [ # gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."), # gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."), # ] #input_textbox = gr.inputs.Textbox(label="test2.wav") #input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...") #audio=convert_audio_to_16k_wav(input) output_textbox = gr.outputs.Textbox(label="The Translated Text is:") # Create a Gradio interface iface = gr.Interface( fn=run_my_code, inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American/British English Accent)..."), gr.inputs.Radio(["Hindi", "French", "German"], label="Language")], outputs=output_textbox, title="English to Hindi/French Translator") # Launch the interface iface.launch()