Spaces:
Runtime error
Runtime error
File size: 6,487 Bytes
4f94afb 8c29416 4f94afb 8c29416 4f94afb 8c29416 4f94afb 8c29416 4f94afb 8c29416 832785c ab7482c 9decf66 d3e410d ef8b63e 6cc98e6 4f94afb 8c29416 4f94afb 8c29416 3aa8911 8c29416 d5c8346 3aa8911 4f94afb 3aa8911 4f94afb 8c29416 3aa8911 8c29416 4f94afb 3aa8911 4f94afb 5293768 4f94afb 8c29416 4f94afb d3e410d 4f94afb d9e6fd6 4f94afb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
"""
Script to translate given single english audio file to corresponding hindi text
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
"""
import gradio as gr
import sys
import os
import subprocess
from pydub import AudioSegment
import yaml
import wave
def get_wav_duration(file_path):
with wave.open(file_path, 'rb') as wav_file:
frames = wav_file.getnframes()
rate = wav_file.getframerate()
duration = frames / float(rate)
return duration
def install_fairseq():
try:
# Run pip install command to install fairseq
subprocess.check_call(["pip", "install", "fairseq"])
subprocess.check_call(["pip", "install", "sentencepiece"])
subprocess.check_call(["pip", "install", "soundfile"])
return "fairseq successfully installed!"
except subprocess.CalledProcessError as e:
return f"An error occurred while installing fairseq: {str(e)}"
def convert_audio_to_16k_wav(audio_input):
sound = AudioSegment.from_file(audio_input)
sample_rate = sound.frame_rate
num_channels = sound.channels
num_frames = int(sound.frame_count())
filename = audio_input.split("/")[-1]
print("original file is at:", audio_input)
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
if num_channels > 1:
sound = sound.set_channels(1)
if sample_rate != 16000:
sound = sound.set_frame_rate(16000)
num_frames = int(sound.frame_count())
filename = filename.replace(".wav", "") + "_16k.wav"
sound.export(f"{filename}", format="wav")
return filename
def run_my_code(input_text, language):
# TODO better argument handling
audio=convert_audio_to_16k_wav(input_text)
hi_wav = audio
data_root=""
model_checkpoint=""
d_r=""
yam=""
if(language=="Hindi"):
model_checkpoint = "./models/hindi_model.pt"
data_root="./MUSTC_ROOT_hindi/en-hi/"
d_r="MUSTC_ROOT_hindi/"
yam="./MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.yaml"
if(language=="French"):
model_checkpoint = "./models/french_model.pt"
data_root="./MUSTC_ROOT_french/en-fr/"
d_r="MUSTC_ROOT_french/"
yam="./MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.yaml"
if(language=="German"):
model_checkpoint = "./models/german_model.pt"
data_root="./MUSTC_ROOT_german/en-de/"
d_r="MUSTC_ROOT_german/"
yam="./MUSTC_ROOT_german/en-de/data/tst-COMMON/txt/tst-COMMON.yaml"
if(language=="Bengali"):
model_checkpoint = "./models/bengali_model.pt"
data_root="./MUSTC_ROOT_bengali/en-bn/"
d_r="MUSTC_ROOT_bengali/"
yam="./MUSTC_ROOT_bengali/en-bn/data/tst-COMMON/txt/tst-COMMON.yaml"
if(language=="Nepali"):
model_checkpoint = "./models/nepali_model.pt"
data_root="./MUSTC_ROOT_nepali/en-ne/"
d_r="MUSTC_ROOT_nepali/"
yam="./MUSTC_ROOT_nepali/en-ne/data/tst-COMMON/txt/tst-COMMON.yaml"
if(language=="Gujrati"):
model_checkpoint = "./models/gujrati_model.pt"
data_root="./MUSTC_ROOT_gujrati/en-gj/"
d_r="MUSTC_ROOT_gujrati/"
yam="./MUSTC_ROOT_gujrati/en-gj/data/tst-COMMON/txt/tst-COMMON.yaml"
if(language=="Tamil"):
model_checkpoint = "./models/tamil_model.pt"
data_root="./MUSTC_ROOT_tamil/en-tm/"
d_r="MUSTC_ROOT_tamil/"
yam="./MUSTC_ROOT_tamil/en-tm/data/tst-COMMON/txt/tst-COMMON.yaml"
#code to change the duration of the yaml file accordign to the audio input
with open(yam, 'r') as yaml_file:
data = yaml.safe_load(yaml_file)
data[0]['duration']=get_wav_duration(hi_wav)
with open(yam, 'w') as yaml_file:
yaml.dump(data, yaml_file)
os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")
print("------Starting data prepration------")
subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
#For testing
#subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"])
print("------Performing translation------")
#subprocess.run(["python", "generate.py", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint])
translation_result = subprocess.run(["python", "generate.py", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint], capture_output=True, text=True)
translation_result_text = translation_result.stdout
lines = translation_result_text.split("\n")
#just for checking the duration from the yaml file of the current input audio
with open(yam, 'r') as yaml_file:
data = yaml.safe_load(yaml_file)
print(data[0]['duration'], " seconds duration")
output_text=""
print("\n\n------Translation results are:\n")
for i in lines:
if (i.startswith("D-0")):
print(i.split("\t")[2])
output_text=i.split("\t")[2]
break
os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav")
return output_text
install_fairseq()
# Define the input and output interfaces for Gradio
#inputs = [
# gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
# gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
# ]
#input_textbox = gr.inputs.Textbox(label="test2.wav")
#input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
#audio=convert_audio_to_16k_wav(input)
output_textbox = gr.outputs.Textbox(label="The Translated Text is:")
# Create a Gradio interface
iface = gr.Interface(
fn=run_my_code,
inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American/British English Accent)..."), gr.inputs.Radio(["Hindi", "Bengali", "Gujrati", "Tamil", "Nepali", "French", "German"], label="Language To be Translated To")],
outputs=output_textbox,
title="English Speech To Multilingual Text Translator")
# Launch the interface
iface.launch() |