File size: 5,353 Bytes
4f94afb
 
 
 
 
 
 
 
 
 
 
 
8c29416
 
 
 
 
 
 
 
 
 
 
 
 
4f94afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c29416
4f94afb
 
 
8c29416
4f94afb
 
 
 
 
8c29416
4f94afb
 
 
 
8c29416
 
 
 
 
 
832785c
 
 
 
 
ef8b63e
6cc98e6
4f94afb
8c29416
 
 
 
 
 
 
 
 
4f94afb
 
8c29416
4f94afb
 
8c29416
4f94afb
8c29416
4f94afb
 
 
 
8c29416
 
 
 
 
 
4f94afb
8c29416
4f94afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5293768
4f94afb
 
8c29416
4f94afb
 
 
 
832785c
4f94afb
6f9a491
4f94afb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
Script to translate given single english audio file to corresponding hindi text
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
"""



import gradio as gr
import sys
import os
import subprocess
from pydub import AudioSegment
import yaml
import wave



def get_wav_duration(file_path):
    with wave.open(file_path, 'rb') as wav_file:
        frames = wav_file.getnframes()
        rate = wav_file.getframerate()
        duration = frames / float(rate)
        return duration



def install_fairseq():
    try:
        # Run pip install command to install fairseq
        subprocess.check_call(["pip", "install", "fairseq"])
        subprocess.check_call(["pip", "install", "sentencepiece"])
        subprocess.check_call(["pip", "install", "soundfile"])
        return "fairseq successfully installed!"
    except subprocess.CalledProcessError as e:
        return f"An error occurred while installing fairseq: {str(e)}"

def convert_audio_to_16k_wav(audio_input):
    sound = AudioSegment.from_file(audio_input)
    sample_rate = sound.frame_rate
    num_channels = sound.channels
    num_frames = int(sound.frame_count())
    filename = audio_input.split("/")[-1]
    print("original file is at:", audio_input)
    if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
        if num_channels > 1:
            sound = sound.set_channels(1)
        if sample_rate != 16000:
            sound = sound.set_frame_rate(16000)
        num_frames = int(sound.frame_count())
        filename = filename.replace(".wav", "") + "_16k.wav"
        sound.export(f"{filename}", format="wav")
    return filename


def run_my_code(input_text, language):
    # TODO better argument handling
    audio=convert_audio_to_16k_wav(input_text)
    hi_wav = audio


    data_root=""
    model_checkpoint=""
    d_r=""
    yam=""

    if(language=="Hindi"):
        model_checkpoint = "./models/hindi_model.pt"
        data_root="./MUSTC_ROOT_hindi/en-hi/"
        d_r="MUSTC_ROOT_hindi/"
        yam="./MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.yaml"
    if(language=="French"):
        model_checkpoint = "./models/french_model.pt"
        data_root="./MUSTC_ROOT_french/en-fr/"
        d_r="MUSTC_ROOT_french/"
        yam="./MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.yaml"
    if(language=="German"):
        model_checkpoint = "./models/german_model.pt"
        data_root="./MUSTC_ROOT_german/en-de/"
        d_r="MUSTC_ROOT_german/"
        yam="./MUSTC_ROOT_german/en-de/data/tst-COMMON/txt/tst-COMMON.yaml"
    if(language=="Bengali"):
        model_checkpoint = "./models/bengali_model.pt"
        data_root="./MUSTC_ROOT_bengali/en-bn/"
        d_r="MUSTC_ROOT_bengali/"
        yam="./MUSTC_ROOT_bengali/en-bn/data/tst-COMMON/txt/tst-COMMON.yaml"        





    #code to change the duration of the yaml file accordign to the audio input
    with open(yam, 'r') as yaml_file:
        data = yaml.safe_load(yaml_file)
    data[0]['duration']=get_wav_duration(hi_wav)
    with open(yam, 'w') as yaml_file:
        yaml.dump(data, yaml_file)

    os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")

    print("------Starting data prepration------")
    subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    print("------Performing translation------")

    translation_result = subprocess.run(["python", "generate.py", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint], capture_output=True, text=True)
    translation_result_text = translation_result.stdout
 
    lines = translation_result_text.split("\n")


    #just for checking the duration from the yaml file of the current input audio
    with open(yam, 'r') as yaml_file:
        data = yaml.safe_load(yaml_file)
    print(data[0]['duration'], " seconds duration")    

    output_text=""
    print("\n\n------Translation results are:\n")
    for i in lines:
        if (i.startswith("D-0")):
            print(i.split("\t")[2])
            output_text=i.split("\t")[2]
            break

    os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav")
    return output_text

install_fairseq()

# Define the input and output interfaces for Gradio
#inputs = [
  #      gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
  #      gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
   # ]

#input_textbox = gr.inputs.Textbox(label="test2.wav")
#input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
#audio=convert_audio_to_16k_wav(input)
output_textbox = gr.outputs.Textbox(label="The Translated Text is:")

# Create a Gradio interface
iface = gr.Interface(
        fn=run_my_code, 
        inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American/British English Accent)..."), gr.inputs.Radio(["Hindi", "Bengali", "French", "German"], label="Language")], 
        outputs=output_textbox, 
        title="English Speech to Text Translator")

# Launch the interface
iface.launch()