File size: 3,982 Bytes
4f94afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Script to translate given single english audio file to corresponding hindi text
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
"""



import gradio as gr
import sys
import os
import subprocess
from pydub import AudioSegment
from huggingface_hub import snapshot_download

def install_fairseq():
    try:
        # Run pip install command to install fairseq
        subprocess.check_call(["pip", "install", "fairseq"])
        subprocess.check_call(["pip", "install", "sentencepiece"])
        subprocess.check_call(["pip", "install", "soundfile"])
        return "fairseq successfully installed!"
    except subprocess.CalledProcessError as e:
        return f"An error occurred while installing fairseq: {str(e)}"

def convert_audio_to_16k_wav(audio_input):
    sound = AudioSegment.from_file(audio_input)
    sample_rate = sound.frame_rate
    num_channels = sound.channels
    num_frames = int(sound.frame_count())
    filename = audio_input.split("/")[-1]
    print("original file is at:", audio_input)
    if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
        if num_channels > 1:
            sound = sound.set_channels(1)
        if sample_rate != 16000:
            sound = sound.set_frame_rate(16000)
        num_frames = int(sound.frame_count())
        filename = filename.replace(".wav", "") + "_16k.wav"
        sound.export(f"{filename}", format="wav")
    return filename


def run_my_code(input_text, language):
    # TODO better argument handling
    audio=convert_audio_to_16k_wav(input_text)
    hi_wav = audio

    data_root=""
    model_checkpoint=""
    d_r=""

    if(language=="Hindi"):
        model_checkpoint = "./models/hindi_model.pt"
        data_root="./MUSTC_ROOT_hindi/en-hi/"
        d_r="MUSTC_ROOT_hindi/"
    if(language=="French"):
        model_checkpoint = "./models/french_model.pt"
        data_root="./MUSTC_ROOT_french/en-fr/"
        d_r="MUSTC_ROOT_french/"



    os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")

    print("------Starting data prepration...")
    subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    print("------Performing translation...")

    translation_result = subprocess.run(["fairseq-generate", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5", "--scoring", "sacrebleu"], capture_output=True, text=True)
    translation_result_text = translation_result.stdout
 
    lines = translation_result_text.split("\n")

    output_text=""
    print("\n\n------Translation results are:")
    for i in lines:
        if (i.startswith("D-0")):
            print(i.split("\t")[2])
            output_text=i.split("\t")[2]
            break

    os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav")
    return output_text

install_fairseq()

# Define the input and output interfaces for Gradio
#inputs = [
  #      gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
  #      gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
   # ]

#input_textbox = gr.inputs.Textbox(label="test2.wav")
#input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
#audio=convert_audio_to_16k_wav(input)
output_textbox = gr.outputs.Textbox(label="Output Text")

# Create a Gradio interface
iface = gr.Interface(
        fn=run_my_code, 
        inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."), gr.inputs.Radio(["Hindi", "French"], label="Language")], 
        outputs=output_textbox, 
        title="English to Hindi Translator")

# Launch the interface
iface.launch()