balaramas commited on
Commit
dc58133
1 Parent(s): e4a78f5

Upload 28 files

Browse files
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to translate given single english audio file to corresponding hindi text
3
+ Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
4
+ """
5
+
6
+
7
+
8
+ import gradio as gr
9
+ import sys
10
+ import os
11
+ import subprocess
12
+ from pydub import AudioSegment
13
+ from huggingface_hub import snapshot_download
14
+
15
+ def install_fairseq():
16
+ try:
17
+ # Run pip install command to install fairseq
18
+ subprocess.check_call(["pip", "install", "fairseq"])
19
+ subprocess.check_call(["pip", "install", "sentencepiece"])
20
+ subprocess.check_call(["pip", "install", "soundfile"])
21
+ return "fairseq successfully installed!"
22
+ except subprocess.CalledProcessError as e:
23
+ return f"An error occurred while installing fairseq: {str(e)}"
24
+
25
+ def convert_audio_to_16k_wav(audio_input):
26
+ sound = AudioSegment.from_file(audio_input)
27
+ sample_rate = sound.frame_rate
28
+ num_channels = sound.channels
29
+ num_frames = int(sound.frame_count())
30
+ filename = audio_input.split("/")[-1]
31
+ print("original file is at:", audio_input)
32
+ if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
33
+ if num_channels > 1:
34
+ sound = sound.set_channels(1)
35
+ if sample_rate != 16000:
36
+ sound = sound.set_frame_rate(16000)
37
+ num_frames = int(sound.frame_count())
38
+ filename = filename.replace(".wav", "") + "_16k.wav"
39
+ sound.export(f"{filename}", format="wav")
40
+ return filename
41
+
42
+
43
+ def run_my_code(input_text, language):
44
+ # TODO better argument handling
45
+ audio=convert_audio_to_16k_wav(input_text)
46
+ hi_wav = audio
47
+
48
+ data_root=""
49
+ model_checkpoint=""
50
+ d_r=""
51
+
52
+ if(language=="Hindi"):
53
+ model_checkpoint = "./models/hi_m.pt"
54
+ data_root="./lang/bn"
55
+
56
+ if(language=="Gujrati"):
57
+ model_checkpoint = "./models/gj_m.pt"
58
+ data_root="./lang/gj"
59
+
60
+
61
+
62
+ #os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")
63
+ f = open('input.txt', 'w')
64
+ f.write(hi_wav)
65
+
66
+
67
+ print("------Performing translation...")
68
+
69
+ translation_result = subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"], capture_output=True, text=True)
70
+ translation_result_text = translation_result.stdout
71
+
72
+ lines = translation_result_text.split("\n")
73
+
74
+ output_text=""
75
+ print("\n\n------Translation results are:")
76
+ for i in lines:
77
+ if (i.startswith("D-0")):
78
+ print(i.split("\t")[2])
79
+ output_text=i.split("\t")[2]
80
+ break
81
+
82
+ #os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav")
83
+ f = open('input.txt', 'w')
84
+ f.write("")
85
+ return output_text
86
+
87
+ install_fairseq()
88
+
89
+ # Define the input and output interfaces for Gradio
90
+ #inputs = [
91
+ # gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
92
+ # gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
93
+ # ]
94
+
95
+ #input_textbox = gr.inputs.Textbox(label="test2.wav")
96
+ #input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
97
+ #audio=convert_audio_to_16k_wav(input)
98
+ output_textbox = gr.outputs.Textbox(label="Output Text")
99
+
100
+ # Create a Gradio interface
101
+ iface = gr.Interface(
102
+ fn=run_my_code,
103
+ inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American English accent)"), gr.inputs.Radio(["Hindi", "Gujrati"], label="Language")],
104
+ outputs=output_textbox,
105
+ title="English to Indic Language Translator")
106
+
107
+ # Launch the interface
108
+ iface.launch()
input.txt.txt ADDED
File without changes
lang/bn/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/bn/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:892dd0398e561af3bd035798ff1682f9a35c2736378e041922a46e111c3d7a72
3
+ size 467219
lang/bn/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/gj/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/gj/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af23c356de872a60a32cfd6eacd7d5313934d7252b1b4ccc011bfc6992c2e904
3
+ size 454913
lang/gj/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/hi/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/hi/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf7b26c17db61dcd76400fbb74c5395d5f13837ed0fd5fa1098930de4f2a8202
3
+ size 449800
lang/hi/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/mt/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/mt/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ba6aa66df08e7b5614deadfe3fc08d5473dc7dcf672d15134ce0e4db6dd99e1
3
+ size 458987
lang/mt/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/ne/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: /home/deepakprasad/nlp_code/Nepali_MUSTC/en-ne/spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/ne/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9b431e41320a5738c0af5368d23c5071a71899c897887f06a22f2efc087dd80
3
+ size 459775
lang/ne/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
lang/tm/config_st.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bpe_tokenizer:
2
+ bpe: sentencepiece
3
+ sentencepiece_model: ./spm_unigram8000_st.model
4
+ input_channels: 1
5
+ input_feat_per_channel: 80
6
+ specaugment:
7
+ freq_mask_F: 27
8
+ freq_mask_N: 1
9
+ time_mask_N: 1
10
+ time_mask_T: 100
11
+ time_mask_p: 1.0
12
+ time_wrap_W: 0
13
+ transforms:
14
+ '*':
15
+ - utterance_cmvn
16
+ _train:
17
+ - utterance_cmvn
18
+ - specaugment
19
+ vocab_filename: spm_unigram8000_st.txt
lang/tm/spm_unigram8000_st.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8111dca119a0b896f1a2f371fbe60682b804cf1e0f99281dd4cf410ea9e8bd29
3
+ size 500276
lang/tm/spm_unigram8000_st.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/bn_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97a2b6d13570a7296bb8530ff4a97306c643dddfa8abff9197df53d20cd8b735
3
+ size 373237256
models/de_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbd8f22a2b8d90dc24ba9d4fc84df3c3b0bcf711366ac93bef27e0fe2deaa6cd
3
+ size 373237192
models/fr_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34199f96a7194ed36bdde18cf9137df39fff82f725e57923627909c369d75433
3
+ size 373237448
models/gj_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a7d7a21002847ec8e16dd1737b35495574e266fdf39aecfa9bb9126d8444a62
3
+ size 373237448
models/hi_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47e8bfef22034ac859da3a2726b142876793113cf18ac18bb6f6eb85415a7893
3
+ size 373227272
models/mt_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b300d014121494e5583ac83df275038b7a5728e25c25caf2d0a566f482f33a6
3
+ size 373237192
models/ne_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ac42610702980cf090b41356e4b525ac1999b147c0564d8a45605b571b3018
3
+ size 373237192
models/tm_m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:487b409230de732f76fad7bb40581490ff207054b13dcabf8cd52d6ed1334668
3
+ size 373237448