Upload 28 files
Browse files- app.py +108 -0
- input.txt.txt +0 -0
- lang/bn/config_st.yaml +19 -0
- lang/bn/spm_unigram8000_st.model +3 -0
- lang/bn/spm_unigram8000_st.txt +0 -0
- lang/gj/config_st.yaml +19 -0
- lang/gj/spm_unigram8000_st.model +3 -0
- lang/gj/spm_unigram8000_st.txt +0 -0
- lang/hi/config_st.yaml +19 -0
- lang/hi/spm_unigram8000_st.model +3 -0
- lang/hi/spm_unigram8000_st.txt +0 -0
- lang/mt/config_st.yaml +19 -0
- lang/mt/spm_unigram8000_st.model +3 -0
- lang/mt/spm_unigram8000_st.txt +0 -0
- lang/ne/config_st.yaml +19 -0
- lang/ne/spm_unigram8000_st.model +3 -0
- lang/ne/spm_unigram8000_st.txt +0 -0
- lang/tm/config_st.yaml +19 -0
- lang/tm/spm_unigram8000_st.model +3 -0
- lang/tm/spm_unigram8000_st.txt +0 -0
- models/bn_m.pt +3 -0
- models/de_m.pt +3 -0
- models/fr_m.pt +3 -0
- models/gj_m.pt +3 -0
- models/hi_m.pt +3 -0
- models/mt_m.pt +3 -0
- models/ne_m.pt +3 -0
- models/tm_m.pt +3 -0
app.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Script to translate given single english audio file to corresponding hindi text
|
3 |
+
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
|
4 |
+
"""
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import sys
|
10 |
+
import os
|
11 |
+
import subprocess
|
12 |
+
from pydub import AudioSegment
|
13 |
+
from huggingface_hub import snapshot_download
|
14 |
+
|
15 |
+
def install_fairseq():
|
16 |
+
try:
|
17 |
+
# Run pip install command to install fairseq
|
18 |
+
subprocess.check_call(["pip", "install", "fairseq"])
|
19 |
+
subprocess.check_call(["pip", "install", "sentencepiece"])
|
20 |
+
subprocess.check_call(["pip", "install", "soundfile"])
|
21 |
+
return "fairseq successfully installed!"
|
22 |
+
except subprocess.CalledProcessError as e:
|
23 |
+
return f"An error occurred while installing fairseq: {str(e)}"
|
24 |
+
|
25 |
+
def convert_audio_to_16k_wav(audio_input):
|
26 |
+
sound = AudioSegment.from_file(audio_input)
|
27 |
+
sample_rate = sound.frame_rate
|
28 |
+
num_channels = sound.channels
|
29 |
+
num_frames = int(sound.frame_count())
|
30 |
+
filename = audio_input.split("/")[-1]
|
31 |
+
print("original file is at:", audio_input)
|
32 |
+
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
|
33 |
+
if num_channels > 1:
|
34 |
+
sound = sound.set_channels(1)
|
35 |
+
if sample_rate != 16000:
|
36 |
+
sound = sound.set_frame_rate(16000)
|
37 |
+
num_frames = int(sound.frame_count())
|
38 |
+
filename = filename.replace(".wav", "") + "_16k.wav"
|
39 |
+
sound.export(f"{filename}", format="wav")
|
40 |
+
return filename
|
41 |
+
|
42 |
+
|
43 |
+
def run_my_code(input_text, language):
|
44 |
+
# TODO better argument handling
|
45 |
+
audio=convert_audio_to_16k_wav(input_text)
|
46 |
+
hi_wav = audio
|
47 |
+
|
48 |
+
data_root=""
|
49 |
+
model_checkpoint=""
|
50 |
+
d_r=""
|
51 |
+
|
52 |
+
if(language=="Hindi"):
|
53 |
+
model_checkpoint = "./models/hi_m.pt"
|
54 |
+
data_root="./lang/bn"
|
55 |
+
|
56 |
+
if(language=="Gujrati"):
|
57 |
+
model_checkpoint = "./models/gj_m.pt"
|
58 |
+
data_root="./lang/gj"
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
#os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")
|
63 |
+
f = open('input.txt', 'w')
|
64 |
+
f.write(hi_wav)
|
65 |
+
|
66 |
+
|
67 |
+
print("------Performing translation...")
|
68 |
+
|
69 |
+
translation_result = subprocess.run(["fairseq-interactive", data_root, "--config-yaml", "config_st.yaml", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5" ,"--input" ,"input.txt"], capture_output=True, text=True)
|
70 |
+
translation_result_text = translation_result.stdout
|
71 |
+
|
72 |
+
lines = translation_result_text.split("\n")
|
73 |
+
|
74 |
+
output_text=""
|
75 |
+
print("\n\n------Translation results are:")
|
76 |
+
for i in lines:
|
77 |
+
if (i.startswith("D-0")):
|
78 |
+
print(i.split("\t")[2])
|
79 |
+
output_text=i.split("\t")[2]
|
80 |
+
break
|
81 |
+
|
82 |
+
#os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav")
|
83 |
+
f = open('input.txt', 'w')
|
84 |
+
f.write("")
|
85 |
+
return output_text
|
86 |
+
|
87 |
+
install_fairseq()
|
88 |
+
|
89 |
+
# Define the input and output interfaces for Gradio
|
90 |
+
#inputs = [
|
91 |
+
# gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
|
92 |
+
# gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
|
93 |
+
# ]
|
94 |
+
|
95 |
+
#input_textbox = gr.inputs.Textbox(label="test2.wav")
|
96 |
+
#input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
|
97 |
+
#audio=convert_audio_to_16k_wav(input)
|
98 |
+
output_textbox = gr.outputs.Textbox(label="Output Text")
|
99 |
+
|
100 |
+
# Create a Gradio interface
|
101 |
+
iface = gr.Interface(
|
102 |
+
fn=run_my_code,
|
103 |
+
inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in American English accent)"), gr.inputs.Radio(["Hindi", "Gujrati"], label="Language")],
|
104 |
+
outputs=output_textbox,
|
105 |
+
title="English to Indic Language Translator")
|
106 |
+
|
107 |
+
# Launch the interface
|
108 |
+
iface.launch()
|
input.txt.txt
ADDED
File without changes
|
lang/bn/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: ./spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
lang/bn/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:892dd0398e561af3bd035798ff1682f9a35c2736378e041922a46e111c3d7a72
|
3 |
+
size 467219
|
lang/bn/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lang/gj/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: ./spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
lang/gj/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af23c356de872a60a32cfd6eacd7d5313934d7252b1b4ccc011bfc6992c2e904
|
3 |
+
size 454913
|
lang/gj/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lang/hi/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: ./spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
lang/hi/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf7b26c17db61dcd76400fbb74c5395d5f13837ed0fd5fa1098930de4f2a8202
|
3 |
+
size 449800
|
lang/hi/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lang/mt/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: ./spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
lang/mt/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ba6aa66df08e7b5614deadfe3fc08d5473dc7dcf672d15134ce0e4db6dd99e1
|
3 |
+
size 458987
|
lang/mt/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lang/ne/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: /home/deepakprasad/nlp_code/Nepali_MUSTC/en-ne/spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
lang/ne/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9b431e41320a5738c0af5368d23c5071a71899c897887f06a22f2efc087dd80
|
3 |
+
size 459775
|
lang/ne/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lang/tm/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: ./spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
lang/tm/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8111dca119a0b896f1a2f371fbe60682b804cf1e0f99281dd4cf410ea9e8bd29
|
3 |
+
size 500276
|
lang/tm/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/bn_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97a2b6d13570a7296bb8530ff4a97306c643dddfa8abff9197df53d20cd8b735
|
3 |
+
size 373237256
|
models/de_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dbd8f22a2b8d90dc24ba9d4fc84df3c3b0bcf711366ac93bef27e0fe2deaa6cd
|
3 |
+
size 373237192
|
models/fr_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34199f96a7194ed36bdde18cf9137df39fff82f725e57923627909c369d75433
|
3 |
+
size 373237448
|
models/gj_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a7d7a21002847ec8e16dd1737b35495574e266fdf39aecfa9bb9126d8444a62
|
3 |
+
size 373237448
|
models/hi_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47e8bfef22034ac859da3a2726b142876793113cf18ac18bb6f6eb85415a7893
|
3 |
+
size 373227272
|
models/mt_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b300d014121494e5583ac83df275038b7a5728e25c25caf2d0a566f482f33a6
|
3 |
+
size 373237192
|
models/ne_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8ac42610702980cf090b41356e4b525ac1999b147c0564d8a45605b571b3018
|
3 |
+
size 373237192
|
models/tm_m.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:487b409230de732f76fad7bb40581490ff207054b13dcabf8cd52d6ed1334668
|
3 |
+
size 373237448
|