Spaces:
Runtime error
Runtime error
Upload 26 files
Browse files- .gitattributes +2 -0
- MUSTC_ROOT_french/en-fr/config_st.yaml +19 -0
- MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.en +8 -0
- MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.fr +8 -0
- MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.yaml +8 -0
- MUSTC_ROOT_french/en-fr/data/tst-COMMON/wav/ted_1096.wav +3 -0
- MUSTC_ROOT_french/en-fr/spm_unigram8000_st.model +3 -0
- MUSTC_ROOT_french/en-fr/spm_unigram8000_st.txt +0 -0
- MUSTC_ROOT_french/en-fr/tst-COMMON_st.tsv +0 -0
- MUSTC_ROOT_hindi/en-hi/config_st.yaml +19 -0
- MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.en +8 -0
- MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.hi +8 -0
- MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.yaml +8 -0
- MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/wav/ted_1096.wav +3 -0
- MUSTC_ROOT_hindi/en-hi/fbank80.zip +3 -0
- MUSTC_ROOT_hindi/en-hi/spm_unigram8000_st.model +3 -0
- MUSTC_ROOT_hindi/en-hi/spm_unigram8000_st.txt +0 -0
- MUSTC_ROOT_hindi/en-hi/tst-COMMON_st.tsv +33 -0
- app.py +107 -0
- data_utils.py +383 -0
- models/french_model.pt +3 -0
- models/hindi_model.pt +3 -0
- prep_mustc_data_hindi_single.py +263 -0
- s2t_en2hi.py +32 -0
- s2t_en2hi_nolog.py +32 -0
- test.wav +0 -0
- test2.wav +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
MUSTC_ROOT_french/en-fr/data/tst-COMMON/wav/ted_1096.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/wav/ted_1096.wav filter=lfs diff=lfs merge=lfs -text
|
MUSTC_ROOT_french/en-fr/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: /media/lab202/BALARAM_HDD/MUSTC_v1.0_en-fr/en-fr/spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.en
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Back in New York, I am the head of development for a non-profit called Robin Hood.
|
2 |
+
When I'm not fighting poverty, I'm fighting fires as the assistant captain of a volunteer fire company.
|
3 |
+
Now in our town, where the volunteers supplement a highly skilled career staff, you have to get to the fire scene pretty early to get in on any action.
|
4 |
+
I remember my first fire.
|
5 |
+
I was the second volunteer on the scene, so there was a pretty good chance I was going to get in.
|
6 |
+
But still it was a real footrace against the other volunteers to get to the captain in charge to find out what our assignments would be.
|
7 |
+
When I found the captain, he was having a very engaging conversation with the homeowner, who was surely having one of the worst days of her life.
|
8 |
+
Here it was, the middle of the night, she was standing outside in the pouring rain, under an umbrella, in her pajamas, barefoot, while her house was in flames.
|
MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.fr
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
A New York, je suis responsable du développment pour un organisme à but non lucratif appelé Robin Hood.
|
2 |
+
Quand je ne suis pas en train de combattre la pauvreté, je combat des incendies en tant qu'assistant capitaine d'une compagnie de pompiers volontaires.
|
3 |
+
Et dans notre ville, où les volontaires viennent renforcer une équipe professionnelle hautement qualifiée, il faut arriver sur le lieu de l'incendie très tôt pour prendre part à l'action.
|
4 |
+
Je me souviens de mon premier incendie.
|
5 |
+
J'étais le deuxième volontaire sur les lieux, et donc j'avais de bonnes chances d'y aller.
|
6 |
+
Mais pourtant c'était une vrai course à pied contre les autres volontaires pour arriver jusqu'au capitaine responsable pour découvrir ce que seraient nos missions.
|
7 |
+
Quand j'ai trouvé le capitaine, il était en pleine conversation avec la propriétaire, qui était surement en train de vivre la pire journée de sa vie.
|
8 |
+
C'était en pleine nuit, elle était là dehors sous la pluie battante, sous un parapluie, en pyjama, pieds nus, pendant que sa maison était en flammes.
|
MUSTC_ROOT_french/en-fr/data/tst-COMMON/txt/tst-COMMON.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- {duration: 5.0, offset: 0.0, rW: 17, uW: 0, speaker_id: spk.1096, wav: test.wav}
|
2 |
+
- {duration: 5.160000, offset: 20.290000, rW: 17, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
3 |
+
- {duration: 8.110000, offset: 25.930000, rW: 29, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
4 |
+
- {duration: 1.560000, offset: 34.920000, rW: 5, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
5 |
+
- {duration: 4.180000, offset: 36.730000, rW: 21, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
6 |
+
- {duration: 5.580000, offset: 41.880000, rW: 26, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
7 |
+
- {duration: 8.610001, offset: 48.309999, rW: 27, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
8 |
+
- {duration: 9.680000, offset: 57.510000, rW: 29, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
MUSTC_ROOT_french/en-fr/data/tst-COMMON/wav/ted_1096.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69a122c3ad89320ec24cad84b622a01f26c3138b3e5869dc033e65bd0ab73fe1
|
3 |
+
size 8990102
|
MUSTC_ROOT_french/en-fr/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0ee92c9ab8210fb21647509df3f65ae36527a3659005b05491da04008939098
|
3 |
+
size 381797
|
MUSTC_ROOT_french/en-fr/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
MUSTC_ROOT_french/en-fr/tst-COMMON_st.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
MUSTC_ROOT_hindi/en-hi/config_st.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bpe_tokenizer:
|
2 |
+
bpe: sentencepiece
|
3 |
+
sentencepiece_model: ./spm_unigram8000_st.model
|
4 |
+
input_channels: 1
|
5 |
+
input_feat_per_channel: 80
|
6 |
+
specaugment:
|
7 |
+
freq_mask_F: 27
|
8 |
+
freq_mask_N: 1
|
9 |
+
time_mask_N: 1
|
10 |
+
time_mask_T: 100
|
11 |
+
time_mask_p: 1.0
|
12 |
+
time_wrap_W: 0
|
13 |
+
transforms:
|
14 |
+
'*':
|
15 |
+
- utterance_cmvn
|
16 |
+
_train:
|
17 |
+
- utterance_cmvn
|
18 |
+
- specaugment
|
19 |
+
vocab_filename: spm_unigram8000_st.txt
|
MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.en
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Back in New York, I am the head of development for a non-profit called Robin Hood.
|
2 |
+
When I'm not fighting poverty, I'm fighting fires as the assistant captain of a volunteer fire company.
|
3 |
+
Now in our town, where the volunteers supplement a highly skilled career staff, you have to get to the fire scene pretty early to get in on any action.
|
4 |
+
I remember my first fire.
|
5 |
+
I was the second volunteer on the scene, so there was a pretty good chance I was going to get in.
|
6 |
+
But still it was a real footrace against the other volunteers to get to the captain in charge to find out what our assignments would be.
|
7 |
+
When I found the captain, he was having a very engaging conversation with the homeowner, who was surely having one of the worst days of her life.
|
8 |
+
Here it was, the middle of the night, she was standing outside in the pouring rain, under an umbrella, in her pajamas, barefoot, while her house was in flames.
|
MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.hi
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
न्यूयॉर्क में वापस, मैं रॉबिन हुड नामक एक गैर-लाभकारी संस्था के विकास का प्रमुख हूं।
|
2 |
+
जब मैं गरीबी से नहीं लड़ रहा हूं, तो मैं स्वयंसेवी फायर कंपनी के सहायक कप्तान के रूप में आग से लड़ रहा हूं।
|
3 |
+
अब हमारे शहर में, जहां स्वयंसेवक एक अत्यधिक कुशल कैरियर स्टाफ के पूरक हैं, आपको किसी भी कार्रवाई में शामिल होने के लिए आग के दृश्य पर बहुत जल्दी पहुंचना होगा।
|
4 |
+
मुझे अपनी पहली आग याद है।
|
5 |
+
मैं इस दृश्य पर दूसरा स्वयंसेवक था, इसलिए मेरे अंदर आने का एक अच्छा मौका था।
|
6 |
+
लेकिन फिर भी यह अन्य स्वयंसेवकों के खिलाफ एक वास्तविक पदयात्रा थी जो प्रभारी कप्तान के पास यह पता लगाने के लिए थी कि हमारा कार्य क्या होगा।
|
7 |
+
जब मैंने कप्तान को पाया, तो वह गृहस्वामी के साथ बहुत ही आकर्षक बातचीत कर रहा था, जो निश्चित रूप से उसके जीवन के सबसे बुरे दिनों में से एक था।
|
8 |
+
यहाँ यह आधी रात थी, वह बारिश में बाहर, एक छतरी के नीचे, अपने पजामे में, नंगे पाँव खड़ी थी, जबकि उसका घर आग की लपटों में था।
|
MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/txt/tst-COMMON.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- {duration: 5.0, offset: 0.0, rW: 17, uW: 0, speaker_id: spk.1096, wav: test.wav}
|
2 |
+
- {duration: 5.160000, offset: 20.290000, rW: 17, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
3 |
+
- {duration: 8.110000, offset: 25.930000, rW: 29, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
4 |
+
- {duration: 1.560000, offset: 34.920000, rW: 5, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
5 |
+
- {duration: 4.180000, offset: 36.730000, rW: 21, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
6 |
+
- {duration: 5.580000, offset: 41.880000, rW: 26, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
7 |
+
- {duration: 8.610001, offset: 48.309999, rW: 27, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
8 |
+
- {duration: 9.680000, offset: 57.510000, rW: 29, uW: 0, speaker_id: spk.1096, wav: ted_1096.wav}
|
MUSTC_ROOT_hindi/en-hi/data/tst-COMMON/wav/ted_1096.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69a122c3ad89320ec24cad84b622a01f26c3138b3e5869dc033e65bd0ab73fe1
|
3 |
+
size 8990102
|
MUSTC_ROOT_hindi/en-hi/fbank80.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0bef03a45d7514d5018c4de30d352c736359248e6e8d70d586796aa32b30f4e2
|
3 |
+
size 5242360
|
MUSTC_ROOT_hindi/en-hi/spm_unigram8000_st.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf7b26c17db61dcd76400fbb74c5395d5f13837ed0fd5fa1098930de4f2a8202
|
3 |
+
size 449800
|
MUSTC_ROOT_hindi/en-hi/spm_unigram8000_st.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
MUSTC_ROOT_hindi/en-hi/tst-COMMON_st.tsv
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
id audio n_frames tgt_text speaker
|
2 |
+
test_0 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:3674535:136768 427 न्यूयॉर्क में वापस, मैं रॉबिन हुड नामक एक गैर-लाभकारी संस्था के विकास का प्रमुख हूं। spk.1096
|
3 |
+
ted_1096_0 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:44:44928 140 कप्तान ने मुझे लहराया। spk.1096
|
4 |
+
ted_1096_1 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:296095:272128 850 उन्होंने कहा, "बेज़ोस, मैं चाहता हूं कि आप घर में जाएं। मैं चाहता हूं कि आप ऊपर जाएं, आग को पार करें, और मैं चाहता हूं कि आप इस महिला को एक जोड़ी जूते दिलवाएं।" spk.1096
|
5 |
+
ted_1096_2 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:4231705:6208 19 (हँसी) spk.1096
|
6 |
+
ted_1096_3 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:5032741:14848 46 कसम है। spk.1096
|
7 |
+
ted_1096_4 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:1725316:447168 1397 तो, ठीक वैसा नहीं जैसा मैं उम्मीद कर रहा था, लेकिन मैं चला गया - सीढ़ियों से ऊपर, हॉल के नीचे, 'असली' अग्निशामकों के पीछे, जो इस बिंदु पर आग बुझाने के लिए बहुत कुछ कर चुके थे, मास्टर बेडरूम में spk.1096
|
8 |
+
ted_1096_5 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:3811347:152768 477 अब मुझे पता है कि तुम क्या सोच रहे हो, लेकिन मैं हीरो नहीं हूं। spk.1096
|
9 |
+
ted_1096_6 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:4237957:194048 606 मैं अपना पेलोड वापस नीचे की ओर ले गया जहाँ मैं अपने दास और कीमती कुत्ते से सामने के दरवाजे से मिला। spk.1096
|
10 |
+
ted_1096_7 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:725413:236928 740 हम अपने खजानों को बाहर गृहस्वामी के पास ले गए, जहां आश्चर्य की बात नहीं कि मेरे खजानों की तुलना में उनका अधिक ध्यान गया। spk.1096
|
11 |
+
ted_1096_8 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:5047633:192768 602 कुछ सप्ताह बाद, विभाग को गृहस्वामी की ओर से एक पत्र प्राप्त हुआ जिसमें उन्होंने उसके घर को बचाने के लिए किए गए साहसिक प्रयास के लिए हमें धन्यवाद दिया। spk.1096
|
12 |
+
ted_1096_9 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:1184318:188928 590 दयालुता का कार्य उसने अन्य सभी से ऊपर देखा: किसी ने उसे एक जोड़ी जूते भी दिलवाए थे। spk.1096
|
13 |
+
ted_1096_10 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:2172529:425408 1329 रॉबिन हुड में मेरे व्यवसाय और स्वयंसेवी फायर फाइटर के रूप में मेरे व्यवसाय दोनों में, मैं एक बड़े पैमाने पर उदारता और दयालुता के कृत्यों का साक्षी हूं, लेकिन मैं व्यक्तिगत आधार पर अनुग्रह और साहस के कार्यों का भी गवाह हूं। spk.1096
|
14 |
+
ted_1096_11 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:3267448:28608 89 और आप जानते हैं कि मैंने क्या सीखा है? spk.1096
|
15 |
+
ted_1096_12 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:701561:23808 74 वे सब मायने रखते हैं। spk.1096
|
16 |
+
ted_1096_13 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:2597982:338688 1058 इसलिए जब मैं इस कमरे के चारों ओर ऐसे लोगों को देखता हूं, जिन्होंने या तो सफलता के उल्लेखनीय स्तर हासिल किए हैं, या हासिल करने के रास्ते पर हैं, तो मैं यह याद दिलाता हूं: प्रतीक्षा न करें। spk.1096
|
17 |
+
ted_1096_14 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:4432050:164608 514 जब मैं गरीबी से नहीं लड़ रहा हूं, तो मैं स्वयंसेवी फायर कंपनी के सहायक कप्तान के रूप में आग से लड़ रहा हूं। spk.1096
|
18 |
+
ted_1096_15 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:198963:97088 303 किसी के जीवन में बदलाव लाने के लिए अपना पहला मिलियन बनाने तक प्रतीक्षा न करें। spk.1096
|
19 |
+
ted_1096_16 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:3964160:89408 279 अगर आपके पास देने के लिए कुछ है, तो अभी दे दो। spk.1096
|
20 |
+
ted_1096_17 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:1373291:92928 290 सूप किचन में खाना परोसें। spk.1096
|
21 |
+
ted_1096_18 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:4596703:24448 76 एक संरक्षक बनें। spk.1096
|
22 |
+
ted_1096_19 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:1466264:259008 809 अब हमारे शहर में, जहां स्वयंसेवक एक अत्यधिक कुशल कैरियर स्टाफ के पूरक हैं, आपको किसी भी कार्रवाई में शामिल होने के लिए आग के दृश्य पर बहुत जल्दी पहुंचना होगा। spk.1096
|
23 |
+
ted_1096_20 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:45017:49408 154 मुझे अपनी पहली आग याद है। spk.1096
|
24 |
+
ted_1096_21 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:568268:133248 416 मैं इस दृश्य पर दूसरा स्वयंसेवक था, इसलिए मेरे अंदर आने का एक अच्छा मौका था। spk.1096
|
25 |
+
ted_1096_22 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:4053613:178048 556 लेकिन फिर भी यह अन्य स्वयंसेवकों के खिलाफ एक वास्तविक पदयात्रा थी जो प्रभारी कप्तान के पास यह पता लगाने के लिए थी कि हमारा कार्य क्या होगा। spk.1096
|
26 |
+
ted_1096_23 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:4757689:275008 859 जब मैंने कप्तान को पाया, तो वह गृहस्वामी के साथ बहुत ही आकर्षक बातचीत कर रहा था, जो निश्चित रूप से उसके जीवन के सबसे बुरे दिनों में से एक था। spk.1096
|
27 |
+
ted_1096_24 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:3365247:309248 966 यहाँ यह आधी रात थी, वह बारिश में बाहर, एक छतरी के नीचे, अपने पजामे में, नंगे पाँव खड़ी थी, जबकि उसका घर आग की लपटों में था। spk.1096
|
28 |
+
ted_1096_25 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:4621196:136448 426 दूसरा स्वयंसेवक जो मुझसे ठीक पहले आया था -- चलो उसे लेक्स लूथर कहते हैं -- spk.1096
|
29 |
+
ted_1096_26 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:3296101:3968 12 (हँसी) spk.1096
|
30 |
+
ted_1096_27 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:962386:221888 693 पहले कप्तान के पास गया और उसे अंदर जाकर गृहस्वामी के कुत्ते को बचाने के लिए कहा गया। spk.1096
|
31 |
+
ted_1096_28 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:94470:104448 326 कुत्ता! spk.1096
|
32 |
+
ted_1096_29 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:2936715:330688 1033 यहाँ कोई वकील या मनी मैनेजर था, जो अपने पूरे जीवन के लिए लोगों को बताता है कि वह एक जलती हुई इमारत में एक जीवित प्राणी को बचाने के लिए गया था, सिर्फ इसलिए कि उसने मुझे पाँच सेकंड से पीटा। spk.1096
|
33 |
+
ted_1096_30 /home/deepakprasad/nlp_code/fairseq_mustc_single_inference/MUSTC_ROOT/en-hi/fbank80.zip:3300114:65088 203 खैर, मैं अगला था। spk.1096
|
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Script to translate given single english audio file to corresponding hindi text
|
3 |
+
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
|
4 |
+
"""
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import sys
|
10 |
+
import os
|
11 |
+
import subprocess
|
12 |
+
from pydub import AudioSegment
|
13 |
+
from huggingface_hub import snapshot_download
|
14 |
+
|
15 |
+
def install_fairseq():
|
16 |
+
try:
|
17 |
+
# Run pip install command to install fairseq
|
18 |
+
subprocess.check_call(["pip", "install", "fairseq"])
|
19 |
+
subprocess.check_call(["pip", "install", "sentencepiece"])
|
20 |
+
subprocess.check_call(["pip", "install", "soundfile"])
|
21 |
+
return "fairseq successfully installed!"
|
22 |
+
except subprocess.CalledProcessError as e:
|
23 |
+
return f"An error occurred while installing fairseq: {str(e)}"
|
24 |
+
|
25 |
+
def convert_audio_to_16k_wav(audio_input):
|
26 |
+
sound = AudioSegment.from_file(audio_input)
|
27 |
+
sample_rate = sound.frame_rate
|
28 |
+
num_channels = sound.channels
|
29 |
+
num_frames = int(sound.frame_count())
|
30 |
+
filename = audio_input.split("/")[-1]
|
31 |
+
print("original file is at:", audio_input)
|
32 |
+
if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
|
33 |
+
if num_channels > 1:
|
34 |
+
sound = sound.set_channels(1)
|
35 |
+
if sample_rate != 16000:
|
36 |
+
sound = sound.set_frame_rate(16000)
|
37 |
+
num_frames = int(sound.frame_count())
|
38 |
+
filename = filename.replace(".wav", "") + "_16k.wav"
|
39 |
+
sound.export(f"{filename}", format="wav")
|
40 |
+
return filename
|
41 |
+
|
42 |
+
|
43 |
+
def run_my_code(input_text, language):
|
44 |
+
# TODO better argument handling
|
45 |
+
audio=convert_audio_to_16k_wav(input_text)
|
46 |
+
hi_wav = audio
|
47 |
+
|
48 |
+
data_root=""
|
49 |
+
model_checkpoint=""
|
50 |
+
d_r=""
|
51 |
+
|
52 |
+
if(language=="Hindi"):
|
53 |
+
model_checkpoint = "./models/hindi_model.pt"
|
54 |
+
data_root="./MUSTC_ROOT_hindi/en-hi/"
|
55 |
+
d_r="MUSTC_ROOT_hindi/"
|
56 |
+
if(language=="French"):
|
57 |
+
model_checkpoint = "./models/french_model.pt"
|
58 |
+
data_root="./MUSTC_ROOT_french/en-fr/"
|
59 |
+
d_r="MUSTC_ROOT_french/"
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
os.system(f"cp {hi_wav} {data_root}data/tst-COMMON/wav/test.wav")
|
64 |
+
|
65 |
+
print("------Starting data prepration...")
|
66 |
+
subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", d_r, "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
67 |
+
|
68 |
+
print("------Performing translation...")
|
69 |
+
|
70 |
+
translation_result = subprocess.run(["fairseq-generate", data_root, "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", model_checkpoint, "--max-tokens", "50000", "--beam", "5", "--scoring", "sacrebleu"], capture_output=True, text=True)
|
71 |
+
translation_result_text = translation_result.stdout
|
72 |
+
|
73 |
+
lines = translation_result_text.split("\n")
|
74 |
+
|
75 |
+
output_text=""
|
76 |
+
print("\n\n------Translation results are:")
|
77 |
+
for i in lines:
|
78 |
+
if (i.startswith("D-0")):
|
79 |
+
print(i.split("\t")[2])
|
80 |
+
output_text=i.split("\t")[2]
|
81 |
+
break
|
82 |
+
|
83 |
+
os.system(f"rm {data_root}data/tst-COMMON/wav/test.wav")
|
84 |
+
return output_text
|
85 |
+
|
86 |
+
install_fairseq()
|
87 |
+
|
88 |
+
# Define the input and output interfaces for Gradio
|
89 |
+
#inputs = [
|
90 |
+
# gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
|
91 |
+
# gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="Hindi", label="From English to Languages X..."),
|
92 |
+
# ]
|
93 |
+
|
94 |
+
#input_textbox = gr.inputs.Textbox(label="test2.wav")
|
95 |
+
#input=gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)...")
|
96 |
+
#audio=convert_audio_to_16k_wav(input)
|
97 |
+
output_textbox = gr.outputs.Textbox(label="Output Text")
|
98 |
+
|
99 |
+
# Create a Gradio interface
|
100 |
+
iface = gr.Interface(
|
101 |
+
fn=run_my_code,
|
102 |
+
inputs=[gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."), gr.inputs.Radio(["Hindi", "French"], label="Language")],
|
103 |
+
outputs=output_textbox,
|
104 |
+
title="English to Hindi Translator")
|
105 |
+
|
106 |
+
# Launch the interface
|
107 |
+
iface.launch()
|
data_utils.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import csv
|
7 |
+
from pathlib import Path
|
8 |
+
import zipfile
|
9 |
+
from functools import reduce
|
10 |
+
from multiprocessing import cpu_count
|
11 |
+
from typing import Any, Dict, List, Optional, Union
|
12 |
+
import io
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import sentencepiece as sp
|
17 |
+
from fairseq.data.audio.audio_utils import (
|
18 |
+
convert_waveform, _get_kaldi_fbank, _get_torchaudio_fbank, is_npy_data,
|
19 |
+
is_sf_audio_data
|
20 |
+
)
|
21 |
+
import torch
|
22 |
+
import soundfile as sf
|
23 |
+
from tqdm import tqdm
|
24 |
+
|
25 |
+
|
26 |
+
UNK_TOKEN, UNK_TOKEN_ID = "<unk>", 3
|
27 |
+
BOS_TOKEN, BOS_TOKEN_ID = "<s>", 0
|
28 |
+
EOS_TOKEN, EOS_TOKEN_ID = "</s>", 2
|
29 |
+
PAD_TOKEN, PAD_TOKEN_ID = "<pad>", 1
|
30 |
+
|
31 |
+
|
32 |
+
def gen_vocab(
|
33 |
+
input_path: Path, output_path_prefix: Path, model_type="bpe",
|
34 |
+
vocab_size=1000, special_symbols: Optional[List[str]] = None
|
35 |
+
):
|
36 |
+
# Train SentencePiece Model
|
37 |
+
arguments = [
|
38 |
+
f"--input={input_path.as_posix()}",
|
39 |
+
f"--model_prefix={output_path_prefix.as_posix()}",
|
40 |
+
f"--model_type={model_type}",
|
41 |
+
f"--vocab_size={vocab_size}",
|
42 |
+
"--character_coverage=1.0",
|
43 |
+
f"--num_threads={cpu_count()}",
|
44 |
+
f"--unk_id={UNK_TOKEN_ID}",
|
45 |
+
f"--bos_id={BOS_TOKEN_ID}",
|
46 |
+
f"--eos_id={EOS_TOKEN_ID}",
|
47 |
+
f"--pad_id={PAD_TOKEN_ID}",
|
48 |
+
]
|
49 |
+
if special_symbols is not None:
|
50 |
+
_special_symbols = ",".join(special_symbols)
|
51 |
+
arguments.append(f"--user_defined_symbols={_special_symbols}")
|
52 |
+
sp.SentencePieceTrainer.Train(" ".join(arguments))
|
53 |
+
# Export fairseq dictionary
|
54 |
+
spm = sp.SentencePieceProcessor()
|
55 |
+
spm.Load(output_path_prefix.as_posix() + ".model")
|
56 |
+
vocab = {i: spm.IdToPiece(i) for i in range(spm.GetPieceSize())}
|
57 |
+
assert (
|
58 |
+
vocab.get(UNK_TOKEN_ID) == UNK_TOKEN
|
59 |
+
and vocab.get(PAD_TOKEN_ID) == PAD_TOKEN
|
60 |
+
and vocab.get(BOS_TOKEN_ID) == BOS_TOKEN
|
61 |
+
and vocab.get(EOS_TOKEN_ID) == EOS_TOKEN
|
62 |
+
)
|
63 |
+
vocab = {
|
64 |
+
i: s
|
65 |
+
for i, s in vocab.items()
|
66 |
+
if s not in {UNK_TOKEN, BOS_TOKEN, EOS_TOKEN, PAD_TOKEN}
|
67 |
+
}
|
68 |
+
with open(output_path_prefix.as_posix() + ".txt", "w") as f_out:
|
69 |
+
for _, s in sorted(vocab.items(), key=lambda x: x[0]):
|
70 |
+
f_out.write(f"{s} 1\n")
|
71 |
+
|
72 |
+
|
73 |
+
def extract_fbank_features(
|
74 |
+
waveform: torch.FloatTensor,
|
75 |
+
sample_rate: int,
|
76 |
+
output_path: Optional[Path] = None,
|
77 |
+
n_mel_bins: int = 80,
|
78 |
+
overwrite: bool = False,
|
79 |
+
):
|
80 |
+
if output_path is not None and output_path.is_file() and not overwrite:
|
81 |
+
return
|
82 |
+
|
83 |
+
_waveform, _ = convert_waveform(waveform, sample_rate, to_mono=True)
|
84 |
+
# Kaldi compliance: 16-bit signed integers
|
85 |
+
_waveform = _waveform * (2 ** 15)
|
86 |
+
_waveform = _waveform.numpy()
|
87 |
+
|
88 |
+
features = _get_kaldi_fbank(_waveform, sample_rate, n_mel_bins)
|
89 |
+
if features is None:
|
90 |
+
features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins)
|
91 |
+
if features is None:
|
92 |
+
raise ImportError(
|
93 |
+
"Please install pyKaldi or torchaudio to enable fbank feature extraction"
|
94 |
+
)
|
95 |
+
|
96 |
+
if output_path is not None:
|
97 |
+
np.save(output_path.as_posix(), features)
|
98 |
+
return features
|
99 |
+
|
100 |
+
|
101 |
+
def create_zip(data_root: Path, zip_path: Path):
|
102 |
+
paths = list(data_root.glob("*.npy"))
|
103 |
+
paths.extend(data_root.glob("*.flac"))
|
104 |
+
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_STORED) as f:
|
105 |
+
for path in tqdm(paths):
|
106 |
+
f.write(path, arcname=path.name)
|
107 |
+
|
108 |
+
|
109 |
+
def get_zip_manifest(
|
110 |
+
zip_path: Path, zip_root: Optional[Path] = None, is_audio=False
|
111 |
+
):
|
112 |
+
_zip_path = Path.joinpath(zip_root or Path(""), zip_path)
|
113 |
+
with zipfile.ZipFile(_zip_path, mode="r") as f:
|
114 |
+
info = f.infolist()
|
115 |
+
paths, lengths = {}, {}
|
116 |
+
for i in tqdm(info):
|
117 |
+
utt_id = Path(i.filename).stem
|
118 |
+
offset, file_size = i.header_offset + 30 + len(i.filename), i.file_size
|
119 |
+
paths[utt_id] = f"{zip_path.as_posix()}:{offset}:{file_size}"
|
120 |
+
with open(_zip_path, "rb") as f:
|
121 |
+
f.seek(offset)
|
122 |
+
byte_data = f.read(file_size)
|
123 |
+
assert len(byte_data) > 1
|
124 |
+
if is_audio:
|
125 |
+
assert is_sf_audio_data(byte_data), i
|
126 |
+
else:
|
127 |
+
assert is_npy_data(byte_data), i
|
128 |
+
byte_data_fp = io.BytesIO(byte_data)
|
129 |
+
if is_audio:
|
130 |
+
lengths[utt_id] = sf.info(byte_data_fp).frames
|
131 |
+
else:
|
132 |
+
lengths[utt_id] = np.load(byte_data_fp).shape[0]
|
133 |
+
return paths, lengths
|
134 |
+
|
135 |
+
|
136 |
+
def gen_config_yaml(
|
137 |
+
manifest_root: Path,
|
138 |
+
spm_filename: Optional[str] = None,
|
139 |
+
vocab_name: Optional[str] = None,
|
140 |
+
yaml_filename: str = "config.yaml",
|
141 |
+
specaugment_policy: Optional[str] = "lb",
|
142 |
+
prepend_tgt_lang_tag: bool = False,
|
143 |
+
sampling_alpha: Optional[float] = None,
|
144 |
+
input_channels: Optional[int] = 1,
|
145 |
+
input_feat_per_channel: Optional[int] = 80,
|
146 |
+
audio_root: str = "",
|
147 |
+
cmvn_type: str = "utterance",
|
148 |
+
gcmvn_path: Optional[Path] = None,
|
149 |
+
extra=None
|
150 |
+
):
|
151 |
+
manifest_root = manifest_root.absolute()
|
152 |
+
writer = S2TDataConfigWriter(manifest_root / yaml_filename)
|
153 |
+
assert spm_filename is not None or vocab_name is not None
|
154 |
+
vocab_name = spm_filename.replace(".model", ".txt") if vocab_name is None \
|
155 |
+
else vocab_name
|
156 |
+
writer.set_vocab_filename(vocab_name)
|
157 |
+
if input_channels is not None:
|
158 |
+
writer.set_input_channels(input_channels)
|
159 |
+
if input_feat_per_channel is not None:
|
160 |
+
writer.set_input_feat_per_channel(input_feat_per_channel)
|
161 |
+
specaugment_setters = {
|
162 |
+
"lb": writer.set_specaugment_lb_policy,
|
163 |
+
"ld": writer.set_specaugment_ld_policy,
|
164 |
+
"sm": writer.set_specaugment_sm_policy,
|
165 |
+
"ss": writer.set_specaugment_ss_policy,
|
166 |
+
}
|
167 |
+
specaugment_setter = specaugment_setters.get(specaugment_policy, None)
|
168 |
+
if specaugment_setter is not None:
|
169 |
+
specaugment_setter()
|
170 |
+
if spm_filename is not None:
|
171 |
+
writer.set_bpe_tokenizer(
|
172 |
+
{
|
173 |
+
"bpe": "sentencepiece",
|
174 |
+
"sentencepiece_model": (manifest_root / spm_filename).as_posix(),
|
175 |
+
}
|
176 |
+
)
|
177 |
+
if prepend_tgt_lang_tag:
|
178 |
+
writer.set_prepend_tgt_lang_tag(True)
|
179 |
+
if sampling_alpha is not None:
|
180 |
+
writer.set_sampling_alpha(sampling_alpha)
|
181 |
+
|
182 |
+
if cmvn_type not in ["global", "utterance"]:
|
183 |
+
raise NotImplementedError
|
184 |
+
|
185 |
+
if specaugment_policy is not None:
|
186 |
+
writer.set_feature_transforms(
|
187 |
+
"_train", [f"{cmvn_type}_cmvn", "specaugment"]
|
188 |
+
)
|
189 |
+
writer.set_feature_transforms("*", [f"{cmvn_type}_cmvn"])
|
190 |
+
|
191 |
+
if cmvn_type == "global":
|
192 |
+
if gcmvn_path is None:
|
193 |
+
raise ValueError("Please provide path of global cmvn file.")
|
194 |
+
else:
|
195 |
+
writer.set_global_cmvn(gcmvn_path.as_posix())
|
196 |
+
|
197 |
+
if len(audio_root) > 0:
|
198 |
+
writer.set_audio_root(audio_root)
|
199 |
+
|
200 |
+
if extra is not None:
|
201 |
+
writer.set_extra(extra)
|
202 |
+
writer.flush()
|
203 |
+
|
204 |
+
|
205 |
+
def load_df_from_tsv(path: Union[str, Path]) -> pd.DataFrame:
|
206 |
+
_path = path if isinstance(path, str) else path.as_posix()
|
207 |
+
return pd.read_csv(
|
208 |
+
_path,
|
209 |
+
sep="\t",
|
210 |
+
header=0,
|
211 |
+
encoding="utf-8",
|
212 |
+
escapechar="\\",
|
213 |
+
quoting=csv.QUOTE_NONE,
|
214 |
+
na_filter=False,
|
215 |
+
)
|
216 |
+
|
217 |
+
|
218 |
+
def save_df_to_tsv(dataframe, path: Union[str, Path]):
|
219 |
+
_path = path if isinstance(path, str) else path.as_posix()
|
220 |
+
dataframe.to_csv(
|
221 |
+
_path,
|
222 |
+
sep="\t",
|
223 |
+
header=True,
|
224 |
+
index=False,
|
225 |
+
encoding="utf-8",
|
226 |
+
escapechar="\\",
|
227 |
+
quoting=csv.QUOTE_NONE,
|
228 |
+
)
|
229 |
+
|
230 |
+
|
231 |
+
def load_tsv_to_dicts(path: Union[str, Path]) -> List[dict]:
|
232 |
+
with open(path, "r") as f:
|
233 |
+
reader = csv.DictReader(
|
234 |
+
f,
|
235 |
+
delimiter="\t",
|
236 |
+
quotechar=None,
|
237 |
+
doublequote=False,
|
238 |
+
lineterminator="\n",
|
239 |
+
quoting=csv.QUOTE_NONE,
|
240 |
+
)
|
241 |
+
rows = [dict(e) for e in reader]
|
242 |
+
return rows
|
243 |
+
|
244 |
+
|
245 |
+
def filter_manifest_df(
|
246 |
+
df, is_train_split=False, extra_filters=None, min_n_frames=5, max_n_frames=3000
|
247 |
+
):
|
248 |
+
filters = {
|
249 |
+
"no speech": df["audio"] == "",
|
250 |
+
f"short speech (<{min_n_frames} frames)": df["n_frames"] < min_n_frames,
|
251 |
+
"empty sentence": df["tgt_text"] == "",
|
252 |
+
}
|
253 |
+
if is_train_split:
|
254 |
+
filters[f"long speech (>{max_n_frames} frames)"] = df["n_frames"] > max_n_frames
|
255 |
+
if extra_filters is not None:
|
256 |
+
filters.update(extra_filters)
|
257 |
+
invalid = reduce(lambda x, y: x | y, filters.values())
|
258 |
+
valid = ~invalid
|
259 |
+
print(
|
260 |
+
"| "
|
261 |
+
+ ", ".join(f"{n}: {f.sum()}" for n, f in filters.items())
|
262 |
+
+ f", total {invalid.sum()} filtered, {valid.sum()} remained."
|
263 |
+
)
|
264 |
+
return df[valid]
|
265 |
+
|
266 |
+
|
267 |
+
def cal_gcmvn_stats(features_list):
|
268 |
+
features = np.concatenate(features_list)
|
269 |
+
square_sums = (features ** 2).sum(axis=0)
|
270 |
+
mean = features.mean(axis=0)
|
271 |
+
features = np.subtract(features, mean)
|
272 |
+
var = square_sums / features.shape[0] - mean ** 2
|
273 |
+
std = np.sqrt(np.maximum(var, 1e-8))
|
274 |
+
return {"mean": mean.astype("float32"), "std": std.astype("float32")}
|
275 |
+
|
276 |
+
|
277 |
+
class S2TDataConfigWriter(object):
|
278 |
+
DEFAULT_VOCAB_FILENAME = "dict.txt"
|
279 |
+
DEFAULT_INPUT_FEAT_PER_CHANNEL = 80
|
280 |
+
DEFAULT_INPUT_CHANNELS = 1
|
281 |
+
|
282 |
+
def __init__(self, yaml_path: Path):
|
283 |
+
try:
|
284 |
+
import yaml
|
285 |
+
except ImportError:
|
286 |
+
print("Please install PyYAML for S2T data config YAML files")
|
287 |
+
self.yaml = yaml
|
288 |
+
self.yaml_path = yaml_path
|
289 |
+
self.config = {}
|
290 |
+
|
291 |
+
def flush(self):
|
292 |
+
with open(self.yaml_path, "w") as f:
|
293 |
+
self.yaml.dump(self.config, f)
|
294 |
+
|
295 |
+
def set_audio_root(self, audio_root=""):
|
296 |
+
self.config["audio_root"] = audio_root
|
297 |
+
|
298 |
+
def set_vocab_filename(self, vocab_filename: str = "dict.txt"):
|
299 |
+
self.config["vocab_filename"] = vocab_filename
|
300 |
+
|
301 |
+
def set_specaugment(
|
302 |
+
self,
|
303 |
+
time_wrap_w: int,
|
304 |
+
freq_mask_n: int,
|
305 |
+
freq_mask_f: int,
|
306 |
+
time_mask_n: int,
|
307 |
+
time_mask_t: int,
|
308 |
+
time_mask_p: float,
|
309 |
+
):
|
310 |
+
self.config["specaugment"] = {
|
311 |
+
"time_wrap_W": time_wrap_w,
|
312 |
+
"freq_mask_N": freq_mask_n,
|
313 |
+
"freq_mask_F": freq_mask_f,
|
314 |
+
"time_mask_N": time_mask_n,
|
315 |
+
"time_mask_T": time_mask_t,
|
316 |
+
"time_mask_p": time_mask_p,
|
317 |
+
}
|
318 |
+
|
319 |
+
def set_specaugment_lb_policy(self):
|
320 |
+
self.set_specaugment(
|
321 |
+
time_wrap_w=0,
|
322 |
+
freq_mask_n=1,
|
323 |
+
freq_mask_f=27,
|
324 |
+
time_mask_n=1,
|
325 |
+
time_mask_t=100,
|
326 |
+
time_mask_p=1.0,
|
327 |
+
)
|
328 |
+
|
329 |
+
def set_specaugment_ld_policy(self):
|
330 |
+
self.set_specaugment(
|
331 |
+
time_wrap_w=0,
|
332 |
+
freq_mask_n=2,
|
333 |
+
freq_mask_f=27,
|
334 |
+
time_mask_n=2,
|
335 |
+
time_mask_t=100,
|
336 |
+
time_mask_p=1.0,
|
337 |
+
)
|
338 |
+
|
339 |
+
def set_specaugment_sm_policy(self):
|
340 |
+
self.set_specaugment(
|
341 |
+
time_wrap_w=0,
|
342 |
+
freq_mask_n=2,
|
343 |
+
freq_mask_f=15,
|
344 |
+
time_mask_n=2,
|
345 |
+
time_mask_t=70,
|
346 |
+
time_mask_p=0.2,
|
347 |
+
)
|
348 |
+
|
349 |
+
def set_specaugment_ss_policy(self):
|
350 |
+
self.set_specaugment(
|
351 |
+
time_wrap_w=0,
|
352 |
+
freq_mask_n=2,
|
353 |
+
freq_mask_f=27,
|
354 |
+
time_mask_n=2,
|
355 |
+
time_mask_t=70,
|
356 |
+
time_mask_p=0.2,
|
357 |
+
)
|
358 |
+
|
359 |
+
def set_input_channels(self, input_channels: int = 1):
|
360 |
+
self.config["input_channels"] = input_channels
|
361 |
+
|
362 |
+
def set_input_feat_per_channel(self, input_feat_per_channel: int = 80):
|
363 |
+
self.config["input_feat_per_channel"] = input_feat_per_channel
|
364 |
+
|
365 |
+
def set_bpe_tokenizer(self, bpe_tokenizer: Dict[str, Any]):
|
366 |
+
self.config["bpe_tokenizer"] = bpe_tokenizer
|
367 |
+
|
368 |
+
def set_global_cmvn(self, stats_npz_path: str):
|
369 |
+
self.config["global_cmvn"] = {"stats_npz_path": stats_npz_path}
|
370 |
+
|
371 |
+
def set_feature_transforms(self, split: str, transforms: List[str]):
|
372 |
+
if "transforms" not in self.config:
|
373 |
+
self.config["transforms"] = {}
|
374 |
+
self.config["transforms"][split] = transforms
|
375 |
+
|
376 |
+
def set_prepend_tgt_lang_tag(self, flag: bool = True):
|
377 |
+
self.config["prepend_tgt_lang_tag"] = flag
|
378 |
+
|
379 |
+
def set_sampling_alpha(self, sampling_alpha: float = 1.0):
|
380 |
+
self.config["sampling_alpha"] = sampling_alpha
|
381 |
+
|
382 |
+
def set_extra(self, data):
|
383 |
+
self.config.update(data)
|
models/french_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10c940349cedf8dd3611e7d585cd36b544f9d7a379328147b96d057292dab359
|
3 |
+
size 373015859
|
models/hindi_model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47e8bfef22034ac859da3a2726b142876793113cf18ac18bb6f6eb85415a7893
|
3 |
+
size 373227272
|
prep_mustc_data_hindi_single.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the MIT license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
import argparse
|
8 |
+
import logging
|
9 |
+
import os
|
10 |
+
from pathlib import Path
|
11 |
+
import shutil
|
12 |
+
from itertools import groupby
|
13 |
+
from tempfile import NamedTemporaryFile
|
14 |
+
from typing import Tuple
|
15 |
+
|
16 |
+
import numpy as np
|
17 |
+
import pandas as pd
|
18 |
+
import soundfile as sf
|
19 |
+
from examples.speech_to_text.data_utils import (
|
20 |
+
create_zip,
|
21 |
+
extract_fbank_features,
|
22 |
+
filter_manifest_df,
|
23 |
+
gen_config_yaml,
|
24 |
+
gen_vocab,
|
25 |
+
get_zip_manifest,
|
26 |
+
load_df_from_tsv,
|
27 |
+
save_df_to_tsv,
|
28 |
+
cal_gcmvn_stats,
|
29 |
+
)
|
30 |
+
import torch
|
31 |
+
from torch.utils.data import Dataset
|
32 |
+
from tqdm import tqdm
|
33 |
+
|
34 |
+
from fairseq.data.audio.audio_utils import get_waveform, convert_waveform
|
35 |
+
|
36 |
+
|
37 |
+
log = logging.getLogger(__name__)
|
38 |
+
|
39 |
+
|
40 |
+
MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"]
|
41 |
+
|
42 |
+
|
43 |
+
class MUSTC(Dataset):
|
44 |
+
"""
|
45 |
+
Create a Dataset for MuST-C. Each item is a tuple of the form:
|
46 |
+
waveform, sample_rate, source utterance, target utterance, speaker_id,
|
47 |
+
utterance_id
|
48 |
+
"""
|
49 |
+
|
50 |
+
SPLITS = ["tst-COMMON"]
|
51 |
+
LANGUAGES = ["de", "es", "fr", "it", "nl", "pt", "ro", "ru", "hi"]
|
52 |
+
|
53 |
+
def __init__(self, root: str, lang: str, split: str) -> None:
|
54 |
+
assert split in self.SPLITS and lang in self.LANGUAGES
|
55 |
+
_root = Path(root) / f"en-{lang}" / "data" / split
|
56 |
+
wav_root, txt_root = _root / "wav", _root / "txt"
|
57 |
+
#print(_root, wav_root, txt_root)
|
58 |
+
assert _root.is_dir() and wav_root.is_dir() and txt_root.is_dir()
|
59 |
+
# Load audio segments
|
60 |
+
try:
|
61 |
+
import yaml
|
62 |
+
except ImportError:
|
63 |
+
print("Please install PyYAML to load the MuST-C YAML files")
|
64 |
+
with open(txt_root / f"{split}.yaml") as f:
|
65 |
+
segments = yaml.load(f, Loader=yaml.BaseLoader)
|
66 |
+
# Load source and target utterances
|
67 |
+
for _lang in ["en", lang]:
|
68 |
+
with open(txt_root / f"{split}.{_lang}") as f:
|
69 |
+
utterances = [r.strip() for r in f]
|
70 |
+
print(len(segments), len(utterances))
|
71 |
+
assert len(segments) == len(utterances)
|
72 |
+
for i, u in enumerate(utterances):
|
73 |
+
segments[i][_lang] = u
|
74 |
+
# Gather info
|
75 |
+
self.data = []
|
76 |
+
for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]):
|
77 |
+
wav_path = wav_root / wav_filename
|
78 |
+
sample_rate = sf.info(wav_path.as_posix()).samplerate
|
79 |
+
seg_group = sorted(_seg_group, key=lambda x: x["offset"])
|
80 |
+
for i, segment in enumerate(seg_group):
|
81 |
+
offset = int(float(segment["offset"]) * sample_rate)
|
82 |
+
n_frames = int(float(segment["duration"]) * sample_rate)
|
83 |
+
_id = f"{wav_path.stem}_{i}"
|
84 |
+
self.data.append(
|
85 |
+
(
|
86 |
+
wav_path.as_posix(),
|
87 |
+
offset,
|
88 |
+
n_frames,
|
89 |
+
sample_rate,
|
90 |
+
segment["en"],
|
91 |
+
segment[lang],
|
92 |
+
segment["speaker_id"],
|
93 |
+
_id,
|
94 |
+
)
|
95 |
+
)
|
96 |
+
|
97 |
+
def __getitem__(
|
98 |
+
self, n: int
|
99 |
+
) -> Tuple[torch.Tensor, int, str, str, str, str]:
|
100 |
+
wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, \
|
101 |
+
utt_id = self.data[n]
|
102 |
+
waveform, _ = get_waveform(wav_path, frames=n_frames, start=offset)
|
103 |
+
waveform = torch.from_numpy(waveform)
|
104 |
+
return waveform, sr, src_utt, tgt_utt, spk_id, utt_id
|
105 |
+
|
106 |
+
def __len__(self) -> int:
|
107 |
+
return len(self.data)
|
108 |
+
|
109 |
+
|
110 |
+
def process(args):
|
111 |
+
root = Path(args.data_root).absolute()
|
112 |
+
for lang in MUSTC.LANGUAGES:
|
113 |
+
cur_root = root / f"en-{lang}"
|
114 |
+
if not cur_root.is_dir():
|
115 |
+
print(f"{cur_root.as_posix()} does not exist. Skipped.")
|
116 |
+
continue
|
117 |
+
# Extract features
|
118 |
+
audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80")
|
119 |
+
audio_root.mkdir(exist_ok=True)
|
120 |
+
|
121 |
+
for split in MUSTC.SPLITS:
|
122 |
+
print(f"Fetching split {split}...")
|
123 |
+
dataset = MUSTC(root.as_posix(), lang, split)
|
124 |
+
if args.use_audio_input:
|
125 |
+
print("Converting audios...")
|
126 |
+
for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
|
127 |
+
tgt_sample_rate = 16_000
|
128 |
+
_wavform, _ = convert_waveform(
|
129 |
+
waveform, sample_rate, to_mono=True,
|
130 |
+
to_sample_rate=tgt_sample_rate
|
131 |
+
)
|
132 |
+
sf.write(
|
133 |
+
(audio_root / f"{utt_id}.flac").as_posix(),
|
134 |
+
_wavform.T.numpy(), tgt_sample_rate
|
135 |
+
)
|
136 |
+
else:
|
137 |
+
print("Extracting log mel filter bank features...")
|
138 |
+
gcmvn_feature_list = []
|
139 |
+
if split == 'train' and args.cmvn_type == "global":
|
140 |
+
print("And estimating cepstral mean and variance stats...")
|
141 |
+
|
142 |
+
for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
|
143 |
+
features = extract_fbank_features(
|
144 |
+
waveform, sample_rate, audio_root / f"{utt_id}.npy"
|
145 |
+
)
|
146 |
+
if split == 'train' and args.cmvn_type == "global":
|
147 |
+
if len(gcmvn_feature_list) < args.gcmvn_max_num:
|
148 |
+
gcmvn_feature_list.append(features)
|
149 |
+
|
150 |
+
if split == 'train' and args.cmvn_type == "global":
|
151 |
+
# Estimate and save cmv
|
152 |
+
stats = cal_gcmvn_stats(gcmvn_feature_list)
|
153 |
+
with open(cur_root / "gcmvn.npz", "wb") as f:
|
154 |
+
np.savez(f, mean=stats["mean"], std=stats["std"])
|
155 |
+
|
156 |
+
# Pack features into ZIP
|
157 |
+
zip_path = cur_root / f"{audio_root.name}.zip"
|
158 |
+
print("ZIPing audios/features...")
|
159 |
+
create_zip(audio_root, zip_path)
|
160 |
+
print("Fetching ZIP manifest...")
|
161 |
+
audio_paths, audio_lengths = get_zip_manifest(
|
162 |
+
zip_path,
|
163 |
+
is_audio=args.use_audio_input,
|
164 |
+
)
|
165 |
+
# Generate TSV manifest
|
166 |
+
print("Generating manifest...")
|
167 |
+
train_text = []
|
168 |
+
for split in MUSTC.SPLITS:
|
169 |
+
is_train_split = split.startswith("train")
|
170 |
+
manifest = {c: [] for c in MANIFEST_COLUMNS}
|
171 |
+
dataset = MUSTC(args.data_root, lang, split)
|
172 |
+
for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
|
173 |
+
manifest["id"].append(utt_id)
|
174 |
+
manifest["audio"].append(audio_paths[utt_id])
|
175 |
+
manifest["n_frames"].append(audio_lengths[utt_id])
|
176 |
+
manifest["tgt_text"].append(
|
177 |
+
src_utt if args.task == "asr" else tgt_utt
|
178 |
+
)
|
179 |
+
manifest["speaker"].append(speaker_id)
|
180 |
+
if is_train_split:
|
181 |
+
train_text.extend(manifest["tgt_text"])
|
182 |
+
df = pd.DataFrame.from_dict(manifest)
|
183 |
+
df = filter_manifest_df(df, is_train_split=is_train_split)
|
184 |
+
save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv")
|
185 |
+
# Clean up
|
186 |
+
shutil.rmtree(audio_root)
|
187 |
+
|
188 |
+
|
189 |
+
def process_joint(args):
|
190 |
+
cur_root = Path(args.data_root)
|
191 |
+
assert all(
|
192 |
+
(cur_root / f"en-{lang}").is_dir() for lang in MUSTC.LANGUAGES
|
193 |
+
), "do not have downloaded data available for all 8 languages"
|
194 |
+
# Generate vocab
|
195 |
+
vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
|
196 |
+
spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}"
|
197 |
+
with NamedTemporaryFile(mode="w") as f:
|
198 |
+
for lang in MUSTC.LANGUAGES:
|
199 |
+
tsv_path = cur_root / f"en-{lang}" / f"train_{args.task}.tsv"
|
200 |
+
df = load_df_from_tsv(tsv_path)
|
201 |
+
for t in df["tgt_text"]:
|
202 |
+
f.write(t + "\n")
|
203 |
+
special_symbols = None
|
204 |
+
if args.task == 'st':
|
205 |
+
special_symbols = [f'<lang:{lang}>' for lang in MUSTC.LANGUAGES]
|
206 |
+
gen_vocab(
|
207 |
+
Path(f.name),
|
208 |
+
cur_root / spm_filename_prefix,
|
209 |
+
args.vocab_type,
|
210 |
+
args.vocab_size,
|
211 |
+
special_symbols=special_symbols
|
212 |
+
)
|
213 |
+
# Generate config YAML
|
214 |
+
gen_config_yaml(
|
215 |
+
cur_root,
|
216 |
+
spm_filename=spm_filename_prefix + ".model",
|
217 |
+
yaml_filename=f"config_{args.task}.yaml",
|
218 |
+
specaugment_policy="ld",
|
219 |
+
prepend_tgt_lang_tag=(args.task == "st"),
|
220 |
+
)
|
221 |
+
# Make symbolic links to manifests
|
222 |
+
for lang in MUSTC.LANGUAGES:
|
223 |
+
for split in MUSTC.SPLITS:
|
224 |
+
src_path = cur_root / f"en-{lang}" / f"{split}_{args.task}.tsv"
|
225 |
+
desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv"
|
226 |
+
if not desc_path.is_symlink():
|
227 |
+
os.symlink(src_path, desc_path)
|
228 |
+
|
229 |
+
|
230 |
+
def main():
|
231 |
+
parser = argparse.ArgumentParser()
|
232 |
+
parser.add_argument("--data-root", "-d", required=True, type=str)
|
233 |
+
parser.add_argument(
|
234 |
+
"--vocab-type",
|
235 |
+
default="unigram",
|
236 |
+
required=True,
|
237 |
+
type=str,
|
238 |
+
choices=["bpe", "unigram", "char"],
|
239 |
+
),
|
240 |
+
parser.add_argument("--vocab-size", default=8000, type=int)
|
241 |
+
parser.add_argument("--task", type=str, choices=["asr", "st"])
|
242 |
+
parser.add_argument("--joint", action="store_true", help="")
|
243 |
+
parser.add_argument(
|
244 |
+
"--cmvn-type", default="utterance",
|
245 |
+
choices=["global", "utterance"],
|
246 |
+
help="The type of cepstral mean and variance normalization"
|
247 |
+
)
|
248 |
+
parser.add_argument(
|
249 |
+
"--gcmvn-max-num", default=150000, type=int,
|
250 |
+
help="Maximum number of sentences to use to estimate global mean and "
|
251 |
+
"variance"
|
252 |
+
)
|
253 |
+
parser.add_argument("--use-audio-input", action="store_true")
|
254 |
+
args = parser.parse_args()
|
255 |
+
|
256 |
+
if args.joint:
|
257 |
+
process_joint(args)
|
258 |
+
else:
|
259 |
+
process(args)
|
260 |
+
|
261 |
+
|
262 |
+
if __name__ == "__main__":
|
263 |
+
main()
|
s2t_en2hi.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Script to translate given single english audio file to corresponding hindi text
|
3 |
+
|
4 |
+
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
|
5 |
+
"""
|
6 |
+
|
7 |
+
import sys
|
8 |
+
import os
|
9 |
+
import subprocess
|
10 |
+
|
11 |
+
# TODO better argument handling
|
12 |
+
hi_wav = sys.argv[1]
|
13 |
+
en2hi_model_checkpoint = sys.argv[2]
|
14 |
+
|
15 |
+
os.system(f"cp {hi_wav} ./MUSTC_ROOT/en-hi/data/tst-COMMON/wav/test.wav")
|
16 |
+
|
17 |
+
print("------Starting data prepration...")
|
18 |
+
subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", "MUSTC_ROOT/", "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL)
|
19 |
+
|
20 |
+
print("------Performing translation...")
|
21 |
+
translation_result = subprocess.run(["fairseq-generate", "./MUSTC_ROOT/en-hi/", "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", sys.argv[2], "--max-tokens", "50000", "--beam", "5", "--scoring", "sacrebleu"], capture_output=True, text=True)
|
22 |
+
translation_result_text = translation_result.stdout
|
23 |
+
print(translation_result.std)
|
24 |
+
lines = translation_result_text.split("\n")
|
25 |
+
|
26 |
+
print("\n\n------Translation results are:")
|
27 |
+
for i in lines:
|
28 |
+
if (i.startswith("D-0")):
|
29 |
+
print(i)
|
30 |
+
break
|
31 |
+
|
32 |
+
os.system("rm ./MUSTC_ROOT/en-hi/data/tst-COMMON/wav/test.wav")
|
s2t_en2hi_nolog.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Script to translate given single english audio file to corresponding hindi text
|
3 |
+
|
4 |
+
Usage : python s2t_en2hi.py <audio_file_path> <averaged_checkpoints_file_path>
|
5 |
+
"""
|
6 |
+
|
7 |
+
import sys
|
8 |
+
import os
|
9 |
+
import subprocess
|
10 |
+
|
11 |
+
# TODO better argument handling
|
12 |
+
hi_wav = sys.argv[1]
|
13 |
+
en2hi_model_checkpoint = sys.argv[2]
|
14 |
+
|
15 |
+
os.system(f"cp {hi_wav} ./MUSTC_ROOT/en-hi/data/tst-COMMON/wav/test.wav")
|
16 |
+
|
17 |
+
print("------Starting data prepration...")
|
18 |
+
subprocess.run(["python", "prep_mustc_data_hindi_single.py", "--data-root", "MUSTC_ROOT/", "--task", "st", "--vocab-type", "unigram", "--vocab-size", "8000"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
19 |
+
|
20 |
+
print("------Performing translation...")
|
21 |
+
translation_result = subprocess.run(["fairseq-generate", "./MUSTC_ROOT/en-hi/", "--config-yaml", "config_st.yaml", "--gen-subset", "tst-COMMON_st", "--task", "speech_to_text", "--path", sys.argv[2], "--max-tokens", "50000", "--beam", "5", "--scoring", "sacrebleu"], capture_output=True, text=True)
|
22 |
+
translation_result_text = translation_result.stdout
|
23 |
+
print(translation_result.std)
|
24 |
+
lines = translation_result_text.split("\n")
|
25 |
+
|
26 |
+
print("\n\n------Translation results are:")
|
27 |
+
for i in lines:
|
28 |
+
if (i.startswith("D-0")):
|
29 |
+
print(i.split("\t")[2])
|
30 |
+
break
|
31 |
+
|
32 |
+
os.system("rm ./MUSTC_ROOT/en-hi/data/tst-COMMON/wav/test.wav")
|
test.wav
ADDED
Binary file (141 kB). View file
|
|
test2.wav
ADDED
Binary file (126 kB). View file
|
|