linhqyy commited on
Commit
6c7896d
·
1 Parent(s): a327fa8

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: vi
3
+ datasets:
4
+ - youtube-vi-13k-hours
5
+ tags:
6
+ - speech
7
+ license: cc-by-nc-4.0
8
+ ---
9
+
10
+ # Vietnamese Self-Supervised Learning Wav2Vec2 model
11
+
12
+ ## Model
13
+
14
+ We use wav2vec2 architecture for doing Self-Supervised learning
15
+
16
+ <img src="https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/wav2vec2.png" width=75% height=75%>
17
+
18
+ ## Data
19
+
20
+ Our self-supervised model is pre-trained on a massive audio set of 13k hours of Vietnamese youtube audio, which includes:
21
+ - Clean audio
22
+ - Noise audio
23
+ - Conversation
24
+ - Multi-gender and dialects
25
+
26
+
27
+ ## Download
28
+
29
+ We have already upload our pre-trained model to the Huggingface. The base model trained 35 epochs and the large model trained 20 epochs in about 30 days using TPU V3-8.
30
+
31
+ - [Based version](https://huggingface.co/nguyenvulebinh/wav2vec2-base-vi) ~ 95M params
32
+ - [Large version](https://huggingface.co/nguyenvulebinh/wav2vec2-large-vi) ~ 317M params
33
+
34
+ ## Usage
35
+
36
+ ```python
37
+ from transformers import Wav2Vec2ForPreTraining, Wav2Vec2Processor
38
+
39
+ model_name = 'nguyenvulebinh/wav2vec2-base-vi'
40
+ # model_name = 'nguyenvulebinh/wav2vec2-large-vi'
41
+
42
+ model = Wav2Vec2ForPreTraining.from_pretrained(model_name)
43
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
44
+
45
+ ```
46
+
47
+ Since our model has the same architecture as the English wav2vec2 version, you can use [this notebook](https://colab.research.google.com/drive/1FjTsqbYKphl9kL-eILgUc-bl4zVThL8F?usp=sharing) for more information on how to fine-tune the model.
48
+
49
+ ## Finetuned version
50
+
51
+ ### VLSP 2020 ASR dataset
52
+
53
+ Benchmark WER result on VLSP T1 testset:
54
+
55
+ | | [base model](https://huggingface.co/nguyenvulebinh/wav2vec2-base-vi-vlsp2020) | [large model](https://huggingface.co/nguyenvulebinh/wav2vec2-large-vi-vlsp2020) |
56
+ |---|---|---|
57
+ |without LM| 8.66 | 6.90 |
58
+ |with 5-grams LM| 6.53 | 5.32 |
59
+
60
+ Usage
61
+
62
+ ```python
63
+ #pytorch
64
+ #!pip install transformers==4.20.0
65
+ #!pip install https://github.com/kpu/kenlm/archive/master.zip
66
+ #!pip install pyctcdecode==0.4.0
67
+ from transformers.file_utils import cached_path, hf_bucket_url
68
+ from importlib.machinery import SourceFileLoader
69
+ from transformers import Wav2Vec2ProcessorWithLM
70
+ from IPython.lib.display import Audio
71
+ import torchaudio
72
+ import torch
73
+
74
+ # Load model & processor
75
+ model_name = "nguyenvulebinh/wav2vec2-base-vi-vlsp2020"
76
+ # model_name = "nguyenvulebinh/wav2vec2-large-vi-vlsp2020"
77
+ model = SourceFileLoader("model", cached_path(hf_bucket_url(model_name,filename="model_handling.py"))).load_module().Wav2Vec2ForCTC.from_pretrained(model_name)
78
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
79
+
80
+ # Load an example audio (16k)
81
+ audio, sample_rate = torchaudio.load(cached_path(hf_bucket_url(model_name, filename="t2_0000006682.wav")))
82
+ input_data = processor.feature_extractor(audio[0], sampling_rate=16000, return_tensors='pt')
83
+
84
+ # Infer
85
+ output = model(**input_data)
86
+
87
+ # Output transcript without LM
88
+ print(processor.tokenizer.decode(output.logits.argmax(dim=-1)[0].detach().cpu().numpy()))
89
+
90
+ # Output transcript with LM
91
+ print(processor.decode(output.logits.cpu().detach().numpy()[0], beam_width=100).text)
92
+ ```
93
+
94
+ ## Acknowledgment
95
+
96
+ - We would like to thank the Google TPU Research Cloud (TRC) program and Soonson Kwon (Google ML Ecosystem programs Lead) for their support.
97
+ - Special thanks to my colleagues at [VietAI](https://vietai.org/) and [VAIS](https://vais.vn/) for their advice.
98
+
99
+ ## Contact
100
+
101
102
+
103
+ [![Follow](https://img.shields.io/twitter/follow/nguyenvulebinh?style=social)](https://twitter.com/intent/follow?screen_name=nguyenvulebinh)
104
+
105
+
106
+
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 97,
3
+ "<s>": 96
4
+ }
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e0", "\u00e1", "\u00e2", "\u00e3", "\u00e8", "\u00e9", "\u00ea", "\u00ec", "\u00ed", "\u00f2", "\u00f3", "\u00f4", "\u00f5", "\u00f9", "\u00fa", "\u00fd", "\u0103", "\u0111", "\u0129", "\u0169", "\u01a1", "\u01b0", "\u1ea1", "\u1ea3", "\u1ea5", "\u1ea7", "\u1ea9", "\u1eab", "\u1ead", "\u1eaf", "\u1eb1", "\u1eb3", "\u1eb5", "\u1eb7", "\u1eb9", "\u1ebb", "\u1ebd", "\u1ebf", "\u1ec1", "\u1ec3", "\u1ec5", "\u1ec7", "\u1ec9", "\u1ecb", "\u1ecd", "\u1ecf", "\u1ed1", "\u1ed3", "\u1ed5", "\u1ed7", "\u1ed9", "\u1edb", "\u1edd", "\u1edf", "\u1ee1", "\u1ee3", "\u1ee5", "\u1ee7", "\u1ee9", "\u1eeb", "\u1eed", "\u1eef", "\u1ef1", "\u1ef3", "\u1ef5", "\u1ef7", "\u1ef9", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
config.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./model-bin/wav2vec_pretrained/large/",
3
+ "activation_dropout": 0.0,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForPreTraining"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 768,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.1,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "gradient_checkpointing": false,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_size": 1024,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4096,
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.1,
63
+ "mask_channel_length": 10,
64
+ "mask_channel_min_space": 1,
65
+ "mask_channel_other": 0.0,
66
+ "mask_channel_prob": 0.0,
67
+ "mask_channel_selection": "static",
68
+ "mask_feature_length": 10,
69
+ "mask_feature_min_masks": 0,
70
+ "mask_feature_prob": 0.0,
71
+ "mask_time_length": 10,
72
+ "mask_time_min_masks": 2,
73
+ "mask_time_min_space": 1,
74
+ "mask_time_other": 0.0,
75
+ "mask_time_prob": 0.075,
76
+ "mask_time_selection": "static",
77
+ "model_type": "wav2vec2",
78
+ "num_adapter_layers": 3,
79
+ "num_attention_heads": 16,
80
+ "num_codevector_groups": 2,
81
+ "num_codevectors_per_group": 320,
82
+ "num_conv_pos_embedding_groups": 16,
83
+ "num_conv_pos_embeddings": 128,
84
+ "num_feat_extract_layers": 7,
85
+ "num_hidden_layers": 24,
86
+ "num_negatives": 100,
87
+ "output_hidden_size": 1024,
88
+ "pad_token_id": 0,
89
+ "proj_codevector_dim": 768,
90
+ "tdnn_dilation": [
91
+ 1,
92
+ 2,
93
+ 3,
94
+ 1,
95
+ 1
96
+ ],
97
+ "tdnn_dim": [
98
+ 512,
99
+ 512,
100
+ 512,
101
+ 512,
102
+ 1500
103
+ ],
104
+ "tdnn_kernel": [
105
+ 5,
106
+ 3,
107
+ 3,
108
+ 1,
109
+ 1
110
+ ],
111
+ "torch_dtype": "float32",
112
+ "transformers_version": "4.23.1",
113
+ "use_weighted_layer_sum": false,
114
+ "vocab_size": 96,
115
+ "xvector_output_dim": 512
116
+ }
language_model/5gram.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd50eff6ccdeedf6f5672c824cd9c8ca3775a16d7e04962ae464fc56db656c2a
3
+ size 2906312
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/unigrams.txt ADDED
@@ -0,0 +1,852 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /21
2
+ /47
3
+ /giảm
4
+ /hè
5
+ 0
6
+ 1
7
+ 10
8
+ 11
9
+ 12
10
+ 13
11
+ 14
12
+ 15
13
+ 16
14
+ 17
15
+ 18
16
+ 19
17
+ 2
18
+ 20
19
+ 21
20
+ 22
21
+ 23
22
+ 24
23
+ 25
24
+ 26
25
+ 27
26
+ 28
27
+ 29
28
+ 3
29
+ 30
30
+ 31
31
+ 32
32
+ 33
33
+ 34
34
+ 35
35
+ 36
36
+ 37
37
+ 38
38
+ 39
39
+ 4
40
+ 40
41
+ 41
42
+ 42
43
+ 43
44
+ 44
45
+ 45
46
+ 46
47
+ 47
48
+ 48
49
+ 49
50
+ 5
51
+ 50
52
+ 51
53
+ 52
54
+ 53
55
+ 54
56
+ 55
57
+ 56
58
+ 57
59
+ 58
60
+ 59
61
+ 6
62
+ 61
63
+ 62
64
+ 64
65
+ 65
66
+ 66
67
+ 7
68
+ 71
69
+ 72
70
+ 73
71
+ 74
72
+ 76
73
+ 77
74
+ 78
75
+ 8
76
+ 82
77
+ 83
78
+ 84
79
+ 85
80
+ 87
81
+ 88
82
+ 89
83
+ 9
84
+ 92
85
+ 95
86
+ 96
87
+ 97
88
+ 99
89
+ </s>
90
+ <s>
91
+ a
92
+ ai
93
+ alo
94
+ anh
95
+ ayo
96
+ ban
97
+ bao
98
+ biết
99
+ buồn
100
+ buổi
101
+
102
+ bài
103
+ bàn
104
+ bách
105
+ bánh
106
+ báo
107
+ bát
108
+ bây
109
+
110
+
111
+ béng
112
+ bên
113
+ bình
114
+
115
+ bóng
116
+ bạn
117
+ bảo
118
+ bẩn
119
+ bận
120
+ bật
121
+ bắn
122
+ bắt
123
+ bếp
124
+ bể
125
+ bị
126
+ bọn
127
+ bỏ
128
+ bố
129
+ bồn
130
+ bớt
131
+ bởi
132
+ bụi
133
+ bữa
134
+ c
135
+ ca
136
+ cafe
137
+ camera
138
+ chiếc
139
+ chiếu
140
+ chiều
141
+ cho
142
+ choạng
143
+ chung
144
+ chuyện
145
+ chuẩn
146
+ chà
147
+ chào
148
+ chán
149
+ cháu
150
+ cháy
151
+ chín
152
+ chính
153
+ chói
154
+ chùm
155
+ chú
156
+ chúng
157
+ chút
158
+ chơi
159
+ chưa
160
+ chạy
161
+ chả
162
+ chảy
163
+ chậm
164
+ chập
165
+ chậu
166
+ chắc
167
+ chẳng
168
+ chế
169
+ chết
170
+ chỉ
171
+ chị
172
+ chịu
173
+ chồng
174
+ chỗ
175
+ chờ
176
+ chờn
177
+ chủ
178
+ chứ
179
+ chức
180
+ chứng
181
+ coi
182
+ compact
183
+ con
184
+ cu
185
+ cuối
186
+ cuốn
187
+ cuộc
188
+ cài
189
+
190
+ các
191
+ cái
192
+ cánh
193
+ cáo
194
+ cây
195
+ còn
196
+
197
+
198
+ công
199
+ cúng
200
+ cũng
201
+
202
+ cơm
203
+ cường
204
+ cạnh
205
+ cả
206
+ cảm
207
+ cảnh
208
+ cất
209
+ cần
210
+ cầu
211
+ cẩn
212
+ cậu
213
+ cổng
214
+ cỡ
215
+ của
216
+ cứ
217
+ cửa
218
+ da
219
+ do
220
+ duy
221
+ dõi
222
+ dùm
223
+ dùng
224
+ dưng
225
+ dướ
226
+ dưới
227
+ dược
228
+ dạo
229
+ dần
230
+ dậy
231
+ dẹp
232
+ dến
233
+ dễ
234
+ dọn
235
+ dở
236
+ dụng
237
+ e
238
+ em
239
+ eo
240
+ fax
241
+ game
242
+ garage
243
+ ghê
244
+ gia
245
+ giai
246
+ gian
247
+ giá
248
+ giãn
249
+ gió
250
+ giùm
251
+ giúp
252
+ giảm
253
+ giản
254
+ giặt
255
+ giờ
256
+ giời
257
+ giữ
258
+ gym
259
+
260
+ gác
261
+ gái
262
+
263
+ góc
264
+ gúp
265
+ gấp
266
+ gần
267
+ gặp
268
+ haizz
269
+ hanh
270
+ hay
271
+ hey
272
+ hiên
273
+ hình
274
+ hiểu
275
+ hiện
276
+ hoa
277
+ hoang
278
+ hom
279
+ hoà
280
+ hoàng
281
+ hoạt
282
+ huy
283
+ huỳnh
284
+ huỷ
285
+
286
+ hành
287
+ hát
288
+ hân
289
+ hãng
290
+ hãy
291
+
292
+ héo
293
+ hình
294
+ hòa
295
+ hóng
296
+ hôi
297
+ hôm
298
+ hôn
299
+ hộ
300
+
301
+ hút
302
+ hơi
303
+ hơn
304
+
305
+ hầm
306
+ hắt
307
+ hẳn
308
+ hẵng
309
+ hẹn
310
+ hết
311
+ họ
312
+ học
313
+ họp
314
+ hỏng
315
+ hồ
316
+ hồng
317
+ hỗ
318
+ hộ
319
+ hủy
320
+ in
321
+ karaoke
322
+ kem
323
+ khi
324
+ khoàng
325
+ khoảng
326
+ khuya
327
+ khá
328
+ khách
329
+ khét
330
+ khí
331
+ khó
332
+ khói
333
+ khô
334
+ khôi
335
+ không
336
+ khả
337
+ khắp
338
+ khỏi
339
+ kia
340
+ kinh
341
+ kiểm
342
+ kiệm
343
+ ko
344
+ kéo
345
+ kêu
346
+ kìa
347
+
348
+ kẹt
349
+ kết
350
+ kịch
351
+ kịp
352
+ lang
353
+ laptop
354
+ lau
355
+ led
356
+ len
357
+ linh
358
+ liệu
359
+ loa
360
+ long
361
+ luôn
362
+
363
+ làm
364
+ láng
365
+ lát
366
+ lâu
367
+ lãng
368
+ lên
369
+ lênh
370
+
371
+
372
+
373
+ lùa
374
+ lùi
375
+ lúc
376
+
377
+ lưng
378
+ lượng
379
+ lạ
380
+ lại
381
+ lạnh
382
+ lấy
383
+ lập
384
+ lắm
385
+ lặng
386
+ lịch
387
+ lỗi
388
+ lức
389
+ mau
390
+ minh
391
+ miếng
392
+ mua
393
+ muốn
394
+ muỗi
395
+ muộn
396
+ my
397
+
398
+ màn
399
+ mành
400
+ màu
401
+ mày
402
+ mát
403
+ máy
404
+ mãi
405
+ mình
406
+ mính
407
+ mùa
408
+ mùi
409
+ mưa
410
+ mạn
411
+ mạnh
412
+ mất
413
+ mấy
414
+ mắt
415
+ mặt
416
+ mẹ
417
+ mến
418
+ mệt
419
+ mọi
420
+ mỏi
421
+ một
422
+ mới
423
+ mờ
424
+ mời
425
+ mở
426
+ mức
427
+ nay
428
+ ngay
429
+ nghe
430
+ nghi
431
+ nghĩ
432
+ nghơi
433
+ nghỉ
434
+ ngon
435
+ ngoài
436
+ ngày
437
+ ngã
438
+ ngôi
439
+ ngơi
440
+ người
441
+ ngại
442
+ ngạt
443
+ ngập
444
+ ngắt
445
+ ngồi
446
+ ngột
447
+ ngủ
448
+ nha
449
+ nhanh
450
+ nhà
451
+ nhe
452
+ nhi
453
+ nhiên
454
+ nhiều
455
+ nhiệt
456
+ nho
457
+ nhà
458
+ nhá
459
+ nháp
460
+ nhávào
461
+ nháy
462
+ nhân
463
+ nhé
464
+ nhìn
465
+ nhòe
466
+ nhó
467
+ như
468
+ nhưng
469
+ nhạc
470
+ nhảy
471
+ nhấp
472
+ nhập
473
+ nhằng
474
+ nhể
475
+ nhỉ
476
+ nhỏ
477
+ nhớ
478
+ nhớp
479
+ nhờ
480
+ nhở
481
+ nhỡ
482
+ nào
483
+ này
484
+ náy
485
+ nãy
486
+ nên
487
+
488
+ nóng
489
+ nôi
490
+ nơi
491
+ nước
492
+ nướng
493
+ nấc
494
+ nấu
495
+ nắng
496
+ nặc
497
+ nếu
498
+ nối
499
+ nồm
500
+ nồng
501
+ nổ
502
+ nổi
503
+ nội
504
+ nửa
505
+ nữ
506
+ nữa
507
+ nực
508
+ oi
509
+ ok
510
+ okay
511
+ oke
512
+ okei
513
+ om
514
+ online
515
+ pha
516
+ phim
517
+ phiền
518
+ phát
519
+ phía
520
+ phòng
521
+ phóng
522
+ phút
523
+ phải
524
+ phỏng
525
+ pin
526
+ qua
527
+ quang
528
+ quay
529
+ que
530
+ quyết
531
+ quá
532
+ quân
533
+ quên
534
+ quạt
535
+ quả
536
+ quản
537
+ quần
538
+ ra
539
+ radio
540
+ riêng
541
+
542
+ rèm
543
+ rét
544
+ ròi
545
+
546
+ rùi
547
+ rượu
548
+ rất
549
+ rồi
550
+ rời
551
+ rủ
552
+ rửa
553
+ sa
554
+ sai
555
+ sang
556
+ sao
557
+ sau
558
+ sen
559
+ set
560
+ sinh
561
+ siri
562
+ smarthome
563
+ su
564
+ suất
565
+ suốt
566
+ sàn
567
+ sách
568
+ sáng
569
+ sân
570
+ sóng
571
+ sôi
572
+
573
+ sưởi
574
+ sạch
575
+ sảnh
576
+ sắp
577
+ sẵn
578
+ sẽ
579
+ số
580
+ sổ
581
+ sớm
582
+ sợ
583
+ sợi
584
+ sử
585
+ sự
586
+ ta
587
+ tai
588
+ tao
589
+ tau
590
+ tay
591
+ thang
592
+ thank
593
+ thay
594
+ theo
595
+ thi
596
+ thiết
597
+ thiếu
598
+ thiệt
599
+ thoáng
600
+ thu
601
+ thui
602
+ thành
603
+ thân
604
+ thèm
605
+ thêm
606
+ thì
607
+ thôi
608
+ thông
609
+ thúc
610
+ thăng
611
+ thư
612
+ thượng
613
+ thả
614
+ thấp
615
+ thấy
616
+ thật
617
+ thắng
618
+ thằng
619
+ thế
620
+ thể
621
+ thịt
622
+ thống
623
+ thờ
624
+ thời
625
+ thức
626
+ thử
627
+ ti
628
+ tin
629
+ tiếng
630
+ tiếp
631
+ tiết
632
+ tiền
633
+ tiệc
634
+ to
635
+ toang
636
+ toàn
637
+ toé
638
+ tra
639
+ tranh
640
+ treo
641
+ trogn
642
+ trong
643
+ trung
644
+ truyền
645
+ tràn
646
+ trái
647
+ tráng
648
+ trên
649
+ trì
650
+ tròn
651
+ tròng
652
+ trông
653
+ trưa
654
+ trưng
655
+ trước
656
+ trường
657
+ trạng
658
+ trầm
659
+ trần
660
+ trặc
661
+ trặng
662
+ trẻ
663
+ trị
664
+ trọ
665
+ trộm
666
+ trời
667
+ trợ
668
+ trụ
669
+ trục
670
+ trực
671
+ tung
672
+ tuyệt
673
+ tuýp
674
+ tài
675
+ tâm
676
+ tên
677
+ tìm
678
+ tình
679
+
680
+ tính
681
+ tôi
682
+ tùng
683
+
684
+ tăng
685
+ tĩnh
686
+
687
+ tưới
688
+ tường
689
+ tại
690
+ tất
691
+ tầm
692
+ tầng
693
+ tập
694
+ tắm
695
+ tắt
696
+ tẹo
697
+ tỉnh
698
+ tốc
699
+ tối
700
+ tốn
701
+ tốt
702
+ tổ
703
+ tớ
704
+ tới
705
+ tủ
706
+ tức
707
+ từ
708
+ tự
709
+ ui
710
+ up
711
+ uây
712
+ uống
713
+ van
714
+ vi
715
+ việc
716
+ viện
717
+ vui
718
+ vy
719
+
720
+ vài
721
+ vào
722
+ ván
723
+ vãi
724
+
725
+ vòi
726
+ vòng
727
+
728
+ với
729
+ vườn
730
+ vấn
731
+ vẫn
732
+ vậy
733
+ vắng
734
+ vẻ
735
+ về
736
+ vệ
737
+ vội
738
+ với
739
+ vời
740
+ vợ
741
+ vừa
742
+ wc
743
+ xa
744
+ xe
745
+ xem
746
+ xin
747
+ xong
748
+ xuân
749
+ xuống
750
+ xài
751
+ xách
752
+ xép
753
+ xíu
754
+ xông
755
+ xả
756
+ xảy
757
+ xịt
758
+ you
759
+ yên
760
+ yêu
761
+ yếu
762
+ zai
763
+ à
764
+ ào
765
+ á
766
+ ánh
767
+ áo
768
+ áp
769
+ âm
770
+ ây
771
+ ê
772
+ í
773
+ ít
774
+ ô
775
+ ôi
776
+ ông
777
+ ý
778
+ ăn
779
+ đang
780
+ đau
781
+ đi
782
+ điên
783
+ điều
784
+ điện
785
+ đâu
786
+ đây
787
+ đã
788
+ đèn
789
+ đê
790
+ đêm
791
+ đình
792
+ đó
793
+ đón
794
+ đóng
795
+ đông
796
+ đúng
797
+ đũa
798
+ đơn
799
+ đường
800
+ được
801
+ đại
802
+ đạt
803
+ đất
804
+ đấy
805
+ đầu
806
+ đầy
807
+ đắng
808
+ đặt
809
+ đẹp
810
+ đến
811
+ đề
812
+ đền
813
+ để
814
+ định
815
+ đọc
816
+ đống
817
+ đốt
818
+ đồ
819
+ đồng
820
+ độ
821
+ động
822
+ đỡ
823
+ đợi
824
+ đủ
825
+ đứa
826
+ đứng
827
+ đừng
828
+ ơ
829
+ ơi
830
+ ơigiảm
831
+ ơitắt
832
+ ơn
833
+ ưi
834
+ ướt
835
+
836
+ ảnh
837
+ ảo
838
+ ấm
839
+ ấy
840
+ ẩm
841
+ ế
842
+
843
+ ốp
844
+ ồn
845
+ ổn
846
+
847
+ ới
848
+
849
+ ờm
850
+
851
+ ủi
852
+ ừm
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7936feb834fdea55f0968b2235bd9fd7367a0f954e412a6a8205be803943a89
3
+ size 1269701863
special_tokens_map.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<s>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "</s>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<s>",
33
+ "lstrip": false,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "</s>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<s>",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "</s>",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<s>",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "</s>",
68
+ "lstrip": false,
69
+ "normalized": true,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ },
73
+ {
74
+ "content": "<s>",
75
+ "lstrip": false,
76
+ "normalized": true,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ },
80
+ {
81
+ "content": "</s>",
82
+ "lstrip": false,
83
+ "normalized": true,
84
+ "rstrip": false,
85
+ "single_word": false
86
+ },
87
+ {
88
+ "content": "<s>",
89
+ "lstrip": false,
90
+ "normalized": true,
91
+ "rstrip": false,
92
+ "single_word": false
93
+ },
94
+ {
95
+ "content": "</s>",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false
100
+ },
101
+ {
102
+ "content": "<s>",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false
107
+ },
108
+ {
109
+ "content": "</s>",
110
+ "lstrip": false,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false
114
+ },
115
+ {
116
+ "content": "<s>",
117
+ "lstrip": false,
118
+ "normalized": true,
119
+ "rstrip": false,
120
+ "single_word": false
121
+ },
122
+ {
123
+ "content": "</s>",
124
+ "lstrip": false,
125
+ "normalized": true,
126
+ "rstrip": false,
127
+ "single_word": false
128
+ },
129
+ {
130
+ "content": "<s>",
131
+ "lstrip": false,
132
+ "normalized": true,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ {
137
+ "content": "</s>",
138
+ "lstrip": false,
139
+ "normalized": true,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ }
143
+ ],
144
+ "bos_token": "<s>",
145
+ "eos_token": "</s>",
146
+ "pad_token": "<pad>",
147
+ "unk_token": "<unk>"
148
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "do_lower_case": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "<pad>",
8
+ "processor_class": "Wav2Vec2ProcessorWithLM",
9
+ "replace_word_delimiter_char": " ",
10
+ "target_lang": null,
11
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
12
+ "tokenizer_file": null,
13
+ "unk_token": "<unk>",
14
+ "word_delimiter_token": "|"
15
+ }
vocab.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 95,
3
+ "<unk>": 94,
4
+ "a": 1,
5
+ "b": 2,
6
+ "c": 3,
7
+ "d": 4,
8
+ "e": 5,
9
+ "f": 6,
10
+ "g": 7,
11
+ "h": 8,
12
+ "i": 9,
13
+ "j": 10,
14
+ "k": 11,
15
+ "l": 12,
16
+ "m": 13,
17
+ "n": 14,
18
+ "o": 15,
19
+ "p": 16,
20
+ "q": 17,
21
+ "r": 18,
22
+ "s": 19,
23
+ "t": 20,
24
+ "u": 21,
25
+ "v": 22,
26
+ "w": 23,
27
+ "x": 24,
28
+ "y": 25,
29
+ "z": 26,
30
+ "|": 0,
31
+ "à": 27,
32
+ "á": 28,
33
+ "â": 29,
34
+ "ã": 30,
35
+ "è": 31,
36
+ "é": 32,
37
+ "ê": 33,
38
+ "ì": 34,
39
+ "í": 35,
40
+ "ò": 36,
41
+ "ó": 37,
42
+ "ô": 38,
43
+ "õ": 39,
44
+ "ù": 40,
45
+ "ú": 41,
46
+ "ý": 42,
47
+ "ă": 43,
48
+ "đ": 44,
49
+ "ĩ": 45,
50
+ "ũ": 46,
51
+ "ơ": 47,
52
+ "ư": 48,
53
+ "ạ": 49,
54
+ "ả": 50,
55
+ "ấ": 51,
56
+ "ầ": 52,
57
+ "ẩ": 53,
58
+ "ẫ": 54,
59
+ "ậ": 55,
60
+ "ắ": 56,
61
+ "ằ": 57,
62
+ "ẳ": 58,
63
+ "ẵ": 59,
64
+ "ặ": 60,
65
+ "ẹ": 61,
66
+ "ẻ": 62,
67
+ "ẽ": 63,
68
+ "ế": 64,
69
+ "ề": 65,
70
+ "ể": 66,
71
+ "ễ": 67,
72
+ "ệ": 68,
73
+ "ỉ": 69,
74
+ "ị": 70,
75
+ "ọ": 71,
76
+ "ỏ": 72,
77
+ "ố": 73,
78
+ "ồ": 74,
79
+ "ổ": 75,
80
+ "ỗ": 76,
81
+ "ộ": 77,
82
+ "ớ": 78,
83
+ "ờ": 79,
84
+ "ở": 80,
85
+ "ỡ": 81,
86
+ "ợ": 82,
87
+ "ụ": 83,
88
+ "ủ": 84,
89
+ "ứ": 85,
90
+ "ừ": 86,
91
+ "ử": 87,
92
+ "ữ": 88,
93
+ "ự": 89,
94
+ "ỳ": 90,
95
+ "ỵ": 91,
96
+ "ỷ": 92,
97
+ "ỹ": 93
98
+ }