JovialValley commited on
Commit
e834d47
·
1 Parent(s): 2d32b73

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +6 -0
  2. tokenizer_config.json +11 -0
  3. vocab.json +149 -0
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "do_lower_case": false,
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "[PAD]",
7
+ "replace_word_delimiter_char": " ",
8
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
9
+ "unk_token": "[UNK]",
10
+ "word_delimiter_token": "|"
11
+ }
vocab.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "@_c": 69,
3
+ "@_j": 141,
4
+ "@_n": 30,
5
+ "@_t": 89,
6
+ "[PAD]": 146,
7
+ "[UNK]": 145,
8
+ "a_c": 20,
9
+ "a_j": 3,
10
+ "a_n": 67,
11
+ "a_t": 16,
12
+ "b_c": 51,
13
+ "b_j": 134,
14
+ "b_n": 140,
15
+ "b_t": 26,
16
+ "d_c": 58,
17
+ "d_j": 45,
18
+ "d_n": 64,
19
+ "d_t": 9,
20
+ "e_c": 63,
21
+ "e_j": 25,
22
+ "e_n": 13,
23
+ "e_t": 41,
24
+ "f_c": 39,
25
+ "f_j": 128,
26
+ "f_n": 29,
27
+ "f_t": 109,
28
+ "i_c": 112,
29
+ "i_j": 119,
30
+ "i_n": 93,
31
+ "i_t": 139,
32
+ "j_c": 106,
33
+ "j_j": 87,
34
+ "j_n": 122,
35
+ "j_t": 1,
36
+ "k_c": 107,
37
+ "k_j": 15,
38
+ "k_n": 99,
39
+ "k_t": 101,
40
+ "l_c": 129,
41
+ "l_j": 12,
42
+ "l_n": 65,
43
+ "l_t": 60,
44
+ "m_c": 143,
45
+ "m_j": 84,
46
+ "m_n": 102,
47
+ "m_t": 73,
48
+ "n_c": 27,
49
+ "n_j": 22,
50
+ "n_n": 32,
51
+ "n_t": 6,
52
+ "o_c": 108,
53
+ "o_j": 121,
54
+ "o_n": 110,
55
+ "o_t": 33,
56
+ "p_c": 54,
57
+ "p_j": 132,
58
+ "p_n": 117,
59
+ "p_t": 127,
60
+ "s_c": 85,
61
+ "s_j": 74,
62
+ "s_n": 72,
63
+ "s_t": 28,
64
+ "t_c": 42,
65
+ "t_j": 142,
66
+ "t_n": 52,
67
+ "t_t": 62,
68
+ "u_c": 68,
69
+ "u_j": 135,
70
+ "u_n": 83,
71
+ "u_t": 66,
72
+ "v_c": 35,
73
+ "v_j": 86,
74
+ "v_n": 90,
75
+ "v_t": 81,
76
+ "w_c": 136,
77
+ "w_j": 40,
78
+ "w_n": 144,
79
+ "w_t": 38,
80
+ "y_c": 48,
81
+ "y_j": 36,
82
+ "y_n": 0,
83
+ "y_t": 4,
84
+ "z_c": 53,
85
+ "z_j": 114,
86
+ "z_n": 131,
87
+ "z_t": 126,
88
+ "|": 138,
89
+ "ø_c": 56,
90
+ "ø_j": 113,
91
+ "ø_n": 21,
92
+ "ø_t": 8,
93
+ "œ_c": 61,
94
+ "œ_j": 137,
95
+ "œ_n": 77,
96
+ "œ_t": 124,
97
+ "œ̃_c": 97,
98
+ "œ̃_j": 59,
99
+ "œ̃_n": 130,
100
+ "œ̃_t": 10,
101
+ "ɑ_c": 82,
102
+ "ɑ_j": 18,
103
+ "ɑ_n": 103,
104
+ "ɑ_t": 91,
105
+ "ɑ̃_c": 123,
106
+ "ɑ̃_j": 120,
107
+ "ɑ̃_n": 44,
108
+ "ɑ̃_t": 43,
109
+ "ɔ_c": 105,
110
+ "ɔ_j": 50,
111
+ "ɔ_n": 76,
112
+ "ɔ_t": 31,
113
+ "ɔ̃_c": 14,
114
+ "ɔ̃_j": 88,
115
+ "ɔ̃_n": 19,
116
+ "ɔ̃_t": 5,
117
+ "ɛ_c": 125,
118
+ "ɛ_j": 7,
119
+ "ɛ_n": 95,
120
+ "ɛ_t": 47,
121
+ "ɛ̃_c": 96,
122
+ "ɛ̃_j": 111,
123
+ "ɛ̃_n": 24,
124
+ "ɛ̃_t": 116,
125
+ "ɜ_c": 2,
126
+ "ɜ_j": 100,
127
+ "ɜ_n": 92,
128
+ "ɜ_t": 133,
129
+ "ɡ_c": 70,
130
+ "ɡ_j": 49,
131
+ "ɡ_n": 37,
132
+ "ɡ_t": 75,
133
+ "ʁ_c": 71,
134
+ "ʁ_j": 17,
135
+ "ʁ_n": 55,
136
+ "ʁ_t": 57,
137
+ "ʃ_c": 78,
138
+ "ʃ_j": 98,
139
+ "ʃ_n": 94,
140
+ "ʃ_t": 23,
141
+ "ʊ_c": 79,
142
+ "ʊ_j": 11,
143
+ "ʊ_n": 34,
144
+ "ʊ_t": 115,
145
+ "ʒ_c": 118,
146
+ "ʒ_j": 80,
147
+ "ʒ_n": 46,
148
+ "ʒ_t": 104
149
+ }