English
pszemraj commited on
Commit
fb40c4a
1 Parent(s): c40464d
Files changed (3) hide show
  1. README.md +2 -2
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +28 -28
README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
 
 
2
  license: apache-2.0
3
  datasets:
4
  - BEE-spoke-data/bees-internal
5
- language:
6
- - en
7
  ---
8
 
9
  # BeeTokenizer
 
1
  ---
2
+ language:
3
+ - en
4
  license: apache-2.0
5
  datasets:
6
  - BEE-spoke-data/bees-internal
 
 
7
  ---
8
 
9
  # BeeTokenizer
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -64,7 +64,7 @@
64
  "single_word": false,
65
  "special": true
66
  },
67
- "32100": {
68
  "content": " ",
69
  "lstrip": false,
70
  "normalized": true,
@@ -72,7 +72,7 @@
72
  "single_word": false,
73
  "special": false
74
  },
75
- "32101": {
76
  "content": " ",
77
  "lstrip": false,
78
  "normalized": true,
@@ -80,7 +80,7 @@
80
  "single_word": false,
81
  "special": false
82
  },
83
- "32102": {
84
  "content": " ",
85
  "lstrip": false,
86
  "normalized": true,
@@ -88,7 +88,7 @@
88
  "single_word": false,
89
  "special": false
90
  },
91
- "32103": {
92
  "content": " ",
93
  "lstrip": false,
94
  "normalized": true,
@@ -96,7 +96,7 @@
96
  "single_word": false,
97
  "special": false
98
  },
99
- "32104": {
100
  "content": " ",
101
  "lstrip": false,
102
  "normalized": true,
@@ -104,7 +104,7 @@
104
  "single_word": false,
105
  "special": false
106
  },
107
- "32105": {
108
  "content": " ",
109
  "lstrip": false,
110
  "normalized": true,
@@ -112,7 +112,7 @@
112
  "single_word": false,
113
  "special": false
114
  },
115
- "32106": {
116
  "content": " ",
117
  "lstrip": false,
118
  "normalized": true,
@@ -120,7 +120,7 @@
120
  "single_word": false,
121
  "special": false
122
  },
123
- "32107": {
124
  "content": " ",
125
  "lstrip": false,
126
  "normalized": true,
@@ -128,7 +128,7 @@
128
  "single_word": false,
129
  "special": false
130
  },
131
- "32108": {
132
  "content": " ",
133
  "lstrip": false,
134
  "normalized": true,
@@ -136,7 +136,7 @@
136
  "single_word": false,
137
  "special": false
138
  },
139
- "32109": {
140
  "content": " ",
141
  "lstrip": false,
142
  "normalized": true,
@@ -144,7 +144,7 @@
144
  "single_word": false,
145
  "special": false
146
  },
147
- "32110": {
148
  "content": " ",
149
  "lstrip": false,
150
  "normalized": true,
@@ -152,7 +152,7 @@
152
  "single_word": false,
153
  "special": false
154
  },
155
- "32111": {
156
  "content": " ",
157
  "lstrip": false,
158
  "normalized": true,
@@ -160,7 +160,7 @@
160
  "single_word": false,
161
  "special": false
162
  },
163
- "32112": {
164
  "content": " ",
165
  "lstrip": false,
166
  "normalized": true,
@@ -168,7 +168,7 @@
168
  "single_word": false,
169
  "special": false
170
  },
171
- "32113": {
172
  "content": " ",
173
  "lstrip": false,
174
  "normalized": true,
@@ -176,7 +176,7 @@
176
  "single_word": false,
177
  "special": false
178
  },
179
- "32114": {
180
  "content": " ",
181
  "lstrip": false,
182
  "normalized": true,
@@ -184,7 +184,7 @@
184
  "single_word": false,
185
  "special": false
186
  },
187
- "32115": {
188
  "content": " ",
189
  "lstrip": false,
190
  "normalized": true,
@@ -192,7 +192,7 @@
192
  "single_word": false,
193
  "special": false
194
  },
195
- "32116": {
196
  "content": " ",
197
  "lstrip": false,
198
  "normalized": true,
@@ -200,7 +200,7 @@
200
  "single_word": false,
201
  "special": false
202
  },
203
- "32117": {
204
  "content": " ",
205
  "lstrip": false,
206
  "normalized": true,
@@ -208,7 +208,7 @@
208
  "single_word": false,
209
  "special": false
210
  },
211
- "32118": {
212
  "content": " ",
213
  "lstrip": false,
214
  "normalized": true,
@@ -216,7 +216,7 @@
216
  "single_word": false,
217
  "special": false
218
  },
219
- "32119": {
220
  "content": " ",
221
  "lstrip": false,
222
  "normalized": true,
@@ -224,7 +224,7 @@
224
  "single_word": false,
225
  "special": false
226
  },
227
- "32120": {
228
  "content": " ",
229
  "lstrip": false,
230
  "normalized": true,
@@ -232,7 +232,7 @@
232
  "single_word": false,
233
  "special": false
234
  },
235
- "32121": {
236
  "content": " ",
237
  "lstrip": false,
238
  "normalized": true,
@@ -240,7 +240,7 @@
240
  "single_word": false,
241
  "special": false
242
  },
243
- "32122": {
244
  "content": " ",
245
  "lstrip": false,
246
  "normalized": true,
@@ -248,7 +248,7 @@
248
  "single_word": false,
249
  "special": false
250
  },
251
- "32123": {
252
  "content": " ",
253
  "lstrip": false,
254
  "normalized": true,
@@ -256,7 +256,7 @@
256
  "single_word": false,
257
  "special": false
258
  },
259
- "32124": {
260
  "content": " ",
261
  "lstrip": false,
262
  "normalized": true,
@@ -264,7 +264,7 @@
264
  "single_word": false,
265
  "special": false
266
  },
267
- "32125": {
268
  "content": " ",
269
  "lstrip": false,
270
  "normalized": true,
@@ -272,7 +272,7 @@
272
  "single_word": false,
273
  "special": false
274
  },
275
- "32126": {
276
  "content": " ",
277
  "lstrip": false,
278
  "normalized": true,
@@ -280,7 +280,7 @@
280
  "single_word": false,
281
  "special": false
282
  },
283
- "32127": {
284
  "content": " ",
285
  "lstrip": false,
286
  "normalized": true,
 
64
  "single_word": false,
65
  "special": true
66
  },
67
+ "32072": {
68
  "content": " ",
69
  "lstrip": false,
70
  "normalized": true,
 
72
  "single_word": false,
73
  "special": false
74
  },
75
+ "32073": {
76
  "content": " ",
77
  "lstrip": false,
78
  "normalized": true,
 
80
  "single_word": false,
81
  "special": false
82
  },
83
+ "32074": {
84
  "content": " ",
85
  "lstrip": false,
86
  "normalized": true,
 
88
  "single_word": false,
89
  "special": false
90
  },
91
+ "32075": {
92
  "content": " ",
93
  "lstrip": false,
94
  "normalized": true,
 
96
  "single_word": false,
97
  "special": false
98
  },
99
+ "32076": {
100
  "content": " ",
101
  "lstrip": false,
102
  "normalized": true,
 
104
  "single_word": false,
105
  "special": false
106
  },
107
+ "32077": {
108
  "content": " ",
109
  "lstrip": false,
110
  "normalized": true,
 
112
  "single_word": false,
113
  "special": false
114
  },
115
+ "32078": {
116
  "content": " ",
117
  "lstrip": false,
118
  "normalized": true,
 
120
  "single_word": false,
121
  "special": false
122
  },
123
+ "32079": {
124
  "content": " ",
125
  "lstrip": false,
126
  "normalized": true,
 
128
  "single_word": false,
129
  "special": false
130
  },
131
+ "32080": {
132
  "content": " ",
133
  "lstrip": false,
134
  "normalized": true,
 
136
  "single_word": false,
137
  "special": false
138
  },
139
+ "32081": {
140
  "content": " ",
141
  "lstrip": false,
142
  "normalized": true,
 
144
  "single_word": false,
145
  "special": false
146
  },
147
+ "32082": {
148
  "content": " ",
149
  "lstrip": false,
150
  "normalized": true,
 
152
  "single_word": false,
153
  "special": false
154
  },
155
+ "32083": {
156
  "content": " ",
157
  "lstrip": false,
158
  "normalized": true,
 
160
  "single_word": false,
161
  "special": false
162
  },
163
+ "32084": {
164
  "content": " ",
165
  "lstrip": false,
166
  "normalized": true,
 
168
  "single_word": false,
169
  "special": false
170
  },
171
+ "32085": {
172
  "content": " ",
173
  "lstrip": false,
174
  "normalized": true,
 
176
  "single_word": false,
177
  "special": false
178
  },
179
+ "32086": {
180
  "content": " ",
181
  "lstrip": false,
182
  "normalized": true,
 
184
  "single_word": false,
185
  "special": false
186
  },
187
+ "32087": {
188
  "content": " ",
189
  "lstrip": false,
190
  "normalized": true,
 
192
  "single_word": false,
193
  "special": false
194
  },
195
+ "32088": {
196
  "content": " ",
197
  "lstrip": false,
198
  "normalized": true,
 
200
  "single_word": false,
201
  "special": false
202
  },
203
+ "32089": {
204
  "content": " ",
205
  "lstrip": false,
206
  "normalized": true,
 
208
  "single_word": false,
209
  "special": false
210
  },
211
+ "32090": {
212
  "content": " ",
213
  "lstrip": false,
214
  "normalized": true,
 
216
  "single_word": false,
217
  "special": false
218
  },
219
+ "32091": {
220
  "content": " ",
221
  "lstrip": false,
222
  "normalized": true,
 
224
  "single_word": false,
225
  "special": false
226
  },
227
+ "32092": {
228
  "content": " ",
229
  "lstrip": false,
230
  "normalized": true,
 
232
  "single_word": false,
233
  "special": false
234
  },
235
+ "32093": {
236
  "content": " ",
237
  "lstrip": false,
238
  "normalized": true,
 
240
  "single_word": false,
241
  "special": false
242
  },
243
+ "32094": {
244
  "content": " ",
245
  "lstrip": false,
246
  "normalized": true,
 
248
  "single_word": false,
249
  "special": false
250
  },
251
+ "32095": {
252
  "content": " ",
253
  "lstrip": false,
254
  "normalized": true,
 
256
  "single_word": false,
257
  "special": false
258
  },
259
+ "32096": {
260
  "content": " ",
261
  "lstrip": false,
262
  "normalized": true,
 
264
  "single_word": false,
265
  "special": false
266
  },
267
+ "32097": {
268
  "content": " ",
269
  "lstrip": false,
270
  "normalized": true,
 
272
  "single_word": false,
273
  "special": false
274
  },
275
+ "32098": {
276
  "content": " ",
277
  "lstrip": false,
278
  "normalized": true,
 
280
  "single_word": false,
281
  "special": false
282
  },
283
+ "32099": {
284
  "content": " ",
285
  "lstrip": false,
286
  "normalized": true,