Maxime62 commited on
Commit
892f2c4
·
verified ·
1 Parent(s): f5bc145

Upload tokenizer

Browse files
Files changed (2) hide show
  1. added_tokens.json +2 -48
  2. tokenizer_config.json +1 -401
added_tokens.json CHANGED
@@ -1,54 +1,8 @@
1
  {
2
- "[email protected]": 32049,
3
- "12 rue Norbert Ségard, 59800": 32039,
4
- "13 Rue de Toul, 59000": 32036,
5
- "2 rue Norbert Ségard 59014": 32035,
6
- "2 rue Norbert Ségard, 59800": 32038,
7
- "41 boulevard Vauban, 59800": 32037,
8
- "47 bis rue du Port": 32041,
9
- "47 boulevard Vauban, 59000": 32043,
10
- "55 Rue Saint-Jean-Baptiste de la Salle, 59800": 32040,
11
- "67 Boulevard Vauban": 32042,
12
  "<pad>": 32000,
13
- "Adimaker": 32023,
14
- "All Logement": 32033,
15
- "All-Solidarité": 32028,
16
- "Alumnis": 32024,
17
- "Aurion": 32045,
18
- "Bordeaux": 32009,
19
- "CGE": 32021,
20
- "CGSI": 32017,
21
- "CIR": 32013,
22
- "CNB": 32014,
23
- "CPG": 32016,
24
- "CPSU": 32031,
25
- "CSI": 32015,
26
- "Career Center": 32027,
27
- "Châteauroux": 32011,
28
- "ECTS": 32018,
29
- "ERASMUS+": 32025,
30
- "FFSU": 32032,
31
- "FHESCti": 32020,
32
- "Fésic": 32022,
33
  "HEI": 32003,
34
  "ISA": 32002,
35
  "ISEN": 32001,
36
- "ISIC": 32029,
37
- "JUNIA": 32005,
38
- "Junia-learning": 32046,
39
- "Lille": 32007,
40
- "MERMOZ": 32026,
41
- "SAFE": 32030,
42
- "Seekube0328384858": 32034,
43
- "VES": 32019,
44
- "Vauban": 32012,
45
- "[email protected]": 32050,
46
- "all-lacatho.fr0328040240": 32048,
47
- "bordeaux": 32010,
48
- "[email protected]": 32051,
49
- "hei": 32004,
50
- "https://aurion.junia.com/": 32044,
51
- "junia": 32006,
52
- "junia.jobteaser.com": 32047,
53
- "lille": 32008
54
  }
 
1
  {
 
 
 
 
 
 
 
 
 
 
2
  "<pad>": 32000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "HEI": 32003,
4
  "ISA": 32002,
5
  "ISEN": 32001,
6
+ "JUNIA": 32004,
7
+ "Lille": 32005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  }
tokenizer_config.json CHANGED
@@ -26,38 +26,6 @@
26
  "single_word": false,
27
  "special": true
28
  },
29
- "4462": {
30
- "content": "UE",
31
- "lstrip": false,
32
- "normalized": true,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": false
36
- },
37
- "7674": {
38
- "content": "isen",
39
- "lstrip": false,
40
- "normalized": true,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": false
44
- },
45
- "8069": {
46
- "content": "isa",
47
- "lstrip": false,
48
- "normalized": true,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": false
52
- },
53
- "28283": {
54
- "content": "RU",
55
- "lstrip": false,
56
- "normalized": true,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": false
60
- },
61
  "32000": {
62
  "content": "<pad>",
63
  "lstrip": false,
@@ -91,14 +59,6 @@
91
  "special": false
92
  },
93
  "32004": {
94
- "content": "hei",
95
- "lstrip": false,
96
- "normalized": true,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": false
100
- },
101
- "32005": {
102
  "content": "JUNIA",
103
  "lstrip": false,
104
  "normalized": true,
@@ -106,373 +66,13 @@
106
  "single_word": false,
107
  "special": false
108
  },
109
- "32006": {
110
- "content": "junia",
111
- "lstrip": false,
112
- "normalized": true,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": false
116
- },
117
- "32007": {
118
  "content": "Lille",
119
  "lstrip": false,
120
  "normalized": true,
121
  "rstrip": false,
122
  "single_word": false,
123
  "special": false
124
- },
125
- "32008": {
126
- "content": "lille",
127
- "lstrip": false,
128
- "normalized": true,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "32009": {
134
- "content": "Bordeaux",
135
- "lstrip": false,
136
- "normalized": true,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "32010": {
142
- "content": "bordeaux",
143
- "lstrip": false,
144
- "normalized": true,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "32011": {
150
- "content": "Châteauroux",
151
- "lstrip": false,
152
- "normalized": true,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "32012": {
158
- "content": "Vauban",
159
- "lstrip": false,
160
- "normalized": true,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "32013": {
166
- "content": "CIR",
167
- "lstrip": false,
168
- "normalized": true,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "32014": {
174
- "content": "CNB",
175
- "lstrip": false,
176
- "normalized": true,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- },
181
- "32015": {
182
- "content": "CSI",
183
- "lstrip": false,
184
- "normalized": true,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": false
188
- },
189
- "32016": {
190
- "content": "CPG",
191
- "lstrip": false,
192
- "normalized": true,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": false
196
- },
197
- "32017": {
198
- "content": "CGSI",
199
- "lstrip": false,
200
- "normalized": true,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": false
204
- },
205
- "32018": {
206
- "content": "ECTS",
207
- "lstrip": false,
208
- "normalized": true,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": false
212
- },
213
- "32019": {
214
- "content": "VES",
215
- "lstrip": false,
216
- "normalized": true,
217
- "rstrip": false,
218
- "single_word": false,
219
- "special": false
220
- },
221
- "32020": {
222
- "content": "FHESCti",
223
- "lstrip": false,
224
- "normalized": true,
225
- "rstrip": false,
226
- "single_word": false,
227
- "special": false
228
- },
229
- "32021": {
230
- "content": "CGE",
231
- "lstrip": false,
232
- "normalized": true,
233
- "rstrip": false,
234
- "single_word": false,
235
- "special": false
236
- },
237
- "32022": {
238
- "content": "Fésic",
239
- "lstrip": false,
240
- "normalized": true,
241
- "rstrip": false,
242
- "single_word": false,
243
- "special": false
244
- },
245
- "32023": {
246
- "content": "Adimaker",
247
- "lstrip": false,
248
- "normalized": true,
249
- "rstrip": false,
250
- "single_word": false,
251
- "special": false
252
- },
253
- "32024": {
254
- "content": "Alumnis",
255
- "lstrip": false,
256
- "normalized": true,
257
- "rstrip": false,
258
- "single_word": false,
259
- "special": false
260
- },
261
- "32025": {
262
- "content": "ERASMUS+",
263
- "lstrip": false,
264
- "normalized": true,
265
- "rstrip": false,
266
- "single_word": false,
267
- "special": false
268
- },
269
- "32026": {
270
- "content": "MERMOZ",
271
- "lstrip": false,
272
- "normalized": true,
273
- "rstrip": false,
274
- "single_word": false,
275
- "special": false
276
- },
277
- "32027": {
278
- "content": "Career Center",
279
- "lstrip": false,
280
- "normalized": true,
281
- "rstrip": false,
282
- "single_word": false,
283
- "special": false
284
- },
285
- "32028": {
286
- "content": "All-Solidarité",
287
- "lstrip": false,
288
- "normalized": true,
289
- "rstrip": false,
290
- "single_word": false,
291
- "special": false
292
- },
293
- "32029": {
294
- "content": "ISIC",
295
- "lstrip": false,
296
- "normalized": true,
297
- "rstrip": false,
298
- "single_word": false,
299
- "special": false
300
- },
301
- "32030": {
302
- "content": "SAFE",
303
- "lstrip": false,
304
- "normalized": true,
305
- "rstrip": false,
306
- "single_word": false,
307
- "special": false
308
- },
309
- "32031": {
310
- "content": "CPSU",
311
- "lstrip": false,
312
- "normalized": true,
313
- "rstrip": false,
314
- "single_word": false,
315
- "special": false
316
- },
317
- "32032": {
318
- "content": "FFSU",
319
- "lstrip": false,
320
- "normalized": true,
321
- "rstrip": false,
322
- "single_word": false,
323
- "special": false
324
- },
325
- "32033": {
326
- "content": "All Logement",
327
- "lstrip": false,
328
- "normalized": true,
329
- "rstrip": false,
330
- "single_word": false,
331
- "special": false
332
- },
333
- "32034": {
334
- "content": "Seekube0328384858",
335
- "lstrip": false,
336
- "normalized": true,
337
- "rstrip": false,
338
- "single_word": false,
339
- "special": false
340
- },
341
- "32035": {
342
- "content": "2 rue Norbert Ségard 59014",
343
- "lstrip": false,
344
- "normalized": true,
345
- "rstrip": false,
346
- "single_word": false,
347
- "special": false
348
- },
349
- "32036": {
350
- "content": "13 Rue de Toul, 59000",
351
- "lstrip": false,
352
- "normalized": true,
353
- "rstrip": false,
354
- "single_word": false,
355
- "special": false
356
- },
357
- "32037": {
358
- "content": "41 boulevard Vauban, 59800",
359
- "lstrip": false,
360
- "normalized": true,
361
- "rstrip": false,
362
- "single_word": false,
363
- "special": false
364
- },
365
- "32038": {
366
- "content": "2 rue Norbert Ségard, 59800",
367
- "lstrip": false,
368
- "normalized": true,
369
- "rstrip": false,
370
- "single_word": false,
371
- "special": false
372
- },
373
- "32039": {
374
- "content": "12 rue Norbert Ségard, 59800",
375
- "lstrip": false,
376
- "normalized": true,
377
- "rstrip": false,
378
- "single_word": false,
379
- "special": false
380
- },
381
- "32040": {
382
- "content": "55 Rue Saint-Jean-Baptiste de la Salle, 59800",
383
- "lstrip": false,
384
- "normalized": true,
385
- "rstrip": false,
386
- "single_word": false,
387
- "special": false
388
- },
389
- "32041": {
390
- "content": "47 bis rue du Port",
391
- "lstrip": false,
392
- "normalized": true,
393
- "rstrip": false,
394
- "single_word": false,
395
- "special": false
396
- },
397
- "32042": {
398
- "content": "67 Boulevard Vauban",
399
- "lstrip": false,
400
- "normalized": true,
401
- "rstrip": false,
402
- "single_word": false,
403
- "special": false
404
- },
405
- "32043": {
406
- "content": "47 boulevard Vauban, 59000",
407
- "lstrip": false,
408
- "normalized": true,
409
- "rstrip": false,
410
- "single_word": false,
411
- "special": false
412
- },
413
- "32044": {
414
- "content": "https://aurion.junia.com/",
415
- "lstrip": false,
416
- "normalized": true,
417
- "rstrip": false,
418
- "single_word": false,
419
- "special": false
420
- },
421
- "32045": {
422
- "content": "Aurion",
423
- "lstrip": false,
424
- "normalized": true,
425
- "rstrip": false,
426
- "single_word": false,
427
- "special": false
428
- },
429
- "32046": {
430
- "content": "Junia-learning",
431
- "lstrip": false,
432
- "normalized": true,
433
- "rstrip": false,
434
- "single_word": false,
435
- "special": false
436
- },
437
- "32047": {
438
- "content": "junia.jobteaser.com",
439
- "lstrip": false,
440
- "normalized": true,
441
- "rstrip": false,
442
- "single_word": false,
443
- "special": false
444
- },
445
- "32048": {
446
- "content": "all-lacatho.fr0328040240",
447
- "lstrip": false,
448
- "normalized": true,
449
- "rstrip": false,
450
- "single_word": false,
451
- "special": false
452
- },
453
- "32049": {
454
- "content": "[email protected]",
455
- "lstrip": false,
456
- "normalized": true,
457
- "rstrip": false,
458
- "single_word": false,
459
- "special": false
460
- },
461
- "32050": {
462
- "content": "[email protected]",
463
- "lstrip": false,
464
- "normalized": true,
465
- "rstrip": false,
466
- "single_word": false,
467
- "special": false
468
- },
469
- "32051": {
470
- "content": "[email protected]",
471
- "lstrip": false,
472
- "normalized": true,
473
- "rstrip": false,
474
- "single_word": false,
475
- "special": false
476
  }
477
  },
478
  "bos_token": "<s>",
 
26
  "single_word": false,
27
  "special": true
28
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  "32000": {
30
  "content": "<pad>",
31
  "lstrip": false,
 
59
  "special": false
60
  },
61
  "32004": {
 
 
 
 
 
 
 
 
62
  "content": "JUNIA",
63
  "lstrip": false,
64
  "normalized": true,
 
66
  "single_word": false,
67
  "special": false
68
  },
69
+ "32005": {
 
 
 
 
 
 
 
 
70
  "content": "Lille",
71
  "lstrip": false,
72
  "normalized": true,
73
  "rstrip": false,
74
  "single_word": false,
75
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  }
77
  },
78
  "bos_token": "<s>",