kiansheik commited on
Commit
f3f0897
1 Parent(s): 952ad60

Addding multiple orthographies

Browse files

no Japanese yet, only latin based characters still

config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "models/t5-1.3_base_nouns/",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
 
1
  {
2
+ "_name_or_path": "models/t5-1.5_special_chars_full/",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f15ee67ce40176628e800736efe8b306d9184767fa5404cf75a0d2cf5a627014
3
  size 242250792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95f56b3ca947f730118bea445d4fac2ab563df7413b9d37c86af323b4fd54679
3
  size 242250792
special_tokens_map.json CHANGED
@@ -140,13 +140,6 @@
140
  "rstrip": false,
141
  "single_word": false
142
  },
143
- {
144
- "content": "abo",
145
- "lstrip": false,
146
- "normalized": false,
147
- "rstrip": false,
148
- "single_word": false
149
- },
150
  {
151
  "content": "[GERUND_SUFFIX:CLASS_1]",
152
  "lstrip": false,
@@ -168,13 +161,6 @@
168
  "rstrip": false,
169
  "single_word": false
170
  },
171
- {
172
- "content": "[GERUND_SUFFIX:CLASS_1:IYU]",
173
- "lstrip": false,
174
- "normalized": false,
175
- "rstrip": false,
176
- "single_word": false
177
- },
178
  {
179
  "content": "[NEGATION_SUFFIX]",
180
  "lstrip": false,
@@ -246,28 +232,28 @@
246
  "single_word": false
247
  },
248
  {
249
- "content": "[SUBJECT:1ppe]",
250
  "lstrip": false,
251
  "normalized": false,
252
  "rstrip": false,
253
  "single_word": false
254
  },
255
  {
256
- "content": "[SUBJECT:1ps]",
257
  "lstrip": false,
258
  "normalized": false,
259
  "rstrip": false,
260
  "single_word": false
261
  },
262
  {
263
- "content": "a",
264
  "lstrip": false,
265
  "normalized": false,
266
  "rstrip": false,
267
  "single_word": false
268
  },
269
  {
270
- "content": "[w1q]ep[w4q]",
271
  "lstrip": false,
272
  "normalized": false,
273
  "rstrip": false,
@@ -288,14 +274,14 @@
288
  "single_word": false
289
  },
290
  {
291
- "content": "[OBJECT:2ps]",
292
  "lstrip": false,
293
  "normalized": false,
294
  "rstrip": false,
295
  "single_word": false
296
  },
297
  {
298
- "content": "pe[w10q]",
299
  "lstrip": false,
300
  "normalized": false,
301
  "rstrip": false,
@@ -337,14 +323,14 @@
337
  "single_word": false
338
  },
339
  {
340
- "content": "[NEGATION_PREFIX]",
341
  "lstrip": false,
342
  "normalized": false,
343
  "rstrip": false,
344
  "single_word": false
345
  },
346
  {
347
- "content": "ix[w4q]",
348
  "lstrip": false,
349
  "normalized": false,
350
  "rstrip": false,
@@ -393,14 +379,14 @@
393
  "single_word": false
394
  },
395
  {
396
- "content": "[OBJECT_MARKER:3p:DEFAULT]",
397
  "lstrip": false,
398
  "normalized": false,
399
  "rstrip": false,
400
  "single_word": false
401
  },
402
  {
403
- "content": "opo",
404
  "lstrip": false,
405
  "normalized": false,
406
  "rstrip": false,
@@ -568,14 +554,14 @@
568
  "single_word": false
569
  },
570
  {
571
- "content": "pa",
572
  "lstrip": false,
573
  "normalized": false,
574
  "rstrip": false,
575
  "single_word": false
576
  },
577
  {
578
- "content": "ramo",
579
  "lstrip": false,
580
  "normalized": false,
581
  "rstrip": false,
@@ -595,13 +581,6 @@
595
  "rstrip": false,
596
  "single_word": false
597
  },
598
- {
599
- "content": "[w5q]",
600
- "lstrip": false,
601
- "normalized": false,
602
- "rstrip": false,
603
- "single_word": false
604
- },
605
  {
606
  "content": "[GERUND_SUFFIX:CLASS_1:NASAL_VOWEL]",
607
  "lstrip": false,
@@ -638,14 +617,14 @@
638
  "single_word": false
639
  },
640
  {
641
- "content": "oro",
642
  "lstrip": false,
643
  "normalized": false,
644
  "rstrip": false,
645
  "single_word": false
646
  },
647
  {
648
- "content": "[OBJECT:2ps:SUBJECT_1P]",
649
  "lstrip": false,
650
  "normalized": false,
651
  "rstrip": false,
@@ -666,14 +645,14 @@
666
  "single_word": false
667
  },
668
  {
669
- "content": "[PLURIFORM_PREFIX:R]",
670
  "lstrip": false,
671
  "normalized": false,
672
  "rstrip": false,
673
  "single_word": false
674
  },
675
  {
676
- "content": "[SUBJECT:2ps]",
677
  "lstrip": false,
678
  "normalized": false,
679
  "rstrip": false,
 
140
  "rstrip": false,
141
  "single_word": false
142
  },
 
 
 
 
 
 
 
143
  {
144
  "content": "[GERUND_SUFFIX:CLASS_1]",
145
  "lstrip": false,
 
161
  "rstrip": false,
162
  "single_word": false
163
  },
 
 
 
 
 
 
 
164
  {
165
  "content": "[NEGATION_SUFFIX]",
166
  "lstrip": false,
 
232
  "single_word": false
233
  },
234
  {
235
+ "content": "[SUBJECT:1ps]",
236
  "lstrip": false,
237
  "normalized": false,
238
  "rstrip": false,
239
  "single_word": false
240
  },
241
  {
242
+ "content": "[SUBJECT:1ppe]",
243
  "lstrip": false,
244
  "normalized": false,
245
  "rstrip": false,
246
  "single_word": false
247
  },
248
  {
249
+ "content": "[w1q]ep[w4q]",
250
  "lstrip": false,
251
  "normalized": false,
252
  "rstrip": false,
253
  "single_word": false
254
  },
255
  {
256
+ "content": "a",
257
  "lstrip": false,
258
  "normalized": false,
259
  "rstrip": false,
 
274
  "single_word": false
275
  },
276
  {
277
+ "content": "pe[w10q]",
278
  "lstrip": false,
279
  "normalized": false,
280
  "rstrip": false,
281
  "single_word": false
282
  },
283
  {
284
+ "content": "[OBJECT:2ps]",
285
  "lstrip": false,
286
  "normalized": false,
287
  "rstrip": false,
 
323
  "single_word": false
324
  },
325
  {
326
+ "content": "ix[w4q]",
327
  "lstrip": false,
328
  "normalized": false,
329
  "rstrip": false,
330
  "single_word": false
331
  },
332
  {
333
+ "content": "[NEGATION_PREFIX]",
334
  "lstrip": false,
335
  "normalized": false,
336
  "rstrip": false,
 
379
  "single_word": false
380
  },
381
  {
382
+ "content": "opo",
383
  "lstrip": false,
384
  "normalized": false,
385
  "rstrip": false,
386
  "single_word": false
387
  },
388
  {
389
+ "content": "[OBJECT_MARKER:3p:DEFAULT]",
390
  "lstrip": false,
391
  "normalized": false,
392
  "rstrip": false,
 
554
  "single_word": false
555
  },
556
  {
557
+ "content": "ramo",
558
  "lstrip": false,
559
  "normalized": false,
560
  "rstrip": false,
561
  "single_word": false
562
  },
563
  {
564
+ "content": "pa",
565
  "lstrip": false,
566
  "normalized": false,
567
  "rstrip": false,
 
581
  "rstrip": false,
582
  "single_word": false
583
  },
 
 
 
 
 
 
 
584
  {
585
  "content": "[GERUND_SUFFIX:CLASS_1:NASAL_VOWEL]",
586
  "lstrip": false,
 
617
  "single_word": false
618
  },
619
  {
620
+ "content": "[OBJECT:2ps:SUBJECT_1P]",
621
  "lstrip": false,
622
  "normalized": false,
623
  "rstrip": false,
624
  "single_word": false
625
  },
626
  {
627
+ "content": "oro",
628
  "lstrip": false,
629
  "normalized": false,
630
  "rstrip": false,
 
645
  "single_word": false
646
  },
647
  {
648
+ "content": "[SUBJECT:2ps]",
649
  "lstrip": false,
650
  "normalized": false,
651
  "rstrip": false,
652
  "single_word": false
653
  },
654
  {
655
+ "content": "[PLURIFORM_PREFIX:R]",
656
  "lstrip": false,
657
  "normalized": false,
658
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -1975,11 +1975,9 @@
1975
  "e[w15q]ym",
1976
  "[OBJECT_MARKER:3p:PLURIFORM_PREFIX:MONOSYLLABIC]",
1977
  "[OBJECT:1ppi]",
1978
- "abo",
1979
  "[GERUND_SUFFIX:CLASS_1]",
1980
  "[SUB_VERB]",
1981
  "n[w15q]",
1982
- "[GERUND_SUFFIX:CLASS_1:IYU]",
1983
  "[NEGATION_SUFFIX]",
1984
  "[GERUND_SUBJECT_PREFIX:1ppe]",
1985
  "i",
@@ -1990,29 +1988,29 @@
1990
  "[w1q]",
1991
  "xe",
1992
  "t",
1993
- "[SUBJECT:1ppe]",
1994
  "[SUBJECT:1ps]",
1995
- "a",
1996
  "[w1q]ep[w4q]",
 
1997
  "[SUBJECT_PREFIX:1ppe]",
1998
  "amo",
1999
- "[OBJECT:2ps]",
2000
  "pe[w10q]",
 
2001
  "[OBJECT:1ppe]",
2002
  "[NEGATION_SUFFIX:CONSONANT_ENDING]",
2003
  "[OBJECT:MUTUAL]",
2004
  "[GERUND_SUFFIX:CLASS_2:ORAL_VOWEL_ENDING]",
2005
  "[w1q]o",
2006
- "[NEGATION_PREFIX]",
2007
  "ix[w4q]",
 
2008
  "[CIRCUMSTANTIAL_SUFFIX:NULL_ENDING]",
2009
  "[SUBJECT:1ppi]",
2010
  "[PERMISSIVE_PREFIX:VOWEL]",
2011
  "[w1q]e",
2012
  "pe",
2013
  "nde",
2014
- "[OBJECT_MARKER:3p:DEFAULT]",
2015
  "opo",
 
2016
  "or[w4q]",
2017
  "[GERUND_SUBJECT_PREFIX:2ps]",
2018
  "[w1q]a",
@@ -2036,22 +2034,21 @@
2036
  "[NEGATION_SUFFIX:VOWEL_ENDING]",
2037
  "[SUBJECT_PREFIX:1ps]",
2038
  "[GERUND_SUFFIX:CLASS_1:ORAL_VOWEL]",
2039
- "pa",
2040
  "ramo",
 
2041
  "ere",
2042
  "[OBJECT:REFLEXIVE]",
2043
- "[w5q]",
2044
  "[GERUND_SUFFIX:CLASS_1:NASAL_VOWEL]",
2045
  "na",
2046
  "[GERUND_SUFFIX:CLASS_1:B]",
2047
  "[OBJECT:3p:MONOSYLLABIC]",
2048
  "[GERUND_SUFFIX:CLASS_1:CONSONANT]",
2049
- "oro",
2050
  "[OBJECT:2ps:SUBJECT_1P]",
 
2051
  "[GERUND_SUBJECT_PREFIX:1ps]",
2052
  "[PERMISSIVE_PREFIX:CONSONANT]",
2053
- "[PLURIFORM_PREFIX:R]",
2054
  "[SUBJECT:2ps]",
 
2055
  "o",
2056
  "[SUBJECT:2ps:OBJECT_1P]",
2057
  "end[w4q]",
 
1975
  "e[w15q]ym",
1976
  "[OBJECT_MARKER:3p:PLURIFORM_PREFIX:MONOSYLLABIC]",
1977
  "[OBJECT:1ppi]",
 
1978
  "[GERUND_SUFFIX:CLASS_1]",
1979
  "[SUB_VERB]",
1980
  "n[w15q]",
 
1981
  "[NEGATION_SUFFIX]",
1982
  "[GERUND_SUBJECT_PREFIX:1ppe]",
1983
  "i",
 
1988
  "[w1q]",
1989
  "xe",
1990
  "t",
 
1991
  "[SUBJECT:1ps]",
1992
+ "[SUBJECT:1ppe]",
1993
  "[w1q]ep[w4q]",
1994
+ "a",
1995
  "[SUBJECT_PREFIX:1ppe]",
1996
  "amo",
 
1997
  "pe[w10q]",
1998
+ "[OBJECT:2ps]",
1999
  "[OBJECT:1ppe]",
2000
  "[NEGATION_SUFFIX:CONSONANT_ENDING]",
2001
  "[OBJECT:MUTUAL]",
2002
  "[GERUND_SUFFIX:CLASS_2:ORAL_VOWEL_ENDING]",
2003
  "[w1q]o",
 
2004
  "ix[w4q]",
2005
+ "[NEGATION_PREFIX]",
2006
  "[CIRCUMSTANTIAL_SUFFIX:NULL_ENDING]",
2007
  "[SUBJECT:1ppi]",
2008
  "[PERMISSIVE_PREFIX:VOWEL]",
2009
  "[w1q]e",
2010
  "pe",
2011
  "nde",
 
2012
  "opo",
2013
+ "[OBJECT_MARKER:3p:DEFAULT]",
2014
  "or[w4q]",
2015
  "[GERUND_SUBJECT_PREFIX:2ps]",
2016
  "[w1q]a",
 
2034
  "[NEGATION_SUFFIX:VOWEL_ENDING]",
2035
  "[SUBJECT_PREFIX:1ps]",
2036
  "[GERUND_SUFFIX:CLASS_1:ORAL_VOWEL]",
 
2037
  "ramo",
2038
+ "pa",
2039
  "ere",
2040
  "[OBJECT:REFLEXIVE]",
 
2041
  "[GERUND_SUFFIX:CLASS_1:NASAL_VOWEL]",
2042
  "na",
2043
  "[GERUND_SUFFIX:CLASS_1:B]",
2044
  "[OBJECT:3p:MONOSYLLABIC]",
2045
  "[GERUND_SUFFIX:CLASS_1:CONSONANT]",
 
2046
  "[OBJECT:2ps:SUBJECT_1P]",
2047
+ "oro",
2048
  "[GERUND_SUBJECT_PREFIX:1ps]",
2049
  "[PERMISSIVE_PREFIX:CONSONANT]",
 
2050
  "[SUBJECT:2ps]",
2051
+ "[PLURIFORM_PREFIX:R]",
2052
  "o",
2053
  "[SUBJECT:2ps:OBJECT_1P]",
2054
  "end[w4q]",