BERT-MLM / tokenizer.json
AmalNlal's picture
Upload tokenizer
bc08043 verified
raw
history blame
20.7 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": false
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"!": 5,
"\"": 6,
"#": 7,
"$": 8,
"%": 9,
"&": 10,
"'": 11,
"(": 12,
")": 13,
"*": 14,
"+": 15,
",": 16,
"-": 17,
".": 18,
"/": 19,
"0": 20,
"1": 21,
"2": 22,
"3": 23,
"4": 24,
"5": 25,
"6": 26,
"7": 27,
"8": 28,
"9": 29,
":": 30,
";": 31,
"<": 32,
"=": 33,
">": 34,
"?": 35,
"@": 36,
"A": 37,
"B": 38,
"C": 39,
"D": 40,
"E": 41,
"F": 42,
"G": 43,
"H": 44,
"I": 45,
"J": 46,
"K": 47,
"L": 48,
"M": 49,
"N": 50,
"O": 51,
"P": 52,
"Q": 53,
"R": 54,
"S": 55,
"T": 56,
"U": 57,
"V": 58,
"W": 59,
"X": 60,
"Y": 61,
"Z": 62,
"[": 63,
"\\": 64,
"]": 65,
"^": 66,
"_": 67,
"`": 68,
"a": 69,
"b": 70,
"c": 71,
"d": 72,
"e": 73,
"f": 74,
"g": 75,
"h": 76,
"i": 77,
"j": 78,
"k": 79,
"l": 80,
"m": 81,
"n": 82,
"o": 83,
"p": 84,
"q": 85,
"r": 86,
"s": 87,
"t": 88,
"u": 89,
"v": 90,
"w": 91,
"x": 92,
"y": 93,
"z": 94,
"{": 95,
"|": 96,
"}": 97,
"~": 98,
"¡": 99,
"¢": 100,
"£": 101,
"¦": 102,
"§": 103,
"¨": 104,
"©": 105,
"«": 106,
"¬": 107,
"®": 108,
"¯": 109,
"°": 110,
"±": 111,
"³": 112,
"´": 113,
"µ": 114,
"¶": 115,
"·": 116,
"º": 117,
"»": 118,
"½": 119,
"¾": 120,
"À": 121,
"Ä": 122,
"Å": 123,
"Ç": 124,
"È": 125,
"É": 126,
"Ê": 127,
"Í": 128,
"Î": 129,
"Ï": 130,
"Ð": 131,
"Ñ": 132,
"Ó": 133,
"Ô": 134,
"Ö": 135,
"×": 136,
"Ø": 137,
"Ü": 138,
"ß": 139,
"à": 140,
"á": 141,
"â": 142,
"ä": 143,
"å": 144,
"æ": 145,
"ç": 146,
"è": 147,
"é": 148,
"ê": 149,
"ë": 150,
"ì": 151,
"í": 152,
"î": 153,
"ï": 154,
"ð": 155,
"ñ": 156,
"ò": 157,
"ó": 158,
"ô": 159,
"ö": 160,
"÷": 161,
"ø": 162,
"ù": 163,
"ú": 164,
"û": 165,
"ü": 166,
"ý": 167,
"ā": 168,
"Ă": 169,
"ă": 170,
"ć": 171,
"Ċ": 172,
"ċ": 173,
"Č": 174,
"č": 175,
"ē": 176,
"ě": 177,
"Ğ": 178,
"ğ": 179,
"Ĥ": 180,
"ĥ": 181,
"ħ": 182,
"ī": 183,
"ł": 184,
"ń": 185,
"œ": 186,
"ś": 187,
"Ŝ": 188,
"ŝ": 189,
"ş": 190,
"Š": 191,
"š": 192,
"ŵ": 193,
"Ž": 194,
"ǂ": 195,
"ɛ": 196,
"ʰ": 197,
"ʽ": 198,
"˘": 199,
"˙": 200,
"˚": 201,
"˜": 202,
"̃": 203,
"̄": 204,
"̅": 205,
"Β": 206,
"Γ": 207,
"Δ": 208,
"Ε": 209,
"Θ": 210,
"Λ": 211,
"Μ": 212,
"Π": 213,
"Σ": 214,
"Τ": 215,
"Φ": 216,
"Ψ": 217,
"Ω": 218,
"ά": 219,
"ί": 220,
"α": 221,
"β": 222,
"γ": 223,
"δ": 224,
"ε": 225,
"ζ": 226,
"η": 227,
"θ": 228,
"ι": 229,
"κ": 230,
"λ": 231,
"μ": 232,
"ν": 233,
"ξ": 234,
"ο": 235,
"π": 236,
"ρ": 237,
"ς": 238,
"σ": 239,
"τ": 240,
"υ": 241,
"φ": 242,
"χ": 243,
"ψ": 244,
"ω": 245,
"ϑ": 246,
"ϕ": 247,
"ϖ": 248,
"ϰ": 249,
"ϱ": 250,
"Ё": 251,
"А": 252,
"Б": 253,
"В": 254,
"Г": 255,
"Д": 256,
"Е": 257,
"З": 258,
"И": 259,
"К": 260,
"Л": 261,
"М": 262,
"Н": 263,
"О": 264,
"П": 265,
"Р": 266,
"С": 267,
"Т": 268,
"У": 269,
"Ф": 270,
"Х": 271,
"Ц": 272,
"Ч": 273,
"Ш": 274,
"Щ": 275,
"Ъ": 276,
"Э": 277,
"Я": 278,
"а": 279,
"б": 280,
"в": 281,
"г": 282,
"д": 283,
"е": 284,
"ж": 285,
"з": 286,
"и": 287,
"й": 288,
"к": 289,
"л": 290,
"м": 291,
"н": 292,
"о": 293,
"п": 294,
"р": 295,
"с": 296,
"т": 297,
"у": 298,
"ф": 299,
"х": 300,
"ц": 301,
"ч": 302,
"ш": 303,
"щ": 304,
"ъ": 305,
"ы": 306,
"ь": 307,
"э": 308,
"ю": 309,
"я": 310,
"ё": 311,
"׀": 312,
"Ḥ": 313,
"ṭ": 314,
"‐": 315,
"–": 316,
"—": 317,
"‖": 318,
"‘": 319,
"’": 320,
"‚": 321,
"“": 322,
"”": 323,
"„": 324,
"†": 325,
"‡": 326,
"•": 327,
"‥": 328,
"…": 329,
"‰": 330,
"′": 331,
"″": 332,
"‴": 333,
"‹": 334,
"›": 335,
"※": 336,
"₫": 337,
"€": 338,
"⃄": 339,
"ℏ": 340,
"ℓ": 341,
"ℝ": 342,
"™": 343,
"ℴ": 344,
"ℵ": 345,
"←": 346,
"↑": 347,
"→": 348,
"↓": 349,
"↔": 350,
"↶": 351,
"↷": 352,
"⇄": 353,
"⇆": 354,
"⇋": 355,
"⇌": 356,
"⇒": 357,
"⇘": 358,
"∁": 359,
"∂": 360,
"∄": 361,
"∆": 362,
"∇": 363,
"∈": 364,
"∉": 365,
"∎": 366,
"∏": 367,
"∑": 368,
"−": 369,
"∗": 370,
"∘": 371,
"∙": 372,
"√": 373,
"∝": 374,
"∞": 375,
"∠": 376,
"∣": 377,
"∥": 378,
"∩": 379,
"∫": 380,
"∮": 381,
"∶": 382,
"∸": 383,
"∼": 384,
"≃": 385,
"≅": 386,
"≈": 387,
"≊": 388,
"≍": 389,
"≏": 390,
"≙": 391,
"≠": 392,
"≡": 393,
"≤": 394,
"≥": 395,
"≦": 396,
"≧": 397,
"≪": 398,
"≫": 399,
"≳": 400,
"≶": 401,
"≽": 402,
"⊂": 403,
"⊕": 404,
"⊖": 405,
"⊗": 406,
"⊙": 407,
"⊥": 408,
"⊵": 409,
"⋅": 410,
"⋆": 411,
"⋍": 412,
"⋯": 413,
"⌈": 414,
"⌉": 415,
"⌢": 416,
"⌽": 417,
"␣": 418,
"║": 419,
"■": 420,
"□": 421,
"△": 422,
"▵": 423,
"▶": 424,
"►": 425,
"●": 426,
"◘": 427,
"◦": 428,
"◯": 429,
"♪": 430,
"✝": 431,
"⟶": 432,
"⨁": 433,
"⩽": 434,
"⩾": 435,
"⪍": 436,
"⪞": 437,
"⪯": 438,
"〈": 439,
"〉": 440,
"##i": 441,
"##n": 442,
"##e": 443,
"##l": 444,
"##y": 445,
"##u": 446,
"##t": 447,
"##o": 448,
"##m": 449,
"##a": 450,
"##s": 451,
"##r": 452,
"##g": 453,
"##h": 454,
"##c": 455,
"##d": 456,
"##b": 457,
"##w": 458,
"##а": 459,
"##з": 460,
"##в": 461,
"##е": 462,
"##р": 463,
"##т": 464,
"##к": 465,
"##и": 466,
"##ä": 467,
"##p": 468,
"##z": 469,
"##k": 470,
"##4": 471,
"##0": 472,
"##2": 473,
"##é": 474,
"##ü": 475,
"##F": 476,
"##I": 477,
"##P": 478,
"##á": 479,
"##x": 480,
"##X": 481,
"##è": 482,
"##f": 483,
"##с": 484,
"##ы": 485,
"##о": 486,
"##л": 487,
"##у": 488,
"##ч": 489,
"##н": 490,
"##й": 491,
"##D": 492,
"##S": 493,
"##E": 494,
"##A": 495,
"##C": 496,
"##v": 497,
"##ß": 498,
"##г": 499,
"##ö": 500,
"##q": 501,
"##д": 502,
"##R": 503,
"##B": 504,
"##L": 505,
"##U": 506,
"##N": 507,
"##G": 508,
"##W": 509,
"##O": 510,
"##Z": 511,
"##M": 512,
"##T": 513,
"##9": 514,
"##8": 515,
"##V": 516,
"##ю": 517,
"##щ": 518,
"##я": 519,
"##3": 520,
"##H": 521,
"##β": 522,
"##−": 523,
"##С": 524,
"##∶": 525,
"##5": 526,
"##7": 527,
"##6": 528,
"##м": 529,
"##ь": 530,
"##1": 531,
"##°": 532,
"##σ": 533,
"##б": 534,
"##à": 535,
"##≪": 536,
"##ж": 537,
"##±": 538,
"##j": 539,
"##ц": 540,
"##≤": 541,
"##х": 542,
"##И": 543,
"##Э": 544,
"##K": 545,
"##×": 546,
"##п": 547,
"##í": 548,
"##Е": 549,
"##Н": 550,
"##А": 551,
"##О": 552,
"##ï": 553,
"##Y": 554,
"##Т": 555,
"##®": 556,
"##J": 557,
"##ф": 558,
"##→": 559,
"##η": 560,
"##Р": 561,
"##î": 562,
"##ó": 563,
"##ñ": 564,
"##Ü": 565,
"##Ч": 566,
"##™": 567,
"##Q": 568,
"##М": 569,
"##Ф": 570,
"##У": 571,
"##μ": 572,
"##ν": 573,
"##δ": 574,
"##Д": 575,
"##α": 576,
"##В": 577,
"##⊕": 578,
"##´": 579,
"##ʰ": 580,
"##≡": 581,
"##←": 582,
"##ш": 583,
"##ë": 584,
"##Б": 585,
"##Щ": 586,
"##≽": 587,
"##â": 588,
"##̄": 589,
"##˚": 590,
"##λ": 591,
"##γ": 592,
"##ç": 593,
"##␣": 594,
"##Ä": 595,
"##ò": 596,
"##ê": 597,
"##Г": 598,
"##ô": 599,
"##↑": 600,
"##ú": 601,
"##∥": 602,
"##≅": 603,
"##æ": 604,
"##ċ": 605,
"##∼": 606,
"##µ": 607,
"##≥": 608,
"##ø": 609,
"##φ": 610,
"##э": 611,
"##ε": 612,
"##ρ": 613,
"##ο": 614,
"##ς": 615,
"##π": 616,
"##Å": 617,
"##∁": 618,
"##¯": 619,
"##¦": 620,
"##Ω": 621,
"##¨": 622,
"##Σ": 623,
"##ă": 624,
"##⋅": 625,
"##ά": 626,
"##Ñ": 627,
"##œ": 628,
"##ъ": 629,
"##Δ": 630,
"##ć": 631,
"##≫": 632,
"##Ö": 633,
"##û": 634,
"##≃": 635,
"##∞": 636,
"##ń": 637,
"##Β": 638,
"##å": 639,
"##ð": 640,
"##⩽": 641,
"##↔": 642,
"##ù": 643,
"##⋯": 644,
"##Ё": 645,
"##⇌": 646,
"##√": 647,
"##ì": 648,
"##κ": 649,
"##ψ": 650,
"##º": 651,
"##υ": 652,
"##⇋": 653,
"##ě": 654,
"##ω": 655,
"##ℓ": 656,
"##ξ": 657,
"##К": 658,
"##▵": 659,
"##≈": 660,
"##É": 661,
"##Î": 662,
"##Í": 663,
"##∘": 664,
"##ğ": 665,
"##ī": 666,
"##ṭ": 667,
"##ś": 668,
"##П": 669,
"##˘": 670,
"##ā": 671,
"##÷": 672,
"##⊥": 673,
"##˜": 674,
"##Ô": 675,
"##⪯": 676,
"##Ъ": 677,
"##⪍": 678,
"##̅": 679,
"##θ": 680,
"##Ê": 681,
"##Ψ": 682,
"##∸": 683,
"##≦": 684,
"##₫": 685,
"##Ï": 686,
"##⊖": 687,
"##ł": 688,
"##Φ": 689,
"##ℴ": 690,
"##Ŝ": 691,
"##ý": 692,
"##Ċ": 693,
"##⇘": 694,
"##≠": 695,
"##⌢": 696,
"##ι": 697,
"##Π": 698,
"##ё": 699,
"##¢": 700,
"##∏": 701,
"##↓": 702,
"##ϕ": 703,
"##ϰ": 704,
"##š": 705,
"##⇒": 706,
"##ŝ": 707,
"##č": 708,
"##⊂": 709,
"##Ç": 710,
"##∙": 711,
"##∣": 712,
"##³": 713,
"##⨁": 714,
"##⊙": 715,
"##τ": 716,
"##ϑ": 717,
"##ş": 718,
"##≳": 719,
"##ϱ": 720,
"##ζ": 721,
"##Μ": 722,
"##˙": 723,
"##∗": 724,
"##◦": 725,
"##¬": 726,
"##ɛ": 727,
"##er": 728,
"##en": 729,
"##on": 730,
"##ti": 731,
"##he": 732,
"##in": 733,
"##es": 734,
"##nd": 735,
"the": 736,
"##ed": 737,
"##al": 738,
"of": 739,
"##tion": 740,
"##ch": 741,
"##or": 742,
"##ro": 743,
"##at": 744,
"##it": 745,
"##an": 746,
"in": 747,
"##ar": 748,
"##et": 749,
"##as": 750,
"##is": 751,
"##ol": 752,
"##el": 753,
"and": 754,
"##ur": 755,
"##ic": 756,
"##ation": 757,
"##ec": 758,
"##st": 759,
"##ent": 760,
"##ng": 761,
"##le": 762,
"##id": 763,
"##om": 764,
"##re": 765,
"##ing": 766,
"##ac": 767,
"##ie": 768,
"to": 769,
"##ul": 770,
"##ung": 771,
"##il": 772,
"##tr": 773,
"##em": 774,
"##ich": 775,
"##ig": 776,
"##us": 777,
"The": 778,
"##ter": 779,
"##am": 780,
"th": 781,
"##um": 782,
"for": 783,
"con": 784,
"##od": 785,
"##uc": 786,
"##os": 787,
"pro": 788,
"der": 789,
"##yl": 790,
"##ith": 791,
"##ff": 792,
"##im": 793,
"be": 794,
"und": 795,
"##tiv": 796,
"##te": 797,
"an": 798,
"with": 799,
"is": 800,
"##ts": 801,
"##ly": 802,
"##ere": 803,
"##ine": 804,
"##omp": 805,
"##tic": 806,
"##th": 807,
"re": 808,
"##ir": 809,
"##ph": 810,
"by": 811,
"##ta": 812,
"##rom": 813,
"al": 814,
"##ge": 815,
"die": 816,
"ac": 817,
"##per": 818,
"##eth": 819,
"##ten": 820,
"on": 821,
"##ow": 822,
"##ate": 823,
"di": 824,
"##ra": 825,
"de": 826,
"##ect": 827,
"was": 828,
"##si": 829,
"##ut": 830,
"##gen": 831,
"comp": 832,
"##erm": 833,
"ex": 834,
"##qu": 835,
"##ity": 836,
"##ure": 837,
"##ell": 838,
"##ce": 839,
"as": 840,
"##ated": 841,
"ch": 842,
"##ab": 843,
"##se": 844,
"##aly": 845,
"##di": 846,
"##ri": 847,
"##den": 848,
"##ers": 849,
"##op": 850,
"##iz": 851,
"##ess": 852,
"are": 853,
"sp": 854,
"##ver": 855,
"st": 856,
"##ou": 857,
"##yd": 858,
"at": 859,
"en": 860,
"were": 861,
"##oly": 862,
"##pl": 863,
"res": 864,
"ein": 865,
"su": 866,
"or": 867,
"##che": 868,
"von": 869,
"##hr": 870,
"##og": 871,
"##ical": 872,
"##action": 873,
"pr": 874,
"##ase": 875,
"ph": 876,
"des": 877,
"##ve": 878,
"##ind": 879,
"##yst": 880,
"##ot": 881,
"##ib": 882,
"##uf": 883,
"im": 884,
"meth": 885,
"that": 886,
"sol": 887,
"##tive": 888,
"In": 889,
"##chen": 890,
"##ide": 891,
"wh": 892,
"##ier": 893,
"##igh": 894,
"##ran": 895,
"from": 896,
"##lu": 897,
"##de": 898,
"##ations": 899,
"##ment": 900,
"##if": 901,
"##sis": 902,
"##and": 903,
"un": 904,
"us": 905,
"##ium": 906,
"##ydro": 907,
"##ak": 908,
"##orm": 909,
"det": 910,
"##ap": 911,
"str": 912,
"##duc": 913,
"##all": 914,
"##eit": 915,
"##ati": 916,
"##ens": 917,
"##ant": 918,
"##oc": 919,
"me": 920,
"##ous": 921,
"##ined": 922,
"dis": 923,
"se": 924,
"##ene": 925,
"##der": 926,
"##nt": 927,
"##ass": 928,
"##ay": 929,
"##lich": 930,
"##tions": 931,
"el": 932,
"##we": 933,
"##un": 934,
"##ach": 935,
"##ber": 936,
"##ungen": 937,
"par": 938,
"##ox": 939,
"##etr": 940,
"##pp": 941,
"poly": 942,
"##bs": 943,
"##ad": 944,
"##end": 945,
"##ectro": 946,
"##ater": 947,
"##ence": 948,
"##hem": 949,
"acid": 950,
"mit": 951,
"##ents": 952,
"##arb": 953,
"diff": 954,
"##ies": 955,
"analy": 956,
"##act": 957,
"##uct": 958,
"##ound": 959,
"##ied": 960,
"er": 961,
"mol": 962,
"An": 963,
"den": 964,
"##chn": 965,
"inter": 966,
"form": 967,
"ne": 968,
"##uch": 969,
"##ons": 970,
"##est": 971,
"als": 972,
"sh": 973,
"##ystem": 974,
"ad": 975,
"##form": 976,
"##ial": 977,
"am": 978,
"stu": 979,
"method": 980,
"##par": 981,
"hydro": 982,
"##esti": 983,
"co": 984,
"##ag": 985,
"##ür": 986,
"ox": 987,
"Die": 988,
"##iv": 989,
"##lex": 990,
"##iel": 991,
"Th": 992,
"Ver": 993,
"been": 994,
"inc": 995,
"Re": 996,
"##ble": 997,
"##entr": 998,
"##iqu": 999
}
}
}