musiclang / tokenizer.json
floriangardin's picture
Upload tokenizer
a1f66da
raw
history blame
23.5 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 821,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<s>": 0,
"</s>": 1,
"<unk>": 2,
"<mask>": 3,
"!": 4,
"\"": 5,
"#": 6,
"$": 7,
"%": 8,
"&": 9,
"'": 10,
"(": 11,
")": 12,
"*": 13,
"+": 14,
",": 15,
"-": 16,
".": 17,
"/": 18,
"0": 19,
"1": 20,
"2": 21,
"3": 22,
"4": 23,
"5": 24,
"6": 25,
"7": 26,
"8": 27,
"9": 28,
":": 29,
";": 30,
"<": 31,
"=": 32,
">": 33,
"?": 34,
"@": 35,
"A": 36,
"B": 37,
"C": 38,
"D": 39,
"E": 40,
"F": 41,
"G": 42,
"H": 43,
"I": 44,
"J": 45,
"K": 46,
"L": 47,
"M": 48,
"N": 49,
"O": 50,
"P": 51,
"Q": 52,
"R": 53,
"S": 54,
"T": 55,
"U": 56,
"V": 57,
"W": 58,
"X": 59,
"Y": 60,
"Z": 61,
"[": 62,
"\\": 63,
"]": 64,
"^": 65,
"_": 66,
"`": 67,
"a": 68,
"b": 69,
"c": 70,
"d": 71,
"e": 72,
"f": 73,
"g": 74,
"h": 75,
"i": 76,
"j": 77,
"k": 78,
"l": 79,
"m": 80,
"n": 81,
"o": 82,
"p": 83,
"q": 84,
"r": 85,
"s": 86,
"t": 87,
"u": 88,
"v": 89,
"w": 90,
"x": 91,
"y": 92,
"z": 93,
"{": 94,
"|": 95,
"}": 96,
"~": 97,
"¡": 98,
"¢": 99,
"£": 100,
"¤": 101,
"¥": 102,
"¦": 103,
"§": 104,
"¨": 105,
"©": 106,
"ª": 107,
"«": 108,
"¬": 109,
"®": 110,
"¯": 111,
"°": 112,
"±": 113,
"²": 114,
"³": 115,
"´": 116,
"µ": 117,
"¶": 118,
"·": 119,
"¸": 120,
"¹": 121,
"º": 122,
"»": 123,
"¼": 124,
"½": 125,
"¾": 126,
"¿": 127,
"À": 128,
"Á": 129,
"Â": 130,
"Ã": 131,
"Ä": 132,
"Å": 133,
"Æ": 134,
"Ç": 135,
"È": 136,
"É": 137,
"Ê": 138,
"Ë": 139,
"Ì": 140,
"Í": 141,
"Î": 142,
"Ï": 143,
"Ð": 144,
"Ñ": 145,
"Ò": 146,
"Ó": 147,
"Ô": 148,
"Õ": 149,
"Ö": 150,
"×": 151,
"Ø": 152,
"Ù": 153,
"Ú": 154,
"Û": 155,
"Ü": 156,
"Ý": 157,
"Þ": 158,
"ß": 159,
"à": 160,
"á": 161,
"â": 162,
"ã": 163,
"ä": 164,
"å": 165,
"æ": 166,
"ç": 167,
"è": 168,
"é": 169,
"ê": 170,
"ë": 171,
"ì": 172,
"í": 173,
"î": 174,
"ï": 175,
"ð": 176,
"ñ": 177,
"ò": 178,
"ó": 179,
"ô": 180,
"õ": 181,
"ö": 182,
"÷": 183,
"ø": 184,
"ù": 185,
"ú": 186,
"û": 187,
"ü": 188,
"ý": 189,
"þ": 190,
"ÿ": 191,
"Ā": 192,
"ā": 193,
"Ă": 194,
"ă": 195,
"Ą": 196,
"ą": 197,
"Ć": 198,
"ć": 199,
"Ĉ": 200,
"ĉ": 201,
"Ċ": 202,
"ċ": 203,
"Č": 204,
"č": 205,
"Ď": 206,
"ď": 207,
"Đ": 208,
"đ": 209,
"Ē": 210,
"ē": 211,
"Ĕ": 212,
"ĕ": 213,
"Ė": 214,
"ė": 215,
"Ę": 216,
"ę": 217,
"Ě": 218,
"ě": 219,
"Ĝ": 220,
"ĝ": 221,
"Ğ": 222,
"ğ": 223,
"Ġ": 224,
"ġ": 225,
"Ģ": 226,
"ģ": 227,
"Ĥ": 228,
"ĥ": 229,
"Ħ": 230,
"ħ": 231,
"Ĩ": 232,
"ĩ": 233,
"Ī": 234,
"ī": 235,
"Ĭ": 236,
"ĭ": 237,
"Į": 238,
"į": 239,
"İ": 240,
"ı": 241,
"IJ": 242,
"ij": 243,
"Ĵ": 244,
"ĵ": 245,
"Ķ": 246,
"ķ": 247,
"ĸ": 248,
"Ĺ": 249,
"ĺ": 250,
"Ļ": 251,
"ļ": 252,
"Ľ": 253,
"ľ": 254,
"Ŀ": 255,
"ŀ": 256,
"Ł": 257,
"ł": 258,
"Ń": 259,
").": 260,
"(-": 261,
"__": 262,
"mp": 263,
"no": 264,
"pi": 265,
"ano": 266,
"piano": 267,
")+": 268,
"pp": 269,
")(": 270,
")+(": 271,
"II": 272,
"ed": 273,
"']": 274,
"['": 275,
"']%": 276,
"qd": 277,
"sd": 278,
"ff": 279,
"en": 280,
"ra": 281,
"au": 282,
"fra": 283,
"gm": 284,
"ent": 285,
"augm": 286,
"frac": 287,
"augment": 288,
"ri": 289,
"VII": 290,
"ht": 291,
"bri": 292,
"ght": 293,
"bright": 294,
"IV": 295,
"),": 296,
"11": 297,
"hd": 298,
"10": 299,
"VI": 300,
"III": 301,
")).": 302,
"))+": 303,
"fff": 304,
"24": 305,
")()+(": 306,
"su": 307,
"on": 308,
"64": 309,
"ba": 310,
"sa": 311,
"sax": 312,
"21": 313,
"))+(": 314,
"65": 315,
"te": 316,
"ppp": 317,
"wd": 318,
"tr": 319,
"43": 320,
"ol": 321,
"ss": 322,
"et": 323,
"ar": 324,
"or": 325,
"one": 326,
"bass": 327,
"ti": 328,
"hor": 329,
"horn": 330,
"el": 331,
"iol": 332,
"viol": 333,
"mb": 334,
"56": 335,
"ump": 336,
"trump": 337,
"trumpet": 338,
"gu": 339,
"it": 340,
"guit": 341,
"guitar": 342,
"to": 343,
"omb": 344,
"tromb": 345,
"trombone": 346,
"40": 347,
"is": 348,
"rin": 349,
"12": 350,
"ch": 351,
"nor": 352,
"tenor": 353,
"13": 354,
"al": 355,
"alto": 356,
"in": 357,
"15": 358,
"op": 359,
"rano": 360,
"sop": 361,
"soprano": 362,
"an": 363,
"gl": 364,
"har": 365,
"harp": 366,
"cel": 367,
"engl": 368,
"ish": 369,
"english": 370,
"fr": 371,
"tu": 372,
"tuba": 373,
"bo": 374,
"tone": 375,
"ritone": 376,
"baritone": 377,
"lo": 378,
"obo": 379,
"oboe": 380,
"cc": 381,
"cello": 382,
"cl": 383,
"arin": 384,
"violin": 385,
"ench": 386,
"french": 387,
"picc": 388,
"olo": 389,
"piccolo": 390,
"oon": 391,
"bassoon": 392,
"clarin": 393,
"clarinet": 394,
"dr": 395,
"ms": 396,
"ums": 397,
"drums": 398,
"viola": 399,
"fl": 400,
"ute": 401,
"flute": 402,
"con": 403,
"tra": 404,
"contra": 405,
"contrabass": 406,
"mpan": 407,
"timpan": 408,
"timpani": 409,
"35": 410,
")),": 411,
"se": 412,
"14": 413,
"tion": 414,
"ste": 415,
"steel": 416,
"16": 417,
")<": 418,
"do": 419,
"fte": 420,
"xt": 421,
"|>": 422,
"endo": 423,
")<|": 424,
"ftext": 425,
"endoftext": 426,
"|>(": 427,
"17": 428,
"ac": 429,
"ou": 430,
"sti": 431,
"acou": 432,
"stic": 433,
"acoustic": 434,
"ad": 435,
"{-": 436,
"le": 437,
"}']%": 438,
"30": 439,
"42": 440,
"18": 441,
"st": 442,
"32": 443,
"bra": 444,
"brass": 445,
"add": 446,
"ense": 447,
"mble": 448,
"ring": 449,
"string": 450,
"ensemble": 451,
"ction": 452,
"section": 453,
")']%": 454,
"dis": 455,
"tor": 456,
"distor": 457,
"distortion": 458,
"19": 459,
"hh": 460,
"20": 461,
"22": 462,
"(+": 463,
"(+)']%": 464,
"]{-": 465,
"23": 466,
"sn": 467,
"25": 468,
"ic": 469,
"ato": 470,
"zz": 471,
"pizz": 472,
"icato": 473,
"pizzicato": 474,
"bd": 475,
"ct": 476,
"ect": 477,
"ric": 478,
"elect": 479,
"electric": 480,
"27": 481,
"26": 482,
"ir": 483,
"oir": 484,
"choir": 485,
"hs": 486,
"aa": 487,
"aahs": 488,
"er": 489,
"td": 490,
"28": 491,
"29": 492,
"mm": 493,
")))+(": 494,
"31": 495,
"oc": 496,
"fin": 497,
"ger": 498,
"finger": 499,
"gan": 500,
"organ": 501,
"33": 502,
"ock": 503,
"['(+)']%": 504,
"jo": 505,
"njo": 506,
"banjo": 507,
"pad": 508,
"34": 509,
"]']%": 510,
"rock": 511,
"37": 512,
"39": 513,
"41": 514,
"44": 515,
"rs": 516,
"84": 517,
"69": 518,
")()<|": 519,
"36": 520,
"war": 521,
"warm": 522,
"etle": 523,
"fretle": 524,
"fretless": 525,
"53": 526,
"['[": 527,
"mba": 528,
"45": 529,
"ari": 530,
"mari": 531,
"marimba": 532,
"47": 533,
"gpi": 534,
"pe": 535,
"bagpi": 536,
"bagpipe": 537,
"))<|": 538,
"48": 539,
"38": 540,
"][": 541,
"est": 542,
"celest": 543,
"celesta": 544,
"fi": 545,
"th": 546,
"cp": 547,
"168": 548,
"52": 549,
"fx": 550,
"ean": 551,
"clean": 552,
"49": 553,
"arina": 554,
"ocarina": 555,
"46": 556,
"oh": 557,
"60": 558,
"ci": 559,
"sci": 560,
"51": 561,
"acc": 562,
"di": 563,
"ordi": 564,
"accordi": 565,
"accordion": 566,
"50": 567,
"54": 568,
"nth": 569,
"sy": 570,
"synth": 571,
"55": 572,
"59": 573,
"lead": 574,
"ver": 575,
"onk": 576,
"rch": 577,
"urch": 578,
"church": 579,
"120": 580,
"dri": 581,
"over": 582,
"ven": 583,
"driven": 584,
"overdriven": 585,
"70": 586,
"}{-": 587,
"73": 588,
"bt": 589,
"61": 590,
"ne": 591,
"87": 592,
"80": 593,
"ap": 594,
"lap": 595,
"slap": 596,
"ag": 597,
"dd": 598,
"oic": 599,
"voic": 600,
"fidd": 601,
"new": 602,
"age": 603,
"voice": 604,
"fiddle": 605,
"83": 606,
"honk": 607,
"spi": 608,
"tonk": 609,
"enspi": 610,
"glock": 611,
"honky": 612,
"enspiel": 613,
"glockenspiel": 614,
"67": 615,
"oth": 616,
"wto": 617,
"sawto": 618,
"sawtooth": 619,
"81": 620,
"77": 621,
"mt": 622,
"57": 623,
"101": 624,
"109": 625,
"79": 626,
"112": 627,
"89": 628,
"62": 629,
"113": 630,
"140": 631,
"lt": 632,
"71": 633,
"103": 634,
"97": 635,
"blo": 636,
"tt": 637,
"wn": 638,
"bott": 639,
"blown": 640,
"bottle": 641,
"95": 642,
"115": 643,
"151": 644,
"85": 645,
"280": 646,
"137": 647,
"143": 648,
"240": 649,
"91": 650,
"125": 651,
"241": 652,
"121": 653,
"92": 654,
"116": 655,
"123": 656,
"187": 657,
"at": 658,
"her": 659,
"mo": 660,
"pher": 661,
"spher": 662,
"127": 663,
"131": 664,
"229": 665,
"atmo": 666,
"sphere": 667,
"atmosphere": 668,
"63": 669,
"82": 670,
"119": 671,
"173": 672,
"223": 673,
"179": 674,
"93": 675,
"129": 676,
"840": 677,
"105": 678,
"293": 679,
"ara": 680,
"ng": 681,
"brightne": 682,
"chara": 683,
"brightness": 684,
"charang": 685,
"117": 686,
"139": 687,
"169": 688,
")))<|": 689,
"167": 690,
"185": 691,
"133": 692,
"157": 693,
"181": 694,
"365": 695,
"mu": 696,
"ted": 697,
"muted": 698,
"210": 699,
"221": 700,
"76": 701,
"oo": 702,
"199": 703,
"oohs": 704,
"cy": 705,
"ever": 706,
"rever": 707,
"207": 708,
"289": 709,
"336": 710,
"mbal": 711,
"1094": 712,
"cymbal": 713,
"reverse": 714,
"197": 715,
"107": 716,
"147": 717,
"99": 718,
"75": 719,
"153": 720,
"257": 721,
"281": 722,
"389": 723,
"211": 724,
"529": 725,
"553": 726,
"68": 727,
"1087": 728,
"213": 729,
"191": 730,
"209": 731,
"271": 732,
"295": 733,
"467": 734,
"343": 735,
"719": 736,
"737": 737,
"843": 738,
"917": 739,
"247": 740,
"437": 741,
"409": 742,
"128": 743,
"359": 744,
"145": 745,
"205": 746,
"297": 747,
"371": 748,
"391": 749,
"697": 750,
"451": 751,
"671": 752,
"679": 753,
"319": 754,
"623": 755,
"669": 756,
"733": 757,
"767": 758,
"757": 759,
"811": 760,
"883": 761,
"913": 762,
"935": 763,
"243": 764,
"212": 765,
"217": 766,
"560": 767,
"569": 768,
"152": 769,
"155": 770,
"1521": 771,
"161": 772,
"163": 773,
"1627": 774,
"177": 775,
"186": 776,
"323": 777,
"327": 778,
"255": 779,
"263": 780,
"268": 781,
"334": 782,
"345": 783,
"392": 784,
"399": 785,
"695": 786,
"699": 787,
"475": 788,
"489": 789,
"491": 790,
"591": 791,
"705": 792,
"801": 793,
"3653": 794,
"9943": 795,
"149": 796,
"922": 797,
"159": 798,
"bl": 799,
"obl": 800,
"ins": 801,
"globl": 802,
"193": 803,
"377": 804,
"globlins": 805,
"58": 806,
"227": 807,
"463": 808,
"235": 809,
"311": 810,
"111": 811,
"141": 812,
"148": 813,
"1691": 814,
"170": 815,
"422": 816,
"188": 817,
"201": 818,
"269": 819,
"481": 820
},
"merges": [
") .",
"( -",
"_ _",
"m p",
"n o",
"p i",
"a no",
"pi ano",
") +",
"p p",
") (",
")+ (",
"I I",
"e d",
"' ]",
"[ '",
"'] %",
"q d",
"s d",
"f f",
"e n",
"r a",
"a u",
"f ra",
"g m",
"en t",
"au gm",
"fra c",
"augm ent",
"r i",
"V II",
"h t",
"b ri",
"g ht",
"bri ght",
"I V",
") ,",
"1 1",
"h d",
"1 0",
"V I",
"II I",
") ).",
") )+",
"ff f",
"2 4",
")( )+(",
"s u",
"o n",
"6 4",
"b a",
"s a",
"sa x",
"2 1",
") )+(",
"6 5",
"t e",
"pp p",
"w d",
"t r",
"4 3",
"o l",
"s s",
"e t",
"a r",
"o r",
"on e",
"ba ss",
"t i",
"h or",
"hor n",
"e l",
"i ol",
"v iol",
"m b",
"5 6",
"u mp",
"tr ump",
"trump et",
"g u",
"i t",
"gu it",
"guit ar",
"t o",
"o mb",
"tr omb",
"tromb one",
"4 0",
"i s",
"ri n",
"1 2",
"c h",
"no r",
"te nor",
"1 3",
"a l",
"al to",
"i n",
"1 5",
"o p",
"r ano",
"s op",
"sop rano",
"a n",
"g l",
"h ar",
"har p",
"c el",
"en gl",
"is h",
"engl ish",
"f r",
"t u",
"tu ba",
"b o",
"t one",
"ri tone",
"ba ritone",
"l o",
"o bo",
"obo e",
"c c",
"cel lo",
"c l",
"a rin",
"viol in",
"en ch",
"fr ench",
"pi cc",
"ol o",
"picc olo",
"o on",
"bass oon",
"cl arin",
"clarin et",
"d r",
"m s",
"u ms",
"dr ums",
"viol a",
"f l",
"u te",
"fl ute",
"c on",
"t ra",
"con tra",
"contra bass",
"mp an",
"ti mpan",
"timpan i",
"3 5",
") ),",
"s e",
"1 4",
"ti on",
"s te",
"ste el",
"1 6",
") <",
"d o",
"f te",
"x t",
"| >",
"en do",
")< |",
"fte xt",
"endo ftext",
"|> (",
"1 7",
"a c",
"o u",
"s ti",
"ac ou",
"sti c",
"acou stic",
"a d",
"{ -",
"l e",
"} ']%",
"3 0",
"4 2",
"1 8",
"s t",
"3 2",
"b ra",
"bra ss",
"ad d",
"en se",
"mb le",
"rin g",
"st ring",
"ense mble",
"c tion",
"se ction",
") ']%",
"d is",
"t or",
"dis tor",
"distor tion",
"1 9",
"h h",
"2 0",
"2 2",
"( +",
"(+ )']%",
"] {-",
"2 3",
"s n",
"2 5",
"i c",
"a to",
"z z",
"pi zz",
"ic ato",
"pizz icato",
"b d",
"c t",
"e ct",
"ri c",
"el ect",
"elect ric",
"2 7",
"2 6",
"i r",
"o ir",
"ch oir",
"h s",
"a a",
"aa hs",
"e r",
"t d",
"2 8",
"2 9",
"m m",
") ))+(",
"3 1",
"o c",
"f in",
"g er",
"fin ger",
"g an",
"or gan",
"3 3",
"oc k",
"[' (+)']%",
"j o",
"n jo",
"ba njo",
"p ad",
"3 4",
"] ']%",
"r ock",
"3 7",
"3 9",
"4 1",
"4 4",
"r s",
"8 4",
"6 9",
")( )<|",
"3 6",
"w ar",
"war m",
"et le",
"fr etle",
"fretle ss",
"5 3",
"[' [",
"m ba",
"4 5",
"a ri",
"m ari",
"mari mba",
"4 7",
"g pi",
"p e",
"ba gpi",
"bagpi pe",
") )<|",
"4 8",
"3 8",
"] [",
"e st",
"cel est",
"celest a",
"f i",
"t h",
"c p",
"16 8",
"5 2",
"f x",
"e an",
"cl ean",
"4 9",
"arin a",
"oc arina",
"4 6",
"o h",
"6 0",
"c i",
"s ci",
"5 1",
"a cc",
"d i",
"or di",
"acc ordi",
"accordi on",
"5 0",
"5 4",
"n th",
"s y",
"sy nth",
"5 5",
"5 9",
"le ad",
"v er",
"on k",
"r ch",
"u rch",
"ch urch",
"12 0",
"d ri",
"o ver",
"v en",
"dri ven",
"over driven",
"7 0",
"} {-",
"7 3",
"b t",
"6 1",
"n e",
"8 7",
"8 0",
"a p",
"l ap",
"s lap",
"a g",
"d d",
"o ic",
"v oic",
"fi dd",
"ne w",
"ag e",
"voic e",
"fidd le",
"8 3",
"h onk",
"s pi",
"t onk",
"en spi",
"gl ock",
"honk y",
"enspi el",
"glock enspiel",
"6 7",
"o th",
"w to",
"sa wto",
"sawto oth",
"8 1",
"7 7",
"m t",
"5 7",
"10 1",
"10 9",
"7 9",
"11 2",
"8 9",
"6 2",
"11 3",
"1 40",
"l t",
"7 1",
"10 3",
"9 7",
"b lo",
"t t",
"w n",
"bo tt",
"blo wn",
"bott le",
"9 5",
"11 5",
"15 1",
"8 5",
"28 0",
"13 7",
"1 43",
"24 0",
"9 1",
"12 5",
"24 1",
"1 21",
"9 2",
"11 6",
"12 3",
"18 7",
"a t",
"h er",
"m o",
"p her",
"s pher",
"12 7",
"13 1",
"22 9",
"at mo",
"spher e",
"atmo sphere",
"6 3",
"8 2",
"11 9",
"17 3",
"22 3",
"17 9",
"9 3",
"12 9",
"8 40",
"10 5",
"29 3",
"a ra",
"n g",
"bright ne",
"ch ara",
"brightne ss",
"chara ng",
"11 7",
"13 9",
"16 9",
") ))<|",
"16 7",
"18 5",
"13 3",
"15 7",
"18 1",
"3 65",
"m u",
"t ed",
"mu ted",
"2 10",
"2 21",
"7 6",
"o o",
"19 9",
"oo hs",
"c y",
"e ver",
"r ever",
"20 7",
"28 9",
"33 6",
"mba l",
"109 4",
"cy mbal",
"rever se",
"19 7",
"10 7",
"14 7",
"9 9",
"7 5",
"15 3",
"25 7",
"28 1",
"38 9",
"2 11",
"5 29",
"5 53",
"6 8",
"10 87",
"21 3",
"19 1",
"20 9",
"27 1",
"29 5",
"46 7",
"3 43",
"7 19",
"7 37",
"8 43",
"9 17",
"24 7",
"43 7",
"40 9",
"12 8",
"35 9",
"14 5",
"20 5",
"29 7",
"37 1",
"39 1",
"69 7",
"45 1",
"67 1",
"67 9",
"3 19",
"6 23",
"6 69",
"7 33",
"7 67",
"7 57",
"8 11",
"8 83",
"9 13",
"9 35",
"24 3",
"21 2",
"21 7",
"56 0",
"56 9",
"15 2",
"15 5",
"15 21",
"16 1",
"16 3",
"16 27",
"17 7",
"18 6",
"32 3",
"32 7",
"25 5",
"26 3",
"26 8",
"33 4",
"34 5",
"39 2",
"39 9",
"69 5",
"69 9",
"47 5",
"48 9",
"49 1",
"59 1",
"70 5",
"80 1",
"365 3",
"99 43",
"14 9",
"9 22",
"15 9",
"b l",
"o bl",
"in s",
"gl obl",
"19 3",
"37 7",
"globl ins",
"5 8",
"22 7",
"46 3",
"2 35",
"3 11",
"11 1",
"14 1",
"14 8",
"16 91",
"17 0",
"42 2",
"18 8",
"20 1",
"26 9",
"48 1"
]
}
}