yhavinga commited on
Commit
c78da21
·
1 Parent(s): 7d83c88

Add Llama tokenizer creation for Dutch, English, Code, Markdown and TeX.

Browse files
app.py CHANGED
@@ -8,7 +8,7 @@ from patcher.gr_interface import TabbedInterface
8
  demo = TabbedInterface(
9
  [tab_playground, tab_compression],
10
  [" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
11
- title='<div align="center">Tokenizer Arena ⚔️</div>',
12
  css="css/style.css"
13
  )
14
 
 
8
  demo = TabbedInterface(
9
  [tab_playground, tab_compression],
10
  [" ⚔️ Playground", "🏆 Compression Leaderboard",], # 编码速度,解码速度,字符分类(zh、num等,支持正则),支持的语言,机构,。
11
+ title='Tokenizer Arena ⚔️ (with some Dutch 🇳🇱🇧🇪🇸🇷 hacked in)',
12
  css="css/style.css"
13
  )
14
 
app_compression.py CHANGED
@@ -59,7 +59,7 @@ with gr.Blocks() as demo:
59
  with gr.Row():
60
  compress_rate_corpus = gr.Dropdown(
61
  common_corpuses, # , "code"
62
- value=["cc100-en", "cc100-zh-Hans"],
63
  label="corpus",
64
  multiselect=True
65
  # info=""
 
59
  with gr.Row():
60
  compress_rate_corpus = gr.Dropdown(
61
  common_corpuses, # , "code"
62
+ value=["cc100-nl", "cc100-en"],
63
  label="corpus",
64
  multiselect=True
65
  # info=""
config.py CHANGED
@@ -11,10 +11,22 @@ LAZY_IMPORT = True
11
  # DEBUG: 设置环境变量 RUST_BACKTRACE=full
12
  #
13
 
14
- default_user_input = """\
15
- Replace this text in the input field to see how tokenization works.
16
- Buenos días!
17
- 华为发布Mate60手机。
 
 
 
 
 
 
 
 
 
 
 
18
  ラグビーワールドカップ2023フランス"""
19
  default_tokenizer_type_1 = "llama3"
20
- default_tokenizer_type_2 = "gpt_4"
 
 
11
  # DEBUG: 设置环境变量 RUST_BACKTRACE=full
12
  #
13
 
14
+
15
+ default_user_input = """“We apologize for any inconvenience and concern this may have caused to our customers and all concerned. We pray for the rest of the souls of those who lost their lives aboard the Japanese Coast Guard's equipment and extend our condolences to the bereaved families,” he said.
16
+ Steenvliegen of oevervliegen[2] (Plecoptera) zijn een kleine orde van gevleugelde insecten. Steenvliegen zijn te herkennen aan hun slanke, langwerpige lichaamsvorm en de doorzichtige vleugels die in rust plat op de rug worden gehouden.
17
+ def load_image_file(file, mode='RGB'):
18
+ im = PIL.Image.open(file)
19
+ if mode:
20
+ im = im.convert(mode)
21
+ return np.array(im)
22
+ \section{The expected number of intervening \mbox{H\,{\sc i}}
23
+ absorbers}\label{section:expected_number}
24
+ \begin{equation}\label{equation:expected_number}
25
+ \mu = \iint{f(N_{\rm HI},X)\,\mathrm{d}X\,\mathrm{d}N_{\rm HI}},
26
+ \end{equation}
27
+ Eerder noemde De Meij Oud en Nieuw "een soort oorlogsgebied". En hij heeft dan ook geen zin in de nieuwjaarsnacht. "Als je weet dat er collega's gewond gaan raken, kan je niet meer zeggen: het is mooi politiewerk en we gaan naar een spannende nacht. Het zijn gewoon risico's die je niet wil lopen."
28
+ 华为发布Mate60手机
29
  ラグビーワールドカップ2023フランス"""
30
  default_tokenizer_type_1 = "llama3"
31
+ # default_tokenizer_type_2 = "internlm_chat_7b"
32
+ default_tokenizer_type_2 = "mistral_7b"
stats/compress_rate.json CHANGED
@@ -4282,5 +4282,509 @@
4282
  "n_bytes": 2633047,
4283
  "n_tokens": 757405,
4284
  "n_chars": 927311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4285
  }
4286
  }
 
4282
  "n_bytes": 2633047,
4283
  "n_tokens": 757405,
4284
  "n_chars": 927311
4285
+ },
4286
+ "dutch_llama_tokenizer.cc100-en": {
4287
+ "vocab_size": 32000,
4288
+ "n_bytes": 1124813,
4289
+ "n_tokens": 291975,
4290
+ "n_chars": 1121360
4291
+ },
4292
+ "gronlp-gpt2-small-dutch.cc100-en": {
4293
+ "vocab_size": 40000,
4294
+ "n_bytes": 1124813,
4295
+ "n_tokens": 361710,
4296
+ "n_chars": 1121360
4297
+ },
4298
+ "yhavinga-gpt2-medium-dutch.cc100-en": {
4299
+ "vocab_size": 50257,
4300
+ "n_bytes": 1124813,
4301
+ "n_tokens": 361847,
4302
+ "n_chars": 1121360
4303
+ },
4304
+ "yhavinga-ul2-large-en-nl.cc100-en": {
4305
+ "vocab_size": 32128,
4306
+ "n_bytes": 1124813,
4307
+ "n_tokens": 297641,
4308
+ "n_chars": 1121360
4309
+ },
4310
+ "dutch_llama_tokenizer.cc100-zh-Hans": {
4311
+ "vocab_size": 32000,
4312
+ "n_bytes": 2633047,
4313
+ "n_tokens": 2621293,
4314
+ "n_chars": 927311
4315
+ },
4316
+ "gronlp-gpt2-small-dutch.cc100-zh-Hans": {
4317
+ "vocab_size": 40000,
4318
+ "n_bytes": 2633047,
4319
+ "n_tokens": 1350320,
4320
+ "n_chars": 927311
4321
+ },
4322
+ "yhavinga-gpt2-medium-dutch.cc100-zh-Hans": {
4323
+ "vocab_size": 50257,
4324
+ "n_bytes": 2633047,
4325
+ "n_tokens": 2600872,
4326
+ "n_chars": 927311
4327
+ },
4328
+ "yhavinga-ul2-large-en-nl.cc100-zh-Hans": {
4329
+ "vocab_size": 32128,
4330
+ "n_bytes": 2633047,
4331
+ "n_tokens": 2519719,
4332
+ "n_chars": 927311
4333
+ },
4334
+ "aya_101.cc100-nl": {
4335
+ "vocab_size": 250100,
4336
+ "n_bytes": 1513030,
4337
+ "n_tokens": 423616,
4338
+ "n_chars": 1508067
4339
+ },
4340
+ "baichuan.cc100-nl": {
4341
+ "vocab_size": 64000,
4342
+ "n_bytes": 1513030,
4343
+ "n_tokens": 574927,
4344
+ "n_chars": 1508067
4345
+ },
4346
+ "baichuan2.cc100-nl": {
4347
+ "vocab_size": 125696,
4348
+ "n_bytes": 1513030,
4349
+ "n_tokens": 540387,
4350
+ "n_chars": 1508067
4351
+ },
4352
+ "bert_base_cased.cc100-nl": {
4353
+ "vocab_size": 28996,
4354
+ "n_bytes": 1513030,
4355
+ "n_tokens": 630793,
4356
+ "n_chars": 1508067
4357
+ },
4358
+ "bert_base_chinese.cc100-nl": {
4359
+ "vocab_size": 21128,
4360
+ "n_bytes": 1513030,
4361
+ "n_tokens": 626052,
4362
+ "n_chars": 1508067
4363
+ },
4364
+ "bert_base_uncased.cc100-nl": {
4365
+ "vocab_size": 30522,
4366
+ "n_bytes": 1513030,
4367
+ "n_tokens": 574651,
4368
+ "n_chars": 1508067
4369
+ },
4370
+ "bloom.cc100-nl": {
4371
+ "vocab_size": 250680,
4372
+ "n_bytes": 1513030,
4373
+ "n_tokens": 488924,
4374
+ "n_chars": 1508067
4375
+ },
4376
+ "byt5_small.cc100-nl": {
4377
+ "vocab_size": 384,
4378
+ "n_bytes": 1513030,
4379
+ "n_tokens": 1523030,
4380
+ "n_chars": 1508067
4381
+ },
4382
+ "character_glm_6b.cc100-nl": {
4383
+ "vocab_size": 64789,
4384
+ "n_bytes": 1513030,
4385
+ "n_tokens": 559014,
4386
+ "n_chars": 1508067
4387
+ },
4388
+ "chatglm2_6b.cc100-nl": {
4389
+ "vocab_size": 64787,
4390
+ "n_bytes": 1513030,
4391
+ "n_tokens": 559017,
4392
+ "n_chars": 1508067
4393
+ },
4394
+ "chatglm3_6b.cc100-nl": {
4395
+ "vocab_size": 64796,
4396
+ "n_bytes": 1513030,
4397
+ "n_tokens": 559014,
4398
+ "n_chars": 1508067
4399
+ },
4400
+ "chatglm_6b.cc100-nl": {
4401
+ "vocab_size": 150344,
4402
+ "n_bytes": 1513030,
4403
+ "n_tokens": 533174,
4404
+ "n_chars": 1508067
4405
+ },
4406
+ "chatyuan_large_v2.cc100-nl": {
4407
+ "vocab_size": 32128,
4408
+ "n_bytes": 1513030,
4409
+ "n_tokens": 837963,
4410
+ "n_chars": 1508067
4411
+ },
4412
+ "chinese_llama.cc100-nl": {
4413
+ "vocab_size": 49953,
4414
+ "n_bytes": 1513030,
4415
+ "n_tokens": 488766,
4416
+ "n_chars": 1508067
4417
+ },
4418
+ "chinese_llama2.cc100-nl": {
4419
+ "vocab_size": 55296,
4420
+ "n_bytes": 1513030,
4421
+ "n_tokens": 495966,
4422
+ "n_chars": 1508067
4423
+ },
4424
+ "code_davinci_002.cc100-nl": {
4425
+ "vocab_size": 50281,
4426
+ "n_bytes": 1513030,
4427
+ "n_tokens": 559119,
4428
+ "n_chars": 1508067
4429
+ },
4430
+ "crystal_coder.cc100-nl": {
4431
+ "vocab_size": 32022,
4432
+ "n_bytes": 1513030,
4433
+ "n_tokens": 485966,
4434
+ "n_chars": 1508067
4435
+ },
4436
+ "dbrx_instruct.cc100-nl": {
4437
+ "vocab_size": 100280,
4438
+ "n_bytes": 1513030,
4439
+ "n_tokens": 449343,
4440
+ "n_chars": 1508067
4441
+ },
4442
+ "deepseek_coder_33b_instruct.cc100-nl": {
4443
+ "vocab_size": 32022,
4444
+ "n_bytes": 1513030,
4445
+ "n_tokens": 603966,
4446
+ "n_chars": 1508067
4447
+ },
4448
+ "deepseek_llm_7b_base.cc100-nl": {
4449
+ "vocab_size": 100015,
4450
+ "n_bytes": 1513030,
4451
+ "n_tokens": 536746,
4452
+ "n_chars": 1508067
4453
+ },
4454
+ "dutch_llama_tokenizer.cc100-nl": {
4455
+ "vocab_size": 32000,
4456
+ "n_bytes": 1513030,
4457
+ "n_tokens": 366481,
4458
+ "n_chars": 1508067
4459
+ },
4460
+ "falcon_180b.cc100-nl": {
4461
+ "vocab_size": 65024,
4462
+ "n_bytes": 1513030,
4463
+ "n_tokens": 438112,
4464
+ "n_chars": 1508067
4465
+ },
4466
+ "falcon_7b.cc100-nl": {
4467
+ "vocab_size": 65024,
4468
+ "n_bytes": 1513030,
4469
+ "n_tokens": 438112,
4470
+ "n_chars": 1508067
4471
+ },
4472
+ "fastchat_t5_3b.cc100-nl": {
4473
+ "vocab_size": 32110,
4474
+ "n_bytes": 1513030,
4475
+ "n_tokens": 933018,
4476
+ "n_chars": 1508067
4477
+ },
4478
+ "flan_t5_base.cc100-nl": {
4479
+ "vocab_size": 32100,
4480
+ "n_bytes": 1513030,
4481
+ "n_tokens": 696337,
4482
+ "n_chars": 1508067
4483
+ },
4484
+ "gemma_7b.cc100-nl": {
4485
+ "vocab_size": 256000,
4486
+ "n_bytes": 1513030,
4487
+ "n_tokens": 387522,
4488
+ "n_chars": 1508067
4489
+ },
4490
+ "gpt2.cc100-nl": {
4491
+ "vocab_size": 50257,
4492
+ "n_bytes": 1513030,
4493
+ "n_tokens": 559119,
4494
+ "n_chars": 1508067
4495
+ },
4496
+ "gpt2_chinese.cc100-nl": {
4497
+ "vocab_size": 21128,
4498
+ "n_bytes": 1513030,
4499
+ "n_tokens": 676651,
4500
+ "n_chars": 1508067
4501
+ },
4502
+ "gpt_35_turbo.cc100-nl": {
4503
+ "vocab_size": 100277,
4504
+ "n_bytes": 1513030,
4505
+ "n_tokens": 449343,
4506
+ "n_chars": 1508067
4507
+ },
4508
+ "gpt_4.cc100-nl": {
4509
+ "vocab_size": 100277,
4510
+ "n_bytes": 1513030,
4511
+ "n_tokens": 449343,
4512
+ "n_chars": 1508067
4513
+ },
4514
+ "gpt_neox_japanese_2_7b.cc100-nl": {
4515
+ "vocab_size": 32000,
4516
+ "n_bytes": 1513030,
4517
+ "n_tokens": 1509448,
4518
+ "n_chars": 1508067
4519
+ },
4520
+ "gpt_nexo_20b.cc100-nl": {
4521
+ "vocab_size": 50277,
4522
+ "n_bytes": 1513030,
4523
+ "n_tokens": 497728,
4524
+ "n_chars": 1508067
4525
+ },
4526
+ "grok_1.cc100-nl": {
4527
+ "vocab_size": 131072,
4528
+ "n_bytes": 1513030,
4529
+ "n_tokens": 457359,
4530
+ "n_chars": 1508067
4531
+ },
4532
+ "gronlp-gpt2-small-dutch.cc100-nl": {
4533
+ "vocab_size": 40000,
4534
+ "n_bytes": 1513030,
4535
+ "n_tokens": 332376,
4536
+ "n_chars": 1508067
4537
+ },
4538
+ "internlm2_chat_7b.cc100-nl": {
4539
+ "vocab_size": 92544,
4540
+ "n_bytes": 1513030,
4541
+ "n_tokens": 494821,
4542
+ "n_chars": 1508067
4543
+ },
4544
+ "internlm2_math_7b.cc100-nl": {
4545
+ "vocab_size": 92544,
4546
+ "n_bytes": 1513030,
4547
+ "n_tokens": 494821,
4548
+ "n_chars": 1508067
4549
+ },
4550
+ "internlm_chat_7b.cc100-nl": {
4551
+ "vocab_size": 103168,
4552
+ "n_bytes": 1513030,
4553
+ "n_tokens": 494108,
4554
+ "n_chars": 1508067
4555
+ },
4556
+ "internlm_xcomposer_7b.cc100-nl": {
4557
+ "vocab_size": 103168,
4558
+ "n_bytes": 1513030,
4559
+ "n_tokens": 494108,
4560
+ "n_chars": 1508067
4561
+ },
4562
+ "jamba_v0_1.cc100-nl": {
4563
+ "vocab_size": 65536,
4564
+ "n_bytes": 1513030,
4565
+ "n_tokens": 442176,
4566
+ "n_chars": 1508067
4567
+ },
4568
+ "kplug.cc100-nl": {
4569
+ "vocab_size": 10261,
4570
+ "n_bytes": 1513030,
4571
+ "n_tokens": 678131,
4572
+ "n_chars": 1508067
4573
+ },
4574
+ "llama.cc100-nl": {
4575
+ "vocab_size": 32000,
4576
+ "n_bytes": 1513030,
4577
+ "n_tokens": 495966,
4578
+ "n_chars": 1508067
4579
+ },
4580
+ "llama2.cc100-nl": {
4581
+ "vocab_size": 32001,
4582
+ "n_bytes": 1513030,
4583
+ "n_tokens": 495966,
4584
+ "n_chars": 1508067
4585
+ },
4586
+ "llama3.cc100-nl": {
4587
+ "vocab_size": 128256,
4588
+ "n_bytes": 1513030,
4589
+ "n_tokens": 448173,
4590
+ "n_chars": 1508067
4591
+ },
4592
+ "llama_3_chinese_8b.cc100-nl": {
4593
+ "vocab_size": 128256,
4594
+ "n_bytes": 1513030,
4595
+ "n_tokens": 458173,
4596
+ "n_chars": 1508067
4597
+ },
4598
+ "mistral_7b.cc100-nl": {
4599
+ "vocab_size": 32000,
4600
+ "n_bytes": 1513030,
4601
+ "n_tokens": 515884,
4602
+ "n_chars": 1508067
4603
+ },
4604
+ "mixtral_8_7b.cc100-nl": {
4605
+ "vocab_size": 32000,
4606
+ "n_bytes": 1513030,
4607
+ "n_tokens": 515884,
4608
+ "n_chars": 1508067
4609
+ },
4610
+ "mobilebert_uncased.cc100-nl": {
4611
+ "vocab_size": 30522,
4612
+ "n_bytes": 1513030,
4613
+ "n_tokens": 574651,
4614
+ "n_chars": 1508067
4615
+ },
4616
+ "moss.cc100-nl": {
4617
+ "vocab_size": 106072,
4618
+ "n_bytes": 1513030,
4619
+ "n_tokens": 557984,
4620
+ "n_chars": 1508067
4621
+ },
4622
+ "mt5_large.cc100-nl": {
4623
+ "vocab_size": 250100,
4624
+ "n_bytes": 1513030,
4625
+ "n_tokens": 423616,
4626
+ "n_chars": 1508067
4627
+ },
4628
+ "dutch_llama_tokenizer.cc100-es": {
4629
+ "vocab_size": 32000,
4630
+ "n_bytes": 1664455,
4631
+ "n_tokens": 610314,
4632
+ "n_chars": 1630297
4633
+ },
4634
+ "gronlp-gpt2-small-dutch.cc100-es": {
4635
+ "vocab_size": 40000,
4636
+ "n_bytes": 1664455,
4637
+ "n_tokens": 608465,
4638
+ "n_chars": 1630297
4639
+ },
4640
+ "yhavinga-gpt2-medium-dutch.cc100-es": {
4641
+ "vocab_size": 50257,
4642
+ "n_bytes": 1664455,
4643
+ "n_tokens": 605886,
4644
+ "n_chars": 1630297
4645
+ },
4646
+ "yhavinga-ul2-large-en-nl.cc100-es": {
4647
+ "vocab_size": 32128,
4648
+ "n_bytes": 1664455,
4649
+ "n_tokens": 686255,
4650
+ "n_chars": 1630297
4651
+ },
4652
+ "olmo_7b.cc100-nl": {
4653
+ "vocab_size": 50280,
4654
+ "n_bytes": 1513030,
4655
+ "n_tokens": 497728,
4656
+ "n_chars": 1508067
4657
+ },
4658
+ "orion_14b_chat.cc100-nl": {
4659
+ "vocab_size": 84608,
4660
+ "n_bytes": 1513030,
4661
+ "n_tokens": 599429,
4662
+ "n_chars": 1508067
4663
+ },
4664
+ "phi_1.cc100-nl": {
4665
+ "vocab_size": 50295,
4666
+ "n_bytes": 1513030,
4667
+ "n_tokens": 559124,
4668
+ "n_chars": 1508067
4669
+ },
4670
+ "phi_2.cc100-nl": {
4671
+ "vocab_size": 50295,
4672
+ "n_bytes": 1513030,
4673
+ "n_tokens": 559124,
4674
+ "n_chars": 1508067
4675
+ },
4676
+ "phi_3_mini.cc100-nl": {
4677
+ "vocab_size": 32011,
4678
+ "n_bytes": 1513030,
4679
+ "n_tokens": 495966,
4680
+ "n_chars": 1508067
4681
+ },
4682
+ "pko_t5_large.cc100-nl": {
4683
+ "vocab_size": 50358,
4684
+ "n_bytes": 1513030,
4685
+ "n_tokens": 1017288,
4686
+ "n_chars": 1508067
4687
+ },
4688
+ "prompt_clue.cc100-nl": {
4689
+ "vocab_size": 32128,
4690
+ "n_bytes": 1513030,
4691
+ "n_tokens": 837963,
4692
+ "n_chars": 1508067
4693
+ },
4694
+ "qwen1_5_14b_chat.cc100-nl": {
4695
+ "vocab_size": 151646,
4696
+ "n_bytes": 1513030,
4697
+ "n_tokens": 453342,
4698
+ "n_chars": 1508067
4699
+ },
4700
+ "qwen_1_8b_chat.cc100-nl": {
4701
+ "vocab_size": 151851,
4702
+ "n_bytes": 1513030,
4703
+ "n_tokens": 453342,
4704
+ "n_chars": 1508067
4705
+ },
4706
+ "qwen_72b_chat.cc100-nl": {
4707
+ "vocab_size": 151851,
4708
+ "n_bytes": 1513030,
4709
+ "n_tokens": 453342,
4710
+ "n_chars": 1508067
4711
+ },
4712
+ "qwen_7b_chat.cc100-nl": {
4713
+ "vocab_size": 151851,
4714
+ "n_bytes": 1513030,
4715
+ "n_tokens": 453342,
4716
+ "n_chars": 1508067
4717
+ },
4718
+ "roberta_chinese_clue.cc100-nl": {
4719
+ "vocab_size": 8021,
4720
+ "n_bytes": 1513030,
4721
+ "n_tokens": 821246,
4722
+ "n_chars": 1508067
4723
+ },
4724
+ "skywork_13b_base.cc100-nl": {
4725
+ "vocab_size": 65519,
4726
+ "n_bytes": 1513030,
4727
+ "n_tokens": 495958,
4728
+ "n_chars": 1508067
4729
+ },
4730
+ "skywork_13b_math.cc100-nl": {
4731
+ "vocab_size": 65519,
4732
+ "n_bytes": 1513030,
4733
+ "n_tokens": 495958,
4734
+ "n_chars": 1508067
4735
+ },
4736
+ "solar_10_7b.cc100-nl": {
4737
+ "vocab_size": 32000,
4738
+ "n_bytes": 1513030,
4739
+ "n_tokens": 515884,
4740
+ "n_chars": 1508067
4741
+ },
4742
+ "starchat_alpha.cc100-nl": {
4743
+ "vocab_size": 49156,
4744
+ "n_bytes": 1513030,
4745
+ "n_tokens": 532871,
4746
+ "n_chars": 1508067
4747
+ },
4748
+ "switch_c_2048.cc100-nl": {
4749
+ "vocab_size": 32100,
4750
+ "n_bytes": 1513030,
4751
+ "n_tokens": 696333,
4752
+ "n_chars": 1508067
4753
+ },
4754
+ "t5_base.cc100-nl": {
4755
+ "vocab_size": 32100,
4756
+ "n_bytes": 1513030,
4757
+ "n_tokens": 696333,
4758
+ "n_chars": 1508067
4759
+ },
4760
+ "t5_large.cc100-nl": {
4761
+ "vocab_size": 32100,
4762
+ "n_bytes": 1513030,
4763
+ "n_tokens": 696333,
4764
+ "n_chars": 1508067
4765
+ },
4766
+ "t5_small.cc100-nl": {
4767
+ "vocab_size": 32100,
4768
+ "n_bytes": 1513030,
4769
+ "n_tokens": 696333,
4770
+ "n_chars": 1508067
4771
+ },
4772
+ "text_davinci_003.cc100-nl": {
4773
+ "vocab_size": 50281,
4774
+ "n_bytes": 1513030,
4775
+ "n_tokens": 559119,
4776
+ "n_chars": 1508067
4777
+ },
4778
+ "tigerbot_13b_chat_v2.cc100-nl": {
4779
+ "vocab_size": 60515,
4780
+ "n_bytes": 1513030,
4781
+ "n_tokens": 486271,
4782
+ "n_chars": 1508067
4783
+ },
4784
+ "tigerbot_70b_chat_v4_4k.cc100-nl": {
4785
+ "vocab_size": 65110,
4786
+ "n_bytes": 1513030,
4787
+ "n_tokens": 486472,
4788
+ "n_chars": 1508067
4789
  }
4790
  }
utils/compression_util.py CHANGED
@@ -20,7 +20,7 @@ from typing import List, Optional, Union, Literal
20
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
21
 
22
  common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
23
- common_corpuses = sorted(["cc100-en", "cc100-zh-Hans", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
24
  "cc100-fa", "cc100-ar", "cc100-ja"])
25
 
26
  VALID_CODES_CC100 = [
@@ -155,7 +155,7 @@ def tokenize_corpus(
155
 
156
 
157
  def get_compression_leaderboard(
158
- corpuses: List[str] = ['cc100-en'],
159
  unit: str = "b_tokens/g_bytes",
160
  tokenizer_filter: Optional[str] = None,
161
  return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
 
20
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
21
 
22
  common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
23
+ common_corpuses = sorted(["cc100-nl", "cc100-en", "cc100-es", "cc100-fr", "cc100-de", "cc100-ko",
24
  "cc100-fa", "cc100-ar", "cc100-ja"])
25
 
26
  VALID_CODES_CC100 = [
 
155
 
156
 
157
  def get_compression_leaderboard(
158
+ corpuses: List[str] = ['cc100-nl'],
159
  unit: str = "b_tokens/g_bytes",
160
  tokenizer_filter: Optional[str] = None,
161
  return_type: Optional[Literal["dict", "dataframe"]] = "dataframe"
vocab/wizardcoder_15b_v1/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
-
2
- from transformers import AutoTokenizer
3
-
4
- tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
 
1
+ #
2
+ # from transformers import AutoTokenizer
3
+ #
4
+ # tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)