yhavinga commited on
Commit
55df72d
·
1 Parent(s): f331792

Hack some Dutch tokenizers into it

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Tokenizer Arena
3
  emoji: ⚡
4
  colorFrom: red
5
  colorTo: gray
 
1
  ---
2
+ title: Dutch Tokenizer Arena
3
  emoji: ⚡
4
  colorFrom: red
5
  colorTo: gray
util.py CHANGED
@@ -125,6 +125,29 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
125
  return overlap_token_size, overlap_token_size
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  def on_load(url_params, request: gr.Request):
130
  """
 
125
  return overlap_token_size, overlap_token_size
126
 
127
 
128
+ default_user_input = """
129
+ “We apologize for any inconvenience and concern this may have caused to our customers and all concerned. We pray for the rest of the souls of those who lost their lives aboard the Japanese Coast Guard's equipment and extend our condolences to the bereaved families,” he said.
130
+
131
+ Steenvliegen of oevervliegen[2] (Plecoptera) zijn een kleine orde van gevleugelde insecten. Steenvliegen zijn te herkennen aan hun slanke, langwerpige lichaamsvorm en de doorzichtige vleugels die in rust plat op de rug worden gehouden.
132
+
133
+ def load_image_file(file, mode='RGB'):
134
+ im = PIL.Image.open(file)
135
+ if mode:
136
+ im = im.convert(mode)
137
+ return np.array(im)
138
+
139
+ \section{The expected number of intervening \mbox{H\,{\sc i}}
140
+ absorbers}\label{section:expected_number}
141
+ \begin{equation}\label{equation:expected_number}
142
+ \mu = \iint{f(N_{\rm HI},X)\,\mathrm{d}X\,\mathrm{d}N_{\rm HI}},
143
+ \end{equation}
144
+
145
+ Eerder noemde De Meij Oud en Nieuw "een soort oorlogsgebied". En hij heeft dan ook geen zin in de nieuwjaarsnacht. "Als je weet dat er collega's gewond gaan raken, kan je niet meer zeggen: het is mooi politiewerk en we gaan naar een spannende nacht. Het zijn gewoon risico's die je niet wil lopen."
146
+ 华为发布Mate60手机
147
+ ラグビーワールドカップ2023フランス"""
148
+ default_tokenizer_type_1 = "dutch_llama_tokenizer"
149
+ # default_tokenizer_type_2 = "internlm_chat_7b"
150
+ default_tokenizer_type_2 = "mistral_7b"
151
 
152
  def on_load(url_params, request: gr.Request):
153
  """
vocab/__init__.py CHANGED
@@ -94,6 +94,11 @@ all_tokenizers = [
94
  ("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
95
  ("starchat_alpha", "", "GPT2Tokenizer",),
96
 
 
 
 
 
 
97
  ####### google/sentencepiece tokenizer:
98
  # T5 llama internlm
99
  ("t5_small", "", "sentencepiece"),
 
94
  ("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
95
  ("starchat_alpha", "", "GPT2Tokenizer",),
96
 
97
+ ("gronlp-gpt2-small-dutch", "", "GPT2Tokenizer",),
98
+ ("yhavinga-gpt2-medium-dutch", "", "GPT2Tokenizer",),
99
+ ("dutch_llama_tokenizer", ),
100
+ ("yhavinga-ul2-large-en-nl", "", "sentencepiece"),
101
+
102
  ####### google/sentencepiece tokenizer:
103
  # T5 llama internlm
104
  ("t5_small", "", "sentencepiece"),
vocab/dutch_llama_tokenizer/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("yhavinga/dutch-llama-tokenizer", trust_remote_code=True)
vocab/gronlp-gpt2-small-dutch/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-dutch", trust_remote_code=True)
vocab/yhavinga-gpt2-medium-dutch/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("yhavinga/gpt2-medium-dutch", trust_remote_code=True)
vocab/yhavinga-ul2-large-en-nl/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("yhavinga/ul2-large-en-nl", trust_remote_code=True, use_fast=False)