xu-song commited on
Commit
baf4d1e
·
1 Parent(s): f1b4ae2

launch with queue

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. compression_util.py +3 -5
  3. vocab.py +6 -4
app.py CHANGED
@@ -21,4 +21,4 @@ demo = TabbedInterface(
21
  demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
22
 
23
  if __name__ == "__main__":
24
- demo.launch()
 
21
  demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
22
 
23
  if __name__ == "__main__":
24
+ demo.queue(max_size=1024, default_concurrency_limit=80).launch()
compression_util.py CHANGED
@@ -133,15 +133,14 @@ cache = {}
133
 
134
 
135
  def tokenize_corpus(
136
- tokenizer_name: str, # 可以免加载tokenizer直接出结果
137
  corpuses: List[str],
138
  cache_dir: str = "stats"
139
  ) -> dict:
140
  """
141
- 这个要独立的cache,因为速度慢。
142
- :param tokenizer_config: 可以不加载就
143
  :param corpuses:
144
- :param cache_path:
145
  :return:
146
  """
147
 
@@ -157,7 +156,6 @@ def tokenize_corpus(
157
 
158
  def _tokenize(tokenizer, datasets, detail_path=None):
159
  """
160
- export_diff: true | false
161
  :param tokenizer:
162
  :param datasets:
163
  :param detail_path:
 
133
 
134
 
135
  def tokenize_corpus(
136
+ tokenizer_name: str,
137
  corpuses: List[str],
138
  cache_dir: str = "stats"
139
  ) -> dict:
140
  """
141
+ :param tokenizer_name:
 
142
  :param corpuses:
143
+ :param cache_dir:
144
  :return:
145
  """
146
 
 
156
 
157
  def _tokenize(tokenizer, datasets, detail_path=None):
158
  """
 
159
  :param tokenizer:
160
  :param datasets:
161
  :param detail_path:
vocab.py CHANGED
@@ -182,8 +182,8 @@ class TokenizerConfig:
182
  return hash(self.name_or_path)
183
 
184
 
185
- # format: , description, hf_path, tokenizer_class/type, comments, Organization
186
  # TODO: append link and description to the end of dropdown button.
 
187
  _all_tokenizer_config = [
188
  ##### bert 系列
189
  TokenizerConfig("google-bert/bert-base-cased", impl=TokenizerImpl.BertTokenizer, org="Google",
@@ -229,7 +229,9 @@ _all_tokenizer_config = [
229
  TokenizerConfig("EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"), # 5万
230
  TokenizerConfig("cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"), # GPTNeoXTokenizer
231
  TokenizerConfig("abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"),
232
- TokenizerConfig("Qwen/Qwen1.5-14B-Chat", name_display="Qwen/Qwen1.5", impl=TokenizerImpl.SentencePiece, org="Alibaba"), # 15万,速度有点慢
 
 
233
  TokenizerConfig("HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"),
234
 
235
  ####### google/sentencepiece tokenizer:
@@ -385,7 +387,7 @@ class TokenizerFactory:
385
 
386
  def get_tokenizer(self, tokenizer_name: str):
387
  """
388
- :param tokenizer_config:
389
  :return:
390
  """
391
  tokenizer_config = self.get_tokenizer_config(tokenizer_name)
@@ -407,7 +409,7 @@ class TokenizerFactory:
407
  self.tokenizer_cache[tokenizer_config] = tokenizer
408
  return tokenizer
409
 
410
- def get_name_with_hyperlink(self, tokenizer_name):
411
  def model_hyperlink(link, model_name):
412
  model_name = model_name
413
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
182
  return hash(self.name_or_path)
183
 
184
 
 
185
  # TODO: append link and description to the end of dropdown button.
186
+ # Add tokenizer_class/type, comments
187
  _all_tokenizer_config = [
188
  ##### bert 系列
189
  TokenizerConfig("google-bert/bert-base-cased", impl=TokenizerImpl.BertTokenizer, org="Google",
 
229
  TokenizerConfig("EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"), # 5万
230
  TokenizerConfig("cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"), # GPTNeoXTokenizer
231
  TokenizerConfig("abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"),
232
+ TokenizerConfig("Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"), # 15万,速度有点慢
233
+ TokenizerConfig("Qwen/Qwen1.5-110B ", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
234
+ TokenizerConfig("Qwen/Qwen1.5-1.8B ", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
235
  TokenizerConfig("HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"),
236
 
237
  ####### google/sentencepiece tokenizer:
 
387
 
388
  def get_tokenizer(self, tokenizer_name: str):
389
  """
390
+ :param tokenizer_name:
391
  :return:
392
  """
393
  tokenizer_config = self.get_tokenizer_config(tokenizer_name)
 
409
  self.tokenizer_cache[tokenizer_config] = tokenizer
410
  return tokenizer
411
 
412
+ def get_name_with_hyperlink(self, tokenizer_name: str):
413
  def model_hyperlink(link, model_name):
414
  model_name = model_name
415
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'