tamang0000 commited on
Commit
80caa24
1 Parent(s): eec601a

added assamese

Browse files
Files changed (1) hide show
  1. app.py +10 -6
app.py CHANGED
@@ -15,7 +15,7 @@ def load_test_phrases(filename):
15
 
16
  models = ["Xenova/claude-tokenizer", # Anthropic
17
  "meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
18
- "beomi/llama-2-ko-7b", # LLAMA-2-ko
19
  "ai4bharat/Airavata", # ARIVATA
20
  "openaccess-ai-collective/tiny-mistral", # Mistral
21
  "gpt-3.5-turbo", # GPT3.5
@@ -23,7 +23,9 @@ models = ["Xenova/claude-tokenizer", # Anthropic
23
  "CohereForAI/aya-23-8B", # AYA
24
  "google/gemma-1.1-2b-it", # GEMMA
25
  "gpt-4o", # GPT4o
26
- "TWO/sutra-mlt256-v2"] # SUTRA
 
 
27
 
28
  test_phrase_set = [
29
  "I am going for a walk later today",
@@ -111,8 +113,9 @@ def generate_split_token_table(text):
111
  with gr.Blocks() as sutra_token_count:
112
  gr.Markdown(
113
  """
114
- # SUTRA Multilingual Tokenizer Specs & Stats.
115
  ## Tokenize paragraphs in multiple languages and compare token counts.
 
116
  """)
117
  textbox = gr.Textbox(label="Input Text")
118
  submit_button = gr.Button("Submit")
@@ -140,9 +143,10 @@ def generate_tokens_table(text):
140
  with gr.Blocks() as sutra_tokenize:
141
  gr.Markdown(
142
  """
143
- # SUTRA Multilingual Tokenizer Sentence Inspector.
144
  ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
145
- """)
 
146
  textbox = gr.Textbox(label="Input Text")
147
  submit_button = gr.Button("Submit")
148
  output = gr.Dataframe()
@@ -156,7 +160,7 @@ if __name__ == '__main__':
156
  with gr.Row():
157
  gr.Markdown(
158
  """
159
- ## <img src="https://playground.two.ai/sutra.svg" height="20"/>
160
  """
161
  )
162
  with gr.Row():
 
15
 
16
  models = ["Xenova/claude-tokenizer", # Anthropic
17
  "meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
18
+ # "beomi/llama-2-ko-7b", # LLAMA-2-ko
19
  "ai4bharat/Airavata", # ARIVATA
20
  "openaccess-ai-collective/tiny-mistral", # Mistral
21
  "gpt-3.5-turbo", # GPT3.5
 
23
  "CohereForAI/aya-23-8B", # AYA
24
  "google/gemma-1.1-2b-it", # GEMMA
25
  "gpt-4o", # GPT4o
26
+ "TWO/sutra-mlt256-v2", # SUTRA
27
+ "tamang0000/assamese-tokenizer-50k" # Assamese
28
+ ]
29
 
30
  test_phrase_set = [
31
  "I am going for a walk later today",
 
113
  with gr.Blocks() as sutra_token_count:
114
  gr.Markdown(
115
  """
116
+ # Multilingual Tokenizer Specs & Stats.
117
  ## Tokenize paragraphs in multiple languages and compare token counts.
118
+ Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
119
  """)
120
  textbox = gr.Textbox(label="Input Text")
121
  submit_button = gr.Button("Submit")
 
143
  with gr.Blocks() as sutra_tokenize:
144
  gr.Markdown(
145
  """
146
+ # Multilingual Tokenizer Sentence Inspector.
147
  ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
148
+ Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
149
+ """)
150
  textbox = gr.Textbox(label="Input Text")
151
  submit_button = gr.Button("Submit")
152
  output = gr.Dataframe()
 
160
  with gr.Row():
161
  gr.Markdown(
162
  """
163
+ ## <img src="https://raw.githubusercontent.com/SAGAR-TAMANG/sagar-tamang-official-website-new/master/img/pi.jpg" height="30"/>
164
  """
165
  )
166
  with gr.Row():