Spaces:

tamang0000
/

assamese-tokenizer-comparison

Running

tamang0000 commited on Jul 10

Commit

80caa24

•

1 Parent(s): eec601a

added assamese

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ def load_test_phrases(filename):
 models = ["Xenova/claude-tokenizer", # Anthropic
  "meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
- "beomi/llama-2-ko-7b", # LLAMA-2-ko
  "ai4bharat/Airavata", # ARIVATA
  "openaccess-ai-collective/tiny-mistral", # Mistral
  "gpt-3.5-turbo", # GPT3.5
@@ -23,7 +23,9 @@ models = ["Xenova/claude-tokenizer", # Anthropic
  "CohereForAI/aya-23-8B", # AYA
  "google/gemma-1.1-2b-it", # GEMMA
  "gpt-4o", # GPT4o
- "TWO/sutra-mlt256-v2"] # SUTRA
 test_phrase_set = [
  "I am going for a walk later today",
@@ -111,8 +113,9 @@ def generate_split_token_table(text):
 with gr.Blocks() as sutra_token_count:
  gr.Markdown(
  """
- # SUTRA Multilingual Tokenizer Specs & Stats.
  ## Tokenize paragraphs in multiple languages and compare token counts.
  """)
  textbox = gr.Textbox(label="Input Text")
  submit_button = gr.Button("Submit")
@@ -140,9 +143,10 @@ def generate_tokens_table(text):
 with gr.Blocks() as sutra_tokenize:
  gr.Markdown(
  """
- # SUTRA Multilingual Tokenizer Sentence Inspector.
  ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
- """)
  textbox = gr.Textbox(label="Input Text")
  submit_button = gr.Button("Submit")
  output = gr.Dataframe()
@@ -156,7 +160,7 @@ if __name__ == '__main__':
  with gr.Row():
  gr.Markdown(
  """
- ## <img src="https://playground.two.ai/sutra.svg" height="20"/>
  """
  )
  with gr.Row():

 models = ["Xenova/claude-tokenizer", # Anthropic
  "meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
+# "beomi/llama-2-ko-7b", # LLAMA-2-ko
  "ai4bharat/Airavata", # ARIVATA
  "openaccess-ai-collective/tiny-mistral", # Mistral
  "gpt-3.5-turbo", # GPT3.5
  "CohereForAI/aya-23-8B", # AYA
  "google/gemma-1.1-2b-it", # GEMMA
  "gpt-4o", # GPT4o
+ "TWO/sutra-mlt256-v2", # SUTRA
+ "tamang0000/assamese-tokenizer-50k" # Assamese
+]
 test_phrase_set = [
  "I am going for a walk later today",
 with gr.Blocks() as sutra_token_count:
  gr.Markdown(
  """
+ # Multilingual Tokenizer Specs & Stats.
  ## Tokenize paragraphs in multiple languages and compare token counts.
+ Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
  """)
  textbox = gr.Textbox(label="Input Text")
  submit_button = gr.Button("Submit")
 with gr.Blocks() as sutra_tokenize:
  gr.Markdown(
  """
+ # Multilingual Tokenizer Sentence Inspector.
  ## Tokenize a sentence with various tokenizers and inspect how it's broken down.
+ Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
+""")
  textbox = gr.Textbox(label="Input Text")
  submit_button = gr.Button("Submit")
  output = gr.Dataframe()
  with gr.Row():
  gr.Markdown(
  """
+ ## <img src="https://raw.githubusercontent.com/SAGAR-TAMANG/sagar-tamang-official-website-new/master/img/pi.jpg" height="30"/>
  """
  )
  with gr.Row():