File size: 1,480 Bytes
9b87e96 c53145d 9b87e96 c53145d 9b87e96 c53145d 9b87e96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import gradio as gr
import tiktoken
from tokenizer import CustomTokenizer
# Initialize tokenizers
custom_tokenizer = CustomTokenizer("bpe_tok.model")
tiktoken_encoder = tiktoken.encoding_for_model("gpt-4")
def encode_text(text):
# Get encodings from both tokenizers
custom_tokens = custom_tokenizer.encode(text, allowed_special={"<|endoftext|>"})
tiktoken_tokens = tiktoken_encoder.encode(text, allowed_special={"<|endoftext|>"})
# Format output
custom_output = f"Token count: {len(custom_tokens)}\nTokens: {custom_tokens}"
tiktoken_output = f"Token count: {len(tiktoken_tokens)}\nTokens: {tiktoken_tokens}"
return custom_output, tiktoken_output
# Create Gradio interface
iface = gr.Interface(
fn=encode_text,
inputs=gr.Textbox(lines=5, label="Enter text to tokenize"),
outputs=[
gr.Textbox(label="Custom Tokenizer Output", lines=4),
gr.Textbox(label="Tiktoken Output", lines=4)
],
title="Tokenizer Comparison",
description="Compare custom BPE tokenizer with Tiktoken GPT-4 tokenizer",
examples=[
["आज तो बहुत थक गया हूँ, ಸ್ವಲ್ಪ विश्रಾಂತಿ ಬೇಕು।"],
["मौसम कितना अच्छा है! ನೀವೂ ಹೊರಗೆ ಬನ್ನಿ, let's enjoy together."],
["My name is Jeff Bezos, and I'm the owner of Amazon.<|endoftext|>"]
]
)
if __name__ == "__main__":
iface.launch() |