Jordan Legg
unified the approach to not rely on HF models, just input text.
b39e76c
raw
history blame
992 Bytes
import gradio as gr
from transformers import T5TokenizerFast, CLIPTokenizer
def count_tokens(text):
# Load the common tokenizers
t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
# Get token counts directly using the encode method
t5_count = len(t5_tokenizer.encode(text))
clip_count = len(clip_tokenizer.encode(text))
return f"T5: {t5_count} tokens", f"CLIP: {clip_count} tokens"
# Create a Gradio interface
iface = gr.Interface(
fn=count_tokens,
inputs=[
gr.Textbox(label="Text", placeholder="Enter text here...")
],
outputs=[
gr.Textbox(label="T5 Tokenizer"),
gr.Textbox(label="CLIP Tokenizer")
],
title="Common Diffusion Model Token Counter",
description="Enter text to count tokens using T5 and CLIP tokenizers, commonly used in diffusion models."
)
# Launch the app
iface.launch()