import gradio as gr from transformers import RobertaTokenizer import pandas as pd import json tokenizer = RobertaTokenizer.from_pretrained("roberta-base") def process_text(text, include_special_tokens=False, show_attention_mask=False): encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True) tokens = tokenizer.tokenize(text) token_ids = tokenizer.encode(text) if not include_special_tokens: tokens = tokens token_ids = token_ids[1:-1] token_info = [] for token, token_id in zip(tokens, token_ids): info = { "Token": token, "ID": token_id, } if show_attention_mask: info["Attention Mask"] = encoding["attention_mask"][0][len(token_info)] token_info.append(info) df = pd.DataFrame(token_info) stats = f""" Number of tokens: {len(tokens)} Input text length: {len(text)} Tokens/character ratio: {len(tokens)/len(text):.2f} Vocabulary size: {tokenizer.vocab_size} """ json_output = json.dumps( { "input_ids": token_ids, "tokens": tokens, }, indent=2, ) return df, stats, json_output iface = gr.Interface( fn=process_text, inputs=[ gr.Textbox( lines=5, placeholder="Enter text to tokenize...", label="Input Text" ), gr.Checkbox(label="Include Special Tokens", value=False), gr.Checkbox(label="Show Attention Mask", value=False), ], outputs=[ gr.Dataframe( headers=["Token", "ID", "Attention Mask"], label="Tokenization Results" ), gr.Textbox(label="Statistics", lines=4), gr.JSON(label="JSON Output"), ], title="RoBERTa Tokenizer Playground", description=""" An interactive demonstration of the RoBERTa tokenizer. """, theme="default", ) if __name__ == "__main__": iface.launch(share=True)