from transformers import AutoTokenizer from flask import Flask, request, render_template_string, jsonify import hashlib import random import math import json import io import os app = Flask(__name__) # Set maximum content length to 100MB to handle larger files app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # Create upload folder if it doesn't exist UPLOAD_FOLDER = '/tmp/tokenizer_uploads' if not os.path.exists(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # Predefined tokenizer models with aliases TOKENIZER_MODELS = { 'mistral-small': { 'name': 'mistralai/Mistral-Small-24B-Instruct-2501', 'alias': 'Mistral Small 3' }, 'gemma3-27b': { 'name': 'google/gemma-3-27b-it', 'alias': 'Gemma 3 27B' }, 'deepseek-r1': { 'name': 'deepseek-ai/DeepSeek-R1', 'alias': 'Deepseek R1' }, 'qwen_25_72b': { 'name': 'Qwen/Qwen2.5-72B-Instruct', 'alias': 'QWQ 32B' }, 'llama_33': { 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit', 'alias': 'Llama 3.3 70B' }, 'gemma2_2b': { 'name': 'google/gemma-2-2b-it', 'alias': 'Gemma 2 2B' }, 'bert-large-uncased': { 'name': 'google-bert/bert-large-uncased', 'alias': 'Bert Large Uncased' }, 'gpt2': { 'name': 'openai-community/gpt2', 'alias': 'GPT-2' } } # Initialize tokenizers dict tokenizers = {} def load_tokenizer(model_id): """Load tokenizer if not already loaded""" if model_id not in tokenizers: tokenizers[model_id] = AutoTokenizer.from_pretrained(TOKENIZER_MODELS[model_id]['name']) return tokenizers[model_id] def get_varied_color(token: str) -> dict: """Generate vibrant colors with HSL for better visual distinction.""" token_hash = hashlib.md5(token.encode()).hexdigest() hue = int(token_hash[:3], 16) % 360 saturation = 70 + (int(token_hash[3:5], 16) % 20) lightness = 80 + (int(token_hash[5:7], 16) % 10) text_lightness = 20 if lightness > 50 else 90 return { 'background': f'hsl({hue}, {saturation}%, {lightness}%)', 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)' } def fix_token(token: str) -> str: """Fix token for display with improved space visualization.""" if token.startswith('Ġ'): space_count = token.count('Ġ') return '·' * space_count + token[space_count:] return token def get_token_stats(tokens: list, original_text: str) -> dict: """Calculate enhanced statistics about the tokens.""" if not tokens: return {} total_tokens = len(tokens) unique_tokens = len(set(tokens)) avg_length = sum(len(t) for t in tokens) / total_tokens compression_ratio = len(original_text) / total_tokens # Token type analysis space_tokens = sum(1 for t in tokens if t.startswith('Ġ')) newline_tokens = sum(1 for t in tokens if 'Ċ' in t) special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}'])) punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()')) # Length distribution lengths = [len(t) for t in tokens] mean_length = sum(lengths) / len(lengths) variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) std_dev = math.sqrt(variance) return { 'basic_stats': { 'total_tokens': total_tokens, 'unique_tokens': unique_tokens, 'compression_ratio': round(compression_ratio, 2), 'space_tokens': space_tokens, 'newline_tokens': newline_tokens, 'special_tokens': special_tokens, 'punctuation_tokens': punctuation_tokens, 'unique_percentage': round(unique_tokens/total_tokens * 100, 1) }, 'length_stats': { 'avg_length': round(avg_length, 2), 'std_dev': round(std_dev, 2), 'min_length': min(lengths), 'max_length': max(lengths), 'median_length': sorted(lengths)[len(lengths)//2] } } def process_text(text: str, model_id: str, is_full_file: bool = False, file_path: str = None) -> dict: """Process text and return tokenization data.""" tokenizer = load_tokenizer(model_id) # For file uploads, read only preview from file but process full file for stats if file_path and is_full_file: # Read the preview for display with open(file_path, 'r', errors='replace') as f: preview_text = f.read(8096) # Tokenize preview for display preview_tokens = tokenizer.tokenize(preview_text) display_tokens = preview_tokens[:50000] # Process full file for stats in chunks to avoid memory issues total_tokens = [] token_set = set() total_length = 0 chunk_size = 1024 * 1024 # 1MB chunks with open(file_path, 'r', errors='replace') as f: while True: chunk = f.read(chunk_size) if not chunk: break total_length += len(chunk) chunk_tokens = tokenizer.tokenize(chunk) total_tokens.extend(chunk_tokens) token_set.update(chunk_tokens) # Calculate stats stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text else: # Standard processing for normal text input all_tokens = tokenizer.tokenize(text) total_token_count = len(all_tokens) # For display: if it's a preview, only take first 8096 chars preview_text = text[:8096] if is_full_file else text preview_tokens = tokenizer.tokenize(preview_text) display_tokens = preview_tokens[:50000] # Always use full text for stats stats = get_token_stats(all_tokens, text) # Format tokens for display token_data = [] for idx, token in enumerate(display_tokens): colors = get_varied_color(token) fixed_token = fix_token(token) # Compute the numerical token ID from the tokenizer token_id = tokenizer.convert_tokens_to_ids(token) token_data.append({ 'original': token, 'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token, 'colors': colors, 'newline': fixed_token.endswith('Ċ'), 'token_id': token_id, 'token_index': idx }) # Use the appropriate token count based on processing method total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens) return { 'tokens': token_data, 'stats': stats, 'display_limit_reached': total_token_count > 50000 and not is_full_file, 'total_tokens': total_token_count, 'is_full_file': is_full_file, 'preview_only': is_full_file } # HTML template with enhanced modern styling HTML_TEMPLATE = """
Drop your file here
Advanced tokenization analysis and visualization