from transformers import AutoTokenizer from flask import Flask, request, render_template_string, jsonify import hashlib import random import math import json import io import os app = Flask(__name__) # Set maximum content length to 100MB to handle larger files app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # Create upload folder if it doesn't exist UPLOAD_FOLDER = '/tmp/tokenizer_uploads' if not os.path.exists(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # Predefined tokenizer models with aliases TOKENIZER_MODELS = { 'mistral-small': { 'name': 'mistralai/Mistral-Small-24B-Instruct-2501', 'alias': 'Mistral Small 3' }, 'gemma3-27b': { 'name': 'google/gemma-3-27b-it', 'alias': 'Gemma 3 27B' }, 'deepseek-r1': { 'name': 'deepseek-ai/DeepSeek-R1', 'alias': 'Deepseek R1' }, 'qwen_25_72b': { 'name': 'Qwen/Qwen2.5-72B-Instruct', 'alias': 'QWQ 32B' }, 'llama_33': { 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit', 'alias': 'Llama 3.3 70B' }, 'gemma2_2b': { 'name': 'google/gemma-2-2b-it', 'alias': 'Gemma 2 2B' }, 'bert-large-uncased': { 'name': 'google-bert/bert-large-uncased', 'alias': 'Bert Large Uncased' }, 'gpt2': { 'name': 'openai-community/gpt2', 'alias': 'GPT-2' } } # Initialize tokenizers dict tokenizers = {} def load_tokenizer(model_id): """Load tokenizer if not already loaded""" if model_id not in tokenizers: tokenizers[model_id] = AutoTokenizer.from_pretrained(TOKENIZER_MODELS[model_id]['name']) return tokenizers[model_id] def get_varied_color(token: str) -> dict: """Generate vibrant colors with HSL for better visual distinction.""" token_hash = hashlib.md5(token.encode()).hexdigest() hue = int(token_hash[:3], 16) % 360 saturation = 70 + (int(token_hash[3:5], 16) % 20) lightness = 80 + (int(token_hash[5:7], 16) % 10) text_lightness = 20 if lightness > 50 else 90 return { 'background': f'hsl({hue}, {saturation}%, {lightness}%)', 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)' } def fix_token(token: str) -> str: """Fix token for display with improved space visualization.""" if token.startswith('Ġ'): space_count = token.count('Ġ') return '·' * space_count + token[space_count:] return token def get_token_stats(tokens: list, original_text: str) -> dict: """Calculate enhanced statistics about the tokens.""" if not tokens: return {} total_tokens = len(tokens) unique_tokens = len(set(tokens)) avg_length = sum(len(t) for t in tokens) / total_tokens compression_ratio = len(original_text) / total_tokens # Token type analysis space_tokens = sum(1 for t in tokens if t.startswith('Ġ')) newline_tokens = sum(1 for t in tokens if 'Ċ' in t) special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}'])) punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()')) # Length distribution lengths = [len(t) for t in tokens] mean_length = sum(lengths) / len(lengths) variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) std_dev = math.sqrt(variance) return { 'basic_stats': { 'total_tokens': total_tokens, 'unique_tokens': unique_tokens, 'compression_ratio': round(compression_ratio, 2), 'space_tokens': space_tokens, 'newline_tokens': newline_tokens, 'special_tokens': special_tokens, 'punctuation_tokens': punctuation_tokens, 'unique_percentage': round(unique_tokens/total_tokens * 100, 1) }, 'length_stats': { 'avg_length': round(avg_length, 2), 'std_dev': round(std_dev, 2), 'min_length': min(lengths), 'max_length': max(lengths), 'median_length': sorted(lengths)[len(lengths)//2] } } def process_text(text: str, model_id: str, is_full_file: bool = False, file_path: str = None) -> dict: """Process text and return tokenization data.""" tokenizer = load_tokenizer(model_id) # For file uploads, read only preview from file but process full file for stats if file_path and is_full_file: # Read the preview for display with open(file_path, 'r', errors='replace') as f: preview_text = f.read(8096) # Tokenize preview for display preview_tokens = tokenizer.tokenize(preview_text) display_tokens = preview_tokens[:50000] # Process full file for stats in chunks to avoid memory issues total_tokens = [] token_set = set() total_length = 0 chunk_size = 1024 * 1024 # 1MB chunks with open(file_path, 'r', errors='replace') as f: while True: chunk = f.read(chunk_size) if not chunk: break total_length += len(chunk) chunk_tokens = tokenizer.tokenize(chunk) total_tokens.extend(chunk_tokens) token_set.update(chunk_tokens) # Calculate stats stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text else: # Standard processing for normal text input all_tokens = tokenizer.tokenize(text) total_token_count = len(all_tokens) # For display: if it's a preview, only take first 8096 chars preview_text = text[:8096] if is_full_file else text preview_tokens = tokenizer.tokenize(preview_text) display_tokens = preview_tokens[:50000] # Always use full text for stats stats = get_token_stats(all_tokens, text) # Format tokens for display token_data = [] for idx, token in enumerate(display_tokens): colors = get_varied_color(token) fixed_token = fix_token(token) # Compute the numerical token ID from the tokenizer token_id = tokenizer.convert_tokens_to_ids(token) token_data.append({ 'original': token, 'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token, 'colors': colors, 'newline': fixed_token.endswith('Ċ'), 'token_id': token_id, 'token_index': idx }) # Use the appropriate token count based on processing method total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens) return { 'tokens': token_data, 'stats': stats, 'display_limit_reached': total_token_count > 50000 and not is_full_file, 'total_tokens': total_token_count, 'is_full_file': is_full_file, 'preview_only': is_full_file } # HTML template with enhanced modern styling HTML_TEMPLATE = """ Token Visualizer Pro
📄

Drop your file here

📎

Token Visualizer

Advanced tokenization analysis and visualization

Token Visualization

Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
{% if token_data %} {% for token in token_data.tokens %} {{ token.display }} {% if token.newline %}
{% endif %} {% endfor %} {% endif %}
Note: Only showing first 50,000 tokens. Total token count: 0
Total Tokens
{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}
{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique ({{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}%)
Token Types
{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}
special tokens
Whitespace
{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}
spaces: {{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}, newlines: {{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}
Token Length
{{ token_data.stats.length_stats.avg_length if token_data else 0 }}
median: {{ token_data.stats.length_stats.median_length if token_data else 0 }}, ±{{ token_data.stats.length_stats.std_dev if token_data else 0 }} std
Compression
{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}
characters per token
""" @app.route('/', methods=['GET', 'POST']) def index(): text = "" token_data = None selected_model = request.args.get('model', request.form.get('model', 'mistral-small')) if request.method == 'POST': # Check if file upload if 'file' in request.files and request.files['file'].filename: uploaded_file = request.files['file'] # Save file to tmp directory file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename) uploaded_file.save(file_path) # Read a small preview of the file with open(file_path, 'r', errors='replace') as f: text = f.read(8096) try: # Process the file token_data = process_text("", selected_model, is_full_file=True, file_path=file_path) # Clean up the file after processing if os.path.exists(file_path): os.remove(file_path) # If request is AJAX, return JSON if request.headers.get('X-Requested-With') == 'XMLHttpRequest': return jsonify(token_data) except Exception as e: error_message = str(e) # Clean up the file after processing if os.path.exists(file_path): os.remove(file_path) if request.headers.get('X-Requested-With') == 'XMLHttpRequest': return jsonify({"error": error_message}), 400 return render_template_string( HTML_TEMPLATE, text=text, token_data=None, models=TOKENIZER_MODELS, selected_model=selected_model, error=error_message ) # Regular text processing else: text = request.form.get('text', '') if text: try: token_data = process_text(text, selected_model) # If request is AJAX, return JSON if request.headers.get('X-Requested-With') == 'XMLHttpRequest': return jsonify(token_data) except Exception as e: error_message = str(e) if request.headers.get('X-Requested-With') == 'XMLHttpRequest': return jsonify({"error": error_message}), 400 return render_template_string( HTML_TEMPLATE, text=text, token_data=None, models=TOKENIZER_MODELS, selected_model=selected_model, error=error_message ) return render_template_string( HTML_TEMPLATE, text=text, token_data=token_data, models=TOKENIZER_MODELS, selected_model=selected_model ) if __name__ == "__main__": app.run(host='0.0.0.0', port=7860)