Spaces:
Running
Running
from transformers import AutoTokenizer | |
from flask import Flask, request, render_template_string, jsonify | |
import hashlib | |
import random | |
import math | |
import json | |
import io | |
import os | |
app = Flask(__name__) | |
# Set maximum content length to 100MB to handle larger files | |
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 | |
# Create upload folder if it doesn't exist | |
UPLOAD_FOLDER = '/tmp/tokenizer_uploads' | |
if not os.path.exists(UPLOAD_FOLDER): | |
os.makedirs(UPLOAD_FOLDER) | |
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER | |
# Predefined tokenizer models with aliases | |
TOKENIZER_MODELS = { | |
'mistral-small': { | |
'name': 'mistralai/Mistral-Small-24B-Instruct-2501', | |
'alias': 'Mistral Small 3' | |
}, | |
'gemma3-27b': { | |
'name': 'google/gemma-3-27b-it', | |
'alias': 'Gemma 3 27B' | |
}, | |
'deepseek-r1': { | |
'name': 'deepseek-ai/DeepSeek-R1', | |
'alias': 'Deepseek R1' | |
}, | |
'qwen_25_72b': { | |
'name': 'Qwen/Qwen2.5-72B-Instruct', | |
'alias': 'QWQ 32B' | |
}, | |
'llama_33': { | |
'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit', | |
'alias': 'Llama 3.3 70B' | |
}, | |
'gemma2_2b': { | |
'name': 'google/gemma-2-2b-it', | |
'alias': 'Gemma 2 2B' | |
}, | |
'bert-large-uncased': { | |
'name': 'google-bert/bert-large-uncased', | |
'alias': 'Bert Large Uncased' | |
}, | |
'gpt2': { | |
'name': 'openai-community/gpt2', | |
'alias': 'GPT-2' | |
} | |
} | |
# Initialize tokenizers dict | |
tokenizers = {} | |
def load_tokenizer(model_id): | |
"""Load tokenizer if not already loaded""" | |
if model_id not in tokenizers: | |
tokenizers[model_id] = AutoTokenizer.from_pretrained(TOKENIZER_MODELS[model_id]['name']) | |
return tokenizers[model_id] | |
def get_varied_color(token: str) -> dict: | |
"""Generate vibrant colors with HSL for better visual distinction.""" | |
token_hash = hashlib.md5(token.encode()).hexdigest() | |
hue = int(token_hash[:3], 16) % 360 | |
saturation = 70 + (int(token_hash[3:5], 16) % 20) | |
lightness = 80 + (int(token_hash[5:7], 16) % 10) | |
text_lightness = 20 if lightness > 50 else 90 | |
return { | |
'background': f'hsl({hue}, {saturation}%, {lightness}%)', | |
'text': f'hsl({hue}, {saturation}%, {text_lightness}%)' | |
} | |
def fix_token(token: str) -> str: | |
"""Fix token for display with improved space visualization.""" | |
if token.startswith('Ġ'): | |
space_count = token.count('Ġ') | |
return '·' * space_count + token[space_count:] | |
return token | |
def get_token_stats(tokens: list, original_text: str) -> dict: | |
"""Calculate enhanced statistics about the tokens.""" | |
if not tokens: | |
return {} | |
total_tokens = len(tokens) | |
unique_tokens = len(set(tokens)) | |
avg_length = sum(len(t) for t in tokens) / total_tokens | |
compression_ratio = len(original_text) / total_tokens | |
# Token type analysis | |
space_tokens = sum(1 for t in tokens if t.startswith('Ġ')) | |
newline_tokens = sum(1 for t in tokens if 'Ċ' in t) | |
special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}'])) | |
punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()')) | |
# Length distribution | |
lengths = [len(t) for t in tokens] | |
mean_length = sum(lengths) / len(lengths) | |
variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) | |
std_dev = math.sqrt(variance) | |
return { | |
'basic_stats': { | |
'total_tokens': total_tokens, | |
'unique_tokens': unique_tokens, | |
'compression_ratio': round(compression_ratio, 2), | |
'space_tokens': space_tokens, | |
'newline_tokens': newline_tokens, | |
'special_tokens': special_tokens, | |
'punctuation_tokens': punctuation_tokens, | |
'unique_percentage': round(unique_tokens/total_tokens * 100, 1) | |
}, | |
'length_stats': { | |
'avg_length': round(avg_length, 2), | |
'std_dev': round(std_dev, 2), | |
'min_length': min(lengths), | |
'max_length': max(lengths), | |
'median_length': sorted(lengths)[len(lengths)//2] | |
} | |
} | |
def process_text(text: str, model_id: str, is_full_file: bool = False, file_path: str = None) -> dict: | |
"""Process text and return tokenization data.""" | |
tokenizer = load_tokenizer(model_id) | |
# For file uploads, read only preview from file but process full file for stats | |
if file_path and is_full_file: | |
# Read the preview for display | |
with open(file_path, 'r', errors='replace') as f: | |
preview_text = f.read(8096) | |
# Tokenize preview for display | |
preview_tokens = tokenizer.tokenize(preview_text) | |
display_tokens = preview_tokens[:50000] | |
# Process full file for stats in chunks to avoid memory issues | |
total_tokens = [] | |
token_set = set() | |
total_length = 0 | |
chunk_size = 1024 * 1024 # 1MB chunks | |
with open(file_path, 'r', errors='replace') as f: | |
while True: | |
chunk = f.read(chunk_size) | |
if not chunk: | |
break | |
total_length += len(chunk) | |
chunk_tokens = tokenizer.tokenize(chunk) | |
total_tokens.extend(chunk_tokens) | |
token_set.update(chunk_tokens) | |
# Calculate stats | |
stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text | |
else: | |
# Standard processing for normal text input | |
all_tokens = tokenizer.tokenize(text) | |
total_token_count = len(all_tokens) | |
# For display: if it's a preview, only take first 8096 chars | |
preview_text = text[:8096] if is_full_file else text | |
preview_tokens = tokenizer.tokenize(preview_text) | |
display_tokens = preview_tokens[:50000] | |
# Always use full text for stats | |
stats = get_token_stats(all_tokens, text) | |
# Format tokens for display | |
token_data = [] | |
for idx, token in enumerate(display_tokens): | |
colors = get_varied_color(token) | |
fixed_token = fix_token(token) | |
# Compute the numerical token ID from the tokenizer | |
token_id = tokenizer.convert_tokens_to_ids(token) | |
token_data.append({ | |
'original': token, | |
'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token, | |
'colors': colors, | |
'newline': fixed_token.endswith('Ċ'), | |
'token_id': token_id, | |
'token_index': idx | |
}) | |
# Use the appropriate token count based on processing method | |
total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens) | |
return { | |
'tokens': token_data, | |
'stats': stats, | |
'display_limit_reached': total_token_count > 50000 and not is_full_file, | |
'total_tokens': total_token_count, | |
'is_full_file': is_full_file, | |
'preview_only': is_full_file | |
} | |
# HTML template with enhanced modern styling | |
HTML_TEMPLATE = """ | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>Token Visualizer Pro</title> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
<style> | |
:root { | |
--primary-color: #0f4f9b; /* Blue accent */ | |
--primary-hover: #0c3e7a; /* Darker blue accent */ | |
--bg-color: #121212; /* Dark background */ | |
--card-bg: #1e1e1e; /* Dark card background */ | |
--card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7), | |
0 2px 4px -1px rgba(0, 0, 0, 0.6); | |
--transition: all 0.3s ease; | |
--text-color: #E0E0E0; /* Main text color */ | |
--secondary-text: #A0A0A0;/* Secondary text color */ | |
--input-bg: #2a2a2a; /* Input/textarea background */ | |
--input-border: #444444; /* Input/textarea border */ | |
--input-focus: #0f4f9b; /* Focus border color */ | |
} | |
* { | |
margin: 0; | |
padding: 0; | |
box-sizing: border-box; | |
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; | |
scrollbar-width: thin; | |
scrollbar-color: #0f4f9b #121212 | |
} | |
/* Width and height of the scrollbar */ | |
::-webkit-scrollbar { | |
width: 12px; | |
height: 12px; | |
} | |
/* Track (background) */ | |
::-webkit-scrollbar-track { | |
background: #121212; | |
border-radius: 10px; | |
} | |
/* Handle (draggable part) */ | |
::-webkit-scrollbar-thumb { | |
background: #0f4f9b; | |
border-radius: 10px; | |
border: 2px solid #121212; | |
} | |
/* Handle on hover */ | |
::-webkit-scrollbar-thumb:hover { | |
background: #0c3e7a; | |
} | |
body { | |
background-color: var(--bg-color); | |
padding: 2rem; | |
min-height: 100vh; | |
background-image: | |
radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%), | |
radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%); | |
color: var(--text-color); | |
} | |
.container { | |
max-width: 1200px; | |
margin: 0 auto; | |
} | |
.header { | |
display: flex; | |
justify-content: space-between; | |
align-items: center; | |
margin-bottom: 2rem; | |
position: relative; | |
} | |
.title-section { | |
flex-grow: 1; | |
} | |
.title { | |
font-size: 2.5rem; | |
font-weight: 800; | |
color: var(--primary-color); | |
margin-bottom: 0.5rem; | |
} | |
.subtitle { | |
color: var(--secondary-text); | |
font-size: 1.1rem; | |
} | |
.model-selector { | |
position: relative; | |
min-width: 200px; | |
} | |
select { | |
width: 100%; | |
padding: 0.75rem 1rem; | |
border: 2px solid var(--input-border); | |
border-radius: 0.5rem; | |
font-size: 1rem; | |
color: var(--text-color); | |
background-color: var(--input-bg); | |
cursor: pointer; | |
transition: var(--transition); | |
appearance: none; | |
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E"); | |
background-repeat: no-repeat; | |
background-position: right 1rem center; | |
background-size: 1.5rem; | |
} | |
select:hover { | |
border-color: var(--primary-color); | |
} | |
select:focus { | |
outline: none; | |
border-color: var(--primary-color); | |
box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1); | |
} | |
.input-section { | |
margin-bottom: 2rem; | |
} | |
textarea { | |
width: 100%; | |
height: 150px; | |
padding: 1.25rem; | |
border: 2px solid var(--input-border); | |
border-radius: 0.75rem; | |
resize: vertical; | |
font-size: 1rem; | |
margin-bottom: 1rem; | |
transition: var(--transition); | |
background-color: var(--input-bg); | |
color: var(--text-color); | |
} | |
textarea:focus { | |
outline: none; | |
border-color: var(--input-focus); | |
box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1); | |
} | |
.button-container { | |
display: flex; | |
justify-content: center; | |
width: 100%; | |
gap: 1rem; | |
} | |
button { | |
padding: 0.875rem 2.5rem; | |
background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%); | |
color: #fff; | |
border: none; | |
border-radius: 0.75rem; | |
font-size: 1.1rem; | |
font-weight: 600; | |
cursor: pointer; | |
transition: var(--transition); | |
box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2); | |
} | |
button:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3); | |
} | |
button:active { | |
transform: translateY(0); | |
} | |
button:disabled { | |
opacity: 0.7; | |
cursor: not-allowed; | |
} | |
.card { | |
background-color: var(--card-bg); | |
border-radius: 1rem; | |
box-shadow: var(--card-shadow); | |
padding: 1.5rem; | |
margin-bottom: 2rem; | |
transition: var(--transition); | |
} | |
.card:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1); | |
} | |
.card-title { | |
font-size: 1.25rem; | |
font-weight: 700; | |
color: var(--text-color); | |
margin-bottom: 1.25rem; | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
cursor: pointer; | |
} | |
.card-title::before { | |
content: ''; | |
display: block; | |
width: 4px; | |
height: 1.25rem; | |
background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%); | |
border-radius: 2px; | |
} | |
.token-container { | |
display: flex; | |
flex-wrap: wrap; | |
gap: 0.375rem; | |
margin-bottom: 1rem; | |
padding: 1rem; | |
background-color: #2a2a2a; | |
border-radius: 0.5rem; | |
max-height: 200px; | |
overflow-y: auto; | |
transition: max-height 0.3s ease; | |
} | |
.token-container.expanded { | |
max-height: none; | |
} | |
.token { | |
padding: 0.375rem 0.75rem; | |
border-radius: 0.375rem; | |
background-color: var(--input-bg); | |
font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace; | |
font-size: 0.875rem; | |
color: var(--text-color); | |
cursor: default; | |
transition: var(--transition); | |
box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); | |
} | |
.token:hover { | |
transform: translateY(-1px); | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); | |
} | |
.stats-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
gap: 1.5rem; | |
margin-bottom: 2rem; | |
} | |
.stat-card { | |
background-color: var(--card-bg); | |
padding: 1.5rem; | |
border-radius: 1rem; | |
box-shadow: var(--card-shadow); | |
transition: var(--transition); | |
} | |
.stat-card:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1); | |
} | |
.stat-title { | |
color: var(--secondary-text); | |
font-size: 0.875rem; | |
font-weight: 500; | |
margin-bottom: 0.5rem; | |
text-transform: uppercase; | |
letter-spacing: 0.05em; | |
} | |
.stat-value { | |
color: var(--text-color); | |
font-size: 2rem; | |
font-weight: 700; | |
line-height: 1.2; | |
margin-bottom: 0.25rem; | |
} | |
.stat-description { | |
color: var(--secondary-text); | |
font-size: 0.875rem; | |
} | |
.expand-button { | |
background: none; | |
border: none; | |
color: var(--primary-color); | |
font-size: 0.875rem; | |
padding: 0.5rem; | |
cursor: pointer; | |
display: block; | |
margin: 0 auto; | |
box-shadow: none; | |
} | |
.expand-button:hover { | |
text-decoration: underline; | |
transform: none; | |
box-shadow: none; | |
} | |
.error-message { | |
color: #EF4444; | |
background-color: #3a1f1f; | |
border: 1px solid #562626; | |
padding: 1rem; | |
border-radius: 0.5rem; | |
margin-bottom: 1rem; | |
display: none; | |
} | |
.display-limit-notice { | |
background-color: #4b2b07; | |
border: 1px solid #7c4a02; | |
color: #FFD591; | |
padding: 0.75rem; | |
border-radius: 0.5rem; | |
margin-top: 1rem; | |
font-size: 0.875rem; | |
display: none; | |
} | |
/* File drop zone styles */ | |
.file-drop-zone { | |
position: fixed; | |
top: 0; | |
left: 0; | |
width: 100%; | |
height: 100%; | |
background-color: rgba(15, 79, 155, 0.15); | |
z-index: 1000; | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
opacity: 0; | |
pointer-events: none; | |
transition: opacity 0.3s ease; | |
} | |
.file-drop-zone.active { | |
opacity: 1; | |
pointer-events: all; | |
} | |
.drop-indicator { | |
background-color: var(--card-bg); | |
border: 2px dashed var(--primary-color); | |
border-radius: 1rem; | |
padding: 2rem; | |
text-align: center; | |
width: 60%; | |
max-width: 400px; | |
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25); | |
animation: pulse 2s infinite; | |
} | |
@keyframes pulse { | |
0% { transform: scale(1); } | |
50% { transform: scale(1.05); } | |
100% { transform: scale(1); } | |
} | |
.drop-indicator p { | |
margin-bottom: 0.5rem; | |
color: var(--text-color); | |
font-size: 1.2rem; | |
} | |
.file-icon { | |
font-size: 3rem; | |
margin-bottom: 1rem; | |
color: var(--primary-color); | |
} | |
.file-upload-icon { | |
position: fixed; | |
bottom: 20px; | |
left: 20px; | |
width: 45px; | |
height: 45px; | |
background-color: var(--card-bg); | |
border-radius: 50%; | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
cursor: pointer; | |
z-index: 100; | |
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2); | |
transition: transform 0.2s ease, box-shadow 0.2s ease; | |
} | |
.file-upload-icon:hover { | |
transform: translateY(-2px); | |
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3); | |
} | |
.file-upload-icon span { | |
font-size: 1.5rem; | |
color: var(--primary-color); | |
} | |
.file-info { | |
position: fixed; | |
bottom: 20px; | |
left: 75px; | |
background-color: var(--card-bg); | |
color: var(--primary-color); | |
font-weight: 500; | |
padding: 0.5rem 1rem; | |
border-radius: 1rem; | |
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2); | |
max-width: 270px; | |
white-space: nowrap; | |
overflow: hidden; | |
text-overflow: ellipsis; | |
z-index: 100; | |
display: none; | |
} | |
.file-detach { | |
margin-left: 8px; | |
display: inline-block; | |
width: 18px; | |
height: 18px; | |
background-color: rgba(255, 255, 255, 0.1); | |
color: var(--text-color); | |
border-radius: 50%; | |
text-align: center; | |
line-height: 16px; | |
font-size: 12px; | |
cursor: pointer; | |
transition: all 0.2s ease; | |
} | |
.file-detach:hover { | |
background-color: rgba(255, 0, 0, 0.2); | |
color: #ff6b6b; | |
transform: scale(1.1); | |
} | |
.preview-notice { | |
background-color: #273c56; | |
border: 1px solid #365a82; | |
color: #89b4e8; | |
padding: 0.75rem; | |
border-radius: 0.5rem; | |
margin-top: 1rem; | |
font-size: 0.875rem; | |
display: none; | |
} | |
@media (max-width: 768px) { | |
.header { | |
flex-direction: column; | |
align-items: stretch; | |
gap: 1rem; | |
} | |
.model-selector { | |
width: 100%; | |
} | |
.stats-grid { | |
grid-template-columns: 1fr; | |
} | |
} | |
</style> | |
</head> | |
<body> | |
<!-- Hidden File Drop Zone that appears when dragging files --> | |
<div id="fileDropZone" class="file-drop-zone"> | |
<div class="drop-indicator"> | |
<div class="file-icon">📄</div> | |
<p>Drop your file here</p> | |
</div> | |
</div> | |
<!-- File upload icon in bottom left corner --> | |
<div id="fileUploadIcon" class="file-upload-icon"> | |
<span>📎</span> | |
</div> | |
<p class="file-info" id="fileInfo"></p> | |
<div class="container"> | |
<div class="header"> | |
<div class="title-section"> | |
<h1 class="title">Token Visualizer</h1> | |
<p class="subtitle">Advanced tokenization analysis and visualization</p> | |
</div> | |
<div class="model-selector"> | |
<select id="modelSelect" name="model"> | |
{% for model_id, info in models.items() %} | |
<option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}> | |
{{ info.alias }} | |
</option> | |
{% endfor %} | |
</select> | |
</div> | |
</div> | |
<div class="error-message" id="errorMessage"></div> | |
<div class="input-section"> | |
<form id="analyzeForm" method="POST" enctype="multipart/form-data"> | |
<textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea> | |
<input type="hidden" name="model" id="modelInput" value="{{ selected_model }}"> | |
<input type="file" name="file" id="fileInput" style="display: none;"> | |
<div class="button-container"> | |
<button type="submit" id="analyzeButton">Analyze Text</button> | |
</div> | |
</form> | |
</div> | |
<div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}> | |
<div class="card"> | |
<h2 class="card-title">Token Visualization</h2> | |
<div class="preview-notice" id="previewNotice"> | |
Note: Showing preview of first 8096 characters. Stats are calculated on the full file. | |
</div> | |
<div class="token-container" id="tokenContainer"> | |
{% if token_data %} | |
{% for token in token_data.tokens %} | |
<span class="token" | |
style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};" | |
title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}"> | |
{{ token.display }} | |
</span> | |
{% if token.newline %}<br>{% endif %} | |
{% endfor %} | |
{% endif %} | |
</div> | |
<button class="expand-button" id="expandButton">Show More</button> | |
<div class="display-limit-notice" id="displayLimitNotice"> | |
Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span> | |
</div> | |
</div> | |
<div class="stats-grid"> | |
<div class="stat-card"> | |
<div class="stat-title">Total Tokens</div> | |
<div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div> | |
<div class="stat-description"> | |
<span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span> | |
(<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%) | |
</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Token Types</div> | |
<div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div> | |
<div class="stat-description">special tokens</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Whitespace</div> | |
<div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div> | |
<div class="stat-description"> | |
spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>, | |
newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span> | |
</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Token Length</div> | |
<div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div> | |
<div class="stat-description"> | |
median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>, | |
±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std | |
</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Compression</div> | |
<div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div> | |
<div class="stat-description">characters per token</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<script> | |
$(document).ready(function() { | |
// File handling variables | |
let currentFile = null; | |
let originalTextContent = null; | |
let lastUploadedFileName = null; | |
let fileJustUploaded = false; // Flag to prevent immediate detachment | |
function showError(message) { | |
const errorDiv = $('#errorMessage'); | |
errorDiv.text(message); | |
errorDiv.show(); | |
setTimeout(() => errorDiv.fadeOut(), 5000); | |
} | |
function updateResults(data) { | |
$('#results').show(); | |
// Update tokens | |
const tokenContainer = $('#tokenContainer'); | |
tokenContainer.empty(); | |
data.tokens.forEach(token => { | |
const span = $('<span>') | |
.addClass('token') | |
.css({ | |
'background-color': token.colors.background, | |
'color': token.colors.text | |
}) | |
// Include token id in the tooltip on hover | |
.attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`) | |
.text(token.display); | |
tokenContainer.append(span); | |
if (token.newline) { | |
tokenContainer.append('<br>'); | |
} | |
}); | |
// Update display limit notice | |
if (data.display_limit_reached) { | |
$('#displayLimitNotice').show(); | |
$('#totalTokenCount').text(data.total_tokens); | |
} else { | |
$('#displayLimitNotice').hide(); | |
} | |
// Update preview notice | |
if (data.preview_only) { | |
$('#previewNotice').show(); | |
} else { | |
$('#previewNotice').hide(); | |
} | |
// Update basic stats | |
$('#totalTokens').text(data.stats.basic_stats.total_tokens); | |
$('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`); | |
$('#uniquePercentage').text(data.stats.basic_stats.unique_percentage); | |
$('#specialTokens').text(data.stats.basic_stats.special_tokens); | |
$('#spaceTokens').text(data.stats.basic_stats.space_tokens); | |
$('#spaceCount').text(data.stats.basic_stats.space_tokens); | |
$('#newlineCount').text(data.stats.basic_stats.newline_tokens); | |
$('#compressionRatio').text(data.stats.basic_stats.compression_ratio); | |
// Update length stats | |
$('#avgLength').text(data.stats.length_stats.avg_length); | |
$('#medianLength').text(data.stats.length_stats.median_length); | |
$('#stdDev').text(data.stats.length_stats.std_dev); | |
} | |
// Handle text changes to detach file | |
$('#textInput').on('input', function() { | |
// Skip if file was just uploaded (prevents immediate detachment) | |
if (fileJustUploaded) { | |
fileJustUploaded = false; | |
return; | |
} | |
const currentText = $(this).val(); | |
const fileInput = document.getElementById('fileInput'); | |
// Only detach if a file exists and text has been substantially modified | |
if (fileInput.files.length > 0 && originalTextContent !== null) { | |
// Check if the text is completely different or has been significantly changed | |
// This allows for small edits without detaching | |
const isMajorChange = | |
currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20% | |
(currentText.length > 0 && | |
currentText !== originalTextContent.substring(0, currentText.length) && | |
currentText.substring(0, Math.min(20, currentText.length)) !== | |
originalTextContent.substring(0, Math.min(20, currentText.length))); | |
if (isMajorChange) { | |
detachFile(); | |
} | |
} | |
}); | |
// Function to detach file | |
function detachFile() { | |
// Clear the file input | |
$('#fileInput').val(''); | |
// Hide file info | |
$('#fileInfo').fadeOut(300); | |
// Reset the original content tracker | |
originalTextContent = $('#textInput').val(); | |
// Reset last uploaded filename | |
lastUploadedFileName = null; | |
} | |
// For model changes | |
$('#modelSelect').change(function() { | |
const selectedModel = $(this).val(); | |
$('#modelInput').val(selectedModel); | |
// If text exists, submit the form | |
if ($('#textInput').val().trim()) { | |
$('#analyzeForm').submit(); | |
} | |
}); | |
// File drop handling | |
const fileDropZone = $('#fileDropZone'); | |
const fileUploadIcon = $('#fileUploadIcon'); | |
// Prevent default drag behaviors | |
['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => { | |
fileDropZone[0].addEventListener(eventName, preventDefaults, false); | |
document.body.addEventListener(eventName, preventDefaults, false); | |
}); | |
function preventDefaults(e) { | |
e.preventDefault(); | |
e.stopPropagation(); | |
} | |
// Show drop zone when file is dragged over the document | |
document.addEventListener('dragenter', showDropZone, false); | |
document.addEventListener('dragover', showDropZone, false); | |
fileDropZone[0].addEventListener('dragleave', hideDropZone, false); | |
fileDropZone[0].addEventListener('drop', hideDropZone, false); | |
function showDropZone(e) { | |
fileDropZone.addClass('active'); | |
} | |
function hideDropZone() { | |
fileDropZone.removeClass('active'); | |
} | |
// Handle dropped files | |
fileDropZone[0].addEventListener('drop', handleDrop, false); | |
function handleDrop(e) { | |
const dt = e.dataTransfer; | |
const files = dt.files; | |
handleFiles(files); | |
} | |
// Also handle file selection via click on the icon | |
fileUploadIcon.on('click', function() { | |
const input = document.createElement('input'); | |
input.type = 'file'; | |
input.onchange = e => { | |
handleFiles(e.target.files); | |
}; | |
input.click(); | |
}); | |
function handleFiles(files) { | |
if (files.length) { | |
const file = files[0]; | |
currentFile = file; | |
lastUploadedFileName = file.name; | |
fileJustUploaded = true; // Set flag to prevent immediate detachment | |
// Show file info with animation and add detach button | |
$('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300); | |
// Add click handler for detach button | |
$('#fileDetach').on('click', function(e) { | |
e.stopPropagation(); // Prevent event bubbling | |
detachFile(); | |
return false; | |
}); | |
// Set the file to the file input | |
const dataTransfer = new DataTransfer(); | |
dataTransfer.items.add(file); | |
document.getElementById('fileInput').files = dataTransfer.files; | |
// Preview in textarea (first 8096 chars) | |
const reader = new FileReader(); | |
reader.onload = function(e) { | |
const previewText = e.target.result.slice(0, 8096); | |
$('#textInput').val(previewText); | |
// Store this as the original content AFTER setting the value | |
// to prevent the input event from firing and detaching immediately | |
setTimeout(() => { | |
originalTextContent = previewText; | |
// Automatically submit for analysis | |
$('#analyzeForm').submit(); | |
}, 50); | |
}; | |
reader.readAsText(file); | |
} | |
} | |
function formatFileSize(bytes) { | |
if (bytes < 1024) return bytes + ' bytes'; | |
else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB'; | |
else return (bytes / 1048576).toFixed(1) + ' MB'; | |
} | |
// Make sure to check if there's still a file when analyzing | |
$('#analyzeForm').on('submit', function(e) { | |
e.preventDefault(); | |
// Skip detachment check if file was just uploaded | |
if (!fileJustUploaded) { | |
// Check if text has been changed but file is still attached | |
const textInput = $('#textInput').val(); | |
const fileInput = document.getElementById('fileInput'); | |
if (fileInput.files.length > 0 && | |
originalTextContent !== null && | |
textInput !== originalTextContent && | |
textInput.length < originalTextContent.length * 0.8) { | |
// Text was significantly changed but file is still attached, detach it | |
detachFile(); | |
} | |
} else { | |
// Reset flag after first submission | |
fileJustUploaded = false; | |
} | |
const formData = new FormData(this); | |
$('#analyzeButton').prop('disabled', true); | |
$.ajax({ | |
url: '/', | |
method: 'POST', | |
data: formData, | |
processData: false, | |
contentType: false, | |
success: function(response) { | |
if (response.error) { | |
showError(response.error); | |
} else { | |
updateResults(response); | |
} | |
}, | |
error: function(xhr) { | |
showError(xhr.responseText || 'An error occurred while processing the text'); | |
}, | |
complete: function() { | |
$('#analyzeButton').prop('disabled', false); | |
} | |
}); | |
}); | |
$('#expandButton').click(function() { | |
const container = $('#tokenContainer'); | |
const isExpanded = container.hasClass('expanded'); | |
container.toggleClass('expanded'); | |
$(this).text(isExpanded ? 'Show More' : 'Show Less'); | |
}); | |
}); | |
</script> | |
</body> | |
</html> | |
""" | |
def index(): | |
text = "" | |
token_data = None | |
selected_model = request.args.get('model', request.form.get('model', 'mistral-small')) | |
if request.method == 'POST': | |
# Check if file upload | |
if 'file' in request.files and request.files['file'].filename: | |
uploaded_file = request.files['file'] | |
# Save file to tmp directory | |
file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename) | |
uploaded_file.save(file_path) | |
# Read a small preview of the file | |
with open(file_path, 'r', errors='replace') as f: | |
text = f.read(8096) | |
try: | |
# Process the file | |
token_data = process_text("", selected_model, is_full_file=True, file_path=file_path) | |
# Clean up the file after processing | |
if os.path.exists(file_path): | |
os.remove(file_path) | |
# If request is AJAX, return JSON | |
if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
return jsonify(token_data) | |
except Exception as e: | |
error_message = str(e) | |
# Clean up the file after processing | |
if os.path.exists(file_path): | |
os.remove(file_path) | |
if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
return jsonify({"error": error_message}), 400 | |
return render_template_string( | |
HTML_TEMPLATE, | |
text=text, | |
token_data=None, | |
models=TOKENIZER_MODELS, | |
selected_model=selected_model, | |
error=error_message | |
) | |
# Regular text processing | |
else: | |
text = request.form.get('text', '') | |
if text: | |
try: | |
token_data = process_text(text, selected_model) | |
# If request is AJAX, return JSON | |
if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
return jsonify(token_data) | |
except Exception as e: | |
error_message = str(e) | |
if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
return jsonify({"error": error_message}), 400 | |
return render_template_string( | |
HTML_TEMPLATE, | |
text=text, | |
token_data=None, | |
models=TOKENIZER_MODELS, | |
selected_model=selected_model, | |
error=error_message | |
) | |
return render_template_string( | |
HTML_TEMPLATE, | |
text=text, | |
token_data=token_data, | |
models=TOKENIZER_MODELS, | |
selected_model=selected_model | |
) | |
if __name__ == "__main__": | |
app.run(host='0.0.0.0', port=7860) |