tokenizers / app.py
barttee's picture
Update app.py
f74c8d6 verified
raw
history blame
38.7 kB
from transformers import AutoTokenizer
from flask import Flask, request, render_template_string, jsonify
import hashlib
import random
import math
import json
import io
import os
app = Flask(__name__)
# Set maximum content length to 100MB to handle larger files
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024
# Create upload folder if it doesn't exist
UPLOAD_FOLDER = '/tmp/tokenizer_uploads'
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
# Predefined tokenizer models with aliases
TOKENIZER_MODELS = {
'mistral-small': {
'name': 'mistralai/Mistral-Small-24B-Instruct-2501',
'alias': 'Mistral Small 3'
},
'gemma3-27b': {
'name': 'google/gemma-3-27b-it',
'alias': 'Gemma 3 27B'
},
'deepseek-r1': {
'name': 'deepseek-ai/DeepSeek-R1',
'alias': 'Deepseek R1'
},
'qwen_25_72b': {
'name': 'Qwen/Qwen2.5-72B-Instruct',
'alias': 'QWQ 32B'
},
'llama_33': {
'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
'alias': 'Llama 3.3 70B'
},
'gemma2_2b': {
'name': 'google/gemma-2-2b-it',
'alias': 'Gemma 2 2B'
},
'bert-large-uncased': {
'name': 'google-bert/bert-large-uncased',
'alias': 'Bert Large Uncased'
},
'gpt2': {
'name': 'openai-community/gpt2',
'alias': 'GPT-2'
}
}
# Initialize tokenizers dict
tokenizers = {}
def load_tokenizer(model_id):
"""Load tokenizer if not already loaded"""
if model_id not in tokenizers:
tokenizers[model_id] = AutoTokenizer.from_pretrained(TOKENIZER_MODELS[model_id]['name'])
return tokenizers[model_id]
def get_varied_color(token: str) -> dict:
"""Generate vibrant colors with HSL for better visual distinction."""
token_hash = hashlib.md5(token.encode()).hexdigest()
hue = int(token_hash[:3], 16) % 360
saturation = 70 + (int(token_hash[3:5], 16) % 20)
lightness = 80 + (int(token_hash[5:7], 16) % 10)
text_lightness = 20 if lightness > 50 else 90
return {
'background': f'hsl({hue}, {saturation}%, {lightness}%)',
'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
}
def fix_token(token: str) -> str:
"""Fix token for display with improved space visualization."""
if token.startswith('Ġ'):
space_count = token.count('Ġ')
return '·' * space_count + token[space_count:]
return token
def get_token_stats(tokens: list, original_text: str) -> dict:
"""Calculate enhanced statistics about the tokens."""
if not tokens:
return {}
total_tokens = len(tokens)
unique_tokens = len(set(tokens))
avg_length = sum(len(t) for t in tokens) / total_tokens
compression_ratio = len(original_text) / total_tokens
# Token type analysis
space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
# Length distribution
lengths = [len(t) for t in tokens]
mean_length = sum(lengths) / len(lengths)
variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
std_dev = math.sqrt(variance)
return {
'basic_stats': {
'total_tokens': total_tokens,
'unique_tokens': unique_tokens,
'compression_ratio': round(compression_ratio, 2),
'space_tokens': space_tokens,
'newline_tokens': newline_tokens,
'special_tokens': special_tokens,
'punctuation_tokens': punctuation_tokens,
'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
},
'length_stats': {
'avg_length': round(avg_length, 2),
'std_dev': round(std_dev, 2),
'min_length': min(lengths),
'max_length': max(lengths),
'median_length': sorted(lengths)[len(lengths)//2]
}
}
def process_text(text: str, model_id: str, is_full_file: bool = False, file_path: str = None) -> dict:
"""Process text and return tokenization data."""
tokenizer = load_tokenizer(model_id)
# For file uploads, read only preview from file but process full file for stats
if file_path and is_full_file:
# Read the preview for display
with open(file_path, 'r', errors='replace') as f:
preview_text = f.read(8096)
# Tokenize preview for display
preview_tokens = tokenizer.tokenize(preview_text)
display_tokens = preview_tokens[:50000]
# Process full file for stats in chunks to avoid memory issues
total_tokens = []
token_set = set()
total_length = 0
chunk_size = 1024 * 1024 # 1MB chunks
with open(file_path, 'r', errors='replace') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
total_length += len(chunk)
chunk_tokens = tokenizer.tokenize(chunk)
total_tokens.extend(chunk_tokens)
token_set.update(chunk_tokens)
# Calculate stats
stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text
else:
# Standard processing for normal text input
all_tokens = tokenizer.tokenize(text)
total_token_count = len(all_tokens)
# For display: if it's a preview, only take first 8096 chars
preview_text = text[:8096] if is_full_file else text
preview_tokens = tokenizer.tokenize(preview_text)
display_tokens = preview_tokens[:50000]
# Always use full text for stats
stats = get_token_stats(all_tokens, text)
# Format tokens for display
token_data = []
for idx, token in enumerate(display_tokens):
colors = get_varied_color(token)
fixed_token = fix_token(token)
# Compute the numerical token ID from the tokenizer
token_id = tokenizer.convert_tokens_to_ids(token)
token_data.append({
'original': token,
'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
'colors': colors,
'newline': fixed_token.endswith('Ċ'),
'token_id': token_id,
'token_index': idx
})
# Use the appropriate token count based on processing method
total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
return {
'tokens': token_data,
'stats': stats,
'display_limit_reached': total_token_count > 50000 and not is_full_file,
'total_tokens': total_token_count,
'is_full_file': is_full_file,
'preview_only': is_full_file
}
# HTML template with enhanced modern styling
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<title>Token Visualizer Pro</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
<style>
:root {
--primary-color: #0f4f9b; /* Blue accent */
--primary-hover: #0c3e7a; /* Darker blue accent */
--bg-color: #121212; /* Dark background */
--card-bg: #1e1e1e; /* Dark card background */
--card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
0 2px 4px -1px rgba(0, 0, 0, 0.6);
--transition: all 0.3s ease;
--text-color: #E0E0E0; /* Main text color */
--secondary-text: #A0A0A0;/* Secondary text color */
--input-bg: #2a2a2a; /* Input/textarea background */
--input-border: #444444; /* Input/textarea border */
--input-focus: #0f4f9b; /* Focus border color */
}
* {
margin: 0;
padding: 0;
box-sizing: border-box;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
scrollbar-width: thin;
scrollbar-color: #0f4f9b #121212
}
/* Width and height of the scrollbar */
::-webkit-scrollbar {
width: 12px;
height: 12px;
}
/* Track (background) */
::-webkit-scrollbar-track {
background: #121212;
border-radius: 10px;
}
/* Handle (draggable part) */
::-webkit-scrollbar-thumb {
background: #0f4f9b;
border-radius: 10px;
border: 2px solid #121212;
}
/* Handle on hover */
::-webkit-scrollbar-thumb:hover {
background: #0c3e7a;
}
body {
background-color: var(--bg-color);
padding: 2rem;
min-height: 100vh;
background-image:
radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
color: var(--text-color);
}
.container {
max-width: 1200px;
margin: 0 auto;
}
.header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 2rem;
position: relative;
}
.title-section {
flex-grow: 1;
}
.title {
font-size: 2.5rem;
font-weight: 800;
color: var(--primary-color);
margin-bottom: 0.5rem;
}
.subtitle {
color: var(--secondary-text);
font-size: 1.1rem;
}
.model-selector {
position: relative;
min-width: 200px;
}
select {
width: 100%;
padding: 0.75rem 1rem;
border: 2px solid var(--input-border);
border-radius: 0.5rem;
font-size: 1rem;
color: var(--text-color);
background-color: var(--input-bg);
cursor: pointer;
transition: var(--transition);
appearance: none;
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
background-repeat: no-repeat;
background-position: right 1rem center;
background-size: 1.5rem;
}
select:hover {
border-color: var(--primary-color);
}
select:focus {
outline: none;
border-color: var(--primary-color);
box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
}
.input-section {
margin-bottom: 2rem;
}
textarea {
width: 100%;
height: 150px;
padding: 1.25rem;
border: 2px solid var(--input-border);
border-radius: 0.75rem;
resize: vertical;
font-size: 1rem;
margin-bottom: 1rem;
transition: var(--transition);
background-color: var(--input-bg);
color: var(--text-color);
}
textarea:focus {
outline: none;
border-color: var(--input-focus);
box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
}
.button-container {
display: flex;
justify-content: center;
width: 100%;
gap: 1rem;
}
button {
padding: 0.875rem 2.5rem;
background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
color: #fff;
border: none;
border-radius: 0.75rem;
font-size: 1.1rem;
font-weight: 600;
cursor: pointer;
transition: var(--transition);
box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
}
button:hover {
transform: translateY(-2px);
box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
}
button:active {
transform: translateY(0);
}
button:disabled {
opacity: 0.7;
cursor: not-allowed;
}
.card {
background-color: var(--card-bg);
border-radius: 1rem;
box-shadow: var(--card-shadow);
padding: 1.5rem;
margin-bottom: 2rem;
transition: var(--transition);
}
.card:hover {
transform: translateY(-2px);
box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
}
.card-title {
font-size: 1.25rem;
font-weight: 700;
color: var(--text-color);
margin-bottom: 1.25rem;
display: flex;
align-items: center;
gap: 0.5rem;
cursor: pointer;
}
.card-title::before {
content: '';
display: block;
width: 4px;
height: 1.25rem;
background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
border-radius: 2px;
}
.token-container {
display: flex;
flex-wrap: wrap;
gap: 0.375rem;
margin-bottom: 1rem;
padding: 1rem;
background-color: #2a2a2a;
border-radius: 0.5rem;
max-height: 200px;
overflow-y: auto;
transition: max-height 0.3s ease;
}
.token-container.expanded {
max-height: none;
}
.token {
padding: 0.375rem 0.75rem;
border-radius: 0.375rem;
background-color: var(--input-bg);
font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
font-size: 0.875rem;
color: var(--text-color);
cursor: default;
transition: var(--transition);
box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
}
.token:hover {
transform: translateY(-1px);
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1.5rem;
margin-bottom: 2rem;
}
.stat-card {
background-color: var(--card-bg);
padding: 1.5rem;
border-radius: 1rem;
box-shadow: var(--card-shadow);
transition: var(--transition);
}
.stat-card:hover {
transform: translateY(-2px);
box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
}
.stat-title {
color: var(--secondary-text);
font-size: 0.875rem;
font-weight: 500;
margin-bottom: 0.5rem;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.stat-value {
color: var(--text-color);
font-size: 2rem;
font-weight: 700;
line-height: 1.2;
margin-bottom: 0.25rem;
}
.stat-description {
color: var(--secondary-text);
font-size: 0.875rem;
}
.expand-button {
background: none;
border: none;
color: var(--primary-color);
font-size: 0.875rem;
padding: 0.5rem;
cursor: pointer;
display: block;
margin: 0 auto;
box-shadow: none;
}
.expand-button:hover {
text-decoration: underline;
transform: none;
box-shadow: none;
}
.error-message {
color: #EF4444;
background-color: #3a1f1f;
border: 1px solid #562626;
padding: 1rem;
border-radius: 0.5rem;
margin-bottom: 1rem;
display: none;
}
.display-limit-notice {
background-color: #4b2b07;
border: 1px solid #7c4a02;
color: #FFD591;
padding: 0.75rem;
border-radius: 0.5rem;
margin-top: 1rem;
font-size: 0.875rem;
display: none;
}
/* File drop zone styles */
.file-drop-zone {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-color: rgba(15, 79, 155, 0.15);
z-index: 1000;
display: flex;
justify-content: center;
align-items: center;
opacity: 0;
pointer-events: none;
transition: opacity 0.3s ease;
}
.file-drop-zone.active {
opacity: 1;
pointer-events: all;
}
.drop-indicator {
background-color: var(--card-bg);
border: 2px dashed var(--primary-color);
border-radius: 1rem;
padding: 2rem;
text-align: center;
width: 60%;
max-width: 400px;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
animation: pulse 2s infinite;
}
@keyframes pulse {
0% { transform: scale(1); }
50% { transform: scale(1.05); }
100% { transform: scale(1); }
}
.drop-indicator p {
margin-bottom: 0.5rem;
color: var(--text-color);
font-size: 1.2rem;
}
.file-icon {
font-size: 3rem;
margin-bottom: 1rem;
color: var(--primary-color);
}
.file-upload-icon {
position: fixed;
bottom: 20px;
left: 20px;
width: 45px;
height: 45px;
background-color: var(--card-bg);
border-radius: 50%;
display: flex;
justify-content: center;
align-items: center;
cursor: pointer;
z-index: 100;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
transition: transform 0.2s ease, box-shadow 0.2s ease;
}
.file-upload-icon:hover {
transform: translateY(-2px);
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
}
.file-upload-icon span {
font-size: 1.5rem;
color: var(--primary-color);
}
.file-info {
position: fixed;
bottom: 20px;
left: 75px;
background-color: var(--card-bg);
color: var(--primary-color);
font-weight: 500;
padding: 0.5rem 1rem;
border-radius: 1rem;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
max-width: 270px;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
z-index: 100;
display: none;
}
.file-detach {
margin-left: 8px;
display: inline-block;
width: 18px;
height: 18px;
background-color: rgba(255, 255, 255, 0.1);
color: var(--text-color);
border-radius: 50%;
text-align: center;
line-height: 16px;
font-size: 12px;
cursor: pointer;
transition: all 0.2s ease;
}
.file-detach:hover {
background-color: rgba(255, 0, 0, 0.2);
color: #ff6b6b;
transform: scale(1.1);
}
.preview-notice {
background-color: #273c56;
border: 1px solid #365a82;
color: #89b4e8;
padding: 0.75rem;
border-radius: 0.5rem;
margin-top: 1rem;
font-size: 0.875rem;
display: none;
}
@media (max-width: 768px) {
.header {
flex-direction: column;
align-items: stretch;
gap: 1rem;
}
.model-selector {
width: 100%;
}
.stats-grid {
grid-template-columns: 1fr;
}
}
</style>
</head>
<body>
<!-- Hidden File Drop Zone that appears when dragging files -->
<div id="fileDropZone" class="file-drop-zone">
<div class="drop-indicator">
<div class="file-icon">📄</div>
<p>Drop your file here</p>
</div>
</div>
<!-- File upload icon in bottom left corner -->
<div id="fileUploadIcon" class="file-upload-icon">
<span>📎</span>
</div>
<p class="file-info" id="fileInfo"></p>
<div class="container">
<div class="header">
<div class="title-section">
<h1 class="title">Token Visualizer</h1>
<p class="subtitle">Advanced tokenization analysis and visualization</p>
</div>
<div class="model-selector">
<select id="modelSelect" name="model">
{% for model_id, info in models.items() %}
<option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
{{ info.alias }}
</option>
{% endfor %}
</select>
</div>
</div>
<div class="error-message" id="errorMessage"></div>
<div class="input-section">
<form id="analyzeForm" method="POST" enctype="multipart/form-data">
<textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
<input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
<input type="file" name="file" id="fileInput" style="display: none;">
<div class="button-container">
<button type="submit" id="analyzeButton">Analyze Text</button>
</div>
</form>
</div>
<div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
<div class="card">
<h2 class="card-title">Token Visualization</h2>
<div class="preview-notice" id="previewNotice">
Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
</div>
<div class="token-container" id="tokenContainer">
{% if token_data %}
{% for token in token_data.tokens %}
<span class="token"
style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
{{ token.display }}
</span>
{% if token.newline %}<br>{% endif %}
{% endfor %}
{% endif %}
</div>
<button class="expand-button" id="expandButton">Show More</button>
<div class="display-limit-notice" id="displayLimitNotice">
Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
</div>
</div>
<div class="stats-grid">
<div class="stat-card">
<div class="stat-title">Total Tokens</div>
<div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
<div class="stat-description">
<span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
(<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
</div>
</div>
<div class="stat-card">
<div class="stat-title">Token Types</div>
<div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
<div class="stat-description">special tokens</div>
</div>
<div class="stat-card">
<div class="stat-title">Whitespace</div>
<div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
<div class="stat-description">
spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
</div>
</div>
<div class="stat-card">
<div class="stat-title">Token Length</div>
<div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
<div class="stat-description">
median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
</div>
</div>
<div class="stat-card">
<div class="stat-title">Compression</div>
<div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
<div class="stat-description">characters per token</div>
</div>
</div>
</div>
</div>
<script>
$(document).ready(function() {
// File handling variables
let currentFile = null;
let originalTextContent = null;
let lastUploadedFileName = null;
let fileJustUploaded = false; // Flag to prevent immediate detachment
function showError(message) {
const errorDiv = $('#errorMessage');
errorDiv.text(message);
errorDiv.show();
setTimeout(() => errorDiv.fadeOut(), 5000);
}
function updateResults(data) {
$('#results').show();
// Update tokens
const tokenContainer = $('#tokenContainer');
tokenContainer.empty();
data.tokens.forEach(token => {
const span = $('<span>')
.addClass('token')
.css({
'background-color': token.colors.background,
'color': token.colors.text
})
// Include token id in the tooltip on hover
.attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
.text(token.display);
tokenContainer.append(span);
if (token.newline) {
tokenContainer.append('<br>');
}
});
// Update display limit notice
if (data.display_limit_reached) {
$('#displayLimitNotice').show();
$('#totalTokenCount').text(data.total_tokens);
} else {
$('#displayLimitNotice').hide();
}
// Update preview notice
if (data.preview_only) {
$('#previewNotice').show();
} else {
$('#previewNotice').hide();
}
// Update basic stats
$('#totalTokens').text(data.stats.basic_stats.total_tokens);
$('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
$('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
$('#specialTokens').text(data.stats.basic_stats.special_tokens);
$('#spaceTokens').text(data.stats.basic_stats.space_tokens);
$('#spaceCount').text(data.stats.basic_stats.space_tokens);
$('#newlineCount').text(data.stats.basic_stats.newline_tokens);
$('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
// Update length stats
$('#avgLength').text(data.stats.length_stats.avg_length);
$('#medianLength').text(data.stats.length_stats.median_length);
$('#stdDev').text(data.stats.length_stats.std_dev);
}
// Handle text changes to detach file
$('#textInput').on('input', function() {
// Skip if file was just uploaded (prevents immediate detachment)
if (fileJustUploaded) {
fileJustUploaded = false;
return;
}
const currentText = $(this).val();
const fileInput = document.getElementById('fileInput');
// Only detach if a file exists and text has been substantially modified
if (fileInput.files.length > 0 && originalTextContent !== null) {
// Check if the text is completely different or has been significantly changed
// This allows for small edits without detaching
const isMajorChange =
currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
(currentText.length > 0 &&
currentText !== originalTextContent.substring(0, currentText.length) &&
currentText.substring(0, Math.min(20, currentText.length)) !==
originalTextContent.substring(0, Math.min(20, currentText.length)));
if (isMajorChange) {
detachFile();
}
}
});
// Function to detach file
function detachFile() {
// Clear the file input
$('#fileInput').val('');
// Hide file info
$('#fileInfo').fadeOut(300);
// Reset the original content tracker
originalTextContent = $('#textInput').val();
// Reset last uploaded filename
lastUploadedFileName = null;
}
// For model changes
$('#modelSelect').change(function() {
const selectedModel = $(this).val();
$('#modelInput').val(selectedModel);
// If text exists, submit the form
if ($('#textInput').val().trim()) {
$('#analyzeForm').submit();
}
});
// File drop handling
const fileDropZone = $('#fileDropZone');
const fileUploadIcon = $('#fileUploadIcon');
// Prevent default drag behaviors
['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
fileDropZone[0].addEventListener(eventName, preventDefaults, false);
document.body.addEventListener(eventName, preventDefaults, false);
});
function preventDefaults(e) {
e.preventDefault();
e.stopPropagation();
}
// Show drop zone when file is dragged over the document
document.addEventListener('dragenter', showDropZone, false);
document.addEventListener('dragover', showDropZone, false);
fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
fileDropZone[0].addEventListener('drop', hideDropZone, false);
function showDropZone(e) {
fileDropZone.addClass('active');
}
function hideDropZone() {
fileDropZone.removeClass('active');
}
// Handle dropped files
fileDropZone[0].addEventListener('drop', handleDrop, false);
function handleDrop(e) {
const dt = e.dataTransfer;
const files = dt.files;
handleFiles(files);
}
// Also handle file selection via click on the icon
fileUploadIcon.on('click', function() {
const input = document.createElement('input');
input.type = 'file';
input.onchange = e => {
handleFiles(e.target.files);
};
input.click();
});
function handleFiles(files) {
if (files.length) {
const file = files[0];
currentFile = file;
lastUploadedFileName = file.name;
fileJustUploaded = true; // Set flag to prevent immediate detachment
// Show file info with animation and add detach button
$('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
// Add click handler for detach button
$('#fileDetach').on('click', function(e) {
e.stopPropagation(); // Prevent event bubbling
detachFile();
return false;
});
// Set the file to the file input
const dataTransfer = new DataTransfer();
dataTransfer.items.add(file);
document.getElementById('fileInput').files = dataTransfer.files;
// Preview in textarea (first 8096 chars)
const reader = new FileReader();
reader.onload = function(e) {
const previewText = e.target.result.slice(0, 8096);
$('#textInput').val(previewText);
// Store this as the original content AFTER setting the value
// to prevent the input event from firing and detaching immediately
setTimeout(() => {
originalTextContent = previewText;
// Automatically submit for analysis
$('#analyzeForm').submit();
}, 50);
};
reader.readAsText(file);
}
}
function formatFileSize(bytes) {
if (bytes < 1024) return bytes + ' bytes';
else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
else return (bytes / 1048576).toFixed(1) + ' MB';
}
// Make sure to check if there's still a file when analyzing
$('#analyzeForm').on('submit', function(e) {
e.preventDefault();
// Skip detachment check if file was just uploaded
if (!fileJustUploaded) {
// Check if text has been changed but file is still attached
const textInput = $('#textInput').val();
const fileInput = document.getElementById('fileInput');
if (fileInput.files.length > 0 &&
originalTextContent !== null &&
textInput !== originalTextContent &&
textInput.length < originalTextContent.length * 0.8) {
// Text was significantly changed but file is still attached, detach it
detachFile();
}
} else {
// Reset flag after first submission
fileJustUploaded = false;
}
const formData = new FormData(this);
$('#analyzeButton').prop('disabled', true);
$.ajax({
url: '/',
method: 'POST',
data: formData,
processData: false,
contentType: false,
success: function(response) {
if (response.error) {
showError(response.error);
} else {
updateResults(response);
}
},
error: function(xhr) {
showError(xhr.responseText || 'An error occurred while processing the text');
},
complete: function() {
$('#analyzeButton').prop('disabled', false);
}
});
});
$('#expandButton').click(function() {
const container = $('#tokenContainer');
const isExpanded = container.hasClass('expanded');
container.toggleClass('expanded');
$(this).text(isExpanded ? 'Show More' : 'Show Less');
});
});
</script>
</body>
</html>
"""
@app.route('/', methods=['GET', 'POST'])
def index():
text = ""
token_data = None
selected_model = request.args.get('model', request.form.get('model', 'mistral-small'))
if request.method == 'POST':
# Check if file upload
if 'file' in request.files and request.files['file'].filename:
uploaded_file = request.files['file']
# Save file to tmp directory
file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
uploaded_file.save(file_path)
# Read a small preview of the file
with open(file_path, 'r', errors='replace') as f:
text = f.read(8096)
try:
# Process the file
token_data = process_text("", selected_model, is_full_file=True, file_path=file_path)
# Clean up the file after processing
if os.path.exists(file_path):
os.remove(file_path)
# If request is AJAX, return JSON
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
return jsonify(token_data)
except Exception as e:
error_message = str(e)
# Clean up the file after processing
if os.path.exists(file_path):
os.remove(file_path)
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
return jsonify({"error": error_message}), 400
return render_template_string(
HTML_TEMPLATE,
text=text,
token_data=None,
models=TOKENIZER_MODELS,
selected_model=selected_model,
error=error_message
)
# Regular text processing
else:
text = request.form.get('text', '')
if text:
try:
token_data = process_text(text, selected_model)
# If request is AJAX, return JSON
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
return jsonify(token_data)
except Exception as e:
error_message = str(e)
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
return jsonify({"error": error_message}), 400
return render_template_string(
HTML_TEMPLATE,
text=text,
token_data=None,
models=TOKENIZER_MODELS,
selected_model=selected_model,
error=error_message
)
return render_template_string(
HTML_TEMPLATE,
text=text,
token_data=token_data,
models=TOKENIZER_MODELS,
selected_model=selected_model
)
if __name__ == "__main__":
app.run(host='0.0.0.0', port=7860)