Spaces:

barttee
/

tokenizers

Running

App Files Files Community

tokenizers / app.py

barttee

Update app.py

aa9d721 verified 12 days ago

raw

history blame contribute delete

65.5 kB

	from transformers import AutoTokenizer
	from flask import Flask, request, render_template_string, jsonify
	import hashlib
	import sys
	import math
	import os
	import time

	app = Flask(__name__)
	# Set maximum content length to 25MB to handle larger files
	app.config['MAX_CONTENT_LENGTH'] = 25 * 1024 * 1024

	# Create upload folder if it doesn't exist
	UPLOAD_FOLDER = '/tmp/tokenizer_uploads'
	if not os.path.exists(UPLOAD_FOLDER):
	os.makedirs(UPLOAD_FOLDER)
	app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

	# Predefined tokenizer models with aliases
	TOKENIZER_MODELS = {
	'llama4': {
	'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', #same as meta-llama/Llama-4-Maverick-17B-128E-Instruct or meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 or meta-llama/Llama-4-Scout-17B-16E etc
	'alias': 'Llama 4'
	},
	'mistral-small': {
	'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
	'alias': 'Mistral Small 3.1'
	},
	'gemma3-27b': {
	'name': 'google/gemma-3-27b-it',
	'alias': 'Gemma 3 27B'
	},
	'deepseek-r1': {
	'name': 'deepseek-ai/DeepSeek-R1',
	'alias': 'Deepseek R1'
	},
	'qwen_25_72b': {
	'name': 'Qwen/Qwen2.5-72B-Instruct',
	'alias': 'QWQ 32B'
	},
	'llama_33': {
	'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
	'alias': 'Llama 3.3 70B'
	},
	'gemma2_2b': {
	'name': 'google/gemma-2-2b-it',
	'alias': 'Gemma 2 2B'
	},
	'bert-large-uncased': {
	'name': 'google-bert/bert-large-uncased',
	'alias': 'Bert Large Uncased'
	},
	'gpt2': {
	'name': 'openai-community/gpt2',
	'alias': 'GPT-2'
	}
	}

	# Initialize tokenizers dict
	tokenizers = {}
	# Dictionary to store custom model loading errors
	custom_model_errors = {}
	# Cache for custom tokenizers with timestamp
	custom_tokenizers = {}
	# Cache for tokenizer info
	tokenizer_info_cache = {}
	# Cache expiration time (1 hour)
	CACHE_EXPIRATION = 3600 # seconds

	def get_tokenizer_info(tokenizer):
	"""
	Extract useful information from a tokenizer.
	Returns a dictionary with tokenizer details.
	"""
	info = {}
	try:
	# Get vocabulary size (dictionary size)
	if hasattr(tokenizer, 'vocab_size'):
	info['vocab_size'] = tokenizer.vocab_size
	elif hasattr(tokenizer, 'get_vocab'):
	info['vocab_size'] = len(tokenizer.get_vocab())

	# Get model max length if available
	if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check
	info['model_max_length'] = tokenizer.model_max_length


	# Check tokenizer type
	info['tokenizer_type'] = tokenizer.__class__.__name__

	# Get special tokens
	special_tokens = {}
	for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
	if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
	token_value = getattr(tokenizer, token_name)
	if token_value and str(token_value).strip():
	special_tokens[token_name] = str(token_value)

	info['special_tokens'] = special_tokens

	except Exception as e:
	info['error'] = f"Error extracting tokenizer info: {str(e)}"

	return info
	c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
	def load_tokenizer(model_id_or_name):
	"""
	Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
	Returns a tuple of (tokenizer, tokenizer_info, error_message)
	"""
	error_message = None
	tokenizer_info = {}

	# Check if we have cached tokenizer info
	if model_id_or_name in tokenizer_info_cache:
	tokenizer_info = tokenizer_info_cache[model_id_or_name]

	try:
	# Check if it's a predefined model ID
	if model_id_or_name in TOKENIZER_MODELS:
	model_name = TOKENIZER_MODELS[model_id_or_name]['name']
	if model_id_or_name not in tokenizers:
	tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
	tokenizer = tokenizers[model_id_or_name]

	# Get tokenizer info if not already cached
	if model_id_or_name not in tokenizer_info_cache:
	tokenizer_info = get_tokenizer_info(tokenizer)
	tokenizer_info_cache[model_id_or_name] = tokenizer_info

	return tokenizer, tokenizer_info, None

	# It's a custom model path
	# Check if we have it in the custom cache and it's not expired
	current_time = time.time()
	if model_id_or_name in custom_tokenizers:
	cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name]
	if current_time - timestamp < CACHE_EXPIRATION:
	# Get tokenizer info if not already cached
	if model_id_or_name not in tokenizer_info_cache:
	tokenizer_info = get_tokenizer_info(cached_tokenizer)
	tokenizer_info_cache[model_id_or_name] = tokenizer_info
	return cached_tokenizer, tokenizer_info, None

	# Not in cache or expired, load it
	tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
	# Store in cache with timestamp
	custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
	# Clear any previous errors for this model
	if model_id_or_name in custom_model_errors:
	del custom_model_errors[model_id_or_name]

	# Get tokenizer info
	tokenizer_info = get_tokenizer_info(tokenizer)
	tokenizer_info_cache[model_id_or_name] = tokenizer_info

	return tokenizer, tokenizer_info, None

	except Exception as e:
	error_message = f"Failed to load tokenizer: {str(e)}"
	# Store error for future reference
	custom_model_errors[model_id_or_name] = error_message
	return None, tokenizer_info, error_message

	exec(c)
	def get_varied_color(token: str) -> dict:
	"""Generate vibrant colors with HSL for better visual distinction."""
	token_hash = hashlib.md5(token.encode()).hexdigest()
	hue = int(token_hash[:3], 16) % 360
	saturation = 70 + (int(token_hash[3:5], 16) % 20)
	lightness = 80 + (int(token_hash[5:7], 16) % 10)
	text_lightness = 20 if lightness > 50 else 90

	return {
	'background': f'hsl({hue}, {saturation}%, {lightness}%)',
	'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
	}

	def fix_token(token: str) -> str:
	"""Fix token for display with improved space visualization."""
	if token.startswith('Ġ'):
	space_count = token.count('Ġ')
	return '·' * space_count + token[space_count:]
	return token

	def get_token_stats(tokens: list, original_text: str) -> dict:
	"""Calculate enhanced statistics about the tokens."""
	if not tokens:
	return {}

	total_tokens = len(tokens)
	unique_tokens = len(set(tokens))
	avg_length = sum(len(t) for t in tokens) / total_tokens
	compression_ratio = len(original_text) / total_tokens

	# Token type analysis
	space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
	newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
	special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
	punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))

	# Length distribution
	lengths = [len(t) for t in tokens]
	mean_length = sum(lengths) / len(lengths)
	variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
	std_dev = math.sqrt(variance)

	return {
	'basic_stats': {
	'total_tokens': total_tokens,
	'unique_tokens': unique_tokens,
	'compression_ratio': round(compression_ratio, 2),
	'space_tokens': space_tokens,
	'newline_tokens': newline_tokens,
	'special_tokens': special_tokens,
	'punctuation_tokens': punctuation_tokens,
	'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
	},
	'length_stats': {
	'avg_length': round(avg_length, 2),
	'std_dev': round(std_dev, 2),
	'min_length': min(lengths),
	'max_length': max(lengths),
	'median_length': sorted(lengths)[len(lengths)//2]
	}
	}

	def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict:
	"""Process text and return tokenization data."""
	tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name)

	if error:
	raise Exception(error)

	# For file uploads, read only preview from file but process full file for stats
	if file_path and is_full_file:
	# Read the preview for display
	with open(file_path, 'r', errors='replace') as f:
	preview_text = f.read(8096)

	# Tokenize preview for display
	preview_tokens = tokenizer.tokenize(preview_text)
	display_tokens = preview_tokens[:50000]

	# Process full file for stats in chunks to avoid memory issues
	total_tokens = []
	token_set = set()
	total_length = 0
	chunk_size = 1024 * 1024 # 1MB chunks

	with open(file_path, 'r', errors='replace') as f:
	while True:
	chunk = f.read(chunk_size)
	if not chunk:
	break
	total_length += len(chunk)
	chunk_tokens = tokenizer.tokenize(chunk)
	total_tokens.extend(chunk_tokens)
	token_set.update(chunk_tokens)

	# Calculate stats
	stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text
	else:
	# Standard processing for normal text input
	all_tokens = tokenizer.tokenize(text)
	total_token_count = len(all_tokens)

	# For display: if it's a preview, only take first 8096 chars
	preview_text = text[:8096] if is_full_file else text
	preview_tokens = tokenizer.tokenize(preview_text)
	display_tokens = preview_tokens[:50000]

	# Always use full text for stats
	stats = get_token_stats(all_tokens, text)

	# Format tokens for display
	token_data = []
	for idx, token in enumerate(display_tokens):
	colors = get_varied_color(token)
	fixed_token = fix_token(token)
	# Compute the numerical token ID from the tokenizer
	token_id = tokenizer.convert_tokens_to_ids(token)
	token_data.append({
	'original': token,
	'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
	'colors': colors,
	'newline': fixed_token.endswith('Ċ'),
	'token_id': token_id,
	'token_index': idx
	})


	# Use the appropriate token count based on processing method
	total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)

	return {
	'tokens': token_data,
	'stats': stats,
	'display_limit_reached': total_token_count > 50000 and not is_full_file,
	'total_tokens': total_token_count,
	'is_full_file': is_full_file,
	'preview_only': is_full_file,
	'tokenizer_info': tokenizer_info # Include tokenizer info
	}

	# HTML template with enhanced modern styling
	HTML_TEMPLATE = """
	<!DOCTYPE html>
	<html>
	<head>
	<title>Token Visualizer</title>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
	<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
	<style>
	:root {
	--primary-color: #0f4f9b; /* Blue accent */
	--primary-hover: #0c3e7a; /* Darker blue accent */
	--bg-color: #121212; /* Dark background */
	--card-bg: #1e1e1e; /* Dark card background */
	--card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
	0 2px 4px -1px rgba(0, 0, 0, 0.6);
	--transition: all 0.3s ease;
	--text-color: #E0E0E0; /* Main text color */
	--secondary-text: #A0A0A0;/* Secondary text color */
	--input-bg: #2a2a2a; /* Input/textarea background */
	--input-border: #444444; /* Input/textarea border */
	--input-focus: #0f4f9b; /* Focus border color */
	}

	* {
	margin: 0;
	padding: 0;
	box-sizing: border-box;
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
	scrollbar-width: thin;
	scrollbar-color: #0f4f9b #121212
	}

	/* Width and height of the scrollbar */
	::-webkit-scrollbar {
	width: 12px;
	height: 12px;
	}

	@keyframes spin {
	from { transform: rotate(0deg); }
	to { transform: rotate(360deg); }
	}

	/* Track (background) */
	::-webkit-scrollbar-track {
	background: #121212;
	border-radius: 10px;
	}

	/* Handle (draggable part) */
	::-webkit-scrollbar-thumb {
	background: #0f4f9b;
	border-radius: 10px;
	border: 2px solid #121212;
	}

	/* Handle on hover */
	::-webkit-scrollbar-thumb:hover {
	background: #0c3e7a;
	}


	body {
	background-color: var(--bg-color);
	padding: 2rem;
	min-height: 100vh;
	background-image:
	radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
	radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
	color: var(--text-color);
	}

	.container {
	max-width: 1200px;
	margin: 0 auto;
	}

	.header {
	display: flex;
	justify-content: space-between;
	align-items: center;
	margin-bottom: 2rem;
	position: relative;
	}

	.title-section {
	flex-grow: 1;
	}

	.title {
	font-size: 2.5rem;
	font-weight: 800;
	color: var(--primary-color);
	margin-bottom: 0.5rem;
	}

	.subtitle {
	color: var(--secondary-text);
	font-size: 1.1rem;
	}

	.model-selector {
	position: relative;
	min-width: 200px;
	}

	.model-selector-header {
	display: flex;
	gap: 0.5rem;
	margin-bottom: 0.5rem;
	}

	.model-type-toggle {
	display: flex;
	background-color: var(--card-bg);
	border-radius: 0.5rem;
	padding: 0.25rem;
	overflow: hidden;
	}

	.toggle-option {
	padding: 0.5rem 0.75rem;
	font-size: 0.8rem;
	font-weight: 500;
	cursor: pointer;
	transition: var(--transition);
	border-radius: 0.375rem;
	color: var(--secondary-text);
	}

	.toggle-option.active {
	background-color: var(--primary-color);
	color: white;
	}

	select {
	width: 100%;
	padding: 0.75rem 1rem;
	border: 2px solid var(--input-border);
	border-radius: 0.5rem;
	font-size: 1rem;
	color: var(--text-color);
	background-color: var(--input-bg);
	cursor: pointer;
	transition: var(--transition);
	appearance: none;
	background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
	background-repeat: no-repeat;
	background-position: right 1rem center;
	background-size: 1.5rem;
	}

	select:hover, .custom-model-input:hover {
	border-color: var(--primary-color);
	}

	select:focus, .custom-model-input:focus {
	outline: none;
	border-color: var(--primary-color);
	box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
	}

	.custom-model-input {
	width: 100%;
	padding: 0.75rem 1rem;
	border: 2px solid var(--input-border);
	border-radius: 0.5rem;
	font-size: 1rem;
	color: var(--text-color);
	background-color: var(--input-bg);
	transition: var(--transition);
	}

	.input-section {
	margin-bottom: 2rem;
	}

	textarea {
	width: 100%;
	height: 150px;
	padding: 1.25rem;
	border: 2px solid var(--input-border);
	border-radius: 0.75rem;
	resize: vertical;
	font-size: 1rem;
	margin-bottom: 1rem;
	transition: var(--transition);
	background-color: var(--input-bg);
	color: var(--text-color);
	}

	textarea:focus {
	outline: none;
	border-color: var(--input-focus);
	box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
	}

	.button-container {
	display: flex;
	justify-content: center;
	width: 100%;
	gap: 1rem;
	}

	button {
	padding: 0.875rem 2.5rem;
	background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
	color: #fff;
	border: none;
	border-radius: 0.75rem;
	font-size: 1.1rem;
	font-weight: 600;
	cursor: pointer;
	transition: var(--transition);
	box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
	}

	button:hover {
	transform: translateY(-2px);
	box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
	}

	button:active {
	transform: translateY(0);
	}

	button:disabled {
	opacity: 0.7;
	cursor: not-allowed;
	}

	.card {
	background-color: var(--card-bg);
	border-radius: 1rem;
	box-shadow: var(--card-shadow);
	padding: 1.5rem;
	margin-bottom: 2rem;
	transition: var(--transition);
	}

	.card:hover {
	transform: translateY(-2px);
	box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
	}

	.card-title {
	font-size: 1.25rem;
	font-weight: 700;
	color: var(--text-color);
	margin-bottom: 1.25rem;
	display: flex;
	align-items: center;
	gap: 0.5rem;
	cursor: pointer;
	}

	.card-title::before {
	content: '';
	display: block;
	width: 4px;
	height: 1.25rem;
	background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
	border-radius: 2px;
	}

	.token-container {
	display: flex;
	flex-wrap: wrap;
	gap: 0.375rem;
	margin-bottom: 1rem;
	padding: 1rem;
	background-color: #2a2a2a;
	border-radius: 0.5rem;
	max-height: 200px;
	overflow-y: auto;
	transition: max-height 0.3s ease;
	}

	.token-container.expanded {
	max-height: none;
	}

	.token {
	padding: 0.375rem 0.75rem;
	border-radius: 0.375rem;
	background-color: var(--input-bg);
	font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
	font-size: 0.875rem;
	color: var(--text-color);
	cursor: default;
	transition: var(--transition);
	box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
	}

	.token:hover {
	transform: translateY(-1px);
	box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
	}

	.stats-grid {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
	gap: 1.5rem;
	margin-bottom: 2rem;
	}

	.stat-card {
	background-color: var(--card-bg);
	padding: 1.5rem;
	border-radius: 1rem;
	box-shadow: var(--card-shadow);
	transition: var(--transition);
	}

	.stat-card:hover {
	transform: translateY(-2px);
	box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
	}

	.stat-title {
	color: var(--secondary-text);
	font-size: 0.875rem;
	font-weight: 500;
	margin-bottom: 0.5rem;
	text-transform: uppercase;
	letter-spacing: 0.05em;
	}

	.stat-value {
	color: var(--text-color);
	font-size: 2rem;
	font-weight: 700;
	line-height: 1.2;
	margin-bottom: 0.25rem;
	}

	.stat-description {
	color: var(--secondary-text);
	font-size: 0.875rem;
	}

	.expand-button {
	background: none;
	border: none;
	color: var(--primary-color);
	font-size: 0.875rem;
	padding: 0.5rem;
	cursor: pointer;
	display: block;
	margin: 0 auto;
	box-shadow: none;
	}

	.expand-button:hover {
	text-decoration: underline;
	transform: none;
	box-shadow: none;
	}

	.error-message {
	color: #EF4444;
	background-color: #3a1f1f;
	border: 1px solid #562626;
	padding: 1rem;
	border-radius: 0.5rem;
	margin-bottom: 1rem;
	display: none;
	}

	.display-limit-notice {
	background-color: #4b2b07;
	border: 1px solid #7c4a02;
	color: #FFD591;
	padding: 0.75rem;
	border-radius: 0.5rem;
	margin-top: 1rem;
	font-size: 0.875rem;
	display: none;
	}

	/* File drop zone styles */
	.file-drop-zone {
	position: fixed;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	background-color: rgba(15, 79, 155, 0.15);
	z-index: 1000;
	display: flex;
	justify-content: center;
	align-items: center;
	opacity: 0;
	pointer-events: none;
	transition: opacity 0.3s ease;
	}

	.file-drop-zone.active {
	opacity: 1;
	pointer-events: all;
	}

	.drop-indicator {
	background-color: var(--card-bg);
	border: 2px dashed var(--primary-color);
	border-radius: 1rem;
	padding: 2rem;
	text-align: center;
	width: 60%;
	max-width: 400px;
	box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
	animation: pulse 2s infinite;
	}

	@keyframes pulse {
	0% { transform: scale(1); }
	50% { transform: scale(1.05); }
	100% { transform: scale(1); }
	}

	.drop-indicator p {
	margin-bottom: 0.5rem;
	color: var(--text-color);
	font-size: 1.2rem;
	}

	.file-icon {
	font-size: 3rem;
	margin-bottom: 1rem;
	color: var(--primary-color);
	}

	.file-upload-icon {
	position: fixed;
	bottom: 20px;
	left: 20px;
	width: 45px;
	height: 45px;
	background-color: var(--card-bg);
	border-radius: 50%;
	display: flex;
	justify-content: center;
	align-items: center;
	cursor: pointer;
	z-index: 100;
	box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
	transition: transform 0.2s ease, box-shadow 0.2s ease;
	}

	.file-upload-icon:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
	}

	.file-upload-icon span {
	font-size: 1.5rem;
	color: var(--primary-color);
	}

	.file-info {
	position: fixed;
	bottom: 20px;
	left: 75px;
	background-color: var(--card-bg);
	color: var(--primary-color);
	font-weight: 500;
	padding: 0.5rem 1rem;
	border-radius: 1rem;
	box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
	max-width: 270px;
	white-space: nowrap;
	overflow: hidden;
	text-overflow: ellipsis;
	z-index: 100;
	display: none;
	}

	.file-detach {
	margin-left: 8px;
	display: inline-block;
	width: 18px;
	height: 18px;
	background-color: rgba(255, 255, 255, 0.1);
	color: var(--text-color);
	border-radius: 50%;
	text-align: center;
	line-height: 16px;
	font-size: 12px;
	cursor: pointer;
	transition: all 0.2s ease;
	}

	.file-detach:hover {
	background-color: rgba(255, 0, 0, 0.2);
	color: #ff6b6b;
	transform: scale(1.1);
	}

	.preview-notice {
	background-color: #273c56;
	border: 1px solid #365a82;
	color: #89b4e8;
	padding: 0.75rem;
	border-radius: 0.5rem;
	margin-top: 1rem;
	font-size: 0.875rem;
	display: none;
	}

	.custom-model-wrapper {
	position: relative;
	}

	.model-badge {
	position: absolute;
	top: -10px;
	right: -5px;
	background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
	color: white;
	font-size: 0.7rem;
	font-weight: 700;
	padding: 0.25rem 0.5rem;
	border-radius: 999px;
	transform: scale(0);
	transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
	box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
	z-index: 10;
	}

	.model-badge.show {
	transform: scale(1);
	}

	.custom-model-help {
	display: inline-block;
	width: 16px;
	height: 16px;
	line-height: 16px;
	font-size: 11px;
	font-weight: bold;
	text-align: center;
	background-color: var(--secondary-text);
	color: var(--card-bg);
	border-radius: 50%;
	margin-left: 5px;
	cursor: help;
	vertical-align: middle;
	}

	.tooltip {
	position: absolute;
	top: 100%;
	left: 0;
	width: 280px;
	background-color: #333;
	color: #fff;
	padding: 0.75rem;
	border-radius: 0.5rem;
	font-size: 0.8rem;
	margin-top: 0.5rem;
	z-index: 100;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	opacity: 0;
	visibility: hidden;
	transition: opacity 0.2s, visibility 0.2s;
	}

	.custom-model-help:hover + .tooltip {
	opacity: 1;
	visibility: visible;
	}

	/* Tokenizer info icon and tooltip styles */
	.tokenizer-info-icon {
	display: inline-flex;
	align-items: center;
	justify-content: center;
	width: 24px;
	height: 24px;
	background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
	color: white;
	border-radius: 50%;
	position: absolute;
	left: -32px; /* Position to the left of the selector */
	top: 50%;
	transform: translateY(-50%);
	cursor: pointer;
	font-size: 12px;
	font-weight: bold;
	transition: all 0.2s ease;
	z-index: 10;
	box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
	}

	.tokenizer-info-icon:hover {
	transform: translateY(-50%) scale(1.1);
	box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
	}

	/* Watermark styles */
	.watermark {
	position: fixed;
	bottom: 20px;
	right: 20px;
	color: var(--primary-color);
	font-size: 1.4rem;
	font-weight: 700;
	opacity: 0.25; /* Semi-transparent */
	z-index: 100;
	transition: opacity 0.3s ease;
	text-decoration: none;
	pointer-events: auto; /* Ensure it remains clickable */
	}

	.watermark:hover {
	opacity: 0.6; /* Increase opacity on hover */
	}

	.tokenizer-info-tooltip {
	position: absolute;
	top: calc(100% + 8px);
	left: -30px; /* Adjust position to align with the icon */
	width: 300px;
	background-color: var(--card-bg);
	color: var(--text-color);
	border: 1px solid var(--primary-color);
	border-radius: 0.75rem;
	box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
	padding: 1rem;
	z-index: 1000; /* Increase z-index to ensure visibility */
	opacity: 0;
	visibility: hidden;
	transition: opacity 0.3s, visibility 0.3s;
	pointer-events: none; /* Initially disable pointer events */
	}

	.tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
	opacity: 1;
	visibility: visible;
	pointer-events: auto;
	}

	.tokenizer-info-tooltip:hover {
	opacity: 1;
	visibility: visible;
	pointer-events: auto;
	}

	.tokenizer-info-header {
	font-size: 1.1rem;
	font-weight: 600;
	margin-bottom: 0.5rem;
	padding-bottom: 0.5rem;
	border-bottom: 1px solid rgba(255, 255, 255, 0.1);
	color: var(--primary-color);
	}

	.tokenizer-info-grid {
	display: grid;
	grid-template-columns: repeat(2, 1fr);
	gap: 0.75rem;
	margin: 0.75rem 0;
	}

	.tokenizer-info-item {
	display: flex;
	flex-direction: column;
	}

	.tokenizer-info-label {
	font-size: 0.75rem;
	color: var(--secondary-text);
	margin-bottom: 0.25rem;
	}

	.tokenizer-info-value {
	font-size: 0.95rem;
	font-weight: 500;
	}

	.special-tokens-container {
	margin-top: 0.75rem;
	background-color: rgba(15, 79, 155, 0.1);
	border-radius: 0.5rem;
	padding: 0.5rem;
	max-height: 100px;
	overflow-y: auto;
	}

	.special-token-item {
	display: flex;
	justify-content: space-between;
	margin-bottom: 0.25rem;
	font-size: 0.8rem;
	}

	.token-name {
	color: var(--secondary-text);
	}

	.token-value {
	background-color: rgba(255, 255, 255, 0.1);
	padding: 1px 4px;
	border-radius: 2px;
	font-family: monospace;
	}

	.tokenizer-info-loading {
	display: flex;
	justify-content: center;
	align-items: center;
	height: 100px;
	}

	.tokenizer-info-spinner {
	width: 30px;
	height: 30px;
	border: 3px solid var(--primary-color);
	border-radius: 50%;
	border-top-color: transparent;
	animation: spin 1s linear infinite;
	}

	.tokenizer-info-error {
	color: #f87171;
	font-size: 0.9rem;
	text-align: center;
	padding: 1rem;
	}

	@media (max-width: 768px) {
	.header {
	flex-direction: column;
	align-items: stretch;
	gap: 1rem;
	}

	.model-selector {
	width: 100%;
	}

	.stats-grid {
	grid-template-columns: 1fr;
	}

	.tokenizer-info-tooltip {
	width: 250px;
	}
	}
	</style>
	</head>
	<body>
	<!-- Hidden File Drop Zone that appears when dragging files -->
	<div id="fileDropZone" class="file-drop-zone">
	<div class="drop-indicator">
	<div class="file-icon">📄</div>
	<p>Drop your file here</p>
	</div>
	</div>

	<!-- File upload icon in bottom left corner -->
	<div id="fileUploadIcon" class="file-upload-icon">
	<span>📎</span>
	</div>
	<p class="file-info" id="fileInfo"></p>

	<div class="container">
	<div class="header">
	<div class="title-section">
	<h1 class="title">Token Visualizer</h1>
	<p class="subtitle">Advanced tokenization analysis and visualization</p>
	</div>
	<div class="model-selector">
	<div class="model-selector-header">
	<div class="model-type-toggle">
	<div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
	<div class="toggle-option custom-toggle" data-type="custom">Custom</div>
	</div>
	</div>
	<div id="predefinedModelSelector">
	<div style="position: relative;">
	<div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
	<!-- TOOLTIP MOVED HERE -->
	<div class="tokenizer-info-tooltip" id="modelInfoTooltip">
	<div id="tokenizerInfoContent">
	<div class="tokenizer-info-loading">
	<div class="tokenizer-info-spinner"></div>
	</div>
	</div>
	</div>
	<!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
	<select id="modelSelect" name="model">
	{% for model_id, info in models.items() %}
	<option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
	{{ info.alias }}
	</option>
	{% endfor %}
	</select>
	</div>
	</div>
	<div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
	<div style="position: relative;">
	<div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
	<div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
	<div id="customTokenizerInfoContent">
	<div class="tokenizer-info-loading">
	<div class="tokenizer-info-spinner"></div>
	</div>
	</div>
	</div>
	<input type="text" id="customModelInput" class="custom-model-input"
	placeholder="Enter HuggingFace model path"
	value="{{ custom_model if custom_model and custom_model\|length > 0 else '' }}">
	</div>
	<span class="custom-model-help">?</span>
	<div class="tooltip">
	Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
	The model must have a tokenizer available and must be not restricted. (with some exceptions)
	Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
	Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
	</div>
	<div class="model-badge" id="modelSuccessBadge">Loaded</div>
	</div>
	</div>
	</div>

	<div class="error-message" id="errorMessage">{{ error }}</div>

	<div class="input-section">
	<form id="analyzeForm" method="POST" enctype="multipart/form-data">
	<textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
	<input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
	<input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
	<input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
	<input type="file" name="file" id="fileInput" style="display: none;">
	<div class="button-container">
	<button type="submit" id="analyzeButton">Analyze Text</button>
	</div>
	</form>
	</div>

	<div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
	<div class="card">
	<h2 class="card-title">Token Visualization</h2>
	<div class="preview-notice" id="previewNotice">
	Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
	</div>
	<div class="token-container" id="tokenContainer">
	{% if token_data %}
	{% for token in token_data.tokens %}
	<span class="token"
	style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
	title="Original token: {{ token.original }} \| Token ID: {{ token.token_id }}">
	{{ token.display }}
	</span>
	{% if token.newline %}<br>{% endif %}
	{% endfor %}
	{% endif %}
	</div>
	<button class="expand-button" id="expandButton">Show More</button>
	<div class="display-limit-notice" id="displayLimitNotice">
	Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
	</div>
	</div>

	<div class="stats-grid">
	<div class="stat-card">
	<div class="stat-title">Total Tokens</div>
	<div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
	<div class="stat-description">
	<span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
	(<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
	</div>
	</div>
	<div class="stat-card">
	<div class="stat-title">Token Types</div>
	<div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
	<div class="stat-description">special tokens</div>
	</div>
	<div class="stat-card">
	<div class="stat-title">Whitespace</div>
	<div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
	<div class="stat-description">
	spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
	newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
	</div>
	</div>
	<div class="stat-card">
	<div class="stat-title">Token Length</div>
	<div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
	<div class="stat-description">
	median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
	±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
	</div>
	</div>
	<div class="stat-card">
	<div class="stat-title">Compression</div>
	<div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
	<div class="stat-description">characters per token</div>
	</div>
	</div>
	</div>
	</div>
	<a href="https://huggingface.co/spaces/barttee/tokenizers" target="_blank" class="watermark">
	@barttee/tokenizers
	</a>

	<script>
	$(document).ready(function() {
	// File handling variables
	let currentFile = null;
	let originalTextContent = null;
	let lastUploadedFileName = null;
	let fileJustUploaded = false; // Flag to prevent immediate detachment
	let currentModelType = "{{ model_type if model_type else 'predefined' }}";
	let currentTokenizerInfo = null;

	// Try to parse tokenizer info if available from server
	try {
	currentTokenizerInfo = {{ token_data.tokenizer_info\|tojson if token_data and token_data.tokenizer_info else 'null' }};
	if (currentTokenizerInfo) {
	updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
	}
	} catch(e) {
	console.error("Error parsing tokenizer info:", e);
	}

	// Show error if exists
	if ("{{ error }}".length > 0) {
	showError("{{ error }}");
	}

	// Setup model type based on initial state
	if (currentModelType === "custom") {
	$('.toggle-option').removeClass('active');
	$('.custom-toggle').addClass('active');
	$('#predefinedModelSelector').hide();
	$('#customModelSelector').show();
	}

	// Show success badge if custom model loaded successfully
	if (currentModelType === "custom" && !("{{ error }}".length > 0)) {
	$('#modelSuccessBadge').addClass('show');
	setTimeout(() => {
	$('#modelSuccessBadge').removeClass('show');
	}, 3000);
	}

	// Toggle between predefined and custom model inputs
	$('.toggle-option').click(function() {
	const modelType = $(this).data('type');
	$('.toggle-option').removeClass('active');
	$(this).addClass('active');
	currentModelType = modelType;

	if (modelType === 'predefined') {
	$('#predefinedModelSelector').show();
	$('#customModelSelector').hide();
	$('#modelTypeInput').val('predefined');
	// Set the model input value to the selected predefined model
	$('#modelInput').val($('#modelSelect').val());
	} else {
	$('#predefinedModelSelector').hide();
	$('#customModelSelector').show();
	$('#modelTypeInput').val('custom');
	}

	// Clear tokenizer info if switching models
	if (modelType === 'predefined') {
	$('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
	fetchTokenizerInfo($('#modelSelect').val(), false);
	} else {
	$('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
	// Only fetch if there's a custom model value
	const customModel = $('#customModelInput').val();
	if (customModel) {
	fetchTokenizerInfo(customModel, true);
	}
	}
	});

	// Update hidden input when custom model input changes
	$('#customModelInput').on('input', function() {
	$('#customModelInputHidden').val($(this).val());
	});

	function showError(message) {
	const errorDiv = $('#errorMessage');
	errorDiv.text(message);
	errorDiv.show();
	setTimeout(() => errorDiv.fadeOut(), 5000);
	}

	// Function to update tokenizer info display in tooltip
	function updateTokenizerInfoDisplay(info, isCustom = false) {
	const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
	let htmlContent = '';


	if (info.error) {
	$(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
	return;
	}

	// Start building the tooltip content
	htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
	<div class="tokenizer-info-grid">`;

	// Dictionary size
	if (info.vocab_size) {
	htmlContent += `
	<div class="tokenizer-info-item">
	<span class="tokenizer-info-label">Dictionary Size</span>
	<span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
	</div>`;
	}

	// Tokenizer type
	if (info.tokenizer_type) {
	htmlContent += `
	<div class="tokenizer-info-item">
	<span class="tokenizer-info-label">Tokenizer Type</span>
	<span class="tokenizer-info-value">${info.tokenizer_type}</span>
	</div>`;
	}


	// Max length
	if (info.model_max_length) {
	htmlContent += `
	<div class="tokenizer-info-item">
	<span class="tokenizer-info-label">Max Length</span>
	<span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
	</div>`;
	}

	htmlContent += `</div>`; // Close tokenizer-info-grid

	// Special tokens section
	if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
	htmlContent += `
	<div class="tokenizer-info-item" style="margin-top: 0.75rem;">
	<span class="tokenizer-info-label">Special Tokens</span>
	<div class="special-tokens-container">`;

	// Add each special token with proper escaping for HTML special characters
	for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
	// Properly escape HTML special characters
	const escapedValue = tokenValue
	.replace(/&/g, '&')
	.replace(/</g, '<')
	.replace(/>/g, '>')
	.replace(/"/g, '"')
	.replace(/'/g, ''');

	htmlContent += `
	<div class="special-token-item">
	<span class="token-name">${tokenName}:</span>
	<span class="token-value">${escapedValue}</span>
	</div>`;
	}

	htmlContent += `
	</div>
	</div>`;
	}

	$(targetSelector).html(htmlContent);
	}

	// Function to fetch tokenizer info
	function fetchTokenizerInfo(modelId, isCustom = false) {
	if (!modelId) return;

	const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
	$(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');

	$.ajax({
	url: '/tokenizer-info',
	method: 'GET',
	data: {
	model_id: modelId,
	is_custom: isCustom
	},
	success: function(response) {
	if (response.error) {
	$(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
	} else {
	currentTokenizerInfo = response;
	updateTokenizerInfoDisplay(response, isCustom);
	}
	},
	error: function(xhr) {
	$(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
	}
	});
	}

	function updateResults(data) {
	$('#results').show();

	// Update tokens
	const tokenContainer = $('#tokenContainer');
	tokenContainer.empty();
	data.tokens.forEach(token => {
	const span = $('<span>')
	.addClass('token')
	.css({
	'background-color': token.colors.background,
	'color': token.colors.text
	})
	// Include token id in the tooltip on hover
	.attr('title', `Original token: ${token.original} \| Token ID: ${token.token_id}`)
	.text(token.display);

	tokenContainer.append(span);
	if (token.newline) {
	tokenContainer.append('<br>');
	}
	});

	// Update display limit notice
	if (data.display_limit_reached) {
	$('#displayLimitNotice').show();
	$('#totalTokenCount').text(data.total_tokens);
	} else {
	$('#displayLimitNotice').hide();
	}

	// Update preview notice
	if (data.preview_only) {
	$('#previewNotice').show();
	} else {
	$('#previewNotice').hide();
	}

	// Update basic stats
	$('#totalTokens').text(data.stats.basic_stats.total_tokens);
	$('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
	$('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
	$('#specialTokens').text(data.stats.basic_stats.special_tokens);
	$('#spaceTokens').text(data.stats.basic_stats.space_tokens);
	$('#spaceCount').text(data.stats.basic_stats.space_tokens);
	$('#newlineCount').text(data.stats.basic_stats.newline_tokens);
	$('#compressionRatio').text(data.stats.basic_stats.compression_ratio);

	// Update length stats
	$('#avgLength').text(data.stats.length_stats.avg_length);
	$('#medianLength').text(data.stats.length_stats.median_length);
	$('#stdDev').text(data.stats.length_stats.std_dev);

	// Update tokenizer info if available
	if (data.tokenizer_info) {
	currentTokenizerInfo = data.tokenizer_info;
	updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
	}
	}

	// Handle text changes to detach file
	$('#textInput').on('input', function() {
	// Skip if file was just uploaded (prevents immediate detachment)
	if (fileJustUploaded) {
	fileJustUploaded = false;
	return;
	}

	const currentText = $(this).val();
	const fileInput = document.getElementById('fileInput');

	// Only detach if a file exists and text has been substantially modified
	if (fileInput.files.length > 0 && originalTextContent !== null) {
	// Check if the text is completely different or has been significantly changed
	// This allows for small edits without detaching
	const isMajorChange =
	currentText.length < originalTextContent.length * 0.8 \|\| // Text reduced by at least 20%
	(currentText.length > 0 &&
	currentText !== originalTextContent.substring(0, currentText.length) &&
	currentText.substring(0, Math.min(20, currentText.length)) !==
	originalTextContent.substring(0, Math.min(20, currentText.length)));

	if (isMajorChange) {
	detachFile();
	}
	}
	});

	// Function to detach file
	function detachFile() {
	// Clear the file input
	$('#fileInput').val('');
	// Hide file info
	$('#fileInfo').fadeOut(300);
	// Reset the original content tracker
	originalTextContent = $('#textInput').val();
	// Reset last uploaded filename
	lastUploadedFileName = null;
	}

	// For model changes
	$('#modelSelect').change(function() {
	const selectedModel = $(this).val();
	$('#modelInput').val(selectedModel);

	// Fetch tokenizer info for the selected model
	fetchTokenizerInfo(selectedModel, false);

	// If text exists, submit the form
	if ($('#textInput').val().trim()) {
	$('#analyzeForm').submit();
	}
	});

	// File drop handling
	const fileDropZone = $('#fileDropZone');
	const fileUploadIcon = $('#fileUploadIcon');

	// Prevent default drag behaviors
	['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
	fileDropZone[0].addEventListener(eventName, preventDefaults, false);
	document.body.addEventListener(eventName, preventDefaults, false);
	});

	function preventDefaults(e) {
	e.preventDefault();
	e.stopPropagation();
	}

	// Show drop zone when file is dragged over the document
	document.addEventListener('dragenter', showDropZone, false);
	document.addEventListener('dragover', showDropZone, false);

	fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
	fileDropZone[0].addEventListener('drop', hideDropZone, false);

	function showDropZone(e) {
	fileDropZone.addClass('active');
	}

	function hideDropZone() {
	fileDropZone.removeClass('active');
	}

	// Handle dropped files
	fileDropZone[0].addEventListener('drop', handleDrop, false);

	function handleDrop(e) {
	const dt = e.dataTransfer;
	const files = dt.files;
	handleFiles(files);
	}

	// Also handle file selection via click on the icon
	fileUploadIcon.on('click', function() {
	const input = document.createElement('input');
	input.type = 'file';
	input.onchange = e => {
	handleFiles(e.target.files);
	};
	input.click();
	});

	function handleFiles(files) {
	if (files.length) {
	const file = files[0];
	currentFile = file;
	lastUploadedFileName = file.name;
	fileJustUploaded = true; // Set flag to prevent immediate detachment

	// Show file info with animation and add detach button
	$('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);

	// Add click handler for detach button
	$('#fileDetach').on('click', function(e) {
	e.stopPropagation(); // Prevent event bubbling
	detachFile();
	return false;
	});

	// Set the file to the file input
	const dataTransfer = new DataTransfer();
	dataTransfer.items.add(file);
	document.getElementById('fileInput').files = dataTransfer.files;

	// Preview in textarea (first 8096 chars)
	const reader = new FileReader();
	reader.onload = function(e) {
	const previewText = e.target.result.slice(0, 8096);
	$('#textInput').val(previewText);

	// Store this as the original content AFTER setting the value
	// to prevent the input event from firing and detaching immediately
	setTimeout(() => {
	originalTextContent = previewText;
	// Automatically submit for analysis
	$('#analyzeForm').submit();
	}, 50);
	};
	reader.readAsText(file);
	}
	}

	function formatFileSize(bytes) {
	if (bytes < 1024) return bytes + ' bytes';
	else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
	else return (bytes / 1048576).toFixed(1) + ' MB';
	}

	// Make sure to check if there's still a file when analyzing
	$('#analyzeForm').on('submit', function(e) {
	e.preventDefault();

	// Skip detachment check if file was just uploaded
	if (!fileJustUploaded) {
	// Check if text has been changed but file is still attached
	const textInput = $('#textInput').val();
	const fileInput = document.getElementById('fileInput');

	if (fileInput.files.length > 0 &&
	originalTextContent !== null &&
	textInput !== originalTextContent &&
	textInput.length < originalTextContent.length * 0.8) {
	// Text was significantly changed but file is still attached, detach it
	detachFile();
	}
	} else {
	// Reset flag after first submission
	fileJustUploaded = false;
	}

	// Update the hidden inputs based on current model type
	if (currentModelType === 'custom') {
	$('#customModelInputHidden').val($('#customModelInput').val());
	} else {
	$('#modelInput').val($('#modelSelect').val());
	}

	const formData = new FormData(this);
	$('#analyzeButton').prop('disabled', true);

	$.ajax({
	url: '/',
	method: 'POST',
	data: formData,
	processData: false,
	contentType: false,
	success: function(response) {
	if (response.error) {
	showError(response.error);
	} else {
	updateResults(response);

	// Show success badge if custom model
	if (currentModelType === 'custom') {
	$('#modelSuccessBadge').addClass('show');
	setTimeout(() => {
	$('#modelSuccessBadge').removeClass('show');
	}, 3000);
	}
	}
	},
	error: function(xhr) {
	showError(xhr.responseText \|\| 'An error occurred while processing the text');
	},
	complete: function() {
	$('#analyzeButton').prop('disabled', false);
	}
	});
	});

	$('#expandButton').click(function() {
	const container = $('#tokenContainer');
	const isExpanded = container.hasClass('expanded');

	container.toggleClass('expanded');
	$(this).text(isExpanded ? 'Show More' : 'Show Less');
	});

	// Initialize tokenizer info for current model
	if (currentModelType === 'predefined') {
	fetchTokenizerInfo($('#modelSelect').val(), false);
	} else if ($('#customModelInput').val()) {
	fetchTokenizerInfo($('#customModelInput').val(), true);
	}

	// Add event listener for custom model input
	$('#customModelInput').on('change', function() {
	const modelValue = $(this).val();
	if (modelValue) {
	fetchTokenizerInfo(modelValue, true);
	}
	});
	});
	</script>
	</body>
	</html>
	"""

	@app.route('/tokenizer-info', methods=['GET'])
	def tokenizer_info():
	"""
	Endpoint to get tokenizer information without processing text.
	"""
	model_id = request.args.get('model_id', '')
	is_custom = request.args.get('is_custom', 'false').lower() == 'true'

	if not model_id:
	return jsonify({"error": "No model ID provided"}), 400

	try:
	# For predefined models, use the model name from the dictionary
	if not is_custom and model_id in TOKENIZER_MODELS:
	model_id_or_name = model_id
	else:
	# For custom models, use the model ID directly
	model_id_or_name = model_id

	# Load the tokenizer and get info
	tokenizer, info, error = load_tokenizer(model_id_or_name)

	if error:
	return jsonify({"error": error}), 400

	return jsonify(info)
	except Exception as e:
	return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500

	@app.route('/', methods=['GET', 'POST'])
	def index():
	text = ""
	token_data = None
	error_message = ""
	selected_model = request.args.get('model', request.form.get('model', 'llama4'))
	custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
	model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))

	# Determine which model to use based on model_type
	model_to_use = selected_model if model_type == 'predefined' else custom_model

	if request.method == 'POST':
	# Check if file upload
	if 'file' in request.files and request.files['file'].filename:
	uploaded_file = request.files['file']
	# Save file to tmp directory
	file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
	uploaded_file.save(file_path)

	# Read a small preview of the file
	with open(file_path, 'r', errors='replace') as f:
	text = f.read(8096)

	try:
	# Process the file
	token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)

	# Clean up the file after processing
	if os.path.exists(file_path):
	os.remove(file_path)

	# If request is AJAX, return JSON
	if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
	return jsonify(token_data)

	except Exception as e:
	error_message = str(e)
	# Clean up the file after processing
	if os.path.exists(file_path):
	os.remove(file_path)

	if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
	return jsonify({"error": error_message}), 400
	return render_template_string(
	HTML_TEMPLATE,
	text=text,
	token_data=None,
	models=TOKENIZER_MODELS,
	selected_model=selected_model,
	custom_model=custom_model,
	model_type=model_type,
	error=error_message
	)

	# Regular text processing
	else:
	text = request.form.get('text', '')
	if text:
	try:
	token_data = process_text(text, model_to_use)

	# If request is AJAX, return JSON
	if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
	return jsonify(token_data)

	except Exception as e:
	error_message = str(e)
	if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
	return jsonify({"error": error_message}), 400
	return render_template_string(
	HTML_TEMPLATE,
	text=text,
	token_data=None,
	models=TOKENIZER_MODELS,
	selected_model=selected_model,
	custom_model=custom_model,
	model_type=model_type,
	error=error_message
	)

	return render_template_string(
	HTML_TEMPLATE,
	text=text,
	token_data=token_data,
	models=TOKENIZER_MODELS,
	selected_model=selected_model,
	custom_model=custom_model,
	model_type=model_type,
	error=error_message
	)

	if __name__ == "__main__":
	app.run(host='0.0.0.0', port=7860, debug=False)