hf-train-frontend / check_tokenization.py
George-API's picture
Upload folder using huggingface_hub
a57357b verified
#!/usr/bin/env python
import json
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
def load_tokenizers():
"""Load both tokenizers."""
print("Loading tokenizers...")
phi_tokenizer = AutoTokenizer.from_pretrained(
"unsloth/phi-4-unsloth-bnb-4bit",
trust_remote_code=True
)
deepseek_tokenizer = AutoTokenizer.from_pretrained(
"deepseek-ai/deepseek-llama-7b-base",
trust_remote_code=True
)
return phi_tokenizer, deepseek_tokenizer
def analyze_token_counts(jsonl_path, phi_tokenizer, deepseek_tokenizer, sample_size=100):
"""Analyze token count differences between tokenizers."""
token_counts = {
'phi': [],
'deepseek': [],
'differences': []
}
print(f"Analyzing token counts from {jsonl_path}")
with open(jsonl_path, 'r', encoding='utf-8') as f:
data = [json.loads(line) for line in f]
# Take a random sample if sample_size specified
if sample_size and sample_size < len(data):
data = np.random.choice(data, sample_size, replace=False)
for item in tqdm(data, desc="Processing entries"):
text = item.get('text', '') or item.get('content', '')
# Get token counts
phi_tokens = len(phi_tokenizer.encode(text))
deepseek_tokens = len(deepseek_tokenizer.encode(text))
token_counts['phi'].append(phi_tokens)
token_counts['deepseek'].append(deepseek_tokens)
token_counts['differences'].append(phi_tokens - deepseek_tokens)
return token_counts
def plot_comparison(token_counts):
"""Create visualization of token count differences."""
plt.figure(figsize=(12, 6))
# Plot token count distributions
plt.subplot(1, 2, 1)
plt.hist([token_counts['phi'], token_counts['deepseek']],
label=['Phi-4', 'DeepSeek'], alpha=0.6)
plt.title('Token Count Distribution')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.legend()
# Plot differences
plt.subplot(1, 2, 2)
plt.hist(token_counts['differences'], bins=30)
plt.title('Token Count Differences\n(Phi-4 minus DeepSeek)')
plt.xlabel('Difference in Tokens')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('tokenization_analysis.png')
print("Saved visualization to tokenization_analysis.png")
def main():
# Load tokenizers
phi_tokenizer, deepseek_tokenizer = load_tokenizers()
# Analyze token counts
token_counts = analyze_token_counts(
"../../../../data_processing/data/training_data.jsonl",
phi_tokenizer,
deepseek_tokenizer
)
# Calculate statistics
phi_mean = np.mean(token_counts['phi'])
deepseek_mean = np.mean(token_counts['deepseek'])
diff_mean = np.mean(token_counts['differences'])
diff_std = np.std(token_counts['differences'])
print("\nAnalysis Results:")
print(f"Phi-4 average tokens: {phi_mean:.1f}")
print(f"DeepSeek average tokens: {deepseek_mean:.1f}")
print(f"Average difference: {diff_mean:.1f} ± {diff_std:.1f}")
print(f"Max Phi-4 tokens: {max(token_counts['phi'])}")
print(f"Max DeepSeek tokens: {max(token_counts['deepseek'])}")
# Create visualization
plot_comparison(token_counts)
if __name__ == "__main__":
main()