#!/usr/bin/env python import json from transformers import AutoTokenizer import numpy as np from tqdm import tqdm import matplotlib.pyplot as plt def load_tokenizers(): """Load both tokenizers.""" print("Loading tokenizers...") phi_tokenizer = AutoTokenizer.from_pretrained( "unsloth/phi-4-unsloth-bnb-4bit", trust_remote_code=True ) deepseek_tokenizer = AutoTokenizer.from_pretrained( "deepseek-ai/deepseek-llama-7b-base", trust_remote_code=True ) return phi_tokenizer, deepseek_tokenizer def analyze_token_counts(jsonl_path, phi_tokenizer, deepseek_tokenizer, sample_size=100): """Analyze token count differences between tokenizers.""" token_counts = { 'phi': [], 'deepseek': [], 'differences': [] } print(f"Analyzing token counts from {jsonl_path}") with open(jsonl_path, 'r', encoding='utf-8') as f: data = [json.loads(line) for line in f] # Take a random sample if sample_size specified if sample_size and sample_size < len(data): data = np.random.choice(data, sample_size, replace=False) for item in tqdm(data, desc="Processing entries"): text = item.get('text', '') or item.get('content', '') # Get token counts phi_tokens = len(phi_tokenizer.encode(text)) deepseek_tokens = len(deepseek_tokenizer.encode(text)) token_counts['phi'].append(phi_tokens) token_counts['deepseek'].append(deepseek_tokens) token_counts['differences'].append(phi_tokens - deepseek_tokens) return token_counts def plot_comparison(token_counts): """Create visualization of token count differences.""" plt.figure(figsize=(12, 6)) # Plot token count distributions plt.subplot(1, 2, 1) plt.hist([token_counts['phi'], token_counts['deepseek']], label=['Phi-4', 'DeepSeek'], alpha=0.6) plt.title('Token Count Distribution') plt.xlabel('Number of Tokens') plt.ylabel('Frequency') plt.legend() # Plot differences plt.subplot(1, 2, 2) plt.hist(token_counts['differences'], bins=30) plt.title('Token Count Differences\n(Phi-4 minus DeepSeek)') plt.xlabel('Difference in Tokens') plt.ylabel('Frequency') plt.tight_layout() plt.savefig('tokenization_analysis.png') print("Saved visualization to tokenization_analysis.png") def main(): # Load tokenizers phi_tokenizer, deepseek_tokenizer = load_tokenizers() # Analyze token counts token_counts = analyze_token_counts( "../../../../data_processing/data/training_data.jsonl", phi_tokenizer, deepseek_tokenizer ) # Calculate statistics phi_mean = np.mean(token_counts['phi']) deepseek_mean = np.mean(token_counts['deepseek']) diff_mean = np.mean(token_counts['differences']) diff_std = np.std(token_counts['differences']) print("\nAnalysis Results:") print(f"Phi-4 average tokens: {phi_mean:.1f}") print(f"DeepSeek average tokens: {deepseek_mean:.1f}") print(f"Average difference: {diff_mean:.1f} ± {diff_std:.1f}") print(f"Max Phi-4 tokens: {max(token_counts['phi'])}") print(f"Max DeepSeek tokens: {max(token_counts['deepseek'])}") # Create visualization plot_comparison(token_counts) if __name__ == "__main__": main()