Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
import json | |
from transformers import AutoTokenizer | |
import numpy as np | |
from tqdm import tqdm | |
import matplotlib.pyplot as plt | |
def load_tokenizers(): | |
"""Load both tokenizers.""" | |
print("Loading tokenizers...") | |
phi_tokenizer = AutoTokenizer.from_pretrained( | |
"unsloth/phi-4-unsloth-bnb-4bit", | |
trust_remote_code=True | |
) | |
deepseek_tokenizer = AutoTokenizer.from_pretrained( | |
"deepseek-ai/deepseek-llama-7b-base", | |
trust_remote_code=True | |
) | |
return phi_tokenizer, deepseek_tokenizer | |
def analyze_token_counts(jsonl_path, phi_tokenizer, deepseek_tokenizer, sample_size=100): | |
"""Analyze token count differences between tokenizers.""" | |
token_counts = { | |
'phi': [], | |
'deepseek': [], | |
'differences': [] | |
} | |
print(f"Analyzing token counts from {jsonl_path}") | |
with open(jsonl_path, 'r', encoding='utf-8') as f: | |
data = [json.loads(line) for line in f] | |
# Take a random sample if sample_size specified | |
if sample_size and sample_size < len(data): | |
data = np.random.choice(data, sample_size, replace=False) | |
for item in tqdm(data, desc="Processing entries"): | |
text = item.get('text', '') or item.get('content', '') | |
# Get token counts | |
phi_tokens = len(phi_tokenizer.encode(text)) | |
deepseek_tokens = len(deepseek_tokenizer.encode(text)) | |
token_counts['phi'].append(phi_tokens) | |
token_counts['deepseek'].append(deepseek_tokens) | |
token_counts['differences'].append(phi_tokens - deepseek_tokens) | |
return token_counts | |
def plot_comparison(token_counts): | |
"""Create visualization of token count differences.""" | |
plt.figure(figsize=(12, 6)) | |
# Plot token count distributions | |
plt.subplot(1, 2, 1) | |
plt.hist([token_counts['phi'], token_counts['deepseek']], | |
label=['Phi-4', 'DeepSeek'], alpha=0.6) | |
plt.title('Token Count Distribution') | |
plt.xlabel('Number of Tokens') | |
plt.ylabel('Frequency') | |
plt.legend() | |
# Plot differences | |
plt.subplot(1, 2, 2) | |
plt.hist(token_counts['differences'], bins=30) | |
plt.title('Token Count Differences\n(Phi-4 minus DeepSeek)') | |
plt.xlabel('Difference in Tokens') | |
plt.ylabel('Frequency') | |
plt.tight_layout() | |
plt.savefig('tokenization_analysis.png') | |
print("Saved visualization to tokenization_analysis.png") | |
def main(): | |
# Load tokenizers | |
phi_tokenizer, deepseek_tokenizer = load_tokenizers() | |
# Analyze token counts | |
token_counts = analyze_token_counts( | |
"../../../../data_processing/data/training_data.jsonl", | |
phi_tokenizer, | |
deepseek_tokenizer | |
) | |
# Calculate statistics | |
phi_mean = np.mean(token_counts['phi']) | |
deepseek_mean = np.mean(token_counts['deepseek']) | |
diff_mean = np.mean(token_counts['differences']) | |
diff_std = np.std(token_counts['differences']) | |
print("\nAnalysis Results:") | |
print(f"Phi-4 average tokens: {phi_mean:.1f}") | |
print(f"DeepSeek average tokens: {deepseek_mean:.1f}") | |
print(f"Average difference: {diff_mean:.1f} ± {diff_std:.1f}") | |
print(f"Max Phi-4 tokens: {max(token_counts['phi'])}") | |
print(f"Max DeepSeek tokens: {max(token_counts['deepseek'])}") | |
# Create visualization | |
plot_comparison(token_counts) | |
if __name__ == "__main__": | |
main() |