tokenGPT-2 / app.py
Itsme5's picture
Update app.py
f96e8fd verified
raw
history blame contribute delete
997 Bytes
from tokenizers import models, trainers, Tokenizer
from datasets import load_dataset
# Step 1: Download the dataset and save it locally
dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
# Save the dataset locally to a text file
with open("wikipedia_data.txt", "w", encoding="utf-8") as file:
for example in dataset:
if "text" in example: # Ensure the 'text' column exists
file.write(example["text"] + "\n")
# Step 2: Initialize the tokenizer
tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
# Special tokens and trainer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
# Train the tokenizer using the local text file
tokenizer.train(["wikipedia_data.txt"], trainer=trainer)
# Step 3: Test the tokenizer
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print("Token IDs:", encoding.ids)