Spaces:
Runtime error
Runtime error
from tokenizers import models, trainers, Tokenizer | |
from datasets import load_dataset | |
# Step 1: Download the dataset and save it locally | |
dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train") | |
# Save the dataset locally to a text file | |
with open("wikipedia_data.txt", "w", encoding="utf-8") as file: | |
for example in dataset: | |
if "text" in example: # Ensure the 'text' column exists | |
file.write(example["text"] + "\n") | |
# Step 2: Initialize the tokenizer | |
tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]")) | |
# Special tokens and trainer | |
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"] | |
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens) | |
# Train the tokenizer using the local text file | |
tokenizer.train(["wikipedia_data.txt"], trainer=trainer) | |
# Step 3: Test the tokenizer | |
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.") | |
print("Token IDs:", encoding.ids) | |