Spaces:

Itsme5
/

tokenGPT-2

Runtime error

tokenGPT-2 / app.py

Update app.py

f96e8fd verified 26 days ago

997 Bytes

	from tokenizers import models, trainers, Tokenizer
	from datasets import load_dataset

	# Step 1: Download the dataset and save it locally
	dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")

	# Save the dataset locally to a text file
	with open("wikipedia_data.txt", "w", encoding="utf-8") as file:
	for example in dataset:
	if "text" in example: # Ensure the 'text' column exists
	file.write(example["text"] + "\n")

	# Step 2: Initialize the tokenizer
	tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))

	# Special tokens and trainer
	special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
	trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

	# Train the tokenizer using the local text file
	tokenizer.train(["wikipedia_data.txt"], trainer=trainer)

	# Step 3: Test the tokenizer
	encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
	print("Token IDs:", encoding.ids)