Generative-Sequence-Chatbot / preprocess_dialogs.py

Upload preprocess_dialogs.py

0bdb125 verified 3 months ago

6.81 kB

	import os
	import convokit
	import re
	import matplotlib.pyplot as plt
	import json
	import random


	class DialogProcessor:
	def __init__(self):
	self.corpus = None

	def load_corpus(self, corpus_path):
	# Load the corpus from a local file or download it
	if os.path.exists(corpus_path):
	self.corpus = convokit.Corpus(filename=corpus_path)
	else:
	print("Corpus not found locally. Downloading from Convokit...")
	self.corpus = convokit.Corpus(filename=convokit.download('movie-corpus'))

	import re

	@staticmethod
	def preprocess_text(text):
	"""Clean text by removing tags, quotes, extra spaces, and expanding contractions."""
	if not text:
	return "" # Handle missing data gracefully

	# Remove XML-like tags
	cleaned_text = re.sub(r'<[^>]+>', '', text) # More general than <u>...</u>

	# Remove double quotes
	cleaned_text = cleaned_text.replace('"', '')

	# Normalize spaces
	cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

	# Convert to lowercase
	txt = cleaned_text.lower()

	# Expanded contractions mapping
	contractions = {
	"i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
	"what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is",
	"it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are",
	"you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have",
	"they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
	"she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
	"you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
	"they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not",
	"won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not",
	"shouldn't": "should not", "mightn't": "might not", "mustn't": "must not",
	"isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
	"haven't": "have not", "hasn't": "has not", "hadn't": "had not"
	}

	# Expand contractions
	for contraction, expansion in contractions.items():
	txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt)

	# Remove non-alphanumeric characters except apostrophes
	txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt) # Keeps letters, numbers, and apostrophes
	txt = re.sub(r"\s+", " ", txt).strip() # Remove extra spaces

	return txt


	def group_conversations(self):
	"""Process and structure conversations into distinct, numbered groups with tuple pairs."""
	if self.corpus is None:
	raise ValueError("Corpus is not loaded.")

	grouped_dialogues = {} # Format: {conversation_id: [(input, response), ...]}
	misc_dialogues = [] # Shorter convos go here
	conversation_ids = list(self.corpus.get_conversation_ids())
	current_dialog_id = 1 # Start numbering from 1

	for conversation_id in conversation_ids:
	conversation = self.corpus.get_conversation(conversation_id)
	utterances = conversation.get_utterance_ids()
	current_dialog = []

	for i in range(len(utterances) - 1):
	utterance = self.corpus.get_utterance(utterances[i])

	# Ensure valid text before processing
	if not utterance.text:
	continue

	data = self.preprocess_text(utterance.text)

	current_dialog.append(data)

	if len(current_dialog) >= 4: # Save only if the convo has at least 4 exchanges
	grouped_dialogues[str(current_dialog_id)] = current_dialog
	current_dialog_id += 1
	else:
	misc_dialogues.append(current_dialog)
	current_dialog = [] # Reset for the next conversation

	grouped_dialogues["0"] = [convo for convo in misc_dialogues]

	print(f"Processed {len(grouped_dialogues)} grouped conversations, including {len(misc_dialogues)} miscellaneous.")
	return grouped_dialogues

	def plot_token_statistics(self, conversation_averages, conversation_top):
	plt.figure(figsize=(10, 5))
	plt.plot(range(1, len(conversation_averages) + 1), conversation_averages, marker='o', linestyle='-',
	label="Average Tokens per Input")
	plt.plot(range(1, len(conversation_top) + 1), conversation_top, marker='s', linestyle='--',
	label="Top Token Count")
	plt.xlabel("Total Number of Conversations")
	plt.ylabel("Token Count")
	plt.title("Token Statistics Over Conversations")
	plt.legend()
	plt.grid(True)
	plt.show()

	def save_grouped_conversations(self, grouped_dialogues):
	average_list = []
	with open('preprocessed_dialogs.py', 'w', encoding="utf-8") as f:
	f.write("dialog_data = {\n")
	for keys, values in grouped_dialogues.items():
	# print(list(values))
	[average_list.append(str(sentences).split(" ")) for sentences in values if sentences not in average_list]
	organized = [(values[i], values[i + 1]) for i in range(len(values)- 1)]
	f.write(f' "{keys}": {organized},\n')

	f.write("}")

	average_numbers = []
	top_number = []

	for items in average_list:
	new_data = len(items)
	top_number.append(0)
	if new_data > max(top_number) and new_data not in top_number:
	top_number.append(new_data)
	average_numbers.append(new_data)
	else:
	average_numbers.append(new_data)

	print(list(average_numbers))
	data = sum(average_numbers)/len(average_numbers)
	print(f"Top Tokens: {max(top_number)}")
	print(f"Average: {data} ")
	self.plot_token_statistics(average_numbers, top_number)


	if __name__ == "__main__":
	dialog_processor = DialogProcessor()

	# Specify the corpus path or use 'movie-corpus' to download
	corpus_path = "D:\\movie-corpus" # Change to your actual local path
	dialog_processor.load_corpus(corpus_path)

	grouped_dialogues = dialog_processor.group_conversations()

	# Save the processed dialogues as JSON
	dialog_processor.save_grouped_conversations(grouped_dialogues)