import os import convokit import re import matplotlib.pyplot as plt import json import random class DialogProcessor: def __init__(self): self.corpus = None def load_corpus(self, corpus_path): # Load the corpus from a local file or download it if os.path.exists(corpus_path): self.corpus = convokit.Corpus(filename=corpus_path) else: print("Corpus not found locally. Downloading from Convokit...") self.corpus = convokit.Corpus(filename=convokit.download('movie-corpus')) import re @staticmethod def preprocess_text(text): """Clean text by removing tags, quotes, extra spaces, and expanding contractions.""" if not text: return "" # Handle missing data gracefully # Remove XML-like tags cleaned_text = re.sub(r'<[^>]+>', '', text) # More general than ... # Remove double quotes cleaned_text = cleaned_text.replace('"', '') # Normalize spaces cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip() # Convert to lowercase txt = cleaned_text.lower() # Expanded contractions mapping contractions = { "i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is", "what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is", "it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are", "you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have", "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would", "she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will", "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not", "won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not", "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not" } # Expand contractions for contraction, expansion in contractions.items(): txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt) # Remove non-alphanumeric characters except apostrophes txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt) # Keeps letters, numbers, and apostrophes txt = re.sub(r"\s+", " ", txt).strip() # Remove extra spaces return txt def group_conversations(self): """Process and structure conversations into distinct, numbered groups with tuple pairs.""" if self.corpus is None: raise ValueError("Corpus is not loaded.") grouped_dialogues = {} # Format: {conversation_id: [(input, response), ...]} misc_dialogues = [] # Shorter convos go here conversation_ids = list(self.corpus.get_conversation_ids()) current_dialog_id = 1 # Start numbering from 1 for conversation_id in conversation_ids: conversation = self.corpus.get_conversation(conversation_id) utterances = conversation.get_utterance_ids() current_dialog = [] for i in range(len(utterances) - 1): utterance = self.corpus.get_utterance(utterances[i]) # Ensure valid text before processing if not utterance.text: continue data = self.preprocess_text(utterance.text) current_dialog.append(data) if len(current_dialog) >= 4: # Save only if the convo has at least 4 exchanges grouped_dialogues[str(current_dialog_id)] = current_dialog current_dialog_id += 1 else: misc_dialogues.append(current_dialog) current_dialog = [] # Reset for the next conversation grouped_dialogues["0"] = [convo for convo in misc_dialogues] print(f"Processed {len(grouped_dialogues)} grouped conversations, including {len(misc_dialogues)} miscellaneous.") return grouped_dialogues def plot_token_statistics(self, conversation_averages, conversation_top): plt.figure(figsize=(10, 5)) plt.plot(range(1, len(conversation_averages) + 1), conversation_averages, marker='o', linestyle='-', label="Average Tokens per Input") plt.plot(range(1, len(conversation_top) + 1), conversation_top, marker='s', linestyle='--', label="Top Token Count") plt.xlabel("Total Number of Conversations") plt.ylabel("Token Count") plt.title("Token Statistics Over Conversations") plt.legend() plt.grid(True) plt.show() def save_grouped_conversations(self, grouped_dialogues): average_list = [] with open('preprocessed_dialogs.py', 'w', encoding="utf-8") as f: f.write("dialog_data = {\n") for keys, values in grouped_dialogues.items(): # print(list(values)) [average_list.append(str(sentences).split(" ")) for sentences in values if sentences not in average_list] organized = [(values[i], values[i + 1]) for i in range(len(values)- 1)] f.write(f' "{keys}": {organized},\n') f.write("}") average_numbers = [] top_number = [] for items in average_list: new_data = len(items) top_number.append(0) if new_data > max(top_number) and new_data not in top_number: top_number.append(new_data) average_numbers.append(new_data) else: average_numbers.append(new_data) print(list(average_numbers)) data = sum(average_numbers)/len(average_numbers) print(f"Top Tokens: {max(top_number)}") print(f"Average: {data} ") self.plot_token_statistics(average_numbers, top_number) if __name__ == "__main__": dialog_processor = DialogProcessor() # Specify the corpus path or use 'movie-corpus' to download corpus_path = "D:\\movie-corpus" # Change to your actual local path dialog_processor.load_corpus(corpus_path) grouped_dialogues = dialog_processor.group_conversations() # Save the processed dialogues as JSON dialog_processor.save_grouped_conversations(grouped_dialogues)