Upload preprocess_dialogs.py

Forgot to add this. This is how I preprocessed the dialogs for the corpus to be local instead of pulling from convokit constantly

Files changed (1) hide show

preprocess_dialogs.py +160 -0

preprocess_dialogs.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import convokit
+import re
+import matplotlib.pyplot as plt
+import json
+import random
+class DialogProcessor:
+    def __init__(self):
+        self.corpus = None
+    def load_corpus(self, corpus_path):
+        # Load the corpus from a local file or download it
+        if os.path.exists(corpus_path):
+            self.corpus = convokit.Corpus(filename=corpus_path)
+        else:
+            print("Corpus not found locally. Downloading from Convokit...")
+            self.corpus = convokit.Corpus(filename=convokit.download('movie-corpus'))
+    import re
+    @staticmethod
+    def preprocess_text(text):
+        """Clean text by removing tags, quotes, extra spaces, and expanding contractions."""
+        if not text:
+            return ""  # Handle missing data gracefully
+        # Remove XML-like tags
+        cleaned_text = re.sub(r'<[^>]+>', '', text)  # More general than <u>...</u>
+        # Remove double quotes
+        cleaned_text = cleaned_text.replace('"', '')
+        # Normalize spaces
+        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
+        # Convert to lowercase
+        txt = cleaned_text.lower()
+        # Expanded contractions mapping
+        contractions = {
+            "i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
+            "what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is",
+            "it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are",
+            "you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have",
+            "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
+            "she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
+            "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
+            "they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not",
+            "won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not",
+            "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not",
+            "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
+            "haven't": "have not", "hasn't": "has not", "hadn't": "had not"
+        }
+        # Expand contractions
+        for contraction, expansion in contractions.items():
+            txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt)
+        # Remove non-alphanumeric characters except apostrophes
+        txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt)  # Keeps letters, numbers, and apostrophes
+        txt = re.sub(r"\s+", " ", txt).strip()  # Remove extra spaces
+        return txt
+    def group_conversations(self):
+        """Process and structure conversations into distinct, numbered groups with tuple pairs."""
+        if self.corpus is None:
+            raise ValueError("Corpus is not loaded.")
+        grouped_dialogues = {}  # Format: {conversation_id: [(input, response), ...]}
+        misc_dialogues = []  # Shorter convos go here
+        conversation_ids = list(self.corpus.get_conversation_ids())
+        current_dialog_id = 1  # Start numbering from 1
+        for conversation_id in conversation_ids:
+            conversation = self.corpus.get_conversation(conversation_id)
+            utterances = conversation.get_utterance_ids()
+            current_dialog = []
+            for i in range(len(utterances) - 1):
+                utterance = self.corpus.get_utterance(utterances[i])
+                # Ensure valid text before processing
+                if not utterance.text:
+                    continue
+                data = self.preprocess_text(utterance.text)
+                current_dialog.append(data)
+            if len(current_dialog) >= 4:  # Save only if the convo has at least 4 exchanges
+                grouped_dialogues[str(current_dialog_id)] = current_dialog
+                current_dialog_id += 1
+            else:
+                misc_dialogues.append(current_dialog)
+            current_dialog = []  # Reset for the next conversation
+        grouped_dialogues["0"] = [convo for convo in misc_dialogues]
+        print(f"Processed {len(grouped_dialogues)} grouped conversations, including {len(misc_dialogues)} miscellaneous.")
+        return grouped_dialogues
+    def plot_token_statistics(self, conversation_averages, conversation_top):
+        plt.figure(figsize=(10, 5))
+        plt.plot(range(1, len(conversation_averages) + 1), conversation_averages, marker='o', linestyle='-',
+                 label="Average Tokens per Input")
+        plt.plot(range(1, len(conversation_top) + 1), conversation_top, marker='s', linestyle='--',
+                 label="Top Token Count")
+        plt.xlabel("Total Number of Conversations")
+        plt.ylabel("Token Count")
+        plt.title("Token Statistics Over Conversations")
+        plt.legend()
+        plt.grid(True)
+        plt.show()
+    def save_grouped_conversations(self, grouped_dialogues):
+        average_list = []
+        with open('preprocessed_dialogs.py', 'w', encoding="utf-8") as f:
+            f.write("dialog_data = {\n")
+            for keys, values in grouped_dialogues.items():
+                # print(list(values))
+                [average_list.append(str(sentences).split(" ")) for sentences in values if sentences not in average_list]
+                organized = [(values[i], values[i + 1]) for i in range(len(values)- 1)]
+                f.write(f'  "{keys}": {organized},\n')
+            f.write("}")
+        average_numbers = []
+        top_number = []
+        for items in average_list:
+            new_data = len(items)
+            top_number.append(0)
+            if new_data > max(top_number) and  new_data not in top_number:
+                top_number.append(new_data)
+                average_numbers.append(new_data)
+            else:
+                average_numbers.append(new_data)
+        print(list(average_numbers))
+        data = sum(average_numbers)/len(average_numbers)
+        print(f"Top Tokens: {max(top_number)}")
+        print(f"Average:  {data}  ")
+        self.plot_token_statistics(average_numbers, top_number)
+if __name__ == "__main__":
+    dialog_processor = DialogProcessor()
+    # Specify the corpus path or use 'movie-corpus' to download
+    corpus_path = "D:\\movie-corpus"  # Change to your actual local path
+    dialog_processor.load_corpus(corpus_path)
+    grouped_dialogues = dialog_processor.group_conversations()
+    # Save the processed dialogues as JSON
+    dialog_processor.save_grouped_conversations(grouped_dialogues)