Upload preprocess_dialogs.py
Browse filesForgot to add this. This is how I preprocessed the dialogs for the corpus to be local instead of pulling from convokit constantly
- preprocess_dialogs.py +160 -0
preprocess_dialogs.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import convokit
|
3 |
+
import re
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import json
|
6 |
+
import random
|
7 |
+
|
8 |
+
|
9 |
+
class DialogProcessor:
|
10 |
+
def __init__(self):
|
11 |
+
self.corpus = None
|
12 |
+
|
13 |
+
def load_corpus(self, corpus_path):
|
14 |
+
# Load the corpus from a local file or download it
|
15 |
+
if os.path.exists(corpus_path):
|
16 |
+
self.corpus = convokit.Corpus(filename=corpus_path)
|
17 |
+
else:
|
18 |
+
print("Corpus not found locally. Downloading from Convokit...")
|
19 |
+
self.corpus = convokit.Corpus(filename=convokit.download('movie-corpus'))
|
20 |
+
|
21 |
+
import re
|
22 |
+
|
23 |
+
@staticmethod
|
24 |
+
def preprocess_text(text):
|
25 |
+
"""Clean text by removing tags, quotes, extra spaces, and expanding contractions."""
|
26 |
+
if not text:
|
27 |
+
return "" # Handle missing data gracefully
|
28 |
+
|
29 |
+
# Remove XML-like tags
|
30 |
+
cleaned_text = re.sub(r'<[^>]+>', '', text) # More general than <u>...</u>
|
31 |
+
|
32 |
+
# Remove double quotes
|
33 |
+
cleaned_text = cleaned_text.replace('"', '')
|
34 |
+
|
35 |
+
# Normalize spaces
|
36 |
+
cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
|
37 |
+
|
38 |
+
# Convert to lowercase
|
39 |
+
txt = cleaned_text.lower()
|
40 |
+
|
41 |
+
# Expanded contractions mapping
|
42 |
+
contractions = {
|
43 |
+
"i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
|
44 |
+
"what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is",
|
45 |
+
"it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are",
|
46 |
+
"you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have",
|
47 |
+
"they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
|
48 |
+
"she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
|
49 |
+
"you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
|
50 |
+
"they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not",
|
51 |
+
"won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not",
|
52 |
+
"shouldn't": "should not", "mightn't": "might not", "mustn't": "must not",
|
53 |
+
"isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
|
54 |
+
"haven't": "have not", "hasn't": "has not", "hadn't": "had not"
|
55 |
+
}
|
56 |
+
|
57 |
+
# Expand contractions
|
58 |
+
for contraction, expansion in contractions.items():
|
59 |
+
txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt)
|
60 |
+
|
61 |
+
# Remove non-alphanumeric characters except apostrophes
|
62 |
+
txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt) # Keeps letters, numbers, and apostrophes
|
63 |
+
txt = re.sub(r"\s+", " ", txt).strip() # Remove extra spaces
|
64 |
+
|
65 |
+
return txt
|
66 |
+
|
67 |
+
|
68 |
+
def group_conversations(self):
|
69 |
+
"""Process and structure conversations into distinct, numbered groups with tuple pairs."""
|
70 |
+
if self.corpus is None:
|
71 |
+
raise ValueError("Corpus is not loaded.")
|
72 |
+
|
73 |
+
grouped_dialogues = {} # Format: {conversation_id: [(input, response), ...]}
|
74 |
+
misc_dialogues = [] # Shorter convos go here
|
75 |
+
conversation_ids = list(self.corpus.get_conversation_ids())
|
76 |
+
current_dialog_id = 1 # Start numbering from 1
|
77 |
+
|
78 |
+
for conversation_id in conversation_ids:
|
79 |
+
conversation = self.corpus.get_conversation(conversation_id)
|
80 |
+
utterances = conversation.get_utterance_ids()
|
81 |
+
current_dialog = []
|
82 |
+
|
83 |
+
for i in range(len(utterances) - 1):
|
84 |
+
utterance = self.corpus.get_utterance(utterances[i])
|
85 |
+
|
86 |
+
# Ensure valid text before processing
|
87 |
+
if not utterance.text:
|
88 |
+
continue
|
89 |
+
|
90 |
+
data = self.preprocess_text(utterance.text)
|
91 |
+
|
92 |
+
current_dialog.append(data)
|
93 |
+
|
94 |
+
if len(current_dialog) >= 4: # Save only if the convo has at least 4 exchanges
|
95 |
+
grouped_dialogues[str(current_dialog_id)] = current_dialog
|
96 |
+
current_dialog_id += 1
|
97 |
+
else:
|
98 |
+
misc_dialogues.append(current_dialog)
|
99 |
+
current_dialog = [] # Reset for the next conversation
|
100 |
+
|
101 |
+
grouped_dialogues["0"] = [convo for convo in misc_dialogues]
|
102 |
+
|
103 |
+
print(f"Processed {len(grouped_dialogues)} grouped conversations, including {len(misc_dialogues)} miscellaneous.")
|
104 |
+
return grouped_dialogues
|
105 |
+
|
106 |
+
def plot_token_statistics(self, conversation_averages, conversation_top):
|
107 |
+
plt.figure(figsize=(10, 5))
|
108 |
+
plt.plot(range(1, len(conversation_averages) + 1), conversation_averages, marker='o', linestyle='-',
|
109 |
+
label="Average Tokens per Input")
|
110 |
+
plt.plot(range(1, len(conversation_top) + 1), conversation_top, marker='s', linestyle='--',
|
111 |
+
label="Top Token Count")
|
112 |
+
plt.xlabel("Total Number of Conversations")
|
113 |
+
plt.ylabel("Token Count")
|
114 |
+
plt.title("Token Statistics Over Conversations")
|
115 |
+
plt.legend()
|
116 |
+
plt.grid(True)
|
117 |
+
plt.show()
|
118 |
+
|
119 |
+
def save_grouped_conversations(self, grouped_dialogues):
|
120 |
+
average_list = []
|
121 |
+
with open('preprocessed_dialogs.py', 'w', encoding="utf-8") as f:
|
122 |
+
f.write("dialog_data = {\n")
|
123 |
+
for keys, values in grouped_dialogues.items():
|
124 |
+
# print(list(values))
|
125 |
+
[average_list.append(str(sentences).split(" ")) for sentences in values if sentences not in average_list]
|
126 |
+
organized = [(values[i], values[i + 1]) for i in range(len(values)- 1)]
|
127 |
+
f.write(f' "{keys}": {organized},\n')
|
128 |
+
|
129 |
+
f.write("}")
|
130 |
+
|
131 |
+
average_numbers = []
|
132 |
+
top_number = []
|
133 |
+
|
134 |
+
for items in average_list:
|
135 |
+
new_data = len(items)
|
136 |
+
top_number.append(0)
|
137 |
+
if new_data > max(top_number) and new_data not in top_number:
|
138 |
+
top_number.append(new_data)
|
139 |
+
average_numbers.append(new_data)
|
140 |
+
else:
|
141 |
+
average_numbers.append(new_data)
|
142 |
+
|
143 |
+
print(list(average_numbers))
|
144 |
+
data = sum(average_numbers)/len(average_numbers)
|
145 |
+
print(f"Top Tokens: {max(top_number)}")
|
146 |
+
print(f"Average: {data} ")
|
147 |
+
self.plot_token_statistics(average_numbers, top_number)
|
148 |
+
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
dialog_processor = DialogProcessor()
|
152 |
+
|
153 |
+
# Specify the corpus path or use 'movie-corpus' to download
|
154 |
+
corpus_path = "D:\\movie-corpus" # Change to your actual local path
|
155 |
+
dialog_processor.load_corpus(corpus_path)
|
156 |
+
|
157 |
+
grouped_dialogues = dialog_processor.group_conversations()
|
158 |
+
|
159 |
+
# Save the processed dialogues as JSON
|
160 |
+
dialog_processor.save_grouped_conversations(grouped_dialogues)
|