Hyacinthax commited on
Commit
0bdb125
·
verified ·
1 Parent(s): d35be68

Upload preprocess_dialogs.py

Browse files

Forgot to add this. This is how I preprocessed the dialogs for the corpus to be local instead of pulling from convokit constantly

Files changed (1) hide show
  1. preprocess_dialogs.py +160 -0
preprocess_dialogs.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import convokit
3
+ import re
4
+ import matplotlib.pyplot as plt
5
+ import json
6
+ import random
7
+
8
+
9
+ class DialogProcessor:
10
+ def __init__(self):
11
+ self.corpus = None
12
+
13
+ def load_corpus(self, corpus_path):
14
+ # Load the corpus from a local file or download it
15
+ if os.path.exists(corpus_path):
16
+ self.corpus = convokit.Corpus(filename=corpus_path)
17
+ else:
18
+ print("Corpus not found locally. Downloading from Convokit...")
19
+ self.corpus = convokit.Corpus(filename=convokit.download('movie-corpus'))
20
+
21
+ import re
22
+
23
+ @staticmethod
24
+ def preprocess_text(text):
25
+ """Clean text by removing tags, quotes, extra spaces, and expanding contractions."""
26
+ if not text:
27
+ return "" # Handle missing data gracefully
28
+
29
+ # Remove XML-like tags
30
+ cleaned_text = re.sub(r'<[^>]+>', '', text) # More general than <u>...</u>
31
+
32
+ # Remove double quotes
33
+ cleaned_text = cleaned_text.replace('"', '')
34
+
35
+ # Normalize spaces
36
+ cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
37
+
38
+ # Convert to lowercase
39
+ txt = cleaned_text.lower()
40
+
41
+ # Expanded contractions mapping
42
+ contractions = {
43
+ "i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
44
+ "what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is",
45
+ "it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are",
46
+ "you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have",
47
+ "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
48
+ "she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
49
+ "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
50
+ "they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not",
51
+ "won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not",
52
+ "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not",
53
+ "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
54
+ "haven't": "have not", "hasn't": "has not", "hadn't": "had not"
55
+ }
56
+
57
+ # Expand contractions
58
+ for contraction, expansion in contractions.items():
59
+ txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt)
60
+
61
+ # Remove non-alphanumeric characters except apostrophes
62
+ txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt) # Keeps letters, numbers, and apostrophes
63
+ txt = re.sub(r"\s+", " ", txt).strip() # Remove extra spaces
64
+
65
+ return txt
66
+
67
+
68
+ def group_conversations(self):
69
+ """Process and structure conversations into distinct, numbered groups with tuple pairs."""
70
+ if self.corpus is None:
71
+ raise ValueError("Corpus is not loaded.")
72
+
73
+ grouped_dialogues = {} # Format: {conversation_id: [(input, response), ...]}
74
+ misc_dialogues = [] # Shorter convos go here
75
+ conversation_ids = list(self.corpus.get_conversation_ids())
76
+ current_dialog_id = 1 # Start numbering from 1
77
+
78
+ for conversation_id in conversation_ids:
79
+ conversation = self.corpus.get_conversation(conversation_id)
80
+ utterances = conversation.get_utterance_ids()
81
+ current_dialog = []
82
+
83
+ for i in range(len(utterances) - 1):
84
+ utterance = self.corpus.get_utterance(utterances[i])
85
+
86
+ # Ensure valid text before processing
87
+ if not utterance.text:
88
+ continue
89
+
90
+ data = self.preprocess_text(utterance.text)
91
+
92
+ current_dialog.append(data)
93
+
94
+ if len(current_dialog) >= 4: # Save only if the convo has at least 4 exchanges
95
+ grouped_dialogues[str(current_dialog_id)] = current_dialog
96
+ current_dialog_id += 1
97
+ else:
98
+ misc_dialogues.append(current_dialog)
99
+ current_dialog = [] # Reset for the next conversation
100
+
101
+ grouped_dialogues["0"] = [convo for convo in misc_dialogues]
102
+
103
+ print(f"Processed {len(grouped_dialogues)} grouped conversations, including {len(misc_dialogues)} miscellaneous.")
104
+ return grouped_dialogues
105
+
106
+ def plot_token_statistics(self, conversation_averages, conversation_top):
107
+ plt.figure(figsize=(10, 5))
108
+ plt.plot(range(1, len(conversation_averages) + 1), conversation_averages, marker='o', linestyle='-',
109
+ label="Average Tokens per Input")
110
+ plt.plot(range(1, len(conversation_top) + 1), conversation_top, marker='s', linestyle='--',
111
+ label="Top Token Count")
112
+ plt.xlabel("Total Number of Conversations")
113
+ plt.ylabel("Token Count")
114
+ plt.title("Token Statistics Over Conversations")
115
+ plt.legend()
116
+ plt.grid(True)
117
+ plt.show()
118
+
119
+ def save_grouped_conversations(self, grouped_dialogues):
120
+ average_list = []
121
+ with open('preprocessed_dialogs.py', 'w', encoding="utf-8") as f:
122
+ f.write("dialog_data = {\n")
123
+ for keys, values in grouped_dialogues.items():
124
+ # print(list(values))
125
+ [average_list.append(str(sentences).split(" ")) for sentences in values if sentences not in average_list]
126
+ organized = [(values[i], values[i + 1]) for i in range(len(values)- 1)]
127
+ f.write(f' "{keys}": {organized},\n')
128
+
129
+ f.write("}")
130
+
131
+ average_numbers = []
132
+ top_number = []
133
+
134
+ for items in average_list:
135
+ new_data = len(items)
136
+ top_number.append(0)
137
+ if new_data > max(top_number) and new_data not in top_number:
138
+ top_number.append(new_data)
139
+ average_numbers.append(new_data)
140
+ else:
141
+ average_numbers.append(new_data)
142
+
143
+ print(list(average_numbers))
144
+ data = sum(average_numbers)/len(average_numbers)
145
+ print(f"Top Tokens: {max(top_number)}")
146
+ print(f"Average: {data} ")
147
+ self.plot_token_statistics(average_numbers, top_number)
148
+
149
+
150
+ if __name__ == "__main__":
151
+ dialog_processor = DialogProcessor()
152
+
153
+ # Specify the corpus path or use 'movie-corpus' to download
154
+ corpus_path = "D:\\movie-corpus" # Change to your actual local path
155
+ dialog_processor.load_corpus(corpus_path)
156
+
157
+ grouped_dialogues = dialog_processor.group_conversations()
158
+
159
+ # Save the processed dialogues as JSON
160
+ dialog_processor.save_grouped_conversations(grouped_dialogues)