Raubachm commited on
Commit
b78862e
·
verified ·
1 Parent(s): c8bd9c5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +84 -80
README.md CHANGED
@@ -66,90 +66,94 @@ from sklearn.metrics.pairwise import cosine_similarity
66
  import numpy as np
67
  import matplotlib.pyplot as plt
68
 
69
- # Text to be chunked
70
- with open("/path to text") as f:
71
- text = f.read()
72
-
73
- # Tokenize the text into sentences
74
- sentences = sent_tokenize(text)
75
-
76
- # Generate embeddings for each sentence using sentence-transformers model of choice
77
- model = SentenceTransformer('sentence-transformers/all-mpnet-base-v1')
78
- embeddings = model.encode(sentences)
79
-
80
- # Combine the sentences with their neighbors
81
- # Adjust buffer size to change how many neighboring sentences on either side of a target sentence are included in the combined text (1=1 before and after).
82
- def combine_sentences(sentences, buffer_size=1):
83
- combined_sentences = []
84
- for i in range(len(sentences)):
85
- combined_sentence = ' '.join(sentences[max(0, i-buffer_size):min(len(sentences), i+1+buffer_size)])
86
- combined_sentences.append(combined_sentence)
87
- return combined_sentences
88
-
89
- combined_sentences = combine_sentences(sentences)
90
- combined_embeddings = model.encode(combined_sentences)
91
 
92
  # Calculate cosine distances between embeddings
93
- def calculate_cosine_distances(embeddings):
94
- distances = []
95
- for i in range(len(embeddings) - 1):
96
- similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
97
- distance = 1 - similarity
98
- distances.append(distance)
99
- return distances
100
-
101
- distances = calculate_cosine_distances(combined_embeddings)
102
-
103
- # Identify breakpoints
104
- # Adjust breakpoint threshhold to change the level of dissimilarity between chunk embeddings (higher for greater dissimilarity)
105
- breakpoint_percentile_threshold = 95
106
- breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
107
- breakpoint_indices = [i for i, distance in enumerate(distances) if distance > breakpoint_distance_threshold]
108
 
109
  # Create chunks based on breakpoints
110
- chunks = []
111
- start_index = 0
112
- for breakpoint_index in breakpoint_indices:
113
- chunk = ' '.join(sentences[start_index:breakpoint_index + 1])
114
- chunks.append(chunk)
115
- start_index = breakpoint_index + 1
116
- chunks.append(' '.join(sentences[start_index:]))
117
-
118
- # Set a minimum number of sentences per chunk
119
- min_chunk_size = 3
120
-
121
- # Merge small chunks with their most semantically similar neighbor
122
- def merge_small_chunks_with_neighbors(chunks, embeddings):
123
- merged_chunks = [chunks[0]] # Start with the first chunk
124
- merged_embeddings = [embeddings[0]] # And its embedding
125
-
126
- for i in range(1, len(chunks) - 1): # Iterate through chunks, excluding the first and last
127
- # If the current chunk is small, consider merging it with a neighbor
128
- if len(chunks[i].split('. ')) < min_chunk_size:
129
- prev_similarity = cosine_similarity([embeddings[i]], [merged_embeddings[-1]])[0][0]
130
- next_similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
131
-
132
- # Merge with the most similar neighbor
133
- if prev_similarity > next_similarity:
134
- merged_chunks[-1] += ' ' + chunks[i]
135
- merged_embeddings[-1] = (merged_embeddings[-1] + embeddings[i]) / 2
136
- else:
137
- chunks[i + 1] = chunks[i] + ' ' + chunks[i + 1]
138
- embeddings[i + 1] = (embeddings[i] + embeddings[i + 1]) / 2
139
- else:
140
- merged_chunks.append(chunks[i])
141
- merged_embeddings.append(embeddings[i])
142
-
143
- merged_chunks.append(chunks[-1])
144
- merged_embeddings.append(embeddings[-1])
145
-
146
- return merged_chunks, merged_embeddings
147
-
148
- # Generate embeddings for each initial chunk and merge most semantically similar neighbors
149
- chunk_embeddings = model.encode(chunks)
150
- chunks, chunk_embeddings = merge_small_chunks_with_neighbors(chunks, chunk_embeddings)
151
-
152
- print(chunks[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  ```
154
  ## Evaluation Results
155
 
 
66
  import numpy as np
67
  import matplotlib.pyplot as plt
68
 
69
+ # Load and tokenize the text
70
+ def load_and_tokenize(file_path):
71
+ with open(file_path, 'r') as f:
72
+ text = f.read()
73
+ return sent_tokenize(text)
74
+
75
+ # Combine sentences with their neighbors
76
+ def combine_sentences(sentences, buffer=1):
77
+ combined = []
78
+ for i in range(len(sentences)):
79
+ start = max(0, i - buffer)
80
+ end = min(len(sentences), i + buffer + 1)
81
+ combined.append(' '.join(sentences[start:end]))
82
+ return combined
 
 
 
 
 
 
 
 
83
 
84
  # Calculate cosine distances between embeddings
85
+ def calc_cosine_distances(embeddings):
86
+ distances = []
87
+ for i in range(len(embeddings) - 1):
88
+ sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
89
+ distances.append(1 - sim)
90
+ return distances
91
+
92
+ # Find breakpoints based on distance threshold
93
+ def find_breakpoints(distances, percentile=95):
94
+ threshold = np.percentile(distances, percentile)
95
+ return [i for i, d in enumerate(distances) if d > threshold]
 
 
 
 
96
 
97
  # Create chunks based on breakpoints
98
+ def create_chunks(sentences, breakpoints):
99
+ chunks = []
100
+ start = 0
101
+ for bp in breakpoints:
102
+ chunks.append(' '.join(sentences[start:bp + 1]))
103
+ start = bp + 1
104
+ chunks.append(' '.join(sentences[start:]))
105
+ return chunks
106
+
107
+ # Merge small chunks with their most similar neighbor
108
+ def merge_small_chunks(chunks, embeddings, min_size=3):
109
+ merged = [chunks[0]]
110
+ merged_emb = [embeddings[0]]
111
+
112
+ for i in range(1, len(chunks) - 1):
113
+ if len(chunks[i].split('. ')) < min_size:
114
+ prev_sim = cosine_similarity([embeddings[i]], [merged_emb[-1]])[0][0]
115
+ next_sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
116
+
117
+ if prev_sim > next_sim:
118
+ merged[-1] += ' ' + chunks[i]
119
+ merged_emb[-1] = (merged_emb[-1] + embeddings[i]) / 2
120
+ else:
121
+ chunks[i + 1] = chunks[i] + ' ' + chunks[i + 1]
122
+ embeddings[i + 1] = (embeddings[i] + embeddings[i + 1]) / 2
123
+ else:
124
+ merged.append(chunks[i])
125
+ merged_emb.append(embeddings[i])
126
+
127
+ merged.append(chunks[-1])
128
+ merged_emb.append(embeddings[-1])
129
+ return merged, merged_emb
130
+
131
+ # Main process
132
+ def chunk_text(file_path):
133
+ # Load the model
134
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v1')
135
+
136
+ # Process the text
137
+ sentences = load_and_tokenize(file_path)
138
+ combined = combine_sentences(sentences)
139
+ embeddings = model.encode(combined)
140
+
141
+ # Find breakpoints and create initial chunks
142
+ distances = calc_cosine_distances(embeddings)
143
+ breakpoints = find_breakpoints(distances)
144
+ chunks = create_chunks(sentences, breakpoints)
145
+
146
+ # Merge small chunks
147
+ chunk_embeddings = model.encode(chunks)
148
+ final_chunks, _ = merge_small_chunks(chunks, chunk_embeddings)
149
+
150
+ return final_chunks
151
+
152
+ if __name__ == "__main__":
153
+ file_path = "/path/to/your/text/file.txt"
154
+ result = chunk_text(file_path)
155
+ print(f"Number of chunks: {len(result)}")
156
+ print("First chunk:", result[0][:100] + "...")
157
  ```
158
  ## Evaluation Results
159