Spaces:
Running
on
Zero
Running
on
Zero
Upload indexador.py
Browse files- indexador.py +79 -0
indexador.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
import faiss
|
4 |
+
import pickle
|
5 |
+
import numpy as np
|
6 |
+
from transformers import AutoTokenizer, AutoModel
|
7 |
+
import torch
|
8 |
+
|
9 |
+
def cargar_pdfs(ruta="."):
|
10 |
+
textos = []
|
11 |
+
for archivo in os.listdir(ruta):
|
12 |
+
if archivo.endswith(".pdf"):
|
13 |
+
ruta_pdf = os.path.join(ruta, archivo)
|
14 |
+
print(f"Procesando: {archivo}")
|
15 |
+
doc = fitz.open(ruta_pdf)
|
16 |
+
texto = ""
|
17 |
+
for pagina in doc:
|
18 |
+
texto += pagina.get_text()
|
19 |
+
texto = texto.replace("\n", " ").replace(" ", " ").strip()
|
20 |
+
doc.close()
|
21 |
+
if texto:
|
22 |
+
textos.append(texto)
|
23 |
+
return textos
|
24 |
+
|
25 |
+
def chunk_texto(texto, longitud=800):
|
26 |
+
return [texto[i:i+longitud] for i in range(0, len(texto), longitud)]
|
27 |
+
|
28 |
+
def generar_embedding(textos, tokenizer, model, batch_size=32):
|
29 |
+
all_embeddings = []
|
30 |
+
|
31 |
+
for i in range(0, len(textos), batch_size):
|
32 |
+
batch = textos[i:i + batch_size]
|
33 |
+
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
|
34 |
+
with torch.no_grad():
|
35 |
+
outputs = model(**inputs)
|
36 |
+
last_hidden = outputs.last_hidden_state
|
37 |
+
mask = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden.size()).float()
|
38 |
+
summed = torch.sum(last_hidden * mask, 1)
|
39 |
+
counted = torch.clamp(mask.sum(1), min=1e-9)
|
40 |
+
mean_pooled = summed / counted
|
41 |
+
all_embeddings.append(mean_pooled.numpy())
|
42 |
+
|
43 |
+
return np.vstack(all_embeddings)
|
44 |
+
|
45 |
+
def crear_index_y_guardar(ruta=".", modelo_id="jinaai/jina-embeddings-v2-base-es", archivo_salida="index.pkl"):
|
46 |
+
print("Cargando PDFs...")
|
47 |
+
textos = cargar_pdfs(ruta)
|
48 |
+
|
49 |
+
print("Dividiendo en chunks...")
|
50 |
+
chunks = []
|
51 |
+
for texto in textos:
|
52 |
+
chunks.extend(chunk_texto(texto))
|
53 |
+
|
54 |
+
if not chunks:
|
55 |
+
raise ValueError("No se generaron chunks. Revisa tus PDFs.")
|
56 |
+
|
57 |
+
print(f"Total de chunks generados: {len(chunks)}")
|
58 |
+
|
59 |
+
print("Generando embeddings...")
|
60 |
+
tokenizer = AutoTokenizer.from_pretrained(modelo_id)
|
61 |
+
model = AutoModel.from_pretrained(modelo_id)
|
62 |
+
|
63 |
+
embeddings = generar_embedding(chunks, tokenizer, model, batch_size=32)
|
64 |
+
|
65 |
+
print(f"Dimensión de embeddings: {embeddings.shape[1]}")
|
66 |
+
|
67 |
+
print("Creando índice FAISS...")
|
68 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
69 |
+
index.add(embeddings)
|
70 |
+
|
71 |
+
print(f"Guardando índice en: {archivo_salida}")
|
72 |
+
with open(archivo_salida, "wb") as f:
|
73 |
+
pickle.dump((index, chunks), f)
|
74 |
+
|
75 |
+
print("Indexación completada.")
|
76 |
+
return index, chunks
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
crear_index_y_guardar()
|