Spaces:

thejarll
/

TALLER_1_3

Running on Zero

App Files Files Community

thejarll commited on 10 days ago

Commit

2af6f6f

verified ·

1 Parent(s): 1b7a077

Upload indexador.py

Browse files

Files changed (1) hide show

indexador.py +79 -0

indexador.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import fitz  # PyMuPDF
+import faiss
+import pickle
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+import torch
+def cargar_pdfs(ruta="."):
+    textos = []
+    for archivo in os.listdir(ruta):
+        if archivo.endswith(".pdf"):
+            ruta_pdf = os.path.join(ruta, archivo)
+            print(f"Procesando: {archivo}")
+            doc = fitz.open(ruta_pdf)
+            texto = ""
+            for pagina in doc:
+                texto += pagina.get_text()
+            texto = texto.replace("\n", " ").replace("  ", " ").strip()
+            doc.close()
+            if texto:
+                textos.append(texto)
+    return textos
+def chunk_texto(texto, longitud=800):
+    return [texto[i:i+longitud] for i in range(0, len(texto), longitud)]
+def generar_embedding(textos, tokenizer, model, batch_size=32):
+    all_embeddings = []
+    for i in range(0, len(textos), batch_size):
+        batch = textos[i:i + batch_size]
+        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            last_hidden = outputs.last_hidden_state
+            mask = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden.size()).float()
+            summed = torch.sum(last_hidden * mask, 1)
+            counted = torch.clamp(mask.sum(1), min=1e-9)
+            mean_pooled = summed / counted
+            all_embeddings.append(mean_pooled.numpy())
+    return np.vstack(all_embeddings)
+def crear_index_y_guardar(ruta=".", modelo_id="jinaai/jina-embeddings-v2-base-es", archivo_salida="index.pkl"):
+    print("Cargando PDFs...")
+    textos = cargar_pdfs(ruta)
+    print("Dividiendo en chunks...")
+    chunks = []
+    for texto in textos:
+        chunks.extend(chunk_texto(texto))
+    if not chunks:
+        raise ValueError("No se generaron chunks. Revisa tus PDFs.")
+    print(f"Total de chunks generados: {len(chunks)}")
+    print("Generando embeddings...")
+    tokenizer = AutoTokenizer.from_pretrained(modelo_id)
+    model = AutoModel.from_pretrained(modelo_id)
+    embeddings = generar_embedding(chunks, tokenizer, model, batch_size=32)
+    print(f"Dimensión de embeddings: {embeddings.shape[1]}")
+    print("Creando índice FAISS...")
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    print(f"Guardando índice en: {archivo_salida}")
+    with open(archivo_salida, "wb") as f:
+        pickle.dump((index, chunks), f)
+    print("Indexación completada.")
+    return index, chunks
+if __name__ == "__main__":
+    crear_index_y_guardar()