Spaces:

adarsh-maurya
/

ApnaLawyer

Running

App Files Files Community

adarsh-maurya commited on 8 days ago

Commit

5402334

verified ·

1 Parent(s): 0dae560

Update Ingest.py

Browse files

Files changed (1) hide show

Ingest.py +35 -25

Ingest.py CHANGED Viewed

@@ -6,59 +6,69 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
-# Initialize Ray
-ray.init()
-# Set up basic configuration for logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Directory where the FAISS index is saved
 index_directory = 'ipc_embed_db'
 index_path_faiss = os.path.join(index_directory, 'index.faiss')
 index_path_pkl = os.path.join(index_directory, 'index.pkl')
-# Ensure the index directory exists
 os.makedirs(index_directory, exist_ok=True)
 # Load documents
-logging.info("Loading documents...")
-loader = DirectoryLoader('data', glob="./*.txt")
 documents = loader.load()
-# Split documents into manageable chunks
-logging.info("Splitting documents into chunks...")
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
 texts = text_splitter.split_documents(documents)
-# Load embedding model once
-logging.info("Loading embedding model...")
 embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
-# Function to create and save FAISS index
 def create_faiss_index():
-    logging.info("Creating new FAISS index from documents...")
     faiss_db = FAISS.from_documents(texts, embeddings)
     faiss_db.save_local(index_directory)
-    logging.info("FAISS index created and saved.")
     return faiss_db
-# Function to load or create FAISS index
 def load_or_create_faiss_index():
     if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
-        logging.info("Loading existing FAISS index...")
-        faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
-        logging.info("FAISS index loaded successfully.")
-        return faiss_db
     else:
-        logging.info("FAISS index not found. Creating a new one...")
-        return create_faiss_index()
-# Load or create the index
 faiss_db = load_or_create_faiss_index()
-# Optional: If you want to use the retriever later
 # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
-# Shutdown Ray after the process
 ray.shutdown()
-logging.info("Process completed successfully.")

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
+# Initialize Ray (safe even if already running)
+ray.init(ignore_reinit_error=True)
+# Logging setup
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Define FAISS index paths
 index_directory = 'ipc_embed_db'
 index_path_faiss = os.path.join(index_directory, 'index.faiss')
 index_path_pkl = os.path.join(index_directory, 'index.pkl')
+# Ensure index directory exists
 os.makedirs(index_directory, exist_ok=True)
 # Load documents
+logging.info("📁 Loading legal documents from 'data/' directory...")
+loader = DirectoryLoader('data', glob="**/*.txt")  # Recursively load .txt files
 documents = loader.load()
+# Check if any documents were found
+if not documents:
+    logging.error("❌ No documents found in 'data/'. Please add .txt files to proceed.")
+    ray.shutdown()
+    exit()
+# Split documents into chunks
+logging.info("✂️ Splitting documents for embedding...")
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
 texts = text_splitter.split_documents(documents)
+# Load the InLegalBERT embedding model
+logging.info("📦 Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...")
 embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
+# Create and save the FAISS index
 def create_faiss_index():
+    logging.info("⚙️ Creating new FAISS index...")
     faiss_db = FAISS.from_documents(texts, embeddings)
     faiss_db.save_local(index_directory)
+    logging.info("✅ FAISS index saved in '%s'.", index_directory)
     return faiss_db
+# Load existing index or create if missing
 def load_or_create_faiss_index():
     if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
+        logging.info("📂 Loading existing FAISS index...")
+        try:
+            faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
+            logging.info("✅ FAISS index loaded successfully.")
+            return faiss_db
+        except Exception as e:
+            logging.warning("⚠️ Failed to load existing index. Recreating... (%s)", str(e))
     else:
+        logging.info("❌ FAISS index files not found. Creating new index...")
+    return create_faiss_index()
+# Build the index
 faiss_db = load_or_create_faiss_index()
+# Optional: if you want to use the retriever later
 # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+# Shut down Ray
 ray.shutdown()
+logging.info("✅ Indexing process completed successfully.")