adarsh-maurya commited on
Commit
5402334
Β·
verified Β·
1 Parent(s): 0dae560

Update Ingest.py

Browse files
Files changed (1) hide show
  1. Ingest.py +35 -25
Ingest.py CHANGED
@@ -6,59 +6,69 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
 
9
- # Initialize Ray
10
- ray.init()
11
 
12
- # Set up basic configuration for logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
 
15
- # Directory where the FAISS index is saved
16
  index_directory = 'ipc_embed_db'
17
  index_path_faiss = os.path.join(index_directory, 'index.faiss')
18
  index_path_pkl = os.path.join(index_directory, 'index.pkl')
19
 
20
- # Ensure the index directory exists
21
  os.makedirs(index_directory, exist_ok=True)
22
 
23
  # Load documents
24
- logging.info("Loading documents...")
25
- loader = DirectoryLoader('data', glob="./*.txt")
26
  documents = loader.load()
27
 
28
- # Split documents into manageable chunks
29
- logging.info("Splitting documents into chunks...")
 
 
 
 
 
 
30
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
31
  texts = text_splitter.split_documents(documents)
32
 
33
- # Load embedding model once
34
- logging.info("Loading embedding model...")
35
  embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
36
 
37
- # Function to create and save FAISS index
38
  def create_faiss_index():
39
- logging.info("Creating new FAISS index from documents...")
40
  faiss_db = FAISS.from_documents(texts, embeddings)
41
  faiss_db.save_local(index_directory)
42
- logging.info("FAISS index created and saved.")
43
  return faiss_db
44
 
45
- # Function to load or create FAISS index
46
  def load_or_create_faiss_index():
47
  if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
48
- logging.info("Loading existing FAISS index...")
49
- faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
50
- logging.info("FAISS index loaded successfully.")
51
- return faiss_db
 
 
 
52
  else:
53
- logging.info("FAISS index not found. Creating a new one...")
54
- return create_faiss_index()
 
55
 
56
- # Load or create the index
57
  faiss_db = load_or_create_faiss_index()
58
 
59
- # Optional: If you want to use the retriever later
60
  # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
61
 
62
- # Shutdown Ray after the process
63
  ray.shutdown()
64
- logging.info("Process completed successfully.")
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
 
9
+ # Initialize Ray (safe even if already running)
10
+ ray.init(ignore_reinit_error=True)
11
 
12
+ # Logging setup
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
 
15
+ # Define FAISS index paths
16
  index_directory = 'ipc_embed_db'
17
  index_path_faiss = os.path.join(index_directory, 'index.faiss')
18
  index_path_pkl = os.path.join(index_directory, 'index.pkl')
19
 
20
+ # Ensure index directory exists
21
  os.makedirs(index_directory, exist_ok=True)
22
 
23
  # Load documents
24
+ logging.info("πŸ“ Loading legal documents from 'data/' directory...")
25
+ loader = DirectoryLoader('data', glob="**/*.txt") # Recursively load .txt files
26
  documents = loader.load()
27
 
28
+ # Check if any documents were found
29
+ if not documents:
30
+ logging.error("❌ No documents found in 'data/'. Please add .txt files to proceed.")
31
+ ray.shutdown()
32
+ exit()
33
+
34
+ # Split documents into chunks
35
+ logging.info("βœ‚οΈ Splitting documents for embedding...")
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
37
  texts = text_splitter.split_documents(documents)
38
 
39
+ # Load the InLegalBERT embedding model
40
+ logging.info("πŸ“¦ Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...")
41
  embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
42
 
43
+ # Create and save the FAISS index
44
  def create_faiss_index():
45
+ logging.info("βš™οΈ Creating new FAISS index...")
46
  faiss_db = FAISS.from_documents(texts, embeddings)
47
  faiss_db.save_local(index_directory)
48
+ logging.info("βœ… FAISS index saved in '%s'.", index_directory)
49
  return faiss_db
50
 
51
+ # Load existing index or create if missing
52
  def load_or_create_faiss_index():
53
  if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
54
+ logging.info("πŸ“‚ Loading existing FAISS index...")
55
+ try:
56
+ faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
57
+ logging.info("βœ… FAISS index loaded successfully.")
58
+ return faiss_db
59
+ except Exception as e:
60
+ logging.warning("⚠️ Failed to load existing index. Recreating... (%s)", str(e))
61
  else:
62
+ logging.info("❌ FAISS index files not found. Creating new index...")
63
+
64
+ return create_faiss_index()
65
 
66
+ # Build the index
67
  faiss_db = load_or_create_faiss_index()
68
 
69
+ # Optional: if you want to use the retriever later
70
  # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
71
 
72
+ # Shut down Ray
73
  ray.shutdown()
74
+ logging.info("βœ… Indexing process completed successfully.")