from huggingface_hub import InferenceClient from config import BASE_MODEL, MY_MODEL, HF_TOKEN import pandas as pd import json import re from difflib import get_close_matches class SchoolChatbot: """Boston School Chatbot integrating structured data, vector context, and model completion.""" def __init__(self): model_id = MY_MODEL if MY_MODEL else BASE_MODEL self.client = InferenceClient(model=model_id, token=HF_TOKEN) self.df = pd.read_csv("bps_data.csv") with open("cleaned_keyword_to_column_map.json") as f: self.keyword_map = json.load(f) # Build name variants for school matching self.school_name_map = {} for _, row in self.df.iterrows(): primary = row.get("BPS_School_Name") hist = row.get("BPS_Historical_Name") abbrev = row.get("SMMA_Abbreviated_Name") if pd.notna(primary): self.school_name_map[primary.lower()] = primary if pd.notna(hist): self.school_name_map[hist.lower()] = primary if pd.notna(abbrev): self.school_name_map[abbrev.lower()] = primary self.school_name_map.update({ "acc": "Another Course to College*", "baldwin": "Baldwin Early Learning Pilot Academy", "adams elementary": "Adams, Samuel Elementary", "alighieri montessori": "Alighieri, Dante Montessori School", "phineas bates": "Bates, Phineas Elementary", }) def format_prompt(self, user_input): return ( "<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n" f"<|user|>{user_input}<|end|>\n" "<|assistant|>" ) def match_school_name(self, query): for key in self.school_name_map: if key in query.lower(): return self.school_name_map[key] return None def extract_context_with_keywords(self, prompt, school_name=None): def extract_keywords(text): tokens = re.findall(r'\b\w+\b', text.lower()) matched = set() for token in tokens: matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85)) return matched matched_keywords = extract_keywords(prompt) df_filtered = self.df if school_name: df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)] if df_filtered.empty: return [] row = df_filtered.iloc[0] context_items = [] for kw in matched_keywords: col = self.keyword_map.get(kw) val = row.get(col) if col else None if col and pd.notna(val): context_items.append(f"The school's {kw} is {val.lower()}.") return context_items def query_schools_by_feature(self, query): tokens = re.findall(r'\b\w+\b', query.lower()) matched_keywords = set() for token in tokens: matched_keywords.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85)) positive_terms = "yes|accessible|adequate|good|excellent|present" negative_terms = "no|not accessible|inadequate|poor|bad|limited" matching_schools = set() inverse = any(t in query.lower() for t in ["not", "inaccessible", "bad", "poor", "lacking"]) for keyword in matched_keywords: col = self.keyword_map.get(keyword) if col and col in self.df.columns: if inverse: subset = self.df[~self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)] else: subset = self.df[self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)] schools = subset["BPS_School_Name"].dropna().unique().tolist() matching_schools.update(schools) if not matching_schools: return None return ( "The following schools match your criteria:\n" + "\n".join(f"- {s}" for s in sorted(matching_schools)) ) def get_response(self, user_input): # School-wide filter query school_filter = self.query_schools_by_feature(user_input) if school_filter: return school_filter # Per-school context query matched_school = self.match_school_name(user_input) structured_facts = self.extract_context_with_keywords(user_input, matched_school) if structured_facts: natural_context = ( f"You know the following facts about {matched_school or 'a Boston public school'}:\n" + "\n".join(f"- {fact}" for fact in structured_facts) ) prompt = ( "<|system|>You are a helpful assistant that specializes in Boston public school enrollment. " "Use any known facts about the school to answer helpfully.<|end|>\n" f"<|user|>{user_input}<|end|>\n" f"<|context|>{natural_context}<|end|>\n" "<|assistant|>" ) else: prompt = self.format_prompt(user_input) response = self.client.text_generation( prompt, max_new_tokens=512, temperature=0.7, top_p=0.9, stop_sequences=["<|end|>"] ) return response.strip()