Spaces:
Build error
Build error
from huggingface_hub import InferenceClient | |
from config import BASE_MODEL, MY_MODEL, HF_TOKEN | |
import pandas as pd | |
import json | |
import re | |
from difflib import get_close_matches | |
class SchoolChatbot: | |
"""Boston School Chatbot integrating structured data, vector context, and model completion.""" | |
def __init__(self): | |
model_id = MY_MODEL if MY_MODEL else BASE_MODEL | |
self.client = InferenceClient(model=model_id, token=HF_TOKEN) | |
self.df = pd.read_csv("bps_data.csv") | |
with open("cleaned_keyword_to_column_map.json") as f: | |
self.keyword_map = json.load(f) | |
# Build name variants for school matching | |
self.school_name_map = {} | |
for _, row in self.df.iterrows(): | |
primary = row.get("BPS_School_Name") | |
hist = row.get("BPS_Historical_Name") | |
abbrev = row.get("SMMA_Abbreviated_Name") | |
if pd.notna(primary): | |
self.school_name_map[primary.lower()] = primary | |
if pd.notna(hist): | |
self.school_name_map[hist.lower()] = primary | |
if pd.notna(abbrev): | |
self.school_name_map[abbrev.lower()] = primary | |
self.school_name_map.update({ | |
"acc": "Another Course to College*", | |
"baldwin": "Baldwin Early Learning Pilot Academy", | |
"adams elementary": "Adams, Samuel Elementary", | |
"alighieri montessori": "Alighieri, Dante Montessori School", | |
"phineas bates": "Bates, Phineas Elementary", | |
}) | |
def format_prompt(self, user_input): | |
return ( | |
"<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n" | |
f"<|user|>{user_input}<|end|>\n" | |
"<|assistant|>" | |
) | |
def match_school_name(self, query): | |
for key in self.school_name_map: | |
if key in query.lower(): | |
return self.school_name_map[key] | |
return None | |
def extract_context_with_keywords(self, prompt, school_name=None): | |
def extract_keywords(text): | |
tokens = re.findall(r'\b\w+\b', text.lower()) | |
matched = set() | |
for token in tokens: | |
matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85)) | |
return matched | |
matched_keywords = extract_keywords(prompt) | |
df_filtered = self.df | |
if school_name: | |
df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)] | |
if df_filtered.empty: | |
return [] | |
row = df_filtered.iloc[0] | |
context_items = [] | |
for kw in matched_keywords: | |
col = self.keyword_map.get(kw) | |
val = row.get(col) if col else None | |
if col and pd.notna(val): | |
context_items.append(f"The school's {kw} is {val.lower()}.") | |
return context_items | |
def query_schools_by_feature(self, query): | |
tokens = re.findall(r'\b\w+\b', query.lower()) | |
matched_keywords = set() | |
for token in tokens: | |
matched_keywords.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85)) | |
positive_terms = "yes|accessible|adequate|good|excellent|present" | |
negative_terms = "no|not accessible|inadequate|poor|bad|limited" | |
matching_schools = set() | |
inverse = any(t in query.lower() for t in ["not", "inaccessible", "bad", "poor", "lacking"]) | |
for keyword in matched_keywords: | |
col = self.keyword_map.get(keyword) | |
if col and col in self.df.columns: | |
if inverse: | |
subset = self.df[~self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)] | |
else: | |
subset = self.df[self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)] | |
schools = subset["BPS_School_Name"].dropna().unique().tolist() | |
matching_schools.update(schools) | |
if not matching_schools: | |
return None | |
return ( | |
"The following schools match your criteria:\n" + | |
"\n".join(f"- {s}" for s in sorted(matching_schools)) | |
) | |
def get_response(self, user_input): | |
# School-wide filter query | |
school_filter = self.query_schools_by_feature(user_input) | |
if school_filter: | |
return school_filter | |
# Per-school context query | |
matched_school = self.match_school_name(user_input) | |
structured_facts = self.extract_context_with_keywords(user_input, matched_school) | |
if structured_facts: | |
natural_context = ( | |
f"You know the following facts about {matched_school or 'a Boston public school'}:\n" | |
+ "\n".join(f"- {fact}" for fact in structured_facts) | |
) | |
prompt = ( | |
"<|system|>You are a helpful assistant that specializes in Boston public school enrollment. " | |
"Use any known facts about the school to answer helpfully.<|end|>\n" | |
f"<|user|>{user_input}<|end|>\n" | |
f"<|context|>{natural_context}<|end|>\n" | |
"<|assistant|>" | |
) | |
else: | |
prompt = self.format_prompt(user_input) | |
response = self.client.text_generation( | |
prompt, | |
max_new_tokens=512, | |
temperature=0.7, | |
top_p=0.9, | |
stop_sequences=["<|end|>"] | |
) | |
return response.strip() | |