ellawang9's picture
Update src/chat.py
45931f5 verified
from huggingface_hub import InferenceClient
from config import BASE_MODEL, MY_MODEL, HF_TOKEN
import pandas as pd
import json
import re
from difflib import get_close_matches
class SchoolChatbot:
"""Boston School Chatbot integrating structured data, vector context, and model completion."""
def __init__(self):
model_id = MY_MODEL if MY_MODEL else BASE_MODEL
self.client = InferenceClient(model=model_id, token=HF_TOKEN)
self.df = pd.read_csv("bps_data.csv")
with open("keyword_to_column_map.json") as f:
self.keyword_map = json.load(f)
# Build name variants for school matching
self.school_name_map = {}
for _, row in self.df.iterrows():
primary = row.get("BPS_School_Name")
hist = row.get("BPS_Historical_Name")
abbrev = row.get("SMMA_Abbreviated_Name")
if pd.notna(primary):
self.school_name_map[primary.lower()] = primary
if pd.notna(hist):
self.school_name_map[hist.lower()] = primary
if pd.notna(abbrev):
self.school_name_map[abbrev.lower()] = primary
self.school_name_map.update({
"acc": "Another Course to College*",
"baldwin": "Baldwin Early Learning Pilot Academy",
"adams elementary": "Adams, Samuel Elementary",
"alighieri montessori": "Alighieri, Dante Montessori School",
"phineas bates": "Bates, Phineas Elementary",
})
def format_prompt(self, user_input):
return (
"<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n"
f"<|user|>{user_input}<|end|>\n"
"<|assistant|>"
)
def match_school_name(self, query):
for key in self.school_name_map:
if key in query.lower():
return self.school_name_map[key]
return None
def extract_context_with_keywords(self, prompt, school_name=None):
def extract_keywords(text):
tokens = re.findall(r'\b\w+\b', text.lower())
matched = set()
for token in tokens:
matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85))
return matched
matched_keywords = extract_keywords(prompt)
df_filtered = self.df
if school_name:
df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)]
if df_filtered.empty:
return []
row = df_filtered.iloc[0]
context_items = []
for kw in matched_keywords:
col = self.keyword_map.get(kw)
val = row.get(col) if col else None
if col and pd.notna(val):
context_items.append(f"The school's {kw} is {val.lower()}.")
return context_items
def query_schools_by_feature(self, query):
tokens = re.findall(r'\b\w+\b', query.lower())
matched_keywords = set()
for token in tokens:
matched_keywords.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85))
positive_terms = "yes|accessible|adequate|good|excellent|present"
negative_terms = "no|not accessible|inadequate|poor|bad|limited"
matching_schools = set()
inverse = any(t in query.lower() for t in ["not", "inaccessible", "bad", "poor", "lacking"])
for keyword in matched_keywords:
col = self.keyword_map.get(keyword)
if col and col in self.df.columns:
if inverse:
subset = self.df[~self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)]
else:
subset = self.df[self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)]
schools = subset["BPS_School_Name"].dropna().unique().tolist()
matching_schools.update(schools)
if not matching_schools:
return None
return (
"The following schools match your criteria:\n" +
"\n".join(f"- {s}" for s in sorted(matching_schools))
)
def get_response(self, user_input):
# School-wide filter query
school_filter = self.query_schools_by_feature(user_input)
if school_filter:
return school_filter
# Per-school context query
matched_school = self.match_school_name(user_input)
structured_facts = self.extract_context_with_keywords(user_input, matched_school)
if structured_facts:
natural_context = (
f"You know the following facts about {matched_school or 'a Boston public school'}:\n"
+ "\n".join(f"- {fact}" for fact in structured_facts)
)
prompt = (
"<|system|>You are a helpful assistant that specializes in Boston public school enrollment. "
"Use any known facts about the school to answer helpfully.<|end|>\n"
f"<|user|>{user_input}<|end|>\n"
f"<|context|>{natural_context}<|end|>\n"
"<|assistant|>"
)
else:
prompt = self.format_prompt(user_input)
response = self.client.text_generation(
prompt,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
stop_sequences=["<|end|>"]
)
return response.strip()