Spaces:

ellawang9
/

bps-school-chatbot

Build error

App Files Files Community

bps-school-chatbot / src /chat.py

ellawang9

Update src/chat.py

2b3dacc verified about 1 month ago

raw

history blame

5.52 kB


	from huggingface_hub import InferenceClient
	from config import BASE_MODEL, MY_MODEL, HF_TOKEN
	import pandas as pd
	import json
	import re
	from difflib import get_close_matches

	class SchoolChatbot:
	"""Boston School Chatbot integrating structured data, vector context, and model completion."""

	def __init__(self):
	model_id = MY_MODEL if MY_MODEL else BASE_MODEL
	self.client = InferenceClient(model=model_id, token=HF_TOKEN)
	self.df = pd.read_csv("bps_data.csv")
	with open("cleaned_keyword_to_column_map.json") as f:
	self.keyword_map = json.load(f)

	# Build name variants for school matching
	self.school_name_map = {}
	for _, row in self.df.iterrows():
	primary = row.get("BPS_School_Name")
	hist = row.get("BPS_Historical_Name")
	abbrev = row.get("SMMA_Abbreviated_Name")
	if pd.notna(primary):
	self.school_name_map[primary.lower()] = primary
	if pd.notna(hist):
	self.school_name_map[hist.lower()] = primary
	if pd.notna(abbrev):
	self.school_name_map[abbrev.lower()] = primary

	self.school_name_map.update({
	"acc": "Another Course to College*",
	"baldwin": "Baldwin Early Learning Pilot Academy",
	"adams elementary": "Adams, Samuel Elementary",
	"alighieri montessori": "Alighieri, Dante Montessori School",
	"phineas bates": "Bates, Phineas Elementary",
	})

	def format_prompt(self, user_input):
	return (
	"<\|system\|>You are a helpful assistant that specializes in Boston public school enrollment.<\|end\|>\n"
	f"<\|user\|>{user_input}<\|end\|>\n"
	"<\|assistant\|>"
	)

	def match_school_name(self, query):
	for key in self.school_name_map:
	if key in query.lower():
	return self.school_name_map[key]
	return None

	def extract_context_with_keywords(self, prompt, school_name=None):
	def extract_keywords(text):
	tokens = re.findall(r'\b\w+\b', text.lower())
	matched = set()
	for token in tokens:
	matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85))
	return matched

	matched_keywords = extract_keywords(prompt)
	df_filtered = self.df
	if school_name:
	df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)]
	if df_filtered.empty:
	return []

	row = df_filtered.iloc[0]
	context_items = []
	for kw in matched_keywords:
	col = self.keyword_map.get(kw)
	val = row.get(col) if col else None
	if col and pd.notna(val):
	context_items.append(f"The school's {kw} is {val.lower()}.")
	return context_items

	def query_schools_by_feature(self, query):
	tokens = re.findall(r'\b\w+\b', query.lower())
	matched_keywords = set()
	for token in tokens:
	matched_keywords.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85))

	positive_terms = "yes\|accessible\|adequate\|good\|excellent\|present"
	negative_terms = "no\|not accessible\|inadequate\|poor\|bad\|limited"

	matching_schools = set()
	inverse = any(t in query.lower() for t in ["not", "inaccessible", "bad", "poor", "lacking"])

	for keyword in matched_keywords:
	col = self.keyword_map.get(keyword)
	if col and col in self.df.columns:
	if inverse:
	subset = self.df[~self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)]
	else:
	subset = self.df[self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)]
	schools = subset["BPS_School_Name"].dropna().unique().tolist()
	matching_schools.update(schools)

	if not matching_schools:
	return None
	return (
	"The following schools match your criteria:\n" +
	"\n".join(f"- {s}" for s in sorted(matching_schools))
	)

	def get_response(self, user_input):
	# School-wide filter query
	school_filter = self.query_schools_by_feature(user_input)
	if school_filter:
	return school_filter

	# Per-school context query
	matched_school = self.match_school_name(user_input)
	structured_facts = self.extract_context_with_keywords(user_input, matched_school)

	if structured_facts:
	natural_context = (
	f"You know the following facts about {matched_school or 'a Boston public school'}:\n"
	+ "\n".join(f"- {fact}" for fact in structured_facts)
	)
	prompt = (
	"<\|system\|>You are a helpful assistant that specializes in Boston public school enrollment. "
	"Use any known facts about the school to answer helpfully.<\|end\|>\n"
	f"<\|user\|>{user_input}<\|end\|>\n"
	f"<\|context\|>{natural_context}<\|end\|>\n"
	"<\|assistant\|>"
	)
	else:
	prompt = self.format_prompt(user_input)

	response = self.client.text_generation(
	prompt,
	max_new_tokens=512,
	temperature=0.7,
	top_p=0.9,
	stop_sequences=["<\|end\|>"]
	)
	return response.strip()