from huggingface_hub import InferenceClient from config import BASE_MODEL, MY_MODEL, HF_TOKEN import pandas as pd import json import re from difflib import get_close_matches class SchoolChatbot: """ A chatbot that integrates structured school data and language generation to assist with Boston Public School queries. """ def __init__(self): model_id = MY_MODEL if MY_MODEL else BASE_MODEL self.client = InferenceClient(model=model_id, token=HF_TOKEN) self.df = pd.read_csv("bps_data.csv") with open("cleaned_keyword_to_column_map.json") as f: self.keyword_map = json.load(f) def format_prompt(self, user_input): return ( "<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n" f"<|user|>{user_input}<|end|>\n" "<|assistant|>" ) def extract_context_with_keywords(self, prompt, school_name=None): def extract_keywords(text): tokens = re.findall(r'\b\w+\b', text.lower()) matched = set() for token in tokens: matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85)) return matched matched_keywords = extract_keywords(prompt) df_filtered = self.df if school_name: df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)] if df_filtered.empty: return [] row = df_filtered.iloc[0] context_items = [] for kw in matched_keywords: col = self.keyword_map.get(kw) val = row.get(col) if col else None if col and pd.notna(val): context_items.append(f"The school's {kw} is {val.lower()}.") return context_items def get_response(self, user_input): matched_school = None for name in self.df["BPS_School_Name"].dropna(): if name.lower() in user_input.lower(): matched_school = name break structured_facts = self.extract_context_with_keywords(user_input, matched_school) if structured_facts: natural_context = ( f"You know the following facts about {matched_school or 'a Boston public school'}:\n" + "\n".join(f"- {fact}" for fact in structured_facts) ) prompt = ( "<|system|>You are a helpful assistant that specializes in Boston public school enrollment. " "Use any known facts about the school to answer helpfully.<|end|>\n" f"<|user|>{user_input}<|end|>\n" f"<|context|>{natural_context}<|end|>\n" "<|assistant|>" ) else: prompt = self.format_prompt(user_input) response = self.client.text_generation( prompt, max_new_tokens=512, temperature=0.7, top_p=0.9, stop_sequences=["<|end|>"] ) return response.strip()