from huggingface_hub import InferenceClient
from config import BASE_MODEL, MY_MODEL, HF_TOKEN
import pandas as pd
import json
import re
from difflib import get_close_matches

class SchoolChatbot:
    """Boston School Chatbot integrating structured data, vector context, and model completion."""

    def __init__(self):
        model_id = MY_MODEL if MY_MODEL else BASE_MODEL
        self.client = InferenceClient(model=model_id, token=HF_TOKEN)
        self.df = pd.read_csv("bps_data.csv")
        with open("cleaned_keyword_to_column_map.json") as f:
            self.keyword_map = json.load(f)

        # Build name variants for school matching
        self.school_name_map = {}
        for _, row in self.df.iterrows():
            primary = row.get("BPS_School_Name")
            hist = row.get("BPS_Historical_Name")
            abbrev = row.get("SMMA_Abbreviated_Name")
            if pd.notna(primary):
                self.school_name_map[primary.lower()] = primary
            if pd.notna(hist):
                self.school_name_map[hist.lower()] = primary
            if pd.notna(abbrev):
                self.school_name_map[abbrev.lower()] = primary

        self.school_name_map.update({
            "acc": "Another Course to College*",
            "baldwin": "Baldwin Early Learning Pilot Academy",
            "adams elementary": "Adams, Samuel Elementary",
            "alighieri montessori": "Alighieri, Dante Montessori School",
            "phineas bates": "Bates, Phineas Elementary",
        })

    def format_prompt(self, user_input):
        return (
            "<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n"
            f"<|user|>{user_input}<|end|>\n"
            "<|assistant|>"
        )

    def match_school_name(self, query):
        for key in self.school_name_map:
            if key in query.lower():
                return self.school_name_map[key]
        return None

    def extract_context_with_keywords(self, prompt, school_name=None):
        def extract_keywords(text):
            tokens = re.findall(r'\b\w+\b', text.lower())
            matched = set()
            for token in tokens:
                matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85))
            return matched

        matched_keywords = extract_keywords(prompt)
        df_filtered = self.df
        if school_name:
            df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)]
        if df_filtered.empty:
            return []

        row = df_filtered.iloc[0]
        context_items = []
        for kw in matched_keywords:
            col = self.keyword_map.get(kw)
            val = row.get(col) if col else None
            if col and pd.notna(val):
                context_items.append(f"The school's {kw} is {val.lower()}.")
        return context_items

    def query_schools_by_feature(self, query):
        tokens = re.findall(r'\b\w+\b', query.lower())
        matched_keywords = set()
        for token in tokens:
            matched_keywords.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85))

        positive_terms = "yes|accessible|adequate|good|excellent|present"
        negative_terms = "no|not accessible|inadequate|poor|bad|limited"

        matching_schools = set()
        inverse = any(t in query.lower() for t in ["not", "inaccessible", "bad", "poor", "lacking"])

        for keyword in matched_keywords:
            col = self.keyword_map.get(keyword)
            if col and col in self.df.columns:
                if inverse:
                    subset = self.df[~self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)]
                else:
                    subset = self.df[self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)]
                schools = subset["BPS_School_Name"].dropna().unique().tolist()
                matching_schools.update(schools)

        if not matching_schools:
            return None
        return (
            "The following schools match your criteria:\n" +
            "\n".join(f"- {s}" for s in sorted(matching_schools))
        )

    def get_response(self, user_input):
        # School-wide filter query
        school_filter = self.query_schools_by_feature(user_input)
        if school_filter:
            return school_filter

        # Per-school context query
        matched_school = self.match_school_name(user_input)
        structured_facts = self.extract_context_with_keywords(user_input, matched_school)

        if structured_facts:
            natural_context = (
                f"You know the following facts about {matched_school or 'a Boston public school'}:\n"
                + "\n".join(f"- {fact}" for fact in structured_facts)
            )
            prompt = (
                "<|system|>You are a helpful assistant that specializes in Boston public school enrollment. "
                "Use any known facts about the school to answer helpfully.<|end|>\n"
                f"<|user|>{user_input}<|end|>\n"
                f"<|context|>{natural_context}<|end|>\n"
                "<|assistant|>"
            )
        else:
            prompt = self.format_prompt(user_input)

        response = self.client.text_generation(
            prompt,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            stop_sequences=["<|end|>"]
        )
        return response.strip()