Spaces:
Sleeping
Sleeping
Update src/chat.py
Browse files- src/chat.py +41 -66
src/chat.py
CHANGED
@@ -1,103 +1,78 @@
|
|
|
|
1 |
from huggingface_hub import InferenceClient
|
2 |
from config import BASE_MODEL, MY_MODEL, HF_TOKEN
|
3 |
import pandas as pd
|
4 |
import json
|
|
|
|
|
5 |
|
6 |
class SchoolChatbot:
|
7 |
"""
|
8 |
-
|
9 |
-
|
10 |
-
Example usage:
|
11 |
-
chatbot = SchoolChatbot()
|
12 |
-
response = chatbot.get_response("What schools offer Spanish programs?")
|
13 |
"""
|
14 |
|
15 |
def __init__(self):
|
16 |
-
|
17 |
-
Initialize the chatbot with a HF model ID
|
18 |
-
"""
|
19 |
-
model_id = MY_MODEL if MY_MODEL else BASE_MODEL # define MY_MODEL in config.py if you create a new model in the HuggingFace Hub
|
20 |
self.client = InferenceClient(model=model_id, token=HF_TOKEN)
|
21 |
self.df = pd.read_csv("bps_data.csv")
|
22 |
-
with open("
|
23 |
self.keyword_map = json.load(f)
|
24 |
-
|
25 |
-
def format_prompt(self, user_input):
|
26 |
-
"""
|
27 |
-
TODO: Implement this method to format the user's input into a proper prompt.
|
28 |
-
|
29 |
-
This method should:
|
30 |
-
1. Add any necessary system context or instructions
|
31 |
-
2. Format the user's input appropriately
|
32 |
-
3. Add any special tokens or formatting the model expects
|
33 |
|
34 |
-
|
35 |
-
user_input (str): The user's question about Boston schools
|
36 |
-
|
37 |
-
Returns:
|
38 |
-
str: A formatted prompt ready for the model
|
39 |
-
|
40 |
-
Example prompt format:
|
41 |
-
"You are a helpful assistant that specializes in Boston schools...
|
42 |
-
User: {user_input}
|
43 |
-
Assistant:"
|
44 |
-
"""
|
45 |
return (
|
46 |
"<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n"
|
47 |
f"<|user|>{user_input}<|end|>\n"
|
48 |
"<|assistant|>"
|
49 |
)
|
50 |
|
51 |
-
def
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
55 |
|
|
|
|
|
56 |
if school_name:
|
57 |
df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)]
|
58 |
-
|
59 |
if df_filtered.empty:
|
60 |
-
return
|
61 |
|
62 |
row = df_filtered.iloc[0]
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
def get_response(self, user_input):
|
71 |
-
"""
|
72 |
-
TODO: Implement this method to generate responses to user questions.
|
73 |
-
|
74 |
-
This method should:
|
75 |
-
1. Use format_prompt() to prepare the input
|
76 |
-
2. Generate a response using the model
|
77 |
-
3. Clean up and return the response
|
78 |
|
79 |
-
|
80 |
-
user_input (str): The user's question about Boston schools
|
81 |
-
|
82 |
-
Returns:
|
83 |
-
str: The chatbot's response
|
84 |
-
|
85 |
-
Implementation tips:
|
86 |
-
- Use self.format_prompt() to format the user's input
|
87 |
-
- Use self.client to generate responses
|
88 |
-
"""
|
89 |
matched_school = None
|
90 |
for name in self.df["BPS_School_Name"].dropna():
|
91 |
if name.lower() in user_input.lower():
|
92 |
matched_school = name
|
93 |
break
|
94 |
|
95 |
-
|
96 |
|
97 |
-
if
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
prompt = self.format_prompt(user_input)
|
101 |
response = self.client.text_generation(
|
102 |
prompt,
|
103 |
max_new_tokens=512,
|
@@ -105,4 +80,4 @@ class SchoolChatbot:
|
|
105 |
top_p=0.9,
|
106 |
stop_sequences=["<|end|>"]
|
107 |
)
|
108 |
-
return response.strip()
|
|
|
1 |
+
|
2 |
from huggingface_hub import InferenceClient
|
3 |
from config import BASE_MODEL, MY_MODEL, HF_TOKEN
|
4 |
import pandas as pd
|
5 |
import json
|
6 |
+
import re
|
7 |
+
from difflib import get_close_matches
|
8 |
|
9 |
class SchoolChatbot:
|
10 |
"""
|
11 |
+
A chatbot that integrates structured school data and language generation to assist with Boston Public School queries.
|
|
|
|
|
|
|
|
|
12 |
"""
|
13 |
|
14 |
def __init__(self):
|
15 |
+
model_id = MY_MODEL if MY_MODEL else BASE_MODEL
|
|
|
|
|
|
|
16 |
self.client = InferenceClient(model=model_id, token=HF_TOKEN)
|
17 |
self.df = pd.read_csv("bps_data.csv")
|
18 |
+
with open("cleaned_keyword_to_column_map.json") as f:
|
19 |
self.keyword_map = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
def format_prompt(self, user_input):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
return (
|
23 |
"<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n"
|
24 |
f"<|user|>{user_input}<|end|>\n"
|
25 |
"<|assistant|>"
|
26 |
)
|
27 |
|
28 |
+
def extract_context_with_keywords(self, prompt, school_name=None):
|
29 |
+
def extract_keywords(text):
|
30 |
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
31 |
+
matched = set()
|
32 |
+
for token in tokens:
|
33 |
+
matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85))
|
34 |
+
return matched
|
35 |
|
36 |
+
matched_keywords = extract_keywords(prompt)
|
37 |
+
df_filtered = self.df
|
38 |
if school_name:
|
39 |
df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)]
|
|
|
40 |
if df_filtered.empty:
|
41 |
+
return []
|
42 |
|
43 |
row = df_filtered.iloc[0]
|
44 |
+
context_items = []
|
45 |
+
for kw in matched_keywords:
|
46 |
+
col = self.keyword_map.get(kw)
|
47 |
+
val = row.get(col) if col else None
|
48 |
+
if col and pd.notna(val):
|
49 |
+
context_items.append(f"The school's {kw} is {val.lower()}.")
|
50 |
+
return context_items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
def get_response(self, user_input):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
matched_school = None
|
54 |
for name in self.df["BPS_School_Name"].dropna():
|
55 |
if name.lower() in user_input.lower():
|
56 |
matched_school = name
|
57 |
break
|
58 |
|
59 |
+
structured_facts = self.extract_context_with_keywords(user_input, matched_school)
|
60 |
|
61 |
+
if structured_facts:
|
62 |
+
natural_context = (
|
63 |
+
f"You know the following facts about {matched_school or 'a Boston public school'}:\n"
|
64 |
+
+ "\n".join(f"- {fact}" for fact in structured_facts)
|
65 |
+
)
|
66 |
+
prompt = (
|
67 |
+
"<|system|>You are a helpful assistant that specializes in Boston public school enrollment. "
|
68 |
+
"Use any known facts about the school to answer helpfully.<|end|>\n"
|
69 |
+
f"<|user|>{user_input}<|end|>\n"
|
70 |
+
f"<|context|>{natural_context}<|end|>\n"
|
71 |
+
"<|assistant|>"
|
72 |
+
)
|
73 |
+
else:
|
74 |
+
prompt = self.format_prompt(user_input)
|
75 |
|
|
|
76 |
response = self.client.text_generation(
|
77 |
prompt,
|
78 |
max_new_tokens=512,
|
|
|
80 |
top_p=0.9,
|
81 |
stop_sequences=["<|end|>"]
|
82 |
)
|
83 |
+
return response.strip()
|