Spaces:
Sleeping
Sleeping
flask files
Browse files- Dockerfile +13 -0
- app.py +149 -0
- requirements.txt +44 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
|
7 |
+
WORKDIR /app
|
8 |
+
|
9 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
+
|
12 |
+
COPY --chown=user . /app
|
13 |
+
CMD gunicorn app:app -b :8080 --timeout 120 --workers=3 --threads=3 --worker-connections=1000
|
app.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, pipeline, AutoTokenizer, AutoModelForTokenClassification
|
3 |
+
import re
|
4 |
+
from flask_cors import CORS
|
5 |
+
|
6 |
+
|
7 |
+
app = Flask(__name__)
|
8 |
+
CORS(app)
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
# Load chatbot model
|
14 |
+
model_name = "facebook/blenderbot-400M-distill"
|
15 |
+
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
|
16 |
+
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
|
17 |
+
|
18 |
+
# Load POS tagging pipeline
|
19 |
+
pos_pipe = pipeline("token-classification", model="TweebankNLP/bertweet-tb2-pos-tagging")
|
20 |
+
|
21 |
+
# Load NER model
|
22 |
+
model_checkpoint = "huggingface-course/bert-finetuned-ner"
|
23 |
+
ner_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
|
24 |
+
ner_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
25 |
+
token_classifier = pipeline(
|
26 |
+
"token-classification", model=ner_model, aggregation_strategy="simple", tokenizer=ner_tokenizer,
|
27 |
+
)
|
28 |
+
|
29 |
+
# Function to clean messages
|
30 |
+
def clean_message(text):
|
31 |
+
# Remove emojis and special characters (except spaces and letters)
|
32 |
+
text = re.sub(r'[^\w\s]', '', text)
|
33 |
+
|
34 |
+
# Reduce repeated letters only if they appear more than twice at the end
|
35 |
+
text = re.sub(r'(\w*?)(\w)\2{2,}\b', r'\1\2', text)
|
36 |
+
|
37 |
+
# Perform POS tagging
|
38 |
+
pos_tags = pos_pipe(text)
|
39 |
+
|
40 |
+
# Convert words to title case selectively
|
41 |
+
words = text.split()
|
42 |
+
cleaned_words = []
|
43 |
+
|
44 |
+
for i, word in enumerate(words):
|
45 |
+
tag = next((tag_info["entity"] for tag_info in pos_tags if tag_info["word"] == word), None)
|
46 |
+
|
47 |
+
if tag in ["ADJ", "ADP"]: # Keep ADJ and ADP words lowercase
|
48 |
+
cleaned_words.append(word.lower())
|
49 |
+
else: # Title case for other words
|
50 |
+
cleaned_words.append(word.title())
|
51 |
+
|
52 |
+
# Remove single-letter words (except 'I' or 'A' if needed)
|
53 |
+
cleaned_words = [word for word in cleaned_words if len(word) > 1]
|
54 |
+
|
55 |
+
return " ".join(cleaned_words)
|
56 |
+
|
57 |
+
# Function to extract named entities from a single message
|
58 |
+
def extract_entities(text, message_index, existing_entities=set(), threshold=0.85):
|
59 |
+
entities_dict = {"PER": [], "ORG": [], "LOC": [], "MISC": []}
|
60 |
+
seen_words = set(existing_entities) # Initialize the set of previously noted entities
|
61 |
+
|
62 |
+
results = token_classifier(text)
|
63 |
+
|
64 |
+
for entity in results:
|
65 |
+
word = entity["word"]
|
66 |
+
entity_type = entity["entity_group"]
|
67 |
+
score = entity["score"]
|
68 |
+
|
69 |
+
# Ignore low-confidence entities
|
70 |
+
if score < threshold:
|
71 |
+
continue
|
72 |
+
|
73 |
+
# Ignore subword tokens (split words like "##word")
|
74 |
+
if word.startswith("##"):
|
75 |
+
continue
|
76 |
+
|
77 |
+
# Ignore short words (e.g., single letters)
|
78 |
+
if len(word) == 1:
|
79 |
+
continue
|
80 |
+
|
81 |
+
# Keep multi-word locations intact
|
82 |
+
if entity_type == "LOC":
|
83 |
+
processed_words = [word]
|
84 |
+
else:
|
85 |
+
processed_words = word.split()
|
86 |
+
|
87 |
+
for single_word in processed_words:
|
88 |
+
# Check if the word has been already noted
|
89 |
+
if single_word not in seen_words:
|
90 |
+
seen_words.add(single_word)
|
91 |
+
# Add new word to the respective entity list
|
92 |
+
if entity_type in entities_dict:
|
93 |
+
entities_dict[entity_type].append({
|
94 |
+
"index": message_index,
|
95 |
+
"word": single_word,
|
96 |
+
"substring": (text.find(single_word), text.find(single_word) + len(single_word))
|
97 |
+
})
|
98 |
+
|
99 |
+
return entities_dict
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
@app.route("/")
|
104 |
+
def home():
|
105 |
+
return "Hello, World!"
|
106 |
+
|
107 |
+
|
108 |
+
@app.route("/api/home", methods=['POST','GET'])
|
109 |
+
def receive_message():
|
110 |
+
data = request.get_json()
|
111 |
+
message_index = data.get("index")
|
112 |
+
message = data.get("message", "")
|
113 |
+
|
114 |
+
print(f"Received message at index {message_index}: {message}")
|
115 |
+
|
116 |
+
# Clean user message
|
117 |
+
cleaned_message = clean_message(message)
|
118 |
+
print("Cleaned Message:", cleaned_message)
|
119 |
+
|
120 |
+
# Extract named entities from user message
|
121 |
+
user_entities = extract_entities(cleaned_message, message_index)
|
122 |
+
print("Extracted Entities from User's Message:", user_entities)
|
123 |
+
|
124 |
+
# Generate chatbot response
|
125 |
+
inputs = tokenizer(cleaned_message, return_tensors="pt")
|
126 |
+
reply_ids = model.generate(**inputs)
|
127 |
+
bot_response = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
|
128 |
+
|
129 |
+
print(f"Chatbot Response: {bot_response}")
|
130 |
+
|
131 |
+
# The bot's response index will be the user message index + 1
|
132 |
+
bot_index = message_index + 1
|
133 |
+
|
134 |
+
# Extract named entities from chatbot response (bot index)
|
135 |
+
bot_entities = extract_entities(bot_response, bot_index)
|
136 |
+
print("Extracted Entities from Chatbot's Response:", bot_entities)
|
137 |
+
|
138 |
+
return jsonify({
|
139 |
+
'response': bot_response,
|
140 |
+
'person_user': user_entities.get("PER", []),
|
141 |
+
'location_user': user_entities.get("LOC", []),
|
142 |
+
'person_bot': bot_entities.get("PER", []),
|
143 |
+
'location_bot': bot_entities.get("LOC", [])
|
144 |
+
})
|
145 |
+
|
146 |
+
if __name__ == "__main__":
|
147 |
+
app.run(host="0.0.0.0", debug=True)
|
148 |
+
|
149 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
blinker==1.9.0
|
3 |
+
gunicorn===20.1.0
|
4 |
+
certifi==2025.1.31
|
5 |
+
charset-normalizer==3.4.1
|
6 |
+
click==8.1.8
|
7 |
+
colorama==0.4.6
|
8 |
+
filelock==3.18.0
|
9 |
+
Flask==3.1.0
|
10 |
+
flask-cors==5.0.1
|
11 |
+
fsspec==2025.3.0
|
12 |
+
huggingface-hub==0.29.3
|
13 |
+
idna==3.10
|
14 |
+
importlib_metadata==8.6.1
|
15 |
+
itsdangerous==2.2.0
|
16 |
+
Jinja2==3.1.6
|
17 |
+
MarkupSafe==3.0.2
|
18 |
+
mpmath==1.3.0
|
19 |
+
networkx==3.2.1
|
20 |
+
numpy==2.0.2
|
21 |
+
packaging==24.2
|
22 |
+
PyYAML==6.0.2
|
23 |
+
regex==2024.11.6
|
24 |
+
requests==2.32.3
|
25 |
+
safetensors==0.5.3
|
26 |
+
sympy==1.13.1
|
27 |
+
tokenizers==0.21.1
|
28 |
+
torch==2.6.0
|
29 |
+
tqdm==4.67.1
|
30 |
+
transformers==4.49.0
|
31 |
+
typing_extensions==4.12.2
|
32 |
+
urllib3==2.3.0
|
33 |
+
Werkzeug==3.1.3
|
34 |
+
zipp==3.21.0
|
35 |
+
vercel==0.2.1
|
36 |
+
|
37 |
+
#
|
38 |
+
# This file is autogenerated by pip-compile with Python 3.9
|
39 |
+
# by the following command:
|
40 |
+
#
|
41 |
+
# pip-compile --output-file=-
|
42 |
+
pip==23.3.1
|
43 |
+
python-dotenv
|
44 |
+
|