innovation1007 commited on
Commit
4b43eb7
·
verified ·
1 Parent(s): 7866c59

flask files

Browse files
Files changed (3) hide show
  1. Dockerfile +13 -0
  2. app.py +149 -0
  3. requirements.txt +44 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD gunicorn app:app -b :8080 --timeout 120 --workers=3 --threads=3 --worker-connections=1000
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, pipeline, AutoTokenizer, AutoModelForTokenClassification
3
+ import re
4
+ from flask_cors import CORS
5
+
6
+
7
+ app = Flask(__name__)
8
+ CORS(app)
9
+
10
+
11
+
12
+
13
+ # Load chatbot model
14
+ model_name = "facebook/blenderbot-400M-distill"
15
+ tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
16
+ model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
17
+
18
+ # Load POS tagging pipeline
19
+ pos_pipe = pipeline("token-classification", model="TweebankNLP/bertweet-tb2-pos-tagging")
20
+
21
+ # Load NER model
22
+ model_checkpoint = "huggingface-course/bert-finetuned-ner"
23
+ ner_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
24
+ ner_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
25
+ token_classifier = pipeline(
26
+ "token-classification", model=ner_model, aggregation_strategy="simple", tokenizer=ner_tokenizer,
27
+ )
28
+
29
+ # Function to clean messages
30
+ def clean_message(text):
31
+ # Remove emojis and special characters (except spaces and letters)
32
+ text = re.sub(r'[^\w\s]', '', text)
33
+
34
+ # Reduce repeated letters only if they appear more than twice at the end
35
+ text = re.sub(r'(\w*?)(\w)\2{2,}\b', r'\1\2', text)
36
+
37
+ # Perform POS tagging
38
+ pos_tags = pos_pipe(text)
39
+
40
+ # Convert words to title case selectively
41
+ words = text.split()
42
+ cleaned_words = []
43
+
44
+ for i, word in enumerate(words):
45
+ tag = next((tag_info["entity"] for tag_info in pos_tags if tag_info["word"] == word), None)
46
+
47
+ if tag in ["ADJ", "ADP"]: # Keep ADJ and ADP words lowercase
48
+ cleaned_words.append(word.lower())
49
+ else: # Title case for other words
50
+ cleaned_words.append(word.title())
51
+
52
+ # Remove single-letter words (except 'I' or 'A' if needed)
53
+ cleaned_words = [word for word in cleaned_words if len(word) > 1]
54
+
55
+ return " ".join(cleaned_words)
56
+
57
+ # Function to extract named entities from a single message
58
+ def extract_entities(text, message_index, existing_entities=set(), threshold=0.85):
59
+ entities_dict = {"PER": [], "ORG": [], "LOC": [], "MISC": []}
60
+ seen_words = set(existing_entities) # Initialize the set of previously noted entities
61
+
62
+ results = token_classifier(text)
63
+
64
+ for entity in results:
65
+ word = entity["word"]
66
+ entity_type = entity["entity_group"]
67
+ score = entity["score"]
68
+
69
+ # Ignore low-confidence entities
70
+ if score < threshold:
71
+ continue
72
+
73
+ # Ignore subword tokens (split words like "##word")
74
+ if word.startswith("##"):
75
+ continue
76
+
77
+ # Ignore short words (e.g., single letters)
78
+ if len(word) == 1:
79
+ continue
80
+
81
+ # Keep multi-word locations intact
82
+ if entity_type == "LOC":
83
+ processed_words = [word]
84
+ else:
85
+ processed_words = word.split()
86
+
87
+ for single_word in processed_words:
88
+ # Check if the word has been already noted
89
+ if single_word not in seen_words:
90
+ seen_words.add(single_word)
91
+ # Add new word to the respective entity list
92
+ if entity_type in entities_dict:
93
+ entities_dict[entity_type].append({
94
+ "index": message_index,
95
+ "word": single_word,
96
+ "substring": (text.find(single_word), text.find(single_word) + len(single_word))
97
+ })
98
+
99
+ return entities_dict
100
+
101
+
102
+
103
+ @app.route("/")
104
+ def home():
105
+ return "Hello, World!"
106
+
107
+
108
+ @app.route("/api/home", methods=['POST','GET'])
109
+ def receive_message():
110
+ data = request.get_json()
111
+ message_index = data.get("index")
112
+ message = data.get("message", "")
113
+
114
+ print(f"Received message at index {message_index}: {message}")
115
+
116
+ # Clean user message
117
+ cleaned_message = clean_message(message)
118
+ print("Cleaned Message:", cleaned_message)
119
+
120
+ # Extract named entities from user message
121
+ user_entities = extract_entities(cleaned_message, message_index)
122
+ print("Extracted Entities from User's Message:", user_entities)
123
+
124
+ # Generate chatbot response
125
+ inputs = tokenizer(cleaned_message, return_tensors="pt")
126
+ reply_ids = model.generate(**inputs)
127
+ bot_response = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
128
+
129
+ print(f"Chatbot Response: {bot_response}")
130
+
131
+ # The bot's response index will be the user message index + 1
132
+ bot_index = message_index + 1
133
+
134
+ # Extract named entities from chatbot response (bot index)
135
+ bot_entities = extract_entities(bot_response, bot_index)
136
+ print("Extracted Entities from Chatbot's Response:", bot_entities)
137
+
138
+ return jsonify({
139
+ 'response': bot_response,
140
+ 'person_user': user_entities.get("PER", []),
141
+ 'location_user': user_entities.get("LOC", []),
142
+ 'person_bot': bot_entities.get("PER", []),
143
+ 'location_bot': bot_entities.get("LOC", [])
144
+ })
145
+
146
+ if __name__ == "__main__":
147
+ app.run(host="0.0.0.0", debug=True)
148
+
149
+
requirements.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ blinker==1.9.0
3
+ gunicorn===20.1.0
4
+ certifi==2025.1.31
5
+ charset-normalizer==3.4.1
6
+ click==8.1.8
7
+ colorama==0.4.6
8
+ filelock==3.18.0
9
+ Flask==3.1.0
10
+ flask-cors==5.0.1
11
+ fsspec==2025.3.0
12
+ huggingface-hub==0.29.3
13
+ idna==3.10
14
+ importlib_metadata==8.6.1
15
+ itsdangerous==2.2.0
16
+ Jinja2==3.1.6
17
+ MarkupSafe==3.0.2
18
+ mpmath==1.3.0
19
+ networkx==3.2.1
20
+ numpy==2.0.2
21
+ packaging==24.2
22
+ PyYAML==6.0.2
23
+ regex==2024.11.6
24
+ requests==2.32.3
25
+ safetensors==0.5.3
26
+ sympy==1.13.1
27
+ tokenizers==0.21.1
28
+ torch==2.6.0
29
+ tqdm==4.67.1
30
+ transformers==4.49.0
31
+ typing_extensions==4.12.2
32
+ urllib3==2.3.0
33
+ Werkzeug==3.1.3
34
+ zipp==3.21.0
35
+ vercel==0.2.1
36
+
37
+ #
38
+ # This file is autogenerated by pip-compile with Python 3.9
39
+ # by the following command:
40
+ #
41
+ # pip-compile --output-file=-
42
+ pip==23.3.1
43
+ python-dotenv
44
+