Spaces:

innovation1007
/

whatsappner2

Sleeping

App Files Files Community

innovation1007 commited on Mar 25

Commit

4b43eb7

verified ·

1 Parent(s): 7866c59

flask files

Browse files

Files changed (3) hide show

Dockerfile +13 -0
app.py +149 -0
requirements.txt +44 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD gunicorn app:app -b :8080 --timeout 120 --workers=3 --threads=3 --worker-connections=1000

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from flask import Flask, request, jsonify
+from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, pipeline, AutoTokenizer, AutoModelForTokenClassification
+import re
+from flask_cors import CORS
+app = Flask(__name__)
+CORS(app)
+# Load chatbot model
+model_name = "facebook/blenderbot-400M-distill"
+tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
+model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
+# Load POS tagging pipeline
+pos_pipe = pipeline("token-classification", model="TweebankNLP/bertweet-tb2-pos-tagging")
+# Load NER model
+model_checkpoint = "huggingface-course/bert-finetuned-ner"
+ner_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
+ner_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+token_classifier = pipeline(
+    "token-classification", model=ner_model, aggregation_strategy="simple", tokenizer=ner_tokenizer,
+)
+# Function to clean messages
+def clean_message(text):
+    # Remove emojis and special characters (except spaces and letters)
+    text = re.sub(r'[^\w\s]', '', text)
+    # Reduce repeated letters only if they appear more than twice at the end
+    text = re.sub(r'(\w*?)(\w)\2{2,}\b', r'\1\2', text)
+    # Perform POS tagging
+    pos_tags = pos_pipe(text)
+    # Convert words to title case selectively
+    words = text.split()
+    cleaned_words = []
+    for i, word in enumerate(words):
+        tag = next((tag_info["entity"] for tag_info in pos_tags if tag_info["word"] == word), None)
+        if tag in ["ADJ", "ADP"]:  # Keep ADJ and ADP words lowercase
+            cleaned_words.append(word.lower())
+        else:  # Title case for other words
+            cleaned_words.append(word.title())
+    # Remove single-letter words (except 'I' or 'A' if needed)
+    cleaned_words = [word for word in cleaned_words if len(word) > 1]
+    return " ".join(cleaned_words)
+# Function to extract named entities from a single message
+def extract_entities(text, message_index, existing_entities=set(), threshold=0.85):
+    entities_dict = {"PER": [], "ORG": [], "LOC": [], "MISC": []}
+    seen_words = set(existing_entities)  # Initialize the set of previously noted entities
+    results = token_classifier(text)
+    for entity in results:
+        word = entity["word"]
+        entity_type = entity["entity_group"]
+        score = entity["score"]
+        # Ignore low-confidence entities
+        if score < threshold:
+            continue
+        # Ignore subword tokens (split words like "##word")
+        if word.startswith("##"):
+            continue
+        # Ignore short words (e.g., single letters)
+        if len(word) == 1:
+            continue
+        # Keep multi-word locations intact
+        if entity_type == "LOC":
+            processed_words = [word]
+        else:
+            processed_words = word.split()
+        for single_word in processed_words:
+            # Check if the word has been already noted
+            if single_word not in seen_words:
+                seen_words.add(single_word)
+                # Add new word to the respective entity list
+                if entity_type in entities_dict:
+                    entities_dict[entity_type].append({
+                        "index": message_index,
+                        "word": single_word,
+                        "substring": (text.find(single_word), text.find(single_word) + len(single_word))
+                    })
+    return entities_dict
+@app.route("/")
+def home():
+    return "Hello, World!"
+@app.route("/api/home", methods=['POST','GET'])
+def receive_message():
+    data = request.get_json()
+    message_index = data.get("index")
+    message = data.get("message", "")
+    print(f"Received message at index {message_index}: {message}")
+    # Clean user message
+    cleaned_message = clean_message(message)
+    print("Cleaned Message:", cleaned_message)
+    # Extract named entities from user message
+    user_entities = extract_entities(cleaned_message, message_index)
+    print("Extracted Entities from User's Message:", user_entities)
+    # Generate chatbot response
+    inputs = tokenizer(cleaned_message, return_tensors="pt")
+    reply_ids = model.generate(**inputs)
+    bot_response = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
+    print(f"Chatbot Response: {bot_response}")
+    # The bot's response index will be the user message index + 1
+    bot_index = message_index + 1
+    # Extract named entities from chatbot response (bot index)
+    bot_entities = extract_entities(bot_response, bot_index)
+    print("Extracted Entities from Chatbot's Response:", bot_entities)
+    return jsonify({
+        'response': bot_response,
+        'person_user': user_entities.get("PER", []),
+        'location_user': user_entities.get("LOC", []),
+        'person_bot': bot_entities.get("PER", []),
+        'location_bot': bot_entities.get("LOC", [])
+    })
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+blinker==1.9.0
+gunicorn===20.1.0
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+filelock==3.18.0
+Flask==3.1.0
+flask-cors==5.0.1
+fsspec==2025.3.0
+huggingface-hub==0.29.3
+idna==3.10
+importlib_metadata==8.6.1
+itsdangerous==2.2.0
+Jinja2==3.1.6
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.2.1
+numpy==2.0.2
+packaging==24.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.5.3
+sympy==1.13.1
+tokenizers==0.21.1
+torch==2.6.0
+tqdm==4.67.1
+transformers==4.49.0
+typing_extensions==4.12.2
+urllib3==2.3.0
+Werkzeug==3.1.3
+zipp==3.21.0
+vercel==0.2.1
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --output-file=-
+pip==23.3.1
+python-dotenv