Update app.py
Browse files
app.py
CHANGED
@@ -1,24 +1,14 @@
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
4 |
-
from nltk.tokenize import sent_tokenize
|
5 |
from collections import defaultdict
|
6 |
import fitz # PyMuPDF for PDF reading
|
7 |
import re
|
8 |
import os
|
9 |
-
import
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
os.makedirs(NLTK_DATA_PATH, exist_ok=True)
|
14 |
-
nltk.data.path.append(NLTK_DATA_PATH)
|
15 |
-
|
16 |
-
# Ensure punkt is downloaded
|
17 |
-
try:
|
18 |
-
nltk.data.find('tokenizers/punkt')
|
19 |
-
except LookupError:
|
20 |
-
print("Downloading punkt tokenizer...")
|
21 |
-
nltk.download('punkt', download_dir=NLTK_DATA_PATH)
|
22 |
|
23 |
# Streamlit App Configuration
|
24 |
st.set_page_config(page_title="π Financial Report Sentiment Analyzer", layout="wide")
|
@@ -89,10 +79,11 @@ if uploaded_file:
|
|
89 |
label_idx = torch.argmax(probs, dim=1).item()
|
90 |
return label_mapping[label_idx], probs.tolist()[0]
|
91 |
|
92 |
-
# β
Extract Sentences Matching Financial Keywords
|
93 |
def extract_sentences(text, keywords):
|
94 |
try:
|
95 |
-
|
|
|
96 |
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
|
97 |
return [s for s in sentences if pattern.search(s)]
|
98 |
except Exception as e:
|
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
|
4 |
from collections import defaultdict
|
5 |
import fitz # PyMuPDF for PDF reading
|
6 |
import re
|
7 |
import os
|
8 |
+
import spacy # Replace NLTK with spaCy for sentence tokenization
|
9 |
|
10 |
+
# Load spaCy model for sentence tokenization
|
11 |
+
nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Streamlit App Configuration
|
14 |
st.set_page_config(page_title="π Financial Report Sentiment Analyzer", layout="wide")
|
|
|
79 |
label_idx = torch.argmax(probs, dim=1).item()
|
80 |
return label_mapping[label_idx], probs.tolist()[0]
|
81 |
|
82 |
+
# β
Extract Sentences Matching Financial Keywords (using spaCy)
|
83 |
def extract_sentences(text, keywords):
|
84 |
try:
|
85 |
+
doc = nlp(text)
|
86 |
+
sentences = [sent.text for sent in doc.sents] # Use spaCy for sentence tokenization
|
87 |
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
|
88 |
return [s for s in sentences if pattern.search(s)]
|
89 |
except Exception as e:
|