|
import streamlit as st |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
from collections import defaultdict |
|
import fitz |
|
import re |
|
import os |
|
import spacy |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
st.set_page_config(page_title="π Financial Report Sentiment Analyzer", layout="wide") |
|
st.title("π Financial Report Sentiment Analyzer") |
|
|
|
st.markdown(""" |
|
### What is FinBERT? |
|
**FinBERT** is a language model fine-tuned for financial text analysis. It classifies sentiment as **Positive, Neutral, or Negative** for key financial aspects: |
|
1. **Assets** β What the company owns |
|
2. **Liabilities** β What the company owes |
|
3. **Equity** β Net worth (Assets - Liabilities) |
|
--- |
|
""") |
|
|
|
|
|
uploaded_file = st.file_uploader("π Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"]) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.report-preview { |
|
border: 1px solid #ccc; |
|
padding: 10px; |
|
max-height: 300px; |
|
overflow-y: scroll; |
|
background-color: #f9f9f9; |
|
color: #333 !important; |
|
white-space: pre-wrap; |
|
line-height: 1.6; |
|
font-family: Arial, sans-serif; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone") |
|
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone") |
|
return tokenizer, model |
|
|
|
tokenizer, model = load_model() |
|
label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'} |
|
|
|
|
|
def extract_text(file): |
|
try: |
|
if file.name.endswith('.pdf'): |
|
with fitz.open(stream=file.read(), filetype="pdf") as doc: |
|
return "\n".join([page.get_text() for page in doc]) |
|
else: |
|
return file.read().decode('utf-8') |
|
except Exception as e: |
|
st.error(f"β Error reading file: {e}") |
|
return "" |
|
|
|
if uploaded_file: |
|
report_text = extract_text(uploaded_file) |
|
st.write("### π Uploaded Report Preview:") |
|
st.markdown(f"<div class='report-preview'>{report_text[:5000]}</div>", unsafe_allow_html=True) |
|
|
|
|
|
def analyze_sentiment(sentence): |
|
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
label_idx = torch.argmax(probs, dim=1).item() |
|
return label_mapping[label_idx], probs.tolist()[0] |
|
|
|
|
|
def extract_sentences(text, keywords): |
|
try: |
|
doc = nlp(text) |
|
sentences = [sent.text for sent in doc.sents] |
|
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE) |
|
return [s for s in sentences if pattern.search(s)] |
|
except Exception as e: |
|
st.error(f"β Error in sentence tokenization: {e}") |
|
return [] |
|
|
|
|
|
def analyze_category(text, category_name, keywords): |
|
sentences = extract_sentences(text, keywords) |
|
if not sentences: |
|
st.warning(f"β οΈ No relevant sentences found for {category_name}") |
|
return None, [] |
|
|
|
sentiment_scores = defaultdict(int) |
|
negative_sentences = [] |
|
|
|
for sentence in sentences: |
|
label, probs = analyze_sentiment(sentence) |
|
sentiment_scores[label] += 1 |
|
if label == 'Negative': |
|
negative_sentences.append((sentence, probs)) |
|
|
|
total = sum(sentiment_scores.values()) |
|
sentiment_percentages = { |
|
'Positive': (sentiment_scores.get('Positive', 0) / total) * 100 if total else 0, |
|
'Negative': (sentiment_scores.get('Negative', 0) / total) * 100 if total else 0, |
|
'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100 if total else 0 |
|
} |
|
return sentiment_percentages, negative_sentences |
|
|
|
|
|
categories = { |
|
'Assets': ['asset', 'current assets', 'fixed assets', 'cash equivalents', 'inventory', 'receivables', 'property', 'investments'], |
|
'Liabilities': ['liability', 'debt', 'accounts payable', 'loans payable', 'taxes payable', 'borrowings', 'creditors', 'obligations'], |
|
'Equity': ['equity', 'shareholders equity', 'stockholders equity', 'common stock', 'retained earnings', 'net worth', 'share capital'] |
|
} |
|
|
|
|
|
st.write("## π Sentiment Analysis Results:") |
|
|
|
for category, keywords in categories.items(): |
|
st.write(f"### π {category}") |
|
result = analyze_category(report_text, category, keywords) |
|
if result[0] is None: |
|
continue |
|
|
|
sentiment_percentages, negative_sentences = result |
|
|
|
|
|
cols = st.columns(3) |
|
cols[0].metric(label="β
Positive", value=f"{sentiment_percentages['Positive']:.1f}%") |
|
cols[1].metric(label="β οΈ Negative", value=f"{sentiment_percentages['Negative']:.1f}%") |
|
cols[2].metric(label="βΉοΈ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%") |
|
|
|
|
|
if negative_sentences: |
|
with st.expander("π» View Negative Sentences"): |
|
for idx, (sentence, probs) in enumerate(negative_sentences, 1): |
|
st.write(f"**{idx}.** *{sentence}*") |
|
st.caption(f"Probabilities β Positive: {probs[0]:.2f}, Negative: {probs[1]:.2f}, Neutral: {probs[2]:.2f}") |
|
else: |
|
st.success("β
No negative sentences detected.") |