import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import nltk
import fitz # PyMuPDF
import re
nltk.download('punkt')
st.set_page_config(page_title="π Financial Report Sentiment Analyzer", layout="wide")
st.title("π Financial Report Sentiment Analyzer")
st.markdown("""
### What is FinBERT?
**FinBERT** is a language model fine-tuned specifically for financial text. It helps in detecting sentiment (Positive, Neutral, Negative) in financial reports.
We analyze three key financial aspects:
1. **Assets** β What the company owns
2. **Liabilities** β What the company owes
3. **Equity** β Net worth (Assets - Liabilities)
---
""")
uploaded_file = st.file_uploader("π Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"])
st.markdown("""
""", unsafe_allow_html=True)
if uploaded_file:
# Text extraction
if uploaded_file.name.endswith('.pdf'):
with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
report_text = ""
for page in doc:
report_text += page.get_text()
else:
report_text = uploaded_file.read().decode('utf-8')
st.write("### π Uploaded Report Preview:")
st.markdown(f'''
{report_text[:5000]}
''', unsafe_allow_html=True)
# Load FinBERT Model
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
return tokenizer, model
tokenizer, model = load_model()
label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}
# Sentiment Analysis Function
def analyze_sentiment(sentence):
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
label_idx = torch.argmax(probs, dim=1).item()
label = label_mapping[label_idx]
return label, probs.tolist()[0]
# Enhanced sentence extraction with regex
def extract_sentences(text, keywords):
sentences = sent_tokenize(text)
keywords_lower = [k.lower() for k in keywords]
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords_lower)) + r')\b', re.IGNORECASE)
return [s for s in sentences if pattern.search(s)]
def analyze_category(text, category_name, keywords):
sentences = extract_sentences(text, keywords)
if not sentences:
st.warning(f"β οΈ No relevant sentences found for {category_name}")
return None, []
sentiment_scores = defaultdict(int)
negative_sentences = []
for sentence in sentences:
label, probs = analyze_sentiment(sentence)
sentiment_scores[label] += 1
if label == 'Negative':
negative_sentences.append((sentence, probs))
total = sum(sentiment_scores.values())
sentiment_percentages = {
'Positive': (sentiment_scores.get('Positive', 0) / total) * 100,
'Negative': (sentiment_scores.get('Negative', 0) / total) * 100,
'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100
}
return sentiment_percentages, negative_sentences
# Expanded financial categories
categories = {
'Assets': [
'asset', 'assets', 'current assets', 'fixed assets', 'cash equivalents',
'inventory', 'receivables', 'property', 'equipment', 'investments',
'prepaid expenses', 'securities', 'liquid assets', 'capital assets'
],
'Liabilities': [
'liability', 'liabilities', 'debt', 'accounts payable', 'accrued expenses',
'loans payable', 'bonds payable', 'mortgage', 'taxes payable', 'leases',
'borrowings', 'creditors', 'obligations', 'outstanding debt'
],
'Equity': [
'equity', 'shareholders equity', 'stockholders equity', 'common stock',
'preferred stock', 'retained earnings', 'treasury stock', 'paid-in capital',
'net worth', 'ownerβs equity', 'share capital', 'accumulated deficit'
]
}
st.write("## π Sentiment Analysis Results:")
for category, keywords in categories.items():
st.write(f"### π {category}")
result = analyze_category(report_text, category, keywords)
if result[0] is None:
continue
sentiment_percentages, negative_sentences = result
cols = st.columns(3)
cols[0].metric(label="β
Positive", value=f"{sentiment_percentages['Positive']:.1f}%")
cols[1].metric(label="β οΈ Negative", value=f"{sentiment_percentages['Negative']:.1f}%")
cols[2].metric(label="βΉοΈ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%")
if negative_sentences:
with st.expander("π» View Negative Sentences"):
for idx, (sentence, probs) in enumerate(negative_sentences, 1):
st.write(f"**{idx}.** *{sentence}*")
st.caption(f"Probabilities β Positive: {probs[0]:.2f}, Negative: {probs[1]:.2f}, Neutral: {probs[2]:.2f}")
else:
st.success("β
No negative sentences detected.")