import streamlit as st import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification from collections import defaultdict import fitz # PyMuPDF for PDF reading import re import os import spacy # Replace NLTK with spaCy for sentence tokenization # Load spaCy model for sentence tokenization nlp = spacy.load("en_core_web_sm") # Streamlit App Configuration st.set_page_config(page_title="📊 Financial Report Sentiment Analyzer", layout="wide") st.title("📊 Financial Report Sentiment Analyzer") st.markdown(""" ### What is FinBERT? **FinBERT** is a language model fine-tuned for financial text analysis. It classifies sentiment as **Positive, Neutral, or Negative** for key financial aspects: 1. **Assets** – What the company owns 2. **Liabilities** – What the company owes 3. **Equity** – Net worth (Assets - Liabilities) --- """) # File Upload uploaded_file = st.file_uploader("📂 Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"]) # ✅ Custom CSS for Better Report Preview st.markdown(""" """, unsafe_allow_html=True) # ✅ Load FinBERT Model (Optimized with Streamlit Caching) @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone") model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone") return tokenizer, model tokenizer, model = load_model() label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'} # ✅ Extract Text from Uploaded File def extract_text(file): try: if file.name.endswith('.pdf'): with fitz.open(stream=file.read(), filetype="pdf") as doc: return "\n".join([page.get_text() for page in doc]) else: return file.read().decode('utf-8') except Exception as e: st.error(f"❌ Error reading file: {e}") return "" if uploaded_file: report_text = extract_text(uploaded_file) st.write("### 📄 Uploaded Report Preview:") st.markdown(f"

{report_text[:5000]}

", unsafe_allow_html=True) # ✅ Sentiment Analysis Function def analyze_sentiment(sentence): inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=-1) label_idx = torch.argmax(probs, dim=1).item() return label_mapping[label_idx], probs.tolist()[0] # ✅ Extract Sentences Matching Financial Keywords (using spaCy) def extract_sentences(text, keywords): try: doc = nlp(text) sentences = [sent.text for sent in doc.sents] # Use spaCy for sentence tokenization pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE) return [s for s in sentences if pattern.search(s)] except Exception as e: st.error(f"❌ Error in sentence tokenization: {e}") return [] # ✅ Analyze Sentiment for a Specific Financial Category def analyze_category(text, category_name, keywords): sentences = extract_sentences(text, keywords) if not sentences: st.warning(f"⚠️ No relevant sentences found for {category_name}") return None, [] sentiment_scores = defaultdict(int) negative_sentences = [] for sentence in sentences: label, probs = analyze_sentiment(sentence) sentiment_scores[label] += 1 if label == 'Negative': negative_sentences.append((sentence, probs)) total = sum(sentiment_scores.values()) sentiment_percentages = { 'Positive': (sentiment_scores.get('Positive', 0) / total) * 100 if total else 0, 'Negative': (sentiment_scores.get('Negative', 0) / total) * 100 if total else 0, 'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100 if total else 0 } return sentiment_percentages, negative_sentences # ✅ Financial Categories & Keywords categories = { 'Assets': ['asset', 'current assets', 'fixed assets', 'cash equivalents', 'inventory', 'receivables', 'property', 'investments'], 'Liabilities': ['liability', 'debt', 'accounts payable', 'loans payable', 'taxes payable', 'borrowings', 'creditors', 'obligations'], 'Equity': ['equity', 'shareholders equity', 'stockholders equity', 'common stock', 'retained earnings', 'net worth', 'share capital'] } # ✅ Sentiment Analysis Results st.write("## 📝 Sentiment Analysis Results:") for category, keywords in categories.items(): st.write(f"### 🔍 {category}") result = analyze_category(report_text, category, keywords) if result[0] is None: continue sentiment_percentages, negative_sentences = result # Display Sentiment Metrics cols = st.columns(3) cols[0].metric(label="✅ Positive", value=f"{sentiment_percentages['Positive']:.1f}%") cols[1].metric(label="⚠️ Negative", value=f"{sentiment_percentages['Negative']:.1f}%") cols[2].metric(label="ℹ️ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%") # Show Negative Sentences (if any) if negative_sentences: with st.expander("🔻 View Negative Sentences"): for idx, (sentence, probs) in enumerate(negative_sentences, 1): st.write(f"**{idx}.** *{sentence}*") st.caption(f"Probabilities → Positive: {probs[0]:.2f}, Negative: {probs[1]:.2f}, Neutral: {probs[2]:.2f}") else: st.success("✅ No negative sentences detected.")