import streamlit as st import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification from nltk.tokenize import sent_tokenize from collections import defaultdict import nltk import fitz # PyMuPDF import re nltk.download('punkt') st.set_page_config(page_title="πŸ“Š Financial Report Sentiment Analyzer", layout="wide") st.title("πŸ“Š Financial Report Sentiment Analyzer") st.markdown(""" ### What is FinBERT? **FinBERT** is a language model fine-tuned specifically for financial text. It helps in detecting sentiment (Positive, Neutral, Negative) in financial reports. We analyze three key financial aspects: 1. **Assets** – What the company owns 2. **Liabilities** – What the company owes 3. **Equity** – Net worth (Assets - Liabilities) --- """) uploaded_file = st.file_uploader("πŸ“‚ Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"]) st.markdown(""" """, unsafe_allow_html=True) if uploaded_file: # Text extraction if uploaded_file.name.endswith('.pdf'): with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc: report_text = "" for page in doc: report_text += page.get_text() else: report_text = uploaded_file.read().decode('utf-8') st.write("### πŸ“„ Uploaded Report Preview:") st.markdown(f'''
{report_text[:5000]}
''', unsafe_allow_html=True) # Load FinBERT Model @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone") model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone") return tokenizer, model tokenizer, model = load_model() label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'} # Sentiment Analysis Function def analyze_sentiment(sentence): inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=-1) label_idx = torch.argmax(probs, dim=1).item() label = label_mapping[label_idx] return label, probs.tolist()[0] # Enhanced sentence extraction with regex def extract_sentences(text, keywords): sentences = sent_tokenize(text) keywords_lower = [k.lower() for k in keywords] pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords_lower)) + r')\b', re.IGNORECASE) return [s for s in sentences if pattern.search(s)] def analyze_category(text, category_name, keywords): sentences = extract_sentences(text, keywords) if not sentences: st.warning(f"⚠️ No relevant sentences found for {category_name}") return None, [] sentiment_scores = defaultdict(int) negative_sentences = [] for sentence in sentences: label, probs = analyze_sentiment(sentence) sentiment_scores[label] += 1 if label == 'Negative': negative_sentences.append((sentence, probs)) total = sum(sentiment_scores.values()) sentiment_percentages = { 'Positive': (sentiment_scores.get('Positive', 0) / total) * 100, 'Negative': (sentiment_scores.get('Negative', 0) / total) * 100, 'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100 } return sentiment_percentages, negative_sentences # Expanded financial categories categories = { 'Assets': [ 'asset', 'assets', 'current assets', 'fixed assets', 'cash equivalents', 'inventory', 'receivables', 'property', 'equipment', 'investments', 'prepaid expenses', 'securities', 'liquid assets', 'capital assets' ], 'Liabilities': [ 'liability', 'liabilities', 'debt', 'accounts payable', 'accrued expenses', 'loans payable', 'bonds payable', 'mortgage', 'taxes payable', 'leases', 'borrowings', 'creditors', 'obligations', 'outstanding debt' ], 'Equity': [ 'equity', 'shareholders equity', 'stockholders equity', 'common stock', 'preferred stock', 'retained earnings', 'treasury stock', 'paid-in capital', 'net worth', 'owner’s equity', 'share capital', 'accumulated deficit' ] } st.write("## πŸ“ Sentiment Analysis Results:") for category, keywords in categories.items(): st.write(f"### πŸ” {category}") result = analyze_category(report_text, category, keywords) if result[0] is None: continue sentiment_percentages, negative_sentences = result cols = st.columns(3) cols[0].metric(label="βœ… Positive", value=f"{sentiment_percentages['Positive']:.1f}%") cols[1].metric(label="⚠️ Negative", value=f"{sentiment_percentages['Negative']:.1f}%") cols[2].metric(label="ℹ️ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%") if negative_sentences: with st.expander("πŸ”» View Negative Sentences"): for idx, (sentence, probs) in enumerate(negative_sentences, 1): st.write(f"**{idx}.** *{sentence}*") st.caption(f"Probabilities β†’ Positive: {probs[0]:.2f}, Negative: {probs[1]:.2f}, Neutral: {probs[2]:.2f}") else: st.success("βœ… No negative sentences detected.")