RAHULJUNEJA33 commited on
Commit
23f3ac6
Β·
verified Β·
1 Parent(s): baefe99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -15
app.py CHANGED
@@ -1,24 +1,14 @@
1
  import streamlit as st
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
- from nltk.tokenize import sent_tokenize
5
  from collections import defaultdict
6
  import fitz # PyMuPDF for PDF reading
7
  import re
8
  import os
9
- import nltk
10
 
11
- # βœ… Fix NLTK Issue: Set Custom Download Path
12
- NLTK_DATA_PATH = "/root/nltk_data"
13
- os.makedirs(NLTK_DATA_PATH, exist_ok=True)
14
- nltk.data.path.append(NLTK_DATA_PATH)
15
-
16
- # Ensure punkt is downloaded
17
- try:
18
- nltk.data.find('tokenizers/punkt')
19
- except LookupError:
20
- print("Downloading punkt tokenizer...")
21
- nltk.download('punkt', download_dir=NLTK_DATA_PATH)
22
 
23
  # Streamlit App Configuration
24
  st.set_page_config(page_title="πŸ“Š Financial Report Sentiment Analyzer", layout="wide")
@@ -89,10 +79,11 @@ if uploaded_file:
89
  label_idx = torch.argmax(probs, dim=1).item()
90
  return label_mapping[label_idx], probs.tolist()[0]
91
 
92
- # βœ… Extract Sentences Matching Financial Keywords
93
  def extract_sentences(text, keywords):
94
  try:
95
- sentences = sent_tokenize(text)
 
96
  pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
97
  return [s for s in sentences if pattern.search(s)]
98
  except Exception as e:
 
1
  import streamlit as st
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
4
  from collections import defaultdict
5
  import fitz # PyMuPDF for PDF reading
6
  import re
7
  import os
8
+ import spacy # Replace NLTK with spaCy for sentence tokenization
9
 
10
+ # Load spaCy model for sentence tokenization
11
+ nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
 
 
 
 
12
 
13
  # Streamlit App Configuration
14
  st.set_page_config(page_title="πŸ“Š Financial Report Sentiment Analyzer", layout="wide")
 
79
  label_idx = torch.argmax(probs, dim=1).item()
80
  return label_mapping[label_idx], probs.tolist()[0]
81
 
82
+ # βœ… Extract Sentences Matching Financial Keywords (using spaCy)
83
  def extract_sentences(text, keywords):
84
  try:
85
+ doc = nlp(text)
86
+ sentences = [sent.text for sent in doc.sents] # Use spaCy for sentence tokenization
87
  pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
88
  return [s for s in sentences if pattern.search(s)]
89
  except Exception as e: