File size: 7,093 Bytes
2d7977e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172d12d
2d7977e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from collections import Counter
import time
import json
import numpy as np

def sentiment_analysis(querystring, headers):

    # Load FinBERT
    model_name = "yiyanghkust/finbert-tone"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

    def calculate_sentiment_scores(sentiment_data):
        # Convert list values to their lengths, excluding 'details'
        processed = {
            k: len(v) if isinstance(v, list) and k != 'details' else v
            for k, v in sentiment_data.items() if k != 'details'
        }
        total = sum(processed.values())
        
        return {
            "overall": max(processed, key=processed.get) if processed else "neutral",
            "positive_percent": processed.get("positive", 0) / total * 100 if total > 0 else 0,
            "negative_percent": processed.get("negative", 0) / total * 100 if total > 0 else 0,
            "sentiment_ratio": processed.get("positive", 0) / processed.get("negative", 1) if processed.get("negative", 1) != 0 else float('-99999999'),
            "average_confidence": sum(sentiment_data.get("confidence", [0])) / len(sentiment_data.get("confidence", [0])) if sentiment_data.get("confidence") else 0
        }

    # API setup
    url = "https://indian-stock-exchange-api2.p.rapidapi.com/stock"

    # Step 1: Get stock data
    print("Fetching stock data...")
    response = requests.get(url, headers=headers, params=querystring)
    data = response.json()
    news_data = data.get("recentNews", {})
    print(f"Found {len(news_data)} news articles")

    # Step 2: Extract URLs
    urls = [item["url"] for item in news_data if isinstance(item, dict) and "url" in item]
    print(f"Processing {len(urls)} articles...")

    # Step 3: Analyze sentiment for each article
    summary = Counter()
    details = []

    for i, news_item in enumerate(news_data):
                news_url = news_item.get("url")
                headline = news_item.get("headline", "")
                intro = news_item.get("intro", "")

                content_for_sentiment = ""
                if news_url:
                    try:
                        print(f"\n[{i+1}/{len(urls)}] Analyzing: {news_url[:60]}...")
                        html = requests.get(news_url, timeout=10).text
                        soup = BeautifulSoup(html, "html.parser")

                        # Grab <p> tags and filter
                        paragraphs = soup.find_all("p")
                        if not paragraphs:
                            raise ValueError("No content found in paragraphs")

                        content_for_sentiment = " ".join(p.get_text() for p in paragraphs if len(p.get_text()) > 40)
                        content_for_sentiment = content_for_sentiment.strip()
                        if len(content_for_sentiment) < 100:
                            print("β†’ Content too short from web scraping, falling back to headline/intro")
                            content_for_sentiment = headline + " ." + intro

                    except Exception as e:
                        print(f"❌ Error scraping {news_url}: {str(e)}. Falling back to headline/intro for sentiment analysis.")
                        content_for_sentiment = headline + " ." + intro
                else:
                    print(f"\n[{i+1}/{len(urls)}] No URL provided, using headline/intro for sentiment analysis.")
                    content_for_sentiment = headline + " ." + intro

                if not content_for_sentiment.strip():
                    print("β†’ No content available for sentiment analysis, skipping.")
                    continue

                # Truncate to 512 tokens max
                content_for_sentiment = content_for_sentiment[:1000]
                result = classifier(content_for_sentiment[:512])[0]
                label = result['label'].lower()
                score = round(result['score'], 3)

                summary[label] += 1
                details.append({
                    "url": news_url,
                    "title": news_item.get("title", "No title"), # Use title from news_item if available
                    "sentiment": label,
                    "confidence": score,
                    "content_length": len(content_for_sentiment),
                    "image_222x148": news_item.get("image_222x148"),
                    "intro": intro,
                    "headline": headline
                })

                print(f"β†’ Sentiment: {label.upper()} (confidence: {score:.1%})")
                time.sleep(1.2)

    # Step 4: Generate comprehensive output
    sentiment_scores = calculate_sentiment_scores({
        "positive": summary["positive"],
        "negative": summary["negative"],
        "neutral": summary["neutral"],
        "details": details
    })

    output = {
        "metadata": {
            "total_articles": len(urls),
            "processed_articles": len(details),
            "processing_time": time.strftime("%Y-%m-%d %H:%M:%S")
        },
        "sentiment_metrics": {
            "overall_score": sentiment_scores["overall"], # Removed round() for string label
            "positive_score": round(sentiment_scores["positive_percent"], 2),
            "negative_score": round(sentiment_scores["negative_percent"], 2),
            "sentiment_ratio": round(sentiment_scores["sentiment_ratio"], 2),
            "average_confidence": round(sentiment_scores["average_confidence"], 2)
        },
        "article_details": details
    }

    # Print formatted results
    print("\n=== SENTIMENT ANALYSIS RESULTS ===")
    print(f"Overall Sentiment Score: {output['sentiment_metrics']['overall_score']}") # Updated print statement
    print(f"Positive/Negative Ratio: {output['sentiment_metrics']['sentiment_ratio']:.2f}")
    print(f"Average Confidence: {output['sentiment_metrics']['average_confidence']:.1f}%")

    import json
    with open("sentiment_results.json", "w") as f:
        json.dump(output, f, indent=2)
    print("Results saved to sentiment_results.json")
    return output

def mainOne(querystring):
    """
    Main function that takes querystring as parameter and runs sentiment analysis
    Args:
        querystring: Dictionary containing stock name (e.g. {'name': 'HDFC BANK'})
    Returns:
        Dictionary containing sentiment analysis results
    """
    try:
        headers = {
            "x-rapidapi-host": "indian-stock-exchange-api2.p.rapidapi.com",
            "x-rapidapi-key": "a12f59fc40msh153da8fdf3885b6p100406jsn57d1d84b0d06"
        }
        
        # Run the sentiment analysis
        results = sentiment_analysis(querystring, headers)
        return results
        
    except Exception as e:
        print(f"Error in main function: {str(e)}")
        return {"error": str(e)}