Spaces:

carpeadiam
/

centiMent

Sleeping

File size: 7,093 Bytes

import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from collections import Counter
import time
import json
import numpy as np

def sentiment_analysis(querystring, headers):

    # Load FinBERT
    model_name = "yiyanghkust/finbert-tone"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

    def calculate_sentiment_scores(sentiment_data):
        # Convert list values to their lengths, excluding 'details'
        processed = {
            k: len(v) if isinstance(v, list) and k != 'details' else v
            for k, v in sentiment_data.items() if k != 'details'
        }
        total = sum(processed.values())
        
        return {
            "overall": max(processed, key=processed.get) if processed else "neutral",
            "positive_percent": processed.get("positive", 0) / total * 100 if total > 0 else 0,
            "negative_percent": processed.get("negative", 0) / total * 100 if total > 0 else 0,
            "sentiment_ratio": processed.get("positive", 0) / processed.get("negative", 1) if processed.get("negative", 1) != 0 else float('-99999999'),
            "average_confidence": sum(sentiment_data.get("confidence", [0])) / len(sentiment_data.get("confidence", [0])) if sentiment_data.get("confidence") else 0
        }

    # API setup
    url = "https://indian-stock-exchange-api2.p.rapidapi.com/stock"

    # Step 1: Get stock data
    print("Fetching stock data...")
    response = requests.get(url, headers=headers, params=querystring)
    data = response.json()
    news_data = data.get("recentNews", {})
    print(f"Found {len(news_data)} news articles")

    # Step 2: Extract URLs
    urls = [item["url"] for item in news_data if isinstance(item, dict) and "url" in item]
    print(f"Processing {len(urls)} articles...")

    # Step 3: Analyze sentiment for each article
    summary = Counter()
    details = []

    for i, news_item in enumerate(news_data):
                news_url = news_item.get("url")
                headline = news_item.get("headline", "")
                intro = news_item.get("intro", "")

                content_for_sentiment = ""
                if news_url:
                    try:
                        print(f"\n[{i+1}/{len(urls)}] Analyzing: {news_url[:60]}...")
                        html = requests.get(news_url, timeout=10).text
                        soup = BeautifulSoup(html, "html.parser")

                        # Grab <p> tags and filter
                        paragraphs = soup.find_all("p")
                        if not paragraphs:
                            raise ValueError("No content found in paragraphs")

                        content_for_sentiment = " ".join(p.get_text() for p in paragraphs if len(p.get_text()) > 40)
                        content_for_sentiment = content_for_sentiment.strip()
                        if len(content_for_sentiment) < 100:
                            print("→ Content too short from web scraping, falling back to headline/intro")
                            content_for_sentiment = headline + " ." + intro

                    except Exception as e:
                        print(f"❌ Error scraping {news_url}: {str(e)}. Falling back to headline/intro for sentiment analysis.")
                        content_for_sentiment = headline + " ." + intro
                else:
                    print(f"\n[{i+1}/{len(urls)}] No URL provided, using headline/intro for sentiment analysis.")
                    content_for_sentiment = headline + " ." + intro

                if not content_for_sentiment.strip():
                    print("→ No content available for sentiment analysis, skipping.")
                    continue

                # Truncate to 512 tokens max
                content_for_sentiment = content_for_sentiment[:1000]
                result = classifier(content_for_sentiment[:512])[0]
                label = result['label'].lower()
                score = round(result['score'], 3)

                summary[label] += 1
                details.append({
                    "url": news_url,
                    "title": news_item.get("title", "No title"), # Use title from news_item if available
                    "sentiment": label,
                    "confidence": score,
                    "content_length": len(content_for_sentiment),
                    "image_222x148": news_item.get("image_222x148"),
                    "intro": intro,
                    "headline": headline
                })

                print(f"→ Sentiment: {label.upper()} (confidence: {score:.1%})")
                time.sleep(1.2)

    # Step 4: Generate comprehensive output
    sentiment_scores = calculate_sentiment_scores({
        "positive": summary["positive"],
        "negative": summary["negative"],
        "neutral": summary["neutral"],
        "details": details
    })

    output = {
        "metadata": {
            "total_articles": len(urls),
            "processed_articles": len(details),
            "processing_time": time.strftime("%Y-%m-%d %H:%M:%S")
        },
        "sentiment_metrics": {
            "overall_score": sentiment_scores["overall"], # Removed round() for string label
            "positive_score": round(sentiment_scores["positive_percent"], 2),
            "negative_score": round(sentiment_scores["negative_percent"], 2),
            "sentiment_ratio": round(sentiment_scores["sentiment_ratio"], 2),
            "average_confidence": round(sentiment_scores["average_confidence"], 2)
        },
        "article_details": details
    }

    # Print formatted results
    print("\n=== SENTIMENT ANALYSIS RESULTS ===")
    print(f"Overall Sentiment Score: {output['sentiment_metrics']['overall_score']}") # Updated print statement
    print(f"Positive/Negative Ratio: {output['sentiment_metrics']['sentiment_ratio']:.2f}")
    print(f"Average Confidence: {output['sentiment_metrics']['average_confidence']:.1f}%")

    import json
    with open("sentiment_results.json", "w") as f:
        json.dump(output, f, indent=2)
    print("Results saved to sentiment_results.json")
    return output

def mainOne(querystring):
    """
    Main function that takes querystring as parameter and runs sentiment analysis
    Args:
        querystring: Dictionary containing stock name (e.g. {'name': 'HDFC BANK'})
    Returns:
        Dictionary containing sentiment analysis results
    """
    try:
        headers = {
            "x-rapidapi-host": "indian-stock-exchange-api2.p.rapidapi.com",
            "x-rapidapi-key": "a12f59fc40msh153da8fdf3885b6p100406jsn57d1d84b0d06"
        }
        
        # Run the sentiment analysis
        results = sentiment_analysis(querystring, headers)
        return results
        
    except Exception as e:
        print(f"Error in main function: {str(e)}")
        return {"error": str(e)}