|
|
|
from bs4 import BeautifulSoup |
|
import requests |
|
from langchain.schema import HumanMessage |
|
from langchain_groq import ChatGroq |
|
import json |
|
from dotenv import load_dotenv |
|
import os |
|
from transformers import pipeline |
|
|
|
|
|
load_dotenv() |
|
GROQ_API_KEY = os.getenv('GROQ_API_KEY') |
|
|
|
|
|
llm = ChatGroq(api_key=GROQ_API_KEY, model="llama-3.1-8b-instant") |
|
|
|
|
|
def extract_titles_and_summaries(company_name, num_articles=10): |
|
url = f"https://economictimes.indiatimes.com/topic/{company_name}/news" |
|
try: |
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
print(f"Failed to fetch the webpage. Status code: {response.status_code}") |
|
return [] |
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
articles = soup.find_all('div', class_='clr flt topicstry story_list', limit=num_articles) |
|
extracted_articles = [] |
|
|
|
for article in articles: |
|
title_tag = article.find('h2') |
|
if title_tag: |
|
link_tag = title_tag.find('a') |
|
title = link_tag.get_text(strip=True) if link_tag else "No Title Found" |
|
else: |
|
title = "No Title Found" |
|
|
|
summary_tag = article.find('p') |
|
summary = summary_tag.get_text(strip=True) if summary_tag else "No Summary Found" |
|
|
|
extracted_articles.append({ |
|
"Title": title, |
|
"Summary": summary |
|
}) |
|
|
|
return { |
|
"Company": company_name, |
|
"Articles": extracted_articles |
|
} |
|
except Exception as e: |
|
print(f"An error occurred: {e}") |
|
return [] |
|
|
|
|
|
def perform_sentiment_analysis(news_data): |
|
from transformers import pipeline |
|
articles = news_data.get("Articles", []) |
|
pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis", device=1) |
|
sentiment_counts = {"Positive": 0, "Negative": 0, "Neutral": 0} |
|
|
|
for article in articles: |
|
content = f"{article['Title']} {article['Summary']}" |
|
sentiment_result = pipe(content)[0] |
|
|
|
sentiment_map = { |
|
"positive": "Positive", |
|
"negative": "Negative", |
|
"neutral": "Neutral", |
|
"very positive": "Positive", |
|
"very negative": "Negative" |
|
} |
|
|
|
sentiment = sentiment_map.get(sentiment_result["label"].lower(), "Unknown") |
|
score = float(sentiment_result["score"]) |
|
|
|
article["Sentiment"] = sentiment |
|
article["Score"] = score |
|
|
|
if sentiment in sentiment_counts: |
|
sentiment_counts[sentiment] += 1 |
|
|
|
return news_data, sentiment_counts |
|
|
|
|
|
def extract_topics_with_hf(news_data): |
|
structured_data = { |
|
"Company": news_data.get("Company", "Unknown"), |
|
"Articles": [] |
|
} |
|
topic_pipe = pipeline("text-classification", model="valurank/distilroberta-topic-classification", device=1) |
|
articles = news_data.get("Articles", []) |
|
|
|
for article in articles: |
|
content = f"{article['Title']} {article['Summary']}" |
|
topics_result = topic_pipe(content, top_k=3) |
|
topics = [topic["label"] for topic in topics_result] if topics_result else ["Unknown"] |
|
|
|
structured_data["Articles"].append({ |
|
"Title": article["Title"], |
|
"Summary": article["Summary"], |
|
"Sentiment": article.get("Sentiment", "Unknown"), |
|
"Score": article.get("Score", 0.0), |
|
"Topics": topics |
|
}) |
|
return structured_data |
|
|
|
|
|
def generate_final_sentiment(news_data, sentiment_counts): |
|
company_name = news_data["Company"] |
|
total_articles = sum(sentiment_counts.values()) |
|
combined_summaries = " ".join([article["Summary"] for article in news_data["Articles"]]) |
|
|
|
prompt = f""" |
|
Based on the analysis of {total_articles} articles about the company "{company_name}": |
|
- Positive articles: {sentiment_counts['Positive']} |
|
- Negative articles: {sentiment_counts['Negative']} |
|
- Neutral articles: {sentiment_counts['Neutral']} |
|
The following are the summarized key points from the articles: "{combined_summaries}". |
|
Provide a single, concise summary that integrates the overall sentiment analysis and key news highlights while maintaining a natural flow. Explain its implications for the company's reputation, stock potential, and public perception. |
|
Respond **ONLY** with a well-structured very concise and short paragraph in plain text, focusing on overall sentiment. |
|
""" |
|
|
|
response = llm.invoke([HumanMessage(content=prompt)], max_tokens=200) |
|
final_sentiment = response if response else "Sentiment analysis summary not available." |
|
return final_sentiment.content |
|
|
|
|
|
def extract_json(response): |
|
try: |
|
return json.loads(response) |
|
except json.JSONDecodeError: |
|
return {} |
|
|
|
|
|
def compare_articles(news_data, sentiment_counts): |
|
articles = news_data.get("Articles", []) |
|
all_topics = [set(article["Topics"]) for article in articles] |
|
common_topics = set.intersection(*all_topics) if all_topics else set() |
|
|
|
topics_prompt = f""" |
|
Analyze the following article topics and identify **only three** key themes that are common across multiple articles, |
|
even if they are phrased differently. The topics from each article are: |
|
{all_topics} |
|
|
|
Respond **ONLY** with a JSON format: |
|
{{"CommonTopics": ["topic1", "topic2", "topic3"]}} |
|
""" |
|
|
|
response = llm.invoke([HumanMessage(content=topics_prompt)]).content |
|
contextual_common_topics = extract_json(response).get("CommonTopics", list(common_topics))[:3] |
|
|
|
total_articles = sum(sentiment_counts.values()) |
|
|
|
comparison_prompt = f""" |
|
Provide a high-level summary comparing {total_articles} news articles about "{news_data['Company']}": |
|
- Sentiment distribution: {sentiment_counts} |
|
- Commonly discussed topics across articles: {contextual_common_topics} |
|
|
|
Consider the following: |
|
1. Notable contrasts between articles (e.g., major differences in topics and perspectives). |
|
2. Overall implications for the company's reputation, stock potential, and public perception. |
|
3. How sentiment varies across articles and its impact. |
|
|
|
Respond **ONLY** with a concise and insightful summary in this JSON format: |
|
{{ |
|
"Coverage Differences": [ |
|
{{"Comparison": "Brief contrast between Articles 1 & 2", "Impact": "Concise impact statement"}}, |
|
{{"Comparison": "Brief contrast between Articles 3 & 4", "Impact": "Concise impact statement"}} |
|
] |
|
}} |
|
""" |
|
|
|
response = llm.invoke([HumanMessage(content=comparison_prompt)]).content |
|
coverage_differences = extract_json(response).get("Coverage Differences", []) |
|
|
|
final_sentiment = generate_final_sentiment(news_data, sentiment_counts) |
|
|
|
return { |
|
"Company": news_data["Company"], |
|
"Articles": articles, |
|
"Comparative Sentiment Score": { |
|
"Sentiment Distribution": sentiment_counts, |
|
"Coverage Differences": coverage_differences, |
|
"Topic Overlap": { |
|
"Common Topics": contextual_common_topics, |
|
"Unique Topics": { |
|
f"Article {i+1}": list(topics - set(contextual_common_topics)) |
|
for i, topics in enumerate(all_topics) |
|
} |
|
} |
|
}, |
|
"Final Sentiment Analysis": final_sentiment |
|
} |
|
|