Spaces:

jatin7237
/

News-App

Running

News-App / utils.py

Divyansh Kushwaha

aea053c about 1 month ago

7.37 kB

	from bs4 import BeautifulSoup
	import requests
	from langchain.schema import HumanMessage
	from langchain_groq import ChatGroq
	import json
	from dotenv import load_dotenv
	import os
	from transformers import pipeline

	load_dotenv()
	GROQ_API_KEY = os.getenv('GROQ_API_KEY')

	llm=ChatGroq(api_key=GROQ_API_KEY, model="llama-3.1-8b-instant")

	def extract_titles_and_summaries(company_name, num_articles=10):
	url = f"https://economictimes.indiatimes.com/topic/{company_name}/news"
	try:
	response = requests.get(url)
	if response.status_code != 200:
	print(f"Failed to fetch the webpage. Status code: {response.status_code}")
	return []

	soup = BeautifulSoup(response.content, "html.parser")
	articles = soup.find_all('div', class_='clr flt topicstry story_list', limit=num_articles)
	extracted_articles = []

	for article in articles:
	title_tag = article.find('h2')
	if title_tag:
	link_tag = title_tag.find('a')
	title = link_tag.get_text(strip=True) if link_tag else "No Title Found"
	else:
	title = "No Title Found"

	summary_tag = article.find('p')
	summary = summary_tag.get_text(strip=True) if summary_tag else "No Summary Found"

	extracted_articles.append({
	"Title": title,
	"Summary": summary
	})

	return {
	"Company": company_name,
	"Articles": extracted_articles
	}
	except Exception as e:
	print(f"An error occurred: {e}")
	return []

	def perform_sentiment_analysis(news_data):
	from transformers import pipeline
	articles = news_data.get("Articles", [])
	pipe = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis",device=1)
	sentiment_counts = {"Positive": 0, "Negative": 0, "Neutral": 0}

	for article in articles:
	content = f"{article['Title']} {article['Summary']}"
	sentiment_result = pipe(content)[0]

	sentiment_map = {
	"positive": "Positive",
	"negative": "Negative",
	"neutral": "Neutral",
	"very positive": "Positive",
	"very negative": "Negative"
	}

	sentiment = sentiment_map.get(sentiment_result["label"].lower(), "Unknown")
	score = float(sentiment_result["score"])

	article["Sentiment"] = sentiment
	article["Score"] = score

	if sentiment in sentiment_counts:
	sentiment_counts[sentiment] += 1

	return news_data, sentiment_counts

	def extract_topics_with_hf(news_data):
	structured_data = {
	"Company": news_data.get("Company", "Unknown"),
	"Articles": []
	}
	topic_pipe = pipeline("text-classification", model="valurank/distilroberta-topic-classification",device=1)
	articles = news_data.get("Articles", [])
	for article in articles:
	content = f"{article['Title']} {article['Summary']}"
	topics_result = topic_pipe(content, top_k=3)
	topics = [topic["label"] for topic in topics_result] if topics_result else ["Unknown"]

	structured_data["Articles"].append({
	"Title": article["Title"],
	"Summary": article["Summary"],
	"Sentiment": article.get("Sentiment", "Unknown"),
	"Score": article.get("Score", 0.0),
	"Topics": topics
	})
	return structured_data

	def generate_final_sentiment(news_data, sentiment_counts):
	company_name = news_data["Company"]
	total_articles = sum(sentiment_counts.values())
	combined_summaries = " ".join([article["Summary"] for article in news_data["Articles"]])
	prompt = f"""
	Based on the analysis of {total_articles} articles about the company "{company_name}":
	- Positive articles: {sentiment_counts['Positive']}
	- Negative articles: {sentiment_counts['Negative']}
	- Neutral articles: {sentiment_counts['Neutral']}
	The following are the summarized key points from the articles: "{combined_summaries}".
	Provide a single, concise summary that integrates the overall sentiment analysis and key news highlights while maintaining a natural flow. Explain its implications for the company's reputation, stock potential, and public perception.
	Respond ONLY with a well-structured very concised and very short paragraph in plain text, focus on overall sentiment.
	"""
	response = llm.invoke([HumanMessage(content=prompt)],max_tokens=200)
	final_sentiment = response if response else "Sentiment analysis summary not available."
	return final_sentiment.content # it's a string

	def extract_json(response):
	try:
	return json.loads(response)
	except json.JSONDecodeError:
	return {}

	def compare_articles(news_data, sentiment_counts):
	articles = news_data.get("Articles", [])
	all_topics = [set(article["Topics"]) for article in articles]
	common_topics = set.intersection(*all_topics) if all_topics else set()
	topics_prompt = f"""
	Analyze the following article topics and identify only three key themes that are common across multiple articles,
	even if they are phrased differently. The topics from each article are:
	{all_topics}

	Respond ONLY with a JSON format:
	{{"CommonTopics": ["topic1", "topic2", "topic3"]}}
	"""
	response = llm.invoke([HumanMessage(content=topics_prompt)]).content
	contextual_common_topics = extract_json(response).get("CommonTopics", list(common_topics))[:3] # Limit to 3 topics

	total_articles = sum(sentiment_counts.values())
	comparison_prompt = f"""
	Provide a high-level summary comparing {total_articles} news articles about "{news_data['Company']}":
	- Sentiment distribution: {sentiment_counts}
	- Commonly discussed topics across articles: {contextual_common_topics}

	Consider the following:
	1. Notable contrasts between articles (e.g., major differences in topics and perspectives).
	2. Overall implications for the company's reputation, stock potential, and public perception.
	3. How sentiment varies across articles and its impact.

	Respond ONLY with a concise and insightful summary in this JSON format:
	{{
	"Coverage Differences": [
	{{"Comparison": "Brief contrast between Articles 1 & 2", "Impact": "Concise impact statement"}},
	{{"Comparison": "Brief contrast between Articles 3 & 4", "Impact": "Concise impact statement"}}
	]
	}}
	"""
	response = llm.invoke([HumanMessage(content=comparison_prompt)]).content
	coverage_differences = extract_json(response).get("Coverage Differences", [])
	final_sentiment = generate_final_sentiment(news_data, sentiment_counts)
	return {
	"Company": news_data["Company"],
	"Articles": articles,
	"Comparative Sentiment Score": {
	"Sentiment Distribution": sentiment_counts,
	"Coverage Differences": coverage_differences,
	"Topic Overlap": {
	"Common Topics": contextual_common_topics,
	"Unique Topics": {
	f"Article {i+1}": list(topics - set(contextual_common_topics))
	for i, topics in enumerate(all_topics)
	}
	}
	},
	"Final Sentiment Analysis": final_sentiment
	}