Spaces:
Running
Running
from typing import Dict, Optional, List | |
import logging | |
from urllib.parse import urlparse | |
import requests | |
from bs4 import BeautifulSoup, NavigableString | |
from ..utils.logging_config import setup_logging | |
class ArticleScraper: | |
def __init__(self): | |
self.session = requests.Session() | |
self.session.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
setup_logging() | |
self.logger = logging.getLogger(__name__) | |
def _get_domain(self, url: str) -> str: | |
"""Extract domain from URL.""" | |
return urlparse(url).netloc | |
def _fetch_page(self, url: str) -> Optional[str]: | |
"""Fetch page content with error handling.""" | |
try: | |
response = self.session.get(url) | |
response.raise_for_status() | |
return response.text | |
except Exception as e: | |
self.logger.error(f"Error fetching {url}: {str(e)}") | |
return None | |
def _process_element(self, element) -> str: | |
"""Process an HTML element while preserving its structure and formatting.""" | |
if isinstance(element, NavigableString): | |
return str(element) | |
# Handle different types of elements | |
tag_name = element.name | |
if tag_name in ['p', 'div']: | |
return '\n\n' + ''.join(self._process_element(child) for child in element.children).strip() | |
elif tag_name in ['ul', 'ol']: | |
items = [] | |
for li in element.find_all('li', recursive=False): | |
prefix = '• ' if tag_name == 'ul' else f"{len(items) + 1}. " | |
items.append(prefix + ''.join(self._process_element(child) for child in li.children).strip()) | |
return '\n' + '\n'.join(items) + '\n' | |
elif tag_name == 'br': | |
return '\n' | |
elif tag_name in ['strong', 'b']: | |
return '**' + ''.join(self._process_element(child) for child in element.children) + '**' | |
elif tag_name in ['em', 'i']: | |
return '_' + ''.join(self._process_element(child) for child in element.children) + '_' | |
elif tag_name == 'a': | |
text = ''.join(self._process_element(child) for child in element.children) | |
href = element.get('href', '') | |
return f'[{text}]({href})' | |
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: | |
level = int(tag_name[1]) | |
prefix = '#' * (level + 1) # Add one more # to match test expectations | |
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n' | |
# For other elements, just process their children | |
return ''.join(self._process_element(child) for child in element.children) | |
def _extract_content(self, container) -> str: | |
"""Extract and format content from a container element.""" | |
if not container: | |
return '' | |
# Remove unwanted elements | |
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']): | |
unwanted.decompose() | |
# Process the container | |
content = self._process_element(container) | |
# Clean up extra whitespace and newlines | |
content = '\n'.join(line.strip() for line in content.split('\n')) | |
content = '\n'.join(filter(None, content.split('\n'))) | |
return content.strip() | |
def _extract_article(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]: | |
"""Extract content from any article, with special handling for known domains.""" | |
try: | |
# Find headline - try domain-specific selectors first, then fallback to generic | |
headline = None | |
headline_selectors = { | |
'politifact.com': ['h1.article__title'], | |
'snopes.com': ['header h1', 'article h1'] | |
} | |
# Try domain-specific headline selectors | |
if domain in headline_selectors: | |
for selector in headline_selectors[domain]: | |
headline = soup.select_one(selector) | |
if headline: | |
break | |
# Fallback to any h1 if no domain-specific headline found | |
if not headline: | |
headline = soup.find('h1') | |
headline_text = headline.get_text().strip() if headline else "No headline found" | |
self.logger.info(f"Found headline: {headline_text}") | |
# Find content - try domain-specific selectors first, then fallback to generic | |
content_div = None | |
content_selectors = { | |
'politifact.com': ['article.article', '.article__text', '.m-textblock'], | |
'snopes.com': ['article'] | |
} | |
# Try domain-specific content selectors | |
if domain in content_selectors: | |
for selector in content_selectors[domain]: | |
content_div = soup.select_one(selector) | |
if content_div: | |
break | |
# Fallback to generic content selectors | |
if not content_div: | |
for selector in ['article', 'main', '.content', '.article-content']: | |
content_div = soup.select_one(selector) | |
if content_div: | |
break | |
content = self._extract_content(content_div) if content_div else "No content found" | |
if not content: | |
self.logger.warning("No content found in article") | |
self.logger.debug(f"Domain: {domain}") | |
return {"headline": headline_text, "content": content} | |
except Exception as e: | |
self.logger.error(f"Error extracting article content: {str(e)}") | |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"} | |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]: | |
""" | |
Main function to scrape fact-checking articles. | |
Returns a dictionary with headline and content. | |
""" | |
html_content = self._fetch_page(url) | |
if not html_content: | |
self.logger.error("Failed to fetch page content") | |
return None | |
soup = BeautifulSoup(html_content, 'html.parser') | |
domain = self._get_domain(url) | |
self.logger.info(f"Scraping article from domain: {domain}") | |
return self._extract_article(soup, domain) |