|
import logging |
|
import re |
|
from typing import Tuple |
|
|
|
import arxiv |
|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def clean_spaces(text: str) -> str: |
|
|
|
text = re.sub(" +", " ", text) |
|
|
|
text = re.sub("\n\n+", "\n\n", text) |
|
|
|
text = text.strip() |
|
return text |
|
|
|
|
|
def get_cleaned_arxiv_paper_text(html_content: str) -> str: |
|
|
|
soup = BeautifulSoup(html_content, "html.parser") |
|
|
|
alerts = soup.find("div", class_="package-alerts ltx_document") |
|
|
|
article = soup.find("article") |
|
article_text = article.get_text() |
|
|
|
article_text_clean = clean_spaces(article_text) |
|
return article_text_clean |
|
|
|
|
|
def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]: |
|
|
|
search_by_id = arxiv.Search(id_list=[arxiv_id]) |
|
try: |
|
result = list(arxiv.Client().results(search_by_id)) |
|
except arxiv.HTTPError as e: |
|
raise gr.Error(f"Failed to fetch arXiv data: {e}") |
|
if len(result) == 0: |
|
raise gr.Error(f"Could not find any paper with arXiv ID '{arxiv_id}'") |
|
first_result = result[0] |
|
if abstract_only: |
|
abstract_clean = first_result.summary.replace("\n", " ") |
|
return abstract_clean, first_result.entry_id |
|
if "/abs/" not in first_result.entry_id: |
|
raise gr.Error( |
|
f"Could not create the HTML URL for arXiv ID '{arxiv_id}' because its entry ID has " |
|
f"an unexpected format: {first_result.entry_id}" |
|
) |
|
html_url = first_result.entry_id.replace("/abs/", "/html/") |
|
request_result = requests.get(html_url) |
|
if request_result.status_code != 200: |
|
raise gr.Error( |
|
f"Could not fetch the HTML content for arXiv ID '{arxiv_id}', status code: " |
|
f"{request_result.status_code}" |
|
) |
|
html_content = request_result.text |
|
text_clean = get_cleaned_arxiv_paper_text(html_content) |
|
return text_clean, html_url |
|
|