ArneBinder's picture
https://github.com/ArneBinder/pie-document-level/pull/312
3133b5e verified
raw
history blame
2.27 kB
import logging
import re
from typing import Tuple
import arxiv
import gradio as gr
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
def clean_spaces(text: str) -> str:
# replace all multiple spaces with a single space
text = re.sub(" +", " ", text)
# reduce more than two newlines to two newlines
text = re.sub("\n\n+", "\n\n", text)
# remove leading and trailing whitespaces
text = text.strip()
return text
def get_cleaned_arxiv_paper_text(html_content: str) -> str:
# parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# get alerts (this is one div with classes "package-alerts" and "ltx_document")
alerts = soup.find("div", class_="package-alerts ltx_document")
# get the "article" html element
article = soup.find("article")
article_text = article.get_text()
# cleanup the text
article_text_clean = clean_spaces(article_text)
return article_text_clean
def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]:
search_by_id = arxiv.Search(id_list=[arxiv_id])
try:
result = list(arxiv.Client().results(search_by_id))
except arxiv.HTTPError as e:
raise gr.Error(f"Failed to fetch arXiv data: {e}")
if len(result) == 0:
raise gr.Error(f"Could not find any paper with arXiv ID '{arxiv_id}'")
first_result = result[0]
if abstract_only:
abstract_clean = first_result.summary.replace("\n", " ")
return abstract_clean, first_result.entry_id
if "/abs/" not in first_result.entry_id:
raise gr.Error(
f"Could not create the HTML URL for arXiv ID '{arxiv_id}' because its entry ID has "
f"an unexpected format: {first_result.entry_id}"
)
html_url = first_result.entry_id.replace("/abs/", "/html/")
request_result = requests.get(html_url)
if request_result.status_code != 200:
raise gr.Error(
f"Could not fetch the HTML content for arXiv ID '{arxiv_id}', status code: "
f"{request_result.status_code}"
)
html_content = request_result.text
text_clean = get_cleaned_arxiv_paper_text(html_content)
return text_clean, html_url