Spaces:

ArneBinder
/

ScientificArgumentRecommender

Sleeping

App Files Files Community

ScientificArgumentRecommender / src /demo /data_utils.py

ArneBinder

https://github.com/ArneBinder/pie-document-level/pull/312

3133b5e verified 6 months ago

raw

history blame

2.27 kB

	import logging
	import re
	from typing import Tuple

	import arxiv
	import gradio as gr
	import requests
	from bs4 import BeautifulSoup

	logger = logging.getLogger(__name__)


	def clean_spaces(text: str) -> str:
	# replace all multiple spaces with a single space
	text = re.sub(" +", " ", text)
	# reduce more than two newlines to two newlines
	text = re.sub("\n\n+", "\n\n", text)
	# remove leading and trailing whitespaces
	text = text.strip()
	return text


	def get_cleaned_arxiv_paper_text(html_content: str) -> str:
	# parse the HTML content with BeautifulSoup
	soup = BeautifulSoup(html_content, "html.parser")
	# get alerts (this is one div with classes "package-alerts" and "ltx_document")
	alerts = soup.find("div", class_="package-alerts ltx_document")
	# get the "article" html element
	article = soup.find("article")
	article_text = article.get_text()
	# cleanup the text
	article_text_clean = clean_spaces(article_text)
	return article_text_clean


	def load_text_from_arxiv(arxiv_id: str, abstract_only: bool = False) -> Tuple[str, str]:

	search_by_id = arxiv.Search(id_list=[arxiv_id])
	try:
	result = list(arxiv.Client().results(search_by_id))
	except arxiv.HTTPError as e:
	raise gr.Error(f"Failed to fetch arXiv data: {e}")
	if len(result) == 0:
	raise gr.Error(f"Could not find any paper with arXiv ID '{arxiv_id}'")
	first_result = result[0]
	if abstract_only:
	abstract_clean = first_result.summary.replace("\n", " ")
	return abstract_clean, first_result.entry_id
	if "/abs/" not in first_result.entry_id:
	raise gr.Error(
	f"Could not create the HTML URL for arXiv ID '{arxiv_id}' because its entry ID has "
	f"an unexpected format: {first_result.entry_id}"
	)
	html_url = first_result.entry_id.replace("/abs/", "/html/")
	request_result = requests.get(html_url)
	if request_result.status_code != 200:
	raise gr.Error(
	f"Could not fetch the HTML content for arXiv ID '{arxiv_id}', status code: "
	f"{request_result.status_code}"
	)
	html_content = request_result.text
	text_clean = get_cleaned_arxiv_paper_text(html_content)
	return text_clean, html_url