Spaces:

attilasimko
/

reproduce

Sleeping

App Files Files Community

reproduce / data /fetch_arxiv.py

attilasimko

new evaluations

8ac76ef 5 months ago

raw

history blame contribute delete

3.4 kB

	import pandas as pd
	import requests
	import pdfplumber
	import re
	from multiprocessing import Pool, cpu_count
	from functools import partial
	import os
	# Function to process each URL
	def process_arxiv_paper(article_link):
	try:
	article_text = requests.get(article_link).text
	title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>'
	title = re.findall(title_pattern, article_text, re.DOTALL)[0]
	year_pattern = r'\[Submitted on(?:.?(\d{1,2} \w+ \d{4}))(?:.?)]'
	year = re.findall(year_pattern, article_text)[0].split(" ")[-1]

	article_id = article_link.split("/")[-1]
	pdf_url = f'https://arxiv.org/pdf/{article_id}'
	response = requests.get(pdf_url)
	if response.status_code == 200:
	with open(f"{article_id}.pdf", 'wb') as file:
	file.write(response.content)
	if (response.status_code == 404):
	print("Failed to fetch pdf")
	return None

	urls = []
	link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)'
	with pdfplumber.open(f"{article_id}.pdf") as pdf:
	# Loop through all pages
	for page_num, page in enumerate(pdf.pages):
	# Extract text from the page
	text = page.extract_text()

	# Search for a specific word or phrase
	found_urls = re.findall(link_pattern, text)
	urls.extend(found_urls)
	os.remove(f"{article_id}.pdf")
	urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)]
	print(urls)
	url = urls[0] if len(urls) > 0 else ""

	# Return a dictionary of the results
	return {"venue": "arXiv", "title": title, "url": url, "year": year}

	except Exception as e:
	print(f"Error processing {article_link}: {e}")
	return None

	# Set debug mode
	debug = False
	# Fetch all URLs for each year
	all_year_urls = []

	page_size = 50
	search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start=']
	articles = []
	for search_query in search_queries:
	page = 0
	while (page <= 100):
	start_idx = page_size * page
	url = f"{search_query}{start_idx}"
	current_page = requests.get(url).text
	pattern = r'<p class="list-title is-inline-block">.*?<a href="([^"]+)"'
	matches = re.findall(pattern, current_page)
	if (len(matches) == 0):
	break
	else:
	page += 1

	articles += matches
	articles = np.unique(articles)

	# Parallel processing using Pool
	if __name__ == "__main__":
	with Pool(processes=4) as pool:
	results = pool.starmap(process_arxiv_paper, [[article] for article in articles])

	# Filter out any None results due to errors
	results = [result for result in results if result is not None]

	# Convert the list of dictionaries to a DataFrame
	arxiv = pd.DataFrame(results)
	arxiv.to_csv('arxiv.csv')