reproduce / data /fetch_arxiv.py
attilasimko's picture
new evaluations
8ac76ef
import pandas as pd
import requests
import pdfplumber
import re
from multiprocessing import Pool, cpu_count
from functools import partial
import os
# Function to process each URL
def process_arxiv_paper(article_link):
try:
article_text = requests.get(article_link).text
title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>'
title = re.findall(title_pattern, article_text, re.DOTALL)[0]
year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]'
year = re.findall(year_pattern, article_text)[0].split(" ")[-1]
article_id = article_link.split("/")[-1]
pdf_url = f'https://arxiv.org/pdf/{article_id}'
response = requests.get(pdf_url)
if response.status_code == 200:
with open(f"{article_id}.pdf", 'wb') as file:
file.write(response.content)
if (response.status_code == 404):
print("Failed to fetch pdf")
return None
urls = []
link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)'
with pdfplumber.open(f"{article_id}.pdf") as pdf:
# Loop through all pages
for page_num, page in enumerate(pdf.pages):
# Extract text from the page
text = page.extract_text()
# Search for a specific word or phrase
found_urls = re.findall(link_pattern, text)
urls.extend(found_urls)
os.remove(f"{article_id}.pdf")
urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)]
print(urls)
url = urls[0] if len(urls) > 0 else ""
# Return a dictionary of the results
return {"venue": "arXiv", "title": title, "url": url, "year": year}
except Exception as e:
print(f"Error processing {article_link}: {e}")
return None
# Set debug mode
debug = False
# Fetch all URLs for each year
all_year_urls = []
page_size = 50
search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start=']
articles = []
for search_query in search_queries:
page = 0
while (page <= 100):
start_idx = page_size * page
url = f"{search_query}{start_idx}"
current_page = requests.get(url).text
pattern = r'<p class="list-title is-inline-block">.*?<a href="([^"]+)"'
matches = re.findall(pattern, current_page)
if (len(matches) == 0):
break
else:
page += 1
articles += matches
articles = np.unique(articles)
# Parallel processing using Pool
if __name__ == "__main__":
with Pool(processes=4) as pool:
results = pool.starmap(process_arxiv_paper, [[article] for article in articles])
# Filter out any None results due to errors
results = [result for result in results if result is not None]
# Convert the list of dictionaries to a DataFrame
arxiv = pd.DataFrame(results)
arxiv.to_csv('arxiv.csv')