Spaces:
Sleeping
Sleeping
import pandas as pd | |
import requests | |
import pdfplumber | |
import re | |
from multiprocessing import Pool, cpu_count | |
from functools import partial | |
import os | |
# Function to process each URL | |
def process_arxiv_paper(article_link): | |
try: | |
article_text = requests.get(article_link).text | |
title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>' | |
title = re.findall(title_pattern, article_text, re.DOTALL)[0] | |
year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]' | |
year = re.findall(year_pattern, article_text)[0].split(" ")[-1] | |
article_id = article_link.split("/")[-1] | |
pdf_url = f'https://arxiv.org/pdf/{article_id}' | |
response = requests.get(pdf_url) | |
if response.status_code == 200: | |
with open(f"{article_id}.pdf", 'wb') as file: | |
file.write(response.content) | |
if (response.status_code == 404): | |
print("Failed to fetch pdf") | |
return None | |
urls = [] | |
link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)' | |
with pdfplumber.open(f"{article_id}.pdf") as pdf: | |
# Loop through all pages | |
for page_num, page in enumerate(pdf.pages): | |
# Extract text from the page | |
text = page.extract_text() | |
# Search for a specific word or phrase | |
found_urls = re.findall(link_pattern, text) | |
urls.extend(found_urls) | |
os.remove(f"{article_id}.pdf") | |
urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)] | |
print(urls) | |
url = urls[0] if len(urls) > 0 else "" | |
# Return a dictionary of the results | |
return {"venue": "arXiv", "title": title, "url": url, "year": year} | |
except Exception as e: | |
print(f"Error processing {article_link}: {e}") | |
return None | |
# Set debug mode | |
debug = False | |
# Fetch all URLs for each year | |
all_year_urls = [] | |
page_size = 50 | |
search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start='] | |
articles = [] | |
for search_query in search_queries: | |
page = 0 | |
while (page <= 100): | |
start_idx = page_size * page | |
url = f"{search_query}{start_idx}" | |
current_page = requests.get(url).text | |
pattern = r'<p class="list-title is-inline-block">.*?<a href="([^"]+)"' | |
matches = re.findall(pattern, current_page) | |
if (len(matches) == 0): | |
break | |
else: | |
page += 1 | |
articles += matches | |
articles = np.unique(articles) | |
# Parallel processing using Pool | |
if __name__ == "__main__": | |
with Pool(processes=4) as pool: | |
results = pool.starmap(process_arxiv_paper, [[article] for article in articles]) | |
# Filter out any None results due to errors | |
results = [result for result in results if result is not None] | |
# Convert the list of dictionaries to a DataFrame | |
arxiv = pd.DataFrame(results) | |
arxiv.to_csv('arxiv.csv') | |