reproduce / data /fetch_nature.py
attilasimko's picture
new evaluations
8ac76ef
import pandas as pd
import requests
import re
from multiprocessing import Pool, cpu_count
from functools import partial
# Function to process each URL
def process_nature_paper(article_link):
try:
url = f'https://www.nature.com/articles/{article_link}'
article_text = requests.get(url).text
pattern = r'Code availability.*?<a href="([^"]+)"'
matches = re.findall(pattern, article_text, re.DOTALL)
urls = [link for link in matches if "github" in link]
url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
year = re.findall(r'datetime="(\d{4})', article_text)[0]
# # Find title
title_pattern = r'<title>(.*?)\s*</title>'
title = re.findall(title_pattern, article_text, re.DOTALL)[0]
pattern = r'Data availability.*?<a href="([^"]+)"'
matches = re.findall(pattern, article_text, re.DOTALL)
dataset_info = "Yes" if (len(matches) > 0) else "No"
# # Return a dictionary of the results
return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""}
except Exception as e:
print(f"Error processing {url}: {e}")
return None
# Set debug mode
debug = False
# Fetch all URLs for each year
all_year_urls = []
search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="]
articles = []
for search_query in search_queries:
page = 1
while (page <= 100):
url = f"{search_query}{page}"
current_page = requests.get(url).text
pattern = r'href="/articles/([^"]+)"'
matches = re.findall(pattern, current_page)
if (len(matches) == 0):
break
else:
page += 1
articles += matches
articles = np.unique(articles)
# Parallel processing using Pool
if __name__ == "__main__":
with Pool(processes=12) as pool:
results = pool.starmap(process_nature_paper, [[article] for article in articles])
# Filter out any None results due to errors
results = [result for result in results if result is not None]
# Convert the list of dictionaries to a DataFrame
nature = pd.DataFrame(results)
nature = nature[['title', 'year', 'pdf', 'url', 'public']]
nature.to_csv('nature.csv')