Spaces:

attilasimko
/

reproduce

Sleeping

App Files Files Community

reproduce / data /fetch_nature.py

attilasimko

new evaluations

8ac76ef 5 months ago

raw

history blame contribute delete

2.39 kB

	import pandas as pd
	import requests
	import re
	from multiprocessing import Pool, cpu_count
	from functools import partial

	# Function to process each URL
	def process_nature_paper(article_link):
	try:
	url = f'https://www.nature.com/articles/{article_link}'
	article_text = requests.get(url).text

	pattern = r'Code availability.*?<a href="([^"]+)"'
	matches = re.findall(pattern, article_text, re.DOTALL)
	urls = [link for link in matches if "github" in link]
	url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")

	year = re.findall(r'datetime="(\d{4})', article_text)[0]
	# # Find title
	title_pattern = r'<title>(.?)\s</title>'
	title = re.findall(title_pattern, article_text, re.DOTALL)[0]

	pattern = r'Data availability.*?<a href="([^"]+)"'
	matches = re.findall(pattern, article_text, re.DOTALL)
	dataset_info = "Yes" if (len(matches) > 0) else "No"

	# # Return a dictionary of the results
	return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""}

	except Exception as e:
	print(f"Error processing {url}: {e}")
	return None

	# Set debug mode
	debug = False

	# Fetch all URLs for each year
	all_year_urls = []
	search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="]
	articles = []
	for search_query in search_queries:
	page = 1
	while (page <= 100):
	url = f"{search_query}{page}"
	current_page = requests.get(url).text
	pattern = r'href="/articles/([^"]+)"'
	matches = re.findall(pattern, current_page)
	if (len(matches) == 0):
	break
	else:
	page += 1

	articles += matches
	articles = np.unique(articles)


	# Parallel processing using Pool
	if __name__ == "__main__":
	with Pool(processes=12) as pool:
	results = pool.starmap(process_nature_paper, [[article] for article in articles])

	# Filter out any None results due to errors
	results = [result for result in results if result is not None]

	# Convert the list of dictionaries to a DataFrame
	nature = pd.DataFrame(results)
	nature = nature[['title', 'year', 'pdf', 'url', 'public']]
	nature.to_csv('nature.csv')