Spaces:
Sleeping
Sleeping
import pandas as pd | |
import requests | |
import re | |
from multiprocessing import Pool, cpu_count | |
from functools import partial | |
# Function to process each URL | |
def process_nature_paper(article_link): | |
try: | |
url = f'https://www.nature.com/articles/{article_link}' | |
article_text = requests.get(url).text | |
pattern = r'Code availability.*?<a href="([^"]+)"' | |
matches = re.findall(pattern, article_text, re.DOTALL) | |
urls = [link for link in matches if "github" in link] | |
url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "") | |
year = re.findall(r'datetime="(\d{4})', article_text)[0] | |
# # Find title | |
title_pattern = r'<title>(.*?)\s*</title>' | |
title = re.findall(title_pattern, article_text, re.DOTALL)[0] | |
pattern = r'Data availability.*?<a href="([^"]+)"' | |
matches = re.findall(pattern, article_text, re.DOTALL) | |
dataset_info = "Yes" if (len(matches) > 0) else "No" | |
# # Return a dictionary of the results | |
return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""} | |
except Exception as e: | |
print(f"Error processing {url}: {e}") | |
return None | |
# Set debug mode | |
debug = False | |
# Fetch all URLs for each year | |
all_year_urls = [] | |
search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="] | |
articles = [] | |
for search_query in search_queries: | |
page = 1 | |
while (page <= 100): | |
url = f"{search_query}{page}" | |
current_page = requests.get(url).text | |
pattern = r'href="/articles/([^"]+)"' | |
matches = re.findall(pattern, current_page) | |
if (len(matches) == 0): | |
break | |
else: | |
page += 1 | |
articles += matches | |
articles = np.unique(articles) | |
# Parallel processing using Pool | |
if __name__ == "__main__": | |
with Pool(processes=12) as pool: | |
results = pool.starmap(process_nature_paper, [[article] for article in articles]) | |
# Filter out any None results due to errors | |
results = [result for result in results if result is not None] | |
# Convert the list of dictionaries to a DataFrame | |
nature = pd.DataFrame(results) | |
nature = nature[['title', 'year', 'pdf', 'url', 'public']] | |
nature.to_csv('nature.csv') |