Spaces:
Sleeping
Sleeping
import pandas as pd | |
import requests | |
import re | |
from multiprocessing import Pool, cpu_count | |
from functools import partial | |
# Function to process each URL | |
def process_paper(year, url): | |
try: | |
paper_page = requests.get(url).text | |
# Find title | |
title_pattern = r'<title>(.*?)\s*</title>' | |
title_match = re.search(title_pattern, paper_page, re.DOTALL) | |
title = title_match.group(1) | |
# Find the code repository link | |
code_repo_pattern = r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">' | |
code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL) | |
code_repo_link = code_repo_match.group(1) if code_repo_match else "" | |
# Find the dataset information | |
dataset_pattern = r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />' | |
dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL) | |
dataset_info = "Yes" if dataset_match else "No" | |
# Return a dictionary of the results | |
return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info} | |
except Exception as e: | |
print(f"Error processing {url}: {e}") | |
return None | |
current_year = 2024 # Update with the current year | |
MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"] | |
MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"] | |
years = [2021, 2022, 2023] | |
# Set debug mode | |
debug = False | |
# Fetch all URLs for each year | |
all_year_urls = [] | |
for i in range(len(MICCAI_pages)): | |
year_page = requests.get(MICCAI_pages[i]).text | |
print(year_page) | |
urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line] | |
all_year_urls.extend([(years[i], url) for url in urls]) | |
print(all_year_urls) | |
# Parallel processing using Pool | |
# if __name__ == "__main__": | |
# with Pool(processes=12) as pool: # Use 12 processes | |
# results = pool.starmap(process_paper, all_year_urls) | |
# # Filter out any None results due to errors | |
# results = [result for result in results if result is not None] | |
# miccai = pd.DataFrame(results) | |
# # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) ) | |
# miccai.to_csv('miccai.csv') |