reproduce / data /fetch_miccai.py
attilasimko's picture
new evaluations
8ac76ef
import pandas as pd
import requests
import re
from multiprocessing import Pool, cpu_count
from functools import partial
# Function to process each URL
def process_paper(year, url):
try:
paper_page = requests.get(url).text
# Find title
title_pattern = r'<title>(.*?)\s*</title>'
title_match = re.search(title_pattern, paper_page, re.DOTALL)
title = title_match.group(1)
# Find the code repository link
code_repo_pattern = r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">'
code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL)
code_repo_link = code_repo_match.group(1) if code_repo_match else ""
# Find the dataset information
dataset_pattern = r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />'
dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL)
dataset_info = "Yes" if dataset_match else "No"
# Return a dictionary of the results
return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info}
except Exception as e:
print(f"Error processing {url}: {e}")
return None
current_year = 2024 # Update with the current year
MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"]
MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"]
years = [2021, 2022, 2023]
# Set debug mode
debug = False
# Fetch all URLs for each year
all_year_urls = []
for i in range(len(MICCAI_pages)):
year_page = requests.get(MICCAI_pages[i]).text
print(year_page)
urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line]
all_year_urls.extend([(years[i], url) for url in urls])
print(all_year_urls)
# Parallel processing using Pool
# if __name__ == "__main__":
# with Pool(processes=12) as pool: # Use 12 processes
# results = pool.starmap(process_paper, all_year_urls)
# # Filter out any None results due to errors
# results = [result for result in results if result is not None]
# miccai = pd.DataFrame(results)
# # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) )
# miccai.to_csv('miccai.csv')