import pandas as pd import requests import re from multiprocessing import Pool, cpu_count from functools import partial # Function to process each URL def process_paper(year, url): try: paper_page = requests.get(url).text # Find title title_pattern = r'(.*?)\s*' title_match = re.search(title_pattern, paper_page, re.DOTALL) title = title_match.group(1) # Find the code repository link code_repo_pattern = r'

.*?

\s*

' code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL) code_repo_link = code_repo_match.group(1) if code_repo_match else "" # Find the dataset information dataset_pattern = r'

.*?

\s*

(.*?)\s*
' dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL) dataset_info = "Yes" if dataset_match else "No" # Return a dictionary of the results return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info} except Exception as e: print(f"Error processing {url}: {e}") return None current_year = 2024 # Update with the current year MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"] MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"] years = [2021, 2022, 2023] # Set debug mode debug = False # Fetch all URLs for each year all_year_urls = [] for i in range(len(MICCAI_pages)): year_page = requests.get(MICCAI_pages[i]).text print(year_page) urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line] all_year_urls.extend([(years[i], url) for url in urls]) print(all_year_urls) # Parallel processing using Pool # if __name__ == "__main__": # with Pool(processes=12) as pool: # Use 12 processes # results = pool.starmap(process_paper, all_year_urls) # # Filter out any None results due to errors # results = [result for result in results if result is not None] # miccai = pd.DataFrame(results) # # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) ) # miccai.to_csv('miccai.csv')