'
code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL)
code_repo_link = code_repo_match.group(1) if code_repo_match else ""
# Find the dataset information
dataset_pattern = r'.*?
\s*(.*?)\s*
'
dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL)
dataset_info = "Yes" if dataset_match else "No"
# Return a dictionary of the results
return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info}
except Exception as e:
print(f"Error processing {url}: {e}")
return None
current_year = 2024 # Update with the current year
MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"]
MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"]
years = [2021, 2022, 2023]
# Set debug mode
debug = False
# Fetch all URLs for each year
all_year_urls = []
for i in range(len(MICCAI_pages)):
year_page = requests.get(MICCAI_pages[i]).text
print(year_page)
urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line]
all_year_urls.extend([(years[i], url) for url in urls])
print(all_year_urls)
# Parallel processing using Pool
# if __name__ == "__main__":
# with Pool(processes=12) as pool: # Use 12 processes
# results = pool.starmap(process_paper, all_year_urls)
# # Filter out any None results due to errors
# results = [result for result in results if result is not None]
# miccai = pd.DataFrame(results)
# # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) )
# miccai.to_csv('miccai.csv')