Spaces:

attilasimko
/

reproduce

Sleeping

File size: 2,423 Bytes

77f290b

import pandas as pd
import requests
import re
from multiprocessing import Pool, cpu_count
from functools import partial

# Function to process each URL
def process_paper(year, url):
    try:
        paper_page = requests.get(url).text

        # Find title
        title_pattern = r'<title>(.*?)\s*</title>'
        title_match = re.search(title_pattern, paper_page, re.DOTALL)
        title = title_match.group(1)

        # Find the code repository link
        code_repo_pattern = r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">'
        code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL)
        code_repo_link = code_repo_match.group(1) if code_repo_match else ""

        # Find the dataset information
        dataset_pattern = r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />'
        dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL)
        dataset_info = "Yes" if dataset_match else "No"

        # Return a dictionary of the results
        return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info}

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

current_year = 2024  # Update with the current year
MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"]
MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"]
years = [2021, 2022, 2023]
# Set debug mode
debug = False

# Fetch all URLs for each year
all_year_urls = []
for i in range(len(MICCAI_pages)):
    year_page = requests.get(MICCAI_pages[i]).text
    print(year_page)
    urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line]
    all_year_urls.extend([(years[i], url) for url in urls])

print(all_year_urls)
# Parallel processing using Pool
# if __name__ == "__main__":
#     with Pool(processes=12) as pool:  # Use 12 processes
#         results = pool.starmap(process_paper, all_year_urls)

#     # Filter out any None results due to errors
#     results = [result for result in results if result is not None]

#     miccai = pd.DataFrame(results)
#     # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) )
#     miccai.to_csv('miccai.csv')