attilasimko commited on
Commit
77f290b
·
1 Parent(s): cd14e4d

What did I do before then?

Browse files
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ githubToken="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"
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data/*.csv
2
+ data/*.zip
data/fetch_arxiv.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import pdfplumber
4
+ import re
5
+ from multiprocessing import Pool, cpu_count
6
+ from functools import partial
7
+ import os
8
+ # Function to process each URL
9
+ def process_arxiv_paper(article_link):
10
+ try:
11
+ article_text = requests.get(article_link).text
12
+ title_pattern = r'<h1 class="title mathjax"><span class="descriptor">Title:</span>(.*?)</h1>'
13
+ title = re.findall(title_pattern, article_text, re.DOTALL)[0]
14
+ year_pattern = r'\[Submitted on(?:.*?(\d{1,2} \w+ \d{4}))(?:.*?)]'
15
+ year = re.findall(year_pattern, article_text)[0].split(" ")[-1]
16
+
17
+ article_id = article_link.split("/")[-1]
18
+ pdf_url = f'https://arxiv.org/pdf/{article_id}'
19
+ response = requests.get(pdf_url)
20
+ if response.status_code == 200:
21
+ with open(f"{article_id}.pdf", 'wb') as file:
22
+ file.write(response.content)
23
+ if (response.status_code == 404):
24
+ print("Failed to fetch pdf")
25
+ return None
26
+
27
+ urls = []
28
+ link_pattern = r'(https?://(?:www\.)?github\.com[^\s]+)'
29
+ with pdfplumber.open(f"{article_id}.pdf") as pdf:
30
+ # Loop through all pages
31
+ for page_num, page in enumerate(pdf.pages):
32
+ # Extract text from the page
33
+ text = page.extract_text()
34
+
35
+ # Search for a specific word or phrase
36
+ found_urls = re.findall(link_pattern, text)
37
+ urls.extend(found_urls)
38
+ os.remove(f"{article_id}.pdf")
39
+ urls = [url for url in urls if ("pytorch" not in url) & ("fchollet" not in url) & (len(url.split("github.com")[1].split("/")) >= 3)]
40
+ print(urls)
41
+ url = urls[0] if len(urls) > 0 else ""
42
+
43
+ # Return a dictionary of the results
44
+ return {"venue": "arXiv", "title": title, "url": url, "year": year}
45
+
46
+ except Exception as e:
47
+ print(f"Error processing {article_link}: {e}")
48
+ return None
49
+
50
+ # Set debug mode
51
+ debug = False
52
+ # Fetch all URLs for each year
53
+ all_year_urls = []
54
+
55
+ page_size = 50
56
+ search_queries = ['https://arxiv.org/search/advanced?advanced=1&terms-0-operator=AND&terms-0-term=deep+learning&terms-0-field=abstract&terms-1-operator=AND&terms-1-term=cancer&terms-1-field=abstract&classification-physics_archives=all&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2018&date-to_date=2024&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first&start=']
57
+ articles = []
58
+ for search_query in search_queries:
59
+ page = 0
60
+ while (page <= 100):
61
+ start_idx = page_size * page
62
+ url = f"{search_query}{start_idx}"
63
+ current_page = requests.get(url).text
64
+ pattern = r'<p class="list-title is-inline-block">.*?<a href="([^"]+)"'
65
+ matches = re.findall(pattern, current_page)
66
+ if (len(matches) == 0):
67
+ break
68
+ else:
69
+ page += 1
70
+
71
+ articles += matches
72
+ articles = np.unique(articles)
73
+
74
+ # Parallel processing using Pool
75
+ if __name__ == "__main__":
76
+ with Pool(processes=4) as pool:
77
+ results = pool.starmap(process_arxiv_paper, [[article] for article in articles])
78
+
79
+ # Filter out any None results due to errors
80
+ results = [result for result in results if result is not None]
81
+
82
+ # Convert the list of dictionaries to a DataFrame
83
+ arxiv = pd.DataFrame(results)
84
+ arxiv.to_csv('arxiv.csv')
data/fetch_miccai.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import re
4
+ from multiprocessing import Pool, cpu_count
5
+ from functools import partial
6
+
7
+ # Function to process each URL
8
+ def process_paper(year, url):
9
+ try:
10
+ paper_page = requests.get(url).text
11
+
12
+ # Find title
13
+ title_pattern = r'<title>(.*?)\s*</title>'
14
+ title_match = re.search(title_pattern, paper_page, re.DOTALL)
15
+ title = title_match.group(1)
16
+
17
+ # Find the code repository link
18
+ code_repo_pattern = r'<h1 id="code-id">.*?</h1>\s*<p><a href="(.*?)">'
19
+ code_repo_match = re.search(code_repo_pattern, paper_page, re.DOTALL)
20
+ code_repo_link = code_repo_match.group(1) if code_repo_match else ""
21
+
22
+ # Find the dataset information
23
+ dataset_pattern = r'<h1 id="dataset-id">.*?</h1>\s*<p>(.*?)\s*<br />'
24
+ dataset_match = re.search(dataset_pattern, paper_page, re.DOTALL)
25
+ dataset_info = "Yes" if dataset_match else "No"
26
+
27
+ # Return a dictionary of the results
28
+ return {"title": title, "url": code_repo_link, "year": year, "public": dataset_info}
29
+
30
+ except Exception as e:
31
+ print(f"Error processing {url}: {e}")
32
+ return None
33
+
34
+ current_year = 2024 # Update with the current year
35
+ MICCAI_pages = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org/2022/papers/", "https://conferences.miccai.org/2023/papers/"]
36
+ MICCAI_root = ["https://miccai2021.org/openaccess/paperlinks/", "https://conferences.miccai.org", "https://conferences.miccai.org"]
37
+ years = [2021, 2022, 2023]
38
+ # Set debug mode
39
+ debug = False
40
+
41
+ # Fetch all URLs for each year
42
+ all_year_urls = []
43
+ for i in range(len(MICCAI_pages)):
44
+ year_page = requests.get(MICCAI_pages[i]).text
45
+ print(year_page)
46
+ urls = [MICCAI_root[i] + line.split('href="')[1].split('"')[0] for line in year_page.split('\n') if "&bullet" in line]
47
+ all_year_urls.extend([(years[i], url) for url in urls])
48
+
49
+ print(all_year_urls)
50
+ # Parallel processing using Pool
51
+ # if __name__ == "__main__":
52
+ # with Pool(processes=12) as pool: # Use 12 processes
53
+ # results = pool.starmap(process_paper, all_year_urls)
54
+
55
+ # # Filter out any None results due to errors
56
+ # results = [result for result in results if result is not None]
57
+
58
+ # miccai = pd.DataFrame(results)
59
+ # # miccai = pd.DataFrame( OrderedDict( { 'title': pd.Series(a), 'b': pd.Series(b), 'c': pd.Series(c) } ) )
60
+ # miccai.to_csv('miccai.csv')
data/fetch_nature.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import re
4
+ from multiprocessing import Pool, cpu_count
5
+ from functools import partial
6
+
7
+ # Function to process each URL
8
+ def process_nature_paper(article_link):
9
+ try:
10
+ url = f'https://www.nature.com/articles/{article_link}'
11
+ article_text = requests.get(url).text
12
+
13
+ pattern = r'Code availability.*?<a href="([^"]+)"'
14
+ matches = re.findall(pattern, article_text, re.DOTALL)
15
+ urls = [link for link in matches if "github" in link]
16
+ url = urls[0] if len(urls) > 0 else (matches[0] if len(matches) > 0 else "")
17
+
18
+ year = re.findall(r'datetime="(\d{4})', article_text)[0]
19
+ # # Find title
20
+ title_pattern = r'<title>(.*?)\s*</title>'
21
+ title = re.findall(title_pattern, article_text, re.DOTALL)[0]
22
+
23
+ pattern = r'Data availability.*?<a href="([^"]+)"'
24
+ matches = re.findall(pattern, article_text, re.DOTALL)
25
+ dataset_info = "Yes" if (len(matches) > 0) else "No"
26
+
27
+ # # Return a dictionary of the results
28
+ return {"title": title, "url": url, "year": year, "public": dataset_info, "pdf": ""}
29
+
30
+ except Exception as e:
31
+ print(f"Error processing {url}: {e}")
32
+ return None
33
+
34
+ # Set debug mode
35
+ debug = False
36
+
37
+ # Fetch all URLs for each year
38
+ all_year_urls = []
39
+ search_queries = ["https://www.nature.com/search?q=deep+learning&order=relevance&journal=commsmed%2Cnm&page=", "https://www.nature.com/search?q=AI&order=relevance&journal=commsmed%2Cnm&page="]
40
+ articles = []
41
+ for search_query in search_queries:
42
+ page = 1
43
+ while (page <= 100):
44
+ url = f"{search_query}{page}"
45
+ current_page = requests.get(url).text
46
+ pattern = r'href="/articles/([^"]+)"'
47
+ matches = re.findall(pattern, current_page)
48
+ if (len(matches) == 0):
49
+ break
50
+ else:
51
+ page += 1
52
+
53
+ articles += matches
54
+ articles = np.unique(articles)
55
+
56
+
57
+ # Parallel processing using Pool
58
+ if __name__ == "__main__":
59
+ with Pool(processes=12) as pool:
60
+ results = pool.starmap(process_nature_paper, [[article] for article in articles])
61
+
62
+ # Filter out any None results due to errors
63
+ results = [result for result in results if result is not None]
64
+
65
+ # Convert the list of dictionaries to a DataFrame
66
+ nature = pd.DataFrame(results)
67
+ nature = nature[['title', 'year', 'pdf', 'url', 'public']]
68
+ nature.to_csv('nature.csv')
data/fetch_processed.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import numpy as np
3
+ import pandas as pd
4
+ import re
5
+
6
+ current_year = 2024
7
+ MIDL_years = range(2018, current_year + 1, 1)
8
+ custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
9
+
10
+ for venue in custom_order:
11
+ df = pd.read_excel("https://docs.google.com/spreadsheets/d/e/2PACX-1vQjpsSYcEcYUVB-88bCQ01UfQf0z9m16ax7p1ft03G68Nr-DdXHpPt-xOFSrXFj1N49AjK5nYhmKBfo/pub?output=xlsx", sheet_name=venue)
12
+ df.to_csv(f'data/{venue}.csv', sep="\t")
13
+
14
+ # Store all evaluations here
15
+ paper_dump = pd.DataFrame()
16
+ # Official color codes for conferences
17
+ MIDL_colors = ["#506775", "#4E7268", "#5170B1", "#004B5A", "#268BCC", "#B18630", "#AA0000"]
18
+
19
+ for venue in custom_order:
20
+ with open(f'data/{venue}.csv') as file:
21
+ tsv_file = csv.reader(file, delimiter="\t")
22
+ for row in tsv_file:
23
+ if (row[0] == ""):
24
+ continue
25
+
26
+ if (row[1] == ""):
27
+ continue
28
+
29
+ paper_dump = pd.concat([paper_dump, pd.DataFrame({"venue": venue, "title": [row[1]], "year": [row[2]], "pdf": [row[3]], "url": [row[4]], "public": [row[5]], "dependencies": [row[6]], "training": [row[7]], "evaluation": [row[8]], "weights": [row[9]], "readme": [row[10]], "license": [row[11]]})], ignore_index=True)
30
+ paper_dump.to_csv(f'data/dump.csv', sep="\t")
evaluations/__pycache__/documentation.cpython-310.pyc ADDED
Binary file (3.11 kB). View file
 
evaluations/__pycache__/license.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
evaluations/__pycache__/repo_evaluations.cpython-310.pyc ADDED
Binary file (4.63 kB). View file
 
evaluations/__pycache__/requirements.cpython-310.pyc ADDED
Binary file (1.56 kB). View file
 
evaluations/__pycache__/training.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
evaluations/__pycache__/utils.cpython-310.pyc ADDED
Binary file (4.01 kB). View file
 
evaluations/__pycache__/validating.cpython-310.pyc ADDED
Binary file (1.23 kB). View file
 
evaluations/__pycache__/weights.cpython-310.pyc ADDED
Binary file (2.41 kB). View file
 
evaluations/documentation.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import log
2
+ import re
3
+ import numpy as np
4
+
5
+ def evaluate(verbose, llm, zip, readme):
6
+ log(verbose, "LOG", "\nEvaluating code documentation...")
7
+ overall = "No"
8
+
9
+
10
+ code_to_comment_ratio = get_code_to_comment_ratio(zip)
11
+ log(verbose, "LOG", f"Your python scripts have a comment-to-code ratio of {np.round(code_to_comment_ratio, 2)}%.")
12
+
13
+
14
+ if (readme):
15
+ non_empty_rows = [row for row in readme.split("\n") if row != ""]
16
+ if (len(non_empty_rows) < 5):
17
+ log(verbose, "WARNING", "Readme file has very few lines")
18
+ return overall
19
+
20
+ if (count_code_lines(non_empty_rows) > 5):
21
+ log(verbose, "LOG", "Readme file contains python examples.")
22
+ overall = "Yes"
23
+ return overall
24
+
25
+ if (llm):
26
+ prompt = f'{readme}\n \
27
+ Is this README file is enough to find what \
28
+ package dependencies you need to install and how to train \
29
+ and evaluate the proposed model? Please strictly \
30
+ answer yes or no.\n\nA:'
31
+
32
+
33
+ manual_fail = False
34
+ if ((len(re.findall("train", readme, re.IGNORECASE)) == 0)):
35
+ log(verbose, "ERROR", "Readme file missing training information")
36
+ overall = "No"
37
+ if ((len(re.findall("demo", readme, re.IGNORECASE)) == 0) | (len(re.findall("evaluat", readme, re.IGNORECASE)) == 0)):
38
+ log(verbose, "ERROR", "Readme file missing testing information")
39
+ overall = "No"
40
+
41
+ if ((len(re.findall("example", readme, re.IGNORECASE)) > 0)):
42
+ log(verbose, "LOG", "Readme file contains links to examples")
43
+ overall = "Yes"
44
+
45
+ if ((len(re.findall("package", readme, re.IGNORECASE)) == 0) & \
46
+ (len(re.findall("dependenc", readme, re.IGNORECASE)) == 0) & \
47
+ (len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
48
+ log(verbose, "ERROR", "Readme file missing information about package dependencies")
49
+ overall = "No"
50
+
51
+ log(verbose, "ERROR", "Found no useful information in README file.")
52
+ return overall
53
+
54
+ def count_comment_lines(lines):
55
+ # Initialize counters
56
+ single_line_comments = 0
57
+ multi_line_comments = 0
58
+ in_multiline_comment = False
59
+
60
+ for line in lines:
61
+ stripped_line = line.strip()
62
+
63
+ # Check for single-line comments
64
+ if stripped_line.startswith('#'):
65
+ single_line_comments += 1
66
+
67
+ # Check for multi-line comment (docstring) start or end
68
+ if stripped_line.startswith('"""') or stripped_line.startswith("'''"):
69
+ if not in_multiline_comment:
70
+ # Starting a new multi-line comment
71
+ in_multiline_comment = True
72
+ multi_line_comments += 1 # Count the start line itself
73
+ else:
74
+ # Ending an existing multi-line comment
75
+ in_multiline_comment = False
76
+ multi_line_comments += 1 # Count the end line itself
77
+ elif in_multiline_comment:
78
+ # Continue counting lines within a multi-line comment
79
+ multi_line_comments += 1
80
+
81
+ return single_line_comments, multi_line_comments
82
+
83
+ def get_code_to_comment_ratio(zip):
84
+ python_files = [file_path for file_path in zip.namelist() if (file_path.endswith(".py") | file_path.endswith(".ipynb"))]
85
+ code_line_count = 0
86
+ comment_line_count = 0
87
+ for file in python_files:
88
+ file_lines = zip.open(file).read().decode("utf-8").split('\n')
89
+ sl_comm, ml_comm = count_comment_lines(file_lines)
90
+ comment_line_count += sl_comm + ml_comm
91
+ code_line_count += len(file_lines) - (sl_comm + ml_comm)
92
+ code_to_comment_ratio = 100 * comment_line_count / code_line_count
93
+
94
+ return code_to_comment_ratio
95
+
96
+ def count_code_lines(lines):
97
+ is_code_snippet = False
98
+ code_line_count = 0
99
+
100
+ for line in lines:
101
+ stripped_line = line.strip()
102
+
103
+ if stripped_line.startswith('```'):
104
+ if not is_code_snippet:
105
+ is_code_snippet = True
106
+ code_line_count += 1
107
+ else:
108
+ is_code_snippet = False
109
+ code_line_count += 1
110
+ elif is_code_snippet:
111
+ code_line_count += 1
112
+
113
+ return int(code_line_count / 2)
evaluations/license.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import log, model_predict
2
+ import re
3
+
4
+ def evaluate(verbose, llm, zip, readme):
5
+ log(verbose, "LOG", "\nEvaluating repository licensing...")
6
+ overall = "No"
7
+ license_files = [license for license in zip.namelist() if ((("LICENSE" in license) | ("license" in license)) & (len(license.split("/")) == 2))]
8
+ if (len(license_files) > 0):
9
+ license = zip.open(license_files[0]).read().decode("utf-8")
10
+ ans = [row for row in license.split("\n") if row != ""]
11
+
12
+ if (llm):
13
+ license = license[:50]
14
+ prompt = f"Q: {license}. This was an excerpt from a license \
15
+ file. Do you know the name of this license?"
16
+ ans = model_predict(prompt)
17
+ log(verbose, "LOG", f"Found license: {ans}")
18
+ else:
19
+ log(verbose, "LOG", f"Found license file: {license_files[0]}")
20
+
21
+ overall = "Yes"
22
+
23
+ if (readme):
24
+ if ("License" in readme):
25
+ log(verbose, "LOG", "License found in README.")
26
+ overall = "Yes"
27
+
28
+ log(verbose, "ERROR", "LICENSE file not found.")
29
+ return overall
evaluations/repo_evaluations.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from evaluations import documentation, requirements, training, validating, license, weights
4
+ from evaluations.utils import *
5
+ import zipfile
6
+ import os
7
+ import numpy as np
8
+ from huggingface_hub import InferenceClient
9
+
10
+ API_URL = "https://api-inference.huggingface.co/models/openlm-research/open_llama_3b_v2"
11
+ headers = {"Authorization": "Bearer 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", "x-wait-for-model": "true"}
12
+
13
+
14
+ client = InferenceClient(
15
+ "meta-llama/Llama-3.1-8B-Instruct",
16
+ token="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",
17
+ )
18
+
19
+ def init_llm(verbose):
20
+ log(verbose, "LOG", "Initializing LLM...")
21
+
22
+ def evaluate(llm, verbose, repo_url, title=None, year=None):
23
+ repo_name = "data/repo.zip"
24
+ token = os.getenv("githubToken")
25
+ # token = userdata.get('githubToken')
26
+
27
+ if (llm):
28
+ init_llm(verbose)
29
+ else:
30
+ log(verbose, "LOG", "No LLM will be used for the evaluation.")
31
+
32
+ results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
33
+
34
+ try:
35
+ if (get_api_link(repo_url) != ""):
36
+ results["pred_valid"] = True
37
+ username, repo_name = decompose_url(repo_url)
38
+ log(verbose, "LOG", f"Fetching github repository: https://github.com/{username}/{repo_name}")
39
+
40
+ fetch_repo(verbose, repo_url, repo_name, token)
41
+
42
+ if ((title != None) & (year != None) & (title != "") & (year != "")):
43
+ res = fetch_openalex(verbose, title, year)
44
+ if (res != None):
45
+ res = res["results"]
46
+ if (len(res) > 0):
47
+ res = res[0]
48
+ results["pred_citations"] = res["cited_by_count"]
49
+
50
+ if (not(os.path.exists(repo_name))):
51
+ results["pred_live"] = "No"
52
+ return results
53
+
54
+ zip = zipfile.ZipFile(repo_name)
55
+ readme = fetch_readme(zip)
56
+ results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
57
+
58
+
59
+ if (len(zip.namelist()) <= 2):
60
+ log(verbose, "LOG", "Empty repository")
61
+ results["pred_live"] = "No"
62
+ results["pred_training"] = "No"
63
+ results["pred_evaluation"] = "No"
64
+ results["pred_weights"] = "No"
65
+ results["pred_packages"] = "No"
66
+ else:
67
+ results["pred_dependencies"] = requirements.evaluate(verbose, llm, zip, readme)
68
+ results["pred_training"] = training.evaluate(verbose, llm, zip, readme)
69
+ results["pred_evaluation"] = validating.evaluate(verbose, llm, zip, readme)
70
+ results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
71
+ results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
72
+ results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
73
+ results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
74
+
75
+ return results
76
+ except Exception as e:
77
+ log(verbose, "ERROR", "Evaluating repository failed: " + str(e))
78
+ results["pred_live"] = "No"
79
+ return results
80
+
81
+ def full_evaluations():
82
+ paper_dump = pd.read_csv("data/dump.csv", sep="\t")
83
+ repro = evaluate(None, False)
84
+ full_results = []
85
+
86
+ nth = 1
87
+ for idx, row in paper_dump.iterrows():
88
+ if (idx % nth != 0):
89
+ continue
90
+
91
+ if (row["url"] == ""):
92
+ continue
93
+
94
+ print(str(int(100 * idx / paper_dump["title"].count())) + "% done")
95
+ result = evaluate(None, False, row["url"], row["title"], row["year"])
96
+ for column in result.keys():
97
+ row[column] = result[column]
98
+
99
+ full_results.append(row)
100
+
101
+ def midl_evaluations():
102
+ compare_to_gt = True
103
+ paper_dump = pd.read_csv("data/dump.csv", sep="\t")
104
+ verbose = 1
105
+
106
+ eval_readme = []
107
+ eval_training = []
108
+ eval_evaluating = []
109
+ eval_licensing = []
110
+ eval_weights = []
111
+ eval_dependencies = []
112
+ full_results = []
113
+ for idx, row in paper_dump.iterrows():
114
+ if (row["venue"] != "MIDL"):
115
+ continue
116
+
117
+ if (row["venue"] == 2024):
118
+ continue
119
+
120
+ if (row["url"] == ""):
121
+ continue
122
+
123
+
124
+ print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
125
+ print(f'Paper title - "{row["title"]}" ({row["year"]})')
126
+ print(f'Repository link - {row["url"]}')
127
+ result = evaluate(None, verbose, row["url"])
128
+ for column in result.keys():
129
+ row[column] = result[column]
130
+ full_results.append(row)
131
+ if (compare_to_gt):
132
+ print("\nSummary:")
133
+ if ((row["pred_dependencies"] is not None) & (row["dependencies"] != "")):
134
+ eval_dependencies.append(row["pred_dependencies"] == row["dependencies"])
135
+ print(f"Dependencies acc. - {row['pred_dependencies']} (GT:{row['dependencies']}) / {int(100 * np.mean(eval_dependencies))}%")
136
+ if ((row["pred_training"] is not None) & (row["training"] != "")):
137
+ eval_training.append(row["training"] == row["pred_training"])
138
+ print(f"Training acc. -{row['pred_training']} (GT:{row['training']}) / {int(100 * np.mean(eval_training))}%")
139
+ if ((row["pred_evaluation"] is not None) & (row["evaluation"] != "")):
140
+ eval_evaluating.append(row["evaluation"] == row["pred_evaluation"])
141
+ print(f"Evaluating acc. - {row['pred_evaluation']} (GT:{row['evaluation']}) / {int(100 * np.mean(eval_evaluating))}%")
142
+ if ((row["pred_weights"] is not None) & (row["weights"] != "")):
143
+ eval_weights.append(row["weights"] == row["pred_weights"])
144
+ print(f"Weights acc. - {row['pred_weights']} (GT:{row['weights']}) / {int(100 * np.mean(eval_weights))}%")
145
+ if ((row["pred_readme"] is not None) & (row["readme"] != "")):
146
+ eval_readme.append(row["readme"] == row["pred_readme"])
147
+ print(f"README acc. - {row['pred_readme']} (GT:{row['readme']}) / {int(100 * np.mean(eval_readme))}%")
148
+ if ((row["pred_license"] is not None) & (row["license"] != "")):
149
+ eval_licensing.append(("No" if row["license"] == "No" else "Yes") == row["pred_license"])
150
+ print(f"LICENSE acc. - {row['pred_license']} (GT:{row['license']}) / {int(100 * np.mean(eval_licensing))}%")
evaluations/requirements.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import log
2
+
3
+ def evaluate(verbose, llm, zip, readme):
4
+ log(verbose, "LOG", "\nLooking for package dependencies for running the code...")
5
+ overall = "No"
6
+
7
+ scripts = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
8
+
9
+ files = [file_path for file_path in zip.namelist() if (file_path.endswith(".yml") | file_path.endswith("setup.py") | file_path.endswith("requirements.txt") | ("requirement" in file_path) | ("package" in file_path))]
10
+ files = [file_path for file_path in files if len(file_path.split("/")) == 2]
11
+ for file in files:
12
+ log(verbose, "LOG", f"Found requirements file: {file}")
13
+ requirements = zip.open(file).read().decode("utf-8")
14
+ overall = "Yes"
15
+ if (len(requirements.split("\n")) < 5):
16
+ log(verbose, "WARNING", "Requirements file contains too few lines.")
17
+ overall = "No"
18
+
19
+ if (readme):
20
+ if (("requirement" in readme) | ("Requirement" in readme) | ("Dependenc" in readme) | ("dependenc" in readme) | (len([row for row in readme.split("\n") if (("#" in row) & (("environment" in row) | ("Environment" in row)))]) > 0)):
21
+ log(verbose, "LOG", "Found dependencies in README file")
22
+ overall = "Yes"
23
+
24
+ return overall
evaluations/training.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import log
2
+ import re
3
+
4
+ def evaluate(verbose, llm, zip, readme):
5
+ log(verbose, "LOG", "\nLooking for code to train the model...")
6
+ overall = "No"
7
+
8
+
9
+ patterns = {
10
+ 'tensorflow': [
11
+ r'model\.(fit|compile|train_on_batch)',
12
+ r'tf\.GradientTape'
13
+ ],
14
+ 'pytorch': [
15
+ r'model\.(train|forward)',
16
+ r'loss\.backward',
17
+ r'optimizer\.step',
18
+ ]
19
+ }
20
+ files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
21
+ for file_path in files:
22
+ code = zip.open(file_path).read().decode("utf-8")
23
+ for framework, regex_list in patterns.items():
24
+ for pattern in regex_list:
25
+ if re.search(pattern, code):
26
+ log(verbose, "LOG", f"Found code for training a model in {framework} framework in file: {file_path}")
27
+ overall = "Yes"
28
+
29
+
30
+ if (readme):
31
+ if (("train" in readme)):
32
+ log(verbose, "LOG", "Found something about training in README file")
33
+ overall = "Yes"
34
+
35
+ return overall
evaluations/utils.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ import time
4
+ import os
5
+ import json
6
+ import streamlit as st
7
+
8
+
9
+ def model_predict(client, prompt):
10
+ for message in client.chat_completion(
11
+ messages=[{"role": "system", "content": "You are a chatbot evaluating github repositories, their python codes and corresponding readme files. Strictly answer the questions with Yes or No."}, {"role": "user", "content": prompt}],
12
+ max_tokens=500,
13
+ stream=True,
14
+ ):
15
+ return message.choices[0].delta.content
16
+
17
+ return ""
18
+
19
+
20
+ def get_api_link(url):
21
+ username, repo_name = decompose_url(url)
22
+ if (username == None):
23
+ return ""
24
+ return f"https://api.github.com/repos/{username}/{repo_name}/zipball/"
25
+
26
+ def decompose_url(url):
27
+ try:
28
+ url = url.split("github.com")[1]
29
+ url = url.strip(".")
30
+ url = url.split(".git")[0]
31
+ url = url.strip("/")
32
+ parts = url.split("/")
33
+ username = parts[0]
34
+ repo_name = parts[1]
35
+ return username, repo_name
36
+ except:
37
+ return ""
38
+
39
+
40
+ def fetch_repo_stars(verbose, repo_url, token):
41
+ headers = {"Authorization": f"token {token}"}
42
+ api_url = get_api_link(repo_url)
43
+ api_url = api_url.replace("/zipball/", "")
44
+
45
+ # Sending GET request to GitHub API
46
+ response = requests.get(api_url, headers=headers)
47
+
48
+ if response.status_code == 200:
49
+ return json.loads(response.content)["stargazers_count"]
50
+ if (response.status_code == 404):
51
+ log(verbose, "ERROR", "Repository private.")
52
+
53
+ def fetch_repo(verbose, repo_url, repo_name, token):
54
+ if (os.path.exists(repo_name)):
55
+ os.remove(repo_name)
56
+
57
+
58
+ if ("github.com" not in repo_url):
59
+ log(verbose, "ERROR", f"URL not for github repo, please evaluate manually ({repo_url}).")
60
+ return
61
+
62
+ headers = {"Authorization": f"token {token}"}
63
+ api_url = get_api_link(repo_url)
64
+
65
+ if (api_url == ""):
66
+ log(verbose, "ERROR", f"Failed to parse the URL, please evaluate manually ({repo_url}).")
67
+ return
68
+
69
+ # Sending GET request to GitHub API
70
+ response = requests.get(api_url, headers=headers)
71
+
72
+ if response.status_code == 200:
73
+ with open(repo_name, 'wb') as file:
74
+ file.write(response.content)
75
+
76
+ log(verbose, "LOG", "Repository downloaded successfully")
77
+ if (response.status_code == 404):
78
+ log(verbose, "ERROR", "Repository private.")
79
+
80
+ def fetch_readme(zip):
81
+ readme_files = [readme for readme in zip.namelist() if ((readme.endswith("README.MD") | readme.endswith("README.md") | readme.endswith("readme.md")) & (len(readme.split("/")) == 2))]
82
+ readme = ""
83
+ for readme_file in readme_files:
84
+ readme += zip.open(readme_file).read().decode("utf-8") + "\n\n"
85
+ return readme
86
+
87
+ def fetch_license(zip):
88
+ license_files = [license for license in zip.namelist() if (("LICENSE" in license) & (len(license.split("/")) == 2))]
89
+ license = None
90
+ if (len(license_files) > 0):
91
+ license = zip.open(license_files[0]).read().decode("utf-8")
92
+ return license
93
+
94
+ def fetch_openalex(verbose, paper_name, year):
95
+ api_url = f"https://api.openalex.org/works?filter=default.search:{paper_name},publication_year:{year}"
96
+
97
+ response = requests.get(api_url)
98
+
99
+ if response.status_code == 200:
100
+ return response.json()
101
+ else:
102
+ log(verbose, "WARNING", "Could not find OpenAlex information for paper.")
103
+
104
+
105
+ def log(verbose, log_type, log_text, hf=False):
106
+ if (verbose == 0):
107
+ return
108
+
109
+
110
+
111
+ if (log_type == "LOG"):
112
+ log_text = f"LOG: {log_text}"
113
+ if (log_type == "ERROR"):
114
+ log_text = f"ERROR: {log_text}"
115
+ if (log_type == "WARNING"):
116
+ log_text = f"WARNING: {log_text}"
117
+
118
+ # Align line-break
119
+ if (log_text.startswith("\n")):
120
+ print("\n")
121
+ log_text = log_text.lstrip('\n')
122
+
123
+ if (verbose == 1):
124
+ print(log_text)
125
+ return
126
+
127
+ if (verbose == 2):
128
+ st.write(log_text)
129
+ return
130
+
131
+ raise Exception(log_text)
evaluations/validating.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import log
2
+ import re
3
+
4
+ def evaluate(verbose, llm, zip, readme):
5
+ log(verbose, "LOG", "\nLooking for examples for running the model...")
6
+ overall = "No"
7
+ patterns = {
8
+ 'tensorflow': [
9
+ r'tf\.keras\.models\.load_model', # TensorFlow model loading
10
+ r'tf\.saved_model\.load',
11
+ r'model\.predict', # Running inference
12
+ r'model\(.+\)' # Direct model invocation for inference
13
+ ],
14
+ 'pytorch': [
15
+ r'torch\.load', # PyTorch model loading
16
+ r'torch\.jit\.load', # PyTorch JIT model loading
17
+ r'model\.eval', # Running inference
18
+ r'model\(.+\)' # Direct model invocation for inference
19
+ ]
20
+ }
21
+
22
+ files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".py") | file_path.endswith(".ipynb")))]
23
+ for file_path in files:
24
+ code = zip.open(file_path).read().decode("utf-8")
25
+ for framework, regex_list in patterns.items():
26
+ for pattern in regex_list:
27
+ if re.search(pattern, code):
28
+ log(verbose, "LOG", f"Found code for evaluating a model in {framework} framework in file: {file_path}")
29
+ overall = "Yes"
30
+
31
+ if (readme):
32
+ if ((len(re.findall("testing", readme)) > 0)):
33
+ log(verbose, "LOG", "Found information about evaluations in readme")
34
+ overall = "Yes"
35
+
36
+ return overall
evaluations/weights.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import log
2
+ import re
3
+
4
+ def evaluate(verbose, llm, zip, readme):
5
+ log(verbose, "LOG", "\nLooking for pre-trained model weights...")
6
+ overall = "No"
7
+ files = [file_path for file_path in zip.namelist() if ((file_path.endswith(".h5") | file_path.endswith(".pth") | file_path.endswith(".torch") | file_path.endswith(".pt") | file_path.endswith(".tar.gz") | file_path.endswith("checkpoint.pt") | ("weights" in file_path) | file_path.endswith("ckpt")))]
8
+ if (len(files) > 0):
9
+ log(verbose, "LOG", f"Found model weights: {files}")
10
+ overall = "Yes"
11
+ return overall
12
+
13
+ if (readme):
14
+
15
+ url_pattern = r'(https?://[^\s]+)'
16
+ urls = re.findall(url_pattern, readme)
17
+ if (len([url for url in urls if "pth" in url]) > 0):
18
+ log(verbose, "LOG", "Found a link to pre-trained weights in readme")
19
+ overall = "Yes"
20
+ return overall
21
+
22
+ readme_lines = readme.split("\n")
23
+ if (len([row for row in readme_lines if ((len(re.findall("pretrained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
24
+ log(verbose, "LOG", "Found a link for 'pretrained' something in readme")
25
+ overall = "Yes"
26
+ return overall
27
+
28
+ if (len([row for row in readme_lines if ((len(re.findall("pre-trained", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
29
+ log(verbose, "LOG", "Found a link for 'pre-trained' something in readme")
30
+ overall = "Yes"
31
+ return overall
32
+
33
+ if (len([row for row in readme_lines if ((len(re.findall("weight", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
34
+ log(verbose, "LOG", "Found a link for 'weight' something in readme")
35
+ overall = "Yes"
36
+ return overall
37
+
38
+ if (len([row for row in readme_lines if ((len(re.findall("download", row, re.IGNORECASE)) > 0) & (len(re.findall("model", row, re.IGNORECASE)) > 0) & (len(re.findall("http", row, re.IGNORECASE)) > 0))]) > 0):
39
+ log(verbose, "LOG", "Found a link for 'model' something in readme")
40
+ overall = "Yes"
41
+ return overall
42
+
43
+ if (llm):
44
+ prompt = f"{readme}\nQ: Does this text contain a download link for the model pre-trained weights?"
45
+ ans = model_predict(prompt)
46
+ if (("Yes" in ans) & ("No" not in ans)):
47
+ log(verbose, "LOG", "The LLM found signs for accessing the pre-trained weights from the readme")
48
+ overall = "Yes"
49
+ return overall
50
+
51
+ log(verbose, "ERROR", "Found no pre-trained model weights.")
52
+ return overall
midl.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from evaluations.repo_evaluations import midl_evaluations
2
+ # importing os module for environment variables
3
+ import os
4
+ # importing necessary functions from dotenv library
5
+ from dotenv import load_dotenv, dotenv_values
6
+ # loading variables from .env file
7
+ load_dotenv()
8
+ token = os.getenv("githubToken")
9
+
10
+ midl_evaluations()
plotting/paper_plots.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.express as px
2
+ import numpy as np
3
+
4
+ paper_dump = pd.read_csv('data/dump.csv', sep="\t")
5
+ # Calculate total number of URLs per year and venue
6
+ custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
7
+ total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
8
+
9
+ # Calculate the number of URLs with errors per year and venue
10
+ total_url_per_venue = paper_dump[paper_dump["url"] != ""].groupby(['year', 'venue']).size().reset_index(name='total_urls')
11
+
12
+ # Merge the DataFrames to calculate the error rate
13
+ merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
14
+ merged_df['repo_rate'] = merged_df['total_urls'] / merged_df['total_titles']
15
+
16
+ # Plot the error rates using Plotly, with year on x-axis and color by venue
17
+ fig = px.bar(
18
+ merged_df,
19
+ x='year',
20
+ y='total_titles',
21
+ color='venue',
22
+ barmode='group',
23
+ title=f'Number of papers per venue',
24
+ labels={'error_rate': 'Success Rate', 'year': 'Year'},
25
+ category_orders={'venue': custom_order}
26
+ )
27
+
28
+ fig.update_xaxes(range=[2018, 2024])
29
+ fig.show()
30
+
31
+ import plotly.express as px
32
+ import numpy as np
33
+
34
+ # Calculate total number of URLs per year and venue
35
+ total_titles_per_venue = paper_dump.groupby(['year', 'venue']).size().reset_index(name='total_titles')
36
+
37
+ # Calculate the number of URLs with errors per year and venue
38
+ total_url_per_venue = paper_dump[paper_dump["url"] != ""].groupby(['year', 'venue']).size().reset_index(name='total_urls')
39
+
40
+ # Merge the DataFrames to calculate the error rate
41
+ merged_df = pd.merge(total_titles_per_venue, total_url_per_venue, on=['year', 'venue'], how='left')
42
+ merged_df['repo_rate'] = merged_df['total_urls'] / merged_df['total_titles']
43
+
44
+ # Plot the error rates using Plotly, with year on x-axis and color by venue
45
+ fig = px.bar(
46
+ merged_df,
47
+ x='year',
48
+ y='total_titles',
49
+ color='venue',
50
+ barmode='group',
51
+ title=f'Number of papers per venue',
52
+ labels={'error_rate': 'Success Rate', 'year': 'Year'},
53
+ category_orders={'venue': custom_order}
54
+ )
55
+
56
+ fig.update_xaxes(range=[2018, 2024])
57
+ fig.show()
58
+
59
+ # Plot the error rates using Plotly, with year on x-axis and color by venue
60
+ fig = px.bar(
61
+ merged_df,
62
+ x='year',
63
+ y='total_urls',
64
+ color='venue',
65
+ barmode='group',
66
+ title=f'Number of papers per venue',
67
+ labels={'error_rate': 'Success Rate', 'year': 'Year'},
68
+ category_orders={'venue': custom_order}
69
+ )
70
+
71
+ fig.update_xaxes(range=[2018, 2024])
72
+ fig.show()
73
+
74
+
75
+ # Plot the error rates using Plotly, with year on x-axis and color by venue
76
+ fig = px.bar(
77
+ merged_df,
78
+ x='year',
79
+ y='repo_rate',
80
+ color='venue',
81
+ barmode='group',
82
+ title=f'Number of repositories per venue',
83
+ labels={'error_rate': 'Success Rate', 'year': 'Year'},
84
+ category_orders={'venue': custom_order}
85
+ )
86
+ fig.update_xaxes(range=[2018, 2024])
87
+ fig.update_yaxes(range=[0, 1])
88
+
89
+ fig.show()
plotting/result_plots.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.express as px
2
+ import pandas as pd
3
+
4
+ df = pd.read_csv('data/results.csv', sep="\t")
5
+ custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
6
+
7
+ # Calculate total number of URLs per year and venue
8
+ total_urls_per_year_venue = df.groupby(['year', 'venue']).size().reset_index(name='total_urls')
9
+
10
+ # Calculate the number of URLs with errors per year and venue
11
+ errors_per_year_venue = df[df["pred_valid"] != False].groupby(['year', 'venue']).size().reset_index(name='errors')
12
+
13
+ # Merge the DataFrames to calculate the error rate
14
+ error_rate_df = pd.merge(total_urls_per_year_venue, errors_per_year_venue, on=['year', 'venue'], how='left')
15
+ error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors
16
+ error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
17
+
18
+ # Plot the error rates using Plotly, with year on x-axis and color by venue
19
+ fig = px.bar(
20
+ error_rate_df,
21
+ x='year',
22
+ y='error_rate',
23
+ color='venue',
24
+ barmode='group',
25
+ title=f'Success Rate per Venue and Year for "valid_url"',
26
+ labels={'error_rate': 'Success Rate', 'year': 'Year'},
27
+ category_orders={'venue': custom_order}
28
+ )
29
+
30
+ fig.update_yaxes(range=[0, 1])
31
+ fig.update_xaxes(range=[2017.5, 2024.5])
32
+ fig.show()
33
+
34
+
35
+ for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
36
+ # Calculate total number of URLs per year and venue
37
+ total_valid_urls_per_year_venue = df[df["pred_valid"] == True].groupby(['year', 'venue']).size().reset_index(name='total_urls')
38
+
39
+ # Calculate the number of URLs with errors per year and venue
40
+ passes_per_year_venue = df[df[topic] != "No"].groupby(['year', 'venue']).size().reset_index(name='successes')
41
+
42
+ # Merge the DataFrames to calculate the error rate
43
+ success_rate_df = pd.merge(total_urls_per_year_venue, passes_per_year_venue, on=['year', 'venue'], how='left')
44
+ success_rate_df['successes'] = success_rate_df['successes'].fillna(0) # Replace NaN with 0 for venues with no errors
45
+ success_rate_df['success_rate'] = success_rate_df['successes'] / success_rate_df['total_urls']
46
+
47
+ # Plot the error rates using Plotly, with year on x-axis and color by venue
48
+ fig = px.bar(
49
+ success_rate_df,
50
+ x='year',
51
+ y='success_rate',
52
+ color='venue',
53
+ barmode='group',
54
+ title=f'Success Rate per Venue and Year for "{topic}"',
55
+ labels={'error_rate': 'Success Rate', 'year': 'Year'},
56
+ category_orders={'venue': custom_order}
57
+ )
58
+
59
+ fig.update_yaxes(range=[0, 1])
60
+ fig.update_xaxes(range=[2017.5, 2024.5])
61
+ fig.show()
62
+
63
+
64
+ # List of columns to check for "No"
65
+ columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
66
+
67
+ # Step 1: Calculate the number of "No" answers per row for the specified columns
68
+ df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
69
+
70
+ # Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
71
+ fig = px.scatter(
72
+ df,
73
+ x='pred_citations',
74
+ y='no_count',
75
+ color='venue',
76
+ title='Number of "No" Answers vs Predicted Stars, Color Coded by Venue',
77
+ labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
78
+ category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary
79
+ log_x=True
80
+ )
81
+
82
+ # Step 3: Display the scatter plot
83
+ fig.show()
84
+
85
+ # List of columns to check for "No"
86
+ columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
87
+
88
+ # Step 1: Calculate the number of "No" answers per row for the specified columns
89
+ df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
90
+
91
+ # Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
92
+ fig = px.strip(
93
+ df,
94
+ x='venue',
95
+ y='no_count',
96
+ color='venue',
97
+ title='Individual "No" Scores with Jitter per Venue',
98
+ labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
99
+ category_orders={'venue': custom_order}, # Ensure custom order for venues
100
+ stripmode='overlay' # Allows all individual points to overlay each other
101
+ )
102
+
103
+ # Step 3: Add some jitter to the x-axis so points don't overlap
104
+ fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))
105
+
106
+ # Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
107
+ fig.add_trace(px.box(
108
+ df,
109
+ x='venue',
110
+ y='no_count',
111
+ category_orders={'venue': custom_order}
112
+ ).data[0]) # We add the first trace of the box plot to overlay
113
+
114
+ # Step 5: Show the plot
115
+ fig.show()
116
+
117
+ for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
118
+ # Calculate total number of URLs per venue
119
+ total_urls_per_venue = df.groupby('venue').size().reset_index(name='total_urls')
120
+
121
+ # Calculate the number of URLs with errors per venue
122
+ errors_per_venue = df[df[topic] != "No"].groupby('venue').size().reset_index(name='errors')
123
+
124
+ # Merge the DataFrames to calculate the error rate
125
+ error_rate_df = pd.merge(total_urls_per_venue, errors_per_venue, on='venue', how='left')
126
+ error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors
127
+ error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
128
+
129
+ # Plot the error rates using Plotly, with venue on x-axis
130
+ fig = px.bar(
131
+ error_rate_df,
132
+ x='venue',
133
+ y='error_rate',
134
+ color='venue',
135
+ title=f'Success Rate per Venue for "{topic}"',
136
+ labels={'error_rate': 'Success Rate', 'venue': 'Venue'},
137
+ category_orders={'venue': custom_order}
138
+ )
139
+
140
+ fig.update_yaxes(range=[0, 1])
141
+ fig.show()