Spaces:
Sleeping
Sleeping
Commit
·
ccf0698
1
Parent(s):
febd197
new files, posible model
Browse files- app.py +4 -1
- evaluations/documentation.py +2 -1
- evaluations/license.py +4 -5
- evaluations/models.py +47 -0
- evaluations/pitfalls.py +6 -0
- evaluations/repo_evaluations.py +5 -17
- evaluations/utils.py +1 -13
- full_eval.py +1 -1
- midl.py +7 -5
app.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
from evaluations.repo_evaluations import evaluate
|
|
|
3 |
import requests
|
4 |
|
|
|
|
|
5 |
st.write("\n")
|
6 |
st.write("Welcome to the online reproducibility evaluation tool!")
|
7 |
st.write("We follow guidelines provided by Simkó et al. (2022) (https://arxiv.org/abs/2210.11146)")
|
@@ -11,7 +14,7 @@ repo_link = st.text_input("Github repository link:", value="", type="default", h
|
|
11 |
|
12 |
if (repo_link):
|
13 |
verbose = 4 if checkbox else 3
|
14 |
-
evaluate(llm=
|
15 |
|
16 |
with st.form("my_form"):
|
17 |
st.write("Notice something wrong? Please tell us so we can improve.")
|
|
|
1 |
import streamlit as st
|
2 |
from evaluations.repo_evaluations import evaluate
|
3 |
+
from evaluations.models import LocalLLM
|
4 |
import requests
|
5 |
|
6 |
+
model = LocalLLM("codellama/CodeLlama-7b-Instruct-hf")
|
7 |
+
|
8 |
st.write("\n")
|
9 |
st.write("Welcome to the online reproducibility evaluation tool!")
|
10 |
st.write("We follow guidelines provided by Simkó et al. (2022) (https://arxiv.org/abs/2210.11146)")
|
|
|
14 |
|
15 |
if (repo_link):
|
16 |
verbose = 4 if checkbox else 3
|
17 |
+
evaluate(llm=model, verbose=verbose, repo_url=repo_link)
|
18 |
|
19 |
with st.form("my_form"):
|
20 |
st.write("Notice something wrong? Please tell us so we can improve.")
|
evaluations/documentation.py
CHANGED
@@ -28,6 +28,7 @@ def evaluate(verbose, llm, zip, readme):
|
|
28 |
package dependencies you need to install and how to train \
|
29 |
and evaluate the proposed model? Please strictly \
|
30 |
answer yes or no.\n\nA:'
|
|
|
31 |
|
32 |
|
33 |
manual_fail = False
|
@@ -47,7 +48,7 @@ def evaluate(verbose, llm, zip, readme):
|
|
47 |
(len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
|
48 |
log(verbose, "ERROR", "Readme file missing information about package dependencies")
|
49 |
overall = "No"
|
50 |
-
|
51 |
return overall
|
52 |
|
53 |
def count_comment_lines(lines):
|
|
|
28 |
package dependencies you need to install and how to train \
|
29 |
and evaluate the proposed model? Please strictly \
|
30 |
answer yes or no.\n\nA:'
|
31 |
+
llm.predict("HELP", prompt)
|
32 |
|
33 |
|
34 |
manual_fail = False
|
|
|
48 |
(len(re.findall("requirement", readme, re.IGNORECASE)) == 0)):
|
49 |
log(verbose, "ERROR", "Readme file missing information about package dependencies")
|
50 |
overall = "No"
|
51 |
+
|
52 |
return overall
|
53 |
|
54 |
def count_comment_lines(lines):
|
evaluations/license.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from .utils import log
|
2 |
import re
|
3 |
|
4 |
def evaluate(verbose, llm, zip, readme):
|
@@ -10,10 +10,9 @@ def evaluate(verbose, llm, zip, readme):
|
|
10 |
ans = [row for row in license.split("\n") if row != ""]
|
11 |
|
12 |
if (llm):
|
13 |
-
license = license
|
14 |
-
prompt = f"
|
15 |
-
|
16 |
-
ans = model_predict(prompt)
|
17 |
log(verbose, "LOG", f"Found license: {ans}")
|
18 |
else:
|
19 |
log(verbose, "LOG", f"Found license file: {license_files[0]}")
|
|
|
1 |
+
from .utils import log
|
2 |
import re
|
3 |
|
4 |
def evaluate(verbose, llm, zip, readme):
|
|
|
10 |
ans = [row for row in license.split("\n") if row != ""]
|
11 |
|
12 |
if (llm):
|
13 |
+
license = license
|
14 |
+
prompt = f"{license}. Please describe this type of license, what it allows and what it doesn't."
|
15 |
+
ans = llm.predict("HELP", prompt)
|
|
|
16 |
log(verbose, "LOG", f"Found license: {ans}")
|
17 |
else:
|
18 |
log(verbose, "LOG", f"Found license file: {license_files[0]}")
|
evaluations/models.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from huggingface_hub import InferenceClient
|
3 |
+
import os
|
4 |
+
|
5 |
+
system_messages = { "STRICT": "You are a chatbot evaluating github repositories, their python codes and corresponding readme files. Strictly answer the questions with Yes or No.",
|
6 |
+
"HELP": "You are a chatbot evaluating github repositories, their python codes and corresponding readme files. Please help me answer the following question." }
|
7 |
+
|
8 |
+
class LocalLLM():
|
9 |
+
def __init__(self, model_name):
|
10 |
+
self.pipe = pipeline("text-generation", model=model_name, max_new_tokens=1000, device_map={0: 0})
|
11 |
+
|
12 |
+
def predict(self, response_type, prompt):
|
13 |
+
messages = [
|
14 |
+
{"role": "system", "content": system_messages[response_type]},
|
15 |
+
{"role": "user", "content": prompt},
|
16 |
+
]
|
17 |
+
res = self.pipe(messages)
|
18 |
+
res = res[0]["generated_text"]
|
19 |
+
|
20 |
+
res = [response for response in res if response["role"] == "assistant"][0]["content"]
|
21 |
+
res = res.strip()
|
22 |
+
|
23 |
+
return res
|
24 |
+
|
25 |
+
class RemoteLLM():
|
26 |
+
def __init__(self):
|
27 |
+
token = os.getenv("hfToken")
|
28 |
+
API_URL = "https://api-inference.huggingface.co/models/openlm-research/open_llama_3b_v2"
|
29 |
+
headers = {"Authorization": f"Bearer {token}", "x-wait-for-model": "true"}
|
30 |
+
|
31 |
+
self.client = InferenceClient(
|
32 |
+
"meta-llama/Llama-3.1-8B-Instruct",
|
33 |
+
token=token,
|
34 |
+
)
|
35 |
+
|
36 |
+
|
37 |
+
def predict(self, response_type, prompt):
|
38 |
+
for message in self.client.chat_completion(
|
39 |
+
messages=[{"role": "system", "content": system_messages[response_type]},
|
40 |
+
{"role": "user", "content": prompt}],
|
41 |
+
max_tokens=500,
|
42 |
+
stream=True,
|
43 |
+
):
|
44 |
+
return message.choices[0].delta.content
|
45 |
+
|
46 |
+
return ""
|
47 |
+
|
evaluations/pitfalls.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .utils import log, model_predict
|
2 |
+
import re
|
3 |
+
|
4 |
+
def evaluate(verbose, llm, zip, readme):
|
5 |
+
log(verbose, "TITLE", "\nLooking for common pitfalls...")
|
6 |
+
|
evaluations/repo_evaluations.py
CHANGED
@@ -7,22 +7,11 @@ import os
|
|
7 |
import numpy as np
|
8 |
from huggingface_hub import InferenceClient
|
9 |
|
10 |
-
API_URL = "https://api-inference.huggingface.co/models/openlm-research/open_llama_3b_v2"
|
11 |
-
headers = {"Authorization": "Bearer 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", "x-wait-for-model": "true"}
|
12 |
-
|
13 |
-
client = InferenceClient(
|
14 |
-
"meta-llama/Llama-3.1-8B-Instruct",
|
15 |
-
token="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",
|
16 |
-
)
|
17 |
-
|
18 |
def evaluate(llm, verbose, repo_url, title=None, year=None):
|
19 |
repository_zip_name = "data/repo.zip"
|
20 |
token = os.getenv("githubToken")
|
21 |
-
# token = userdata.get('githubToken')
|
22 |
|
23 |
-
if (llm):
|
24 |
-
init_llm(verbose)
|
25 |
-
else:
|
26 |
log(verbose, "LOG", "No LLM will be used for the evaluation.")
|
27 |
|
28 |
results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
|
@@ -54,9 +43,9 @@ def evaluate(llm, verbose, repo_url, title=None, year=None):
|
|
54 |
readme = fetch_readme(zip)
|
55 |
results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
|
56 |
|
57 |
-
|
58 |
if (len(zip.namelist()) <= 2):
|
59 |
-
log(verbose, "LOG", "
|
60 |
results["pred_live"] = "No"
|
61 |
results["pred_training"] = "No"
|
62 |
results["pred_evaluation"] = "No"
|
@@ -69,7 +58,6 @@ def evaluate(llm, verbose, repo_url, title=None, year=None):
|
|
69 |
results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
|
70 |
results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
|
71 |
results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
|
72 |
-
results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
|
73 |
|
74 |
return results
|
75 |
except Exception as e:
|
@@ -94,7 +82,7 @@ def full_evaluation():
|
|
94 |
full_results.append(row)
|
95 |
return pd.DataFrame(full_results)
|
96 |
|
97 |
-
def midl_evaluations():
|
98 |
compare_to_gt = True
|
99 |
paper_dump = pd.read_csv("data/dump.csv", sep="\t")
|
100 |
verbose = 1
|
@@ -120,7 +108,7 @@ def midl_evaluations():
|
|
120 |
print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
|
121 |
print(f'Paper title - "{row["title"]}" ({row["year"]})')
|
122 |
print(f'Repository link - {row["url"]}')
|
123 |
-
result = evaluate(
|
124 |
for column in result.keys():
|
125 |
row[column] = result[column]
|
126 |
full_results.append(row)
|
|
|
7 |
import numpy as np
|
8 |
from huggingface_hub import InferenceClient
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def evaluate(llm, verbose, repo_url, title=None, year=None):
|
11 |
repository_zip_name = "data/repo.zip"
|
12 |
token = os.getenv("githubToken")
|
|
|
13 |
|
14 |
+
if (not(llm)):
|
|
|
|
|
15 |
log(verbose, "LOG", "No LLM will be used for the evaluation.")
|
16 |
|
17 |
results = { "pred_live": "Yes", "pred_dependencies": None, "pred_training": None, "pred_evaluation": None, "pred_weights": None, "pred_readme": None, "pred_license": None, "pred_stars": None, "pred_citations": None, "pred_valid": False}
|
|
|
43 |
readme = fetch_readme(zip)
|
44 |
results["pred_stars"] = fetch_repo_stars(verbose, repo_url, token)
|
45 |
|
46 |
+
results["pred_license"] = license.evaluate(verbose, llm, zip, readme)
|
47 |
if (len(zip.namelist()) <= 2):
|
48 |
+
log(verbose, "LOG", "The repository is empty.")
|
49 |
results["pred_live"] = "No"
|
50 |
results["pred_training"] = "No"
|
51 |
results["pred_evaluation"] = "No"
|
|
|
58 |
results["pred_weights"] = weights.evaluate(verbose, llm, zip, readme)
|
59 |
results["pred_readme"] = documentation.evaluate(verbose, llm, zip, readme)
|
60 |
results["pred_codetocomment"] = documentation.get_code_to_comment_ratio(zip)
|
|
|
61 |
|
62 |
return results
|
63 |
except Exception as e:
|
|
|
82 |
full_results.append(row)
|
83 |
return pd.DataFrame(full_results)
|
84 |
|
85 |
+
def midl_evaluations(model):
|
86 |
compare_to_gt = True
|
87 |
paper_dump = pd.read_csv("data/dump.csv", sep="\t")
|
88 |
verbose = 1
|
|
|
108 |
print(f"\nEvaluating {idx+1} out of {len(paper_dump.index)} papers...")
|
109 |
print(f'Paper title - "{row["title"]}" ({row["year"]})')
|
110 |
print(f'Repository link - {row["url"]}')
|
111 |
+
result = evaluate(model, verbose, row["url"])
|
112 |
for column in result.keys():
|
113 |
row[column] = result[column]
|
114 |
full_results.append(row)
|
evaluations/utils.py
CHANGED
@@ -6,16 +6,6 @@ import json
|
|
6 |
import streamlit as st
|
7 |
|
8 |
|
9 |
-
def model_predict(client, prompt):
|
10 |
-
for message in client.chat_completion(
|
11 |
-
messages=[{"role": "system", "content": "You are a chatbot evaluating github repositories, their python codes and corresponding readme files. Strictly answer the questions with Yes or No."}, {"role": "user", "content": prompt}],
|
12 |
-
max_tokens=500,
|
13 |
-
stream=True,
|
14 |
-
):
|
15 |
-
return message.choices[0].delta.content
|
16 |
-
|
17 |
-
return ""
|
18 |
-
|
19 |
|
20 |
def get_api_link(url):
|
21 |
username, repo_name = decompose_url(url)
|
@@ -72,10 +62,8 @@ def fetch_repo(verbose, repo_url, repo_name, token):
|
|
72 |
if response.status_code == 200:
|
73 |
with open(repo_name, 'wb') as file:
|
74 |
file.write(response.content)
|
75 |
-
|
76 |
-
log(verbose, "LOG", "Repository downloaded successfully")
|
77 |
if (response.status_code == 404):
|
78 |
-
log(verbose, "ERROR", "Repository private.")
|
79 |
|
80 |
def fetch_readme(zip):
|
81 |
readme_files = [readme for readme in zip.namelist() if ((readme.endswith("README.MD") | readme.endswith("README.md") | readme.endswith("readme.md")) & (len(readme.split("/")) == 2))]
|
|
|
6 |
import streamlit as st
|
7 |
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def get_api_link(url):
|
11 |
username, repo_name = decompose_url(url)
|
|
|
62 |
if response.status_code == 200:
|
63 |
with open(repo_name, 'wb') as file:
|
64 |
file.write(response.content)
|
|
|
|
|
65 |
if (response.status_code == 404):
|
66 |
+
log(verbose, "ERROR", "Repository private / Link broken.")
|
67 |
|
68 |
def fetch_readme(zip):
|
69 |
readme_files = [readme for readme in zip.namelist() if ((readme.endswith("README.MD") | readme.endswith("README.md") | readme.endswith("readme.md")) & (len(readme.split("/")) == 2))]
|
full_eval.py
CHANGED
@@ -8,4 +8,4 @@ load_dotenv()
|
|
8 |
token = os.getenv("githubToken")
|
9 |
|
10 |
res = full_evaluation()
|
11 |
-
res.to_csv("results.csv", sep="\t", index=False)
|
|
|
8 |
token = os.getenv("githubToken")
|
9 |
|
10 |
res = full_evaluation()
|
11 |
+
res.to_csv("data/results.csv", sep="\t", index=False)
|
midl.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
from evaluations.repo_evaluations import midl_evaluations
|
2 |
-
|
3 |
import os
|
4 |
-
|
5 |
-
from dotenv import load_dotenv
|
6 |
-
# loading variables from .env file
|
7 |
load_dotenv()
|
8 |
token = os.getenv("githubToken")
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
res.to_csv("results_midl.csv", sep="\t", index=False)
|
|
|
1 |
from evaluations.repo_evaluations import midl_evaluations
|
2 |
+
from evaluations.models import LocalLLM
|
3 |
import os
|
4 |
+
from dotenv import load_dotenv
|
|
|
|
|
5 |
load_dotenv()
|
6 |
token = os.getenv("githubToken")
|
7 |
|
8 |
+
|
9 |
+
# Load model directly
|
10 |
+
|
11 |
+
model = LocalLLM("codellama/CodeLlama-7b-Instruct-hf")
|
12 |
+
res = midl_evaluations(model)
|
13 |
res.to_csv("results_midl.csv", sep="\t", index=False)
|