|
from functools import partial |
|
import gradio as gr |
|
import os |
|
import csv |
|
import pandas as pd |
|
import pickle |
|
|
|
def load_results(gen_file, grader_file, exp_name, model_name, grader): |
|
record = [] |
|
if 'llama' in model_name: |
|
version = exp_name.split('_')[2] |
|
k = exp_name.split('_')[0] |
|
if k == 'k2' and version == 'v8': |
|
return [] |
|
if k != 'k2' and version == 'v9': |
|
return [] |
|
|
|
with open(gen_file, 'r') as file: |
|
reader = csv.reader(file) |
|
gen = list(reader) |
|
with open(grader_file, 'r') as file: |
|
reader = csv.reader(file) |
|
grade = list(reader) |
|
|
|
|
|
if len(gen) != len(grade): |
|
|
|
|
|
return [] |
|
else: |
|
|
|
|
|
|
|
|
|
for i in range(1, len(gen)): |
|
skills = [skill.strip() for skill in gen[i][1].split(',')] |
|
topic = gen[i][2] |
|
assert(skills == [skill.strip() for skill in grade[i][1].split(',')]) |
|
assert(topic == grade[i][2]) |
|
points = grade[i][9].split(',') |
|
if len(points) < len(skills): |
|
points = points + ['0.0'] * (len(skills) - len(points)) |
|
points[-1] = grade[i][11] |
|
points_no_skill_name = points.copy() |
|
answer = gen[i][-1] |
|
for skill_id, skill in enumerate(skills): |
|
simple_skill = skill.split('(')[0].strip() |
|
if simple_skill in answer: |
|
|
|
points_no_skill_name[skill_id] = '0.0' |
|
record.append({ |
|
'k': gen[i][0], |
|
'exp_name': exp_name, |
|
'model': model_name, |
|
'grader': 'gpt-4' if 'gpt-4' in grader else 'llama-2-70b', |
|
'grade_run': grader, |
|
'skills': '\n\n'.join(skills), |
|
'topic': topic, |
|
'topic+skills': '+'.join([topic] + sorted(skills)), |
|
'gen_prompt': gen[i][4].split('examples for the concepts:')[1].split('Please start the minimal natural')[0].replace('\n', '\n\n'), |
|
'gen': gen[i][-3].replace('\n', '\n\n'), |
|
'grade': grade[i][5].replace('\n', '\n\n'), |
|
'points': ' '.join([(g[:-2] if g[-2:] == '.0' else g) for g in points]), |
|
'points_no_skill_name': ' '.join([(g[:-2] if g[-2:] == '.0' else g) for g in points_no_skill_name]), |
|
|
|
}) |
|
return record |
|
|
|
|
|
def load_all_results(path='final'): |
|
all_results = [] |
|
for exp_name in os.listdir(path): |
|
if os.path.isfile(os.path.join(path, exp_name)): |
|
continue |
|
for model_name in os.listdir(os.path.join(path, exp_name)): |
|
gen_file = os.path.join(path, exp_name, model_name, "records.csv") |
|
if os.path.exists(gen_file) and os.path.isdir(os.path.join(path, exp_name, model_name, 'graded')): |
|
for grader in os.listdir(os.path.join(path, exp_name, model_name, 'graded')): |
|
grader_file = os.path.join(path, exp_name, model_name, 'graded', grader, "records.csv") |
|
if os.path.exists(grader_file): |
|
all_results += load_results(gen_file, grader_file, exp_name, model_name, grader) |
|
return pd.DataFrame(all_results) |
|
|
|
|
|
block_css = """ |
|
#a { |
|
color: black; |
|
background-color: #DEEBF7; |
|
font-size: 20px; |
|
} |
|
#b { |
|
color: black; |
|
background-color: #E2F0D9; |
|
font-size: 20px; |
|
} |
|
#c { |
|
color: black; |
|
background-color: #FFF2CC; |
|
font-size: 20px; |
|
} |
|
#d { |
|
color: black; |
|
background-color: #FBE5D6; |
|
font-size: 20px; |
|
} |
|
""" |
|
|
|
from Levenshtein import distance |
|
|
|
def best_match(comb, comb_list): |
|
if comb == '': |
|
return comb_list[0] |
|
dist = [distance(comb.split('+'), comb_.split('+')) for comb_ in comb_list] |
|
return comb_list[dist.index(min(dist))] |
|
|
|
class Tracker: |
|
def __init__(self, df) -> None: |
|
self.df = df |
|
self.value = {k: '' for k in ['k', 'k_list', 'comb', 'comb_list', 'model', 'model_list', 'exp_name', 'exp_name_list', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_list', 'grader_run', 'grader_run_list', 'points', 'grade']} |
|
|
|
self.value = self.update(self.value) |
|
self.value = [self.value.copy() for _ in range(5)] |
|
self.component = [{k: '' for k in ['k', 'comb', 'model', 'exp_name', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_run', 'points', 'grade']} for _ in range(5)] |
|
|
|
|
|
def update(self, value): |
|
cdf = self.df |
|
k, comb, model, exp_name, grader, grader_run = value['k'], value['comb'], value['model'], value['exp_name'], value['grader'], value['grader_run'] |
|
k_list = sorted(list(cdf.k.unique())) |
|
if k not in k_list: |
|
k = k_list[0] |
|
value['k'] = k |
|
value['k_list'] = k_list |
|
cdf = cdf[cdf.k==k] |
|
|
|
comb_list = sorted(list(cdf['topic+skills'].unique())) |
|
if comb not in comb_list: |
|
comb = best_match(comb, comb_list) |
|
value['comb'] = comb |
|
value['comb_list'] = comb_list |
|
cdf = cdf[cdf['topic+skills']==comb] |
|
|
|
model_list = sorted(list(cdf['model'].unique())) |
|
if model not in model_list: |
|
model = model_list[0] |
|
value['model'] = model |
|
value['model_list'] = model_list |
|
cdf = cdf[cdf.model==model] |
|
|
|
exp_name_list = sorted(list(cdf['exp_name'].unique())) |
|
if exp_name not in exp_name_list: |
|
exp_name = exp_name_list[0] |
|
value['exp_name'] = exp_name |
|
value['exp_name_list'] = exp_name_list |
|
cdf = cdf[cdf.exp_name==exp_name] |
|
|
|
value['topic'] = "*Topic*: " + cdf['topic'].unique()[0] |
|
value['skills'] = "*Skills*: \n\n" + cdf['skills'].unique()[0] |
|
value['gen_prompt'] = "*Skill Definition and Example*:\n\n" + cdf['gen_prompt'].unique()[0] |
|
value['gen'] = "*Model Answer*:\n\n" + cdf['gen'].unique()[0] |
|
|
|
grader_list = sorted(list(cdf['grader'].unique())) |
|
if grader not in grader_list: |
|
grader = grader_list[0] |
|
value['grader'] = grader |
|
value['grader_list'] = grader_list |
|
cdf = cdf[cdf.grader==grader] |
|
|
|
grader_run_list = sorted(list(cdf['grade_run'].unique())) |
|
if grader_run not in grader_run_list: |
|
grader_run = grader_run_list[0] |
|
value['grader_run'] = grader_run |
|
value['grader_run_list'] = grader_run_list |
|
cdf = cdf[cdf.grade_run==grader_run] |
|
|
|
value['points'] = "Points: " + cdf['points'].unique()[0] + "\n\n(After deducting points for explicitly mentioning skill names: " + cdf['points_no_skill_name'].unique()[0] + ")" |
|
|
|
value['grade'] = cdf['grade'].unique()[0] |
|
|
|
return value |
|
|
|
def procedure(self, c): |
|
|
|
input_list = [] |
|
output_list = [] |
|
fn_list = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
binding = [] |
|
idx = -1 |
|
for i in range(5): |
|
for k, v in self.component[i].items(): |
|
if v is c: |
|
idx = i |
|
key = k |
|
break |
|
if idx != -1: |
|
break |
|
assert(idx != -1) |
|
|
|
sync_list = [] |
|
for b in binding: |
|
if (key == b[0]) and (idx in b[1]): |
|
sync_list = [j for j in b[1] if j != idx] |
|
sync_component = [self.component[j][key] for j in sync_list] |
|
|
|
|
|
def sync(v, sync_list=[0]): |
|
return [gr.Dropdown.update(value=v) for _ in range(len(sync_list))] |
|
|
|
if len(sync_list) > 0: |
|
input_list.append(c) |
|
output_list.append(sync_component) |
|
fn_list.append(partial(sync, sync_list=sync_list)) |
|
|
|
def update(k, comb, model, exp_name, grader, grader_run): |
|
value = { |
|
'k': k, |
|
'k_list': '', |
|
'comb': comb, |
|
'comb_list': '', |
|
'model': model, |
|
'model_list': '', |
|
'exp_name': exp_name, |
|
'exp_name_list': '', |
|
'topic': '', |
|
'skills': '', |
|
'gen_prompt': '', |
|
'gen': '', |
|
'grader': grader, |
|
'grader_list': '', |
|
'grader_run': grader_run, |
|
'grader_run_list': '', |
|
'points': '', |
|
'sent_limit_point': '', |
|
'grade': '' |
|
} |
|
value = self.update(value) |
|
return [gr.Dropdown.update(value=value['k'], choices=value['k_list']), |
|
gr.Dropdown.update(value=value['comb'], choices=value['comb_list']), |
|
gr.Dropdown.update(value=value['model'], choices=value['model_list']), |
|
gr.Dropdown.update(value=value['exp_name'], choices=value['exp_name_list']), |
|
value['topic'], |
|
value['skills'], |
|
value['gen_prompt'], |
|
value['gen'], |
|
gr.Dropdown.update(value=value['grader'], choices=value['grader_list']), |
|
gr.Dropdown.update(value=value['grader_run'], choices=value['grader_run_list']), |
|
value['points'], |
|
|
|
value['grade'] |
|
] |
|
|
|
sync_list += [idx] |
|
update_list = [] |
|
for i in range(5): |
|
for j in sync_list: |
|
if self.component[j][key] is self.component[i][key]: |
|
update_list.append(i) |
|
break |
|
|
|
for j in update_list: |
|
input_list.append([self.component[j][k] for k in ['k', 'comb', 'model', 'exp_name', 'grader', 'grader_run']]) |
|
output_list.append([self.component[j][k] for k in ['k', 'comb', 'model', 'exp_name', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_run', 'points', 'grade']]) |
|
|
|
fn_list.append(update) |
|
|
|
return input_list, output_list, fn_list |
|
|
|
def build_demo(df): |
|
|
|
tracker = Tracker(df) |
|
|
|
with gr.Blocks( |
|
title="Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models", |
|
theme=gr.themes.Base(text_size=gr.themes.sizes.text_lg), |
|
css=block_css, |
|
) as demo: |
|
gr.Markdown( |
|
""" |
|
# Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models |
|
By [Princeton Language and Intelligence (PLI), Princeton University](https://pli.princeton.edu/) and [Google DeepMind](https://www.deepmind.com/) |
|
|
|
### This is a demonstration of the Skill-Mix evaluation. |
|
|
|
Paper link: [https://arxiv.org/abs/2310.17567](https://arxiv.org/abs/2310.17567) |
|
|
|
### Samples are generated using 10% of the full set of skills and topics. Click the second tab for comparison between two generations. |
|
|
|
Coming soon: generation by more models; grading by LLaMA-2. |
|
""" |
|
) |
|
|
|
with gr.Tab('Browse Single Generation'): |
|
v = tracker.value[0] |
|
with gr.Row(): |
|
k = gr.Dropdown(choices=v['k_list'], value=v['k'], label="k") |
|
tracker.component[0]['k'] = k |
|
comb = gr.Dropdown(choices=v['comb_list'], value=v['comb'], label="topic+skills") |
|
tracker.component[0]['comb'] = comb |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Row(): |
|
model = gr.Dropdown(choices=v['model_list'], value=v['model'], label="model") |
|
tracker.component[0]['model'] = model |
|
exp_name = gr.Dropdown(choices=v['exp_name_list'], value=v['exp_name'], label="exp_name") |
|
tracker.component[0]['exp_name'] = exp_name |
|
with gr.Row(): |
|
topic = gr.Markdown(value=v['topic'], elem_id='a') |
|
tracker.component[0]['topic'] = topic |
|
skills = gr.Markdown(value=v['skills'], elem_id='a') |
|
tracker.component[0]['skills'] = skills |
|
gen = gr.Markdown(value=v['gen'], elem_id='b') |
|
tracker.component[0]['gen'] = gen |
|
gen_prompt = gr.Markdown(value=v['gen_prompt'], elem_id='a') |
|
tracker.component[0]['gen_prompt'] = gen_prompt |
|
with gr.Column(): |
|
with gr.Row(): |
|
grader = gr.Dropdown(choices=v['grader_list'], value=v['grader'], label="grader") |
|
tracker.component[0]['grader'] = grader |
|
grader_run = gr.Dropdown(choices=v['grader_run_list'], value=v['grader_run'], label="grader_run") |
|
tracker.component[0]['grader_run'] = grader_run |
|
points = gr.Markdown(value=v['points'], elem_id='c') |
|
tracker.component[0]['points'] = points |
|
|
|
|
|
grade = gr.Markdown(value=v['grade'], elem_id='d') |
|
tracker.component[0]['grade'] = grade |
|
with gr.Tab('Compare Two Generations'): |
|
v = tracker.value[1] |
|
with gr.Row(): |
|
k = gr.Dropdown(choices=v['k_list'], value=v['k'], label="k") |
|
tracker.component[1]['k'] = tracker.component[2]['k'] = k |
|
comb = gr.Dropdown(choices=v['comb_list'], value=v['comb'], label="topic+skills") |
|
tracker.component[1]['comb'] = tracker.component[2]['comb'] = comb |
|
with gr.Row(): |
|
for col in range(1, 3): |
|
v = tracker.value[col] |
|
with gr.Column(): |
|
with gr.Row(): |
|
model = gr.Dropdown(choices=v['model_list'], value=v['model'], label="model") |
|
tracker.component[col]['model'] = model |
|
exp_name = gr.Dropdown(choices=v['exp_name_list'], value=v['exp_name'], label="exp_name") |
|
tracker.component[col]['exp_name'] = exp_name |
|
with gr.Row(): |
|
topic = gr.Markdown(value=v['topic'], elem_id='a') |
|
tracker.component[col]['topic'] = topic |
|
skills = gr.Markdown(value=v['skills'], elem_id='a') |
|
tracker.component[col]['skills'] = skills |
|
gen = gr.Markdown(value=v['gen'], elem_id='b') |
|
tracker.component[col]['gen'] = gen |
|
with gr.Row(): |
|
grader = gr.Dropdown(choices=v['grader_list'], value=v['grader'], label="grader") |
|
tracker.component[col]['grader'] = grader |
|
grader_run = gr.Dropdown(choices=v['grader_run_list'], value=v['grader_run'], label="grader_run") |
|
tracker.component[col]['grader_run'] = grader_run |
|
points = gr.Markdown(value=v['points'], elem_id='c') |
|
tracker.component[col]['points'] = points |
|
|
|
|
|
gen_prompt = gr.Markdown(value=v['gen_prompt'], elem_id='a') |
|
tracker.component[col]['gen_prompt'] = gen_prompt |
|
|
|
grade = gr.Markdown(value=v['grade'], elem_id='d') |
|
tracker.component[col]['grade'] = grade |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
all_components = sum([list(tracker.component[i].values()) for i in range(5)], []) |
|
all_components = [c for c in all_components if c != ''] |
|
all_components = list(set(all_components)) |
|
|
|
for c in all_components: |
|
input_list, output_list, fn_list = tracker.procedure(c) |
|
if len(fn_list) > 0: |
|
if len(fn_list) == 1: |
|
c.change(fn_list[0], input_list[0], output_list[0]) |
|
elif len(fn_list) == 2: |
|
c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]) |
|
elif len(fn_list) == 3: |
|
c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]) |
|
elif len(fn_list) == 4: |
|
c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]).then(fn_list[3], input_list[3], output_list[3]) |
|
elif len(fn_list) == 5: |
|
c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]).then(fn_list[3], input_list[3], output_list[3]).then(fn_list[4], input_list[4], output_list[4]) |
|
elif len(fn_list) == 6: |
|
c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]).then(fn_list[3], input_list[3], output_list[3]).then(fn_list[4], input_list[4], output_list[4]).then(fn_list[5], input_list[5], output_list[5]) |
|
else: |
|
raise NotImplementedError |
|
gr.Markdown('''### Citations |
|
``` |
|
@article{yu2023skillmix, |
|
title={Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models}, |
|
author={Yu, Dingli and Kaur, Simran and Gupta, Arushi and Brown-Cohen, Jonah and Goyal, Anirudh and Arora, Sanjeev}, |
|
journal={arXiv preprint arXiv:2310.17567}, |
|
year={2023} |
|
} |
|
``` |
|
``` |
|
@misc{openai2023gpt4, |
|
title={GPT-4 Technical Report}, |
|
author={OpenAI}, |
|
year={2023}, |
|
eprint={2303.08774}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |
|
``` |
|
@article{touvron2023llama, |
|
title={Llama 2: Open foundation and fine-tuned chat models}, |
|
author={Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and others}, |
|
journal={arXiv preprint arXiv:2307.09288}, |
|
year={2023} |
|
} |
|
``` |
|
''') |
|
return demo |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
df = pickle.load(open('on_released_topics_and_skills.pkl', 'rb')) |
|
|
|
demo = build_demo(df) |
|
|
|
|
|
demo.launch() |
|
|