In [47]:
import os
import pickle

import pandas as pd
from huggingface_hub import HfFileSystem, hf_hub_download

## Prepare data

In [72]:
fs = HfFileSystem()


def extract_date(filename):
    return filename.split("/")[-1].split(".")[0].split("_")[-1]


ELO_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.pkl"
elo_files = fs.glob(ELO_DATA_FILES)
latest_elo_file = sorted(elo_files, key=extract_date, reverse=True)[0]

LEADERBOARD_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.csv"
leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)
latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[0]

In [73]:
latest_leaderboard_file.split("/")[-1], latest_elo_file.split("/")[-1]

('leaderboard_table_20240426.csv', 'elo_results_20240426.pkl')

In [74]:
latest_elo_file_local = hf_hub_download(
    repo_id="lmsys/chatbot-arena-leaderboard",
    filename=latest_elo_file.split("/")[-1],
    repo_type="space",
)
latest_leaderboard_file_local = hf_hub_download(
    repo_id="lmsys/chatbot-arena-leaderboard",
    filename=latest_leaderboard_file.split("/")[-1],
    repo_type="space",
)

In [76]:
# load and prepare ELO data
key_to_category_name = {
    "full": "Overall",
    "coding": "Coding",
    "long_user": "Longer Query",
    "english": "English",
    "chinese": "Chinese",
    "french": "French",
    "no_tie": "Exclude Ties",
    "no_short": "Exclude Short Query (< 5 tokens)",
    "no_refusal": "Exclude Refusal",
}
cat_name_to_explanation = {
    "Overall": "Overall Questions",
    "Coding": "Coding: whether conversation contains code snippets",
    "Longer Query": "Longer Query (>= 500 tokens)",
    "English": "English Prompts",
    "Chinese": "Chinese Prompts",
    "French": "French Prompts",
    "Exclude Ties": "Exclude Ties and Bothbad",
    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
}

with open(latest_elo_file_local, "rb") as fin:
    elo_results = pickle.load(fin)

arena_dfs = {}
for k in key_to_category_name.keys():
    if k not in elo_results:
        continue
    arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]

In [77]:
arena_dfs.keys()

dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])

In [78]:
arena_dfs["Overall"]

Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
RWKV-4-Raven-14B,927.710294,27.143015,935.717850,916.546369,5129,81
alpaca-13b,907.324482,20.736682,915.536856,899.330070,6111,85
bard-jan-24-gemini-pro,1208.505408,6.679087,1213.291358,1203.926901,12388,6
chatglm-6b,886.107553,17.110417,894.034333,878.094776,5195,86
chatglm2-6b,932.678460,33.530570,943.455598,921.346322,2880,81
...,...,...,...,...,...,...
wizardlm-70b,1107.992552,9.385887,1114.218223,1102.655575,8868,29
yi-34b-chat,1109.722447,8.596908,1115.182579,1103.991095,12252,29
zephyr-7b-alpha,1042.108710,43.900714,1052.991768,1027.160917,1901,58
zephyr-7b-beta,1053.655680,10.297607,1059.923254,1047.601629,11924,54


In [79]:
# load and prepare Leaderboard data
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)

In [80]:
leaderboard_df

Unnamed: 0,key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
0,wizardlm-30b,WizardLM-30B,7.01,0.587,2023/6,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-30B-V1.0
1,vicuna-13b-16k,Vicuna-13B-16k,6.92,0.545,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5-16k
2,wizardlm-13b-v1.1,WizardLM-13B-v1.1,6.76,0.500,2023/7,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.1
3,tulu-30b,Tulu-30B,6.43,0.581,2023/6,Non-commercial,AllenAI/UW,https://huggingface.co/allenai/tulu-30b
4,guanaco-65b,Guanaco-65B,6.41,0.621,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-65b...
...,...,...,...,...,...,...,...,...
100,mixtral-8x22b-instruct-v0.1,Mixtral-8x22b-Instruct-v0.1,-,0.778,2024/4,Apache 2.0,Mistral,https://mistral.ai/news/mixtral-8x22b/
101,llama-3-70b-instruct,Llama-3-70b-Instruct,-,0.820,2023/12,Llama 3 Community,Meta,https://llama.meta.com/llama3/
102,llama-3-8b-instruct,Llama-3-8b-Instruct,-,0.684,2023/3,Llama 3 Community,Meta,https://llama.meta.com/llama3/
103,gemini-1.5-pro-api-0409-preview,Gemini 1.5 Pro API-0409-Preview,-,0.819,2023/11,Proprietary,Google,https://blog.google/technology/ai/google-gemin...


In [82]:
arena_dfs.keys()

dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])

In [86]:
# merge ELO and Leaderboard data
merged_dfs = {}
for k, v in arena_dfs.items():
    merged_dfs[k] = (
        pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
        .sort_values("rating", ascending=False)
        .reset_index(drop=True)
    )

In [101]:
merged_dfs["Overall"][:20]

Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking,key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
0,1257.399407,4.283316,1261.676224,1254.003626,30562,1,gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09,-,-,2023/12,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-...
1,1253.025095,2.069534,1256.111392,1250.435207,69871,1,gpt-4-1106-preview,GPT-4-1106-preview,9.32,-,2023/4,Proprietary,OpenAI,https://openai.com/blog/new-models-and-develop...
2,1251.11422,1.862842,1253.629093,1248.362042,75684,2,claude-3-opus-20240229,Claude 3 Opus,-,0.868,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
3,1247.662508,3.263747,1251.582645,1244.380454,33723,2,gemini-1.5-pro-api-0409-preview,Gemini 1.5 Pro API-0409-Preview,-,0.819,2023/11,Proprietary,Google,https://blog.google/technology/ai/google-gemin...
4,1247.277052,1.923014,1249.489411,1244.340257,61924,3,gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary,OpenAI,https://openai.com/blog/new-models-and-develop...
5,1208.505408,6.679087,1213.291358,1203.926901,12388,6,bard-jan-24-gemini-pro,Bard (Gemini Pro),-,-,Online,Proprietary,Google,https://bard.google.com/
6,1207.497541,4.109466,1211.720734,1203.322762,27298,6,llama-3-70b-instruct,Llama-3-70b-Instruct,-,0.820,2023/12,Llama 3 Community,Meta,https://llama.meta.com/llama3/
7,1201.671254,2.525563,1204.862512,1198.658822,75418,6,claude-3-sonnet-20240229,Claude 3 Sonnet,-,0.790,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
8,1191.684542,3.459717,1195.080256,1188.222382,41262,9,command-r-plus,Command R+,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r-plus-microsof...
9,1188.987389,3.124792,1193.335535,1185.935928,48390,9,gpt-4-0314,GPT-4-0314,8.96,0.864,2021/9,Proprietary,OpenAI,https://openai.com/research/gpt-4


### Manually map release dates - MEH.

In [113]:
t = merged_dfs["Overall"].loc[:, ["key", "Model"]]
t["Release Date"] = ""

In [120]:
t.to_json("release_date_mapping.json", orient="records", lines=True)

In [119]:
t.to_dict(orient="records")

[{'key': 'gpt-4-turbo-2024-04-09',
  'Model': 'GPT-4-Turbo-2024-04-09',
  'Release Date': ''},
 {'key': 'gpt-4-1106-preview',
  'Model': 'GPT-4-1106-preview',
  'Release Date': ''},
 {'key': 'claude-3-opus-20240229',
  'Model': 'Claude 3 Opus',
  'Release Date': ''},
 {'key': 'gemini-1.5-pro-api-0409-preview',
  'Model': 'Gemini 1.5 Pro API-0409-Preview',
  'Release Date': ''},
 {'key': 'gpt-4-0125-preview',
  'Model': 'GPT-4-0125-preview',
  'Release Date': ''},
 {'key': 'bard-jan-24-gemini-pro',
  'Model': 'Bard (Gemini Pro)',
  'Release Date': ''},
 {'key': 'llama-3-70b-instruct',
  'Model': 'Llama-3-70b-Instruct',
  'Release Date': ''},
 {'key': 'claude-3-sonnet-20240229',
  'Model': 'Claude 3 Sonnet',
  'Release Date': ''},
 {'key': 'command-r-plus', 'Model': 'Command R+', 'Release Date': ''},
 {'key': 'gpt-4-0314', 'Model': 'GPT-4-0314', 'Release Date': ''},
 {'key': 'claude-3-haiku-20240307',
  'Model': 'Claude 3 Haiku',
  'Release Date': ''},
 {'key': 'gpt-4-0613', 'Model': 'GP

## Build plot