eduagarcia commited on
Commit
b2fe6a1
·
1 Parent(s): c2f342d

Fix update collections and dummy references

Browse files
app.py CHANGED
@@ -106,7 +106,7 @@ def init_space(full_init: bool = True):
106
  benchmark_cols=BENCHMARK_COLS,
107
  show_incomplete=SHOW_INCOMPLETE_EVALS
108
  )
109
- #update_collections(original_df.copy())
110
  leaderboard_df = original_df.copy()
111
 
112
  plot_df = create_plot_df(create_scores_df(raw_data))
@@ -553,10 +553,10 @@ def update_dynamic_files_wrapper():
553
  except Exception as e:
554
  print(f"Error updating dynamic files: {e}")
555
 
556
- scheduler = BackgroundScheduler()
557
  scheduler.add_job(restart_space, "interval", seconds=10800, next_run_time=datetime.now() + timedelta(hours=3)) # restarted every 3h
558
- scheduler.add_job(update_dynamic_files_wrapper, "cron", minute=30) # launched every hour on the hour
559
- scheduler.add_job(update_collections, "date", args=(original_df.copy(),), run_date=datetime.now() + timedelta(minutes=5))
560
  scheduler.start()
561
 
562
  demo.queue(default_concurrency_limit=40).launch()
 
106
  benchmark_cols=BENCHMARK_COLS,
107
  show_incomplete=SHOW_INCOMPLETE_EVALS
108
  )
109
+ update_collections(original_df.copy())
110
  leaderboard_df = original_df.copy()
111
 
112
  plot_df = create_plot_df(create_scores_df(raw_data))
 
553
  except Exception as e:
554
  print(f"Error updating dynamic files: {e}")
555
 
556
+ scheduler = BackgroundScheduler(daemon=True)
557
  scheduler.add_job(restart_space, "interval", seconds=10800, next_run_time=datetime.now() + timedelta(hours=3)) # restarted every 3h
558
+ scheduler.add_job(update_dynamic_files_wrapper, "interval", seconds=1800, next_run_time=datetime.now() + timedelta(minutes=5)) # launched every 30 minutes
559
+ #scheduler.add_job(update_collections, "interval", args=(original_df.copy(),), seconds=3600, next_run_time=datetime.now() + timedelta(minutes=1))
560
  scheduler.start()
561
 
562
  demo.queue(default_concurrency_limit=40).launch()
external_models_results.json CHANGED
@@ -244,8 +244,8 @@
244
  },
245
  {
246
  "model": "llama_405b_instruct",
247
- "name": "meta-llama/Meta-Llama-3.1-405B-Instruct (Vertex AI)",
248
- "link": "https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct",
249
  "date": "2024-08-20",
250
  "status": "full",
251
  "main_language": "English",
 
244
  },
245
  {
246
  "model": "llama_405b_instruct",
247
+ "name": "meta-llama/Llama-3.1-405B-Instruct (Vertex AI)",
248
+ "link": "https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct",
249
  "date": "2024-08-20",
250
  "status": "full",
251
  "main_language": "English",
src/display/utils.py CHANGED
@@ -111,7 +111,8 @@ baseline_row = {
111
  AutoEvalColumn.still_on_hub.name: False,
112
  AutoEvalColumn.moe.name: False,
113
  AutoEvalColumn.eval_time.name: 0.0,
114
- AutoEvalColumn.main_language.name: "?"
 
115
  }
116
 
117
  baseline_list = []
@@ -156,6 +157,7 @@ human_baseline_row = {
156
  AutoEvalColumn.moe.name: False,
157
  AutoEvalColumn.eval_time.name: 0.0,
158
  AutoEvalColumn.main_language.name: "?",
 
159
  }
160
 
161
  baseline_list = []
@@ -279,7 +281,7 @@ if os.path.exists('external_models_results.json'):
279
  model_row[AutoEvalColumn.params.name] = model_data['params']
280
 
281
  model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
282
- #convert 2025-04-03 to 2025-04-03T00:00:00Z
283
  external_rows.append(model_row)
284
 
285
  #Create external_eval_results
@@ -356,7 +358,7 @@ if os.path.exists('external_models_results.json'):
356
 
357
 
358
  # Column selection
359
- COLS = [c.name for c in fields(AutoEvalColumn)]
360
  TYPES = [c.type for c in fields(AutoEvalColumn)]
361
 
362
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 
111
  AutoEvalColumn.still_on_hub.name: False,
112
  AutoEvalColumn.moe.name: False,
113
  AutoEvalColumn.eval_time.name: 0.0,
114
+ AutoEvalColumn.main_language.name: "?",
115
+ 'hf_path': None,
116
  }
117
 
118
  baseline_list = []
 
157
  AutoEvalColumn.moe.name: False,
158
  AutoEvalColumn.eval_time.name: 0.0,
159
  AutoEvalColumn.main_language.name: "?",
160
+ 'hf_path': None,
161
  }
162
 
163
  baseline_list = []
 
281
  model_row[AutoEvalColumn.params.name] = model_data['params']
282
 
283
  model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
284
+ model_row['hf_path'] = None if 'huggingface.co' not in model_data['link'] else model_data['link'].split('huggingface.co/')[1]
285
  external_rows.append(model_row)
286
 
287
  #Create external_eval_results
 
358
 
359
 
360
  # Column selection
361
+ COLS = [c.name for c in fields(AutoEvalColumn)] + ['hf_path']
362
  TYPES = [c.type for c in fields(AutoEvalColumn)]
363
 
364
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
src/leaderboard/filter_models.py CHANGED
@@ -134,7 +134,7 @@ def flag_models(leaderboard_data: list[dict]):
134
  if model_data[AutoEvalColumn.flagged.name] == True:
135
  flag_key = "merged"
136
  else:
137
- flag_key = model_data[AutoEvalColumn.dummy.name]
138
 
139
  if flag_key in FLAGGED_MODELS:
140
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
@@ -153,7 +153,7 @@ def flag_models(leaderboard_data: list[dict]):
153
  def remove_forbidden_models(leaderboard_data: list[dict]):
154
  indices_to_remove = []
155
  for ix, model in enumerate(leaderboard_data):
156
- if model[AutoEvalColumn.dummy.name] in DO_NOT_SUBMIT_MODELS:
157
  indices_to_remove.append(ix)
158
 
159
  for ix in reversed(indices_to_remove):
 
134
  if model_data[AutoEvalColumn.flagged.name] == True:
135
  flag_key = "merged"
136
  else:
137
+ flag_key = model_data['hf_path']
138
 
139
  if flag_key in FLAGGED_MODELS:
140
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
 
153
  def remove_forbidden_models(leaderboard_data: list[dict]):
154
  indices_to_remove = []
155
  for ix, model in enumerate(leaderboard_data):
156
+ if model['hf_path'] in DO_NOT_SUBMIT_MODELS:
157
  indices_to_remove.append(ix)
158
 
159
  for ix in reversed(indices_to_remove):
src/leaderboard/read_evals.py CHANGED
@@ -182,9 +182,10 @@ class EvalResult:
182
  npm.append((res-task.value.baseline)*100.0 / (100.0-task.value.baseline))
183
  average = round(sum(average)/len(average), 2)
184
  npm = round(sum(npm)/len(npm), 2)
185
-
186
  data_dict = {
187
  "eval_name": self.eval_name, # not a column, just a save name,
 
188
  AutoEvalColumn.precision.name: self.precision.value.name,
189
  AutoEvalColumn.model_type.name: self.model_type.value.name,
190
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
 
182
  npm.append((res-task.value.baseline)*100.0 / (100.0-task.value.baseline))
183
  average = round(sum(average)/len(average), 2)
184
  npm = round(sum(npm)/len(npm), 2)
185
+
186
  data_dict = {
187
  "eval_name": self.eval_name, # not a column, just a save name,
188
+ "hf_path": self.full_model,
189
  AutoEvalColumn.precision.name: self.precision.value.name,
190
  AutoEvalColumn.model_type.name: self.model_type.value.name,
191
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
src/scripts/update_all_request_files.py CHANGED
@@ -84,6 +84,7 @@ def update_models(file_path, models, original_leaderboard_files=None):
84
  def update_dynamic_files():
85
  """ This will only update metadata for models already linked in the repo, not add missing ones.
86
  """
 
87
  snapshot_download(
88
  repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
89
  )
 
84
  def update_dynamic_files():
85
  """ This will only update metadata for models already linked in the repo, not add missing ones.
86
  """
87
+ print("update_dynamic_files running...")
88
  snapshot_download(
89
  repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
90
  )
src/tools/collections.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import add_collection_item, delete_collection_item, get_col
5
  from huggingface_hub.utils import HfHubHTTPError
6
  from pandas import DataFrame
7
  import numpy as np
 
8
 
9
  from src.display.utils import AutoEvalColumn, ModelType, NUMERIC_INTERVALS
10
  from src.envs import H4_TOKEN, PATH_TO_COLLECTION
@@ -26,6 +27,7 @@ def update_collections(df: DataFrame):
26
  """This function updates the Open LLM Leaderboard model collection with the latest best models for
27
  each size category and type.
28
  """
 
29
  collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
30
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
31
 
@@ -41,17 +43,17 @@ def update_collections(df: DataFrame):
41
  collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
42
  )
43
  except HfHubHTTPError:
 
44
  continue
45
 
46
  #filter quantized models
47
- df = df[df[AutoEvalColumn.precision.name].isin(['bfloat16', 'float16'])]
48
 
49
  ix = 0
 
 
 
50
  for size in intervals:
51
- interval_scores = []
52
- interval_itens_languages = []
53
- interval_itens = []
54
-
55
  numeric_interval = pd.IntervalIndex([intervals[size]])
56
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
57
  size_df = df.loc[mask]
@@ -71,6 +73,10 @@ def update_collections(df: DataFrame):
71
  # We add them one by one to the leaderboard
72
  for i, row in best_models.iterrows():
73
  model = row[AutoEvalColumn.dummy.name]
 
 
 
 
74
  score = row[AutoEvalColumn.average.name]
75
  language = row[AutoEvalColumn.main_language.name]
76
  if language == 'Portuguese':
@@ -80,7 +86,7 @@ def update_collections(df: DataFrame):
80
  try:
81
  collection = add_collection_item(
82
  PATH_TO_COLLECTION,
83
- item_id=model,
84
  item_type="model",
85
  exists_ok=True,
86
  note=note,
@@ -88,13 +94,14 @@ def update_collections(df: DataFrame):
88
  )
89
  ix += 1
90
  item_object_id = collection.items[-1].item_object_id
91
- cur_best_models.append(model)
92
  interval_scores.append(float(score))
93
  interval_itens_languages.append(language)
94
  interval_itens.append(item_object_id)
95
  scores_per_type[model_type] = float(score)
96
  break
97
  except HfHubHTTPError:
 
98
  continue
99
  if 'Portuguese' not in interval_itens_languages:
100
  language = ['Portuguese']
@@ -107,6 +114,10 @@ def update_collections(df: DataFrame):
107
  # We add them one by one to the leaderboard
108
  for i, row in best_models.iterrows():
109
  model = row[AutoEvalColumn.dummy.name]
 
 
 
 
110
  score = row[AutoEvalColumn.average.name]
111
  language = row[AutoEvalColumn.main_language.name]
112
 
@@ -117,7 +128,7 @@ def update_collections(df: DataFrame):
117
  try:
118
  collection = add_collection_item(
119
  PATH_TO_COLLECTION,
120
- item_id=model,
121
  item_type="model",
122
  exists_ok=True,
123
  note=note,
@@ -125,28 +136,31 @@ def update_collections(df: DataFrame):
125
  )
126
  ix += 1
127
  item_object_id = collection.items[-1].item_object_id
128
- cur_best_models.append(model)
129
  interval_scores.append(float(score))
130
  interval_itens_languages.append(language)
131
  interval_itens.append(item_object_id)
132
  scores_per_type[model_type] = float(score)
133
  break
134
  except HfHubHTTPError:
 
135
  continue
136
- # fix order:
137
- starting_idx = len(cur_best_models)
138
- k = 0
139
- for i in np.argsort(interval_scores):
140
- if i == k:
141
- continue
142
- else:
143
- try:
144
- update_collection_item(
145
- collection_slug=PATH_TO_COLLECTION, item_object_id=interval_itens[i], position=starting_idx+k
146
- )
147
- except:
148
- pass
149
- k += 1
 
 
150
 
151
  collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
152
  for item in collection.items:
@@ -156,4 +170,5 @@ def update_collections(df: DataFrame):
156
  collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
157
  )
158
  except HfHubHTTPError:
 
159
  continue
 
5
  from huggingface_hub.utils import HfHubHTTPError
6
  from pandas import DataFrame
7
  import numpy as np
8
+ import traceback
9
 
10
  from src.display.utils import AutoEvalColumn, ModelType, NUMERIC_INTERVALS
11
  from src.envs import H4_TOKEN, PATH_TO_COLLECTION
 
27
  """This function updates the Open LLM Leaderboard model collection with the latest best models for
28
  each size category and type.
29
  """
30
+ print("Updating collections...")
31
  collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
32
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
33
 
 
43
  collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
44
  )
45
  except HfHubHTTPError:
46
+ traceback.print_exc()
47
  continue
48
 
49
  #filter quantized models
50
+ #df = df[df[AutoEvalColumn.precision.name].isin(['bfloat16', 'float16', "?"])]
51
 
52
  ix = 0
53
+ interval_scores = []
54
+ interval_itens_languages = []
55
+ interval_itens = []
56
  for size in intervals:
 
 
 
 
57
  numeric_interval = pd.IntervalIndex([intervals[size]])
58
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
59
  size_df = df.loc[mask]
 
73
  # We add them one by one to the leaderboard
74
  for i, row in best_models.iterrows():
75
  model = row[AutoEvalColumn.dummy.name]
76
+ hf_path = row['hf_path']
77
+ hf_path = hf_path if 'meta-llama/Meta-' not in hf_path else hf_path.replace("meta-llama/Meta-", "meta-llama/")
78
+ if hf_path in cur_best_models:
79
+ continue
80
  score = row[AutoEvalColumn.average.name]
81
  language = row[AutoEvalColumn.main_language.name]
82
  if language == 'Portuguese':
 
86
  try:
87
  collection = add_collection_item(
88
  PATH_TO_COLLECTION,
89
+ item_id=hf_path,
90
  item_type="model",
91
  exists_ok=True,
92
  note=note,
 
94
  )
95
  ix += 1
96
  item_object_id = collection.items[-1].item_object_id
97
+ cur_best_models.append(hf_path)
98
  interval_scores.append(float(score))
99
  interval_itens_languages.append(language)
100
  interval_itens.append(item_object_id)
101
  scores_per_type[model_type] = float(score)
102
  break
103
  except HfHubHTTPError:
104
+ traceback.print_exc()
105
  continue
106
  if 'Portuguese' not in interval_itens_languages:
107
  language = ['Portuguese']
 
114
  # We add them one by one to the leaderboard
115
  for i, row in best_models.iterrows():
116
  model = row[AutoEvalColumn.dummy.name]
117
+ hf_path = row['hf_path']
118
+ hf_path = hf_path if 'meta-llama/Meta-' not in hf_path else hf_path.replace("meta-llama/Meta-", "meta-llama/")
119
+ if hf_path in cur_best_models:
120
+ continue
121
  score = row[AutoEvalColumn.average.name]
122
  language = row[AutoEvalColumn.main_language.name]
123
 
 
128
  try:
129
  collection = add_collection_item(
130
  PATH_TO_COLLECTION,
131
+ item_id=hf_path,
132
  item_type="model",
133
  exists_ok=True,
134
  note=note,
 
136
  )
137
  ix += 1
138
  item_object_id = collection.items[-1].item_object_id
139
+ cur_best_models.append(hf_path)
140
  interval_scores.append(float(score))
141
  interval_itens_languages.append(language)
142
  interval_itens.append(item_object_id)
143
  scores_per_type[model_type] = float(score)
144
  break
145
  except HfHubHTTPError:
146
+ traceback.print_exc()
147
  continue
148
+ # fix order:
149
+ starting_idx = len(cur_best_models)
150
+ k = 0
151
+ for i in np.argsort(interval_scores):
152
+ if i == k:
153
+ continue
154
+ else:
155
+ try:
156
+ #print(cur_best_models[i], interval_itens[i], starting_idx+k, interval_scores[i])
157
+ update_collection_item(
158
+ collection_slug=PATH_TO_COLLECTION, item_object_id=interval_itens[i], position=starting_idx+k
159
+ )
160
+ except:
161
+ traceback.print_exc()
162
+ pass
163
+ k += 1
164
 
165
  collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
166
  for item in collection.items:
 
170
  collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
171
  )
172
  except HfHubHTTPError:
173
+ traceback.print_exc()
174
  continue