Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
bf586e3
1 Parent(s): 23b3543

feat: add versioning for the long-doc

Browse files
Files changed (4) hide show
  1. app.py +25 -20
  2. src/benchmarks.py +2 -0
  3. src/loaders.py +11 -11
  4. src/utils.py +9 -44
app.py CHANGED
@@ -69,16 +69,16 @@ def update_metric_qa(
69
  return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
70
 
71
 
72
- # def update_metric_long_doc(
73
- # metric: str,
74
- # domains: list,
75
- # langs: list,
76
- # reranking_model: list,
77
- # query: str,
78
- # show_anonymous: bool,
79
- # show_revision_and_timestamp,
80
- # ):
81
- # return update_metric(datastore.raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
82
 
83
 
84
  def update_datastore(version):
@@ -274,18 +274,15 @@ with demo:
274
  lb_table_reranker,
275
  queue=True
276
  )
277
- """
278
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
279
  with gr.Row():
280
  with gr.Column(min_width=320):
281
  # select domain
282
  with gr.Row():
283
- selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
284
  # select language
285
  with gr.Row():
286
- selected_langs = get_language_dropdown(
287
- LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
288
- )
289
  with gr.Column():
290
  # select the metric
291
  with gr.Row():
@@ -301,22 +298,29 @@ with demo:
301
  search_bar = get_search_bar()
302
  # select reranking model
303
  with gr.Column():
304
- selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
305
 
306
  lb_table = get_leaderboard_table(
307
- data["AIR-Bench_24.04"].leaderboard_df_long_doc, data["AIR-Bench_24.04"].types_long_doc
308
  )
309
 
310
  # Dummy leaderboard for handling the case when the user uses backspace key
311
- hidden_lb_table_for_search = get_leaderboard_table(
312
- data["AIR-Bench_24.04"].raw_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
 
 
 
 
 
 
313
  )
314
 
315
  set_listeners(
316
  "long-doc",
317
  lb_table,
318
- hidden_lb_table_for_search,
319
  search_bar,
 
320
  selected_domains,
321
  selected_langs,
322
  selected_rerankings,
@@ -339,6 +343,7 @@ with demo:
339
  lb_table,
340
  queue=True
341
  )
 
342
  with gr.TabItem("Retrieval Only", id=21):
343
  with gr.Row():
344
  with gr.Column(scale=1):
 
69
  return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
70
 
71
 
72
+ def update_metric_long_doc(
73
+ metric: str,
74
+ domains: list,
75
+ langs: list,
76
+ reranking_model: list,
77
+ query: str,
78
+ show_anonymous: bool,
79
+ show_revision_and_timestamp,
80
+ ):
81
+ return update_metric(datastore, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
82
 
83
 
84
  def update_datastore(version):
 
274
  lb_table_reranker,
275
  queue=True
276
  )
 
277
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
278
  with gr.Row():
279
  with gr.Column(min_width=320):
280
  # select domain
281
  with gr.Row():
282
+ selected_domains = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
283
  # select language
284
  with gr.Row():
285
+ selected_langs = get_language_dropdown(LongDocBenchmarks[datastore.slug])
 
 
286
  with gr.Column():
287
  # select the metric
288
  with gr.Row():
 
298
  search_bar = get_search_bar()
299
  # select reranking model
300
  with gr.Column():
301
+ selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
302
 
303
  lb_table = get_leaderboard_table(
304
+ datastore.leaderboard_df_long_doc, datastore.types_long_doc
305
  )
306
 
307
  # Dummy leaderboard for handling the case when the user uses backspace key
308
+ hidden_lb_table = get_leaderboard_table(
309
+ datastore.raw_df_long_doc, datastore.types_long_doc, visible=False
310
+ )
311
+
312
+ selected_version.change(
313
+ update_datastore,
314
+ [selected_version,],
315
+ [selected_domains, selected_langs, selected_rerankings, lb_table, hidden_lb_table]
316
  )
317
 
318
  set_listeners(
319
  "long-doc",
320
  lb_table,
321
+ hidden_lb_table,
322
  search_bar,
323
+ selected_version,
324
  selected_domains,
325
  selected_langs,
326
  selected_rerankings,
 
343
  lb_table,
344
  queue=True
345
  )
346
+ """
347
  with gr.TabItem("Retrieval Only", id=21):
348
  with gr.Row():
349
  with gr.Column(scale=1):
src/benchmarks.py CHANGED
@@ -51,6 +51,8 @@ def get_benchmarks_enum(benchmark_version, task_type):
51
  benchmark_name = f"{domain}_{lang}_{dataset}"
52
  benchmark_name = get_safe_name(benchmark_name)
53
  col_name = benchmark_name
 
 
54
  for metric in METRIC_LIST:
55
  benchmark_dict[benchmark_name] = \
56
  Benchmark(benchmark_name, metric, col_name, domain, lang, task)
 
51
  benchmark_name = f"{domain}_{lang}_{dataset}"
52
  benchmark_name = get_safe_name(benchmark_name)
53
  col_name = benchmark_name
54
+ if "test" not in dataset_list[dataset]["splits"]:
55
+ continue
56
  for metric in METRIC_LIST:
57
  benchmark_dict[benchmark_name] = \
58
  Benchmark(benchmark_name, metric, col_name, domain, lang, task)
src/loaders.py CHANGED
@@ -76,17 +76,17 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
76
  lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
77
  lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
78
 
79
- # lb_data_store.raw_df_long_doc = get_leaderboard_df(
80
- # lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
81
- # print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
82
- # lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
83
- # shown_columns_long_doc, types_long_doc = get_default_cols(
84
- # 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
85
- # lb_data_store.types_long_doc = types_long_doc
86
- # lb_data_store.leaderboard_df_long_doc = \
87
- # lb_data_store.leaderboard_df_long_doc[
88
- # ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
89
- # lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
90
 
91
  lb_data_store.reranking_models = sorted(
92
  list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
 
76
  lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
77
  lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
78
 
79
+ lb_data_store.raw_df_long_doc = get_leaderboard_df(
80
+ lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
81
+ print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
82
+ lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
83
+ shown_columns_long_doc, types_long_doc = get_default_cols(
84
+ 'long-doc', lb_data_store.slug, add_fix_cols=True)
85
+ lb_data_store.types_long_doc = types_long_doc
86
+ lb_data_store.leaderboard_df_long_doc = \
87
+ lb_data_store.leaderboard_df_long_doc[
88
+ ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
89
+ lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
90
 
91
  lb_data_store.reranking_models = sorted(
92
  list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
src/utils.py CHANGED
@@ -64,12 +64,12 @@ def get_default_cols(task: str, version_slug, add_fix_cols: bool=True) -> tuple:
64
  types = []
65
  if task == "qa":
66
  benchmarks = QABenchmarks[version_slug]
67
- cols_list, types_list = get_default_col_names_and_types(benchmarks)
68
- # cols_list = COLS_QA
69
- # types_list = TYPES_QA
70
- benchmark_list = [c.value.col_name for c in list(benchmarks.value)]
71
  else:
72
  raise NotImplemented
 
 
73
  for col_name, col_type in zip(cols_list, types_list):
74
  if col_name not in benchmark_list:
75
  continue
@@ -90,40 +90,6 @@ def get_default_cols(task: str, version_slug, add_fix_cols: bool=True) -> tuple:
90
  return cols, types
91
 
92
 
93
- # def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
94
- # cols = []
95
- # types = []
96
- # if task == "qa":
97
- # cols_list = COLS_QA
98
- # types_list = TYPES_QA
99
- # benchmark_list = [c.value.col_name for c in list(QABenchmarks)]
100
- # elif task == "long-doc":
101
- # cols_list = COLS_LONG_DOC
102
- # types_list = TYPES_LONG_DOC
103
- # benchmark_list = [c.value.col_name for c in list(LongDocBenchmarks)]
104
- # else:
105
- # raise NotImplemented
106
- # for col_name, col_type in zip(cols_list, types_list):
107
- # if col_name not in benchmark_list:
108
- # continue
109
- # if len(columns) > 0 and col_name not in columns:
110
- # continue
111
- # cols.append(col_name)
112
- # types.append(col_type)
113
- #
114
- # if add_fix_cols:
115
- # _cols = []
116
- # _types = []
117
- # for col_name, col_type in zip(cols, types):
118
- # if col_name in FIXED_COLS:
119
- # continue
120
- # _cols.append(col_name)
121
- # _types.append(col_type)
122
- # cols = FIXED_COLS + _cols
123
- # types = FIXED_COLS_TYPES + _types
124
- # return cols, types
125
-
126
-
127
  def select_columns(
128
  df: pd.DataFrame,
129
  domain_query: list,
@@ -360,14 +326,13 @@ def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
360
  cols = [COL_NAME_IS_ANONYMOUS, ]
361
  if task == "qa":
362
  benchmarks = QABenchmarks[datastore.slug]
363
- cols_qa, _ = get_default_col_names_and_types(benchmarks)
364
- cols += cols_qa
365
- benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
366
- # elif task == "long-doc":
367
- # cols += COLS_LONG_DOC
368
- # benchmark_cols = [t.value.col_name for t in LongDocBenchmarks]
369
  else:
370
  raise NotImplemented
 
 
 
371
  all_data_json = []
372
  for v in raw_data:
373
  all_data_json += v.to_dict(task=task, metric=metric)
 
64
  types = []
65
  if task == "qa":
66
  benchmarks = QABenchmarks[version_slug]
67
+ elif task == "long-doc":
68
+ benchmarks = LongDocBenchmarks[version_slug]
 
 
69
  else:
70
  raise NotImplemented
71
+ cols_list, types_list = get_default_col_names_and_types(benchmarks)
72
+ benchmark_list = [c.value.col_name for c in list(benchmarks.value)]
73
  for col_name, col_type in zip(cols_list, types_list):
74
  if col_name not in benchmark_list:
75
  continue
 
90
  return cols, types
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def select_columns(
94
  df: pd.DataFrame,
95
  domain_query: list,
 
326
  cols = [COL_NAME_IS_ANONYMOUS, ]
327
  if task == "qa":
328
  benchmarks = QABenchmarks[datastore.slug]
329
+ elif task == "long-doc":
330
+ benchmarks = LongDocBenchmarks[datastore.slug]
 
 
 
 
331
  else:
332
  raise NotImplemented
333
+ cols_qa, _ = get_default_col_names_and_types(benchmarks)
334
+ cols += cols_qa
335
+ benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
336
  all_data_json = []
337
  for v in raw_data:
338
  all_data_json += v.to_dict(task=task, metric=metric)