Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
59fa204
1 Parent(s): 0785fe4

feat: use dataclass to manage the dataframes

Browse files
Files changed (2) hide show
  1. app.py +60 -48
  2. src/envs.py +1 -1
app.py CHANGED
@@ -65,40 +65,52 @@ def restart_space():
65
  API.restart_space(repo_id=REPO_ID)
66
 
67
 
68
- try:
69
- snapshot_download(
70
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
71
- token=TOKEN
72
- )
73
- except Exception as e:
74
- print(f'failed to download')
75
- restart_space()
76
-
77
- raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/{LATEST_BENCHMARK_VERSION}")
78
-
79
- original_df_qa = get_leaderboard_df(
80
- raw_data, task='qa', metric=DEFAULT_METRIC_QA)
81
- original_df_long_doc = get_leaderboard_df(
82
- raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
83
- print(f'raw data: {len(raw_data)}')
84
- print(f'QA data loaded: {original_df_qa.shape}')
85
- print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
86
-
87
- leaderboard_df_qa = original_df_qa.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
89
  shown_columns_qa, types_qa = get_default_cols(
90
- 'qa', leaderboard_df_qa.columns, add_fix_cols=True)
91
- leaderboard_df_qa = leaderboard_df_qa[~leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
92
- leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
93
 
94
- leaderboard_df_long_doc = original_df_long_doc.copy()
95
  shown_columns_long_doc, types_long_doc = get_default_cols(
96
- 'long-doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
97
- leaderboard_df_long_doc = leaderboard_df_long_doc[~leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
98
- leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
99
 
100
- # select reranking model
101
- reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in raw_data])))
102
 
103
 
104
  def update_metric_qa(
@@ -110,7 +122,7 @@ def update_metric_qa(
110
  show_anonymous: bool,
111
  show_revision_and_timestamp,
112
  ):
113
- return update_metric(raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
114
 
115
  def update_metric_long_doc(
116
  metric: str,
@@ -121,7 +133,7 @@ def update_metric_long_doc(
121
  show_anonymous: bool,
122
  show_revision_and_timestamp,
123
  ):
124
- return update_metric(raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
125
 
126
 
127
  demo = gr.Blocks(css=custom_css)
@@ -160,10 +172,10 @@ with demo:
160
  search_bar = get_search_bar()
161
  # select reranking models
162
  with gr.Column():
163
- selected_rerankings = get_reranking_dropdown(reranking_models)
164
- leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
165
  # Dummy leaderboard for handling the case when the user uses backspace key
166
- hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
167
 
168
  set_listeners(
169
  "qa",
@@ -198,11 +210,11 @@ with demo:
198
  search_bar_retriever = get_search_bar()
199
  with gr.Column(scale=1):
200
  selected_noreranker = get_noreranking_dropdown()
201
- lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
202
  lb_df_retriever = reset_rank(lb_df_retriever)
203
  lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
204
  # Dummy leaderboard for handling the case when the user uses backspace key
205
- hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
206
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
207
  hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
208
 
@@ -234,7 +246,7 @@ with demo:
234
  queue=True
235
  )
236
  with gr.TabItem("Reranking Only", id=12):
237
- lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
238
  lb_df_reranker = reset_rank(lb_df_reranker)
239
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
240
  with gr.Row():
@@ -243,7 +255,7 @@ with demo:
243
  with gr.Column(scale=1):
244
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
245
  lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
246
- hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
247
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
248
  hidden_lb_table_reranker = get_leaderboard_table(
249
  hidden_lb_df_reranker, types_qa, visible=False
@@ -301,15 +313,15 @@ with demo:
301
  search_bar = get_search_bar()
302
  # select reranking model
303
  with gr.Column():
304
- selected_rerankings = get_reranking_dropdown(reranking_models)
305
 
306
  lb_table = get_leaderboard_table(
307
- leaderboard_df_long_doc, types_long_doc
308
  )
309
 
310
  # Dummy leaderboard for handling the case when the user uses backspace key
311
  hidden_lb_table_for_search = get_leaderboard_table(
312
- original_df_long_doc, types_long_doc, visible=False
313
  )
314
 
315
  set_listeners(
@@ -345,12 +357,12 @@ with demo:
345
  search_bar_retriever = get_search_bar()
346
  with gr.Column(scale=1):
347
  selected_noreranker = get_noreranking_dropdown()
348
- lb_df_retriever_long_doc = leaderboard_df_long_doc[
349
- leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
350
  ]
351
  lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
352
- hidden_lb_db_retriever_long_doc = original_df_long_doc[
353
- original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
354
  ]
355
  hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
356
  lb_table_retriever_long_doc = get_leaderboard_table(
@@ -386,8 +398,8 @@ with demo:
386
  queue=True
387
  )
388
  with gr.TabItem("Reranking Only", id=22):
389
- lb_df_reranker_ldoc = leaderboard_df_long_doc[
390
- leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
391
  ]
392
  lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
393
  reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
@@ -397,7 +409,7 @@ with demo:
397
  with gr.Column(scale=1):
398
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
399
  lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
400
- hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
401
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
402
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
403
  hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
 
65
  API.restart_space(repo_id=REPO_ID)
66
 
67
 
68
+ # try:
69
+ # snapshot_download(
70
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
71
+ # token=TOKEN
72
+ # )
73
+ # except Exception as e:
74
+ # print(f'failed to download')
75
+ # restart_space()
76
+
77
+ from dataclasses import dataclass
78
+ import pandas as pd
79
+ from typing import Optional
80
+ @dataclass
81
+ class LeaderboardDataStore:
82
+ raw_data: Optional[list]
83
+ original_df_qa: Optional[pd.DataFrame]
84
+ original_df_long_doc: Optional[pd.DataFrame]
85
+ leaderboard_df_qa: Optional[pd.DataFrame]
86
+ leaderboard_df_long_doc: Optional[pd.DataFrame]
87
+ reranking_models: Optional[list]
88
+
89
+ data = {}
90
+ data["AIR-Bench_24.04"] = LeaderboardDataStore(None, None, None, None, None, None)
91
+ data["AIR-Bench_24.04"].raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
92
+ data["AIR-Bench_24.04"].original_df_qa = get_leaderboard_df(
93
+ data["AIR-Bench_24.04"].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
94
+ data["AIR-Bench_24.04"].original_df_long_doc = get_leaderboard_df(
95
+ data["AIR-Bench_24.04"].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
96
+ print(f'raw data: {len(data["AIR-Bench_24.04"].raw_data)}')
97
+ print(f'QA data loaded: {data["AIR-Bench_24.04"].original_df_qa.shape}')
98
+ print(f'Long-Doc data loaded: {len(data["AIR-Bench_24.04"].original_df_long_doc)}')
99
+
100
+ data["AIR-Bench_24.04"].leaderboard_df_qa = data["AIR-Bench_24.04"].original_df_qa.copy()
101
  # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
102
  shown_columns_qa, types_qa = get_default_cols(
103
+ 'qa', data["AIR-Bench_24.04"].leaderboard_df_qa.columns, add_fix_cols=True)
104
+ data["AIR-Bench_24.04"].leaderboard_df_qa = data["AIR-Bench_24.04"].leaderboard_df_qa[~data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
105
+ data["AIR-Bench_24.04"].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
106
 
107
+ data["AIR-Bench_24.04"].leaderboard_df_long_doc = data["AIR-Bench_24.04"].original_df_long_doc.copy()
108
  shown_columns_long_doc, types_long_doc = get_default_cols(
109
+ 'long-doc', data["AIR-Bench_24.04"].leaderboard_df_long_doc.columns, add_fix_cols=True)
110
+ data["AIR-Bench_24.04"].leaderboard_df_long_doc = data["AIR-Bench_24.04"].leaderboard_df_long_doc[~data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
111
+ data["AIR-Bench_24.04"].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
112
 
113
+ data["AIR-Bench_24.04"].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in data["AIR-Bench_24.04"].raw_data])))
 
114
 
115
 
116
  def update_metric_qa(
 
122
  show_anonymous: bool,
123
  show_revision_and_timestamp,
124
  ):
125
+ return update_metric(data["AIR-Bench_24.04"].raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
126
 
127
  def update_metric_long_doc(
128
  metric: str,
 
133
  show_anonymous: bool,
134
  show_revision_and_timestamp,
135
  ):
136
+ return update_metric(data["AIR-Bench_24.04"].raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
137
 
138
 
139
  demo = gr.Blocks(css=custom_css)
 
172
  search_bar = get_search_bar()
173
  # select reranking models
174
  with gr.Column():
175
+ selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
176
+ leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, types_qa)
177
  # Dummy leaderboard for handling the case when the user uses backspace key
178
+ hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].original_df_qa, types_qa, visible=False)
179
 
180
  set_listeners(
181
  "qa",
 
210
  search_bar_retriever = get_search_bar()
211
  with gr.Column(scale=1):
212
  selected_noreranker = get_noreranking_dropdown()
213
+ lb_df_retriever = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
214
  lb_df_retriever = reset_rank(lb_df_retriever)
215
  lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
216
  # Dummy leaderboard for handling the case when the user uses backspace key
217
+ hidden_lb_df_retriever = data["AIR-Bench_24.04"].original_df_qa[data["AIR-Bench_24.04"].original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
218
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
219
  hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
220
 
 
246
  queue=True
247
  )
248
  with gr.TabItem("Reranking Only", id=12):
249
+ lb_df_reranker = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
250
  lb_df_reranker = reset_rank(lb_df_reranker)
251
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
252
  with gr.Row():
 
255
  with gr.Column(scale=1):
256
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
257
  lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
258
+ hidden_lb_df_reranker = data["AIR-Bench_24.04"].original_df_qa[data["AIR-Bench_24.04"].original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
259
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
260
  hidden_lb_table_reranker = get_leaderboard_table(
261
  hidden_lb_df_reranker, types_qa, visible=False
 
313
  search_bar = get_search_bar()
314
  # select reranking model
315
  with gr.Column():
316
+ selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
317
 
318
  lb_table = get_leaderboard_table(
319
+ data["AIR-Bench_24.04"].leaderboard_df_long_doc, types_long_doc
320
  )
321
 
322
  # Dummy leaderboard for handling the case when the user uses backspace key
323
  hidden_lb_table_for_search = get_leaderboard_table(
324
+ data["AIR-Bench_24.04"].original_df_long_doc, types_long_doc, visible=False
325
  )
326
 
327
  set_listeners(
 
357
  search_bar_retriever = get_search_bar()
358
  with gr.Column(scale=1):
359
  selected_noreranker = get_noreranking_dropdown()
360
+ lb_df_retriever_long_doc = data["AIR-Bench_24.04"].leaderboard_df_long_doc[
361
+ data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
362
  ]
363
  lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
364
+ hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].original_df_long_doc[
365
+ data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
366
  ]
367
  hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
368
  lb_table_retriever_long_doc = get_leaderboard_table(
 
398
  queue=True
399
  )
400
  with gr.TabItem("Reranking Only", id=22):
401
+ lb_df_reranker_ldoc = data["AIR-Bench_24.04"].leaderboard_df_long_doc[
402
+ data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
403
  ]
404
  lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
405
  reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
 
409
  with gr.Column(scale=1):
410
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
411
  lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
412
+ hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
413
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
414
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
415
  hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
src/envs.py CHANGED
@@ -27,7 +27,7 @@ BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
27
 
28
  BENCHMARK_VERSION_LIST = [
29
  "AIR-Bench_24.04",
30
- "AIR-Bench_24.05",
31
  ]
32
 
33
  LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
 
27
 
28
  BENCHMARK_VERSION_LIST = [
29
  "AIR-Bench_24.04",
30
+ # "AIR-Bench_24.05",
31
  ]
32
 
33
  LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]