Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
ebf3ceb
·
1 Parent(s): 59fa204

feat: refactor the data loading function

Browse files
Files changed (1) hide show
  1. app.py +55 -42
app.py CHANGED
@@ -77,41 +77,54 @@ def restart_space():
77
  from dataclasses import dataclass
78
  import pandas as pd
79
  from typing import Optional
 
 
80
  @dataclass
81
  class LeaderboardDataStore:
82
  raw_data: Optional[list]
83
- original_df_qa: Optional[pd.DataFrame]
84
  original_df_long_doc: Optional[pd.DataFrame]
85
  leaderboard_df_qa: Optional[pd.DataFrame]
86
  leaderboard_df_long_doc: Optional[pd.DataFrame]
87
  reranking_models: Optional[list]
88
-
89
- data = {}
90
- data["AIR-Bench_24.04"] = LeaderboardDataStore(None, None, None, None, None, None)
91
- data["AIR-Bench_24.04"].raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
92
- data["AIR-Bench_24.04"].original_df_qa = get_leaderboard_df(
93
- data["AIR-Bench_24.04"].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
94
- data["AIR-Bench_24.04"].original_df_long_doc = get_leaderboard_df(
95
- data["AIR-Bench_24.04"].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
96
- print(f'raw data: {len(data["AIR-Bench_24.04"].raw_data)}')
97
- print(f'QA data loaded: {data["AIR-Bench_24.04"].original_df_qa.shape}')
98
- print(f'Long-Doc data loaded: {len(data["AIR-Bench_24.04"].original_df_long_doc)}')
99
-
100
- data["AIR-Bench_24.04"].leaderboard_df_qa = data["AIR-Bench_24.04"].original_df_qa.copy()
101
- # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
102
- shown_columns_qa, types_qa = get_default_cols(
103
- 'qa', data["AIR-Bench_24.04"].leaderboard_df_qa.columns, add_fix_cols=True)
104
- data["AIR-Bench_24.04"].leaderboard_df_qa = data["AIR-Bench_24.04"].leaderboard_df_qa[~data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
105
- data["AIR-Bench_24.04"].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
106
-
107
- data["AIR-Bench_24.04"].leaderboard_df_long_doc = data["AIR-Bench_24.04"].original_df_long_doc.copy()
108
- shown_columns_long_doc, types_long_doc = get_default_cols(
109
- 'long-doc', data["AIR-Bench_24.04"].leaderboard_df_long_doc.columns, add_fix_cols=True)
110
- data["AIR-Bench_24.04"].leaderboard_df_long_doc = data["AIR-Bench_24.04"].leaderboard_df_long_doc[~data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
111
- data["AIR-Bench_24.04"].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
112
-
113
- data["AIR-Bench_24.04"].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in data["AIR-Bench_24.04"].raw_data])))
114
-
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def update_metric_qa(
117
  metric: str,
@@ -173,9 +186,9 @@ with demo:
173
  # select reranking models
174
  with gr.Column():
175
  selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
176
- leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, types_qa)
177
  # Dummy leaderboard for handling the case when the user uses backspace key
178
- hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].original_df_qa, types_qa, visible=False)
179
 
180
  set_listeners(
181
  "qa",
@@ -212,11 +225,11 @@ with demo:
212
  selected_noreranker = get_noreranking_dropdown()
213
  lb_df_retriever = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
214
  lb_df_retriever = reset_rank(lb_df_retriever)
215
- lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
216
  # Dummy leaderboard for handling the case when the user uses backspace key
217
- hidden_lb_df_retriever = data["AIR-Bench_24.04"].original_df_qa[data["AIR-Bench_24.04"].original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
218
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
219
- hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
220
 
221
  set_listeners(
222
  "qa",
@@ -254,11 +267,11 @@ with demo:
254
  selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
255
  with gr.Column(scale=1):
256
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
257
- lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
258
- hidden_lb_df_reranker = data["AIR-Bench_24.04"].original_df_qa[data["AIR-Bench_24.04"].original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
259
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
260
  hidden_lb_table_reranker = get_leaderboard_table(
261
- hidden_lb_df_reranker, types_qa, visible=False
262
  )
263
 
264
  set_listeners(
@@ -316,12 +329,12 @@ with demo:
316
  selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
317
 
318
  lb_table = get_leaderboard_table(
319
- data["AIR-Bench_24.04"].leaderboard_df_long_doc, types_long_doc
320
  )
321
 
322
  # Dummy leaderboard for handling the case when the user uses backspace key
323
  hidden_lb_table_for_search = get_leaderboard_table(
324
- data["AIR-Bench_24.04"].original_df_long_doc, types_long_doc, visible=False
325
  )
326
 
327
  set_listeners(
@@ -366,9 +379,9 @@ with demo:
366
  ]
367
  hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
368
  lb_table_retriever_long_doc = get_leaderboard_table(
369
- lb_df_retriever_long_doc, types_long_doc)
370
  hidden_lb_table_retriever_long_doc = get_leaderboard_table(
371
- hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
372
  )
373
 
374
  set_listeners(
@@ -408,11 +421,11 @@ with demo:
408
  selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
409
  with gr.Column(scale=1):
410
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
411
- lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
412
  hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
413
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
414
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
415
- hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
416
  )
417
 
418
  set_listeners(
 
77
  from dataclasses import dataclass
78
  import pandas as pd
79
  from typing import Optional
80
+
81
+
82
  @dataclass
83
  class LeaderboardDataStore:
84
  raw_data: Optional[list]
85
+ raw_qa_df: Optional[pd.DataFrame]
86
  original_df_long_doc: Optional[pd.DataFrame]
87
  leaderboard_df_qa: Optional[pd.DataFrame]
88
  leaderboard_df_long_doc: Optional[pd.DataFrame]
89
  reranking_models: Optional[list]
90
+ types_qa: Optional[list]
91
+ types_long_doc: Optional[list]
92
+
93
+
94
+ def load_eval_results(file_path: str):
95
+ output = {}
96
+ versions = ("AIR-Bench_24.04",)
97
+ for version in versions:
98
+ output[version] = LeaderboardDataStore(None, None, None, None, None, None, None, None)
99
+ output[version].raw_data = get_raw_eval_results(f"{file_path}/{version}")
100
+ output[version].raw_qa_df = get_leaderboard_df(
101
+ output[version].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
102
+ output[version].original_df_long_doc = get_leaderboard_df(
103
+ output[version].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
104
+ print(f'raw data: {len(output[version].raw_data)}')
105
+ print(f'QA data loaded: {output[version].raw_qa_df.shape}')
106
+ print(f'Long-Doc data loaded: {len(output[version].original_df_long_doc)}')
107
+
108
+ output[version].leaderboard_df_qa = output[version].raw_qa_df.copy()
109
+ # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
110
+ shown_columns_qa, types_qa = get_default_cols(
111
+ 'qa', output[version].leaderboard_df_qa.columns, add_fix_cols=True)
112
+ output[version].types_qa = types_qa
113
+ output[version].leaderboard_df_qa = output[version].leaderboard_df_qa[~output[version].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
114
+ output[version].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
115
+
116
+ output[version].leaderboard_df_long_doc = output[version].original_df_long_doc.copy()
117
+ shown_columns_long_doc, types_long_doc = get_default_cols(
118
+ 'long-doc', output[version].leaderboard_df_long_doc.columns, add_fix_cols=True)
119
+ output[version].types_long_doc = types_long_doc
120
+ output[version].leaderboard_df_long_doc = output[version].leaderboard_df_long_doc[~output[version].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
121
+ output[version].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
122
+
123
+ output[version].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in output[version].raw_data])))
124
+ return output
125
+
126
+
127
+ data = load_eval_results(EVAL_RESULTS_PATH)
128
 
129
  def update_metric_qa(
130
  metric: str,
 
186
  # select reranking models
187
  with gr.Column():
188
  selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
189
+ leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
190
  # Dummy leaderboard for handling the case when the user uses backspace key
191
+ hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_qa_df, data["AIR-Bench_24.04"].types_qa, visible=False)
192
 
193
  set_listeners(
194
  "qa",
 
225
  selected_noreranker = get_noreranking_dropdown()
226
  lb_df_retriever = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
227
  lb_df_retriever = reset_rank(lb_df_retriever)
228
+ lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
229
  # Dummy leaderboard for handling the case when the user uses backspace key
230
+ hidden_lb_df_retriever = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RERANKING_MODEL] == "NoReranker"]
231
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
232
+ hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
233
 
234
  set_listeners(
235
  "qa",
 
267
  selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
268
  with gr.Column(scale=1):
269
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
270
+ lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
271
+ hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
272
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
273
  hidden_lb_table_reranker = get_leaderboard_table(
274
+ hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
275
  )
276
 
277
  set_listeners(
 
329
  selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
330
 
331
  lb_table = get_leaderboard_table(
332
+ data["AIR-Bench_24.04"].leaderboard_df_long_doc, data["AIR-Bench_24.04"].types_long_doc
333
  )
334
 
335
  # Dummy leaderboard for handling the case when the user uses backspace key
336
  hidden_lb_table_for_search = get_leaderboard_table(
337
+ data["AIR-Bench_24.04"].original_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
338
  )
339
 
340
  set_listeners(
 
379
  ]
380
  hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
381
  lb_table_retriever_long_doc = get_leaderboard_table(
382
+ lb_df_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc)
383
  hidden_lb_table_retriever_long_doc = get_leaderboard_table(
384
+ hidden_lb_db_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
385
  )
386
 
387
  set_listeners(
 
421
  selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
422
  with gr.Column(scale=1):
423
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
424
+ lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
425
  hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
426
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
427
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
428
+ hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
429
  )
430
 
431
  set_listeners(