Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
2961737
·
1 Parent(s): 3014147

test: add unit tests for utils

Browse files
Files changed (4) hide show
  1. src/loaders.py +3 -1
  2. src/models.py +8 -8
  3. src/utils.py +1 -1
  4. tests/src/test_utils.py +25 -1
src/loaders.py CHANGED
@@ -1,4 +1,6 @@
1
  import os.path
 
 
2
  from typing import Dict, List
3
 
4
  import pandas as pd
@@ -11,7 +13,7 @@ from src.utils import get_default_cols, get_leaderboard_df, reset_rank
11
  pd.options.mode.copy_on_write = True
12
 
13
 
14
- def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
15
  """
16
  Load the evaluation results from a json file
17
  """
 
1
  import os.path
2
+ from pathlib import Path
3
+ from typing import Union
4
  from typing import Dict, List
5
 
6
  import pandas as pd
 
13
  pd.options.mode.copy_on_write = True
14
 
15
 
16
+ def load_raw_eval_results(results_path: Union[Path, str]) -> List[FullEvalResult]:
17
  """
18
  Load the evaluation results from a json file
19
  """
src/models.py CHANGED
@@ -141,14 +141,14 @@ class FullEvalResult:
141
  class LeaderboardDataStore:
142
  version: str
143
  slug: str
144
- raw_data: Optional[list]
145
- qa_raw_df: Optional[pd.DataFrame]
146
- doc_raw_df: Optional[pd.DataFrame]
147
- qa_fmt_df: Optional[pd.DataFrame]
148
- doc_fmt_df: Optional[pd.DataFrame]
149
- reranking_models: Optional[list]
150
- qa_types: Optional[list]
151
- doc_types: Optional[list]
152
 
153
 
154
  # Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.
 
141
  class LeaderboardDataStore:
142
  version: str
143
  slug: str
144
+ raw_data: list = None
145
+ qa_raw_df: pd.DataFrame = pd.DataFrame()
146
+ doc_raw_df: pd.DataFrame = pd.DataFrame()
147
+ qa_fmt_df: pd.DataFrame = pd.DataFrame()
148
+ doc_fmt_df: pd.DataFrame = pd.DataFrame()
149
+ reranking_models: list = None
150
+ qa_types: list = None
151
+ doc_types: list = None
152
 
153
 
154
  # Define an enum class with the name `TaskType`. There are two types of tasks, `qa` and `long-doc`.
src/utils.py CHANGED
@@ -354,7 +354,7 @@ def get_leaderboard_df(datastore, task: TaskType, metric: str) -> pd.DataFrame:
354
  continue
355
  benchmark_cols.append(t.value.col_name)
356
 
357
- ## filter out the columns that are not in the data
358
  df[COL_NAME_AVG] = (
359
  df[list(benchmark_cols)]
360
  .apply(calculate_mean, axis=1)
 
354
  continue
355
  benchmark_cols.append(t.value.col_name)
356
 
357
+ # filter out the columns that are not in the data
358
  df[COL_NAME_AVG] = (
359
  df[list(benchmark_cols)]
360
  .apply(calculate_mean, axis=1)
tests/src/test_utils.py CHANGED
@@ -1,10 +1,12 @@
1
  import pytest
2
  import pandas as pd
 
3
 
4
- from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem
5
  from src.models import model_hyperlink, TaskType
6
  from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
7
 
 
8
 
9
  NUM_QA_BENCHMARKS_24_05 = 53
10
  NUM_DOC_BENCHMARKS_24_05 = 11
@@ -193,3 +195,25 @@ def test__update_df_elem(toy_df, reset_rank, show_anony):
193
  assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
194
 
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pytest
2
  import pandas as pd
3
+ from pathlib import Path
4
 
5
+ from src.utils import remove_html, calculate_mean, filter_models, filter_queries, get_default_cols, select_columns, get_selected_cols, _update_df_elem, get_leaderboard_df
6
  from src.models import model_hyperlink, TaskType
7
  from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
8
 
9
+ cur_fp = Path(__file__)
10
 
11
  NUM_QA_BENCHMARKS_24_05 = 53
12
  NUM_DOC_BENCHMARKS_24_05 = 11
 
195
  assert df["Average ⬆️"].equals(toy_df["Average ⬆️"])
196
 
197
 
198
+ @pytest.mark.parametrize(
199
+ "version, task_type",
200
+ [
201
+ ("AIR-Bench_24.04", TaskType.qa),
202
+ ("AIR-Bench_24.04", TaskType.long_doc),
203
+ ("AIR-Bench_24.05", TaskType.qa),
204
+ ("AIR-Bench_24.05", TaskType.long_doc)
205
+ ]
206
+ )
207
+ def test_get_leaderboard_df(version, task_type):
208
+ from src.loaders import load_raw_eval_results
209
+ from src.models import LeaderboardDataStore, get_safe_name
210
+ raw_data = load_raw_eval_results(
211
+ cur_fp.parents[1] / f"toydata/eval_results/{version}"
212
+ )
213
+ ds = LeaderboardDataStore(version, get_safe_name(version), raw_data=raw_data)
214
+ df = get_leaderboard_df(
215
+ ds,
216
+ task_type,
217
+ "ndcg_at_10"
218
+ )
219
+ assert df.shape[0] == 1