Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
dfb867f
·
1 Parent(s): 2961737

test: add unit tests for loaders

Browse files
Files changed (2) hide show
  1. src/loaders.py +5 -1
  2. tests/src/test_loaders.py +54 -0
src/loaders.py CHANGED
@@ -53,7 +53,7 @@ def load_raw_eval_results(results_path: Union[Path, str]) -> List[FullEvalResult
53
 
54
 
55
  def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
56
- ds = LeaderboardDataStore(version, get_safe_name(version), None, None, None, None, None, None, None, None)
57
  ds.raw_data = load_raw_eval_results(file_path)
58
  print(f"raw data: {len(ds.raw_data)}")
59
 
@@ -61,7 +61,9 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
61
  print(f"QA data loaded: {ds.qa_raw_df.shape}")
62
  ds.qa_fmt_df = ds.qa_raw_df.copy()
63
  qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
 
64
  ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
 
65
  ds.qa_fmt_df = reset_rank(ds.qa_fmt_df)
66
  ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
67
 
@@ -69,7 +71,9 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
69
  print(f"Long-Doc data loaded: {len(ds.doc_raw_df)}")
70
  ds.doc_fmt_df = ds.doc_raw_df.copy()
71
  doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
 
72
  ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
 
73
  ds.doc_fmt_df = reset_rank(ds.doc_fmt_df)
74
  ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
75
 
 
53
 
54
 
55
  def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
56
+ ds = LeaderboardDataStore(version, get_safe_name(version))
57
  ds.raw_data = load_raw_eval_results(file_path)
58
  print(f"raw data: {len(ds.raw_data)}")
59
 
 
61
  print(f"QA data loaded: {ds.qa_raw_df.shape}")
62
  ds.qa_fmt_df = ds.qa_raw_df.copy()
63
  qa_cols, ds.qa_types = get_default_cols(TaskType.qa, ds.slug, add_fix_cols=True)
64
+ # by default, drop the anonymous submissions
65
  ds.qa_fmt_df = ds.qa_fmt_df[~ds.qa_fmt_df[COL_NAME_IS_ANONYMOUS]][qa_cols]
66
+ # reset the rank after dropping the anonymous submissions
67
  ds.qa_fmt_df = reset_rank(ds.qa_fmt_df)
68
  ds.qa_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
69
 
 
71
  print(f"Long-Doc data loaded: {len(ds.doc_raw_df)}")
72
  ds.doc_fmt_df = ds.doc_raw_df.copy()
73
  doc_cols, ds.doc_types = get_default_cols(TaskType.long_doc, ds.slug, add_fix_cols=True)
74
+ # by default, drop the anonymous submissions
75
  ds.doc_fmt_df = ds.doc_fmt_df[~ds.doc_fmt_df[COL_NAME_IS_ANONYMOUS]][doc_cols]
76
+ # reset the rank after dropping the anonymous submissions
77
  ds.doc_fmt_df = reset_rank(ds.doc_fmt_df)
78
  ds.doc_fmt_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
79
 
tests/src/test_loaders.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pytest
3
+ from pathlib import Path
4
+
5
+ from src.loaders import load_raw_eval_results, load_leaderboard_datastore, load_eval_results
6
+
7
+ cur_fp = Path(__file__)
8
+
9
+
10
+ @pytest.mark.parametrize(
11
+ "version",
12
+ ["AIR-Bench_24.04", "AIR-Bench_24.05"]
13
+ )
14
+ def test_load_raw_eval_results(version):
15
+ raw_data = load_raw_eval_results(
16
+ cur_fp.parents[1] / f"toydata/eval_results/{version}"
17
+ )
18
+ assert len(raw_data) == 1
19
+ full_eval_result = raw_data[0]
20
+ expected_attr = [
21
+ 'eval_name',
22
+ 'retrieval_model',
23
+ 'reranking_model',
24
+ 'retrieval_model_link',
25
+ 'reranking_model_link',
26
+ 'results',
27
+ 'timestamp',
28
+ 'revision',
29
+ 'is_anonymous'
30
+ ]
31
+ result_attr = [k for k in full_eval_result.__dict__.keys() if k[:2] != "__" and k[-2:] != "__"]
32
+ assert sorted(expected_attr) == sorted(result_attr)
33
+
34
+
35
+ @pytest.mark.parametrize(
36
+ "version",
37
+ ["AIR-Bench_24.04", "AIR-Bench_24.05"]
38
+ )
39
+ def test_load_leaderboard_datastore(version):
40
+ file_path = cur_fp.parents[1] / f"toydata/eval_results/{version}"
41
+ datastore = load_leaderboard_datastore(file_path, version)
42
+ for k, v in datastore.__dict__.items():
43
+ if k[:2] != "__" and k[-2:] != "__":
44
+ if isinstance(v, list):
45
+ assert v
46
+ elif isinstance(v, pd.DataFrame):
47
+ assert not v.empty
48
+
49
+
50
+ def test_load_eval_results():
51
+ file_path = cur_fp.parents[1] / "toydata/eval_results/"
52
+ datastore_dict = load_eval_results(file_path)
53
+ assert len(datastore_dict) == 2
54
+