Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
270c122
1 Parent(s): 3fcf957

refactor: rename the benchmarks enum

Browse files
app.py CHANGED
@@ -6,8 +6,8 @@ from src.about import (
6
  TITLE
7
  )
8
  from src.benchmarks import (
9
- BenchmarksQA,
10
- BenchmarksLongDoc
11
  )
12
  from src.display.css_html_js import custom_css
13
  from src.envs import (
@@ -76,11 +76,11 @@ def update_metric_long_doc(
76
  return update_metric(data["AIR-Bench_24.04"].raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
77
 
78
 
79
- DOMAIN_COLS_QA = list(frozenset([c.value.domain for c in list(BenchmarksQA)]))
80
- LANG_COLS_QA = list(frozenset([c.value.lang for c in list(BenchmarksQA)]))
81
 
82
- DOMAIN_COLS_LONG_DOC = list(frozenset([c.value.domain for c in list(BenchmarksLongDoc)]))
83
- LANG_COLS_LONG_DOC = list(frozenset([c.value.lang for c in list(BenchmarksLongDoc)]))
84
 
85
  demo = gr.Blocks(css=custom_css)
86
 
 
6
  TITLE
7
  )
8
  from src.benchmarks import (
9
+ QABenchmarks,
10
+ LongDocBenchmarks
11
  )
12
  from src.display.css_html_js import custom_css
13
  from src.envs import (
 
76
  return update_metric(data["AIR-Bench_24.04"].raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
77
 
78
 
79
+ DOMAIN_COLS_QA = list(frozenset([c.value.domain for c in list(QABenchmarks)]))
80
+ LANG_COLS_QA = list(frozenset([c.value.lang for c in list(QABenchmarks)]))
81
 
82
+ DOMAIN_COLS_LONG_DOC = list(frozenset([c.value.domain for c in list(LongDocBenchmarks)]))
83
+ LANG_COLS_LONG_DOC = list(frozenset([c.value.lang for c in list(LongDocBenchmarks)]))
84
 
85
  demo = gr.Blocks(css=custom_css)
86
 
src/benchmarks.py CHANGED
@@ -54,5 +54,5 @@ def get_benchmarks_enum(benchmark_version):
54
 
55
  _qa_benchmark_dict, _long_doc_benchmark_dict = get_benchmarks_enum('AIR-Bench_24.04')
56
 
57
- BenchmarksQA = Enum('BenchmarksQA', _qa_benchmark_dict)
58
- BenchmarksLongDoc = Enum('BenchmarksLongDoc', _long_doc_benchmark_dict)
 
54
 
55
  _qa_benchmark_dict, _long_doc_benchmark_dict = get_benchmarks_enum('AIR-Bench_24.04')
56
 
57
+ QABenchmarks = Enum('QABenchmarks', _qa_benchmark_dict)
58
+ LongDocBenchmarks = Enum('LongDocBenchmarks', _long_doc_benchmark_dict)
src/display/columns.py CHANGED
@@ -1,6 +1,6 @@
1
  from dataclasses import dataclass, make_dataclass
2
 
3
- from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
4
 
5
  COL_NAME_AVG = "Average ⬆️"
6
  COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
@@ -66,7 +66,7 @@ def get_default_auto_eval_column_dict():
66
  return auto_eval_column_dict
67
 
68
 
69
- def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
70
  auto_eval_column_dict = get_default_auto_eval_column_dict()
71
  # Leaderboard columns
72
  for benchmark in benchmarks:
@@ -79,9 +79,9 @@ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
79
 
80
 
81
  AutoEvalColumnQA = make_autoevalcolumn(
82
- "AutoEvalColumnQA", BenchmarksQA)
83
  AutoEvalColumnLongDoc = make_autoevalcolumn(
84
- "AutoEvalColumnLongDoc", BenchmarksLongDoc)
85
 
86
  fixed_cols = get_default_auto_eval_column_dict()[:-3]
87
 
 
1
  from dataclasses import dataclass, make_dataclass
2
 
3
+ from src.benchmarks import QABenchmarks, LongDocBenchmarks
4
 
5
  COL_NAME_AVG = "Average ⬆️"
6
  COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
 
66
  return auto_eval_column_dict
67
 
68
 
69
+ def make_autoevalcolumn(cls_name="QABenchmarks", benchmarks=QABenchmarks):
70
  auto_eval_column_dict = get_default_auto_eval_column_dict()
71
  # Leaderboard columns
72
  for benchmark in benchmarks:
 
79
 
80
 
81
  AutoEvalColumnQA = make_autoevalcolumn(
82
+ "AutoEvalColumnQA", QABenchmarks)
83
  AutoEvalColumnLongDoc = make_autoevalcolumn(
84
+ "AutoEvalColumnLongDoc", LongDocBenchmarks)
85
 
86
  fixed_cols = get_default_auto_eval_column_dict()[:-3]
87
 
src/utils.py CHANGED
@@ -6,7 +6,7 @@ from typing import List
6
 
7
  import pandas as pd
8
 
9
- from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
10
  from src.display.formatting import styled_message, styled_error
11
  from src.display.columns import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
12
  COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS, COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, \
@@ -68,11 +68,11 @@ def get_default_cols(task: str, columns: list=[], add_fix_cols: bool=True) -> li
68
  if task == "qa":
69
  cols_list = COLS_QA
70
  types_list = TYPES_QA
71
- benchmark_list = [c.value.col_name for c in list(BenchmarksQA)]
72
  elif task == "long-doc":
73
  cols_list = COLS_LONG_DOC
74
  types_list = TYPES_LONG_DOC
75
- benchmark_list = [c.value.col_name for c in list(BenchmarksLongDoc)]
76
  else:
77
  raise NotImplemented
78
  for col_name, col_type in zip(cols_list, types_list):
@@ -110,9 +110,9 @@ def select_columns(
110
  selected_cols = []
111
  for c in cols:
112
  if task == "qa":
113
- eval_col = BenchmarksQA[c].value
114
  elif task == "long-doc":
115
- eval_col = BenchmarksLongDoc[c].value
116
  if eval_col.domain not in domain_query:
117
  continue
118
  if eval_col.lang not in language_query:
@@ -329,10 +329,10 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
329
  cols = [COL_NAME_IS_ANONYMOUS, ]
330
  if task == "qa":
331
  cols += COLS_QA
332
- benchmark_cols = [t.value.col_name for t in BenchmarksQA]
333
  elif task == "long-doc":
334
  cols += COLS_LONG_DOC
335
- benchmark_cols = [t.value.col_name for t in BenchmarksLongDoc]
336
  else:
337
  raise NotImplemented
338
  all_data_json = []
 
6
 
7
  import pandas as pd
8
 
9
+ from src.benchmarks import QABenchmarks, LongDocBenchmarks
10
  from src.display.formatting import styled_message, styled_error
11
  from src.display.columns import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
12
  COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS, COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, \
 
68
  if task == "qa":
69
  cols_list = COLS_QA
70
  types_list = TYPES_QA
71
+ benchmark_list = [c.value.col_name for c in list(QABenchmarks)]
72
  elif task == "long-doc":
73
  cols_list = COLS_LONG_DOC
74
  types_list = TYPES_LONG_DOC
75
+ benchmark_list = [c.value.col_name for c in list(LongDocBenchmarks)]
76
  else:
77
  raise NotImplemented
78
  for col_name, col_type in zip(cols_list, types_list):
 
110
  selected_cols = []
111
  for c in cols:
112
  if task == "qa":
113
+ eval_col = QABenchmarks[c].value
114
  elif task == "long-doc":
115
+ eval_col = LongDocBenchmarks[c].value
116
  if eval_col.domain not in domain_query:
117
  continue
118
  if eval_col.lang not in language_query:
 
329
  cols = [COL_NAME_IS_ANONYMOUS, ]
330
  if task == "qa":
331
  cols += COLS_QA
332
+ benchmark_cols = [t.value.col_name for t in QABenchmarks]
333
  elif task == "long-doc":
334
  cols += COLS_LONG_DOC
335
+ benchmark_cols = [t.value.col_name for t in LongDocBenchmarks]
336
  else:
337
  raise NotImplemented
338
  all_data_json = []
tests/src/test_benchmarks.py CHANGED
@@ -1,11 +1,11 @@
1
- from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
2
 
3
 
4
  def test_qabenchmarks():
5
- print(list(BenchmarksQA))
6
- for benchmark in list(BenchmarksQA):
7
  print(benchmark.name, benchmark.metric, benchmark.col_name, benchmark.domain, benchmark.lang, benchmark.task)
8
 
9
 
10
  def test_longdocbenchmarks():
11
- print(list(BenchmarksLongDoc))
 
1
+ from src.benchmarks import QABenchmarks, LongDocBenchmarks
2
 
3
 
4
  def test_qabenchmarks():
5
+ print(list(QABenchmarks))
6
+ for benchmark in list(QABenchmarks):
7
  print(benchmark.name, benchmark.metric, benchmark.col_name, benchmark.domain, benchmark.lang, benchmark.task)
8
 
9
 
10
  def test_longdocbenchmarks():
11
+ print(list(LongDocBenchmarks))