Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
e8879cc
1 Parent(s): 9c49811

feat: adapt UI in app.py

Browse files
Files changed (5) hide show
  1. app.py +79 -93
  2. src/benchmarks.py +4 -1
  3. src/envs.py +4 -4
  4. src/populate.py +5 -3
  5. tests/src/test_populate.py +2 -2
app.py CHANGED
@@ -18,28 +18,28 @@ from src.display.utils import (
18
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
19
  from src.populate import get_leaderboard_df
20
  from utils import update_table
 
21
 
22
 
23
  def restart_space():
24
  API.restart_space(repo_id=REPO_ID)
25
 
26
-
27
- try:
28
- print(EVAL_REQUESTS_PATH)
29
- snapshot_download(
30
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
31
- token=TOKEN
32
- )
33
- except Exception:
34
- restart_space()
35
- try:
36
- print(EVAL_RESULTS_PATH)
37
- snapshot_download(
38
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
39
- token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
 
44
  raw_data_qa, original_df_qa = get_leaderboard_df(
45
  EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_1')
@@ -58,7 +58,7 @@ with demo:
58
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
59
 
60
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
61
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
62
  with gr.Row():
63
  with gr.Column():
64
  with gr.Row():
@@ -67,56 +67,49 @@ with demo:
67
  show_label=False,
68
  elem_id="search-bar",
69
  )
 
70
  with gr.Row():
71
- shown_columns = gr.CheckboxGroup(
72
- choices=[
73
- c.name
74
- for c in fields(AutoEvalColumnQA)
75
- if not c.hidden and not c.never_hidden
76
- ],
77
- value=[
78
- c.name
79
- for c in fields(AutoEvalColumnQA)
80
- if c.displayed_by_default and not c.hidden and not c.never_hidden
81
- ],
82
- label="Select columns to show",
83
- elem_id="column-select",
84
  interactive=True,
85
  )
 
 
 
 
 
 
 
 
 
 
 
86
  with gr.Row():
87
- deleted_models_visibility = gr.Checkbox(
88
- value=False, label="Show gated/private/deleted models", interactive=True
 
 
 
 
89
  )
90
  with gr.Column(min_width=320):
91
- # with gr.Box(elem_id="box-filter"):
92
- filter_columns_type = gr.CheckboxGroup(
93
- label="Model types",
94
- choices=[t.to_str() for t in ModelType],
95
- value=[t.to_str() for t in ModelType],
96
- interactive=True,
97
- elem_id="filter-columns-type",
98
- )
99
- filter_columns_precision = gr.CheckboxGroup(
100
- label="Precision",
101
- choices=[i.value.name for i in Precision],
102
- value=[i.value.name for i in Precision],
103
  interactive=True,
104
- elem_id="filter-columns-precision",
105
- )
106
- filter_columns_size = gr.CheckboxGroup(
107
- label="Model sizes (in billions of parameters)",
108
- choices=list(NUMERIC_INTERVALS.keys()),
109
- value=list(NUMERIC_INTERVALS.keys()),
110
- interactive=True,
111
- elem_id="filter-columns-size",
112
  )
 
 
113
 
 
114
  leaderboard_table = gr.components.Dataframe(
115
- value=leaderboard_df[
116
- [c.name for c in fields(AutoEvalColumnQA) if c.never_hidden]
117
- + shown_columns.value
118
- ],
119
- headers=[c.name for c in fields(AutoEvalColumnQA) if c.never_hidden] + shown_columns.value,
120
  datatype=TYPES,
121
  elem_id="leaderboard-table",
122
  interactive=False,
@@ -124,41 +117,34 @@ with demo:
124
  )
125
 
126
  # Dummy leaderboard for handling the case when the user uses backspace key
127
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
128
- value=original_df_qa[COLS],
129
- headers=COLS,
130
- datatype=TYPES,
131
- visible=False,
132
- )
133
- search_bar.submit(
134
- update_table,
135
- [
136
- hidden_leaderboard_table_for_search,
137
- shown_columns,
138
- filter_columns_type,
139
- filter_columns_precision,
140
- filter_columns_size,
141
- deleted_models_visibility,
142
- search_bar,
143
- ],
144
- leaderboard_table,
145
- )
146
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size,
147
- deleted_models_visibility]:
148
- selector.change(
149
- update_table,
150
- [
151
- hidden_leaderboard_table_for_search,
152
- shown_columns,
153
- filter_columns_type,
154
- filter_columns_precision,
155
- filter_columns_size,
156
- deleted_models_visibility,
157
- search_bar,
158
- ],
159
- leaderboard_table,
160
- queue=True,
161
- )
162
 
163
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
164
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
18
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
19
  from src.populate import get_leaderboard_df
20
  from utils import update_table
21
+ from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, metric_list
22
 
23
 
24
  def restart_space():
25
  API.restart_space(repo_id=REPO_ID)
26
 
27
+ # try:
28
+ # print(EVAL_REQUESTS_PATH)
29
+ # snapshot_download(
30
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
31
+ # token=TOKEN
32
+ # )
33
+ # except Exception:
34
+ # restart_space()
35
+ # try:
36
+ # print(EVAL_RESULTS_PATH)
37
+ # snapshot_download(
38
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
39
+ # token=TOKEN
40
+ # )
41
+ # except Exception:
42
+ # restart_space()
 
43
 
44
  raw_data_qa, original_df_qa = get_leaderboard_df(
45
  EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_1')
 
58
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
59
 
60
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
61
+ with gr.TabItem("QA", elem_id="llm-benchmark-tab-table", id=0):
62
  with gr.Row():
63
  with gr.Column():
64
  with gr.Row():
 
67
  show_label=False,
68
  elem_id="search-bar",
69
  )
70
+ # select domain
71
  with gr.Row():
72
+ selected_domains = gr.CheckboxGroup(
73
+ choices=DOMAIN_COLS_QA,
74
+ value=DOMAIN_COLS_QA,
75
+ label="Select the domains",
76
+ elem_id="domain-column-select",
 
 
 
 
 
 
 
 
77
  interactive=True,
78
  )
79
+ # select language
80
+ with gr.Row():
81
+ selected_langs = gr.CheckboxGroup(
82
+ choices=LANG_COLS_QA,
83
+ value=LANG_COLS_QA,
84
+ label="Select the languages",
85
+ elem_id="language-column-select",
86
+ interactive=True
87
+ )
88
+ # select reranking models
89
+ reranking_models = list(frozenset([eval_result.retrieval_model for eval_result in raw_data_qa]))
90
  with gr.Row():
91
+ selected_rerankings = gr.CheckboxGroup(
92
+ choices=reranking_models,
93
+ value=reranking_models,
94
+ label="Select the reranking models",
95
+ elem_id="reranking-select",
96
+ interactive=True
97
  )
98
  with gr.Column(min_width=320):
99
+ selected_metric = gr.Dropdown(
100
+ choices=metric_list,
101
+ value=metric_list,
102
+ label="Select the metric",
 
 
 
 
 
 
 
 
103
  interactive=True,
104
+ elem_id="metric-select",
 
 
 
 
 
 
 
105
  )
106
+ # update shown_columns when selected_langs and selected_domains are changed
107
+ shown_columns = leaderboard_df.columns
108
 
109
+ # reload the leaderboard_df and raw_data when selected_metric is changed
110
  leaderboard_table = gr.components.Dataframe(
111
+ value=leaderboard_df,
112
+ # headers=shown_columns,
 
 
 
113
  datatype=TYPES,
114
  elem_id="leaderboard-table",
115
  interactive=False,
 
117
  )
118
 
119
  # Dummy leaderboard for handling the case when the user uses backspace key
120
+ # hidden_leaderboard_table_for_search = gr.components.Dataframe(
121
+ # value=original_df_qa[COLS],
122
+ # headers=COLS,
123
+ # datatype=TYPES,
124
+ # visible=False,
125
+ # )
126
+ # search_bar.submit(
127
+ # update_table,
128
+ # [
129
+ # hidden_leaderboard_table_for_search,
130
+ # shown_columns,
131
+ # selected_rerankings,
132
+ # search_bar,
133
+ # ],
134
+ # leaderboard_table,
135
+ # )
136
+ # for selector in [shown_columns, selected_rerankings, search_bar]:
137
+ # selector.change(
138
+ # update_table,
139
+ # [
140
+ # hidden_leaderboard_table_for_search,
141
+ # shown_columns,
142
+ # selected_rerankings,
143
+ # search_bar,
144
+ # ],
145
+ # leaderboard_table,
146
+ # queue=True,
147
+ # )
 
 
 
 
 
 
 
148
 
149
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
150
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
src/benchmarks.py CHANGED
@@ -135,4 +135,7 @@ for task, domain_dict in dataset_dict.items():
135
  BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
136
  BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
137
 
138
- BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
 
 
 
 
135
  BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
136
  BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
137
 
138
+ BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
139
+
140
+ DOMAIN_COLS_QA = list(frozenset([c.domain for c in qa_benchmark_dict.values()]))
141
+ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
src/envs.py CHANGED
@@ -17,9 +17,9 @@ RESULTS_REPO = f"{OWNER}/results"
17
  CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
17
  CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
+ EVAL_REQUESTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/tests/toydata/test_requests" # os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_RESULTS_PATH = "/Users/nanwang/Codes/huggingface/nan/leaderboard/tests/toydata/test_results" #os.path.join(CACHE_PATH, "eval-results")
22
+ # EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
+ # EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
src/populate.py CHANGED
@@ -17,13 +17,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
17
  all_data_json += v.to_dict(task=task, metric=metric)
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
- df[AutoEvalColumnQA.average.name] = df[benchmark_cols].mean(axis=1)
 
21
  df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
22
  df.reset_index(inplace=True)
23
- df = df[cols].round(decimals=2)
 
24
 
25
  # filter out if any of the benchmarks have not been produced
26
- df = df[has_no_nan_values(df, benchmark_cols)]
27
  return raw_data, df
28
 
29
 
 
17
  all_data_json += v.to_dict(task=task, metric=metric)
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
+ _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
21
+ df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1)
22
  df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
23
  df.reset_index(inplace=True)
24
+ _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
25
+ df = df[_cols].round(decimals=2)
26
 
27
  # filter out if any of the benchmarks have not been produced
28
+ df = df[has_no_nan_values(df, _benchmark_cols)]
29
  return raw_data, df
30
 
31
 
tests/src/test_populate.py CHANGED
@@ -9,9 +9,9 @@ def test_get_leaderboard_df():
9
  results_path = cur_fp.parents[1] / "toydata" / "test_results"
10
  cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
11
  benchmark_cols = ['wiki_en', 'wiki_zh',]
12
- raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
13
  assert df.shape[0] == 2
14
- # the results contains only one embedding model
15
  for i in range(2):
16
  assert df["Retrieval Model"][i] == "bge-m3"
17
  # the results contains only two reranking model
 
9
  results_path = cur_fp.parents[1] / "toydata" / "test_results"
10
  cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
11
  benchmark_cols = ['wiki_en', 'wiki_zh',]
12
+ raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols, 'qa', 'ndcg_at_1')
13
  assert df.shape[0] == 2
14
+ # the results contain only one embedding model
15
  for i in range(2):
16
  assert df["Retrieval Model"][i] == "bge-m3"
17
  # the results contains only two reranking model