yzabc007 commited on
Commit
da96aa6
โ€ข
1 Parent(s): 826f447

Update space

Browse files
Files changed (4) hide show
  1. app.py +8 -7
  2. src/about.py +6 -0
  3. src/leaderboard/read_evals.py +4 -0
  4. src/populate.py +20 -0
app.py CHANGED
@@ -11,6 +11,7 @@ from src.about import (
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
 
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
@@ -136,30 +137,30 @@ with demo:
136
 
137
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
138
  with gr.TabItem("๐Ÿงฎ Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
139
- leaderboard = init_leaderboard(LEADERBOARD_DF)
140
 
141
  with gr.TabItem("๐Ÿ“ Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"):
142
- leaderboard = init_leaderboard(LEADERBOARD_DF)
143
 
144
  with gr.TabItem("๐Ÿ“Š Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
145
- leaderboard = init_leaderboard(LEADERBOARD_DF)
146
 
147
 
148
  with gr.TabItem("๐Ÿง  Reasoning", elem_id="reasonong-tab-table", id=3):
149
 
150
  with gr.TabItem("๐Ÿงฉ Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
151
- leaderboard = init_leaderboard(LEADERBOARD_DF)
152
 
153
  with gr.TabItem("๐Ÿ—ฃ๏ธ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):
154
- leaderboard = init_leaderboard(LEADERBOARD_DF)
155
 
156
 
157
  with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=4):
158
- leaderboard = init_leaderboard(LEADERBOARD_DF)
159
 
160
 
161
  with gr.TabItem("๐Ÿ”ฌ Science", elem_id="science-table", id=5):
162
- leaderboard = init_leaderboard(LEADERBOARD_DF)
163
 
164
 
165
  with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=6):
 
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
+ COMING_SOON_TEXT
15
  )
16
  from src.display.css_html_js import custom_css
17
  from src.display.utils import (
 
137
 
138
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
139
  with gr.TabItem("๐Ÿงฎ Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
140
+ leaderboard = overall_leaderboard(model_leaderboard_df)
141
 
142
  with gr.TabItem("๐Ÿ“ Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"):
143
+ leaderboard = overall_leaderboard(model_leaderboard_df)
144
 
145
  with gr.TabItem("๐Ÿ“Š Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
146
+ leaderboard = overall_leaderboard(model_leaderboard_df)
147
 
148
 
149
  with gr.TabItem("๐Ÿง  Reasoning", elem_id="reasonong-tab-table", id=3):
150
 
151
  with gr.TabItem("๐Ÿงฉ Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
152
+ leaderboard = overall_leaderboard(model_leaderboard_df)
153
 
154
  with gr.TabItem("๐Ÿ—ฃ๏ธ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):
155
+ leaderboard = overall_leaderboard(model_leaderboard_df)
156
 
157
 
158
  with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=4):
159
+ gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
160
 
161
 
162
  with gr.TabItem("๐Ÿ”ฌ Science", elem_id="science-table", id=5):
163
+ gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
164
 
165
 
166
  with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=6):
src/about.py CHANGED
@@ -56,6 +56,12 @@ To reproduce our results, here is the commands you can run:
56
 
57
  """
58
 
 
 
 
 
 
 
59
  EVALUATION_QUEUE_TEXT = """
60
  ## Some good practices before submitting a model
61
 
 
56
 
57
  """
58
 
59
+ COMING_SOON_TEXT = """
60
+ # Coming soon
61
+ We are working on adding more tasks to the leaderboard. Stay tuned!
62
+ """
63
+
64
+
65
  EVALUATION_QUEUE_TEXT = """
66
  ## Some good practices before submitting a model
67
 
src/leaderboard/read_evals.py CHANGED
@@ -11,6 +11,10 @@ from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Domains
12
  from src.submission.check_validity import is_model_on_hub
13
 
 
 
 
 
14
 
15
  @dataclass
16
  class ModelResult:
 
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Domains
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
+ # @dataclass
15
+ # class RankResult:
16
+
17
+
18
 
19
  @dataclass
20
  class ModelResult:
src/populate.py CHANGED
@@ -8,6 +8,26 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[]) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_model_results(results_path)
 
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results
9
 
10
 
11
+
12
+ def get_overview_leaderboard_df(results_path: str) -> pd.DataFrame:
13
+ """Creates a dataframe from all the individual experiment results"""
14
+ raw_data = get_raw_eval_results(results_path, requests_path)
15
+ all_data_json = [v.to_dict() for v in raw_data]
16
+
17
+ df = pd.DataFrame.from_records(all_data_json)
18
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
+ for col in cols:
20
+ if col not in df.columns:
21
+ df[col] = None
22
+ else:
23
+ df[col] = df[col].round(decimals=2)
24
+
25
+ # filter out if any of the benchmarks have not been produced
26
+ df = df[has_no_nan_values(df, benchmark_cols)]
27
+ return df
28
+
29
+
30
+
31
  def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[]) -> pd.DataFrame:
32
  """Creates a dataframe from all the individual experiment results"""
33
  raw_data = get_raw_model_results(results_path)