chivier commited on
Commit
f745515
1 Parent(s): fe8e6f7

sync from github

Browse files
src/display/about.py CHANGED
@@ -10,9 +10,9 @@ The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to me
10
 
11
 
12
  Tasks:
13
- - **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
14
  - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
15
  - **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
 
16
 
17
  Columns and Metrics:
18
  - Method: The MOE LLMs inference framework.
 
10
 
11
 
12
  Tasks:
 
13
  - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
14
  - **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
15
+ - **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
16
 
17
  Columns and Metrics:
18
  - Method: The MOE LLMs inference framework.
src/display/utils.py CHANGED
@@ -37,9 +37,7 @@ gpu_metrics_to_name_map = {
37
  GPU_Mem: GPU_Mem,
38
  "batch_size": BATCH_SIZE,
39
  "precision": PRECISION,
40
- GPU_Name: GPU_Name,
41
- MFU: MFU,
42
- MBU: MBU
43
  }
44
 
45
  @dataclass
 
37
  GPU_Mem: GPU_Mem,
38
  "batch_size": BATCH_SIZE,
39
  "precision": PRECISION,
40
+ GPU_Name: GPU_Name
 
 
41
  }
42
 
43
  @dataclass
src/leaderboard/read_evals.py CHANGED
@@ -65,11 +65,11 @@ class EvalResult:
65
  if len(org_and_model) == 1:
66
  org = None
67
  model = org_and_model[0]
68
- result_key = f"{model}_{precision.value.name}"
69
  else:
70
  org = org_and_model[0]
71
  model = org_and_model[1]
72
- result_key = f"{org}_{model}_{precision.value.name}"
73
  full_model = "/".join(org_and_model)
74
 
75
  still_on_hub, error, model_config = is_model_on_hub(
@@ -120,12 +120,13 @@ class EvalResult:
120
  multiplier = 1.0
121
  if "batch_" in metric or "Mem" in metric or "Util" in metric:
122
  multiplier = 1
123
-
124
-
125
  # print('RESULTS', data['results'])
126
  # print('XXX', benchmark, metric, value, multiplier)
127
  if value == "N/A":
128
- results[benchmark][metric] = None
 
 
129
  else:
130
  results[benchmark][metric] = value * multiplier
131
 
 
65
  if len(org_and_model) == 1:
66
  org = None
67
  model = org_and_model[0]
68
+ result_key = f"{model}_{precision.value.name}_{inference_framework}"
69
  else:
70
  org = org_and_model[0]
71
  model = org_and_model[1]
72
+ result_key = f"{org}_{model}_{precision.value.name}_{inference_framework}"
73
  full_model = "/".join(org_and_model)
74
 
75
  still_on_hub, error, model_config = is_model_on_hub(
 
120
  multiplier = 1.0
121
  if "batch_" in metric or "Mem" in metric or "Util" in metric:
122
  multiplier = 1
123
+
 
124
  # print('RESULTS', data['results'])
125
  # print('XXX', benchmark, metric, value, multiplier)
126
  if value == "N/A":
127
+ results[benchmark][metric] = "-"
128
+ elif value == "auto":
129
+ results[benchmark][metric] = "auto"
130
  else:
131
  results[benchmark][metric] = value * multiplier
132
 
src/populate.py CHANGED
@@ -75,7 +75,7 @@ def get_leaderboard_df(
75
  df[col] = np.nan
76
 
77
  if not df.empty:
78
- df = df.round(decimals=4)
79
 
80
  # filter out if any of the benchmarks have not been produced
81
  # df = df[has_no_nan_values(df, benchmark_cols)]
 
75
  df[col] = np.nan
76
 
77
  if not df.empty:
78
+ df = df.round(decimals=2)
79
 
80
  # filter out if any of the benchmarks have not been produced
81
  # df = df[has_no_nan_values(df, benchmark_cols)]