sync from github
Browse files- src/display/about.py +1 -1
- src/display/utils.py +1 -3
- src/leaderboard/read_evals.py +6 -5
- src/populate.py +1 -1
src/display/about.py
CHANGED
@@ -10,9 +10,9 @@ The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to me
|
|
10 |
|
11 |
|
12 |
Tasks:
|
13 |
-
- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
|
14 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
15 |
- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
|
|
|
16 |
|
17 |
Columns and Metrics:
|
18 |
- Method: The MOE LLMs inference framework.
|
|
|
10 |
|
11 |
|
12 |
Tasks:
|
|
|
13 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
14 |
- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
|
15 |
+
- **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
|
16 |
|
17 |
Columns and Metrics:
|
18 |
- Method: The MOE LLMs inference framework.
|
src/display/utils.py
CHANGED
@@ -37,9 +37,7 @@ gpu_metrics_to_name_map = {
|
|
37 |
GPU_Mem: GPU_Mem,
|
38 |
"batch_size": BATCH_SIZE,
|
39 |
"precision": PRECISION,
|
40 |
-
GPU_Name: GPU_Name
|
41 |
-
MFU: MFU,
|
42 |
-
MBU: MBU
|
43 |
}
|
44 |
|
45 |
@dataclass
|
|
|
37 |
GPU_Mem: GPU_Mem,
|
38 |
"batch_size": BATCH_SIZE,
|
39 |
"precision": PRECISION,
|
40 |
+
GPU_Name: GPU_Name
|
|
|
|
|
41 |
}
|
42 |
|
43 |
@dataclass
|
src/leaderboard/read_evals.py
CHANGED
@@ -65,11 +65,11 @@ class EvalResult:
|
|
65 |
if len(org_and_model) == 1:
|
66 |
org = None
|
67 |
model = org_and_model[0]
|
68 |
-
result_key = f"{model}_{precision.value.name}"
|
69 |
else:
|
70 |
org = org_and_model[0]
|
71 |
model = org_and_model[1]
|
72 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
73 |
full_model = "/".join(org_and_model)
|
74 |
|
75 |
still_on_hub, error, model_config = is_model_on_hub(
|
@@ -120,12 +120,13 @@ class EvalResult:
|
|
120 |
multiplier = 1.0
|
121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
122 |
multiplier = 1
|
123 |
-
|
124 |
-
|
125 |
# print('RESULTS', data['results'])
|
126 |
# print('XXX', benchmark, metric, value, multiplier)
|
127 |
if value == "N/A":
|
128 |
-
results[benchmark][metric] =
|
|
|
|
|
129 |
else:
|
130 |
results[benchmark][metric] = value * multiplier
|
131 |
|
|
|
65 |
if len(org_and_model) == 1:
|
66 |
org = None
|
67 |
model = org_and_model[0]
|
68 |
+
result_key = f"{model}_{precision.value.name}_{inference_framework}"
|
69 |
else:
|
70 |
org = org_and_model[0]
|
71 |
model = org_and_model[1]
|
72 |
+
result_key = f"{org}_{model}_{precision.value.name}_{inference_framework}"
|
73 |
full_model = "/".join(org_and_model)
|
74 |
|
75 |
still_on_hub, error, model_config = is_model_on_hub(
|
|
|
120 |
multiplier = 1.0
|
121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
122 |
multiplier = 1
|
123 |
+
|
|
|
124 |
# print('RESULTS', data['results'])
|
125 |
# print('XXX', benchmark, metric, value, multiplier)
|
126 |
if value == "N/A":
|
127 |
+
results[benchmark][metric] = "-"
|
128 |
+
elif value == "auto":
|
129 |
+
results[benchmark][metric] = "auto"
|
130 |
else:
|
131 |
results[benchmark][metric] = value * multiplier
|
132 |
|
src/populate.py
CHANGED
@@ -75,7 +75,7 @@ def get_leaderboard_df(
|
|
75 |
df[col] = np.nan
|
76 |
|
77 |
if not df.empty:
|
78 |
-
df = df.round(decimals=
|
79 |
|
80 |
# filter out if any of the benchmarks have not been produced
|
81 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
75 |
df[col] = np.nan
|
76 |
|
77 |
if not df.empty:
|
78 |
+
df = df.round(decimals=2)
|
79 |
|
80 |
# filter out if any of the benchmarks have not been produced
|
81 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|