Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

chivier commited on Jun 11, 2024

Commit

f745515

1 Parent(s): fe8e6f7

sync from github

Files changed (4) hide show

src/display/about.py CHANGED Viewed

@@ -10,9 +10,9 @@ The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to me
 Tasks:
-- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
 - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
 - **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
 Columns and Metrics:
 - Method: The MOE LLMs inference framework.

 Tasks:
 - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
 - **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
+- **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
 Columns and Metrics:
 - Method: The MOE LLMs inference framework.

src/display/utils.py CHANGED Viewed

@@ -37,9 +37,7 @@ gpu_metrics_to_name_map = {
     GPU_Mem: GPU_Mem,
     "batch_size": BATCH_SIZE,
     "precision": PRECISION,
-    GPU_Name: GPU_Name,
-    MFU: MFU,
-    MBU: MBU
 }
 @dataclass

     GPU_Mem: GPU_Mem,
     "batch_size": BATCH_SIZE,
     "precision": PRECISION,
+    GPU_Name: GPU_Name
 }
 @dataclass

src/leaderboard/read_evals.py CHANGED Viewed

@@ -65,11 +65,11 @@ class EvalResult:
         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         still_on_hub, error, model_config = is_model_on_hub(
@@ -120,12 +120,13 @@ class EvalResult:
                         multiplier = 1.0
                     if "batch_" in metric or "Mem" in metric or "Util" in metric:
                         multiplier = 1
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
                     if value == "N/A":
-                        results[benchmark][metric] = None
                     else:
                         results[benchmark][metric] = value * multiplier

         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}_{inference_framework}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}_{inference_framework}"
         full_model = "/".join(org_and_model)
         still_on_hub, error, model_config = is_model_on_hub(
                         multiplier = 1.0
                     if "batch_" in metric or "Mem" in metric or "Util" in metric:
                         multiplier = 1
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
                     if value == "N/A":
+                        results[benchmark][metric] = "-"
+                    elif value == "auto":
+                        results[benchmark][metric] = "auto"
                     else:
                         results[benchmark][metric] = value * multiplier

src/populate.py CHANGED Viewed

@@ -75,7 +75,7 @@ def get_leaderboard_df(
             df[col] = np.nan
     if not df.empty:
-        df = df.round(decimals=4)
         # filter out if any of the benchmarks have not been produced
         # df = df[has_no_nan_values(df, benchmark_cols)]

             df[col] = np.nan
     if not df.empty:
+        df = df.round(decimals=2)
         # filter out if any of the benchmarks have not been produced
         # df = df[has_no_nan_values(df, benchmark_cols)]