sync from github
Browse files- src/backend/envs.py +1 -1
- src/display/about.py +3 -0
- src/display/utils.py +1 -1
src/backend/envs.py
CHANGED
@@ -58,7 +58,7 @@ class Tasks(Enum):
|
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
60 |
task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
|
61 |
-
task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
|
62 |
|
63 |
|
64 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
|
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
60 |
task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
|
61 |
+
# task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
|
62 |
|
63 |
|
64 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
src/display/about.py
CHANGED
@@ -12,12 +12,15 @@ The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to me
|
|
12 |
Tasks:
|
13 |
- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
|
14 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
|
|
15 |
|
16 |
Columns and Metrics:
|
17 |
- Method: The MOE LLMs inference framework.
|
18 |
- E2E(s): Average End to End generation time in seconds.
|
19 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
20 |
- T/s: Tokens throughout per second.
|
|
|
|
|
21 |
- Precision: The precison of used model.
|
22 |
|
23 |
"""
|
|
|
12 |
Tasks:
|
13 |
- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
|
14 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
15 |
+
- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
|
16 |
|
17 |
Columns and Metrics:
|
18 |
- Method: The MOE LLMs inference framework.
|
19 |
- E2E(s): Average End to End generation time in seconds.
|
20 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
21 |
- T/s: Tokens throughout per second.
|
22 |
+
- MBU(%): Model Bandwidth Utilization.
|
23 |
+
- MFU(%): Model FLOPs Utilization.
|
24 |
- Precision: The precison of used model.
|
25 |
|
26 |
"""
|
src/display/utils.py
CHANGED
@@ -82,7 +82,7 @@ class Tasks(Enum):
|
|
82 |
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
83 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
84 |
gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
|
85 |
-
gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
|
86 |
|
87 |
|
88 |
# These classes are for user facing column names,
|
|
|
82 |
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
83 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
84 |
gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
|
85 |
+
# gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
|
86 |
|
87 |
|
88 |
# These classes are for user facing column names,
|