Benchmark Scores
Tasks |
Version |
Filter |
n-shot |
Metric |
Value |
|
Stderr |
arc_challenge |
1 |
none |
0 |
acc |
0.5247 |
± |
0.0146 |
|
|
none |
0 |
acc_norm |
0.5623 |
± |
0.0145 |
Tasks |
Version |
Filter |
n-shot |
Metric |
Value |
|
Stderr |
hellaswag |
1 |
none |
0 |
acc |
0.6270 |
± |
0.0048 |
|
|
none |
0 |
acc_norm |
0.8228 |
± |
0.0038 |
Groups |
Version |
Filter |
n-shot |
Metric |
Value |
|
Stderr |
mmlu |
N/A |
none |
0 |
acc |
0.6243 |
± |
0.1341 |
- humanities |
N/A |
none |
0 |
acc |
0.5717 |
± |
0.1400 |
- other |
N/A |
none |
0 |
acc |
0.7016 |
± |
0.1143 |
- social_sciences |
N/A |
none |
0 |
acc |
0.7342 |
± |
0.0753 |
- stem |
N/A |
none |
0 |
acc |
0.5192 |
± |
0.1257 |
Tasks |
Version |
Filter |
n-shot |
Metric |
Value |
|
Stderr |
winogrande |
1 |
none |
0 |
acc |
0.7774 |
± |
0.0117 |
Tasks |
Version |
Filter |
n-shot |
Metric |
Value |
|
Stderr |
gsm8k |
2 |
get-answer |
5 |
exact_match |
0.6732 |
± |
0.0129 |
Tasks |
Version |
Filter |
n-shot |
Metric |
Value |
|
Stderr |
truthfulqa_mc2 |
2 |
none |
0 |
acc |
0.4795 |
± |
0.0148 |
Average 65.658