abhinavnmagic commited on
Commit
3b4c6eb
·
verified ·
1 Parent(s): 7ed1b70

Upload folder using huggingface_hub

Browse files
Files changed (41) hide show
  1. arc_challenge/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T07-21-44.990894.json +102 -0
  2. config.json +94 -0
  3. generation_config.json +6 -0
  4. gsm8k/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T16-23-36.584670.json +138 -0
  5. hellaswag/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T11-27-16.957594.json +100 -0
  6. mmlu/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T16-13-28.474390.json +3154 -0
  7. model-00001-of-00029.safetensors +3 -0
  8. model-00002-of-00029.safetensors +3 -0
  9. model-00003-of-00029.safetensors +3 -0
  10. model-00004-of-00029.safetensors +3 -0
  11. model-00005-of-00029.safetensors +3 -0
  12. model-00006-of-00029.safetensors +3 -0
  13. model-00007-of-00029.safetensors +3 -0
  14. model-00008-of-00029.safetensors +3 -0
  15. model-00009-of-00029.safetensors +3 -0
  16. model-00010-of-00029.safetensors +3 -0
  17. model-00011-of-00029.safetensors +3 -0
  18. model-00012-of-00029.safetensors +3 -0
  19. model-00013-of-00029.safetensors +3 -0
  20. model-00014-of-00029.safetensors +3 -0
  21. model-00015-of-00029.safetensors +3 -0
  22. model-00016-of-00029.safetensors +3 -0
  23. model-00017-of-00029.safetensors +3 -0
  24. model-00018-of-00029.safetensors +3 -0
  25. model-00019-of-00029.safetensors +3 -0
  26. model-00020-of-00029.safetensors +3 -0
  27. model-00021-of-00029.safetensors +3 -0
  28. model-00022-of-00029.safetensors +3 -0
  29. model-00023-of-00029.safetensors +3 -0
  30. model-00024-of-00029.safetensors +3 -0
  31. model-00025-of-00029.safetensors +3 -0
  32. model-00026-of-00029.safetensors +3 -0
  33. model-00027-of-00029.safetensors +3 -0
  34. model-00028-of-00029.safetensors +3 -0
  35. model-00029-of-00029.safetensors +3 -0
  36. model.safetensors.index.json +0 -0
  37. special_tokens_map.json +23 -0
  38. tokenizer.json +0 -0
  39. tokenizer_config.json +108 -0
  40. truthfulqa_mc2/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T07-32-59.669961.json +93 -0
  41. winogrande/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T07-25-09.600505.json +90 -0
arc_challenge/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T07-21-44.990894.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "acc,none": 0.6476109215017065,
5
+ "acc_stderr,none": 0.013960142600598666,
6
+ "acc_norm,none": 0.6919795221843004,
7
+ "acc_norm_stderr,none": 0.013491429517292038,
8
+ "alias": "arc_challenge"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "arc_challenge": []
13
+ },
14
+ "configs": {
15
+ "arc_challenge": {
16
+ "task": "arc_challenge",
17
+ "group": [
18
+ "ai2_arc"
19
+ ],
20
+ "dataset_path": "allenai/ai2_arc",
21
+ "dataset_name": "ARC-Challenge",
22
+ "training_split": "train",
23
+ "validation_split": "validation",
24
+ "test_split": "test",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
27
+ "doc_to_choice": "{{choices.text}}",
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 25,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ }
43
+ ],
44
+ "output_type": "multiple_choice",
45
+ "repeats": 1,
46
+ "should_decontaminate": true,
47
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
48
+ "metadata": {
49
+ "version": 1.0
50
+ }
51
+ }
52
+ },
53
+ "versions": {
54
+ "arc_challenge": 1.0
55
+ },
56
+ "n-shot": {
57
+ "arc_challenge": 25
58
+ },
59
+ "higher_is_better": {
60
+ "arc_challenge": {
61
+ "acc": true,
62
+ "acc_norm": true
63
+ }
64
+ },
65
+ "n-samples": {
66
+ "arc_challenge": {
67
+ "original": 1172,
68
+ "effective": 1172
69
+ }
70
+ },
71
+ "config": {
72
+ "model": "vllm",
73
+ "model_args": "pretrained=/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8,tensor_parallel_size=4,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.8,data_parallel_size=1",
74
+ "batch_size": "auto",
75
+ "batch_sizes": [],
76
+ "device": "cuda",
77
+ "use_cache": null,
78
+ "limit": null,
79
+ "bootstrap_iters": 100000,
80
+ "gen_kwargs": null,
81
+ "random_seed": 0,
82
+ "numpy_seed": 1234,
83
+ "torch_seed": 1234,
84
+ "fewshot_seed": 1234
85
+ },
86
+ "git_hash": "f2843b2f",
87
+ "date": 1717743073.8326726,
88
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.19.0-1010-nvidia-lowlatency-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.5.40\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H100 NVL\nGPU 1: NVIDIA H100 NVL\nGPU 2: NVIDIA H100 NVL\nGPU 3: NVIDIA H100 NVL\nGPU 4: NVIDIA H100 NVL\nGPU 5: NVIDIA H100 NVL\nGPU 6: NVIDIA H100 NVL\nGPU 7: NVIDIA H100 NVL\n\nNvidia driver version: 555.42.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 144\nOn-line CPU(s) list: 0-143\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8452Y\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 36\nSocket(s): 2\nStepping: 8\nFrequency boost: enabled\nCPU max MHz: 2001.0000\nCPU min MHz: 800.0000\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hfi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities\nVirtualization: VT-x\nL1d cache: 3.4 MiB (72 instances)\nL1i cache: 2.3 MiB (72 instances)\nL2 cache: 144 MiB (72 instances)\nL3 cache: 135 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-35,72-107\nNUMA node1 CPU(s): 36-71,108-143\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
89
+ "transformers_version": "4.41.2",
90
+ "upper_git_hash": "f2843b2fd64df799179808ce2428b7a8dbc403de",
91
+ "task_hashes": {},
92
+ "model_source": "vllm",
93
+ "model_name": "/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8",
94
+ "model_name_sanitized": "__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8",
95
+ "system_instruction": null,
96
+ "system_instruction_sha": null,
97
+ "chat_template": null,
98
+ "chat_template_sha": null,
99
+ "start_time": 813352.581331609,
100
+ "end_time": 815189.703240481,
101
+ "total_evaluation_time_seconds": "1837.1219088719226"
102
+ }
config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mistralai/Mixtral-8x22B-Instruct-v0.1",
3
+ "architectures": [
4
+ "MixtralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 6144,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 16384,
13
+ "max_position_embeddings": 8192,
14
+ "model_type": "mixtral",
15
+ "num_attention_heads": 48,
16
+ "num_experts_per_tok": 2,
17
+ "num_hidden_layers": 56,
18
+ "num_key_value_heads": 8,
19
+ "num_local_experts": 8,
20
+ "output_router_logits": false,
21
+ "quantization_config": {
22
+ "activation_scheme": "static",
23
+ "ignored_layers": [
24
+ "model.layers.9.block_sparse_moe.gate",
25
+ "model.layers.21.block_sparse_moe.gate",
26
+ "model.layers.17.block_sparse_moe.gate",
27
+ "model.layers.34.block_sparse_moe.gate",
28
+ "lm_head",
29
+ "model.layers.7.block_sparse_moe.gate",
30
+ "model.layers.28.block_sparse_moe.gate",
31
+ "model.layers.40.block_sparse_moe.gate",
32
+ "model.layers.4.block_sparse_moe.gate",
33
+ "model.layers.12.block_sparse_moe.gate",
34
+ "model.layers.15.block_sparse_moe.gate",
35
+ "model.layers.44.block_sparse_moe.gate",
36
+ "model.layers.26.block_sparse_moe.gate",
37
+ "model.layers.38.block_sparse_moe.gate",
38
+ "model.layers.47.block_sparse_moe.gate",
39
+ "model.layers.27.block_sparse_moe.gate",
40
+ "model.layers.6.block_sparse_moe.gate",
41
+ "model.layers.5.block_sparse_moe.gate",
42
+ "model.layers.11.block_sparse_moe.gate",
43
+ "model.layers.10.block_sparse_moe.gate",
44
+ "model.layers.54.block_sparse_moe.gate",
45
+ "model.layers.25.block_sparse_moe.gate",
46
+ "model.layers.1.block_sparse_moe.gate",
47
+ "model.layers.41.block_sparse_moe.gate",
48
+ "model.layers.33.block_sparse_moe.gate",
49
+ "model.layers.45.block_sparse_moe.gate",
50
+ "model.layers.14.block_sparse_moe.gate",
51
+ "model.layers.2.block_sparse_moe.gate",
52
+ "model.layers.52.block_sparse_moe.gate",
53
+ "model.layers.24.block_sparse_moe.gate",
54
+ "model.layers.43.block_sparse_moe.gate",
55
+ "model.layers.48.block_sparse_moe.gate",
56
+ "model.layers.29.block_sparse_moe.gate",
57
+ "model.layers.35.block_sparse_moe.gate",
58
+ "model.layers.18.block_sparse_moe.gate",
59
+ "model.layers.50.block_sparse_moe.gate",
60
+ "model.layers.0.block_sparse_moe.gate",
61
+ "model.layers.8.block_sparse_moe.gate",
62
+ "model.layers.23.block_sparse_moe.gate",
63
+ "model.layers.49.block_sparse_moe.gate",
64
+ "model.layers.42.block_sparse_moe.gate",
65
+ "model.layers.22.block_sparse_moe.gate",
66
+ "model.layers.39.block_sparse_moe.gate",
67
+ "model.layers.51.block_sparse_moe.gate",
68
+ "model.layers.31.block_sparse_moe.gate",
69
+ "model.layers.36.block_sparse_moe.gate",
70
+ "model.layers.32.block_sparse_moe.gate",
71
+ "model.layers.37.block_sparse_moe.gate",
72
+ "model.layers.16.block_sparse_moe.gate",
73
+ "model.layers.46.block_sparse_moe.gate",
74
+ "model.layers.53.block_sparse_moe.gate",
75
+ "model.layers.19.block_sparse_moe.gate",
76
+ "model.layers.3.block_sparse_moe.gate",
77
+ "model.layers.30.block_sparse_moe.gate",
78
+ "model.layers.55.block_sparse_moe.gate",
79
+ "model.layers.20.block_sparse_moe.gate",
80
+ "model.layers.13.block_sparse_moe.gate"
81
+ ],
82
+ "quant_method": "fp8"
83
+ },
84
+ "rms_norm_eps": 1e-05,
85
+ "rope_theta": 1000000.0,
86
+ "router_aux_loss_coef": 0.001,
87
+ "router_jitter_noise": 0.0,
88
+ "sliding_window": null,
89
+ "tie_word_embeddings": false,
90
+ "torch_dtype": "bfloat16",
91
+ "transformers_version": "4.41.2",
92
+ "use_cache": true,
93
+ "vocab_size": 32768
94
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.41.2"
6
+ }
gsm8k/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T16-23-36.584670.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k": {
4
+ "exact_match,strict-match": 0.7657316148597423,
5
+ "exact_match_stderr,strict-match": 0.01166641512763105,
6
+ "exact_match,flexible-extract": 0.7740712661106899,
7
+ "exact_match_stderr,flexible-extract": 0.01151909877727995,
8
+ "alias": "gsm8k"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gsm8k": []
13
+ },
14
+ "configs": {
15
+ "gsm8k": {
16
+ "task": "gsm8k",
17
+ "group": [
18
+ "math_word_problems"
19
+ ],
20
+ "dataset_path": "gsm8k",
21
+ "dataset_name": "main",
22
+ "training_split": "train",
23
+ "test_split": "test",
24
+ "fewshot_split": "train",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{answer}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 5,
31
+ "metric_list": [
32
+ {
33
+ "metric": "exact_match",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true,
36
+ "ignore_case": true,
37
+ "ignore_punctuation": false,
38
+ "regexes_to_ignore": [
39
+ ",",
40
+ "\\$",
41
+ "(?s).*#### ",
42
+ "\\.$"
43
+ ]
44
+ }
45
+ ],
46
+ "output_type": "generate_until",
47
+ "generation_kwargs": {
48
+ "until": [
49
+ "Question:",
50
+ "</s>",
51
+ "<|im_end|>"
52
+ ],
53
+ "do_sample": false,
54
+ "temperature": 0.0
55
+ },
56
+ "repeats": 1,
57
+ "filter_list": [
58
+ {
59
+ "name": "strict-match",
60
+ "filter": [
61
+ {
62
+ "function": "regex",
63
+ "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
64
+ },
65
+ {
66
+ "function": "take_first"
67
+ }
68
+ ]
69
+ },
70
+ {
71
+ "name": "flexible-extract",
72
+ "filter": [
73
+ {
74
+ "function": "regex",
75
+ "group_select": -1,
76
+ "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
77
+ },
78
+ {
79
+ "function": "take_first"
80
+ }
81
+ ]
82
+ }
83
+ ],
84
+ "should_decontaminate": false,
85
+ "metadata": {
86
+ "version": 3.0
87
+ }
88
+ }
89
+ },
90
+ "versions": {
91
+ "gsm8k": 3.0
92
+ },
93
+ "n-shot": {
94
+ "gsm8k": 5
95
+ },
96
+ "higher_is_better": {
97
+ "gsm8k": {
98
+ "exact_match": true
99
+ }
100
+ },
101
+ "n-samples": {
102
+ "gsm8k": {
103
+ "original": 1319,
104
+ "effective": 1319
105
+ }
106
+ },
107
+ "config": {
108
+ "model": "vllm",
109
+ "model_args": "pretrained=/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8,tensor_parallel_size=4,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.8,data_parallel_size=1",
110
+ "batch_size": "auto",
111
+ "batch_sizes": [],
112
+ "device": "cuda",
113
+ "use_cache": null,
114
+ "limit": null,
115
+ "bootstrap_iters": 100000,
116
+ "gen_kwargs": null,
117
+ "random_seed": 0,
118
+ "numpy_seed": 1234,
119
+ "torch_seed": 1234,
120
+ "fewshot_seed": 1234
121
+ },
122
+ "git_hash": "f2843b2f",
123
+ "date": 1717776830.423203,
124
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.19.0-1010-nvidia-lowlatency-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.5.40\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H100 NVL\nGPU 1: NVIDIA H100 NVL\nGPU 2: NVIDIA H100 NVL\nGPU 3: NVIDIA H100 NVL\nGPU 4: NVIDIA H100 NVL\nGPU 5: NVIDIA H100 NVL\nGPU 6: NVIDIA H100 NVL\nGPU 7: NVIDIA H100 NVL\n\nNvidia driver version: 555.42.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 144\nOn-line CPU(s) list: 0-143\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8452Y\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 36\nSocket(s): 2\nStepping: 8\nFrequency boost: enabled\nCPU max MHz: 2001.0000\nCPU min MHz: 800.0000\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hfi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities\nVirtualization: VT-x\nL1d cache: 3.4 MiB (72 instances)\nL1i cache: 2.3 MiB (72 instances)\nL2 cache: 144 MiB (72 instances)\nL3 cache: 135 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-35,72-107\nNUMA node1 CPU(s): 36-71,108-143\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
125
+ "transformers_version": "4.41.2",
126
+ "upper_git_hash": "f2843b2fd64df799179808ce2428b7a8dbc403de",
127
+ "task_hashes": {},
128
+ "model_source": "vllm",
129
+ "model_name": "/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8",
130
+ "model_name_sanitized": "__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8",
131
+ "system_instruction": null,
132
+ "system_instruction_sha": null,
133
+ "chat_template": null,
134
+ "chat_template_sha": null,
135
+ "start_time": 847108.840543343,
136
+ "end_time": 847701.29731874,
137
+ "total_evaluation_time_seconds": "592.4567753970623"
138
+ }
hellaswag/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T11-27-16.957594.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "acc,none": 0.5985859390559649,
5
+ "acc_stderr,none": 0.004891826692722808,
6
+ "acc_norm,none": 0.8249352718581956,
7
+ "acc_norm_stderr,none": 0.0037924580005235724,
8
+ "alias": "hellaswag"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "group": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
24
+ "doc_to_text": "{{query}}",
25
+ "doc_to_target": "{{label}}",
26
+ "doc_to_choice": "choices",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 10,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": false,
46
+ "metadata": {
47
+ "version": 1.0
48
+ }
49
+ }
50
+ },
51
+ "versions": {
52
+ "hellaswag": 1.0
53
+ },
54
+ "n-shot": {
55
+ "hellaswag": 10
56
+ },
57
+ "higher_is_better": {
58
+ "hellaswag": {
59
+ "acc": true,
60
+ "acc_norm": true
61
+ }
62
+ },
63
+ "n-samples": {
64
+ "hellaswag": {
65
+ "original": 10042,
66
+ "effective": 10042
67
+ }
68
+ },
69
+ "config": {
70
+ "model": "vllm",
71
+ "model_args": "pretrained=/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8,tensor_parallel_size=4,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.8,data_parallel_size=1",
72
+ "batch_size": "auto",
73
+ "batch_sizes": [],
74
+ "device": "cuda",
75
+ "use_cache": null,
76
+ "limit": null,
77
+ "bootstrap_iters": 100000,
78
+ "gen_kwargs": null,
79
+ "random_seed": 0,
80
+ "numpy_seed": 1234,
81
+ "torch_seed": 1234,
82
+ "fewshot_seed": 1234
83
+ },
84
+ "git_hash": "f2843b2f",
85
+ "date": 1717745599.4707556,
86
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.19.0-1010-nvidia-lowlatency-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.5.40\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H100 NVL\nGPU 1: NVIDIA H100 NVL\nGPU 2: NVIDIA H100 NVL\nGPU 3: NVIDIA H100 NVL\nGPU 4: NVIDIA H100 NVL\nGPU 5: NVIDIA H100 NVL\nGPU 6: NVIDIA H100 NVL\nGPU 7: NVIDIA H100 NVL\n\nNvidia driver version: 555.42.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 144\nOn-line CPU(s) list: 0-143\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8452Y\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 36\nSocket(s): 2\nStepping: 8\nFrequency boost: enabled\nCPU max MHz: 2001.0000\nCPU min MHz: 800.0000\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hfi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities\nVirtualization: VT-x\nL1d cache: 3.4 MiB (72 instances)\nL1i cache: 2.3 MiB (72 instances)\nL2 cache: 144 MiB (72 instances)\nL3 cache: 135 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-35,72-107\nNUMA node1 CPU(s): 36-71,108-143\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
87
+ "transformers_version": "4.41.2",
88
+ "upper_git_hash": "f2843b2fd64df799179808ce2428b7a8dbc403de",
89
+ "task_hashes": {},
90
+ "model_source": "vllm",
91
+ "model_name": "/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8",
92
+ "model_name_sanitized": "__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8",
93
+ "system_instruction": null,
94
+ "system_instruction_sha": null,
95
+ "chat_template": null,
96
+ "chat_template_sha": null,
97
+ "start_time": 815877.93595267,
98
+ "end_time": 829921.670261319,
99
+ "total_evaluation_time_seconds": "14043.73430864897"
100
+ }
mmlu/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T16-13-28.474390.json ADDED
@@ -0,0 +1,3154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "mmlu": {
4
+ "acc,none": 0.7060959977211223,
5
+ "acc_stderr,none": 0.0036419117884613442,
6
+ "alias": "mmlu"
7
+ },
8
+ "mmlu_humanities": {
9
+ "alias": " - humanities",
10
+ "acc,none": 0.6561105207226355,
11
+ "acc_stderr,none": 0.006537667125056556
12
+ },
13
+ "mmlu_formal_logic": {
14
+ "alias": " - formal_logic",
15
+ "acc,none": 0.5793650793650794,
16
+ "acc_stderr,none": 0.04415438226743745
17
+ },
18
+ "mmlu_high_school_european_history": {
19
+ "alias": " - high_school_european_history",
20
+ "acc,none": 0.8181818181818182,
21
+ "acc_stderr,none": 0.030117688929503582
22
+ },
23
+ "mmlu_high_school_us_history": {
24
+ "alias": " - high_school_us_history",
25
+ "acc,none": 0.8627450980392157,
26
+ "acc_stderr,none": 0.024152225962801577
27
+ },
28
+ "mmlu_high_school_world_history": {
29
+ "alias": " - high_school_world_history",
30
+ "acc,none": 0.8565400843881856,
31
+ "acc_stderr,none": 0.022818291821017012
32
+ },
33
+ "mmlu_international_law": {
34
+ "alias": " - international_law",
35
+ "acc,none": 0.8512396694214877,
36
+ "acc_stderr,none": 0.03248470083807195
37
+ },
38
+ "mmlu_jurisprudence": {
39
+ "alias": " - jurisprudence",
40
+ "acc,none": 0.8518518518518519,
41
+ "acc_stderr,none": 0.03434300243631002
42
+ },
43
+ "mmlu_logical_fallacies": {
44
+ "alias": " - logical_fallacies",
45
+ "acc,none": 0.7975460122699386,
46
+ "acc_stderr,none": 0.031570650789119
47
+ },
48
+ "mmlu_moral_disputes": {
49
+ "alias": " - moral_disputes",
50
+ "acc,none": 0.7947976878612717,
51
+ "acc_stderr,none": 0.021742519835276274
52
+ },
53
+ "mmlu_moral_scenarios": {
54
+ "alias": " - moral_scenarios",
55
+ "acc,none": 0.4983240223463687,
56
+ "acc_stderr,none": 0.016722407608296398
57
+ },
58
+ "mmlu_philosophy": {
59
+ "alias": " - philosophy",
60
+ "acc,none": 0.7942122186495176,
61
+ "acc_stderr,none": 0.022961339906764234
62
+ },
63
+ "mmlu_prehistory": {
64
+ "alias": " - prehistory",
65
+ "acc,none": 0.7993827160493827,
66
+ "acc_stderr,none": 0.0222823139497749
67
+ },
68
+ "mmlu_professional_law": {
69
+ "alias": " - professional_law",
70
+ "acc,none": 0.516297262059974,
71
+ "acc_stderr,none": 0.012763450734699812
72
+ },
73
+ "mmlu_world_religions": {
74
+ "alias": " - world_religions",
75
+ "acc,none": 0.9122807017543859,
76
+ "acc_stderr,none": 0.021696383943889223
77
+ },
78
+ "mmlu_other": {
79
+ "alias": " - other",
80
+ "acc,none": 0.7608625683939492,
81
+ "acc_stderr,none": 0.007354391095553756
82
+ },
83
+ "mmlu_business_ethics": {
84
+ "alias": " - business_ethics",
85
+ "acc,none": 0.77,
86
+ "acc_stderr,none": 0.04229525846816506
87
+ },
88
+ "mmlu_clinical_knowledge": {
89
+ "alias": " - clinical_knowledge",
90
+ "acc,none": 0.7735849056603774,
91
+ "acc_stderr,none": 0.025757559893106758
92
+ },
93
+ "mmlu_college_medicine": {
94
+ "alias": " - college_medicine",
95
+ "acc,none": 0.7398843930635838,
96
+ "acc_stderr,none": 0.033450369167889904
97
+ },
98
+ "mmlu_global_facts": {
99
+ "alias": " - global_facts",
100
+ "acc,none": 0.45,
101
+ "acc_stderr,none": 0.05
102
+ },
103
+ "mmlu_human_aging": {
104
+ "alias": " - human_aging",
105
+ "acc,none": 0.7533632286995515,
106
+ "acc_stderr,none": 0.028930413120910874
107
+ },
108
+ "mmlu_management": {
109
+ "alias": " - management",
110
+ "acc,none": 0.8543689320388349,
111
+ "acc_stderr,none": 0.0349260647662379
112
+ },
113
+ "mmlu_marketing": {
114
+ "alias": " - marketing",
115
+ "acc,none": 0.9145299145299145,
116
+ "acc_stderr,none": 0.01831589168562584
117
+ },
118
+ "mmlu_medical_genetics": {
119
+ "alias": " - medical_genetics",
120
+ "acc,none": 0.77,
121
+ "acc_stderr,none": 0.042295258468165065
122
+ },
123
+ "mmlu_miscellaneous": {
124
+ "alias": " - miscellaneous",
125
+ "acc,none": 0.8518518518518519,
126
+ "acc_stderr,none": 0.012703598899445173
127
+ },
128
+ "mmlu_nutrition": {
129
+ "alias": " - nutrition",
130
+ "acc,none": 0.7908496732026143,
131
+ "acc_stderr,none": 0.02328768531233481
132
+ },
133
+ "mmlu_professional_accounting": {
134
+ "alias": " - professional_accounting",
135
+ "acc,none": 0.5319148936170213,
136
+ "acc_stderr,none": 0.029766675075873866
137
+ },
138
+ "mmlu_professional_medicine": {
139
+ "alias": " - professional_medicine",
140
+ "acc,none": 0.7794117647058824,
141
+ "acc_stderr,none": 0.02518778666022727
142
+ },
143
+ "mmlu_virology": {
144
+ "alias": " - virology",
145
+ "acc,none": 0.5481927710843374,
146
+ "acc_stderr,none": 0.038743715565879536
147
+ },
148
+ "mmlu_social_sciences": {
149
+ "alias": " - social_sciences",
150
+ "acc,none": 0.8059798505037374,
151
+ "acc_stderr,none": 0.007000549787458337
152
+ },
153
+ "mmlu_econometrics": {
154
+ "alias": " - econometrics",
155
+ "acc,none": 0.6052631578947368,
156
+ "acc_stderr,none": 0.04598188057816542
157
+ },
158
+ "mmlu_high_school_geography": {
159
+ "alias": " - high_school_geography",
160
+ "acc,none": 0.8636363636363636,
161
+ "acc_stderr,none": 0.024450155973189835
162
+ },
163
+ "mmlu_high_school_government_and_politics": {
164
+ "alias": " - high_school_government_and_politics",
165
+ "acc,none": 0.9378238341968912,
166
+ "acc_stderr,none": 0.017426974154240514
167
+ },
168
+ "mmlu_high_school_macroeconomics": {
169
+ "alias": " - high_school_macroeconomics",
170
+ "acc,none": 0.7230769230769231,
171
+ "acc_stderr,none": 0.022688042352424994
172
+ },
173
+ "mmlu_high_school_microeconomics": {
174
+ "alias": " - high_school_microeconomics",
175
+ "acc,none": 0.8109243697478992,
176
+ "acc_stderr,none": 0.02543511943810537
177
+ },
178
+ "mmlu_high_school_psychology": {
179
+ "alias": " - high_school_psychology",
180
+ "acc,none": 0.8844036697247707,
181
+ "acc_stderr,none": 0.013708749534172636
182
+ },
183
+ "mmlu_human_sexuality": {
184
+ "alias": " - human_sexuality",
185
+ "acc,none": 0.7709923664122137,
186
+ "acc_stderr,none": 0.036853466317118506
187
+ },
188
+ "mmlu_professional_psychology": {
189
+ "alias": " - professional_psychology",
190
+ "acc,none": 0.7532679738562091,
191
+ "acc_stderr,none": 0.0174408203674025
192
+ },
193
+ "mmlu_public_relations": {
194
+ "alias": " - public_relations",
195
+ "acc,none": 0.7363636363636363,
196
+ "acc_stderr,none": 0.04220224692971987
197
+ },
198
+ "mmlu_security_studies": {
199
+ "alias": " - security_studies",
200
+ "acc,none": 0.7877551020408163,
201
+ "acc_stderr,none": 0.026176967197866767
202
+ },
203
+ "mmlu_sociology": {
204
+ "alias": " - sociology",
205
+ "acc,none": 0.8756218905472637,
206
+ "acc_stderr,none": 0.023335401790166327
207
+ },
208
+ "mmlu_us_foreign_policy": {
209
+ "alias": " - us_foreign_policy",
210
+ "acc,none": 0.9,
211
+ "acc_stderr,none": 0.030151134457776348
212
+ },
213
+ "mmlu_stem": {
214
+ "alias": " - stem",
215
+ "acc,none": 0.6292419917538852,
216
+ "acc_stderr,none": 0.00828854335131971
217
+ },
218
+ "mmlu_abstract_algebra": {
219
+ "alias": " - abstract_algebra",
220
+ "acc,none": 0.4,
221
+ "acc_stderr,none": 0.049236596391733084
222
+ },
223
+ "mmlu_anatomy": {
224
+ "alias": " - anatomy",
225
+ "acc,none": 0.6518518518518519,
226
+ "acc_stderr,none": 0.041153246103369526
227
+ },
228
+ "mmlu_astronomy": {
229
+ "alias": " - astronomy",
230
+ "acc,none": 0.8026315789473685,
231
+ "acc_stderr,none": 0.03238981601699397
232
+ },
233
+ "mmlu_college_biology": {
234
+ "alias": " - college_biology",
235
+ "acc,none": 0.8402777777777778,
236
+ "acc_stderr,none": 0.030635578972093278
237
+ },
238
+ "mmlu_college_chemistry": {
239
+ "alias": " - college_chemistry",
240
+ "acc,none": 0.5,
241
+ "acc_stderr,none": 0.050251890762960605
242
+ },
243
+ "mmlu_college_computer_science": {
244
+ "alias": " - college_computer_science",
245
+ "acc,none": 0.64,
246
+ "acc_stderr,none": 0.04824181513244218
247
+ },
248
+ "mmlu_college_mathematics": {
249
+ "alias": " - college_mathematics",
250
+ "acc,none": 0.47,
251
+ "acc_stderr,none": 0.050161355804659205
252
+ },
253
+ "mmlu_college_physics": {
254
+ "alias": " - college_physics",
255
+ "acc,none": 0.49019607843137253,
256
+ "acc_stderr,none": 0.04974229460422817
257
+ },
258
+ "mmlu_computer_security": {
259
+ "alias": " - computer_security",
260
+ "acc,none": 0.8,
261
+ "acc_stderr,none": 0.040201512610368445
262
+ },
263
+ "mmlu_conceptual_physics": {
264
+ "alias": " - conceptual_physics",
265
+ "acc,none": 0.6978723404255319,
266
+ "acc_stderr,none": 0.030017554471880557
267
+ },
268
+ "mmlu_electrical_engineering": {
269
+ "alias": " - electrical_engineering",
270
+ "acc,none": 0.6827586206896552,
271
+ "acc_stderr,none": 0.03878352372138622
272
+ },
273
+ "mmlu_elementary_mathematics": {
274
+ "alias": " - elementary_mathematics",
275
+ "acc,none": 0.5582010582010583,
276
+ "acc_stderr,none": 0.025576257061253833
277
+ },
278
+ "mmlu_high_school_biology": {
279
+ "alias": " - high_school_biology",
280
+ "acc,none": 0.8064516129032258,
281
+ "acc_stderr,none": 0.02247525852553606
282
+ },
283
+ "mmlu_high_school_chemistry": {
284
+ "alias": " - high_school_chemistry",
285
+ "acc,none": 0.5566502463054187,
286
+ "acc_stderr,none": 0.03495334582162933
287
+ },
288
+ "mmlu_high_school_computer_science": {
289
+ "alias": " - high_school_computer_science",
290
+ "acc,none": 0.8,
291
+ "acc_stderr,none": 0.04020151261036846
292
+ },
293
+ "mmlu_high_school_mathematics": {
294
+ "alias": " - high_school_mathematics",
295
+ "acc,none": 0.44074074074074077,
296
+ "acc_stderr,none": 0.030270671157284074
297
+ },
298
+ "mmlu_high_school_physics": {
299
+ "alias": " - high_school_physics",
300
+ "acc,none": 0.4768211920529801,
301
+ "acc_stderr,none": 0.04078093859163084
302
+ },
303
+ "mmlu_high_school_statistics": {
304
+ "alias": " - high_school_statistics",
305
+ "acc,none": 0.6898148148148148,
306
+ "acc_stderr,none": 0.03154696285656629
307
+ },
308
+ "mmlu_machine_learning": {
309
+ "alias": " - machine_learning",
310
+ "acc,none": 0.5803571428571429,
311
+ "acc_stderr,none": 0.04684099321077106
312
+ }
313
+ },
314
+ "groups": {
315
+ "mmlu": {
316
+ "acc,none": 0.7060959977211223,
317
+ "acc_stderr,none": 0.0036419117884613442,
318
+ "alias": "mmlu"
319
+ },
320
+ "mmlu_humanities": {
321
+ "alias": " - humanities",
322
+ "acc,none": 0.6561105207226355,
323
+ "acc_stderr,none": 0.006537667125056556
324
+ },
325
+ "mmlu_other": {
326
+ "alias": " - other",
327
+ "acc,none": 0.7608625683939492,
328
+ "acc_stderr,none": 0.007354391095553756
329
+ },
330
+ "mmlu_social_sciences": {
331
+ "alias": " - social_sciences",
332
+ "acc,none": 0.8059798505037374,
333
+ "acc_stderr,none": 0.007000549787458337
334
+ },
335
+ "mmlu_stem": {
336
+ "alias": " - stem",
337
+ "acc,none": 0.6292419917538852,
338
+ "acc_stderr,none": 0.00828854335131971
339
+ }
340
+ },
341
+ "group_subtasks": {
342
+ "mmlu_stem": [
343
+ "mmlu_college_computer_science",
344
+ "mmlu_high_school_physics",
345
+ "mmlu_college_chemistry",
346
+ "mmlu_college_biology",
347
+ "mmlu_high_school_mathematics",
348
+ "mmlu_high_school_computer_science",
349
+ "mmlu_electrical_engineering",
350
+ "mmlu_college_physics",
351
+ "mmlu_anatomy",
352
+ "mmlu_college_mathematics",
353
+ "mmlu_elementary_mathematics",
354
+ "mmlu_high_school_chemistry",
355
+ "mmlu_machine_learning",
356
+ "mmlu_abstract_algebra",
357
+ "mmlu_astronomy",
358
+ "mmlu_computer_security",
359
+ "mmlu_high_school_biology",
360
+ "mmlu_high_school_statistics",
361
+ "mmlu_conceptual_physics"
362
+ ],
363
+ "mmlu_other": [
364
+ "mmlu_business_ethics",
365
+ "mmlu_virology",
366
+ "mmlu_nutrition",
367
+ "mmlu_management",
368
+ "mmlu_clinical_knowledge",
369
+ "mmlu_marketing",
370
+ "mmlu_college_medicine",
371
+ "mmlu_professional_medicine",
372
+ "mmlu_medical_genetics",
373
+ "mmlu_human_aging",
374
+ "mmlu_professional_accounting",
375
+ "mmlu_miscellaneous",
376
+ "mmlu_global_facts"
377
+ ],
378
+ "mmlu_social_sciences": [
379
+ "mmlu_high_school_government_and_politics",
380
+ "mmlu_human_sexuality",
381
+ "mmlu_high_school_microeconomics",
382
+ "mmlu_high_school_macroeconomics",
383
+ "mmlu_public_relations",
384
+ "mmlu_sociology",
385
+ "mmlu_professional_psychology",
386
+ "mmlu_high_school_psychology",
387
+ "mmlu_econometrics",
388
+ "mmlu_high_school_geography",
389
+ "mmlu_us_foreign_policy",
390
+ "mmlu_security_studies"
391
+ ],
392
+ "mmlu_humanities": [
393
+ "mmlu_high_school_european_history",
394
+ "mmlu_high_school_world_history",
395
+ "mmlu_professional_law",
396
+ "mmlu_logical_fallacies",
397
+ "mmlu_high_school_us_history",
398
+ "mmlu_world_religions",
399
+ "mmlu_prehistory",
400
+ "mmlu_jurisprudence",
401
+ "mmlu_moral_scenarios",
402
+ "mmlu_formal_logic",
403
+ "mmlu_philosophy",
404
+ "mmlu_international_law",
405
+ "mmlu_moral_disputes"
406
+ ],
407
+ "mmlu": [
408
+ "mmlu_humanities",
409
+ "mmlu_social_sciences",
410
+ "mmlu_other",
411
+ "mmlu_stem"
412
+ ]
413
+ },
414
+ "configs": {
415
+ "mmlu_abstract_algebra": {
416
+ "task": "mmlu_abstract_algebra",
417
+ "task_alias": "abstract_algebra",
418
+ "group": "mmlu_stem",
419
+ "group_alias": "stem",
420
+ "dataset_path": "hails/mmlu_no_train",
421
+ "dataset_name": "abstract_algebra",
422
+ "test_split": "test",
423
+ "fewshot_split": "dev",
424
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
425
+ "doc_to_target": "answer",
426
+ "doc_to_choice": [
427
+ "A",
428
+ "B",
429
+ "C",
430
+ "D"
431
+ ],
432
+ "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
433
+ "target_delimiter": " ",
434
+ "fewshot_delimiter": "\n\n",
435
+ "fewshot_config": {
436
+ "sampler": "first_n"
437
+ },
438
+ "num_fewshot": 5,
439
+ "metric_list": [
440
+ {
441
+ "metric": "acc",
442
+ "aggregation": "mean",
443
+ "higher_is_better": true
444
+ }
445
+ ],
446
+ "output_type": "multiple_choice",
447
+ "repeats": 1,
448
+ "should_decontaminate": false,
449
+ "metadata": {
450
+ "version": 0.0
451
+ }
452
+ },
453
+ "mmlu_anatomy": {
454
+ "task": "mmlu_anatomy",
455
+ "task_alias": "anatomy",
456
+ "group": "mmlu_stem",
457
+ "group_alias": "stem",
458
+ "dataset_path": "hails/mmlu_no_train",
459
+ "dataset_name": "anatomy",
460
+ "test_split": "test",
461
+ "fewshot_split": "dev",
462
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
463
+ "doc_to_target": "answer",
464
+ "doc_to_choice": [
465
+ "A",
466
+ "B",
467
+ "C",
468
+ "D"
469
+ ],
470
+ "description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
471
+ "target_delimiter": " ",
472
+ "fewshot_delimiter": "\n\n",
473
+ "fewshot_config": {
474
+ "sampler": "first_n"
475
+ },
476
+ "num_fewshot": 5,
477
+ "metric_list": [
478
+ {
479
+ "metric": "acc",
480
+ "aggregation": "mean",
481
+ "higher_is_better": true
482
+ }
483
+ ],
484
+ "output_type": "multiple_choice",
485
+ "repeats": 1,
486
+ "should_decontaminate": false,
487
+ "metadata": {
488
+ "version": 0.0
489
+ }
490
+ },
491
+ "mmlu_astronomy": {
492
+ "task": "mmlu_astronomy",
493
+ "task_alias": "astronomy",
494
+ "group": "mmlu_stem",
495
+ "group_alias": "stem",
496
+ "dataset_path": "hails/mmlu_no_train",
497
+ "dataset_name": "astronomy",
498
+ "test_split": "test",
499
+ "fewshot_split": "dev",
500
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
501
+ "doc_to_target": "answer",
502
+ "doc_to_choice": [
503
+ "A",
504
+ "B",
505
+ "C",
506
+ "D"
507
+ ],
508
+ "description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
509
+ "target_delimiter": " ",
510
+ "fewshot_delimiter": "\n\n",
511
+ "fewshot_config": {
512
+ "sampler": "first_n"
513
+ },
514
+ "num_fewshot": 5,
515
+ "metric_list": [
516
+ {
517
+ "metric": "acc",
518
+ "aggregation": "mean",
519
+ "higher_is_better": true
520
+ }
521
+ ],
522
+ "output_type": "multiple_choice",
523
+ "repeats": 1,
524
+ "should_decontaminate": false,
525
+ "metadata": {
526
+ "version": 0.0
527
+ }
528
+ },
529
+ "mmlu_business_ethics": {
530
+ "task": "mmlu_business_ethics",
531
+ "task_alias": "business_ethics",
532
+ "group": "mmlu_other",
533
+ "group_alias": "other",
534
+ "dataset_path": "hails/mmlu_no_train",
535
+ "dataset_name": "business_ethics",
536
+ "test_split": "test",
537
+ "fewshot_split": "dev",
538
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
539
+ "doc_to_target": "answer",
540
+ "doc_to_choice": [
541
+ "A",
542
+ "B",
543
+ "C",
544
+ "D"
545
+ ],
546
+ "description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
547
+ "target_delimiter": " ",
548
+ "fewshot_delimiter": "\n\n",
549
+ "fewshot_config": {
550
+ "sampler": "first_n"
551
+ },
552
+ "num_fewshot": 5,
553
+ "metric_list": [
554
+ {
555
+ "metric": "acc",
556
+ "aggregation": "mean",
557
+ "higher_is_better": true
558
+ }
559
+ ],
560
+ "output_type": "multiple_choice",
561
+ "repeats": 1,
562
+ "should_decontaminate": false,
563
+ "metadata": {
564
+ "version": 0.0
565
+ }
566
+ },
567
+ "mmlu_clinical_knowledge": {
568
+ "task": "mmlu_clinical_knowledge",
569
+ "task_alias": "clinical_knowledge",
570
+ "group": "mmlu_other",
571
+ "group_alias": "other",
572
+ "dataset_path": "hails/mmlu_no_train",
573
+ "dataset_name": "clinical_knowledge",
574
+ "test_split": "test",
575
+ "fewshot_split": "dev",
576
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
577
+ "doc_to_target": "answer",
578
+ "doc_to_choice": [
579
+ "A",
580
+ "B",
581
+ "C",
582
+ "D"
583
+ ],
584
+ "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
585
+ "target_delimiter": " ",
586
+ "fewshot_delimiter": "\n\n",
587
+ "fewshot_config": {
588
+ "sampler": "first_n"
589
+ },
590
+ "num_fewshot": 5,
591
+ "metric_list": [
592
+ {
593
+ "metric": "acc",
594
+ "aggregation": "mean",
595
+ "higher_is_better": true
596
+ }
597
+ ],
598
+ "output_type": "multiple_choice",
599
+ "repeats": 1,
600
+ "should_decontaminate": false,
601
+ "metadata": {
602
+ "version": 0.0
603
+ }
604
+ },
605
+ "mmlu_college_biology": {
606
+ "task": "mmlu_college_biology",
607
+ "task_alias": "college_biology",
608
+ "group": "mmlu_stem",
609
+ "group_alias": "stem",
610
+ "dataset_path": "hails/mmlu_no_train",
611
+ "dataset_name": "college_biology",
612
+ "test_split": "test",
613
+ "fewshot_split": "dev",
614
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
615
+ "doc_to_target": "answer",
616
+ "doc_to_choice": [
617
+ "A",
618
+ "B",
619
+ "C",
620
+ "D"
621
+ ],
622
+ "description": "The following are multiple choice questions (with answers) about college biology.\n\n",
623
+ "target_delimiter": " ",
624
+ "fewshot_delimiter": "\n\n",
625
+ "fewshot_config": {
626
+ "sampler": "first_n"
627
+ },
628
+ "num_fewshot": 5,
629
+ "metric_list": [
630
+ {
631
+ "metric": "acc",
632
+ "aggregation": "mean",
633
+ "higher_is_better": true
634
+ }
635
+ ],
636
+ "output_type": "multiple_choice",
637
+ "repeats": 1,
638
+ "should_decontaminate": false,
639
+ "metadata": {
640
+ "version": 0.0
641
+ }
642
+ },
643
+ "mmlu_college_chemistry": {
644
+ "task": "mmlu_college_chemistry",
645
+ "task_alias": "college_chemistry",
646
+ "group": "mmlu_stem",
647
+ "group_alias": "stem",
648
+ "dataset_path": "hails/mmlu_no_train",
649
+ "dataset_name": "college_chemistry",
650
+ "test_split": "test",
651
+ "fewshot_split": "dev",
652
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
653
+ "doc_to_target": "answer",
654
+ "doc_to_choice": [
655
+ "A",
656
+ "B",
657
+ "C",
658
+ "D"
659
+ ],
660
+ "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
661
+ "target_delimiter": " ",
662
+ "fewshot_delimiter": "\n\n",
663
+ "fewshot_config": {
664
+ "sampler": "first_n"
665
+ },
666
+ "num_fewshot": 5,
667
+ "metric_list": [
668
+ {
669
+ "metric": "acc",
670
+ "aggregation": "mean",
671
+ "higher_is_better": true
672
+ }
673
+ ],
674
+ "output_type": "multiple_choice",
675
+ "repeats": 1,
676
+ "should_decontaminate": false,
677
+ "metadata": {
678
+ "version": 0.0
679
+ }
680
+ },
681
+ "mmlu_college_computer_science": {
682
+ "task": "mmlu_college_computer_science",
683
+ "task_alias": "college_computer_science",
684
+ "group": "mmlu_stem",
685
+ "group_alias": "stem",
686
+ "dataset_path": "hails/mmlu_no_train",
687
+ "dataset_name": "college_computer_science",
688
+ "test_split": "test",
689
+ "fewshot_split": "dev",
690
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
691
+ "doc_to_target": "answer",
692
+ "doc_to_choice": [
693
+ "A",
694
+ "B",
695
+ "C",
696
+ "D"
697
+ ],
698
+ "description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
699
+ "target_delimiter": " ",
700
+ "fewshot_delimiter": "\n\n",
701
+ "fewshot_config": {
702
+ "sampler": "first_n"
703
+ },
704
+ "num_fewshot": 5,
705
+ "metric_list": [
706
+ {
707
+ "metric": "acc",
708
+ "aggregation": "mean",
709
+ "higher_is_better": true
710
+ }
711
+ ],
712
+ "output_type": "multiple_choice",
713
+ "repeats": 1,
714
+ "should_decontaminate": false,
715
+ "metadata": {
716
+ "version": 0.0
717
+ }
718
+ },
719
+ "mmlu_college_mathematics": {
720
+ "task": "mmlu_college_mathematics",
721
+ "task_alias": "college_mathematics",
722
+ "group": "mmlu_stem",
723
+ "group_alias": "stem",
724
+ "dataset_path": "hails/mmlu_no_train",
725
+ "dataset_name": "college_mathematics",
726
+ "test_split": "test",
727
+ "fewshot_split": "dev",
728
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
729
+ "doc_to_target": "answer",
730
+ "doc_to_choice": [
731
+ "A",
732
+ "B",
733
+ "C",
734
+ "D"
735
+ ],
736
+ "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
737
+ "target_delimiter": " ",
738
+ "fewshot_delimiter": "\n\n",
739
+ "fewshot_config": {
740
+ "sampler": "first_n"
741
+ },
742
+ "num_fewshot": 5,
743
+ "metric_list": [
744
+ {
745
+ "metric": "acc",
746
+ "aggregation": "mean",
747
+ "higher_is_better": true
748
+ }
749
+ ],
750
+ "output_type": "multiple_choice",
751
+ "repeats": 1,
752
+ "should_decontaminate": false,
753
+ "metadata": {
754
+ "version": 0.0
755
+ }
756
+ },
757
+ "mmlu_college_medicine": {
758
+ "task": "mmlu_college_medicine",
759
+ "task_alias": "college_medicine",
760
+ "group": "mmlu_other",
761
+ "group_alias": "other",
762
+ "dataset_path": "hails/mmlu_no_train",
763
+ "dataset_name": "college_medicine",
764
+ "test_split": "test",
765
+ "fewshot_split": "dev",
766
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
767
+ "doc_to_target": "answer",
768
+ "doc_to_choice": [
769
+ "A",
770
+ "B",
771
+ "C",
772
+ "D"
773
+ ],
774
+ "description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
775
+ "target_delimiter": " ",
776
+ "fewshot_delimiter": "\n\n",
777
+ "fewshot_config": {
778
+ "sampler": "first_n"
779
+ },
780
+ "num_fewshot": 5,
781
+ "metric_list": [
782
+ {
783
+ "metric": "acc",
784
+ "aggregation": "mean",
785
+ "higher_is_better": true
786
+ }
787
+ ],
788
+ "output_type": "multiple_choice",
789
+ "repeats": 1,
790
+ "should_decontaminate": false,
791
+ "metadata": {
792
+ "version": 0.0
793
+ }
794
+ },
795
+ "mmlu_college_physics": {
796
+ "task": "mmlu_college_physics",
797
+ "task_alias": "college_physics",
798
+ "group": "mmlu_stem",
799
+ "group_alias": "stem",
800
+ "dataset_path": "hails/mmlu_no_train",
801
+ "dataset_name": "college_physics",
802
+ "test_split": "test",
803
+ "fewshot_split": "dev",
804
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
805
+ "doc_to_target": "answer",
806
+ "doc_to_choice": [
807
+ "A",
808
+ "B",
809
+ "C",
810
+ "D"
811
+ ],
812
+ "description": "The following are multiple choice questions (with answers) about college physics.\n\n",
813
+ "target_delimiter": " ",
814
+ "fewshot_delimiter": "\n\n",
815
+ "fewshot_config": {
816
+ "sampler": "first_n"
817
+ },
818
+ "num_fewshot": 5,
819
+ "metric_list": [
820
+ {
821
+ "metric": "acc",
822
+ "aggregation": "mean",
823
+ "higher_is_better": true
824
+ }
825
+ ],
826
+ "output_type": "multiple_choice",
827
+ "repeats": 1,
828
+ "should_decontaminate": false,
829
+ "metadata": {
830
+ "version": 0.0
831
+ }
832
+ },
833
+ "mmlu_computer_security": {
834
+ "task": "mmlu_computer_security",
835
+ "task_alias": "computer_security",
836
+ "group": "mmlu_stem",
837
+ "group_alias": "stem",
838
+ "dataset_path": "hails/mmlu_no_train",
839
+ "dataset_name": "computer_security",
840
+ "test_split": "test",
841
+ "fewshot_split": "dev",
842
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
843
+ "doc_to_target": "answer",
844
+ "doc_to_choice": [
845
+ "A",
846
+ "B",
847
+ "C",
848
+ "D"
849
+ ],
850
+ "description": "The following are multiple choice questions (with answers) about computer security.\n\n",
851
+ "target_delimiter": " ",
852
+ "fewshot_delimiter": "\n\n",
853
+ "fewshot_config": {
854
+ "sampler": "first_n"
855
+ },
856
+ "num_fewshot": 5,
857
+ "metric_list": [
858
+ {
859
+ "metric": "acc",
860
+ "aggregation": "mean",
861
+ "higher_is_better": true
862
+ }
863
+ ],
864
+ "output_type": "multiple_choice",
865
+ "repeats": 1,
866
+ "should_decontaminate": false,
867
+ "metadata": {
868
+ "version": 0.0
869
+ }
870
+ },
871
+ "mmlu_conceptual_physics": {
872
+ "task": "mmlu_conceptual_physics",
873
+ "task_alias": "conceptual_physics",
874
+ "group": "mmlu_stem",
875
+ "group_alias": "stem",
876
+ "dataset_path": "hails/mmlu_no_train",
877
+ "dataset_name": "conceptual_physics",
878
+ "test_split": "test",
879
+ "fewshot_split": "dev",
880
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
881
+ "doc_to_target": "answer",
882
+ "doc_to_choice": [
883
+ "A",
884
+ "B",
885
+ "C",
886
+ "D"
887
+ ],
888
+ "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
889
+ "target_delimiter": " ",
890
+ "fewshot_delimiter": "\n\n",
891
+ "fewshot_config": {
892
+ "sampler": "first_n"
893
+ },
894
+ "num_fewshot": 5,
895
+ "metric_list": [
896
+ {
897
+ "metric": "acc",
898
+ "aggregation": "mean",
899
+ "higher_is_better": true
900
+ }
901
+ ],
902
+ "output_type": "multiple_choice",
903
+ "repeats": 1,
904
+ "should_decontaminate": false,
905
+ "metadata": {
906
+ "version": 0.0
907
+ }
908
+ },
909
+ "mmlu_econometrics": {
910
+ "task": "mmlu_econometrics",
911
+ "task_alias": "econometrics",
912
+ "group": "mmlu_social_sciences",
913
+ "group_alias": "social_sciences",
914
+ "dataset_path": "hails/mmlu_no_train",
915
+ "dataset_name": "econometrics",
916
+ "test_split": "test",
917
+ "fewshot_split": "dev",
918
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
919
+ "doc_to_target": "answer",
920
+ "doc_to_choice": [
921
+ "A",
922
+ "B",
923
+ "C",
924
+ "D"
925
+ ],
926
+ "description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
927
+ "target_delimiter": " ",
928
+ "fewshot_delimiter": "\n\n",
929
+ "fewshot_config": {
930
+ "sampler": "first_n"
931
+ },
932
+ "num_fewshot": 5,
933
+ "metric_list": [
934
+ {
935
+ "metric": "acc",
936
+ "aggregation": "mean",
937
+ "higher_is_better": true
938
+ }
939
+ ],
940
+ "output_type": "multiple_choice",
941
+ "repeats": 1,
942
+ "should_decontaminate": false,
943
+ "metadata": {
944
+ "version": 0.0
945
+ }
946
+ },
947
+ "mmlu_electrical_engineering": {
948
+ "task": "mmlu_electrical_engineering",
949
+ "task_alias": "electrical_engineering",
950
+ "group": "mmlu_stem",
951
+ "group_alias": "stem",
952
+ "dataset_path": "hails/mmlu_no_train",
953
+ "dataset_name": "electrical_engineering",
954
+ "test_split": "test",
955
+ "fewshot_split": "dev",
956
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
957
+ "doc_to_target": "answer",
958
+ "doc_to_choice": [
959
+ "A",
960
+ "B",
961
+ "C",
962
+ "D"
963
+ ],
964
+ "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
965
+ "target_delimiter": " ",
966
+ "fewshot_delimiter": "\n\n",
967
+ "fewshot_config": {
968
+ "sampler": "first_n"
969
+ },
970
+ "num_fewshot": 5,
971
+ "metric_list": [
972
+ {
973
+ "metric": "acc",
974
+ "aggregation": "mean",
975
+ "higher_is_better": true
976
+ }
977
+ ],
978
+ "output_type": "multiple_choice",
979
+ "repeats": 1,
980
+ "should_decontaminate": false,
981
+ "metadata": {
982
+ "version": 0.0
983
+ }
984
+ },
985
+ "mmlu_elementary_mathematics": {
986
+ "task": "mmlu_elementary_mathematics",
987
+ "task_alias": "elementary_mathematics",
988
+ "group": "mmlu_stem",
989
+ "group_alias": "stem",
990
+ "dataset_path": "hails/mmlu_no_train",
991
+ "dataset_name": "elementary_mathematics",
992
+ "test_split": "test",
993
+ "fewshot_split": "dev",
994
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
995
+ "doc_to_target": "answer",
996
+ "doc_to_choice": [
997
+ "A",
998
+ "B",
999
+ "C",
1000
+ "D"
1001
+ ],
1002
+ "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
1003
+ "target_delimiter": " ",
1004
+ "fewshot_delimiter": "\n\n",
1005
+ "fewshot_config": {
1006
+ "sampler": "first_n"
1007
+ },
1008
+ "num_fewshot": 5,
1009
+ "metric_list": [
1010
+ {
1011
+ "metric": "acc",
1012
+ "aggregation": "mean",
1013
+ "higher_is_better": true
1014
+ }
1015
+ ],
1016
+ "output_type": "multiple_choice",
1017
+ "repeats": 1,
1018
+ "should_decontaminate": false,
1019
+ "metadata": {
1020
+ "version": 0.0
1021
+ }
1022
+ },
1023
+ "mmlu_formal_logic": {
1024
+ "task": "mmlu_formal_logic",
1025
+ "task_alias": "formal_logic",
1026
+ "group": "mmlu_humanities",
1027
+ "group_alias": "humanities",
1028
+ "dataset_path": "hails/mmlu_no_train",
1029
+ "dataset_name": "formal_logic",
1030
+ "test_split": "test",
1031
+ "fewshot_split": "dev",
1032
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1033
+ "doc_to_target": "answer",
1034
+ "doc_to_choice": [
1035
+ "A",
1036
+ "B",
1037
+ "C",
1038
+ "D"
1039
+ ],
1040
+ "description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
1041
+ "target_delimiter": " ",
1042
+ "fewshot_delimiter": "\n\n",
1043
+ "fewshot_config": {
1044
+ "sampler": "first_n"
1045
+ },
1046
+ "num_fewshot": 5,
1047
+ "metric_list": [
1048
+ {
1049
+ "metric": "acc",
1050
+ "aggregation": "mean",
1051
+ "higher_is_better": true
1052
+ }
1053
+ ],
1054
+ "output_type": "multiple_choice",
1055
+ "repeats": 1,
1056
+ "should_decontaminate": false,
1057
+ "metadata": {
1058
+ "version": 0.0
1059
+ }
1060
+ },
1061
+ "mmlu_global_facts": {
1062
+ "task": "mmlu_global_facts",
1063
+ "task_alias": "global_facts",
1064
+ "group": "mmlu_other",
1065
+ "group_alias": "other",
1066
+ "dataset_path": "hails/mmlu_no_train",
1067
+ "dataset_name": "global_facts",
1068
+ "test_split": "test",
1069
+ "fewshot_split": "dev",
1070
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1071
+ "doc_to_target": "answer",
1072
+ "doc_to_choice": [
1073
+ "A",
1074
+ "B",
1075
+ "C",
1076
+ "D"
1077
+ ],
1078
+ "description": "The following are multiple choice questions (with answers) about global facts.\n\n",
1079
+ "target_delimiter": " ",
1080
+ "fewshot_delimiter": "\n\n",
1081
+ "fewshot_config": {
1082
+ "sampler": "first_n"
1083
+ },
1084
+ "num_fewshot": 5,
1085
+ "metric_list": [
1086
+ {
1087
+ "metric": "acc",
1088
+ "aggregation": "mean",
1089
+ "higher_is_better": true
1090
+ }
1091
+ ],
1092
+ "output_type": "multiple_choice",
1093
+ "repeats": 1,
1094
+ "should_decontaminate": false,
1095
+ "metadata": {
1096
+ "version": 0.0
1097
+ }
1098
+ },
1099
+ "mmlu_high_school_biology": {
1100
+ "task": "mmlu_high_school_biology",
1101
+ "task_alias": "high_school_biology",
1102
+ "group": "mmlu_stem",
1103
+ "group_alias": "stem",
1104
+ "dataset_path": "hails/mmlu_no_train",
1105
+ "dataset_name": "high_school_biology",
1106
+ "test_split": "test",
1107
+ "fewshot_split": "dev",
1108
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1109
+ "doc_to_target": "answer",
1110
+ "doc_to_choice": [
1111
+ "A",
1112
+ "B",
1113
+ "C",
1114
+ "D"
1115
+ ],
1116
+ "description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
1117
+ "target_delimiter": " ",
1118
+ "fewshot_delimiter": "\n\n",
1119
+ "fewshot_config": {
1120
+ "sampler": "first_n"
1121
+ },
1122
+ "num_fewshot": 5,
1123
+ "metric_list": [
1124
+ {
1125
+ "metric": "acc",
1126
+ "aggregation": "mean",
1127
+ "higher_is_better": true
1128
+ }
1129
+ ],
1130
+ "output_type": "multiple_choice",
1131
+ "repeats": 1,
1132
+ "should_decontaminate": false,
1133
+ "metadata": {
1134
+ "version": 0.0
1135
+ }
1136
+ },
1137
+ "mmlu_high_school_chemistry": {
1138
+ "task": "mmlu_high_school_chemistry",
1139
+ "task_alias": "high_school_chemistry",
1140
+ "group": "mmlu_stem",
1141
+ "group_alias": "stem",
1142
+ "dataset_path": "hails/mmlu_no_train",
1143
+ "dataset_name": "high_school_chemistry",
1144
+ "test_split": "test",
1145
+ "fewshot_split": "dev",
1146
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1147
+ "doc_to_target": "answer",
1148
+ "doc_to_choice": [
1149
+ "A",
1150
+ "B",
1151
+ "C",
1152
+ "D"
1153
+ ],
1154
+ "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
1155
+ "target_delimiter": " ",
1156
+ "fewshot_delimiter": "\n\n",
1157
+ "fewshot_config": {
1158
+ "sampler": "first_n"
1159
+ },
1160
+ "num_fewshot": 5,
1161
+ "metric_list": [
1162
+ {
1163
+ "metric": "acc",
1164
+ "aggregation": "mean",
1165
+ "higher_is_better": true
1166
+ }
1167
+ ],
1168
+ "output_type": "multiple_choice",
1169
+ "repeats": 1,
1170
+ "should_decontaminate": false,
1171
+ "metadata": {
1172
+ "version": 0.0
1173
+ }
1174
+ },
1175
+ "mmlu_high_school_computer_science": {
1176
+ "task": "mmlu_high_school_computer_science",
1177
+ "task_alias": "high_school_computer_science",
1178
+ "group": "mmlu_stem",
1179
+ "group_alias": "stem",
1180
+ "dataset_path": "hails/mmlu_no_train",
1181
+ "dataset_name": "high_school_computer_science",
1182
+ "test_split": "test",
1183
+ "fewshot_split": "dev",
1184
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1185
+ "doc_to_target": "answer",
1186
+ "doc_to_choice": [
1187
+ "A",
1188
+ "B",
1189
+ "C",
1190
+ "D"
1191
+ ],
1192
+ "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
1193
+ "target_delimiter": " ",
1194
+ "fewshot_delimiter": "\n\n",
1195
+ "fewshot_config": {
1196
+ "sampler": "first_n"
1197
+ },
1198
+ "num_fewshot": 5,
1199
+ "metric_list": [
1200
+ {
1201
+ "metric": "acc",
1202
+ "aggregation": "mean",
1203
+ "higher_is_better": true
1204
+ }
1205
+ ],
1206
+ "output_type": "multiple_choice",
1207
+ "repeats": 1,
1208
+ "should_decontaminate": false,
1209
+ "metadata": {
1210
+ "version": 0.0
1211
+ }
1212
+ },
1213
+ "mmlu_high_school_european_history": {
1214
+ "task": "mmlu_high_school_european_history",
1215
+ "task_alias": "high_school_european_history",
1216
+ "group": "mmlu_humanities",
1217
+ "group_alias": "humanities",
1218
+ "dataset_path": "hails/mmlu_no_train",
1219
+ "dataset_name": "high_school_european_history",
1220
+ "test_split": "test",
1221
+ "fewshot_split": "dev",
1222
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1223
+ "doc_to_target": "answer",
1224
+ "doc_to_choice": [
1225
+ "A",
1226
+ "B",
1227
+ "C",
1228
+ "D"
1229
+ ],
1230
+ "description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
1231
+ "target_delimiter": " ",
1232
+ "fewshot_delimiter": "\n\n",
1233
+ "fewshot_config": {
1234
+ "sampler": "first_n"
1235
+ },
1236
+ "num_fewshot": 5,
1237
+ "metric_list": [
1238
+ {
1239
+ "metric": "acc",
1240
+ "aggregation": "mean",
1241
+ "higher_is_better": true
1242
+ }
1243
+ ],
1244
+ "output_type": "multiple_choice",
1245
+ "repeats": 1,
1246
+ "should_decontaminate": false,
1247
+ "metadata": {
1248
+ "version": 0.0
1249
+ }
1250
+ },
1251
+ "mmlu_high_school_geography": {
1252
+ "task": "mmlu_high_school_geography",
1253
+ "task_alias": "high_school_geography",
1254
+ "group": "mmlu_social_sciences",
1255
+ "group_alias": "social_sciences",
1256
+ "dataset_path": "hails/mmlu_no_train",
1257
+ "dataset_name": "high_school_geography",
1258
+ "test_split": "test",
1259
+ "fewshot_split": "dev",
1260
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1261
+ "doc_to_target": "answer",
1262
+ "doc_to_choice": [
1263
+ "A",
1264
+ "B",
1265
+ "C",
1266
+ "D"
1267
+ ],
1268
+ "description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
1269
+ "target_delimiter": " ",
1270
+ "fewshot_delimiter": "\n\n",
1271
+ "fewshot_config": {
1272
+ "sampler": "first_n"
1273
+ },
1274
+ "num_fewshot": 5,
1275
+ "metric_list": [
1276
+ {
1277
+ "metric": "acc",
1278
+ "aggregation": "mean",
1279
+ "higher_is_better": true
1280
+ }
1281
+ ],
1282
+ "output_type": "multiple_choice",
1283
+ "repeats": 1,
1284
+ "should_decontaminate": false,
1285
+ "metadata": {
1286
+ "version": 0.0
1287
+ }
1288
+ },
1289
+ "mmlu_high_school_government_and_politics": {
1290
+ "task": "mmlu_high_school_government_and_politics",
1291
+ "task_alias": "high_school_government_and_politics",
1292
+ "group": "mmlu_social_sciences",
1293
+ "group_alias": "social_sciences",
1294
+ "dataset_path": "hails/mmlu_no_train",
1295
+ "dataset_name": "high_school_government_and_politics",
1296
+ "test_split": "test",
1297
+ "fewshot_split": "dev",
1298
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1299
+ "doc_to_target": "answer",
1300
+ "doc_to_choice": [
1301
+ "A",
1302
+ "B",
1303
+ "C",
1304
+ "D"
1305
+ ],
1306
+ "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
1307
+ "target_delimiter": " ",
1308
+ "fewshot_delimiter": "\n\n",
1309
+ "fewshot_config": {
1310
+ "sampler": "first_n"
1311
+ },
1312
+ "num_fewshot": 5,
1313
+ "metric_list": [
1314
+ {
1315
+ "metric": "acc",
1316
+ "aggregation": "mean",
1317
+ "higher_is_better": true
1318
+ }
1319
+ ],
1320
+ "output_type": "multiple_choice",
1321
+ "repeats": 1,
1322
+ "should_decontaminate": false,
1323
+ "metadata": {
1324
+ "version": 0.0
1325
+ }
1326
+ },
1327
+ "mmlu_high_school_macroeconomics": {
1328
+ "task": "mmlu_high_school_macroeconomics",
1329
+ "task_alias": "high_school_macroeconomics",
1330
+ "group": "mmlu_social_sciences",
1331
+ "group_alias": "social_sciences",
1332
+ "dataset_path": "hails/mmlu_no_train",
1333
+ "dataset_name": "high_school_macroeconomics",
1334
+ "test_split": "test",
1335
+ "fewshot_split": "dev",
1336
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1337
+ "doc_to_target": "answer",
1338
+ "doc_to_choice": [
1339
+ "A",
1340
+ "B",
1341
+ "C",
1342
+ "D"
1343
+ ],
1344
+ "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
1345
+ "target_delimiter": " ",
1346
+ "fewshot_delimiter": "\n\n",
1347
+ "fewshot_config": {
1348
+ "sampler": "first_n"
1349
+ },
1350
+ "num_fewshot": 5,
1351
+ "metric_list": [
1352
+ {
1353
+ "metric": "acc",
1354
+ "aggregation": "mean",
1355
+ "higher_is_better": true
1356
+ }
1357
+ ],
1358
+ "output_type": "multiple_choice",
1359
+ "repeats": 1,
1360
+ "should_decontaminate": false,
1361
+ "metadata": {
1362
+ "version": 0.0
1363
+ }
1364
+ },
1365
+ "mmlu_high_school_mathematics": {
1366
+ "task": "mmlu_high_school_mathematics",
1367
+ "task_alias": "high_school_mathematics",
1368
+ "group": "mmlu_stem",
1369
+ "group_alias": "stem",
1370
+ "dataset_path": "hails/mmlu_no_train",
1371
+ "dataset_name": "high_school_mathematics",
1372
+ "test_split": "test",
1373
+ "fewshot_split": "dev",
1374
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1375
+ "doc_to_target": "answer",
1376
+ "doc_to_choice": [
1377
+ "A",
1378
+ "B",
1379
+ "C",
1380
+ "D"
1381
+ ],
1382
+ "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
1383
+ "target_delimiter": " ",
1384
+ "fewshot_delimiter": "\n\n",
1385
+ "fewshot_config": {
1386
+ "sampler": "first_n"
1387
+ },
1388
+ "num_fewshot": 5,
1389
+ "metric_list": [
1390
+ {
1391
+ "metric": "acc",
1392
+ "aggregation": "mean",
1393
+ "higher_is_better": true
1394
+ }
1395
+ ],
1396
+ "output_type": "multiple_choice",
1397
+ "repeats": 1,
1398
+ "should_decontaminate": false,
1399
+ "metadata": {
1400
+ "version": 0.0
1401
+ }
1402
+ },
1403
+ "mmlu_high_school_microeconomics": {
1404
+ "task": "mmlu_high_school_microeconomics",
1405
+ "task_alias": "high_school_microeconomics",
1406
+ "group": "mmlu_social_sciences",
1407
+ "group_alias": "social_sciences",
1408
+ "dataset_path": "hails/mmlu_no_train",
1409
+ "dataset_name": "high_school_microeconomics",
1410
+ "test_split": "test",
1411
+ "fewshot_split": "dev",
1412
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1413
+ "doc_to_target": "answer",
1414
+ "doc_to_choice": [
1415
+ "A",
1416
+ "B",
1417
+ "C",
1418
+ "D"
1419
+ ],
1420
+ "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
1421
+ "target_delimiter": " ",
1422
+ "fewshot_delimiter": "\n\n",
1423
+ "fewshot_config": {
1424
+ "sampler": "first_n"
1425
+ },
1426
+ "num_fewshot": 5,
1427
+ "metric_list": [
1428
+ {
1429
+ "metric": "acc",
1430
+ "aggregation": "mean",
1431
+ "higher_is_better": true
1432
+ }
1433
+ ],
1434
+ "output_type": "multiple_choice",
1435
+ "repeats": 1,
1436
+ "should_decontaminate": false,
1437
+ "metadata": {
1438
+ "version": 0.0
1439
+ }
1440
+ },
1441
+ "mmlu_high_school_physics": {
1442
+ "task": "mmlu_high_school_physics",
1443
+ "task_alias": "high_school_physics",
1444
+ "group": "mmlu_stem",
1445
+ "group_alias": "stem",
1446
+ "dataset_path": "hails/mmlu_no_train",
1447
+ "dataset_name": "high_school_physics",
1448
+ "test_split": "test",
1449
+ "fewshot_split": "dev",
1450
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1451
+ "doc_to_target": "answer",
1452
+ "doc_to_choice": [
1453
+ "A",
1454
+ "B",
1455
+ "C",
1456
+ "D"
1457
+ ],
1458
+ "description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
1459
+ "target_delimiter": " ",
1460
+ "fewshot_delimiter": "\n\n",
1461
+ "fewshot_config": {
1462
+ "sampler": "first_n"
1463
+ },
1464
+ "num_fewshot": 5,
1465
+ "metric_list": [
1466
+ {
1467
+ "metric": "acc",
1468
+ "aggregation": "mean",
1469
+ "higher_is_better": true
1470
+ }
1471
+ ],
1472
+ "output_type": "multiple_choice",
1473
+ "repeats": 1,
1474
+ "should_decontaminate": false,
1475
+ "metadata": {
1476
+ "version": 0.0
1477
+ }
1478
+ },
1479
+ "mmlu_high_school_psychology": {
1480
+ "task": "mmlu_high_school_psychology",
1481
+ "task_alias": "high_school_psychology",
1482
+ "group": "mmlu_social_sciences",
1483
+ "group_alias": "social_sciences",
1484
+ "dataset_path": "hails/mmlu_no_train",
1485
+ "dataset_name": "high_school_psychology",
1486
+ "test_split": "test",
1487
+ "fewshot_split": "dev",
1488
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1489
+ "doc_to_target": "answer",
1490
+ "doc_to_choice": [
1491
+ "A",
1492
+ "B",
1493
+ "C",
1494
+ "D"
1495
+ ],
1496
+ "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
1497
+ "target_delimiter": " ",
1498
+ "fewshot_delimiter": "\n\n",
1499
+ "fewshot_config": {
1500
+ "sampler": "first_n"
1501
+ },
1502
+ "num_fewshot": 5,
1503
+ "metric_list": [
1504
+ {
1505
+ "metric": "acc",
1506
+ "aggregation": "mean",
1507
+ "higher_is_better": true
1508
+ }
1509
+ ],
1510
+ "output_type": "multiple_choice",
1511
+ "repeats": 1,
1512
+ "should_decontaminate": false,
1513
+ "metadata": {
1514
+ "version": 0.0
1515
+ }
1516
+ },
1517
+ "mmlu_high_school_statistics": {
1518
+ "task": "mmlu_high_school_statistics",
1519
+ "task_alias": "high_school_statistics",
1520
+ "group": "mmlu_stem",
1521
+ "group_alias": "stem",
1522
+ "dataset_path": "hails/mmlu_no_train",
1523
+ "dataset_name": "high_school_statistics",
1524
+ "test_split": "test",
1525
+ "fewshot_split": "dev",
1526
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1527
+ "doc_to_target": "answer",
1528
+ "doc_to_choice": [
1529
+ "A",
1530
+ "B",
1531
+ "C",
1532
+ "D"
1533
+ ],
1534
+ "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
1535
+ "target_delimiter": " ",
1536
+ "fewshot_delimiter": "\n\n",
1537
+ "fewshot_config": {
1538
+ "sampler": "first_n"
1539
+ },
1540
+ "num_fewshot": 5,
1541
+ "metric_list": [
1542
+ {
1543
+ "metric": "acc",
1544
+ "aggregation": "mean",
1545
+ "higher_is_better": true
1546
+ }
1547
+ ],
1548
+ "output_type": "multiple_choice",
1549
+ "repeats": 1,
1550
+ "should_decontaminate": false,
1551
+ "metadata": {
1552
+ "version": 0.0
1553
+ }
1554
+ },
1555
+ "mmlu_high_school_us_history": {
1556
+ "task": "mmlu_high_school_us_history",
1557
+ "task_alias": "high_school_us_history",
1558
+ "group": "mmlu_humanities",
1559
+ "group_alias": "humanities",
1560
+ "dataset_path": "hails/mmlu_no_train",
1561
+ "dataset_name": "high_school_us_history",
1562
+ "test_split": "test",
1563
+ "fewshot_split": "dev",
1564
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1565
+ "doc_to_target": "answer",
1566
+ "doc_to_choice": [
1567
+ "A",
1568
+ "B",
1569
+ "C",
1570
+ "D"
1571
+ ],
1572
+ "description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
1573
+ "target_delimiter": " ",
1574
+ "fewshot_delimiter": "\n\n",
1575
+ "fewshot_config": {
1576
+ "sampler": "first_n"
1577
+ },
1578
+ "num_fewshot": 5,
1579
+ "metric_list": [
1580
+ {
1581
+ "metric": "acc",
1582
+ "aggregation": "mean",
1583
+ "higher_is_better": true
1584
+ }
1585
+ ],
1586
+ "output_type": "multiple_choice",
1587
+ "repeats": 1,
1588
+ "should_decontaminate": false,
1589
+ "metadata": {
1590
+ "version": 0.0
1591
+ }
1592
+ },
1593
+ "mmlu_high_school_world_history": {
1594
+ "task": "mmlu_high_school_world_history",
1595
+ "task_alias": "high_school_world_history",
1596
+ "group": "mmlu_humanities",
1597
+ "group_alias": "humanities",
1598
+ "dataset_path": "hails/mmlu_no_train",
1599
+ "dataset_name": "high_school_world_history",
1600
+ "test_split": "test",
1601
+ "fewshot_split": "dev",
1602
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1603
+ "doc_to_target": "answer",
1604
+ "doc_to_choice": [
1605
+ "A",
1606
+ "B",
1607
+ "C",
1608
+ "D"
1609
+ ],
1610
+ "description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
1611
+ "target_delimiter": " ",
1612
+ "fewshot_delimiter": "\n\n",
1613
+ "fewshot_config": {
1614
+ "sampler": "first_n"
1615
+ },
1616
+ "num_fewshot": 5,
1617
+ "metric_list": [
1618
+ {
1619
+ "metric": "acc",
1620
+ "aggregation": "mean",
1621
+ "higher_is_better": true
1622
+ }
1623
+ ],
1624
+ "output_type": "multiple_choice",
1625
+ "repeats": 1,
1626
+ "should_decontaminate": false,
1627
+ "metadata": {
1628
+ "version": 0.0
1629
+ }
1630
+ },
1631
+ "mmlu_human_aging": {
1632
+ "task": "mmlu_human_aging",
1633
+ "task_alias": "human_aging",
1634
+ "group": "mmlu_other",
1635
+ "group_alias": "other",
1636
+ "dataset_path": "hails/mmlu_no_train",
1637
+ "dataset_name": "human_aging",
1638
+ "test_split": "test",
1639
+ "fewshot_split": "dev",
1640
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1641
+ "doc_to_target": "answer",
1642
+ "doc_to_choice": [
1643
+ "A",
1644
+ "B",
1645
+ "C",
1646
+ "D"
1647
+ ],
1648
+ "description": "The following are multiple choice questions (with answers) about human aging.\n\n",
1649
+ "target_delimiter": " ",
1650
+ "fewshot_delimiter": "\n\n",
1651
+ "fewshot_config": {
1652
+ "sampler": "first_n"
1653
+ },
1654
+ "num_fewshot": 5,
1655
+ "metric_list": [
1656
+ {
1657
+ "metric": "acc",
1658
+ "aggregation": "mean",
1659
+ "higher_is_better": true
1660
+ }
1661
+ ],
1662
+ "output_type": "multiple_choice",
1663
+ "repeats": 1,
1664
+ "should_decontaminate": false,
1665
+ "metadata": {
1666
+ "version": 0.0
1667
+ }
1668
+ },
1669
+ "mmlu_human_sexuality": {
1670
+ "task": "mmlu_human_sexuality",
1671
+ "task_alias": "human_sexuality",
1672
+ "group": "mmlu_social_sciences",
1673
+ "group_alias": "social_sciences",
1674
+ "dataset_path": "hails/mmlu_no_train",
1675
+ "dataset_name": "human_sexuality",
1676
+ "test_split": "test",
1677
+ "fewshot_split": "dev",
1678
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1679
+ "doc_to_target": "answer",
1680
+ "doc_to_choice": [
1681
+ "A",
1682
+ "B",
1683
+ "C",
1684
+ "D"
1685
+ ],
1686
+ "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
1687
+ "target_delimiter": " ",
1688
+ "fewshot_delimiter": "\n\n",
1689
+ "fewshot_config": {
1690
+ "sampler": "first_n"
1691
+ },
1692
+ "num_fewshot": 5,
1693
+ "metric_list": [
1694
+ {
1695
+ "metric": "acc",
1696
+ "aggregation": "mean",
1697
+ "higher_is_better": true
1698
+ }
1699
+ ],
1700
+ "output_type": "multiple_choice",
1701
+ "repeats": 1,
1702
+ "should_decontaminate": false,
1703
+ "metadata": {
1704
+ "version": 0.0
1705
+ }
1706
+ },
1707
+ "mmlu_international_law": {
1708
+ "task": "mmlu_international_law",
1709
+ "task_alias": "international_law",
1710
+ "group": "mmlu_humanities",
1711
+ "group_alias": "humanities",
1712
+ "dataset_path": "hails/mmlu_no_train",
1713
+ "dataset_name": "international_law",
1714
+ "test_split": "test",
1715
+ "fewshot_split": "dev",
1716
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1717
+ "doc_to_target": "answer",
1718
+ "doc_to_choice": [
1719
+ "A",
1720
+ "B",
1721
+ "C",
1722
+ "D"
1723
+ ],
1724
+ "description": "The following are multiple choice questions (with answers) about international law.\n\n",
1725
+ "target_delimiter": " ",
1726
+ "fewshot_delimiter": "\n\n",
1727
+ "fewshot_config": {
1728
+ "sampler": "first_n"
1729
+ },
1730
+ "num_fewshot": 5,
1731
+ "metric_list": [
1732
+ {
1733
+ "metric": "acc",
1734
+ "aggregation": "mean",
1735
+ "higher_is_better": true
1736
+ }
1737
+ ],
1738
+ "output_type": "multiple_choice",
1739
+ "repeats": 1,
1740
+ "should_decontaminate": false,
1741
+ "metadata": {
1742
+ "version": 0.0
1743
+ }
1744
+ },
1745
+ "mmlu_jurisprudence": {
1746
+ "task": "mmlu_jurisprudence",
1747
+ "task_alias": "jurisprudence",
1748
+ "group": "mmlu_humanities",
1749
+ "group_alias": "humanities",
1750
+ "dataset_path": "hails/mmlu_no_train",
1751
+ "dataset_name": "jurisprudence",
1752
+ "test_split": "test",
1753
+ "fewshot_split": "dev",
1754
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1755
+ "doc_to_target": "answer",
1756
+ "doc_to_choice": [
1757
+ "A",
1758
+ "B",
1759
+ "C",
1760
+ "D"
1761
+ ],
1762
+ "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
1763
+ "target_delimiter": " ",
1764
+ "fewshot_delimiter": "\n\n",
1765
+ "fewshot_config": {
1766
+ "sampler": "first_n"
1767
+ },
1768
+ "num_fewshot": 5,
1769
+ "metric_list": [
1770
+ {
1771
+ "metric": "acc",
1772
+ "aggregation": "mean",
1773
+ "higher_is_better": true
1774
+ }
1775
+ ],
1776
+ "output_type": "multiple_choice",
1777
+ "repeats": 1,
1778
+ "should_decontaminate": false,
1779
+ "metadata": {
1780
+ "version": 0.0
1781
+ }
1782
+ },
1783
+ "mmlu_logical_fallacies": {
1784
+ "task": "mmlu_logical_fallacies",
1785
+ "task_alias": "logical_fallacies",
1786
+ "group": "mmlu_humanities",
1787
+ "group_alias": "humanities",
1788
+ "dataset_path": "hails/mmlu_no_train",
1789
+ "dataset_name": "logical_fallacies",
1790
+ "test_split": "test",
1791
+ "fewshot_split": "dev",
1792
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1793
+ "doc_to_target": "answer",
1794
+ "doc_to_choice": [
1795
+ "A",
1796
+ "B",
1797
+ "C",
1798
+ "D"
1799
+ ],
1800
+ "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
1801
+ "target_delimiter": " ",
1802
+ "fewshot_delimiter": "\n\n",
1803
+ "fewshot_config": {
1804
+ "sampler": "first_n"
1805
+ },
1806
+ "num_fewshot": 5,
1807
+ "metric_list": [
1808
+ {
1809
+ "metric": "acc",
1810
+ "aggregation": "mean",
1811
+ "higher_is_better": true
1812
+ }
1813
+ ],
1814
+ "output_type": "multiple_choice",
1815
+ "repeats": 1,
1816
+ "should_decontaminate": false,
1817
+ "metadata": {
1818
+ "version": 0.0
1819
+ }
1820
+ },
1821
+ "mmlu_machine_learning": {
1822
+ "task": "mmlu_machine_learning",
1823
+ "task_alias": "machine_learning",
1824
+ "group": "mmlu_stem",
1825
+ "group_alias": "stem",
1826
+ "dataset_path": "hails/mmlu_no_train",
1827
+ "dataset_name": "machine_learning",
1828
+ "test_split": "test",
1829
+ "fewshot_split": "dev",
1830
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1831
+ "doc_to_target": "answer",
1832
+ "doc_to_choice": [
1833
+ "A",
1834
+ "B",
1835
+ "C",
1836
+ "D"
1837
+ ],
1838
+ "description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
1839
+ "target_delimiter": " ",
1840
+ "fewshot_delimiter": "\n\n",
1841
+ "fewshot_config": {
1842
+ "sampler": "first_n"
1843
+ },
1844
+ "num_fewshot": 5,
1845
+ "metric_list": [
1846
+ {
1847
+ "metric": "acc",
1848
+ "aggregation": "mean",
1849
+ "higher_is_better": true
1850
+ }
1851
+ ],
1852
+ "output_type": "multiple_choice",
1853
+ "repeats": 1,
1854
+ "should_decontaminate": false,
1855
+ "metadata": {
1856
+ "version": 0.0
1857
+ }
1858
+ },
1859
+ "mmlu_management": {
1860
+ "task": "mmlu_management",
1861
+ "task_alias": "management",
1862
+ "group": "mmlu_other",
1863
+ "group_alias": "other",
1864
+ "dataset_path": "hails/mmlu_no_train",
1865
+ "dataset_name": "management",
1866
+ "test_split": "test",
1867
+ "fewshot_split": "dev",
1868
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1869
+ "doc_to_target": "answer",
1870
+ "doc_to_choice": [
1871
+ "A",
1872
+ "B",
1873
+ "C",
1874
+ "D"
1875
+ ],
1876
+ "description": "The following are multiple choice questions (with answers) about management.\n\n",
1877
+ "target_delimiter": " ",
1878
+ "fewshot_delimiter": "\n\n",
1879
+ "fewshot_config": {
1880
+ "sampler": "first_n"
1881
+ },
1882
+ "num_fewshot": 5,
1883
+ "metric_list": [
1884
+ {
1885
+ "metric": "acc",
1886
+ "aggregation": "mean",
1887
+ "higher_is_better": true
1888
+ }
1889
+ ],
1890
+ "output_type": "multiple_choice",
1891
+ "repeats": 1,
1892
+ "should_decontaminate": false,
1893
+ "metadata": {
1894
+ "version": 0.0
1895
+ }
1896
+ },
1897
+ "mmlu_marketing": {
1898
+ "task": "mmlu_marketing",
1899
+ "task_alias": "marketing",
1900
+ "group": "mmlu_other",
1901
+ "group_alias": "other",
1902
+ "dataset_path": "hails/mmlu_no_train",
1903
+ "dataset_name": "marketing",
1904
+ "test_split": "test",
1905
+ "fewshot_split": "dev",
1906
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1907
+ "doc_to_target": "answer",
1908
+ "doc_to_choice": [
1909
+ "A",
1910
+ "B",
1911
+ "C",
1912
+ "D"
1913
+ ],
1914
+ "description": "The following are multiple choice questions (with answers) about marketing.\n\n",
1915
+ "target_delimiter": " ",
1916
+ "fewshot_delimiter": "\n\n",
1917
+ "fewshot_config": {
1918
+ "sampler": "first_n"
1919
+ },
1920
+ "num_fewshot": 5,
1921
+ "metric_list": [
1922
+ {
1923
+ "metric": "acc",
1924
+ "aggregation": "mean",
1925
+ "higher_is_better": true
1926
+ }
1927
+ ],
1928
+ "output_type": "multiple_choice",
1929
+ "repeats": 1,
1930
+ "should_decontaminate": false,
1931
+ "metadata": {
1932
+ "version": 0.0
1933
+ }
1934
+ },
1935
+ "mmlu_medical_genetics": {
1936
+ "task": "mmlu_medical_genetics",
1937
+ "task_alias": "medical_genetics",
1938
+ "group": "mmlu_other",
1939
+ "group_alias": "other",
1940
+ "dataset_path": "hails/mmlu_no_train",
1941
+ "dataset_name": "medical_genetics",
1942
+ "test_split": "test",
1943
+ "fewshot_split": "dev",
1944
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1945
+ "doc_to_target": "answer",
1946
+ "doc_to_choice": [
1947
+ "A",
1948
+ "B",
1949
+ "C",
1950
+ "D"
1951
+ ],
1952
+ "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
1953
+ "target_delimiter": " ",
1954
+ "fewshot_delimiter": "\n\n",
1955
+ "fewshot_config": {
1956
+ "sampler": "first_n"
1957
+ },
1958
+ "num_fewshot": 5,
1959
+ "metric_list": [
1960
+ {
1961
+ "metric": "acc",
1962
+ "aggregation": "mean",
1963
+ "higher_is_better": true
1964
+ }
1965
+ ],
1966
+ "output_type": "multiple_choice",
1967
+ "repeats": 1,
1968
+ "should_decontaminate": false,
1969
+ "metadata": {
1970
+ "version": 0.0
1971
+ }
1972
+ },
1973
+ "mmlu_miscellaneous": {
1974
+ "task": "mmlu_miscellaneous",
1975
+ "task_alias": "miscellaneous",
1976
+ "group": "mmlu_other",
1977
+ "group_alias": "other",
1978
+ "dataset_path": "hails/mmlu_no_train",
1979
+ "dataset_name": "miscellaneous",
1980
+ "test_split": "test",
1981
+ "fewshot_split": "dev",
1982
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1983
+ "doc_to_target": "answer",
1984
+ "doc_to_choice": [
1985
+ "A",
1986
+ "B",
1987
+ "C",
1988
+ "D"
1989
+ ],
1990
+ "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
1991
+ "target_delimiter": " ",
1992
+ "fewshot_delimiter": "\n\n",
1993
+ "fewshot_config": {
1994
+ "sampler": "first_n"
1995
+ },
1996
+ "num_fewshot": 5,
1997
+ "metric_list": [
1998
+ {
1999
+ "metric": "acc",
2000
+ "aggregation": "mean",
2001
+ "higher_is_better": true
2002
+ }
2003
+ ],
2004
+ "output_type": "multiple_choice",
2005
+ "repeats": 1,
2006
+ "should_decontaminate": false,
2007
+ "metadata": {
2008
+ "version": 0.0
2009
+ }
2010
+ },
2011
+ "mmlu_moral_disputes": {
2012
+ "task": "mmlu_moral_disputes",
2013
+ "task_alias": "moral_disputes",
2014
+ "group": "mmlu_humanities",
2015
+ "group_alias": "humanities",
2016
+ "dataset_path": "hails/mmlu_no_train",
2017
+ "dataset_name": "moral_disputes",
2018
+ "test_split": "test",
2019
+ "fewshot_split": "dev",
2020
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2021
+ "doc_to_target": "answer",
2022
+ "doc_to_choice": [
2023
+ "A",
2024
+ "B",
2025
+ "C",
2026
+ "D"
2027
+ ],
2028
+ "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
2029
+ "target_delimiter": " ",
2030
+ "fewshot_delimiter": "\n\n",
2031
+ "fewshot_config": {
2032
+ "sampler": "first_n"
2033
+ },
2034
+ "num_fewshot": 5,
2035
+ "metric_list": [
2036
+ {
2037
+ "metric": "acc",
2038
+ "aggregation": "mean",
2039
+ "higher_is_better": true
2040
+ }
2041
+ ],
2042
+ "output_type": "multiple_choice",
2043
+ "repeats": 1,
2044
+ "should_decontaminate": false,
2045
+ "metadata": {
2046
+ "version": 0.0
2047
+ }
2048
+ },
2049
+ "mmlu_moral_scenarios": {
2050
+ "task": "mmlu_moral_scenarios",
2051
+ "task_alias": "moral_scenarios",
2052
+ "group": "mmlu_humanities",
2053
+ "group_alias": "humanities",
2054
+ "dataset_path": "hails/mmlu_no_train",
2055
+ "dataset_name": "moral_scenarios",
2056
+ "test_split": "test",
2057
+ "fewshot_split": "dev",
2058
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2059
+ "doc_to_target": "answer",
2060
+ "doc_to_choice": [
2061
+ "A",
2062
+ "B",
2063
+ "C",
2064
+ "D"
2065
+ ],
2066
+ "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
2067
+ "target_delimiter": " ",
2068
+ "fewshot_delimiter": "\n\n",
2069
+ "fewshot_config": {
2070
+ "sampler": "first_n"
2071
+ },
2072
+ "num_fewshot": 5,
2073
+ "metric_list": [
2074
+ {
2075
+ "metric": "acc",
2076
+ "aggregation": "mean",
2077
+ "higher_is_better": true
2078
+ }
2079
+ ],
2080
+ "output_type": "multiple_choice",
2081
+ "repeats": 1,
2082
+ "should_decontaminate": false,
2083
+ "metadata": {
2084
+ "version": 0.0
2085
+ }
2086
+ },
2087
+ "mmlu_nutrition": {
2088
+ "task": "mmlu_nutrition",
2089
+ "task_alias": "nutrition",
2090
+ "group": "mmlu_other",
2091
+ "group_alias": "other",
2092
+ "dataset_path": "hails/mmlu_no_train",
2093
+ "dataset_name": "nutrition",
2094
+ "test_split": "test",
2095
+ "fewshot_split": "dev",
2096
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2097
+ "doc_to_target": "answer",
2098
+ "doc_to_choice": [
2099
+ "A",
2100
+ "B",
2101
+ "C",
2102
+ "D"
2103
+ ],
2104
+ "description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
2105
+ "target_delimiter": " ",
2106
+ "fewshot_delimiter": "\n\n",
2107
+ "fewshot_config": {
2108
+ "sampler": "first_n"
2109
+ },
2110
+ "num_fewshot": 5,
2111
+ "metric_list": [
2112
+ {
2113
+ "metric": "acc",
2114
+ "aggregation": "mean",
2115
+ "higher_is_better": true
2116
+ }
2117
+ ],
2118
+ "output_type": "multiple_choice",
2119
+ "repeats": 1,
2120
+ "should_decontaminate": false,
2121
+ "metadata": {
2122
+ "version": 0.0
2123
+ }
2124
+ },
2125
+ "mmlu_philosophy": {
2126
+ "task": "mmlu_philosophy",
2127
+ "task_alias": "philosophy",
2128
+ "group": "mmlu_humanities",
2129
+ "group_alias": "humanities",
2130
+ "dataset_path": "hails/mmlu_no_train",
2131
+ "dataset_name": "philosophy",
2132
+ "test_split": "test",
2133
+ "fewshot_split": "dev",
2134
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2135
+ "doc_to_target": "answer",
2136
+ "doc_to_choice": [
2137
+ "A",
2138
+ "B",
2139
+ "C",
2140
+ "D"
2141
+ ],
2142
+ "description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
2143
+ "target_delimiter": " ",
2144
+ "fewshot_delimiter": "\n\n",
2145
+ "fewshot_config": {
2146
+ "sampler": "first_n"
2147
+ },
2148
+ "num_fewshot": 5,
2149
+ "metric_list": [
2150
+ {
2151
+ "metric": "acc",
2152
+ "aggregation": "mean",
2153
+ "higher_is_better": true
2154
+ }
2155
+ ],
2156
+ "output_type": "multiple_choice",
2157
+ "repeats": 1,
2158
+ "should_decontaminate": false,
2159
+ "metadata": {
2160
+ "version": 0.0
2161
+ }
2162
+ },
2163
+ "mmlu_prehistory": {
2164
+ "task": "mmlu_prehistory",
2165
+ "task_alias": "prehistory",
2166
+ "group": "mmlu_humanities",
2167
+ "group_alias": "humanities",
2168
+ "dataset_path": "hails/mmlu_no_train",
2169
+ "dataset_name": "prehistory",
2170
+ "test_split": "test",
2171
+ "fewshot_split": "dev",
2172
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2173
+ "doc_to_target": "answer",
2174
+ "doc_to_choice": [
2175
+ "A",
2176
+ "B",
2177
+ "C",
2178
+ "D"
2179
+ ],
2180
+ "description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
2181
+ "target_delimiter": " ",
2182
+ "fewshot_delimiter": "\n\n",
2183
+ "fewshot_config": {
2184
+ "sampler": "first_n"
2185
+ },
2186
+ "num_fewshot": 5,
2187
+ "metric_list": [
2188
+ {
2189
+ "metric": "acc",
2190
+ "aggregation": "mean",
2191
+ "higher_is_better": true
2192
+ }
2193
+ ],
2194
+ "output_type": "multiple_choice",
2195
+ "repeats": 1,
2196
+ "should_decontaminate": false,
2197
+ "metadata": {
2198
+ "version": 0.0
2199
+ }
2200
+ },
2201
+ "mmlu_professional_accounting": {
2202
+ "task": "mmlu_professional_accounting",
2203
+ "task_alias": "professional_accounting",
2204
+ "group": "mmlu_other",
2205
+ "group_alias": "other",
2206
+ "dataset_path": "hails/mmlu_no_train",
2207
+ "dataset_name": "professional_accounting",
2208
+ "test_split": "test",
2209
+ "fewshot_split": "dev",
2210
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2211
+ "doc_to_target": "answer",
2212
+ "doc_to_choice": [
2213
+ "A",
2214
+ "B",
2215
+ "C",
2216
+ "D"
2217
+ ],
2218
+ "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
2219
+ "target_delimiter": " ",
2220
+ "fewshot_delimiter": "\n\n",
2221
+ "fewshot_config": {
2222
+ "sampler": "first_n"
2223
+ },
2224
+ "num_fewshot": 5,
2225
+ "metric_list": [
2226
+ {
2227
+ "metric": "acc",
2228
+ "aggregation": "mean",
2229
+ "higher_is_better": true
2230
+ }
2231
+ ],
2232
+ "output_type": "multiple_choice",
2233
+ "repeats": 1,
2234
+ "should_decontaminate": false,
2235
+ "metadata": {
2236
+ "version": 0.0
2237
+ }
2238
+ },
2239
+ "mmlu_professional_law": {
2240
+ "task": "mmlu_professional_law",
2241
+ "task_alias": "professional_law",
2242
+ "group": "mmlu_humanities",
2243
+ "group_alias": "humanities",
2244
+ "dataset_path": "hails/mmlu_no_train",
2245
+ "dataset_name": "professional_law",
2246
+ "test_split": "test",
2247
+ "fewshot_split": "dev",
2248
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2249
+ "doc_to_target": "answer",
2250
+ "doc_to_choice": [
2251
+ "A",
2252
+ "B",
2253
+ "C",
2254
+ "D"
2255
+ ],
2256
+ "description": "The following are multiple choice questions (with answers) about professional law.\n\n",
2257
+ "target_delimiter": " ",
2258
+ "fewshot_delimiter": "\n\n",
2259
+ "fewshot_config": {
2260
+ "sampler": "first_n"
2261
+ },
2262
+ "num_fewshot": 5,
2263
+ "metric_list": [
2264
+ {
2265
+ "metric": "acc",
2266
+ "aggregation": "mean",
2267
+ "higher_is_better": true
2268
+ }
2269
+ ],
2270
+ "output_type": "multiple_choice",
2271
+ "repeats": 1,
2272
+ "should_decontaminate": false,
2273
+ "metadata": {
2274
+ "version": 0.0
2275
+ }
2276
+ },
2277
+ "mmlu_professional_medicine": {
2278
+ "task": "mmlu_professional_medicine",
2279
+ "task_alias": "professional_medicine",
2280
+ "group": "mmlu_other",
2281
+ "group_alias": "other",
2282
+ "dataset_path": "hails/mmlu_no_train",
2283
+ "dataset_name": "professional_medicine",
2284
+ "test_split": "test",
2285
+ "fewshot_split": "dev",
2286
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2287
+ "doc_to_target": "answer",
2288
+ "doc_to_choice": [
2289
+ "A",
2290
+ "B",
2291
+ "C",
2292
+ "D"
2293
+ ],
2294
+ "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
2295
+ "target_delimiter": " ",
2296
+ "fewshot_delimiter": "\n\n",
2297
+ "fewshot_config": {
2298
+ "sampler": "first_n"
2299
+ },
2300
+ "num_fewshot": 5,
2301
+ "metric_list": [
2302
+ {
2303
+ "metric": "acc",
2304
+ "aggregation": "mean",
2305
+ "higher_is_better": true
2306
+ }
2307
+ ],
2308
+ "output_type": "multiple_choice",
2309
+ "repeats": 1,
2310
+ "should_decontaminate": false,
2311
+ "metadata": {
2312
+ "version": 0.0
2313
+ }
2314
+ },
2315
+ "mmlu_professional_psychology": {
2316
+ "task": "mmlu_professional_psychology",
2317
+ "task_alias": "professional_psychology",
2318
+ "group": "mmlu_social_sciences",
2319
+ "group_alias": "social_sciences",
2320
+ "dataset_path": "hails/mmlu_no_train",
2321
+ "dataset_name": "professional_psychology",
2322
+ "test_split": "test",
2323
+ "fewshot_split": "dev",
2324
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2325
+ "doc_to_target": "answer",
2326
+ "doc_to_choice": [
2327
+ "A",
2328
+ "B",
2329
+ "C",
2330
+ "D"
2331
+ ],
2332
+ "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
2333
+ "target_delimiter": " ",
2334
+ "fewshot_delimiter": "\n\n",
2335
+ "fewshot_config": {
2336
+ "sampler": "first_n"
2337
+ },
2338
+ "num_fewshot": 5,
2339
+ "metric_list": [
2340
+ {
2341
+ "metric": "acc",
2342
+ "aggregation": "mean",
2343
+ "higher_is_better": true
2344
+ }
2345
+ ],
2346
+ "output_type": "multiple_choice",
2347
+ "repeats": 1,
2348
+ "should_decontaminate": false,
2349
+ "metadata": {
2350
+ "version": 0.0
2351
+ }
2352
+ },
2353
+ "mmlu_public_relations": {
2354
+ "task": "mmlu_public_relations",
2355
+ "task_alias": "public_relations",
2356
+ "group": "mmlu_social_sciences",
2357
+ "group_alias": "social_sciences",
2358
+ "dataset_path": "hails/mmlu_no_train",
2359
+ "dataset_name": "public_relations",
2360
+ "test_split": "test",
2361
+ "fewshot_split": "dev",
2362
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2363
+ "doc_to_target": "answer",
2364
+ "doc_to_choice": [
2365
+ "A",
2366
+ "B",
2367
+ "C",
2368
+ "D"
2369
+ ],
2370
+ "description": "The following are multiple choice questions (with answers) about public relations.\n\n",
2371
+ "target_delimiter": " ",
2372
+ "fewshot_delimiter": "\n\n",
2373
+ "fewshot_config": {
2374
+ "sampler": "first_n"
2375
+ },
2376
+ "num_fewshot": 5,
2377
+ "metric_list": [
2378
+ {
2379
+ "metric": "acc",
2380
+ "aggregation": "mean",
2381
+ "higher_is_better": true
2382
+ }
2383
+ ],
2384
+ "output_type": "multiple_choice",
2385
+ "repeats": 1,
2386
+ "should_decontaminate": false,
2387
+ "metadata": {
2388
+ "version": 0.0
2389
+ }
2390
+ },
2391
+ "mmlu_security_studies": {
2392
+ "task": "mmlu_security_studies",
2393
+ "task_alias": "security_studies",
2394
+ "group": "mmlu_social_sciences",
2395
+ "group_alias": "social_sciences",
2396
+ "dataset_path": "hails/mmlu_no_train",
2397
+ "dataset_name": "security_studies",
2398
+ "test_split": "test",
2399
+ "fewshot_split": "dev",
2400
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2401
+ "doc_to_target": "answer",
2402
+ "doc_to_choice": [
2403
+ "A",
2404
+ "B",
2405
+ "C",
2406
+ "D"
2407
+ ],
2408
+ "description": "The following are multiple choice questions (with answers) about security studies.\n\n",
2409
+ "target_delimiter": " ",
2410
+ "fewshot_delimiter": "\n\n",
2411
+ "fewshot_config": {
2412
+ "sampler": "first_n"
2413
+ },
2414
+ "num_fewshot": 5,
2415
+ "metric_list": [
2416
+ {
2417
+ "metric": "acc",
2418
+ "aggregation": "mean",
2419
+ "higher_is_better": true
2420
+ }
2421
+ ],
2422
+ "output_type": "multiple_choice",
2423
+ "repeats": 1,
2424
+ "should_decontaminate": false,
2425
+ "metadata": {
2426
+ "version": 0.0
2427
+ }
2428
+ },
2429
+ "mmlu_sociology": {
2430
+ "task": "mmlu_sociology",
2431
+ "task_alias": "sociology",
2432
+ "group": "mmlu_social_sciences",
2433
+ "group_alias": "social_sciences",
2434
+ "dataset_path": "hails/mmlu_no_train",
2435
+ "dataset_name": "sociology",
2436
+ "test_split": "test",
2437
+ "fewshot_split": "dev",
2438
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2439
+ "doc_to_target": "answer",
2440
+ "doc_to_choice": [
2441
+ "A",
2442
+ "B",
2443
+ "C",
2444
+ "D"
2445
+ ],
2446
+ "description": "The following are multiple choice questions (with answers) about sociology.\n\n",
2447
+ "target_delimiter": " ",
2448
+ "fewshot_delimiter": "\n\n",
2449
+ "fewshot_config": {
2450
+ "sampler": "first_n"
2451
+ },
2452
+ "num_fewshot": 5,
2453
+ "metric_list": [
2454
+ {
2455
+ "metric": "acc",
2456
+ "aggregation": "mean",
2457
+ "higher_is_better": true
2458
+ }
2459
+ ],
2460
+ "output_type": "multiple_choice",
2461
+ "repeats": 1,
2462
+ "should_decontaminate": false,
2463
+ "metadata": {
2464
+ "version": 0.0
2465
+ }
2466
+ },
2467
+ "mmlu_us_foreign_policy": {
2468
+ "task": "mmlu_us_foreign_policy",
2469
+ "task_alias": "us_foreign_policy",
2470
+ "group": "mmlu_social_sciences",
2471
+ "group_alias": "social_sciences",
2472
+ "dataset_path": "hails/mmlu_no_train",
2473
+ "dataset_name": "us_foreign_policy",
2474
+ "test_split": "test",
2475
+ "fewshot_split": "dev",
2476
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2477
+ "doc_to_target": "answer",
2478
+ "doc_to_choice": [
2479
+ "A",
2480
+ "B",
2481
+ "C",
2482
+ "D"
2483
+ ],
2484
+ "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
2485
+ "target_delimiter": " ",
2486
+ "fewshot_delimiter": "\n\n",
2487
+ "fewshot_config": {
2488
+ "sampler": "first_n"
2489
+ },
2490
+ "num_fewshot": 5,
2491
+ "metric_list": [
2492
+ {
2493
+ "metric": "acc",
2494
+ "aggregation": "mean",
2495
+ "higher_is_better": true
2496
+ }
2497
+ ],
2498
+ "output_type": "multiple_choice",
2499
+ "repeats": 1,
2500
+ "should_decontaminate": false,
2501
+ "metadata": {
2502
+ "version": 0.0
2503
+ }
2504
+ },
2505
+ "mmlu_virology": {
2506
+ "task": "mmlu_virology",
2507
+ "task_alias": "virology",
2508
+ "group": "mmlu_other",
2509
+ "group_alias": "other",
2510
+ "dataset_path": "hails/mmlu_no_train",
2511
+ "dataset_name": "virology",
2512
+ "test_split": "test",
2513
+ "fewshot_split": "dev",
2514
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2515
+ "doc_to_target": "answer",
2516
+ "doc_to_choice": [
2517
+ "A",
2518
+ "B",
2519
+ "C",
2520
+ "D"
2521
+ ],
2522
+ "description": "The following are multiple choice questions (with answers) about virology.\n\n",
2523
+ "target_delimiter": " ",
2524
+ "fewshot_delimiter": "\n\n",
2525
+ "fewshot_config": {
2526
+ "sampler": "first_n"
2527
+ },
2528
+ "num_fewshot": 5,
2529
+ "metric_list": [
2530
+ {
2531
+ "metric": "acc",
2532
+ "aggregation": "mean",
2533
+ "higher_is_better": true
2534
+ }
2535
+ ],
2536
+ "output_type": "multiple_choice",
2537
+ "repeats": 1,
2538
+ "should_decontaminate": false,
2539
+ "metadata": {
2540
+ "version": 0.0
2541
+ }
2542
+ },
2543
+ "mmlu_world_religions": {
2544
+ "task": "mmlu_world_religions",
2545
+ "task_alias": "world_religions",
2546
+ "group": "mmlu_humanities",
2547
+ "group_alias": "humanities",
2548
+ "dataset_path": "hails/mmlu_no_train",
2549
+ "dataset_name": "world_religions",
2550
+ "test_split": "test",
2551
+ "fewshot_split": "dev",
2552
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2553
+ "doc_to_target": "answer",
2554
+ "doc_to_choice": [
2555
+ "A",
2556
+ "B",
2557
+ "C",
2558
+ "D"
2559
+ ],
2560
+ "description": "The following are multiple choice questions (with answers) about world religions.\n\n",
2561
+ "target_delimiter": " ",
2562
+ "fewshot_delimiter": "\n\n",
2563
+ "fewshot_config": {
2564
+ "sampler": "first_n"
2565
+ },
2566
+ "num_fewshot": 5,
2567
+ "metric_list": [
2568
+ {
2569
+ "metric": "acc",
2570
+ "aggregation": "mean",
2571
+ "higher_is_better": true
2572
+ }
2573
+ ],
2574
+ "output_type": "multiple_choice",
2575
+ "repeats": 1,
2576
+ "should_decontaminate": false,
2577
+ "metadata": {
2578
+ "version": 0.0
2579
+ }
2580
+ }
2581
+ },
2582
+ "versions": {
2583
+ "mmlu_abstract_algebra": 0.0,
2584
+ "mmlu_anatomy": 0.0,
2585
+ "mmlu_astronomy": 0.0,
2586
+ "mmlu_business_ethics": 0.0,
2587
+ "mmlu_clinical_knowledge": 0.0,
2588
+ "mmlu_college_biology": 0.0,
2589
+ "mmlu_college_chemistry": 0.0,
2590
+ "mmlu_college_computer_science": 0.0,
2591
+ "mmlu_college_mathematics": 0.0,
2592
+ "mmlu_college_medicine": 0.0,
2593
+ "mmlu_college_physics": 0.0,
2594
+ "mmlu_computer_security": 0.0,
2595
+ "mmlu_conceptual_physics": 0.0,
2596
+ "mmlu_econometrics": 0.0,
2597
+ "mmlu_electrical_engineering": 0.0,
2598
+ "mmlu_elementary_mathematics": 0.0,
2599
+ "mmlu_formal_logic": 0.0,
2600
+ "mmlu_global_facts": 0.0,
2601
+ "mmlu_high_school_biology": 0.0,
2602
+ "mmlu_high_school_chemistry": 0.0,
2603
+ "mmlu_high_school_computer_science": 0.0,
2604
+ "mmlu_high_school_european_history": 0.0,
2605
+ "mmlu_high_school_geography": 0.0,
2606
+ "mmlu_high_school_government_and_politics": 0.0,
2607
+ "mmlu_high_school_macroeconomics": 0.0,
2608
+ "mmlu_high_school_mathematics": 0.0,
2609
+ "mmlu_high_school_microeconomics": 0.0,
2610
+ "mmlu_high_school_physics": 0.0,
2611
+ "mmlu_high_school_psychology": 0.0,
2612
+ "mmlu_high_school_statistics": 0.0,
2613
+ "mmlu_high_school_us_history": 0.0,
2614
+ "mmlu_high_school_world_history": 0.0,
2615
+ "mmlu_human_aging": 0.0,
2616
+ "mmlu_human_sexuality": 0.0,
2617
+ "mmlu_international_law": 0.0,
2618
+ "mmlu_jurisprudence": 0.0,
2619
+ "mmlu_logical_fallacies": 0.0,
2620
+ "mmlu_machine_learning": 0.0,
2621
+ "mmlu_management": 0.0,
2622
+ "mmlu_marketing": 0.0,
2623
+ "mmlu_medical_genetics": 0.0,
2624
+ "mmlu_miscellaneous": 0.0,
2625
+ "mmlu_moral_disputes": 0.0,
2626
+ "mmlu_moral_scenarios": 0.0,
2627
+ "mmlu_nutrition": 0.0,
2628
+ "mmlu_philosophy": 0.0,
2629
+ "mmlu_prehistory": 0.0,
2630
+ "mmlu_professional_accounting": 0.0,
2631
+ "mmlu_professional_law": 0.0,
2632
+ "mmlu_professional_medicine": 0.0,
2633
+ "mmlu_professional_psychology": 0.0,
2634
+ "mmlu_public_relations": 0.0,
2635
+ "mmlu_security_studies": 0.0,
2636
+ "mmlu_sociology": 0.0,
2637
+ "mmlu_us_foreign_policy": 0.0,
2638
+ "mmlu_virology": 0.0,
2639
+ "mmlu_world_religions": 0.0
2640
+ },
2641
+ "n-shot": {
2642
+ "mmlu": 0,
2643
+ "mmlu_abstract_algebra": 5,
2644
+ "mmlu_anatomy": 5,
2645
+ "mmlu_astronomy": 5,
2646
+ "mmlu_business_ethics": 5,
2647
+ "mmlu_clinical_knowledge": 5,
2648
+ "mmlu_college_biology": 5,
2649
+ "mmlu_college_chemistry": 5,
2650
+ "mmlu_college_computer_science": 5,
2651
+ "mmlu_college_mathematics": 5,
2652
+ "mmlu_college_medicine": 5,
2653
+ "mmlu_college_physics": 5,
2654
+ "mmlu_computer_security": 5,
2655
+ "mmlu_conceptual_physics": 5,
2656
+ "mmlu_econometrics": 5,
2657
+ "mmlu_electrical_engineering": 5,
2658
+ "mmlu_elementary_mathematics": 5,
2659
+ "mmlu_formal_logic": 5,
2660
+ "mmlu_global_facts": 5,
2661
+ "mmlu_high_school_biology": 5,
2662
+ "mmlu_high_school_chemistry": 5,
2663
+ "mmlu_high_school_computer_science": 5,
2664
+ "mmlu_high_school_european_history": 5,
2665
+ "mmlu_high_school_geography": 5,
2666
+ "mmlu_high_school_government_and_politics": 5,
2667
+ "mmlu_high_school_macroeconomics": 5,
2668
+ "mmlu_high_school_mathematics": 5,
2669
+ "mmlu_high_school_microeconomics": 5,
2670
+ "mmlu_high_school_physics": 5,
2671
+ "mmlu_high_school_psychology": 5,
2672
+ "mmlu_high_school_statistics": 5,
2673
+ "mmlu_high_school_us_history": 5,
2674
+ "mmlu_high_school_world_history": 5,
2675
+ "mmlu_human_aging": 5,
2676
+ "mmlu_human_sexuality": 5,
2677
+ "mmlu_humanities": 5,
2678
+ "mmlu_international_law": 5,
2679
+ "mmlu_jurisprudence": 5,
2680
+ "mmlu_logical_fallacies": 5,
2681
+ "mmlu_machine_learning": 5,
2682
+ "mmlu_management": 5,
2683
+ "mmlu_marketing": 5,
2684
+ "mmlu_medical_genetics": 5,
2685
+ "mmlu_miscellaneous": 5,
2686
+ "mmlu_moral_disputes": 5,
2687
+ "mmlu_moral_scenarios": 5,
2688
+ "mmlu_nutrition": 5,
2689
+ "mmlu_other": 5,
2690
+ "mmlu_philosophy": 5,
2691
+ "mmlu_prehistory": 5,
2692
+ "mmlu_professional_accounting": 5,
2693
+ "mmlu_professional_law": 5,
2694
+ "mmlu_professional_medicine": 5,
2695
+ "mmlu_professional_psychology": 5,
2696
+ "mmlu_public_relations": 5,
2697
+ "mmlu_security_studies": 5,
2698
+ "mmlu_social_sciences": 5,
2699
+ "mmlu_sociology": 5,
2700
+ "mmlu_stem": 5,
2701
+ "mmlu_us_foreign_policy": 5,
2702
+ "mmlu_virology": 5,
2703
+ "mmlu_world_religions": 5
2704
+ },
2705
+ "higher_is_better": {
2706
+ "mmlu": {
2707
+ "acc": true
2708
+ },
2709
+ "mmlu_abstract_algebra": {
2710
+ "acc": true
2711
+ },
2712
+ "mmlu_anatomy": {
2713
+ "acc": true
2714
+ },
2715
+ "mmlu_astronomy": {
2716
+ "acc": true
2717
+ },
2718
+ "mmlu_business_ethics": {
2719
+ "acc": true
2720
+ },
2721
+ "mmlu_clinical_knowledge": {
2722
+ "acc": true
2723
+ },
2724
+ "mmlu_college_biology": {
2725
+ "acc": true
2726
+ },
2727
+ "mmlu_college_chemistry": {
2728
+ "acc": true
2729
+ },
2730
+ "mmlu_college_computer_science": {
2731
+ "acc": true
2732
+ },
2733
+ "mmlu_college_mathematics": {
2734
+ "acc": true
2735
+ },
2736
+ "mmlu_college_medicine": {
2737
+ "acc": true
2738
+ },
2739
+ "mmlu_college_physics": {
2740
+ "acc": true
2741
+ },
2742
+ "mmlu_computer_security": {
2743
+ "acc": true
2744
+ },
2745
+ "mmlu_conceptual_physics": {
2746
+ "acc": true
2747
+ },
2748
+ "mmlu_econometrics": {
2749
+ "acc": true
2750
+ },
2751
+ "mmlu_electrical_engineering": {
2752
+ "acc": true
2753
+ },
2754
+ "mmlu_elementary_mathematics": {
2755
+ "acc": true
2756
+ },
2757
+ "mmlu_formal_logic": {
2758
+ "acc": true
2759
+ },
2760
+ "mmlu_global_facts": {
2761
+ "acc": true
2762
+ },
2763
+ "mmlu_high_school_biology": {
2764
+ "acc": true
2765
+ },
2766
+ "mmlu_high_school_chemistry": {
2767
+ "acc": true
2768
+ },
2769
+ "mmlu_high_school_computer_science": {
2770
+ "acc": true
2771
+ },
2772
+ "mmlu_high_school_european_history": {
2773
+ "acc": true
2774
+ },
2775
+ "mmlu_high_school_geography": {
2776
+ "acc": true
2777
+ },
2778
+ "mmlu_high_school_government_and_politics": {
2779
+ "acc": true
2780
+ },
2781
+ "mmlu_high_school_macroeconomics": {
2782
+ "acc": true
2783
+ },
2784
+ "mmlu_high_school_mathematics": {
2785
+ "acc": true
2786
+ },
2787
+ "mmlu_high_school_microeconomics": {
2788
+ "acc": true
2789
+ },
2790
+ "mmlu_high_school_physics": {
2791
+ "acc": true
2792
+ },
2793
+ "mmlu_high_school_psychology": {
2794
+ "acc": true
2795
+ },
2796
+ "mmlu_high_school_statistics": {
2797
+ "acc": true
2798
+ },
2799
+ "mmlu_high_school_us_history": {
2800
+ "acc": true
2801
+ },
2802
+ "mmlu_high_school_world_history": {
2803
+ "acc": true
2804
+ },
2805
+ "mmlu_human_aging": {
2806
+ "acc": true
2807
+ },
2808
+ "mmlu_human_sexuality": {
2809
+ "acc": true
2810
+ },
2811
+ "mmlu_humanities": {
2812
+ "acc": true
2813
+ },
2814
+ "mmlu_international_law": {
2815
+ "acc": true
2816
+ },
2817
+ "mmlu_jurisprudence": {
2818
+ "acc": true
2819
+ },
2820
+ "mmlu_logical_fallacies": {
2821
+ "acc": true
2822
+ },
2823
+ "mmlu_machine_learning": {
2824
+ "acc": true
2825
+ },
2826
+ "mmlu_management": {
2827
+ "acc": true
2828
+ },
2829
+ "mmlu_marketing": {
2830
+ "acc": true
2831
+ },
2832
+ "mmlu_medical_genetics": {
2833
+ "acc": true
2834
+ },
2835
+ "mmlu_miscellaneous": {
2836
+ "acc": true
2837
+ },
2838
+ "mmlu_moral_disputes": {
2839
+ "acc": true
2840
+ },
2841
+ "mmlu_moral_scenarios": {
2842
+ "acc": true
2843
+ },
2844
+ "mmlu_nutrition": {
2845
+ "acc": true
2846
+ },
2847
+ "mmlu_other": {
2848
+ "acc": true
2849
+ },
2850
+ "mmlu_philosophy": {
2851
+ "acc": true
2852
+ },
2853
+ "mmlu_prehistory": {
2854
+ "acc": true
2855
+ },
2856
+ "mmlu_professional_accounting": {
2857
+ "acc": true
2858
+ },
2859
+ "mmlu_professional_law": {
2860
+ "acc": true
2861
+ },
2862
+ "mmlu_professional_medicine": {
2863
+ "acc": true
2864
+ },
2865
+ "mmlu_professional_psychology": {
2866
+ "acc": true
2867
+ },
2868
+ "mmlu_public_relations": {
2869
+ "acc": true
2870
+ },
2871
+ "mmlu_security_studies": {
2872
+ "acc": true
2873
+ },
2874
+ "mmlu_social_sciences": {
2875
+ "acc": true
2876
+ },
2877
+ "mmlu_sociology": {
2878
+ "acc": true
2879
+ },
2880
+ "mmlu_stem": {
2881
+ "acc": true
2882
+ },
2883
+ "mmlu_us_foreign_policy": {
2884
+ "acc": true
2885
+ },
2886
+ "mmlu_virology": {
2887
+ "acc": true
2888
+ },
2889
+ "mmlu_world_religions": {
2890
+ "acc": true
2891
+ }
2892
+ },
2893
+ "n-samples": {
2894
+ "mmlu_high_school_european_history": {
2895
+ "original": 165,
2896
+ "effective": 165
2897
+ },
2898
+ "mmlu_high_school_world_history": {
2899
+ "original": 237,
2900
+ "effective": 237
2901
+ },
2902
+ "mmlu_professional_law": {
2903
+ "original": 1534,
2904
+ "effective": 1534
2905
+ },
2906
+ "mmlu_logical_fallacies": {
2907
+ "original": 163,
2908
+ "effective": 163
2909
+ },
2910
+ "mmlu_high_school_us_history": {
2911
+ "original": 204,
2912
+ "effective": 204
2913
+ },
2914
+ "mmlu_world_religions": {
2915
+ "original": 171,
2916
+ "effective": 171
2917
+ },
2918
+ "mmlu_prehistory": {
2919
+ "original": 324,
2920
+ "effective": 324
2921
+ },
2922
+ "mmlu_jurisprudence": {
2923
+ "original": 108,
2924
+ "effective": 108
2925
+ },
2926
+ "mmlu_moral_scenarios": {
2927
+ "original": 895,
2928
+ "effective": 895
2929
+ },
2930
+ "mmlu_formal_logic": {
2931
+ "original": 126,
2932
+ "effective": 126
2933
+ },
2934
+ "mmlu_philosophy": {
2935
+ "original": 311,
2936
+ "effective": 311
2937
+ },
2938
+ "mmlu_international_law": {
2939
+ "original": 121,
2940
+ "effective": 121
2941
+ },
2942
+ "mmlu_moral_disputes": {
2943
+ "original": 346,
2944
+ "effective": 346
2945
+ },
2946
+ "mmlu_high_school_government_and_politics": {
2947
+ "original": 193,
2948
+ "effective": 193
2949
+ },
2950
+ "mmlu_human_sexuality": {
2951
+ "original": 131,
2952
+ "effective": 131
2953
+ },
2954
+ "mmlu_high_school_microeconomics": {
2955
+ "original": 238,
2956
+ "effective": 238
2957
+ },
2958
+ "mmlu_high_school_macroeconomics": {
2959
+ "original": 390,
2960
+ "effective": 390
2961
+ },
2962
+ "mmlu_public_relations": {
2963
+ "original": 110,
2964
+ "effective": 110
2965
+ },
2966
+ "mmlu_sociology": {
2967
+ "original": 201,
2968
+ "effective": 201
2969
+ },
2970
+ "mmlu_professional_psychology": {
2971
+ "original": 612,
2972
+ "effective": 612
2973
+ },
2974
+ "mmlu_high_school_psychology": {
2975
+ "original": 545,
2976
+ "effective": 545
2977
+ },
2978
+ "mmlu_econometrics": {
2979
+ "original": 114,
2980
+ "effective": 114
2981
+ },
2982
+ "mmlu_high_school_geography": {
2983
+ "original": 198,
2984
+ "effective": 198
2985
+ },
2986
+ "mmlu_us_foreign_policy": {
2987
+ "original": 100,
2988
+ "effective": 100
2989
+ },
2990
+ "mmlu_security_studies": {
2991
+ "original": 245,
2992
+ "effective": 245
2993
+ },
2994
+ "mmlu_business_ethics": {
2995
+ "original": 100,
2996
+ "effective": 100
2997
+ },
2998
+ "mmlu_virology": {
2999
+ "original": 166,
3000
+ "effective": 166
3001
+ },
3002
+ "mmlu_nutrition": {
3003
+ "original": 306,
3004
+ "effective": 306
3005
+ },
3006
+ "mmlu_management": {
3007
+ "original": 103,
3008
+ "effective": 103
3009
+ },
3010
+ "mmlu_clinical_knowledge": {
3011
+ "original": 265,
3012
+ "effective": 265
3013
+ },
3014
+ "mmlu_marketing": {
3015
+ "original": 234,
3016
+ "effective": 234
3017
+ },
3018
+ "mmlu_college_medicine": {
3019
+ "original": 173,
3020
+ "effective": 173
3021
+ },
3022
+ "mmlu_professional_medicine": {
3023
+ "original": 272,
3024
+ "effective": 272
3025
+ },
3026
+ "mmlu_medical_genetics": {
3027
+ "original": 100,
3028
+ "effective": 100
3029
+ },
3030
+ "mmlu_human_aging": {
3031
+ "original": 223,
3032
+ "effective": 223
3033
+ },
3034
+ "mmlu_professional_accounting": {
3035
+ "original": 282,
3036
+ "effective": 282
3037
+ },
3038
+ "mmlu_miscellaneous": {
3039
+ "original": 783,
3040
+ "effective": 783
3041
+ },
3042
+ "mmlu_global_facts": {
3043
+ "original": 100,
3044
+ "effective": 100
3045
+ },
3046
+ "mmlu_college_computer_science": {
3047
+ "original": 100,
3048
+ "effective": 100
3049
+ },
3050
+ "mmlu_high_school_physics": {
3051
+ "original": 151,
3052
+ "effective": 151
3053
+ },
3054
+ "mmlu_college_chemistry": {
3055
+ "original": 100,
3056
+ "effective": 100
3057
+ },
3058
+ "mmlu_college_biology": {
3059
+ "original": 144,
3060
+ "effective": 144
3061
+ },
3062
+ "mmlu_high_school_mathematics": {
3063
+ "original": 270,
3064
+ "effective": 270
3065
+ },
3066
+ "mmlu_high_school_computer_science": {
3067
+ "original": 100,
3068
+ "effective": 100
3069
+ },
3070
+ "mmlu_electrical_engineering": {
3071
+ "original": 145,
3072
+ "effective": 145
3073
+ },
3074
+ "mmlu_college_physics": {
3075
+ "original": 102,
3076
+ "effective": 102
3077
+ },
3078
+ "mmlu_anatomy": {
3079
+ "original": 135,
3080
+ "effective": 135
3081
+ },
3082
+ "mmlu_college_mathematics": {
3083
+ "original": 100,
3084
+ "effective": 100
3085
+ },
3086
+ "mmlu_elementary_mathematics": {
3087
+ "original": 378,
3088
+ "effective": 378
3089
+ },
3090
+ "mmlu_high_school_chemistry": {
3091
+ "original": 203,
3092
+ "effective": 203
3093
+ },
3094
+ "mmlu_machine_learning": {
3095
+ "original": 112,
3096
+ "effective": 112
3097
+ },
3098
+ "mmlu_abstract_algebra": {
3099
+ "original": 100,
3100
+ "effective": 100
3101
+ },
3102
+ "mmlu_astronomy": {
3103
+ "original": 152,
3104
+ "effective": 152
3105
+ },
3106
+ "mmlu_computer_security": {
3107
+ "original": 100,
3108
+ "effective": 100
3109
+ },
3110
+ "mmlu_high_school_biology": {
3111
+ "original": 310,
3112
+ "effective": 310
3113
+ },
3114
+ "mmlu_high_school_statistics": {
3115
+ "original": 216,
3116
+ "effective": 216
3117
+ },
3118
+ "mmlu_conceptual_physics": {
3119
+ "original": 235,
3120
+ "effective": 235
3121
+ }
3122
+ },
3123
+ "config": {
3124
+ "model": "vllm",
3125
+ "model_args": "pretrained=/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8,tensor_parallel_size=4,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.8,data_parallel_size=1",
3126
+ "batch_size": "auto",
3127
+ "batch_sizes": [],
3128
+ "device": "cuda",
3129
+ "use_cache": null,
3130
+ "limit": null,
3131
+ "bootstrap_iters": 100000,
3132
+ "gen_kwargs": null,
3133
+ "random_seed": 0,
3134
+ "numpy_seed": 1234,
3135
+ "torch_seed": 1234,
3136
+ "fewshot_seed": 1234
3137
+ },
3138
+ "git_hash": "f2843b2f",
3139
+ "date": 1717759668.7806425,
3140
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.19.0-1010-nvidia-lowlatency-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.5.40\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H100 NVL\nGPU 1: NVIDIA H100 NVL\nGPU 2: NVIDIA H100 NVL\nGPU 3: NVIDIA H100 NVL\nGPU 4: NVIDIA H100 NVL\nGPU 5: NVIDIA H100 NVL\nGPU 6: NVIDIA H100 NVL\nGPU 7: NVIDIA H100 NVL\n\nNvidia driver version: 555.42.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 144\nOn-line CPU(s) list: 0-143\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8452Y\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 36\nSocket(s): 2\nStepping: 8\nFrequency boost: enabled\nCPU max MHz: 2001.0000\nCPU min MHz: 800.0000\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hfi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities\nVirtualization: VT-x\nL1d cache: 3.4 MiB (72 instances)\nL1i cache: 2.3 MiB (72 instances)\nL2 cache: 144 MiB (72 instances)\nL3 cache: 135 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-35,72-107\nNUMA node1 CPU(s): 36-71,108-143\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
3141
+ "transformers_version": "4.41.2",
3142
+ "upper_git_hash": "f2843b2fd64df799179808ce2428b7a8dbc403de",
3143
+ "task_hashes": {},
3144
+ "model_source": "vllm",
3145
+ "model_name": "/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8",
3146
+ "model_name_sanitized": "__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8",
3147
+ "system_instruction": null,
3148
+ "system_instruction_sha": null,
3149
+ "chat_template": null,
3150
+ "chat_template_sha": null,
3151
+ "start_time": 829948.992005701,
3152
+ "end_time": 847093.177875013,
3153
+ "total_evaluation_time_seconds": "17144.18586931203"
3154
+ }
model-00001-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a6839dca978613fb1f71e04f8a1e1fc5276f785f96553ad401adc6b6e9ce1de
3
+ size 4907575664
model-00002-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1aa5f53050502784836c375b06acb5ba8b1ec590caab38ae15609938c216e53
3
+ size 4907601776
model-00003-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8e7be43e4d252398a894e85b5c6c34cb70904dd58f2ac23233e71990e1b2663
3
+ size 4907601776
model-00004-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51e2eeb8f54100450f4932e7bf6da41c76ca2a3ddddab7a36f2de6cfa1358372
3
+ size 4907601776
model-00005-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f3ac19f4bc32e03f50eab48c0b6231a3a68c127acc5ed0aa47d4a2af39ae47d
3
+ size 4907601776
model-00006-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5822e59664e8263b6bebcab8554e2799779db3f55358e1f746331ba0d103d154
3
+ size 4907601920
model-00007-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eba20fd06ef42fd5db554886ead700dd3fe3d25da39e8e0483180210dbd8ed0
3
+ size 4907601944
model-00008-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cbb189b8ea9f501c8b54fda421898626572619eba1fcfbc3516ae2e78e8b7f6
3
+ size 4907601944
model-00009-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:693f0167082cf3061ec78f473db2166e842ecd3032d2c8ebc00facfcbd9076ca
3
+ size 4907601944
model-00010-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3def56999aa6cc9b2c83fb2e5f291c8e422bed9964884111cc9d9de1472502b1
3
+ size 4907601944
model-00011-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93263bb445fff535352bc5b87403a9d7615bccd4a2cb1681948aac7bf07cd56b
3
+ size 4907601944
model-00012-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca248042a37e625718ff51d2e864a18d216ba7381248b9344a34b42253adcd9d
3
+ size 4907601944
model-00013-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aafd76541a14f31a88d673cd4b9b9bf84739abea72a1e21d1bfcc8ed7ecc1ed1
3
+ size 4907601944
model-00014-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a02e4327aef42bb0bf90baafc3b28f292a238b111307453669be7813a15c89b
3
+ size 4907601944
model-00015-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac3a24b909d4c61aea1b3f553f643daaef13d96ed7cfcb15219642da4e02b328
3
+ size 4907601944
model-00016-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6abfd72e6f11f526b02588fce71d598b5d562dd7551fe71246299a5de987f761
3
+ size 4907601944
model-00017-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52a131de3a977b6cfce3f6b07670e88d42f1cf34cf86a94b5f578e893aa2eacd
3
+ size 4907601944
model-00018-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3105f2bf9a02dd8eb70dae40688ae3c2f683a6c9eecc1ad1e08449806c274bb4
3
+ size 4907601944
model-00019-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5e3ad75a63414a39e5c4d4179ded0e7a56df3be9d8a8755e85fb7ccdd1228c2
3
+ size 4907601944
model-00020-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e50df8f92e2e1e8350244e6d7194040ae2a91f76493a5a86c359ed8e0f113e2
3
+ size 4907601944
model-00021-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df105cc17350040fa187e8af5bcb85ba2b33cdb38e7d4915e2e975d354a31e68
3
+ size 4970418128
model-00022-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1650386ee4277216b9fabb8f9c955fdc453a71b5d3f4c6b9970926a33dfe47bc
3
+ size 4995682048
model-00023-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9d79bf0e9b15af2d19700a2ff5a2ebee8a8d8e0b23c09fd26ea3843bd1d9b8
3
+ size 4970516552
model-00024-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:160786e8ff0ea635822dde189b32d99a0d530239e7d61fb39de0407af66d9a82
3
+ size 4907577160
model-00025-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba4c19f664771f71e72eb8fc2154cb0604e10c0cfe70453fbfc237382a77c52
3
+ size 4907601944
model-00026-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7a2c530281337e2943afac5b09834767a016d81a2c9317e7f69017582d85fac
3
+ size 4907601944
model-00027-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dbda58f7f2ffb2f754404877420ee52ffd9a64c20d5e556689a28cba43c8583
3
+ size 4907601944
model-00028-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96963db640a7ed3067c1cc302ac036d1e2463cc5c5e11c68927edb8972c550b3
3
+ size 4907601944
model-00029-of-00029.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbf9de7d52e3d43d4819c922000ce1992106b66391d289769f2dbe28ce641a0
3
+ size 3410141576
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "[INST]",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "[/INST]",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "[TOOL_CALLS]",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "[AVAILABLE_TOOLS]",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "[/AVAILABLE_TOOLS]",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "8": {
70
+ "content": "[TOOL_RESULTS]",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "9": {
78
+ "content": "[/TOOL_RESULTS]",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ }
85
+ },
86
+ "additional_special_tokens": [],
87
+ "bos_token": "<s>",
88
+ "chat_template": [
89
+ {
90
+ "name": "default",
91
+ "template": "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
92
+ },
93
+ {
94
+ "name": "tool_use",
95
+ "template": "{{bos_token}}{% set user_messages = messages | selectattr('role', 'equalto', 'user') | list %}{% for message in messages %}{% if message['role'] == 'user' %}{% if message == user_messages[-1] %}{% if tools %}{{'[AVAILABLE_TOOLS]'+ tools|string + '[/AVAILABLE_TOOLS]'}}{% endif %}{{ '[INST]' + message['content'] + '[/INST]' }}{% else %}{{ '[INST]' + message['content'] + '[/INST]' }}{% endif %}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% elif message['role'] == 'tool_results' %}{{'[TOOL_RESULTS]' + message['content']|string + '[/TOOL_RESULTS]'}}{% elif message['role'] == 'tool_calls' %}{{'[TOOL_CALLS]' + message['content']|string + eos_token}}{% endif %}{% endfor %}"
96
+ }
97
+ ],
98
+ "clean_up_tokenization_spaces": false,
99
+ "eos_token": "</s>",
100
+ "legacy": true,
101
+ "model_max_length": 1000000000000000019884624838656,
102
+ "pad_token": null,
103
+ "sp_model_kwargs": {},
104
+ "spaces_between_special_tokens": false,
105
+ "tokenizer_class": "LlamaTokenizer",
106
+ "unk_token": "<unk>",
107
+ "use_default_system_prompt": false
108
+ }
truthfulqa_mc2/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T07-32-59.669961.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "truthfulqa_mc2": {
4
+ "acc,none": 0.6573768835074595,
5
+ "acc_stderr,none": 0.014893303818525347,
6
+ "alias": "truthfulqa_mc2"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "truthfulqa_mc2": []
11
+ },
12
+ "configs": {
13
+ "truthfulqa_mc2": {
14
+ "task": "truthfulqa_mc2",
15
+ "group": [
16
+ "truthfulqa"
17
+ ],
18
+ "dataset_path": "truthful_qa",
19
+ "dataset_name": "multiple_choice",
20
+ "validation_split": "validation",
21
+ "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
22
+ "doc_to_target": 0,
23
+ "doc_to_choice": "{{mc2_targets.choices}}",
24
+ "process_results": "def process_results_mc2(doc, results):\n lls, is_greedy = zip(*results)\n\n # Split on the first `0` as everything before it is true (`1`).\n split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n # Compute the normalized probability mass for the correct answer.\n ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n p_true = p_true / (sum(p_true) + sum(p_false))\n\n return {\"acc\": sum(p_true)}\n",
25
+ "description": "",
26
+ "target_delimiter": " ",
27
+ "fewshot_delimiter": "\n\n",
28
+ "num_fewshot": 0,
29
+ "metric_list": [
30
+ {
31
+ "metric": "acc",
32
+ "aggregation": "mean",
33
+ "higher_is_better": true
34
+ }
35
+ ],
36
+ "output_type": "multiple_choice",
37
+ "repeats": 1,
38
+ "should_decontaminate": true,
39
+ "doc_to_decontamination_query": "question",
40
+ "metadata": {
41
+ "version": 2.0
42
+ }
43
+ }
44
+ },
45
+ "versions": {
46
+ "truthfulqa_mc2": 2.0
47
+ },
48
+ "n-shot": {
49
+ "truthfulqa_mc2": 0
50
+ },
51
+ "higher_is_better": {
52
+ "truthfulqa_mc2": {
53
+ "acc": true
54
+ }
55
+ },
56
+ "n-samples": {
57
+ "truthfulqa_mc2": {
58
+ "original": 817,
59
+ "effective": 817
60
+ }
61
+ },
62
+ "config": {
63
+ "model": "vllm",
64
+ "model_args": "pretrained=/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8,tensor_parallel_size=4,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.8,data_parallel_size=1",
65
+ "batch_size": "auto",
66
+ "batch_sizes": [],
67
+ "device": "cuda",
68
+ "use_cache": null,
69
+ "limit": null,
70
+ "bootstrap_iters": 100000,
71
+ "gen_kwargs": null,
72
+ "random_seed": 0,
73
+ "numpy_seed": 1234,
74
+ "torch_seed": 1234,
75
+ "fewshot_seed": 1234
76
+ },
77
+ "git_hash": "f2843b2f",
78
+ "date": 1717745129.2305892,
79
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.19.0-1010-nvidia-lowlatency-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.5.40\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H100 NVL\nGPU 1: NVIDIA H100 NVL\nGPU 2: NVIDIA H100 NVL\nGPU 3: NVIDIA H100 NVL\nGPU 4: NVIDIA H100 NVL\nGPU 5: NVIDIA H100 NVL\nGPU 6: NVIDIA H100 NVL\nGPU 7: NVIDIA H100 NVL\n\nNvidia driver version: 555.42.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 144\nOn-line CPU(s) list: 0-143\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8452Y\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 36\nSocket(s): 2\nStepping: 8\nFrequency boost: enabled\nCPU max MHz: 2001.0000\nCPU min MHz: 800.0000\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hfi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities\nVirtualization: VT-x\nL1d cache: 3.4 MiB (72 instances)\nL1i cache: 2.3 MiB (72 instances)\nL2 cache: 144 MiB (72 instances)\nL3 cache: 135 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-35,72-107\nNUMA node1 CPU(s): 36-71,108-143\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
80
+ "transformers_version": "4.41.2",
81
+ "upper_git_hash": "f2843b2fd64df799179808ce2428b7a8dbc403de",
82
+ "task_hashes": {},
83
+ "model_source": "vllm",
84
+ "model_name": "/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8",
85
+ "model_name_sanitized": "__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8",
86
+ "system_instruction": null,
87
+ "system_instruction_sha": null,
88
+ "chat_template": null,
89
+ "chat_template_sha": null,
90
+ "start_time": 815407.709750546,
91
+ "end_time": 815864.382636194,
92
+ "total_evaluation_time_seconds": "456.6728856479749"
93
+ }
winogrande/__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8/results_2024-06-07T07-25-09.600505.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "winogrande": {
4
+ "acc,none": 0.8263614838200474,
5
+ "acc_stderr,none": 0.010646116480331012,
6
+ "alias": "winogrande"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "winogrande": []
11
+ },
12
+ "configs": {
13
+ "winogrande": {
14
+ "task": "winogrande",
15
+ "dataset_path": "winogrande",
16
+ "dataset_name": "winogrande_xl",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n",
20
+ "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n",
21
+ "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "acc",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ }
32
+ ],
33
+ "output_type": "multiple_choice",
34
+ "repeats": 1,
35
+ "should_decontaminate": true,
36
+ "doc_to_decontamination_query": "sentence",
37
+ "metadata": {
38
+ "version": 1.0
39
+ }
40
+ }
41
+ },
42
+ "versions": {
43
+ "winogrande": 1.0
44
+ },
45
+ "n-shot": {
46
+ "winogrande": 5
47
+ },
48
+ "higher_is_better": {
49
+ "winogrande": {
50
+ "acc": true
51
+ }
52
+ },
53
+ "n-samples": {
54
+ "winogrande": {
55
+ "original": 1267,
56
+ "effective": 1267
57
+ }
58
+ },
59
+ "config": {
60
+ "model": "vllm",
61
+ "model_args": "pretrained=/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8,tensor_parallel_size=4,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.8,data_parallel_size=1",
62
+ "batch_size": "auto",
63
+ "batch_sizes": [],
64
+ "device": "cuda",
65
+ "use_cache": null,
66
+ "limit": null,
67
+ "bootstrap_iters": 100000,
68
+ "gen_kwargs": null,
69
+ "random_seed": 0,
70
+ "numpy_seed": 1234,
71
+ "torch_seed": 1234,
72
+ "fewshot_seed": 1234
73
+ },
74
+ "git_hash": "f2843b2f",
75
+ "date": 1717744924.8628974,
76
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.3\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.19.0-1010-nvidia-lowlatency-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.5.40\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H100 NVL\nGPU 1: NVIDIA H100 NVL\nGPU 2: NVIDIA H100 NVL\nGPU 3: NVIDIA H100 NVL\nGPU 4: NVIDIA H100 NVL\nGPU 5: NVIDIA H100 NVL\nGPU 6: NVIDIA H100 NVL\nGPU 7: NVIDIA H100 NVL\n\nNvidia driver version: 555.42.02\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 144\nOn-line CPU(s) list: 0-143\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8452Y\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 36\nSocket(s): 2\nStepping: 8\nFrequency boost: enabled\nCPU max MHz: 2001.0000\nCPU min MHz: 800.0000\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single intel_ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local split_lock_detect avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hfi avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear serialize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities\nVirtualization: VT-x\nL1d cache: 3.4 MiB (72 instances)\nL1i cache: 2.3 MiB (72 instances)\nL2 cache: 144 MiB (72 instances)\nL3 cache: 135 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-35,72-107\nNUMA node1 CPU(s): 36-71,108-143\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
77
+ "transformers_version": "4.41.2",
78
+ "upper_git_hash": "f2843b2fd64df799179808ce2428b7a8dbc403de",
79
+ "task_hashes": {},
80
+ "model_source": "vllm",
81
+ "model_name": "/home/mlr/models/Mixtral-8x22B-Instruct-v0.1-FP8",
82
+ "model_name_sanitized": "__home__mlr__models__Mixtral-8x22B-Instruct-v0.1-FP8",
83
+ "system_instruction": null,
84
+ "system_instruction_sha": null,
85
+ "chat_template": null,
86
+ "chat_template_sha": null,
87
+ "start_time": 815203.282574388,
88
+ "end_time": 815394.313211667,
89
+ "total_evaluation_time_seconds": "191.03063727903645"
90
+ }