abhinavnmagic commited on
Commit
4debe3d
1 Parent(s): 3565ac6

Upload folder using huggingface_hub

Browse files
Files changed (21) hide show
  1. added_tokens.json +5 -0
  2. arc_challenge-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-07-01T03-38-50.160777.json +103 -0
  3. config.json +41 -0
  4. gsm8k-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T02-29-25.547540.json +139 -0
  5. hellaswag-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T01-25-24.138070.json +101 -0
  6. merges.txt +0 -0
  7. mmlu-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T02-27-23.641279.json +3155 -0
  8. model.safetensors +3 -0
  9. quantize_config.json +13 -0
  10. special_tokens_map.json +20 -0
  11. tokenizer.json +0 -0
  12. tokenizer_config.json +43 -0
  13. truthfulqa_mc2-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T00-43-07.648332.json +94 -0
  14. vllm-clearml/results_2024-06-29T00-41-08.891349.json +91 -0
  15. vllm-clearml/results_2024-06-29T00-43-07.648332.json +94 -0
  16. vllm-clearml/results_2024-06-29T01-25-24.138070.json +101 -0
  17. vllm-clearml/results_2024-06-29T02-27-23.641279.json +3155 -0
  18. vllm-clearml/results_2024-06-29T02-29-25.547540.json +139 -0
  19. vllm-clearml/results_2024-07-01T03-38-50.160777.json +103 -0
  20. vocab.json +0 -0
  21. winogrande-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T00-41-08.891349.json +91 -0
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
arc_challenge-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-07-01T03-38-50.160777.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "acc,none": 0.3967576791808874,
5
+ "acc_stderr,none": 0.014296513020180639,
6
+ "acc_norm,none": 0.4180887372013652,
7
+ "acc_norm_stderr,none": 0.014413988396996074,
8
+ "alias": "arc_challenge"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "arc_challenge": []
13
+ },
14
+ "configs": {
15
+ "arc_challenge": {
16
+ "task": "arc_challenge",
17
+ "group": [
18
+ "ai2_arc"
19
+ ],
20
+ "dataset_path": "allenai/ai2_arc",
21
+ "dataset_name": "ARC-Challenge",
22
+ "training_split": "train",
23
+ "validation_split": "validation",
24
+ "test_split": "test",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
27
+ "doc_to_choice": "{{choices.text}}",
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 25,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ }
43
+ ],
44
+ "output_type": "multiple_choice",
45
+ "repeats": 1,
46
+ "should_decontaminate": true,
47
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
48
+ "metadata": {
49
+ "version": 1.0
50
+ }
51
+ }
52
+ },
53
+ "versions": {
54
+ "arc_challenge": 1.0
55
+ },
56
+ "n-shot": {
57
+ "arc_challenge": 25
58
+ },
59
+ "higher_is_better": {
60
+ "arc_challenge": {
61
+ "acc": true,
62
+ "acc_norm": true
63
+ }
64
+ },
65
+ "n-samples": {
66
+ "arc_challenge": {
67
+ "original": 1172,
68
+ "effective": 1172
69
+ }
70
+ },
71
+ "config": {
72
+ "model": "vllm",
73
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
74
+ "batch_size": "auto",
75
+ "batch_sizes": [],
76
+ "device": "cuda",
77
+ "use_cache": null,
78
+ "limit": null,
79
+ "bootstrap_iters": 100000,
80
+ "gen_kwargs": null,
81
+ "random_seed": 0,
82
+ "numpy_seed": 1234,
83
+ "torch_seed": 1234,
84
+ "fewshot_seed": 1234
85
+ },
86
+ "git_hash": null,
87
+ "date": 1719804732.5263278,
88
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
89
+ "transformers_version": "4.41.2",
90
+ "upper_git_hash": null,
91
+ "task_hashes": {},
92
+ "model_source": "vllm",
93
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
94
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
95
+ "system_instruction": null,
96
+ "system_instruction_sha": null,
97
+ "fewshot_as_multiturn": false,
98
+ "chat_template": null,
99
+ "chat_template_sha": null,
100
+ "start_time": 7818419.641077244,
101
+ "end_time": 7818822.513901013,
102
+ "total_evaluation_time_seconds": "402.87282376922667"
103
+ }
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2-1.5B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 2,
19
+ "quantization_config": {
20
+ "bits": 4,
21
+ "checkpoint_format": "gptq",
22
+ "damp_percent": 0.1,
23
+ "desc_act": true,
24
+ "group_size": 128,
25
+ "model_file_base_name": "model",
26
+ "model_name_or_path": null,
27
+ "quant_method": "gptq",
28
+ "static_groups": false,
29
+ "sym": true,
30
+ "true_sequential": true
31
+ },
32
+ "rms_norm_eps": 1e-06,
33
+ "rope_theta": 1000000.0,
34
+ "sliding_window": 32768,
35
+ "tie_word_embeddings": true,
36
+ "torch_dtype": "float16",
37
+ "transformers_version": "4.41.2",
38
+ "use_cache": true,
39
+ "use_sliding_window": false,
40
+ "vocab_size": 151936
41
+ }
gsm8k-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T02-29-25.547540.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k": {
4
+ "exact_match,strict-match": 0.5526914329037149,
5
+ "exact_match_stderr,strict-match": 0.013695795709089896,
6
+ "exact_match,flexible-extract": 0.5534495830174374,
7
+ "exact_match_stderr,flexible-extract": 0.013693566549743146,
8
+ "alias": "gsm8k"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gsm8k": []
13
+ },
14
+ "configs": {
15
+ "gsm8k": {
16
+ "task": "gsm8k",
17
+ "group": [
18
+ "math_word_problems"
19
+ ],
20
+ "dataset_path": "gsm8k",
21
+ "dataset_name": "main",
22
+ "training_split": "train",
23
+ "test_split": "test",
24
+ "fewshot_split": "train",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{answer}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 5,
31
+ "metric_list": [
32
+ {
33
+ "metric": "exact_match",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true,
36
+ "ignore_case": true,
37
+ "ignore_punctuation": false,
38
+ "regexes_to_ignore": [
39
+ ",",
40
+ "\\$",
41
+ "(?s).*#### ",
42
+ "\\.$"
43
+ ]
44
+ }
45
+ ],
46
+ "output_type": "generate_until",
47
+ "generation_kwargs": {
48
+ "until": [
49
+ "Question:",
50
+ "</s>",
51
+ "<|im_end|>"
52
+ ],
53
+ "do_sample": false,
54
+ "temperature": 0.0
55
+ },
56
+ "repeats": 1,
57
+ "filter_list": [
58
+ {
59
+ "name": "strict-match",
60
+ "filter": [
61
+ {
62
+ "function": "regex",
63
+ "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
64
+ },
65
+ {
66
+ "function": "take_first"
67
+ }
68
+ ]
69
+ },
70
+ {
71
+ "name": "flexible-extract",
72
+ "filter": [
73
+ {
74
+ "function": "regex",
75
+ "group_select": -1,
76
+ "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
77
+ },
78
+ {
79
+ "function": "take_first"
80
+ }
81
+ ]
82
+ }
83
+ ],
84
+ "should_decontaminate": false,
85
+ "metadata": {
86
+ "version": 3.0
87
+ }
88
+ }
89
+ },
90
+ "versions": {
91
+ "gsm8k": 3.0
92
+ },
93
+ "n-shot": {
94
+ "gsm8k": 5
95
+ },
96
+ "higher_is_better": {
97
+ "gsm8k": {
98
+ "exact_match": true
99
+ }
100
+ },
101
+ "n-samples": {
102
+ "gsm8k": {
103
+ "original": 1319,
104
+ "effective": 1319
105
+ }
106
+ },
107
+ "config": {
108
+ "model": "vllm",
109
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
110
+ "batch_size": "auto",
111
+ "batch_sizes": [],
112
+ "device": "cuda",
113
+ "use_cache": null,
114
+ "limit": null,
115
+ "bootstrap_iters": 100000,
116
+ "gen_kwargs": null,
117
+ "random_seed": 0,
118
+ "numpy_seed": 1234,
119
+ "torch_seed": 1234,
120
+ "fewshot_seed": 1234
121
+ },
122
+ "git_hash": null,
123
+ "date": 1719628061.99107,
124
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
125
+ "transformers_version": "4.41.2",
126
+ "upper_git_hash": null,
127
+ "task_hashes": {},
128
+ "model_source": "vllm",
129
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
130
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
131
+ "system_instruction": null,
132
+ "system_instruction_sha": null,
133
+ "fewshot_as_multiturn": false,
134
+ "chat_template": null,
135
+ "chat_template_sha": null,
136
+ "start_time": 7641749.132861241,
137
+ "end_time": 7641857.900817806,
138
+ "total_evaluation_time_seconds": "108.76795656513423"
139
+ }
hellaswag-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T01-25-24.138070.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "acc,none": 0.47988448516231824,
5
+ "acc_stderr,none": 0.004985741706385721,
6
+ "acc_norm,none": 0.6526588329018124,
7
+ "acc_norm_stderr,none": 0.0047515221274184805,
8
+ "alias": "hellaswag"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "group": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
24
+ "doc_to_text": "{{query}}",
25
+ "doc_to_target": "{{label}}",
26
+ "doc_to_choice": "choices",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 10,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": false,
46
+ "metadata": {
47
+ "version": 1.0
48
+ }
49
+ }
50
+ },
51
+ "versions": {
52
+ "hellaswag": 1.0
53
+ },
54
+ "n-shot": {
55
+ "hellaswag": 10
56
+ },
57
+ "higher_is_better": {
58
+ "hellaswag": {
59
+ "acc": true,
60
+ "acc_norm": true
61
+ }
62
+ },
63
+ "n-samples": {
64
+ "hellaswag": {
65
+ "original": 10042,
66
+ "effective": 10042
67
+ }
68
+ },
69
+ "config": {
70
+ "model": "vllm",
71
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
72
+ "batch_size": "auto",
73
+ "batch_sizes": [],
74
+ "device": "cuda",
75
+ "use_cache": null,
76
+ "limit": null,
77
+ "bootstrap_iters": 100000,
78
+ "gen_kwargs": null,
79
+ "random_seed": 0,
80
+ "numpy_seed": 1234,
81
+ "torch_seed": 1234,
82
+ "fewshot_seed": 1234
83
+ },
84
+ "git_hash": null,
85
+ "date": 1719621807.2691455,
86
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
87
+ "transformers_version": "4.41.2",
88
+ "upper_git_hash": null,
89
+ "task_hashes": {},
90
+ "model_source": "vllm",
91
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
92
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
93
+ "system_instruction": null,
94
+ "system_instruction_sha": null,
95
+ "fewshot_as_multiturn": false,
96
+ "chat_template": null,
97
+ "chat_template_sha": null,
98
+ "start_time": 7635492.847221878,
99
+ "end_time": 7638016.491339342,
100
+ "total_evaluation_time_seconds": "2523.64411746338"
101
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
mmlu-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T02-27-23.641279.json ADDED
@@ -0,0 +1,3155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "mmlu": {
4
+ "acc,none": 0.5438683948155534,
5
+ "acc_stderr,none": 0.004046605401658642,
6
+ "alias": "mmlu"
7
+ },
8
+ "mmlu_humanities": {
9
+ "alias": " - humanities",
10
+ "acc,none": 0.4973432518597237,
11
+ "acc_stderr,none": 0.0069921729107274965
12
+ },
13
+ "mmlu_formal_logic": {
14
+ "alias": " - formal_logic",
15
+ "acc,none": 0.36507936507936506,
16
+ "acc_stderr,none": 0.04306241259127153
17
+ },
18
+ "mmlu_high_school_european_history": {
19
+ "alias": " - high_school_european_history",
20
+ "acc,none": 0.6484848484848484,
21
+ "acc_stderr,none": 0.0372820699868265
22
+ },
23
+ "mmlu_high_school_us_history": {
24
+ "alias": " - high_school_us_history",
25
+ "acc,none": 0.6764705882352942,
26
+ "acc_stderr,none": 0.03283472056108561
27
+ },
28
+ "mmlu_high_school_world_history": {
29
+ "alias": " - high_school_world_history",
30
+ "acc,none": 0.7088607594936709,
31
+ "acc_stderr,none": 0.029571601065753374
32
+ },
33
+ "mmlu_international_law": {
34
+ "alias": " - international_law",
35
+ "acc,none": 0.6942148760330579,
36
+ "acc_stderr,none": 0.04205953933884124
37
+ },
38
+ "mmlu_jurisprudence": {
39
+ "alias": " - jurisprudence",
40
+ "acc,none": 0.6851851851851852,
41
+ "acc_stderr,none": 0.04489931073591312
42
+ },
43
+ "mmlu_logical_fallacies": {
44
+ "alias": " - logical_fallacies",
45
+ "acc,none": 0.7116564417177914,
46
+ "acc_stderr,none": 0.035590395316173425
47
+ },
48
+ "mmlu_moral_disputes": {
49
+ "alias": " - moral_disputes",
50
+ "acc,none": 0.630057803468208,
51
+ "acc_stderr,none": 0.025992472029306383
52
+ },
53
+ "mmlu_moral_scenarios": {
54
+ "alias": " - moral_scenarios",
55
+ "acc,none": 0.3195530726256983,
56
+ "acc_stderr,none": 0.015595520294147402
57
+ },
58
+ "mmlu_philosophy": {
59
+ "alias": " - philosophy",
60
+ "acc,none": 0.594855305466238,
61
+ "acc_stderr,none": 0.027882383791325953
62
+ },
63
+ "mmlu_prehistory": {
64
+ "alias": " - prehistory",
65
+ "acc,none": 0.5524691358024691,
66
+ "acc_stderr,none": 0.027667138569422715
67
+ },
68
+ "mmlu_professional_law": {
69
+ "alias": " - professional_law",
70
+ "acc,none": 0.4048239895697523,
71
+ "acc_stderr,none": 0.012536743830953994
72
+ },
73
+ "mmlu_world_religions": {
74
+ "alias": " - world_religions",
75
+ "acc,none": 0.6900584795321637,
76
+ "acc_stderr,none": 0.035469769593931624
77
+ },
78
+ "mmlu_other": {
79
+ "alias": " - other",
80
+ "acc,none": 0.5909237206308336,
81
+ "acc_stderr,none": 0.008549729756636936
82
+ },
83
+ "mmlu_business_ethics": {
84
+ "alias": " - business_ethics",
85
+ "acc,none": 0.61,
86
+ "acc_stderr,none": 0.04902071300001974
87
+ },
88
+ "mmlu_clinical_knowledge": {
89
+ "alias": " - clinical_knowledge",
90
+ "acc,none": 0.5735849056603773,
91
+ "acc_stderr,none": 0.030437794342983042
92
+ },
93
+ "mmlu_college_medicine": {
94
+ "alias": " - college_medicine",
95
+ "acc,none": 0.49710982658959535,
96
+ "acc_stderr,none": 0.038124005659748335
97
+ },
98
+ "mmlu_global_facts": {
99
+ "alias": " - global_facts",
100
+ "acc,none": 0.34,
101
+ "acc_stderr,none": 0.04760952285695235
102
+ },
103
+ "mmlu_human_aging": {
104
+ "alias": " - human_aging",
105
+ "acc,none": 0.5964125560538116,
106
+ "acc_stderr,none": 0.032928028193303135
107
+ },
108
+ "mmlu_management": {
109
+ "alias": " - management",
110
+ "acc,none": 0.7669902912621359,
111
+ "acc_stderr,none": 0.04185832598928315
112
+ },
113
+ "mmlu_marketing": {
114
+ "alias": " - marketing",
115
+ "acc,none": 0.8076923076923077,
116
+ "acc_stderr,none": 0.025819233256483727
117
+ },
118
+ "mmlu_medical_genetics": {
119
+ "alias": " - medical_genetics",
120
+ "acc,none": 0.56,
121
+ "acc_stderr,none": 0.04988876515698589
122
+ },
123
+ "mmlu_miscellaneous": {
124
+ "alias": " - miscellaneous",
125
+ "acc,none": 0.6845466155810983,
126
+ "acc_stderr,none": 0.016617501738763397
127
+ },
128
+ "mmlu_nutrition": {
129
+ "alias": " - nutrition",
130
+ "acc,none": 0.6503267973856209,
131
+ "acc_stderr,none": 0.0273053080762747
132
+ },
133
+ "mmlu_professional_accounting": {
134
+ "alias": " - professional_accounting",
135
+ "acc,none": 0.42907801418439717,
136
+ "acc_stderr,none": 0.02952591430255855
137
+ },
138
+ "mmlu_professional_medicine": {
139
+ "alias": " - professional_medicine",
140
+ "acc,none": 0.4338235294117647,
141
+ "acc_stderr,none": 0.03010563657001664
142
+ },
143
+ "mmlu_virology": {
144
+ "alias": " - virology",
145
+ "acc,none": 0.43373493975903615,
146
+ "acc_stderr,none": 0.03858158940685515
147
+ },
148
+ "mmlu_social_sciences": {
149
+ "alias": " - social_sciences",
150
+ "acc,none": 0.6360090997725056,
151
+ "acc_stderr,none": 0.008459352068826637
152
+ },
153
+ "mmlu_econometrics": {
154
+ "alias": " - econometrics",
155
+ "acc,none": 0.34210526315789475,
156
+ "acc_stderr,none": 0.04462917535336936
157
+ },
158
+ "mmlu_high_school_geography": {
159
+ "alias": " - high_school_geography",
160
+ "acc,none": 0.7272727272727273,
161
+ "acc_stderr,none": 0.03173071239071724
162
+ },
163
+ "mmlu_high_school_government_and_politics": {
164
+ "alias": " - high_school_government_and_politics",
165
+ "acc,none": 0.7979274611398963,
166
+ "acc_stderr,none": 0.02897908979429673
167
+ },
168
+ "mmlu_high_school_macroeconomics": {
169
+ "alias": " - high_school_macroeconomics",
170
+ "acc,none": 0.5384615384615384,
171
+ "acc_stderr,none": 0.025275892070240644
172
+ },
173
+ "mmlu_high_school_microeconomics": {
174
+ "alias": " - high_school_microeconomics",
175
+ "acc,none": 0.6176470588235294,
176
+ "acc_stderr,none": 0.03156663099215416
177
+ },
178
+ "mmlu_high_school_psychology": {
179
+ "alias": " - high_school_psychology",
180
+ "acc,none": 0.7321100917431193,
181
+ "acc_stderr,none": 0.018987462257978652
182
+ },
183
+ "mmlu_human_sexuality": {
184
+ "alias": " - human_sexuality",
185
+ "acc,none": 0.6412213740458015,
186
+ "acc_stderr,none": 0.04206739313864908
187
+ },
188
+ "mmlu_professional_psychology": {
189
+ "alias": " - professional_psychology",
190
+ "acc,none": 0.5294117647058824,
191
+ "acc_stderr,none": 0.020192808271433788
192
+ },
193
+ "mmlu_public_relations": {
194
+ "alias": " - public_relations",
195
+ "acc,none": 0.5636363636363636,
196
+ "acc_stderr,none": 0.04750185058907296
197
+ },
198
+ "mmlu_security_studies": {
199
+ "alias": " - security_studies",
200
+ "acc,none": 0.689795918367347,
201
+ "acc_stderr,none": 0.02961345987248438
202
+ },
203
+ "mmlu_sociology": {
204
+ "alias": " - sociology",
205
+ "acc,none": 0.736318407960199,
206
+ "acc_stderr,none": 0.031157150869355568
207
+ },
208
+ "mmlu_us_foreign_policy": {
209
+ "alias": " - us_foreign_policy",
210
+ "acc,none": 0.77,
211
+ "acc_stderr,none": 0.042295258468165044
212
+ },
213
+ "mmlu_stem": {
214
+ "alias": " - stem",
215
+ "acc,none": 0.4770060260069775,
216
+ "acc_stderr,none": 0.00876253277535237
217
+ },
218
+ "mmlu_abstract_algebra": {
219
+ "alias": " - abstract_algebra",
220
+ "acc,none": 0.38,
221
+ "acc_stderr,none": 0.04878317312145633
222
+ },
223
+ "mmlu_anatomy": {
224
+ "alias": " - anatomy",
225
+ "acc,none": 0.4666666666666667,
226
+ "acc_stderr,none": 0.043097329010363554
227
+ },
228
+ "mmlu_astronomy": {
229
+ "alias": " - astronomy",
230
+ "acc,none": 0.5526315789473685,
231
+ "acc_stderr,none": 0.04046336883978251
232
+ },
233
+ "mmlu_college_biology": {
234
+ "alias": " - college_biology",
235
+ "acc,none": 0.5486111111111112,
236
+ "acc_stderr,none": 0.04161402398403279
237
+ },
238
+ "mmlu_college_chemistry": {
239
+ "alias": " - college_chemistry",
240
+ "acc,none": 0.39,
241
+ "acc_stderr,none": 0.04902071300001975
242
+ },
243
+ "mmlu_college_computer_science": {
244
+ "alias": " - college_computer_science",
245
+ "acc,none": 0.53,
246
+ "acc_stderr,none": 0.05016135580465919
247
+ },
248
+ "mmlu_college_mathematics": {
249
+ "alias": " - college_mathematics",
250
+ "acc,none": 0.31,
251
+ "acc_stderr,none": 0.04648231987117316
252
+ },
253
+ "mmlu_college_physics": {
254
+ "alias": " - college_physics",
255
+ "acc,none": 0.37254901960784315,
256
+ "acc_stderr,none": 0.04810840148082633
257
+ },
258
+ "mmlu_computer_security": {
259
+ "alias": " - computer_security",
260
+ "acc,none": 0.71,
261
+ "acc_stderr,none": 0.045604802157206845
262
+ },
263
+ "mmlu_conceptual_physics": {
264
+ "alias": " - conceptual_physics",
265
+ "acc,none": 0.4765957446808511,
266
+ "acc_stderr,none": 0.03265019475033582
267
+ },
268
+ "mmlu_electrical_engineering": {
269
+ "alias": " - electrical_engineering",
270
+ "acc,none": 0.5793103448275863,
271
+ "acc_stderr,none": 0.0411391498118926
272
+ },
273
+ "mmlu_elementary_mathematics": {
274
+ "alias": " - elementary_mathematics",
275
+ "acc,none": 0.4417989417989418,
276
+ "acc_stderr,none": 0.025576257061253833
277
+ },
278
+ "mmlu_high_school_biology": {
279
+ "alias": " - high_school_biology",
280
+ "acc,none": 0.6161290322580645,
281
+ "acc_stderr,none": 0.027666182075539645
282
+ },
283
+ "mmlu_high_school_chemistry": {
284
+ "alias": " - high_school_chemistry",
285
+ "acc,none": 0.46798029556650245,
286
+ "acc_stderr,none": 0.035107665979592154
287
+ },
288
+ "mmlu_high_school_computer_science": {
289
+ "alias": " - high_school_computer_science",
290
+ "acc,none": 0.56,
291
+ "acc_stderr,none": 0.04988876515698589
292
+ },
293
+ "mmlu_high_school_mathematics": {
294
+ "alias": " - high_school_mathematics",
295
+ "acc,none": 0.37777777777777777,
296
+ "acc_stderr,none": 0.029560707392465718
297
+ },
298
+ "mmlu_high_school_physics": {
299
+ "alias": " - high_school_physics",
300
+ "acc,none": 0.33774834437086093,
301
+ "acc_stderr,none": 0.03861557546255169
302
+ },
303
+ "mmlu_high_school_statistics": {
304
+ "alias": " - high_school_statistics",
305
+ "acc,none": 0.4861111111111111,
306
+ "acc_stderr,none": 0.03408655867977749
307
+ },
308
+ "mmlu_machine_learning": {
309
+ "alias": " - machine_learning",
310
+ "acc,none": 0.4017857142857143,
311
+ "acc_stderr,none": 0.04653333146973647
312
+ }
313
+ },
314
+ "groups": {
315
+ "mmlu": {
316
+ "acc,none": 0.5438683948155534,
317
+ "acc_stderr,none": 0.004046605401658642,
318
+ "alias": "mmlu"
319
+ },
320
+ "mmlu_humanities": {
321
+ "alias": " - humanities",
322
+ "acc,none": 0.4973432518597237,
323
+ "acc_stderr,none": 0.0069921729107274965
324
+ },
325
+ "mmlu_other": {
326
+ "alias": " - other",
327
+ "acc,none": 0.5909237206308336,
328
+ "acc_stderr,none": 0.008549729756636936
329
+ },
330
+ "mmlu_social_sciences": {
331
+ "alias": " - social_sciences",
332
+ "acc,none": 0.6360090997725056,
333
+ "acc_stderr,none": 0.008459352068826637
334
+ },
335
+ "mmlu_stem": {
336
+ "alias": " - stem",
337
+ "acc,none": 0.4770060260069775,
338
+ "acc_stderr,none": 0.00876253277535237
339
+ }
340
+ },
341
+ "group_subtasks": {
342
+ "mmlu_stem": [
343
+ "mmlu_abstract_algebra",
344
+ "mmlu_college_biology",
345
+ "mmlu_high_school_biology",
346
+ "mmlu_conceptual_physics",
347
+ "mmlu_computer_security",
348
+ "mmlu_college_physics",
349
+ "mmlu_college_chemistry",
350
+ "mmlu_high_school_statistics",
351
+ "mmlu_anatomy",
352
+ "mmlu_high_school_mathematics",
353
+ "mmlu_machine_learning",
354
+ "mmlu_high_school_physics",
355
+ "mmlu_electrical_engineering",
356
+ "mmlu_college_computer_science",
357
+ "mmlu_high_school_chemistry",
358
+ "mmlu_astronomy",
359
+ "mmlu_high_school_computer_science",
360
+ "mmlu_elementary_mathematics",
361
+ "mmlu_college_mathematics"
362
+ ],
363
+ "mmlu_other": [
364
+ "mmlu_business_ethics",
365
+ "mmlu_marketing",
366
+ "mmlu_medical_genetics",
367
+ "mmlu_clinical_knowledge",
368
+ "mmlu_global_facts",
369
+ "mmlu_human_aging",
370
+ "mmlu_professional_medicine",
371
+ "mmlu_nutrition",
372
+ "mmlu_management",
373
+ "mmlu_college_medicine",
374
+ "mmlu_professional_accounting",
375
+ "mmlu_virology",
376
+ "mmlu_miscellaneous"
377
+ ],
378
+ "mmlu_social_sciences": [
379
+ "mmlu_public_relations",
380
+ "mmlu_high_school_macroeconomics",
381
+ "mmlu_human_sexuality",
382
+ "mmlu_high_school_geography",
383
+ "mmlu_high_school_psychology",
384
+ "mmlu_high_school_microeconomics",
385
+ "mmlu_high_school_government_and_politics",
386
+ "mmlu_us_foreign_policy",
387
+ "mmlu_sociology",
388
+ "mmlu_security_studies",
389
+ "mmlu_econometrics",
390
+ "mmlu_professional_psychology"
391
+ ],
392
+ "mmlu_humanities": [
393
+ "mmlu_philosophy",
394
+ "mmlu_logical_fallacies",
395
+ "mmlu_moral_disputes",
396
+ "mmlu_jurisprudence",
397
+ "mmlu_high_school_us_history",
398
+ "mmlu_high_school_world_history",
399
+ "mmlu_world_religions",
400
+ "mmlu_moral_scenarios",
401
+ "mmlu_prehistory",
402
+ "mmlu_formal_logic",
403
+ "mmlu_international_law",
404
+ "mmlu_professional_law",
405
+ "mmlu_high_school_european_history"
406
+ ],
407
+ "mmlu": [
408
+ "mmlu_humanities",
409
+ "mmlu_social_sciences",
410
+ "mmlu_other",
411
+ "mmlu_stem"
412
+ ]
413
+ },
414
+ "configs": {
415
+ "mmlu_abstract_algebra": {
416
+ "task": "mmlu_abstract_algebra",
417
+ "task_alias": "abstract_algebra",
418
+ "group": "mmlu_stem",
419
+ "group_alias": "stem",
420
+ "dataset_path": "hails/mmlu_no_train",
421
+ "dataset_name": "abstract_algebra",
422
+ "test_split": "test",
423
+ "fewshot_split": "dev",
424
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
425
+ "doc_to_target": "answer",
426
+ "doc_to_choice": [
427
+ "A",
428
+ "B",
429
+ "C",
430
+ "D"
431
+ ],
432
+ "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
433
+ "target_delimiter": " ",
434
+ "fewshot_delimiter": "\n\n",
435
+ "fewshot_config": {
436
+ "sampler": "first_n"
437
+ },
438
+ "num_fewshot": 5,
439
+ "metric_list": [
440
+ {
441
+ "metric": "acc",
442
+ "aggregation": "mean",
443
+ "higher_is_better": true
444
+ }
445
+ ],
446
+ "output_type": "multiple_choice",
447
+ "repeats": 1,
448
+ "should_decontaminate": false,
449
+ "metadata": {
450
+ "version": 0.0
451
+ }
452
+ },
453
+ "mmlu_anatomy": {
454
+ "task": "mmlu_anatomy",
455
+ "task_alias": "anatomy",
456
+ "group": "mmlu_stem",
457
+ "group_alias": "stem",
458
+ "dataset_path": "hails/mmlu_no_train",
459
+ "dataset_name": "anatomy",
460
+ "test_split": "test",
461
+ "fewshot_split": "dev",
462
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
463
+ "doc_to_target": "answer",
464
+ "doc_to_choice": [
465
+ "A",
466
+ "B",
467
+ "C",
468
+ "D"
469
+ ],
470
+ "description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
471
+ "target_delimiter": " ",
472
+ "fewshot_delimiter": "\n\n",
473
+ "fewshot_config": {
474
+ "sampler": "first_n"
475
+ },
476
+ "num_fewshot": 5,
477
+ "metric_list": [
478
+ {
479
+ "metric": "acc",
480
+ "aggregation": "mean",
481
+ "higher_is_better": true
482
+ }
483
+ ],
484
+ "output_type": "multiple_choice",
485
+ "repeats": 1,
486
+ "should_decontaminate": false,
487
+ "metadata": {
488
+ "version": 0.0
489
+ }
490
+ },
491
+ "mmlu_astronomy": {
492
+ "task": "mmlu_astronomy",
493
+ "task_alias": "astronomy",
494
+ "group": "mmlu_stem",
495
+ "group_alias": "stem",
496
+ "dataset_path": "hails/mmlu_no_train",
497
+ "dataset_name": "astronomy",
498
+ "test_split": "test",
499
+ "fewshot_split": "dev",
500
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
501
+ "doc_to_target": "answer",
502
+ "doc_to_choice": [
503
+ "A",
504
+ "B",
505
+ "C",
506
+ "D"
507
+ ],
508
+ "description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
509
+ "target_delimiter": " ",
510
+ "fewshot_delimiter": "\n\n",
511
+ "fewshot_config": {
512
+ "sampler": "first_n"
513
+ },
514
+ "num_fewshot": 5,
515
+ "metric_list": [
516
+ {
517
+ "metric": "acc",
518
+ "aggregation": "mean",
519
+ "higher_is_better": true
520
+ }
521
+ ],
522
+ "output_type": "multiple_choice",
523
+ "repeats": 1,
524
+ "should_decontaminate": false,
525
+ "metadata": {
526
+ "version": 0.0
527
+ }
528
+ },
529
+ "mmlu_business_ethics": {
530
+ "task": "mmlu_business_ethics",
531
+ "task_alias": "business_ethics",
532
+ "group": "mmlu_other",
533
+ "group_alias": "other",
534
+ "dataset_path": "hails/mmlu_no_train",
535
+ "dataset_name": "business_ethics",
536
+ "test_split": "test",
537
+ "fewshot_split": "dev",
538
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
539
+ "doc_to_target": "answer",
540
+ "doc_to_choice": [
541
+ "A",
542
+ "B",
543
+ "C",
544
+ "D"
545
+ ],
546
+ "description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
547
+ "target_delimiter": " ",
548
+ "fewshot_delimiter": "\n\n",
549
+ "fewshot_config": {
550
+ "sampler": "first_n"
551
+ },
552
+ "num_fewshot": 5,
553
+ "metric_list": [
554
+ {
555
+ "metric": "acc",
556
+ "aggregation": "mean",
557
+ "higher_is_better": true
558
+ }
559
+ ],
560
+ "output_type": "multiple_choice",
561
+ "repeats": 1,
562
+ "should_decontaminate": false,
563
+ "metadata": {
564
+ "version": 0.0
565
+ }
566
+ },
567
+ "mmlu_clinical_knowledge": {
568
+ "task": "mmlu_clinical_knowledge",
569
+ "task_alias": "clinical_knowledge",
570
+ "group": "mmlu_other",
571
+ "group_alias": "other",
572
+ "dataset_path": "hails/mmlu_no_train",
573
+ "dataset_name": "clinical_knowledge",
574
+ "test_split": "test",
575
+ "fewshot_split": "dev",
576
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
577
+ "doc_to_target": "answer",
578
+ "doc_to_choice": [
579
+ "A",
580
+ "B",
581
+ "C",
582
+ "D"
583
+ ],
584
+ "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
585
+ "target_delimiter": " ",
586
+ "fewshot_delimiter": "\n\n",
587
+ "fewshot_config": {
588
+ "sampler": "first_n"
589
+ },
590
+ "num_fewshot": 5,
591
+ "metric_list": [
592
+ {
593
+ "metric": "acc",
594
+ "aggregation": "mean",
595
+ "higher_is_better": true
596
+ }
597
+ ],
598
+ "output_type": "multiple_choice",
599
+ "repeats": 1,
600
+ "should_decontaminate": false,
601
+ "metadata": {
602
+ "version": 0.0
603
+ }
604
+ },
605
+ "mmlu_college_biology": {
606
+ "task": "mmlu_college_biology",
607
+ "task_alias": "college_biology",
608
+ "group": "mmlu_stem",
609
+ "group_alias": "stem",
610
+ "dataset_path": "hails/mmlu_no_train",
611
+ "dataset_name": "college_biology",
612
+ "test_split": "test",
613
+ "fewshot_split": "dev",
614
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
615
+ "doc_to_target": "answer",
616
+ "doc_to_choice": [
617
+ "A",
618
+ "B",
619
+ "C",
620
+ "D"
621
+ ],
622
+ "description": "The following are multiple choice questions (with answers) about college biology.\n\n",
623
+ "target_delimiter": " ",
624
+ "fewshot_delimiter": "\n\n",
625
+ "fewshot_config": {
626
+ "sampler": "first_n"
627
+ },
628
+ "num_fewshot": 5,
629
+ "metric_list": [
630
+ {
631
+ "metric": "acc",
632
+ "aggregation": "mean",
633
+ "higher_is_better": true
634
+ }
635
+ ],
636
+ "output_type": "multiple_choice",
637
+ "repeats": 1,
638
+ "should_decontaminate": false,
639
+ "metadata": {
640
+ "version": 0.0
641
+ }
642
+ },
643
+ "mmlu_college_chemistry": {
644
+ "task": "mmlu_college_chemistry",
645
+ "task_alias": "college_chemistry",
646
+ "group": "mmlu_stem",
647
+ "group_alias": "stem",
648
+ "dataset_path": "hails/mmlu_no_train",
649
+ "dataset_name": "college_chemistry",
650
+ "test_split": "test",
651
+ "fewshot_split": "dev",
652
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
653
+ "doc_to_target": "answer",
654
+ "doc_to_choice": [
655
+ "A",
656
+ "B",
657
+ "C",
658
+ "D"
659
+ ],
660
+ "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
661
+ "target_delimiter": " ",
662
+ "fewshot_delimiter": "\n\n",
663
+ "fewshot_config": {
664
+ "sampler": "first_n"
665
+ },
666
+ "num_fewshot": 5,
667
+ "metric_list": [
668
+ {
669
+ "metric": "acc",
670
+ "aggregation": "mean",
671
+ "higher_is_better": true
672
+ }
673
+ ],
674
+ "output_type": "multiple_choice",
675
+ "repeats": 1,
676
+ "should_decontaminate": false,
677
+ "metadata": {
678
+ "version": 0.0
679
+ }
680
+ },
681
+ "mmlu_college_computer_science": {
682
+ "task": "mmlu_college_computer_science",
683
+ "task_alias": "college_computer_science",
684
+ "group": "mmlu_stem",
685
+ "group_alias": "stem",
686
+ "dataset_path": "hails/mmlu_no_train",
687
+ "dataset_name": "college_computer_science",
688
+ "test_split": "test",
689
+ "fewshot_split": "dev",
690
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
691
+ "doc_to_target": "answer",
692
+ "doc_to_choice": [
693
+ "A",
694
+ "B",
695
+ "C",
696
+ "D"
697
+ ],
698
+ "description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
699
+ "target_delimiter": " ",
700
+ "fewshot_delimiter": "\n\n",
701
+ "fewshot_config": {
702
+ "sampler": "first_n"
703
+ },
704
+ "num_fewshot": 5,
705
+ "metric_list": [
706
+ {
707
+ "metric": "acc",
708
+ "aggregation": "mean",
709
+ "higher_is_better": true
710
+ }
711
+ ],
712
+ "output_type": "multiple_choice",
713
+ "repeats": 1,
714
+ "should_decontaminate": false,
715
+ "metadata": {
716
+ "version": 0.0
717
+ }
718
+ },
719
+ "mmlu_college_mathematics": {
720
+ "task": "mmlu_college_mathematics",
721
+ "task_alias": "college_mathematics",
722
+ "group": "mmlu_stem",
723
+ "group_alias": "stem",
724
+ "dataset_path": "hails/mmlu_no_train",
725
+ "dataset_name": "college_mathematics",
726
+ "test_split": "test",
727
+ "fewshot_split": "dev",
728
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
729
+ "doc_to_target": "answer",
730
+ "doc_to_choice": [
731
+ "A",
732
+ "B",
733
+ "C",
734
+ "D"
735
+ ],
736
+ "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
737
+ "target_delimiter": " ",
738
+ "fewshot_delimiter": "\n\n",
739
+ "fewshot_config": {
740
+ "sampler": "first_n"
741
+ },
742
+ "num_fewshot": 5,
743
+ "metric_list": [
744
+ {
745
+ "metric": "acc",
746
+ "aggregation": "mean",
747
+ "higher_is_better": true
748
+ }
749
+ ],
750
+ "output_type": "multiple_choice",
751
+ "repeats": 1,
752
+ "should_decontaminate": false,
753
+ "metadata": {
754
+ "version": 0.0
755
+ }
756
+ },
757
+ "mmlu_college_medicine": {
758
+ "task": "mmlu_college_medicine",
759
+ "task_alias": "college_medicine",
760
+ "group": "mmlu_other",
761
+ "group_alias": "other",
762
+ "dataset_path": "hails/mmlu_no_train",
763
+ "dataset_name": "college_medicine",
764
+ "test_split": "test",
765
+ "fewshot_split": "dev",
766
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
767
+ "doc_to_target": "answer",
768
+ "doc_to_choice": [
769
+ "A",
770
+ "B",
771
+ "C",
772
+ "D"
773
+ ],
774
+ "description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
775
+ "target_delimiter": " ",
776
+ "fewshot_delimiter": "\n\n",
777
+ "fewshot_config": {
778
+ "sampler": "first_n"
779
+ },
780
+ "num_fewshot": 5,
781
+ "metric_list": [
782
+ {
783
+ "metric": "acc",
784
+ "aggregation": "mean",
785
+ "higher_is_better": true
786
+ }
787
+ ],
788
+ "output_type": "multiple_choice",
789
+ "repeats": 1,
790
+ "should_decontaminate": false,
791
+ "metadata": {
792
+ "version": 0.0
793
+ }
794
+ },
795
+ "mmlu_college_physics": {
796
+ "task": "mmlu_college_physics",
797
+ "task_alias": "college_physics",
798
+ "group": "mmlu_stem",
799
+ "group_alias": "stem",
800
+ "dataset_path": "hails/mmlu_no_train",
801
+ "dataset_name": "college_physics",
802
+ "test_split": "test",
803
+ "fewshot_split": "dev",
804
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
805
+ "doc_to_target": "answer",
806
+ "doc_to_choice": [
807
+ "A",
808
+ "B",
809
+ "C",
810
+ "D"
811
+ ],
812
+ "description": "The following are multiple choice questions (with answers) about college physics.\n\n",
813
+ "target_delimiter": " ",
814
+ "fewshot_delimiter": "\n\n",
815
+ "fewshot_config": {
816
+ "sampler": "first_n"
817
+ },
818
+ "num_fewshot": 5,
819
+ "metric_list": [
820
+ {
821
+ "metric": "acc",
822
+ "aggregation": "mean",
823
+ "higher_is_better": true
824
+ }
825
+ ],
826
+ "output_type": "multiple_choice",
827
+ "repeats": 1,
828
+ "should_decontaminate": false,
829
+ "metadata": {
830
+ "version": 0.0
831
+ }
832
+ },
833
+ "mmlu_computer_security": {
834
+ "task": "mmlu_computer_security",
835
+ "task_alias": "computer_security",
836
+ "group": "mmlu_stem",
837
+ "group_alias": "stem",
838
+ "dataset_path": "hails/mmlu_no_train",
839
+ "dataset_name": "computer_security",
840
+ "test_split": "test",
841
+ "fewshot_split": "dev",
842
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
843
+ "doc_to_target": "answer",
844
+ "doc_to_choice": [
845
+ "A",
846
+ "B",
847
+ "C",
848
+ "D"
849
+ ],
850
+ "description": "The following are multiple choice questions (with answers) about computer security.\n\n",
851
+ "target_delimiter": " ",
852
+ "fewshot_delimiter": "\n\n",
853
+ "fewshot_config": {
854
+ "sampler": "first_n"
855
+ },
856
+ "num_fewshot": 5,
857
+ "metric_list": [
858
+ {
859
+ "metric": "acc",
860
+ "aggregation": "mean",
861
+ "higher_is_better": true
862
+ }
863
+ ],
864
+ "output_type": "multiple_choice",
865
+ "repeats": 1,
866
+ "should_decontaminate": false,
867
+ "metadata": {
868
+ "version": 0.0
869
+ }
870
+ },
871
+ "mmlu_conceptual_physics": {
872
+ "task": "mmlu_conceptual_physics",
873
+ "task_alias": "conceptual_physics",
874
+ "group": "mmlu_stem",
875
+ "group_alias": "stem",
876
+ "dataset_path": "hails/mmlu_no_train",
877
+ "dataset_name": "conceptual_physics",
878
+ "test_split": "test",
879
+ "fewshot_split": "dev",
880
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
881
+ "doc_to_target": "answer",
882
+ "doc_to_choice": [
883
+ "A",
884
+ "B",
885
+ "C",
886
+ "D"
887
+ ],
888
+ "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
889
+ "target_delimiter": " ",
890
+ "fewshot_delimiter": "\n\n",
891
+ "fewshot_config": {
892
+ "sampler": "first_n"
893
+ },
894
+ "num_fewshot": 5,
895
+ "metric_list": [
896
+ {
897
+ "metric": "acc",
898
+ "aggregation": "mean",
899
+ "higher_is_better": true
900
+ }
901
+ ],
902
+ "output_type": "multiple_choice",
903
+ "repeats": 1,
904
+ "should_decontaminate": false,
905
+ "metadata": {
906
+ "version": 0.0
907
+ }
908
+ },
909
+ "mmlu_econometrics": {
910
+ "task": "mmlu_econometrics",
911
+ "task_alias": "econometrics",
912
+ "group": "mmlu_social_sciences",
913
+ "group_alias": "social_sciences",
914
+ "dataset_path": "hails/mmlu_no_train",
915
+ "dataset_name": "econometrics",
916
+ "test_split": "test",
917
+ "fewshot_split": "dev",
918
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
919
+ "doc_to_target": "answer",
920
+ "doc_to_choice": [
921
+ "A",
922
+ "B",
923
+ "C",
924
+ "D"
925
+ ],
926
+ "description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
927
+ "target_delimiter": " ",
928
+ "fewshot_delimiter": "\n\n",
929
+ "fewshot_config": {
930
+ "sampler": "first_n"
931
+ },
932
+ "num_fewshot": 5,
933
+ "metric_list": [
934
+ {
935
+ "metric": "acc",
936
+ "aggregation": "mean",
937
+ "higher_is_better": true
938
+ }
939
+ ],
940
+ "output_type": "multiple_choice",
941
+ "repeats": 1,
942
+ "should_decontaminate": false,
943
+ "metadata": {
944
+ "version": 0.0
945
+ }
946
+ },
947
+ "mmlu_electrical_engineering": {
948
+ "task": "mmlu_electrical_engineering",
949
+ "task_alias": "electrical_engineering",
950
+ "group": "mmlu_stem",
951
+ "group_alias": "stem",
952
+ "dataset_path": "hails/mmlu_no_train",
953
+ "dataset_name": "electrical_engineering",
954
+ "test_split": "test",
955
+ "fewshot_split": "dev",
956
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
957
+ "doc_to_target": "answer",
958
+ "doc_to_choice": [
959
+ "A",
960
+ "B",
961
+ "C",
962
+ "D"
963
+ ],
964
+ "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
965
+ "target_delimiter": " ",
966
+ "fewshot_delimiter": "\n\n",
967
+ "fewshot_config": {
968
+ "sampler": "first_n"
969
+ },
970
+ "num_fewshot": 5,
971
+ "metric_list": [
972
+ {
973
+ "metric": "acc",
974
+ "aggregation": "mean",
975
+ "higher_is_better": true
976
+ }
977
+ ],
978
+ "output_type": "multiple_choice",
979
+ "repeats": 1,
980
+ "should_decontaminate": false,
981
+ "metadata": {
982
+ "version": 0.0
983
+ }
984
+ },
985
+ "mmlu_elementary_mathematics": {
986
+ "task": "mmlu_elementary_mathematics",
987
+ "task_alias": "elementary_mathematics",
988
+ "group": "mmlu_stem",
989
+ "group_alias": "stem",
990
+ "dataset_path": "hails/mmlu_no_train",
991
+ "dataset_name": "elementary_mathematics",
992
+ "test_split": "test",
993
+ "fewshot_split": "dev",
994
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
995
+ "doc_to_target": "answer",
996
+ "doc_to_choice": [
997
+ "A",
998
+ "B",
999
+ "C",
1000
+ "D"
1001
+ ],
1002
+ "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
1003
+ "target_delimiter": " ",
1004
+ "fewshot_delimiter": "\n\n",
1005
+ "fewshot_config": {
1006
+ "sampler": "first_n"
1007
+ },
1008
+ "num_fewshot": 5,
1009
+ "metric_list": [
1010
+ {
1011
+ "metric": "acc",
1012
+ "aggregation": "mean",
1013
+ "higher_is_better": true
1014
+ }
1015
+ ],
1016
+ "output_type": "multiple_choice",
1017
+ "repeats": 1,
1018
+ "should_decontaminate": false,
1019
+ "metadata": {
1020
+ "version": 0.0
1021
+ }
1022
+ },
1023
+ "mmlu_formal_logic": {
1024
+ "task": "mmlu_formal_logic",
1025
+ "task_alias": "formal_logic",
1026
+ "group": "mmlu_humanities",
1027
+ "group_alias": "humanities",
1028
+ "dataset_path": "hails/mmlu_no_train",
1029
+ "dataset_name": "formal_logic",
1030
+ "test_split": "test",
1031
+ "fewshot_split": "dev",
1032
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1033
+ "doc_to_target": "answer",
1034
+ "doc_to_choice": [
1035
+ "A",
1036
+ "B",
1037
+ "C",
1038
+ "D"
1039
+ ],
1040
+ "description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
1041
+ "target_delimiter": " ",
1042
+ "fewshot_delimiter": "\n\n",
1043
+ "fewshot_config": {
1044
+ "sampler": "first_n"
1045
+ },
1046
+ "num_fewshot": 5,
1047
+ "metric_list": [
1048
+ {
1049
+ "metric": "acc",
1050
+ "aggregation": "mean",
1051
+ "higher_is_better": true
1052
+ }
1053
+ ],
1054
+ "output_type": "multiple_choice",
1055
+ "repeats": 1,
1056
+ "should_decontaminate": false,
1057
+ "metadata": {
1058
+ "version": 0.0
1059
+ }
1060
+ },
1061
+ "mmlu_global_facts": {
1062
+ "task": "mmlu_global_facts",
1063
+ "task_alias": "global_facts",
1064
+ "group": "mmlu_other",
1065
+ "group_alias": "other",
1066
+ "dataset_path": "hails/mmlu_no_train",
1067
+ "dataset_name": "global_facts",
1068
+ "test_split": "test",
1069
+ "fewshot_split": "dev",
1070
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1071
+ "doc_to_target": "answer",
1072
+ "doc_to_choice": [
1073
+ "A",
1074
+ "B",
1075
+ "C",
1076
+ "D"
1077
+ ],
1078
+ "description": "The following are multiple choice questions (with answers) about global facts.\n\n",
1079
+ "target_delimiter": " ",
1080
+ "fewshot_delimiter": "\n\n",
1081
+ "fewshot_config": {
1082
+ "sampler": "first_n"
1083
+ },
1084
+ "num_fewshot": 5,
1085
+ "metric_list": [
1086
+ {
1087
+ "metric": "acc",
1088
+ "aggregation": "mean",
1089
+ "higher_is_better": true
1090
+ }
1091
+ ],
1092
+ "output_type": "multiple_choice",
1093
+ "repeats": 1,
1094
+ "should_decontaminate": false,
1095
+ "metadata": {
1096
+ "version": 0.0
1097
+ }
1098
+ },
1099
+ "mmlu_high_school_biology": {
1100
+ "task": "mmlu_high_school_biology",
1101
+ "task_alias": "high_school_biology",
1102
+ "group": "mmlu_stem",
1103
+ "group_alias": "stem",
1104
+ "dataset_path": "hails/mmlu_no_train",
1105
+ "dataset_name": "high_school_biology",
1106
+ "test_split": "test",
1107
+ "fewshot_split": "dev",
1108
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1109
+ "doc_to_target": "answer",
1110
+ "doc_to_choice": [
1111
+ "A",
1112
+ "B",
1113
+ "C",
1114
+ "D"
1115
+ ],
1116
+ "description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
1117
+ "target_delimiter": " ",
1118
+ "fewshot_delimiter": "\n\n",
1119
+ "fewshot_config": {
1120
+ "sampler": "first_n"
1121
+ },
1122
+ "num_fewshot": 5,
1123
+ "metric_list": [
1124
+ {
1125
+ "metric": "acc",
1126
+ "aggregation": "mean",
1127
+ "higher_is_better": true
1128
+ }
1129
+ ],
1130
+ "output_type": "multiple_choice",
1131
+ "repeats": 1,
1132
+ "should_decontaminate": false,
1133
+ "metadata": {
1134
+ "version": 0.0
1135
+ }
1136
+ },
1137
+ "mmlu_high_school_chemistry": {
1138
+ "task": "mmlu_high_school_chemistry",
1139
+ "task_alias": "high_school_chemistry",
1140
+ "group": "mmlu_stem",
1141
+ "group_alias": "stem",
1142
+ "dataset_path": "hails/mmlu_no_train",
1143
+ "dataset_name": "high_school_chemistry",
1144
+ "test_split": "test",
1145
+ "fewshot_split": "dev",
1146
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1147
+ "doc_to_target": "answer",
1148
+ "doc_to_choice": [
1149
+ "A",
1150
+ "B",
1151
+ "C",
1152
+ "D"
1153
+ ],
1154
+ "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
1155
+ "target_delimiter": " ",
1156
+ "fewshot_delimiter": "\n\n",
1157
+ "fewshot_config": {
1158
+ "sampler": "first_n"
1159
+ },
1160
+ "num_fewshot": 5,
1161
+ "metric_list": [
1162
+ {
1163
+ "metric": "acc",
1164
+ "aggregation": "mean",
1165
+ "higher_is_better": true
1166
+ }
1167
+ ],
1168
+ "output_type": "multiple_choice",
1169
+ "repeats": 1,
1170
+ "should_decontaminate": false,
1171
+ "metadata": {
1172
+ "version": 0.0
1173
+ }
1174
+ },
1175
+ "mmlu_high_school_computer_science": {
1176
+ "task": "mmlu_high_school_computer_science",
1177
+ "task_alias": "high_school_computer_science",
1178
+ "group": "mmlu_stem",
1179
+ "group_alias": "stem",
1180
+ "dataset_path": "hails/mmlu_no_train",
1181
+ "dataset_name": "high_school_computer_science",
1182
+ "test_split": "test",
1183
+ "fewshot_split": "dev",
1184
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1185
+ "doc_to_target": "answer",
1186
+ "doc_to_choice": [
1187
+ "A",
1188
+ "B",
1189
+ "C",
1190
+ "D"
1191
+ ],
1192
+ "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
1193
+ "target_delimiter": " ",
1194
+ "fewshot_delimiter": "\n\n",
1195
+ "fewshot_config": {
1196
+ "sampler": "first_n"
1197
+ },
1198
+ "num_fewshot": 5,
1199
+ "metric_list": [
1200
+ {
1201
+ "metric": "acc",
1202
+ "aggregation": "mean",
1203
+ "higher_is_better": true
1204
+ }
1205
+ ],
1206
+ "output_type": "multiple_choice",
1207
+ "repeats": 1,
1208
+ "should_decontaminate": false,
1209
+ "metadata": {
1210
+ "version": 0.0
1211
+ }
1212
+ },
1213
+ "mmlu_high_school_european_history": {
1214
+ "task": "mmlu_high_school_european_history",
1215
+ "task_alias": "high_school_european_history",
1216
+ "group": "mmlu_humanities",
1217
+ "group_alias": "humanities",
1218
+ "dataset_path": "hails/mmlu_no_train",
1219
+ "dataset_name": "high_school_european_history",
1220
+ "test_split": "test",
1221
+ "fewshot_split": "dev",
1222
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1223
+ "doc_to_target": "answer",
1224
+ "doc_to_choice": [
1225
+ "A",
1226
+ "B",
1227
+ "C",
1228
+ "D"
1229
+ ],
1230
+ "description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
1231
+ "target_delimiter": " ",
1232
+ "fewshot_delimiter": "\n\n",
1233
+ "fewshot_config": {
1234
+ "sampler": "first_n"
1235
+ },
1236
+ "num_fewshot": 5,
1237
+ "metric_list": [
1238
+ {
1239
+ "metric": "acc",
1240
+ "aggregation": "mean",
1241
+ "higher_is_better": true
1242
+ }
1243
+ ],
1244
+ "output_type": "multiple_choice",
1245
+ "repeats": 1,
1246
+ "should_decontaminate": false,
1247
+ "metadata": {
1248
+ "version": 0.0
1249
+ }
1250
+ },
1251
+ "mmlu_high_school_geography": {
1252
+ "task": "mmlu_high_school_geography",
1253
+ "task_alias": "high_school_geography",
1254
+ "group": "mmlu_social_sciences",
1255
+ "group_alias": "social_sciences",
1256
+ "dataset_path": "hails/mmlu_no_train",
1257
+ "dataset_name": "high_school_geography",
1258
+ "test_split": "test",
1259
+ "fewshot_split": "dev",
1260
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1261
+ "doc_to_target": "answer",
1262
+ "doc_to_choice": [
1263
+ "A",
1264
+ "B",
1265
+ "C",
1266
+ "D"
1267
+ ],
1268
+ "description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
1269
+ "target_delimiter": " ",
1270
+ "fewshot_delimiter": "\n\n",
1271
+ "fewshot_config": {
1272
+ "sampler": "first_n"
1273
+ },
1274
+ "num_fewshot": 5,
1275
+ "metric_list": [
1276
+ {
1277
+ "metric": "acc",
1278
+ "aggregation": "mean",
1279
+ "higher_is_better": true
1280
+ }
1281
+ ],
1282
+ "output_type": "multiple_choice",
1283
+ "repeats": 1,
1284
+ "should_decontaminate": false,
1285
+ "metadata": {
1286
+ "version": 0.0
1287
+ }
1288
+ },
1289
+ "mmlu_high_school_government_and_politics": {
1290
+ "task": "mmlu_high_school_government_and_politics",
1291
+ "task_alias": "high_school_government_and_politics",
1292
+ "group": "mmlu_social_sciences",
1293
+ "group_alias": "social_sciences",
1294
+ "dataset_path": "hails/mmlu_no_train",
1295
+ "dataset_name": "high_school_government_and_politics",
1296
+ "test_split": "test",
1297
+ "fewshot_split": "dev",
1298
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1299
+ "doc_to_target": "answer",
1300
+ "doc_to_choice": [
1301
+ "A",
1302
+ "B",
1303
+ "C",
1304
+ "D"
1305
+ ],
1306
+ "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
1307
+ "target_delimiter": " ",
1308
+ "fewshot_delimiter": "\n\n",
1309
+ "fewshot_config": {
1310
+ "sampler": "first_n"
1311
+ },
1312
+ "num_fewshot": 5,
1313
+ "metric_list": [
1314
+ {
1315
+ "metric": "acc",
1316
+ "aggregation": "mean",
1317
+ "higher_is_better": true
1318
+ }
1319
+ ],
1320
+ "output_type": "multiple_choice",
1321
+ "repeats": 1,
1322
+ "should_decontaminate": false,
1323
+ "metadata": {
1324
+ "version": 0.0
1325
+ }
1326
+ },
1327
+ "mmlu_high_school_macroeconomics": {
1328
+ "task": "mmlu_high_school_macroeconomics",
1329
+ "task_alias": "high_school_macroeconomics",
1330
+ "group": "mmlu_social_sciences",
1331
+ "group_alias": "social_sciences",
1332
+ "dataset_path": "hails/mmlu_no_train",
1333
+ "dataset_name": "high_school_macroeconomics",
1334
+ "test_split": "test",
1335
+ "fewshot_split": "dev",
1336
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1337
+ "doc_to_target": "answer",
1338
+ "doc_to_choice": [
1339
+ "A",
1340
+ "B",
1341
+ "C",
1342
+ "D"
1343
+ ],
1344
+ "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
1345
+ "target_delimiter": " ",
1346
+ "fewshot_delimiter": "\n\n",
1347
+ "fewshot_config": {
1348
+ "sampler": "first_n"
1349
+ },
1350
+ "num_fewshot": 5,
1351
+ "metric_list": [
1352
+ {
1353
+ "metric": "acc",
1354
+ "aggregation": "mean",
1355
+ "higher_is_better": true
1356
+ }
1357
+ ],
1358
+ "output_type": "multiple_choice",
1359
+ "repeats": 1,
1360
+ "should_decontaminate": false,
1361
+ "metadata": {
1362
+ "version": 0.0
1363
+ }
1364
+ },
1365
+ "mmlu_high_school_mathematics": {
1366
+ "task": "mmlu_high_school_mathematics",
1367
+ "task_alias": "high_school_mathematics",
1368
+ "group": "mmlu_stem",
1369
+ "group_alias": "stem",
1370
+ "dataset_path": "hails/mmlu_no_train",
1371
+ "dataset_name": "high_school_mathematics",
1372
+ "test_split": "test",
1373
+ "fewshot_split": "dev",
1374
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1375
+ "doc_to_target": "answer",
1376
+ "doc_to_choice": [
1377
+ "A",
1378
+ "B",
1379
+ "C",
1380
+ "D"
1381
+ ],
1382
+ "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
1383
+ "target_delimiter": " ",
1384
+ "fewshot_delimiter": "\n\n",
1385
+ "fewshot_config": {
1386
+ "sampler": "first_n"
1387
+ },
1388
+ "num_fewshot": 5,
1389
+ "metric_list": [
1390
+ {
1391
+ "metric": "acc",
1392
+ "aggregation": "mean",
1393
+ "higher_is_better": true
1394
+ }
1395
+ ],
1396
+ "output_type": "multiple_choice",
1397
+ "repeats": 1,
1398
+ "should_decontaminate": false,
1399
+ "metadata": {
1400
+ "version": 0.0
1401
+ }
1402
+ },
1403
+ "mmlu_high_school_microeconomics": {
1404
+ "task": "mmlu_high_school_microeconomics",
1405
+ "task_alias": "high_school_microeconomics",
1406
+ "group": "mmlu_social_sciences",
1407
+ "group_alias": "social_sciences",
1408
+ "dataset_path": "hails/mmlu_no_train",
1409
+ "dataset_name": "high_school_microeconomics",
1410
+ "test_split": "test",
1411
+ "fewshot_split": "dev",
1412
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1413
+ "doc_to_target": "answer",
1414
+ "doc_to_choice": [
1415
+ "A",
1416
+ "B",
1417
+ "C",
1418
+ "D"
1419
+ ],
1420
+ "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
1421
+ "target_delimiter": " ",
1422
+ "fewshot_delimiter": "\n\n",
1423
+ "fewshot_config": {
1424
+ "sampler": "first_n"
1425
+ },
1426
+ "num_fewshot": 5,
1427
+ "metric_list": [
1428
+ {
1429
+ "metric": "acc",
1430
+ "aggregation": "mean",
1431
+ "higher_is_better": true
1432
+ }
1433
+ ],
1434
+ "output_type": "multiple_choice",
1435
+ "repeats": 1,
1436
+ "should_decontaminate": false,
1437
+ "metadata": {
1438
+ "version": 0.0
1439
+ }
1440
+ },
1441
+ "mmlu_high_school_physics": {
1442
+ "task": "mmlu_high_school_physics",
1443
+ "task_alias": "high_school_physics",
1444
+ "group": "mmlu_stem",
1445
+ "group_alias": "stem",
1446
+ "dataset_path": "hails/mmlu_no_train",
1447
+ "dataset_name": "high_school_physics",
1448
+ "test_split": "test",
1449
+ "fewshot_split": "dev",
1450
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1451
+ "doc_to_target": "answer",
1452
+ "doc_to_choice": [
1453
+ "A",
1454
+ "B",
1455
+ "C",
1456
+ "D"
1457
+ ],
1458
+ "description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
1459
+ "target_delimiter": " ",
1460
+ "fewshot_delimiter": "\n\n",
1461
+ "fewshot_config": {
1462
+ "sampler": "first_n"
1463
+ },
1464
+ "num_fewshot": 5,
1465
+ "metric_list": [
1466
+ {
1467
+ "metric": "acc",
1468
+ "aggregation": "mean",
1469
+ "higher_is_better": true
1470
+ }
1471
+ ],
1472
+ "output_type": "multiple_choice",
1473
+ "repeats": 1,
1474
+ "should_decontaminate": false,
1475
+ "metadata": {
1476
+ "version": 0.0
1477
+ }
1478
+ },
1479
+ "mmlu_high_school_psychology": {
1480
+ "task": "mmlu_high_school_psychology",
1481
+ "task_alias": "high_school_psychology",
1482
+ "group": "mmlu_social_sciences",
1483
+ "group_alias": "social_sciences",
1484
+ "dataset_path": "hails/mmlu_no_train",
1485
+ "dataset_name": "high_school_psychology",
1486
+ "test_split": "test",
1487
+ "fewshot_split": "dev",
1488
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1489
+ "doc_to_target": "answer",
1490
+ "doc_to_choice": [
1491
+ "A",
1492
+ "B",
1493
+ "C",
1494
+ "D"
1495
+ ],
1496
+ "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
1497
+ "target_delimiter": " ",
1498
+ "fewshot_delimiter": "\n\n",
1499
+ "fewshot_config": {
1500
+ "sampler": "first_n"
1501
+ },
1502
+ "num_fewshot": 5,
1503
+ "metric_list": [
1504
+ {
1505
+ "metric": "acc",
1506
+ "aggregation": "mean",
1507
+ "higher_is_better": true
1508
+ }
1509
+ ],
1510
+ "output_type": "multiple_choice",
1511
+ "repeats": 1,
1512
+ "should_decontaminate": false,
1513
+ "metadata": {
1514
+ "version": 0.0
1515
+ }
1516
+ },
1517
+ "mmlu_high_school_statistics": {
1518
+ "task": "mmlu_high_school_statistics",
1519
+ "task_alias": "high_school_statistics",
1520
+ "group": "mmlu_stem",
1521
+ "group_alias": "stem",
1522
+ "dataset_path": "hails/mmlu_no_train",
1523
+ "dataset_name": "high_school_statistics",
1524
+ "test_split": "test",
1525
+ "fewshot_split": "dev",
1526
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1527
+ "doc_to_target": "answer",
1528
+ "doc_to_choice": [
1529
+ "A",
1530
+ "B",
1531
+ "C",
1532
+ "D"
1533
+ ],
1534
+ "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
1535
+ "target_delimiter": " ",
1536
+ "fewshot_delimiter": "\n\n",
1537
+ "fewshot_config": {
1538
+ "sampler": "first_n"
1539
+ },
1540
+ "num_fewshot": 5,
1541
+ "metric_list": [
1542
+ {
1543
+ "metric": "acc",
1544
+ "aggregation": "mean",
1545
+ "higher_is_better": true
1546
+ }
1547
+ ],
1548
+ "output_type": "multiple_choice",
1549
+ "repeats": 1,
1550
+ "should_decontaminate": false,
1551
+ "metadata": {
1552
+ "version": 0.0
1553
+ }
1554
+ },
1555
+ "mmlu_high_school_us_history": {
1556
+ "task": "mmlu_high_school_us_history",
1557
+ "task_alias": "high_school_us_history",
1558
+ "group": "mmlu_humanities",
1559
+ "group_alias": "humanities",
1560
+ "dataset_path": "hails/mmlu_no_train",
1561
+ "dataset_name": "high_school_us_history",
1562
+ "test_split": "test",
1563
+ "fewshot_split": "dev",
1564
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1565
+ "doc_to_target": "answer",
1566
+ "doc_to_choice": [
1567
+ "A",
1568
+ "B",
1569
+ "C",
1570
+ "D"
1571
+ ],
1572
+ "description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
1573
+ "target_delimiter": " ",
1574
+ "fewshot_delimiter": "\n\n",
1575
+ "fewshot_config": {
1576
+ "sampler": "first_n"
1577
+ },
1578
+ "num_fewshot": 5,
1579
+ "metric_list": [
1580
+ {
1581
+ "metric": "acc",
1582
+ "aggregation": "mean",
1583
+ "higher_is_better": true
1584
+ }
1585
+ ],
1586
+ "output_type": "multiple_choice",
1587
+ "repeats": 1,
1588
+ "should_decontaminate": false,
1589
+ "metadata": {
1590
+ "version": 0.0
1591
+ }
1592
+ },
1593
+ "mmlu_high_school_world_history": {
1594
+ "task": "mmlu_high_school_world_history",
1595
+ "task_alias": "high_school_world_history",
1596
+ "group": "mmlu_humanities",
1597
+ "group_alias": "humanities",
1598
+ "dataset_path": "hails/mmlu_no_train",
1599
+ "dataset_name": "high_school_world_history",
1600
+ "test_split": "test",
1601
+ "fewshot_split": "dev",
1602
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1603
+ "doc_to_target": "answer",
1604
+ "doc_to_choice": [
1605
+ "A",
1606
+ "B",
1607
+ "C",
1608
+ "D"
1609
+ ],
1610
+ "description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
1611
+ "target_delimiter": " ",
1612
+ "fewshot_delimiter": "\n\n",
1613
+ "fewshot_config": {
1614
+ "sampler": "first_n"
1615
+ },
1616
+ "num_fewshot": 5,
1617
+ "metric_list": [
1618
+ {
1619
+ "metric": "acc",
1620
+ "aggregation": "mean",
1621
+ "higher_is_better": true
1622
+ }
1623
+ ],
1624
+ "output_type": "multiple_choice",
1625
+ "repeats": 1,
1626
+ "should_decontaminate": false,
1627
+ "metadata": {
1628
+ "version": 0.0
1629
+ }
1630
+ },
1631
+ "mmlu_human_aging": {
1632
+ "task": "mmlu_human_aging",
1633
+ "task_alias": "human_aging",
1634
+ "group": "mmlu_other",
1635
+ "group_alias": "other",
1636
+ "dataset_path": "hails/mmlu_no_train",
1637
+ "dataset_name": "human_aging",
1638
+ "test_split": "test",
1639
+ "fewshot_split": "dev",
1640
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1641
+ "doc_to_target": "answer",
1642
+ "doc_to_choice": [
1643
+ "A",
1644
+ "B",
1645
+ "C",
1646
+ "D"
1647
+ ],
1648
+ "description": "The following are multiple choice questions (with answers) about human aging.\n\n",
1649
+ "target_delimiter": " ",
1650
+ "fewshot_delimiter": "\n\n",
1651
+ "fewshot_config": {
1652
+ "sampler": "first_n"
1653
+ },
1654
+ "num_fewshot": 5,
1655
+ "metric_list": [
1656
+ {
1657
+ "metric": "acc",
1658
+ "aggregation": "mean",
1659
+ "higher_is_better": true
1660
+ }
1661
+ ],
1662
+ "output_type": "multiple_choice",
1663
+ "repeats": 1,
1664
+ "should_decontaminate": false,
1665
+ "metadata": {
1666
+ "version": 0.0
1667
+ }
1668
+ },
1669
+ "mmlu_human_sexuality": {
1670
+ "task": "mmlu_human_sexuality",
1671
+ "task_alias": "human_sexuality",
1672
+ "group": "mmlu_social_sciences",
1673
+ "group_alias": "social_sciences",
1674
+ "dataset_path": "hails/mmlu_no_train",
1675
+ "dataset_name": "human_sexuality",
1676
+ "test_split": "test",
1677
+ "fewshot_split": "dev",
1678
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1679
+ "doc_to_target": "answer",
1680
+ "doc_to_choice": [
1681
+ "A",
1682
+ "B",
1683
+ "C",
1684
+ "D"
1685
+ ],
1686
+ "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
1687
+ "target_delimiter": " ",
1688
+ "fewshot_delimiter": "\n\n",
1689
+ "fewshot_config": {
1690
+ "sampler": "first_n"
1691
+ },
1692
+ "num_fewshot": 5,
1693
+ "metric_list": [
1694
+ {
1695
+ "metric": "acc",
1696
+ "aggregation": "mean",
1697
+ "higher_is_better": true
1698
+ }
1699
+ ],
1700
+ "output_type": "multiple_choice",
1701
+ "repeats": 1,
1702
+ "should_decontaminate": false,
1703
+ "metadata": {
1704
+ "version": 0.0
1705
+ }
1706
+ },
1707
+ "mmlu_international_law": {
1708
+ "task": "mmlu_international_law",
1709
+ "task_alias": "international_law",
1710
+ "group": "mmlu_humanities",
1711
+ "group_alias": "humanities",
1712
+ "dataset_path": "hails/mmlu_no_train",
1713
+ "dataset_name": "international_law",
1714
+ "test_split": "test",
1715
+ "fewshot_split": "dev",
1716
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1717
+ "doc_to_target": "answer",
1718
+ "doc_to_choice": [
1719
+ "A",
1720
+ "B",
1721
+ "C",
1722
+ "D"
1723
+ ],
1724
+ "description": "The following are multiple choice questions (with answers) about international law.\n\n",
1725
+ "target_delimiter": " ",
1726
+ "fewshot_delimiter": "\n\n",
1727
+ "fewshot_config": {
1728
+ "sampler": "first_n"
1729
+ },
1730
+ "num_fewshot": 5,
1731
+ "metric_list": [
1732
+ {
1733
+ "metric": "acc",
1734
+ "aggregation": "mean",
1735
+ "higher_is_better": true
1736
+ }
1737
+ ],
1738
+ "output_type": "multiple_choice",
1739
+ "repeats": 1,
1740
+ "should_decontaminate": false,
1741
+ "metadata": {
1742
+ "version": 0.0
1743
+ }
1744
+ },
1745
+ "mmlu_jurisprudence": {
1746
+ "task": "mmlu_jurisprudence",
1747
+ "task_alias": "jurisprudence",
1748
+ "group": "mmlu_humanities",
1749
+ "group_alias": "humanities",
1750
+ "dataset_path": "hails/mmlu_no_train",
1751
+ "dataset_name": "jurisprudence",
1752
+ "test_split": "test",
1753
+ "fewshot_split": "dev",
1754
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1755
+ "doc_to_target": "answer",
1756
+ "doc_to_choice": [
1757
+ "A",
1758
+ "B",
1759
+ "C",
1760
+ "D"
1761
+ ],
1762
+ "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
1763
+ "target_delimiter": " ",
1764
+ "fewshot_delimiter": "\n\n",
1765
+ "fewshot_config": {
1766
+ "sampler": "first_n"
1767
+ },
1768
+ "num_fewshot": 5,
1769
+ "metric_list": [
1770
+ {
1771
+ "metric": "acc",
1772
+ "aggregation": "mean",
1773
+ "higher_is_better": true
1774
+ }
1775
+ ],
1776
+ "output_type": "multiple_choice",
1777
+ "repeats": 1,
1778
+ "should_decontaminate": false,
1779
+ "metadata": {
1780
+ "version": 0.0
1781
+ }
1782
+ },
1783
+ "mmlu_logical_fallacies": {
1784
+ "task": "mmlu_logical_fallacies",
1785
+ "task_alias": "logical_fallacies",
1786
+ "group": "mmlu_humanities",
1787
+ "group_alias": "humanities",
1788
+ "dataset_path": "hails/mmlu_no_train",
1789
+ "dataset_name": "logical_fallacies",
1790
+ "test_split": "test",
1791
+ "fewshot_split": "dev",
1792
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1793
+ "doc_to_target": "answer",
1794
+ "doc_to_choice": [
1795
+ "A",
1796
+ "B",
1797
+ "C",
1798
+ "D"
1799
+ ],
1800
+ "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
1801
+ "target_delimiter": " ",
1802
+ "fewshot_delimiter": "\n\n",
1803
+ "fewshot_config": {
1804
+ "sampler": "first_n"
1805
+ },
1806
+ "num_fewshot": 5,
1807
+ "metric_list": [
1808
+ {
1809
+ "metric": "acc",
1810
+ "aggregation": "mean",
1811
+ "higher_is_better": true
1812
+ }
1813
+ ],
1814
+ "output_type": "multiple_choice",
1815
+ "repeats": 1,
1816
+ "should_decontaminate": false,
1817
+ "metadata": {
1818
+ "version": 0.0
1819
+ }
1820
+ },
1821
+ "mmlu_machine_learning": {
1822
+ "task": "mmlu_machine_learning",
1823
+ "task_alias": "machine_learning",
1824
+ "group": "mmlu_stem",
1825
+ "group_alias": "stem",
1826
+ "dataset_path": "hails/mmlu_no_train",
1827
+ "dataset_name": "machine_learning",
1828
+ "test_split": "test",
1829
+ "fewshot_split": "dev",
1830
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1831
+ "doc_to_target": "answer",
1832
+ "doc_to_choice": [
1833
+ "A",
1834
+ "B",
1835
+ "C",
1836
+ "D"
1837
+ ],
1838
+ "description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
1839
+ "target_delimiter": " ",
1840
+ "fewshot_delimiter": "\n\n",
1841
+ "fewshot_config": {
1842
+ "sampler": "first_n"
1843
+ },
1844
+ "num_fewshot": 5,
1845
+ "metric_list": [
1846
+ {
1847
+ "metric": "acc",
1848
+ "aggregation": "mean",
1849
+ "higher_is_better": true
1850
+ }
1851
+ ],
1852
+ "output_type": "multiple_choice",
1853
+ "repeats": 1,
1854
+ "should_decontaminate": false,
1855
+ "metadata": {
1856
+ "version": 0.0
1857
+ }
1858
+ },
1859
+ "mmlu_management": {
1860
+ "task": "mmlu_management",
1861
+ "task_alias": "management",
1862
+ "group": "mmlu_other",
1863
+ "group_alias": "other",
1864
+ "dataset_path": "hails/mmlu_no_train",
1865
+ "dataset_name": "management",
1866
+ "test_split": "test",
1867
+ "fewshot_split": "dev",
1868
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1869
+ "doc_to_target": "answer",
1870
+ "doc_to_choice": [
1871
+ "A",
1872
+ "B",
1873
+ "C",
1874
+ "D"
1875
+ ],
1876
+ "description": "The following are multiple choice questions (with answers) about management.\n\n",
1877
+ "target_delimiter": " ",
1878
+ "fewshot_delimiter": "\n\n",
1879
+ "fewshot_config": {
1880
+ "sampler": "first_n"
1881
+ },
1882
+ "num_fewshot": 5,
1883
+ "metric_list": [
1884
+ {
1885
+ "metric": "acc",
1886
+ "aggregation": "mean",
1887
+ "higher_is_better": true
1888
+ }
1889
+ ],
1890
+ "output_type": "multiple_choice",
1891
+ "repeats": 1,
1892
+ "should_decontaminate": false,
1893
+ "metadata": {
1894
+ "version": 0.0
1895
+ }
1896
+ },
1897
+ "mmlu_marketing": {
1898
+ "task": "mmlu_marketing",
1899
+ "task_alias": "marketing",
1900
+ "group": "mmlu_other",
1901
+ "group_alias": "other",
1902
+ "dataset_path": "hails/mmlu_no_train",
1903
+ "dataset_name": "marketing",
1904
+ "test_split": "test",
1905
+ "fewshot_split": "dev",
1906
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1907
+ "doc_to_target": "answer",
1908
+ "doc_to_choice": [
1909
+ "A",
1910
+ "B",
1911
+ "C",
1912
+ "D"
1913
+ ],
1914
+ "description": "The following are multiple choice questions (with answers) about marketing.\n\n",
1915
+ "target_delimiter": " ",
1916
+ "fewshot_delimiter": "\n\n",
1917
+ "fewshot_config": {
1918
+ "sampler": "first_n"
1919
+ },
1920
+ "num_fewshot": 5,
1921
+ "metric_list": [
1922
+ {
1923
+ "metric": "acc",
1924
+ "aggregation": "mean",
1925
+ "higher_is_better": true
1926
+ }
1927
+ ],
1928
+ "output_type": "multiple_choice",
1929
+ "repeats": 1,
1930
+ "should_decontaminate": false,
1931
+ "metadata": {
1932
+ "version": 0.0
1933
+ }
1934
+ },
1935
+ "mmlu_medical_genetics": {
1936
+ "task": "mmlu_medical_genetics",
1937
+ "task_alias": "medical_genetics",
1938
+ "group": "mmlu_other",
1939
+ "group_alias": "other",
1940
+ "dataset_path": "hails/mmlu_no_train",
1941
+ "dataset_name": "medical_genetics",
1942
+ "test_split": "test",
1943
+ "fewshot_split": "dev",
1944
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1945
+ "doc_to_target": "answer",
1946
+ "doc_to_choice": [
1947
+ "A",
1948
+ "B",
1949
+ "C",
1950
+ "D"
1951
+ ],
1952
+ "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
1953
+ "target_delimiter": " ",
1954
+ "fewshot_delimiter": "\n\n",
1955
+ "fewshot_config": {
1956
+ "sampler": "first_n"
1957
+ },
1958
+ "num_fewshot": 5,
1959
+ "metric_list": [
1960
+ {
1961
+ "metric": "acc",
1962
+ "aggregation": "mean",
1963
+ "higher_is_better": true
1964
+ }
1965
+ ],
1966
+ "output_type": "multiple_choice",
1967
+ "repeats": 1,
1968
+ "should_decontaminate": false,
1969
+ "metadata": {
1970
+ "version": 0.0
1971
+ }
1972
+ },
1973
+ "mmlu_miscellaneous": {
1974
+ "task": "mmlu_miscellaneous",
1975
+ "task_alias": "miscellaneous",
1976
+ "group": "mmlu_other",
1977
+ "group_alias": "other",
1978
+ "dataset_path": "hails/mmlu_no_train",
1979
+ "dataset_name": "miscellaneous",
1980
+ "test_split": "test",
1981
+ "fewshot_split": "dev",
1982
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1983
+ "doc_to_target": "answer",
1984
+ "doc_to_choice": [
1985
+ "A",
1986
+ "B",
1987
+ "C",
1988
+ "D"
1989
+ ],
1990
+ "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
1991
+ "target_delimiter": " ",
1992
+ "fewshot_delimiter": "\n\n",
1993
+ "fewshot_config": {
1994
+ "sampler": "first_n"
1995
+ },
1996
+ "num_fewshot": 5,
1997
+ "metric_list": [
1998
+ {
1999
+ "metric": "acc",
2000
+ "aggregation": "mean",
2001
+ "higher_is_better": true
2002
+ }
2003
+ ],
2004
+ "output_type": "multiple_choice",
2005
+ "repeats": 1,
2006
+ "should_decontaminate": false,
2007
+ "metadata": {
2008
+ "version": 0.0
2009
+ }
2010
+ },
2011
+ "mmlu_moral_disputes": {
2012
+ "task": "mmlu_moral_disputes",
2013
+ "task_alias": "moral_disputes",
2014
+ "group": "mmlu_humanities",
2015
+ "group_alias": "humanities",
2016
+ "dataset_path": "hails/mmlu_no_train",
2017
+ "dataset_name": "moral_disputes",
2018
+ "test_split": "test",
2019
+ "fewshot_split": "dev",
2020
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2021
+ "doc_to_target": "answer",
2022
+ "doc_to_choice": [
2023
+ "A",
2024
+ "B",
2025
+ "C",
2026
+ "D"
2027
+ ],
2028
+ "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
2029
+ "target_delimiter": " ",
2030
+ "fewshot_delimiter": "\n\n",
2031
+ "fewshot_config": {
2032
+ "sampler": "first_n"
2033
+ },
2034
+ "num_fewshot": 5,
2035
+ "metric_list": [
2036
+ {
2037
+ "metric": "acc",
2038
+ "aggregation": "mean",
2039
+ "higher_is_better": true
2040
+ }
2041
+ ],
2042
+ "output_type": "multiple_choice",
2043
+ "repeats": 1,
2044
+ "should_decontaminate": false,
2045
+ "metadata": {
2046
+ "version": 0.0
2047
+ }
2048
+ },
2049
+ "mmlu_moral_scenarios": {
2050
+ "task": "mmlu_moral_scenarios",
2051
+ "task_alias": "moral_scenarios",
2052
+ "group": "mmlu_humanities",
2053
+ "group_alias": "humanities",
2054
+ "dataset_path": "hails/mmlu_no_train",
2055
+ "dataset_name": "moral_scenarios",
2056
+ "test_split": "test",
2057
+ "fewshot_split": "dev",
2058
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2059
+ "doc_to_target": "answer",
2060
+ "doc_to_choice": [
2061
+ "A",
2062
+ "B",
2063
+ "C",
2064
+ "D"
2065
+ ],
2066
+ "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
2067
+ "target_delimiter": " ",
2068
+ "fewshot_delimiter": "\n\n",
2069
+ "fewshot_config": {
2070
+ "sampler": "first_n"
2071
+ },
2072
+ "num_fewshot": 5,
2073
+ "metric_list": [
2074
+ {
2075
+ "metric": "acc",
2076
+ "aggregation": "mean",
2077
+ "higher_is_better": true
2078
+ }
2079
+ ],
2080
+ "output_type": "multiple_choice",
2081
+ "repeats": 1,
2082
+ "should_decontaminate": false,
2083
+ "metadata": {
2084
+ "version": 0.0
2085
+ }
2086
+ },
2087
+ "mmlu_nutrition": {
2088
+ "task": "mmlu_nutrition",
2089
+ "task_alias": "nutrition",
2090
+ "group": "mmlu_other",
2091
+ "group_alias": "other",
2092
+ "dataset_path": "hails/mmlu_no_train",
2093
+ "dataset_name": "nutrition",
2094
+ "test_split": "test",
2095
+ "fewshot_split": "dev",
2096
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2097
+ "doc_to_target": "answer",
2098
+ "doc_to_choice": [
2099
+ "A",
2100
+ "B",
2101
+ "C",
2102
+ "D"
2103
+ ],
2104
+ "description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
2105
+ "target_delimiter": " ",
2106
+ "fewshot_delimiter": "\n\n",
2107
+ "fewshot_config": {
2108
+ "sampler": "first_n"
2109
+ },
2110
+ "num_fewshot": 5,
2111
+ "metric_list": [
2112
+ {
2113
+ "metric": "acc",
2114
+ "aggregation": "mean",
2115
+ "higher_is_better": true
2116
+ }
2117
+ ],
2118
+ "output_type": "multiple_choice",
2119
+ "repeats": 1,
2120
+ "should_decontaminate": false,
2121
+ "metadata": {
2122
+ "version": 0.0
2123
+ }
2124
+ },
2125
+ "mmlu_philosophy": {
2126
+ "task": "mmlu_philosophy",
2127
+ "task_alias": "philosophy",
2128
+ "group": "mmlu_humanities",
2129
+ "group_alias": "humanities",
2130
+ "dataset_path": "hails/mmlu_no_train",
2131
+ "dataset_name": "philosophy",
2132
+ "test_split": "test",
2133
+ "fewshot_split": "dev",
2134
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2135
+ "doc_to_target": "answer",
2136
+ "doc_to_choice": [
2137
+ "A",
2138
+ "B",
2139
+ "C",
2140
+ "D"
2141
+ ],
2142
+ "description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
2143
+ "target_delimiter": " ",
2144
+ "fewshot_delimiter": "\n\n",
2145
+ "fewshot_config": {
2146
+ "sampler": "first_n"
2147
+ },
2148
+ "num_fewshot": 5,
2149
+ "metric_list": [
2150
+ {
2151
+ "metric": "acc",
2152
+ "aggregation": "mean",
2153
+ "higher_is_better": true
2154
+ }
2155
+ ],
2156
+ "output_type": "multiple_choice",
2157
+ "repeats": 1,
2158
+ "should_decontaminate": false,
2159
+ "metadata": {
2160
+ "version": 0.0
2161
+ }
2162
+ },
2163
+ "mmlu_prehistory": {
2164
+ "task": "mmlu_prehistory",
2165
+ "task_alias": "prehistory",
2166
+ "group": "mmlu_humanities",
2167
+ "group_alias": "humanities",
2168
+ "dataset_path": "hails/mmlu_no_train",
2169
+ "dataset_name": "prehistory",
2170
+ "test_split": "test",
2171
+ "fewshot_split": "dev",
2172
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2173
+ "doc_to_target": "answer",
2174
+ "doc_to_choice": [
2175
+ "A",
2176
+ "B",
2177
+ "C",
2178
+ "D"
2179
+ ],
2180
+ "description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
2181
+ "target_delimiter": " ",
2182
+ "fewshot_delimiter": "\n\n",
2183
+ "fewshot_config": {
2184
+ "sampler": "first_n"
2185
+ },
2186
+ "num_fewshot": 5,
2187
+ "metric_list": [
2188
+ {
2189
+ "metric": "acc",
2190
+ "aggregation": "mean",
2191
+ "higher_is_better": true
2192
+ }
2193
+ ],
2194
+ "output_type": "multiple_choice",
2195
+ "repeats": 1,
2196
+ "should_decontaminate": false,
2197
+ "metadata": {
2198
+ "version": 0.0
2199
+ }
2200
+ },
2201
+ "mmlu_professional_accounting": {
2202
+ "task": "mmlu_professional_accounting",
2203
+ "task_alias": "professional_accounting",
2204
+ "group": "mmlu_other",
2205
+ "group_alias": "other",
2206
+ "dataset_path": "hails/mmlu_no_train",
2207
+ "dataset_name": "professional_accounting",
2208
+ "test_split": "test",
2209
+ "fewshot_split": "dev",
2210
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2211
+ "doc_to_target": "answer",
2212
+ "doc_to_choice": [
2213
+ "A",
2214
+ "B",
2215
+ "C",
2216
+ "D"
2217
+ ],
2218
+ "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
2219
+ "target_delimiter": " ",
2220
+ "fewshot_delimiter": "\n\n",
2221
+ "fewshot_config": {
2222
+ "sampler": "first_n"
2223
+ },
2224
+ "num_fewshot": 5,
2225
+ "metric_list": [
2226
+ {
2227
+ "metric": "acc",
2228
+ "aggregation": "mean",
2229
+ "higher_is_better": true
2230
+ }
2231
+ ],
2232
+ "output_type": "multiple_choice",
2233
+ "repeats": 1,
2234
+ "should_decontaminate": false,
2235
+ "metadata": {
2236
+ "version": 0.0
2237
+ }
2238
+ },
2239
+ "mmlu_professional_law": {
2240
+ "task": "mmlu_professional_law",
2241
+ "task_alias": "professional_law",
2242
+ "group": "mmlu_humanities",
2243
+ "group_alias": "humanities",
2244
+ "dataset_path": "hails/mmlu_no_train",
2245
+ "dataset_name": "professional_law",
2246
+ "test_split": "test",
2247
+ "fewshot_split": "dev",
2248
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2249
+ "doc_to_target": "answer",
2250
+ "doc_to_choice": [
2251
+ "A",
2252
+ "B",
2253
+ "C",
2254
+ "D"
2255
+ ],
2256
+ "description": "The following are multiple choice questions (with answers) about professional law.\n\n",
2257
+ "target_delimiter": " ",
2258
+ "fewshot_delimiter": "\n\n",
2259
+ "fewshot_config": {
2260
+ "sampler": "first_n"
2261
+ },
2262
+ "num_fewshot": 5,
2263
+ "metric_list": [
2264
+ {
2265
+ "metric": "acc",
2266
+ "aggregation": "mean",
2267
+ "higher_is_better": true
2268
+ }
2269
+ ],
2270
+ "output_type": "multiple_choice",
2271
+ "repeats": 1,
2272
+ "should_decontaminate": false,
2273
+ "metadata": {
2274
+ "version": 0.0
2275
+ }
2276
+ },
2277
+ "mmlu_professional_medicine": {
2278
+ "task": "mmlu_professional_medicine",
2279
+ "task_alias": "professional_medicine",
2280
+ "group": "mmlu_other",
2281
+ "group_alias": "other",
2282
+ "dataset_path": "hails/mmlu_no_train",
2283
+ "dataset_name": "professional_medicine",
2284
+ "test_split": "test",
2285
+ "fewshot_split": "dev",
2286
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2287
+ "doc_to_target": "answer",
2288
+ "doc_to_choice": [
2289
+ "A",
2290
+ "B",
2291
+ "C",
2292
+ "D"
2293
+ ],
2294
+ "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
2295
+ "target_delimiter": " ",
2296
+ "fewshot_delimiter": "\n\n",
2297
+ "fewshot_config": {
2298
+ "sampler": "first_n"
2299
+ },
2300
+ "num_fewshot": 5,
2301
+ "metric_list": [
2302
+ {
2303
+ "metric": "acc",
2304
+ "aggregation": "mean",
2305
+ "higher_is_better": true
2306
+ }
2307
+ ],
2308
+ "output_type": "multiple_choice",
2309
+ "repeats": 1,
2310
+ "should_decontaminate": false,
2311
+ "metadata": {
2312
+ "version": 0.0
2313
+ }
2314
+ },
2315
+ "mmlu_professional_psychology": {
2316
+ "task": "mmlu_professional_psychology",
2317
+ "task_alias": "professional_psychology",
2318
+ "group": "mmlu_social_sciences",
2319
+ "group_alias": "social_sciences",
2320
+ "dataset_path": "hails/mmlu_no_train",
2321
+ "dataset_name": "professional_psychology",
2322
+ "test_split": "test",
2323
+ "fewshot_split": "dev",
2324
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2325
+ "doc_to_target": "answer",
2326
+ "doc_to_choice": [
2327
+ "A",
2328
+ "B",
2329
+ "C",
2330
+ "D"
2331
+ ],
2332
+ "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
2333
+ "target_delimiter": " ",
2334
+ "fewshot_delimiter": "\n\n",
2335
+ "fewshot_config": {
2336
+ "sampler": "first_n"
2337
+ },
2338
+ "num_fewshot": 5,
2339
+ "metric_list": [
2340
+ {
2341
+ "metric": "acc",
2342
+ "aggregation": "mean",
2343
+ "higher_is_better": true
2344
+ }
2345
+ ],
2346
+ "output_type": "multiple_choice",
2347
+ "repeats": 1,
2348
+ "should_decontaminate": false,
2349
+ "metadata": {
2350
+ "version": 0.0
2351
+ }
2352
+ },
2353
+ "mmlu_public_relations": {
2354
+ "task": "mmlu_public_relations",
2355
+ "task_alias": "public_relations",
2356
+ "group": "mmlu_social_sciences",
2357
+ "group_alias": "social_sciences",
2358
+ "dataset_path": "hails/mmlu_no_train",
2359
+ "dataset_name": "public_relations",
2360
+ "test_split": "test",
2361
+ "fewshot_split": "dev",
2362
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2363
+ "doc_to_target": "answer",
2364
+ "doc_to_choice": [
2365
+ "A",
2366
+ "B",
2367
+ "C",
2368
+ "D"
2369
+ ],
2370
+ "description": "The following are multiple choice questions (with answers) about public relations.\n\n",
2371
+ "target_delimiter": " ",
2372
+ "fewshot_delimiter": "\n\n",
2373
+ "fewshot_config": {
2374
+ "sampler": "first_n"
2375
+ },
2376
+ "num_fewshot": 5,
2377
+ "metric_list": [
2378
+ {
2379
+ "metric": "acc",
2380
+ "aggregation": "mean",
2381
+ "higher_is_better": true
2382
+ }
2383
+ ],
2384
+ "output_type": "multiple_choice",
2385
+ "repeats": 1,
2386
+ "should_decontaminate": false,
2387
+ "metadata": {
2388
+ "version": 0.0
2389
+ }
2390
+ },
2391
+ "mmlu_security_studies": {
2392
+ "task": "mmlu_security_studies",
2393
+ "task_alias": "security_studies",
2394
+ "group": "mmlu_social_sciences",
2395
+ "group_alias": "social_sciences",
2396
+ "dataset_path": "hails/mmlu_no_train",
2397
+ "dataset_name": "security_studies",
2398
+ "test_split": "test",
2399
+ "fewshot_split": "dev",
2400
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2401
+ "doc_to_target": "answer",
2402
+ "doc_to_choice": [
2403
+ "A",
2404
+ "B",
2405
+ "C",
2406
+ "D"
2407
+ ],
2408
+ "description": "The following are multiple choice questions (with answers) about security studies.\n\n",
2409
+ "target_delimiter": " ",
2410
+ "fewshot_delimiter": "\n\n",
2411
+ "fewshot_config": {
2412
+ "sampler": "first_n"
2413
+ },
2414
+ "num_fewshot": 5,
2415
+ "metric_list": [
2416
+ {
2417
+ "metric": "acc",
2418
+ "aggregation": "mean",
2419
+ "higher_is_better": true
2420
+ }
2421
+ ],
2422
+ "output_type": "multiple_choice",
2423
+ "repeats": 1,
2424
+ "should_decontaminate": false,
2425
+ "metadata": {
2426
+ "version": 0.0
2427
+ }
2428
+ },
2429
+ "mmlu_sociology": {
2430
+ "task": "mmlu_sociology",
2431
+ "task_alias": "sociology",
2432
+ "group": "mmlu_social_sciences",
2433
+ "group_alias": "social_sciences",
2434
+ "dataset_path": "hails/mmlu_no_train",
2435
+ "dataset_name": "sociology",
2436
+ "test_split": "test",
2437
+ "fewshot_split": "dev",
2438
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2439
+ "doc_to_target": "answer",
2440
+ "doc_to_choice": [
2441
+ "A",
2442
+ "B",
2443
+ "C",
2444
+ "D"
2445
+ ],
2446
+ "description": "The following are multiple choice questions (with answers) about sociology.\n\n",
2447
+ "target_delimiter": " ",
2448
+ "fewshot_delimiter": "\n\n",
2449
+ "fewshot_config": {
2450
+ "sampler": "first_n"
2451
+ },
2452
+ "num_fewshot": 5,
2453
+ "metric_list": [
2454
+ {
2455
+ "metric": "acc",
2456
+ "aggregation": "mean",
2457
+ "higher_is_better": true
2458
+ }
2459
+ ],
2460
+ "output_type": "multiple_choice",
2461
+ "repeats": 1,
2462
+ "should_decontaminate": false,
2463
+ "metadata": {
2464
+ "version": 0.0
2465
+ }
2466
+ },
2467
+ "mmlu_us_foreign_policy": {
2468
+ "task": "mmlu_us_foreign_policy",
2469
+ "task_alias": "us_foreign_policy",
2470
+ "group": "mmlu_social_sciences",
2471
+ "group_alias": "social_sciences",
2472
+ "dataset_path": "hails/mmlu_no_train",
2473
+ "dataset_name": "us_foreign_policy",
2474
+ "test_split": "test",
2475
+ "fewshot_split": "dev",
2476
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2477
+ "doc_to_target": "answer",
2478
+ "doc_to_choice": [
2479
+ "A",
2480
+ "B",
2481
+ "C",
2482
+ "D"
2483
+ ],
2484
+ "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
2485
+ "target_delimiter": " ",
2486
+ "fewshot_delimiter": "\n\n",
2487
+ "fewshot_config": {
2488
+ "sampler": "first_n"
2489
+ },
2490
+ "num_fewshot": 5,
2491
+ "metric_list": [
2492
+ {
2493
+ "metric": "acc",
2494
+ "aggregation": "mean",
2495
+ "higher_is_better": true
2496
+ }
2497
+ ],
2498
+ "output_type": "multiple_choice",
2499
+ "repeats": 1,
2500
+ "should_decontaminate": false,
2501
+ "metadata": {
2502
+ "version": 0.0
2503
+ }
2504
+ },
2505
+ "mmlu_virology": {
2506
+ "task": "mmlu_virology",
2507
+ "task_alias": "virology",
2508
+ "group": "mmlu_other",
2509
+ "group_alias": "other",
2510
+ "dataset_path": "hails/mmlu_no_train",
2511
+ "dataset_name": "virology",
2512
+ "test_split": "test",
2513
+ "fewshot_split": "dev",
2514
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2515
+ "doc_to_target": "answer",
2516
+ "doc_to_choice": [
2517
+ "A",
2518
+ "B",
2519
+ "C",
2520
+ "D"
2521
+ ],
2522
+ "description": "The following are multiple choice questions (with answers) about virology.\n\n",
2523
+ "target_delimiter": " ",
2524
+ "fewshot_delimiter": "\n\n",
2525
+ "fewshot_config": {
2526
+ "sampler": "first_n"
2527
+ },
2528
+ "num_fewshot": 5,
2529
+ "metric_list": [
2530
+ {
2531
+ "metric": "acc",
2532
+ "aggregation": "mean",
2533
+ "higher_is_better": true
2534
+ }
2535
+ ],
2536
+ "output_type": "multiple_choice",
2537
+ "repeats": 1,
2538
+ "should_decontaminate": false,
2539
+ "metadata": {
2540
+ "version": 0.0
2541
+ }
2542
+ },
2543
+ "mmlu_world_religions": {
2544
+ "task": "mmlu_world_religions",
2545
+ "task_alias": "world_religions",
2546
+ "group": "mmlu_humanities",
2547
+ "group_alias": "humanities",
2548
+ "dataset_path": "hails/mmlu_no_train",
2549
+ "dataset_name": "world_religions",
2550
+ "test_split": "test",
2551
+ "fewshot_split": "dev",
2552
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2553
+ "doc_to_target": "answer",
2554
+ "doc_to_choice": [
2555
+ "A",
2556
+ "B",
2557
+ "C",
2558
+ "D"
2559
+ ],
2560
+ "description": "The following are multiple choice questions (with answers) about world religions.\n\n",
2561
+ "target_delimiter": " ",
2562
+ "fewshot_delimiter": "\n\n",
2563
+ "fewshot_config": {
2564
+ "sampler": "first_n"
2565
+ },
2566
+ "num_fewshot": 5,
2567
+ "metric_list": [
2568
+ {
2569
+ "metric": "acc",
2570
+ "aggregation": "mean",
2571
+ "higher_is_better": true
2572
+ }
2573
+ ],
2574
+ "output_type": "multiple_choice",
2575
+ "repeats": 1,
2576
+ "should_decontaminate": false,
2577
+ "metadata": {
2578
+ "version": 0.0
2579
+ }
2580
+ }
2581
+ },
2582
+ "versions": {
2583
+ "mmlu_abstract_algebra": 0.0,
2584
+ "mmlu_anatomy": 0.0,
2585
+ "mmlu_astronomy": 0.0,
2586
+ "mmlu_business_ethics": 0.0,
2587
+ "mmlu_clinical_knowledge": 0.0,
2588
+ "mmlu_college_biology": 0.0,
2589
+ "mmlu_college_chemistry": 0.0,
2590
+ "mmlu_college_computer_science": 0.0,
2591
+ "mmlu_college_mathematics": 0.0,
2592
+ "mmlu_college_medicine": 0.0,
2593
+ "mmlu_college_physics": 0.0,
2594
+ "mmlu_computer_security": 0.0,
2595
+ "mmlu_conceptual_physics": 0.0,
2596
+ "mmlu_econometrics": 0.0,
2597
+ "mmlu_electrical_engineering": 0.0,
2598
+ "mmlu_elementary_mathematics": 0.0,
2599
+ "mmlu_formal_logic": 0.0,
2600
+ "mmlu_global_facts": 0.0,
2601
+ "mmlu_high_school_biology": 0.0,
2602
+ "mmlu_high_school_chemistry": 0.0,
2603
+ "mmlu_high_school_computer_science": 0.0,
2604
+ "mmlu_high_school_european_history": 0.0,
2605
+ "mmlu_high_school_geography": 0.0,
2606
+ "mmlu_high_school_government_and_politics": 0.0,
2607
+ "mmlu_high_school_macroeconomics": 0.0,
2608
+ "mmlu_high_school_mathematics": 0.0,
2609
+ "mmlu_high_school_microeconomics": 0.0,
2610
+ "mmlu_high_school_physics": 0.0,
2611
+ "mmlu_high_school_psychology": 0.0,
2612
+ "mmlu_high_school_statistics": 0.0,
2613
+ "mmlu_high_school_us_history": 0.0,
2614
+ "mmlu_high_school_world_history": 0.0,
2615
+ "mmlu_human_aging": 0.0,
2616
+ "mmlu_human_sexuality": 0.0,
2617
+ "mmlu_international_law": 0.0,
2618
+ "mmlu_jurisprudence": 0.0,
2619
+ "mmlu_logical_fallacies": 0.0,
2620
+ "mmlu_machine_learning": 0.0,
2621
+ "mmlu_management": 0.0,
2622
+ "mmlu_marketing": 0.0,
2623
+ "mmlu_medical_genetics": 0.0,
2624
+ "mmlu_miscellaneous": 0.0,
2625
+ "mmlu_moral_disputes": 0.0,
2626
+ "mmlu_moral_scenarios": 0.0,
2627
+ "mmlu_nutrition": 0.0,
2628
+ "mmlu_philosophy": 0.0,
2629
+ "mmlu_prehistory": 0.0,
2630
+ "mmlu_professional_accounting": 0.0,
2631
+ "mmlu_professional_law": 0.0,
2632
+ "mmlu_professional_medicine": 0.0,
2633
+ "mmlu_professional_psychology": 0.0,
2634
+ "mmlu_public_relations": 0.0,
2635
+ "mmlu_security_studies": 0.0,
2636
+ "mmlu_sociology": 0.0,
2637
+ "mmlu_us_foreign_policy": 0.0,
2638
+ "mmlu_virology": 0.0,
2639
+ "mmlu_world_religions": 0.0
2640
+ },
2641
+ "n-shot": {
2642
+ "mmlu": 0,
2643
+ "mmlu_abstract_algebra": 5,
2644
+ "mmlu_anatomy": 5,
2645
+ "mmlu_astronomy": 5,
2646
+ "mmlu_business_ethics": 5,
2647
+ "mmlu_clinical_knowledge": 5,
2648
+ "mmlu_college_biology": 5,
2649
+ "mmlu_college_chemistry": 5,
2650
+ "mmlu_college_computer_science": 5,
2651
+ "mmlu_college_mathematics": 5,
2652
+ "mmlu_college_medicine": 5,
2653
+ "mmlu_college_physics": 5,
2654
+ "mmlu_computer_security": 5,
2655
+ "mmlu_conceptual_physics": 5,
2656
+ "mmlu_econometrics": 5,
2657
+ "mmlu_electrical_engineering": 5,
2658
+ "mmlu_elementary_mathematics": 5,
2659
+ "mmlu_formal_logic": 5,
2660
+ "mmlu_global_facts": 5,
2661
+ "mmlu_high_school_biology": 5,
2662
+ "mmlu_high_school_chemistry": 5,
2663
+ "mmlu_high_school_computer_science": 5,
2664
+ "mmlu_high_school_european_history": 5,
2665
+ "mmlu_high_school_geography": 5,
2666
+ "mmlu_high_school_government_and_politics": 5,
2667
+ "mmlu_high_school_macroeconomics": 5,
2668
+ "mmlu_high_school_mathematics": 5,
2669
+ "mmlu_high_school_microeconomics": 5,
2670
+ "mmlu_high_school_physics": 5,
2671
+ "mmlu_high_school_psychology": 5,
2672
+ "mmlu_high_school_statistics": 5,
2673
+ "mmlu_high_school_us_history": 5,
2674
+ "mmlu_high_school_world_history": 5,
2675
+ "mmlu_human_aging": 5,
2676
+ "mmlu_human_sexuality": 5,
2677
+ "mmlu_humanities": 5,
2678
+ "mmlu_international_law": 5,
2679
+ "mmlu_jurisprudence": 5,
2680
+ "mmlu_logical_fallacies": 5,
2681
+ "mmlu_machine_learning": 5,
2682
+ "mmlu_management": 5,
2683
+ "mmlu_marketing": 5,
2684
+ "mmlu_medical_genetics": 5,
2685
+ "mmlu_miscellaneous": 5,
2686
+ "mmlu_moral_disputes": 5,
2687
+ "mmlu_moral_scenarios": 5,
2688
+ "mmlu_nutrition": 5,
2689
+ "mmlu_other": 5,
2690
+ "mmlu_philosophy": 5,
2691
+ "mmlu_prehistory": 5,
2692
+ "mmlu_professional_accounting": 5,
2693
+ "mmlu_professional_law": 5,
2694
+ "mmlu_professional_medicine": 5,
2695
+ "mmlu_professional_psychology": 5,
2696
+ "mmlu_public_relations": 5,
2697
+ "mmlu_security_studies": 5,
2698
+ "mmlu_social_sciences": 5,
2699
+ "mmlu_sociology": 5,
2700
+ "mmlu_stem": 5,
2701
+ "mmlu_us_foreign_policy": 5,
2702
+ "mmlu_virology": 5,
2703
+ "mmlu_world_religions": 5
2704
+ },
2705
+ "higher_is_better": {
2706
+ "mmlu": {
2707
+ "acc": true
2708
+ },
2709
+ "mmlu_abstract_algebra": {
2710
+ "acc": true
2711
+ },
2712
+ "mmlu_anatomy": {
2713
+ "acc": true
2714
+ },
2715
+ "mmlu_astronomy": {
2716
+ "acc": true
2717
+ },
2718
+ "mmlu_business_ethics": {
2719
+ "acc": true
2720
+ },
2721
+ "mmlu_clinical_knowledge": {
2722
+ "acc": true
2723
+ },
2724
+ "mmlu_college_biology": {
2725
+ "acc": true
2726
+ },
2727
+ "mmlu_college_chemistry": {
2728
+ "acc": true
2729
+ },
2730
+ "mmlu_college_computer_science": {
2731
+ "acc": true
2732
+ },
2733
+ "mmlu_college_mathematics": {
2734
+ "acc": true
2735
+ },
2736
+ "mmlu_college_medicine": {
2737
+ "acc": true
2738
+ },
2739
+ "mmlu_college_physics": {
2740
+ "acc": true
2741
+ },
2742
+ "mmlu_computer_security": {
2743
+ "acc": true
2744
+ },
2745
+ "mmlu_conceptual_physics": {
2746
+ "acc": true
2747
+ },
2748
+ "mmlu_econometrics": {
2749
+ "acc": true
2750
+ },
2751
+ "mmlu_electrical_engineering": {
2752
+ "acc": true
2753
+ },
2754
+ "mmlu_elementary_mathematics": {
2755
+ "acc": true
2756
+ },
2757
+ "mmlu_formal_logic": {
2758
+ "acc": true
2759
+ },
2760
+ "mmlu_global_facts": {
2761
+ "acc": true
2762
+ },
2763
+ "mmlu_high_school_biology": {
2764
+ "acc": true
2765
+ },
2766
+ "mmlu_high_school_chemistry": {
2767
+ "acc": true
2768
+ },
2769
+ "mmlu_high_school_computer_science": {
2770
+ "acc": true
2771
+ },
2772
+ "mmlu_high_school_european_history": {
2773
+ "acc": true
2774
+ },
2775
+ "mmlu_high_school_geography": {
2776
+ "acc": true
2777
+ },
2778
+ "mmlu_high_school_government_and_politics": {
2779
+ "acc": true
2780
+ },
2781
+ "mmlu_high_school_macroeconomics": {
2782
+ "acc": true
2783
+ },
2784
+ "mmlu_high_school_mathematics": {
2785
+ "acc": true
2786
+ },
2787
+ "mmlu_high_school_microeconomics": {
2788
+ "acc": true
2789
+ },
2790
+ "mmlu_high_school_physics": {
2791
+ "acc": true
2792
+ },
2793
+ "mmlu_high_school_psychology": {
2794
+ "acc": true
2795
+ },
2796
+ "mmlu_high_school_statistics": {
2797
+ "acc": true
2798
+ },
2799
+ "mmlu_high_school_us_history": {
2800
+ "acc": true
2801
+ },
2802
+ "mmlu_high_school_world_history": {
2803
+ "acc": true
2804
+ },
2805
+ "mmlu_human_aging": {
2806
+ "acc": true
2807
+ },
2808
+ "mmlu_human_sexuality": {
2809
+ "acc": true
2810
+ },
2811
+ "mmlu_humanities": {
2812
+ "acc": true
2813
+ },
2814
+ "mmlu_international_law": {
2815
+ "acc": true
2816
+ },
2817
+ "mmlu_jurisprudence": {
2818
+ "acc": true
2819
+ },
2820
+ "mmlu_logical_fallacies": {
2821
+ "acc": true
2822
+ },
2823
+ "mmlu_machine_learning": {
2824
+ "acc": true
2825
+ },
2826
+ "mmlu_management": {
2827
+ "acc": true
2828
+ },
2829
+ "mmlu_marketing": {
2830
+ "acc": true
2831
+ },
2832
+ "mmlu_medical_genetics": {
2833
+ "acc": true
2834
+ },
2835
+ "mmlu_miscellaneous": {
2836
+ "acc": true
2837
+ },
2838
+ "mmlu_moral_disputes": {
2839
+ "acc": true
2840
+ },
2841
+ "mmlu_moral_scenarios": {
2842
+ "acc": true
2843
+ },
2844
+ "mmlu_nutrition": {
2845
+ "acc": true
2846
+ },
2847
+ "mmlu_other": {
2848
+ "acc": true
2849
+ },
2850
+ "mmlu_philosophy": {
2851
+ "acc": true
2852
+ },
2853
+ "mmlu_prehistory": {
2854
+ "acc": true
2855
+ },
2856
+ "mmlu_professional_accounting": {
2857
+ "acc": true
2858
+ },
2859
+ "mmlu_professional_law": {
2860
+ "acc": true
2861
+ },
2862
+ "mmlu_professional_medicine": {
2863
+ "acc": true
2864
+ },
2865
+ "mmlu_professional_psychology": {
2866
+ "acc": true
2867
+ },
2868
+ "mmlu_public_relations": {
2869
+ "acc": true
2870
+ },
2871
+ "mmlu_security_studies": {
2872
+ "acc": true
2873
+ },
2874
+ "mmlu_social_sciences": {
2875
+ "acc": true
2876
+ },
2877
+ "mmlu_sociology": {
2878
+ "acc": true
2879
+ },
2880
+ "mmlu_stem": {
2881
+ "acc": true
2882
+ },
2883
+ "mmlu_us_foreign_policy": {
2884
+ "acc": true
2885
+ },
2886
+ "mmlu_virology": {
2887
+ "acc": true
2888
+ },
2889
+ "mmlu_world_religions": {
2890
+ "acc": true
2891
+ }
2892
+ },
2893
+ "n-samples": {
2894
+ "mmlu_philosophy": {
2895
+ "original": 311,
2896
+ "effective": 311
2897
+ },
2898
+ "mmlu_logical_fallacies": {
2899
+ "original": 163,
2900
+ "effective": 163
2901
+ },
2902
+ "mmlu_moral_disputes": {
2903
+ "original": 346,
2904
+ "effective": 346
2905
+ },
2906
+ "mmlu_jurisprudence": {
2907
+ "original": 108,
2908
+ "effective": 108
2909
+ },
2910
+ "mmlu_high_school_us_history": {
2911
+ "original": 204,
2912
+ "effective": 204
2913
+ },
2914
+ "mmlu_high_school_world_history": {
2915
+ "original": 237,
2916
+ "effective": 237
2917
+ },
2918
+ "mmlu_world_religions": {
2919
+ "original": 171,
2920
+ "effective": 171
2921
+ },
2922
+ "mmlu_moral_scenarios": {
2923
+ "original": 895,
2924
+ "effective": 895
2925
+ },
2926
+ "mmlu_prehistory": {
2927
+ "original": 324,
2928
+ "effective": 324
2929
+ },
2930
+ "mmlu_formal_logic": {
2931
+ "original": 126,
2932
+ "effective": 126
2933
+ },
2934
+ "mmlu_international_law": {
2935
+ "original": 121,
2936
+ "effective": 121
2937
+ },
2938
+ "mmlu_professional_law": {
2939
+ "original": 1534,
2940
+ "effective": 1534
2941
+ },
2942
+ "mmlu_high_school_european_history": {
2943
+ "original": 165,
2944
+ "effective": 165
2945
+ },
2946
+ "mmlu_public_relations": {
2947
+ "original": 110,
2948
+ "effective": 110
2949
+ },
2950
+ "mmlu_high_school_macroeconomics": {
2951
+ "original": 390,
2952
+ "effective": 390
2953
+ },
2954
+ "mmlu_human_sexuality": {
2955
+ "original": 131,
2956
+ "effective": 131
2957
+ },
2958
+ "mmlu_high_school_geography": {
2959
+ "original": 198,
2960
+ "effective": 198
2961
+ },
2962
+ "mmlu_high_school_psychology": {
2963
+ "original": 545,
2964
+ "effective": 545
2965
+ },
2966
+ "mmlu_high_school_microeconomics": {
2967
+ "original": 238,
2968
+ "effective": 238
2969
+ },
2970
+ "mmlu_high_school_government_and_politics": {
2971
+ "original": 193,
2972
+ "effective": 193
2973
+ },
2974
+ "mmlu_us_foreign_policy": {
2975
+ "original": 100,
2976
+ "effective": 100
2977
+ },
2978
+ "mmlu_sociology": {
2979
+ "original": 201,
2980
+ "effective": 201
2981
+ },
2982
+ "mmlu_security_studies": {
2983
+ "original": 245,
2984
+ "effective": 245
2985
+ },
2986
+ "mmlu_econometrics": {
2987
+ "original": 114,
2988
+ "effective": 114
2989
+ },
2990
+ "mmlu_professional_psychology": {
2991
+ "original": 612,
2992
+ "effective": 612
2993
+ },
2994
+ "mmlu_business_ethics": {
2995
+ "original": 100,
2996
+ "effective": 100
2997
+ },
2998
+ "mmlu_marketing": {
2999
+ "original": 234,
3000
+ "effective": 234
3001
+ },
3002
+ "mmlu_medical_genetics": {
3003
+ "original": 100,
3004
+ "effective": 100
3005
+ },
3006
+ "mmlu_clinical_knowledge": {
3007
+ "original": 265,
3008
+ "effective": 265
3009
+ },
3010
+ "mmlu_global_facts": {
3011
+ "original": 100,
3012
+ "effective": 100
3013
+ },
3014
+ "mmlu_human_aging": {
3015
+ "original": 223,
3016
+ "effective": 223
3017
+ },
3018
+ "mmlu_professional_medicine": {
3019
+ "original": 272,
3020
+ "effective": 272
3021
+ },
3022
+ "mmlu_nutrition": {
3023
+ "original": 306,
3024
+ "effective": 306
3025
+ },
3026
+ "mmlu_management": {
3027
+ "original": 103,
3028
+ "effective": 103
3029
+ },
3030
+ "mmlu_college_medicine": {
3031
+ "original": 173,
3032
+ "effective": 173
3033
+ },
3034
+ "mmlu_professional_accounting": {
3035
+ "original": 282,
3036
+ "effective": 282
3037
+ },
3038
+ "mmlu_virology": {
3039
+ "original": 166,
3040
+ "effective": 166
3041
+ },
3042
+ "mmlu_miscellaneous": {
3043
+ "original": 783,
3044
+ "effective": 783
3045
+ },
3046
+ "mmlu_abstract_algebra": {
3047
+ "original": 100,
3048
+ "effective": 100
3049
+ },
3050
+ "mmlu_college_biology": {
3051
+ "original": 144,
3052
+ "effective": 144
3053
+ },
3054
+ "mmlu_high_school_biology": {
3055
+ "original": 310,
3056
+ "effective": 310
3057
+ },
3058
+ "mmlu_conceptual_physics": {
3059
+ "original": 235,
3060
+ "effective": 235
3061
+ },
3062
+ "mmlu_computer_security": {
3063
+ "original": 100,
3064
+ "effective": 100
3065
+ },
3066
+ "mmlu_college_physics": {
3067
+ "original": 102,
3068
+ "effective": 102
3069
+ },
3070
+ "mmlu_college_chemistry": {
3071
+ "original": 100,
3072
+ "effective": 100
3073
+ },
3074
+ "mmlu_high_school_statistics": {
3075
+ "original": 216,
3076
+ "effective": 216
3077
+ },
3078
+ "mmlu_anatomy": {
3079
+ "original": 135,
3080
+ "effective": 135
3081
+ },
3082
+ "mmlu_high_school_mathematics": {
3083
+ "original": 270,
3084
+ "effective": 270
3085
+ },
3086
+ "mmlu_machine_learning": {
3087
+ "original": 112,
3088
+ "effective": 112
3089
+ },
3090
+ "mmlu_high_school_physics": {
3091
+ "original": 151,
3092
+ "effective": 151
3093
+ },
3094
+ "mmlu_electrical_engineering": {
3095
+ "original": 145,
3096
+ "effective": 145
3097
+ },
3098
+ "mmlu_college_computer_science": {
3099
+ "original": 100,
3100
+ "effective": 100
3101
+ },
3102
+ "mmlu_high_school_chemistry": {
3103
+ "original": 203,
3104
+ "effective": 203
3105
+ },
3106
+ "mmlu_astronomy": {
3107
+ "original": 152,
3108
+ "effective": 152
3109
+ },
3110
+ "mmlu_high_school_computer_science": {
3111
+ "original": 100,
3112
+ "effective": 100
3113
+ },
3114
+ "mmlu_elementary_mathematics": {
3115
+ "original": 378,
3116
+ "effective": 378
3117
+ },
3118
+ "mmlu_college_mathematics": {
3119
+ "original": 100,
3120
+ "effective": 100
3121
+ }
3122
+ },
3123
+ "config": {
3124
+ "model": "vllm",
3125
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
3126
+ "batch_size": "auto",
3127
+ "batch_sizes": [],
3128
+ "device": "cuda",
3129
+ "use_cache": null,
3130
+ "limit": null,
3131
+ "bootstrap_iters": 100000,
3132
+ "gen_kwargs": null,
3133
+ "random_seed": 0,
3134
+ "numpy_seed": 1234,
3135
+ "torch_seed": 1234,
3136
+ "fewshot_seed": 1234
3137
+ },
3138
+ "git_hash": null,
3139
+ "date": 1719624343.9120553,
3140
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
3141
+ "transformers_version": "4.41.2",
3142
+ "upper_git_hash": null,
3143
+ "task_hashes": {},
3144
+ "model_source": "vllm",
3145
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
3146
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
3147
+ "system_instruction": null,
3148
+ "system_instruction_sha": null,
3149
+ "fewshot_as_multiturn": false,
3150
+ "chat_template": null,
3151
+ "chat_template_sha": null,
3152
+ "start_time": 7638031.049041288,
3153
+ "end_time": 7641735.991916678,
3154
+ "total_evaluation_time_seconds": "3704.942875389941"
3155
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec6d968bf6c3cfee803ba5301e1b2a55ff0a0a109378b5e1b00cc5e8dd46e50
3
+ size 1616610632
quantize_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bits": 4,
3
+ "group_size": 128,
4
+ "damp_percent": 0.1,
5
+ "desc_act": true,
6
+ "static_groups": false,
7
+ "sym": true,
8
+ "true_sequential": true,
9
+ "model_name_or_path": null,
10
+ "model_file_base_name": "model",
11
+ "quant_method": "gptq",
12
+ "checkpoint_format": "gptq"
13
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
truthfulqa_mc2-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T00-43-07.648332.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "truthfulqa_mc2": {
4
+ "acc,none": 0.4350736881769816,
5
+ "acc_stderr,none": 0.01439772134626664,
6
+ "alias": "truthfulqa_mc2"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "truthfulqa_mc2": []
11
+ },
12
+ "configs": {
13
+ "truthfulqa_mc2": {
14
+ "task": "truthfulqa_mc2",
15
+ "group": [
16
+ "truthfulqa"
17
+ ],
18
+ "dataset_path": "truthful_qa",
19
+ "dataset_name": "multiple_choice",
20
+ "validation_split": "validation",
21
+ "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
22
+ "doc_to_target": 0,
23
+ "doc_to_choice": "{{mc2_targets.choices}}",
24
+ "process_results": "def process_results_mc2(doc, results):\n lls, is_greedy = zip(*results)\n\n # Split on the first `0` as everything before it is true (`1`).\n split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n # Compute the normalized probability mass for the correct answer.\n ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n p_true = p_true / (sum(p_true) + sum(p_false))\n\n return {\"acc\": sum(p_true)}\n",
25
+ "description": "",
26
+ "target_delimiter": " ",
27
+ "fewshot_delimiter": "\n\n",
28
+ "num_fewshot": 0,
29
+ "metric_list": [
30
+ {
31
+ "metric": "acc",
32
+ "aggregation": "mean",
33
+ "higher_is_better": true
34
+ }
35
+ ],
36
+ "output_type": "multiple_choice",
37
+ "repeats": 1,
38
+ "should_decontaminate": true,
39
+ "doc_to_decontamination_query": "question",
40
+ "metadata": {
41
+ "version": 2.0
42
+ }
43
+ }
44
+ },
45
+ "versions": {
46
+ "truthfulqa_mc2": 2.0
47
+ },
48
+ "n-shot": {
49
+ "truthfulqa_mc2": 0
50
+ },
51
+ "higher_is_better": {
52
+ "truthfulqa_mc2": {
53
+ "acc": true
54
+ }
55
+ },
56
+ "n-samples": {
57
+ "truthfulqa_mc2": {
58
+ "original": 817,
59
+ "effective": 817
60
+ }
61
+ },
62
+ "config": {
63
+ "model": "vllm",
64
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
65
+ "batch_size": "auto",
66
+ "batch_sizes": [],
67
+ "device": "cuda",
68
+ "use_cache": null,
69
+ "limit": null,
70
+ "bootstrap_iters": 100000,
71
+ "gen_kwargs": null,
72
+ "random_seed": 0,
73
+ "numpy_seed": 1234,
74
+ "torch_seed": 1234,
75
+ "fewshot_seed": 1234
76
+ },
77
+ "git_hash": null,
78
+ "date": 1719621686.9441924,
79
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
80
+ "transformers_version": "4.41.2",
81
+ "upper_git_hash": null,
82
+ "task_hashes": {},
83
+ "model_source": "vllm",
84
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
85
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
86
+ "system_instruction": null,
87
+ "system_instruction_sha": null,
88
+ "fewshot_as_multiturn": false,
89
+ "chat_template": null,
90
+ "chat_template_sha": null,
91
+ "start_time": 7635373.964217621,
92
+ "end_time": 7635480.001345955,
93
+ "total_evaluation_time_seconds": "106.03712833393365"
94
+ }
vllm-clearml/results_2024-06-29T00-41-08.891349.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "winogrande": {
4
+ "acc,none": 0.6369376479873717,
5
+ "acc_stderr,none": 0.013515191866479218,
6
+ "alias": "winogrande"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "winogrande": []
11
+ },
12
+ "configs": {
13
+ "winogrande": {
14
+ "task": "winogrande",
15
+ "dataset_path": "winogrande",
16
+ "dataset_name": "winogrande_xl",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n",
20
+ "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n",
21
+ "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "acc",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ }
32
+ ],
33
+ "output_type": "multiple_choice",
34
+ "repeats": 1,
35
+ "should_decontaminate": true,
36
+ "doc_to_decontamination_query": "sentence",
37
+ "metadata": {
38
+ "version": 1.0
39
+ }
40
+ }
41
+ },
42
+ "versions": {
43
+ "winogrande": 1.0
44
+ },
45
+ "n-shot": {
46
+ "winogrande": 5
47
+ },
48
+ "higher_is_better": {
49
+ "winogrande": {
50
+ "acc": true
51
+ }
52
+ },
53
+ "n-samples": {
54
+ "winogrande": {
55
+ "original": 1267,
56
+ "effective": 1267
57
+ }
58
+ },
59
+ "config": {
60
+ "model": "vllm",
61
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
62
+ "batch_size": "auto",
63
+ "batch_sizes": [],
64
+ "device": "cuda",
65
+ "use_cache": null,
66
+ "limit": null,
67
+ "bootstrap_iters": 100000,
68
+ "gen_kwargs": null,
69
+ "random_seed": 0,
70
+ "numpy_seed": 1234,
71
+ "torch_seed": 1234,
72
+ "fewshot_seed": 1234
73
+ },
74
+ "git_hash": null,
75
+ "date": 1719621613.7207468,
76
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
77
+ "transformers_version": "4.41.2",
78
+ "upper_git_hash": null,
79
+ "task_hashes": {},
80
+ "model_source": "vllm",
81
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
82
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
83
+ "system_instruction": null,
84
+ "system_instruction_sha": null,
85
+ "fewshot_as_multiturn": false,
86
+ "chat_template": null,
87
+ "chat_template_sha": null,
88
+ "start_time": 7635301.614735226,
89
+ "end_time": 7635361.244645656,
90
+ "total_evaluation_time_seconds": "59.62991042993963"
91
+ }
vllm-clearml/results_2024-06-29T00-43-07.648332.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "truthfulqa_mc2": {
4
+ "acc,none": 0.4350736881769816,
5
+ "acc_stderr,none": 0.01439772134626664,
6
+ "alias": "truthfulqa_mc2"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "truthfulqa_mc2": []
11
+ },
12
+ "configs": {
13
+ "truthfulqa_mc2": {
14
+ "task": "truthfulqa_mc2",
15
+ "group": [
16
+ "truthfulqa"
17
+ ],
18
+ "dataset_path": "truthful_qa",
19
+ "dataset_name": "multiple_choice",
20
+ "validation_split": "validation",
21
+ "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
22
+ "doc_to_target": 0,
23
+ "doc_to_choice": "{{mc2_targets.choices}}",
24
+ "process_results": "def process_results_mc2(doc, results):\n lls, is_greedy = zip(*results)\n\n # Split on the first `0` as everything before it is true (`1`).\n split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n # Compute the normalized probability mass for the correct answer.\n ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n p_true = p_true / (sum(p_true) + sum(p_false))\n\n return {\"acc\": sum(p_true)}\n",
25
+ "description": "",
26
+ "target_delimiter": " ",
27
+ "fewshot_delimiter": "\n\n",
28
+ "num_fewshot": 0,
29
+ "metric_list": [
30
+ {
31
+ "metric": "acc",
32
+ "aggregation": "mean",
33
+ "higher_is_better": true
34
+ }
35
+ ],
36
+ "output_type": "multiple_choice",
37
+ "repeats": 1,
38
+ "should_decontaminate": true,
39
+ "doc_to_decontamination_query": "question",
40
+ "metadata": {
41
+ "version": 2.0
42
+ }
43
+ }
44
+ },
45
+ "versions": {
46
+ "truthfulqa_mc2": 2.0
47
+ },
48
+ "n-shot": {
49
+ "truthfulqa_mc2": 0
50
+ },
51
+ "higher_is_better": {
52
+ "truthfulqa_mc2": {
53
+ "acc": true
54
+ }
55
+ },
56
+ "n-samples": {
57
+ "truthfulqa_mc2": {
58
+ "original": 817,
59
+ "effective": 817
60
+ }
61
+ },
62
+ "config": {
63
+ "model": "vllm",
64
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
65
+ "batch_size": "auto",
66
+ "batch_sizes": [],
67
+ "device": "cuda",
68
+ "use_cache": null,
69
+ "limit": null,
70
+ "bootstrap_iters": 100000,
71
+ "gen_kwargs": null,
72
+ "random_seed": 0,
73
+ "numpy_seed": 1234,
74
+ "torch_seed": 1234,
75
+ "fewshot_seed": 1234
76
+ },
77
+ "git_hash": null,
78
+ "date": 1719621686.9441924,
79
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
80
+ "transformers_version": "4.41.2",
81
+ "upper_git_hash": null,
82
+ "task_hashes": {},
83
+ "model_source": "vllm",
84
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
85
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
86
+ "system_instruction": null,
87
+ "system_instruction_sha": null,
88
+ "fewshot_as_multiturn": false,
89
+ "chat_template": null,
90
+ "chat_template_sha": null,
91
+ "start_time": 7635373.964217621,
92
+ "end_time": 7635480.001345955,
93
+ "total_evaluation_time_seconds": "106.03712833393365"
94
+ }
vllm-clearml/results_2024-06-29T01-25-24.138070.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "acc,none": 0.47988448516231824,
5
+ "acc_stderr,none": 0.004985741706385721,
6
+ "acc_norm,none": 0.6526588329018124,
7
+ "acc_norm_stderr,none": 0.0047515221274184805,
8
+ "alias": "hellaswag"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "group": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
24
+ "doc_to_text": "{{query}}",
25
+ "doc_to_target": "{{label}}",
26
+ "doc_to_choice": "choices",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 10,
31
+ "metric_list": [
32
+ {
33
+ "metric": "acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": false,
46
+ "metadata": {
47
+ "version": 1.0
48
+ }
49
+ }
50
+ },
51
+ "versions": {
52
+ "hellaswag": 1.0
53
+ },
54
+ "n-shot": {
55
+ "hellaswag": 10
56
+ },
57
+ "higher_is_better": {
58
+ "hellaswag": {
59
+ "acc": true,
60
+ "acc_norm": true
61
+ }
62
+ },
63
+ "n-samples": {
64
+ "hellaswag": {
65
+ "original": 10042,
66
+ "effective": 10042
67
+ }
68
+ },
69
+ "config": {
70
+ "model": "vllm",
71
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
72
+ "batch_size": "auto",
73
+ "batch_sizes": [],
74
+ "device": "cuda",
75
+ "use_cache": null,
76
+ "limit": null,
77
+ "bootstrap_iters": 100000,
78
+ "gen_kwargs": null,
79
+ "random_seed": 0,
80
+ "numpy_seed": 1234,
81
+ "torch_seed": 1234,
82
+ "fewshot_seed": 1234
83
+ },
84
+ "git_hash": null,
85
+ "date": 1719621807.2691455,
86
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
87
+ "transformers_version": "4.41.2",
88
+ "upper_git_hash": null,
89
+ "task_hashes": {},
90
+ "model_source": "vllm",
91
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
92
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
93
+ "system_instruction": null,
94
+ "system_instruction_sha": null,
95
+ "fewshot_as_multiturn": false,
96
+ "chat_template": null,
97
+ "chat_template_sha": null,
98
+ "start_time": 7635492.847221878,
99
+ "end_time": 7638016.491339342,
100
+ "total_evaluation_time_seconds": "2523.64411746338"
101
+ }
vllm-clearml/results_2024-06-29T02-27-23.641279.json ADDED
@@ -0,0 +1,3155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "mmlu": {
4
+ "acc,none": 0.5438683948155534,
5
+ "acc_stderr,none": 0.004046605401658642,
6
+ "alias": "mmlu"
7
+ },
8
+ "mmlu_humanities": {
9
+ "alias": " - humanities",
10
+ "acc,none": 0.4973432518597237,
11
+ "acc_stderr,none": 0.0069921729107274965
12
+ },
13
+ "mmlu_formal_logic": {
14
+ "alias": " - formal_logic",
15
+ "acc,none": 0.36507936507936506,
16
+ "acc_stderr,none": 0.04306241259127153
17
+ },
18
+ "mmlu_high_school_european_history": {
19
+ "alias": " - high_school_european_history",
20
+ "acc,none": 0.6484848484848484,
21
+ "acc_stderr,none": 0.0372820699868265
22
+ },
23
+ "mmlu_high_school_us_history": {
24
+ "alias": " - high_school_us_history",
25
+ "acc,none": 0.6764705882352942,
26
+ "acc_stderr,none": 0.03283472056108561
27
+ },
28
+ "mmlu_high_school_world_history": {
29
+ "alias": " - high_school_world_history",
30
+ "acc,none": 0.7088607594936709,
31
+ "acc_stderr,none": 0.029571601065753374
32
+ },
33
+ "mmlu_international_law": {
34
+ "alias": " - international_law",
35
+ "acc,none": 0.6942148760330579,
36
+ "acc_stderr,none": 0.04205953933884124
37
+ },
38
+ "mmlu_jurisprudence": {
39
+ "alias": " - jurisprudence",
40
+ "acc,none": 0.6851851851851852,
41
+ "acc_stderr,none": 0.04489931073591312
42
+ },
43
+ "mmlu_logical_fallacies": {
44
+ "alias": " - logical_fallacies",
45
+ "acc,none": 0.7116564417177914,
46
+ "acc_stderr,none": 0.035590395316173425
47
+ },
48
+ "mmlu_moral_disputes": {
49
+ "alias": " - moral_disputes",
50
+ "acc,none": 0.630057803468208,
51
+ "acc_stderr,none": 0.025992472029306383
52
+ },
53
+ "mmlu_moral_scenarios": {
54
+ "alias": " - moral_scenarios",
55
+ "acc,none": 0.3195530726256983,
56
+ "acc_stderr,none": 0.015595520294147402
57
+ },
58
+ "mmlu_philosophy": {
59
+ "alias": " - philosophy",
60
+ "acc,none": 0.594855305466238,
61
+ "acc_stderr,none": 0.027882383791325953
62
+ },
63
+ "mmlu_prehistory": {
64
+ "alias": " - prehistory",
65
+ "acc,none": 0.5524691358024691,
66
+ "acc_stderr,none": 0.027667138569422715
67
+ },
68
+ "mmlu_professional_law": {
69
+ "alias": " - professional_law",
70
+ "acc,none": 0.4048239895697523,
71
+ "acc_stderr,none": 0.012536743830953994
72
+ },
73
+ "mmlu_world_religions": {
74
+ "alias": " - world_religions",
75
+ "acc,none": 0.6900584795321637,
76
+ "acc_stderr,none": 0.035469769593931624
77
+ },
78
+ "mmlu_other": {
79
+ "alias": " - other",
80
+ "acc,none": 0.5909237206308336,
81
+ "acc_stderr,none": 0.008549729756636936
82
+ },
83
+ "mmlu_business_ethics": {
84
+ "alias": " - business_ethics",
85
+ "acc,none": 0.61,
86
+ "acc_stderr,none": 0.04902071300001974
87
+ },
88
+ "mmlu_clinical_knowledge": {
89
+ "alias": " - clinical_knowledge",
90
+ "acc,none": 0.5735849056603773,
91
+ "acc_stderr,none": 0.030437794342983042
92
+ },
93
+ "mmlu_college_medicine": {
94
+ "alias": " - college_medicine",
95
+ "acc,none": 0.49710982658959535,
96
+ "acc_stderr,none": 0.038124005659748335
97
+ },
98
+ "mmlu_global_facts": {
99
+ "alias": " - global_facts",
100
+ "acc,none": 0.34,
101
+ "acc_stderr,none": 0.04760952285695235
102
+ },
103
+ "mmlu_human_aging": {
104
+ "alias": " - human_aging",
105
+ "acc,none": 0.5964125560538116,
106
+ "acc_stderr,none": 0.032928028193303135
107
+ },
108
+ "mmlu_management": {
109
+ "alias": " - management",
110
+ "acc,none": 0.7669902912621359,
111
+ "acc_stderr,none": 0.04185832598928315
112
+ },
113
+ "mmlu_marketing": {
114
+ "alias": " - marketing",
115
+ "acc,none": 0.8076923076923077,
116
+ "acc_stderr,none": 0.025819233256483727
117
+ },
118
+ "mmlu_medical_genetics": {
119
+ "alias": " - medical_genetics",
120
+ "acc,none": 0.56,
121
+ "acc_stderr,none": 0.04988876515698589
122
+ },
123
+ "mmlu_miscellaneous": {
124
+ "alias": " - miscellaneous",
125
+ "acc,none": 0.6845466155810983,
126
+ "acc_stderr,none": 0.016617501738763397
127
+ },
128
+ "mmlu_nutrition": {
129
+ "alias": " - nutrition",
130
+ "acc,none": 0.6503267973856209,
131
+ "acc_stderr,none": 0.0273053080762747
132
+ },
133
+ "mmlu_professional_accounting": {
134
+ "alias": " - professional_accounting",
135
+ "acc,none": 0.42907801418439717,
136
+ "acc_stderr,none": 0.02952591430255855
137
+ },
138
+ "mmlu_professional_medicine": {
139
+ "alias": " - professional_medicine",
140
+ "acc,none": 0.4338235294117647,
141
+ "acc_stderr,none": 0.03010563657001664
142
+ },
143
+ "mmlu_virology": {
144
+ "alias": " - virology",
145
+ "acc,none": 0.43373493975903615,
146
+ "acc_stderr,none": 0.03858158940685515
147
+ },
148
+ "mmlu_social_sciences": {
149
+ "alias": " - social_sciences",
150
+ "acc,none": 0.6360090997725056,
151
+ "acc_stderr,none": 0.008459352068826637
152
+ },
153
+ "mmlu_econometrics": {
154
+ "alias": " - econometrics",
155
+ "acc,none": 0.34210526315789475,
156
+ "acc_stderr,none": 0.04462917535336936
157
+ },
158
+ "mmlu_high_school_geography": {
159
+ "alias": " - high_school_geography",
160
+ "acc,none": 0.7272727272727273,
161
+ "acc_stderr,none": 0.03173071239071724
162
+ },
163
+ "mmlu_high_school_government_and_politics": {
164
+ "alias": " - high_school_government_and_politics",
165
+ "acc,none": 0.7979274611398963,
166
+ "acc_stderr,none": 0.02897908979429673
167
+ },
168
+ "mmlu_high_school_macroeconomics": {
169
+ "alias": " - high_school_macroeconomics",
170
+ "acc,none": 0.5384615384615384,
171
+ "acc_stderr,none": 0.025275892070240644
172
+ },
173
+ "mmlu_high_school_microeconomics": {
174
+ "alias": " - high_school_microeconomics",
175
+ "acc,none": 0.6176470588235294,
176
+ "acc_stderr,none": 0.03156663099215416
177
+ },
178
+ "mmlu_high_school_psychology": {
179
+ "alias": " - high_school_psychology",
180
+ "acc,none": 0.7321100917431193,
181
+ "acc_stderr,none": 0.018987462257978652
182
+ },
183
+ "mmlu_human_sexuality": {
184
+ "alias": " - human_sexuality",
185
+ "acc,none": 0.6412213740458015,
186
+ "acc_stderr,none": 0.04206739313864908
187
+ },
188
+ "mmlu_professional_psychology": {
189
+ "alias": " - professional_psychology",
190
+ "acc,none": 0.5294117647058824,
191
+ "acc_stderr,none": 0.020192808271433788
192
+ },
193
+ "mmlu_public_relations": {
194
+ "alias": " - public_relations",
195
+ "acc,none": 0.5636363636363636,
196
+ "acc_stderr,none": 0.04750185058907296
197
+ },
198
+ "mmlu_security_studies": {
199
+ "alias": " - security_studies",
200
+ "acc,none": 0.689795918367347,
201
+ "acc_stderr,none": 0.02961345987248438
202
+ },
203
+ "mmlu_sociology": {
204
+ "alias": " - sociology",
205
+ "acc,none": 0.736318407960199,
206
+ "acc_stderr,none": 0.031157150869355568
207
+ },
208
+ "mmlu_us_foreign_policy": {
209
+ "alias": " - us_foreign_policy",
210
+ "acc,none": 0.77,
211
+ "acc_stderr,none": 0.042295258468165044
212
+ },
213
+ "mmlu_stem": {
214
+ "alias": " - stem",
215
+ "acc,none": 0.4770060260069775,
216
+ "acc_stderr,none": 0.00876253277535237
217
+ },
218
+ "mmlu_abstract_algebra": {
219
+ "alias": " - abstract_algebra",
220
+ "acc,none": 0.38,
221
+ "acc_stderr,none": 0.04878317312145633
222
+ },
223
+ "mmlu_anatomy": {
224
+ "alias": " - anatomy",
225
+ "acc,none": 0.4666666666666667,
226
+ "acc_stderr,none": 0.043097329010363554
227
+ },
228
+ "mmlu_astronomy": {
229
+ "alias": " - astronomy",
230
+ "acc,none": 0.5526315789473685,
231
+ "acc_stderr,none": 0.04046336883978251
232
+ },
233
+ "mmlu_college_biology": {
234
+ "alias": " - college_biology",
235
+ "acc,none": 0.5486111111111112,
236
+ "acc_stderr,none": 0.04161402398403279
237
+ },
238
+ "mmlu_college_chemistry": {
239
+ "alias": " - college_chemistry",
240
+ "acc,none": 0.39,
241
+ "acc_stderr,none": 0.04902071300001975
242
+ },
243
+ "mmlu_college_computer_science": {
244
+ "alias": " - college_computer_science",
245
+ "acc,none": 0.53,
246
+ "acc_stderr,none": 0.05016135580465919
247
+ },
248
+ "mmlu_college_mathematics": {
249
+ "alias": " - college_mathematics",
250
+ "acc,none": 0.31,
251
+ "acc_stderr,none": 0.04648231987117316
252
+ },
253
+ "mmlu_college_physics": {
254
+ "alias": " - college_physics",
255
+ "acc,none": 0.37254901960784315,
256
+ "acc_stderr,none": 0.04810840148082633
257
+ },
258
+ "mmlu_computer_security": {
259
+ "alias": " - computer_security",
260
+ "acc,none": 0.71,
261
+ "acc_stderr,none": 0.045604802157206845
262
+ },
263
+ "mmlu_conceptual_physics": {
264
+ "alias": " - conceptual_physics",
265
+ "acc,none": 0.4765957446808511,
266
+ "acc_stderr,none": 0.03265019475033582
267
+ },
268
+ "mmlu_electrical_engineering": {
269
+ "alias": " - electrical_engineering",
270
+ "acc,none": 0.5793103448275863,
271
+ "acc_stderr,none": 0.0411391498118926
272
+ },
273
+ "mmlu_elementary_mathematics": {
274
+ "alias": " - elementary_mathematics",
275
+ "acc,none": 0.4417989417989418,
276
+ "acc_stderr,none": 0.025576257061253833
277
+ },
278
+ "mmlu_high_school_biology": {
279
+ "alias": " - high_school_biology",
280
+ "acc,none": 0.6161290322580645,
281
+ "acc_stderr,none": 0.027666182075539645
282
+ },
283
+ "mmlu_high_school_chemistry": {
284
+ "alias": " - high_school_chemistry",
285
+ "acc,none": 0.46798029556650245,
286
+ "acc_stderr,none": 0.035107665979592154
287
+ },
288
+ "mmlu_high_school_computer_science": {
289
+ "alias": " - high_school_computer_science",
290
+ "acc,none": 0.56,
291
+ "acc_stderr,none": 0.04988876515698589
292
+ },
293
+ "mmlu_high_school_mathematics": {
294
+ "alias": " - high_school_mathematics",
295
+ "acc,none": 0.37777777777777777,
296
+ "acc_stderr,none": 0.029560707392465718
297
+ },
298
+ "mmlu_high_school_physics": {
299
+ "alias": " - high_school_physics",
300
+ "acc,none": 0.33774834437086093,
301
+ "acc_stderr,none": 0.03861557546255169
302
+ },
303
+ "mmlu_high_school_statistics": {
304
+ "alias": " - high_school_statistics",
305
+ "acc,none": 0.4861111111111111,
306
+ "acc_stderr,none": 0.03408655867977749
307
+ },
308
+ "mmlu_machine_learning": {
309
+ "alias": " - machine_learning",
310
+ "acc,none": 0.4017857142857143,
311
+ "acc_stderr,none": 0.04653333146973647
312
+ }
313
+ },
314
+ "groups": {
315
+ "mmlu": {
316
+ "acc,none": 0.5438683948155534,
317
+ "acc_stderr,none": 0.004046605401658642,
318
+ "alias": "mmlu"
319
+ },
320
+ "mmlu_humanities": {
321
+ "alias": " - humanities",
322
+ "acc,none": 0.4973432518597237,
323
+ "acc_stderr,none": 0.0069921729107274965
324
+ },
325
+ "mmlu_other": {
326
+ "alias": " - other",
327
+ "acc,none": 0.5909237206308336,
328
+ "acc_stderr,none": 0.008549729756636936
329
+ },
330
+ "mmlu_social_sciences": {
331
+ "alias": " - social_sciences",
332
+ "acc,none": 0.6360090997725056,
333
+ "acc_stderr,none": 0.008459352068826637
334
+ },
335
+ "mmlu_stem": {
336
+ "alias": " - stem",
337
+ "acc,none": 0.4770060260069775,
338
+ "acc_stderr,none": 0.00876253277535237
339
+ }
340
+ },
341
+ "group_subtasks": {
342
+ "mmlu_stem": [
343
+ "mmlu_abstract_algebra",
344
+ "mmlu_college_biology",
345
+ "mmlu_high_school_biology",
346
+ "mmlu_conceptual_physics",
347
+ "mmlu_computer_security",
348
+ "mmlu_college_physics",
349
+ "mmlu_college_chemistry",
350
+ "mmlu_high_school_statistics",
351
+ "mmlu_anatomy",
352
+ "mmlu_high_school_mathematics",
353
+ "mmlu_machine_learning",
354
+ "mmlu_high_school_physics",
355
+ "mmlu_electrical_engineering",
356
+ "mmlu_college_computer_science",
357
+ "mmlu_high_school_chemistry",
358
+ "mmlu_astronomy",
359
+ "mmlu_high_school_computer_science",
360
+ "mmlu_elementary_mathematics",
361
+ "mmlu_college_mathematics"
362
+ ],
363
+ "mmlu_other": [
364
+ "mmlu_business_ethics",
365
+ "mmlu_marketing",
366
+ "mmlu_medical_genetics",
367
+ "mmlu_clinical_knowledge",
368
+ "mmlu_global_facts",
369
+ "mmlu_human_aging",
370
+ "mmlu_professional_medicine",
371
+ "mmlu_nutrition",
372
+ "mmlu_management",
373
+ "mmlu_college_medicine",
374
+ "mmlu_professional_accounting",
375
+ "mmlu_virology",
376
+ "mmlu_miscellaneous"
377
+ ],
378
+ "mmlu_social_sciences": [
379
+ "mmlu_public_relations",
380
+ "mmlu_high_school_macroeconomics",
381
+ "mmlu_human_sexuality",
382
+ "mmlu_high_school_geography",
383
+ "mmlu_high_school_psychology",
384
+ "mmlu_high_school_microeconomics",
385
+ "mmlu_high_school_government_and_politics",
386
+ "mmlu_us_foreign_policy",
387
+ "mmlu_sociology",
388
+ "mmlu_security_studies",
389
+ "mmlu_econometrics",
390
+ "mmlu_professional_psychology"
391
+ ],
392
+ "mmlu_humanities": [
393
+ "mmlu_philosophy",
394
+ "mmlu_logical_fallacies",
395
+ "mmlu_moral_disputes",
396
+ "mmlu_jurisprudence",
397
+ "mmlu_high_school_us_history",
398
+ "mmlu_high_school_world_history",
399
+ "mmlu_world_religions",
400
+ "mmlu_moral_scenarios",
401
+ "mmlu_prehistory",
402
+ "mmlu_formal_logic",
403
+ "mmlu_international_law",
404
+ "mmlu_professional_law",
405
+ "mmlu_high_school_european_history"
406
+ ],
407
+ "mmlu": [
408
+ "mmlu_humanities",
409
+ "mmlu_social_sciences",
410
+ "mmlu_other",
411
+ "mmlu_stem"
412
+ ]
413
+ },
414
+ "configs": {
415
+ "mmlu_abstract_algebra": {
416
+ "task": "mmlu_abstract_algebra",
417
+ "task_alias": "abstract_algebra",
418
+ "group": "mmlu_stem",
419
+ "group_alias": "stem",
420
+ "dataset_path": "hails/mmlu_no_train",
421
+ "dataset_name": "abstract_algebra",
422
+ "test_split": "test",
423
+ "fewshot_split": "dev",
424
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
425
+ "doc_to_target": "answer",
426
+ "doc_to_choice": [
427
+ "A",
428
+ "B",
429
+ "C",
430
+ "D"
431
+ ],
432
+ "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
433
+ "target_delimiter": " ",
434
+ "fewshot_delimiter": "\n\n",
435
+ "fewshot_config": {
436
+ "sampler": "first_n"
437
+ },
438
+ "num_fewshot": 5,
439
+ "metric_list": [
440
+ {
441
+ "metric": "acc",
442
+ "aggregation": "mean",
443
+ "higher_is_better": true
444
+ }
445
+ ],
446
+ "output_type": "multiple_choice",
447
+ "repeats": 1,
448
+ "should_decontaminate": false,
449
+ "metadata": {
450
+ "version": 0.0
451
+ }
452
+ },
453
+ "mmlu_anatomy": {
454
+ "task": "mmlu_anatomy",
455
+ "task_alias": "anatomy",
456
+ "group": "mmlu_stem",
457
+ "group_alias": "stem",
458
+ "dataset_path": "hails/mmlu_no_train",
459
+ "dataset_name": "anatomy",
460
+ "test_split": "test",
461
+ "fewshot_split": "dev",
462
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
463
+ "doc_to_target": "answer",
464
+ "doc_to_choice": [
465
+ "A",
466
+ "B",
467
+ "C",
468
+ "D"
469
+ ],
470
+ "description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
471
+ "target_delimiter": " ",
472
+ "fewshot_delimiter": "\n\n",
473
+ "fewshot_config": {
474
+ "sampler": "first_n"
475
+ },
476
+ "num_fewshot": 5,
477
+ "metric_list": [
478
+ {
479
+ "metric": "acc",
480
+ "aggregation": "mean",
481
+ "higher_is_better": true
482
+ }
483
+ ],
484
+ "output_type": "multiple_choice",
485
+ "repeats": 1,
486
+ "should_decontaminate": false,
487
+ "metadata": {
488
+ "version": 0.0
489
+ }
490
+ },
491
+ "mmlu_astronomy": {
492
+ "task": "mmlu_astronomy",
493
+ "task_alias": "astronomy",
494
+ "group": "mmlu_stem",
495
+ "group_alias": "stem",
496
+ "dataset_path": "hails/mmlu_no_train",
497
+ "dataset_name": "astronomy",
498
+ "test_split": "test",
499
+ "fewshot_split": "dev",
500
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
501
+ "doc_to_target": "answer",
502
+ "doc_to_choice": [
503
+ "A",
504
+ "B",
505
+ "C",
506
+ "D"
507
+ ],
508
+ "description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
509
+ "target_delimiter": " ",
510
+ "fewshot_delimiter": "\n\n",
511
+ "fewshot_config": {
512
+ "sampler": "first_n"
513
+ },
514
+ "num_fewshot": 5,
515
+ "metric_list": [
516
+ {
517
+ "metric": "acc",
518
+ "aggregation": "mean",
519
+ "higher_is_better": true
520
+ }
521
+ ],
522
+ "output_type": "multiple_choice",
523
+ "repeats": 1,
524
+ "should_decontaminate": false,
525
+ "metadata": {
526
+ "version": 0.0
527
+ }
528
+ },
529
+ "mmlu_business_ethics": {
530
+ "task": "mmlu_business_ethics",
531
+ "task_alias": "business_ethics",
532
+ "group": "mmlu_other",
533
+ "group_alias": "other",
534
+ "dataset_path": "hails/mmlu_no_train",
535
+ "dataset_name": "business_ethics",
536
+ "test_split": "test",
537
+ "fewshot_split": "dev",
538
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
539
+ "doc_to_target": "answer",
540
+ "doc_to_choice": [
541
+ "A",
542
+ "B",
543
+ "C",
544
+ "D"
545
+ ],
546
+ "description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
547
+ "target_delimiter": " ",
548
+ "fewshot_delimiter": "\n\n",
549
+ "fewshot_config": {
550
+ "sampler": "first_n"
551
+ },
552
+ "num_fewshot": 5,
553
+ "metric_list": [
554
+ {
555
+ "metric": "acc",
556
+ "aggregation": "mean",
557
+ "higher_is_better": true
558
+ }
559
+ ],
560
+ "output_type": "multiple_choice",
561
+ "repeats": 1,
562
+ "should_decontaminate": false,
563
+ "metadata": {
564
+ "version": 0.0
565
+ }
566
+ },
567
+ "mmlu_clinical_knowledge": {
568
+ "task": "mmlu_clinical_knowledge",
569
+ "task_alias": "clinical_knowledge",
570
+ "group": "mmlu_other",
571
+ "group_alias": "other",
572
+ "dataset_path": "hails/mmlu_no_train",
573
+ "dataset_name": "clinical_knowledge",
574
+ "test_split": "test",
575
+ "fewshot_split": "dev",
576
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
577
+ "doc_to_target": "answer",
578
+ "doc_to_choice": [
579
+ "A",
580
+ "B",
581
+ "C",
582
+ "D"
583
+ ],
584
+ "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
585
+ "target_delimiter": " ",
586
+ "fewshot_delimiter": "\n\n",
587
+ "fewshot_config": {
588
+ "sampler": "first_n"
589
+ },
590
+ "num_fewshot": 5,
591
+ "metric_list": [
592
+ {
593
+ "metric": "acc",
594
+ "aggregation": "mean",
595
+ "higher_is_better": true
596
+ }
597
+ ],
598
+ "output_type": "multiple_choice",
599
+ "repeats": 1,
600
+ "should_decontaminate": false,
601
+ "metadata": {
602
+ "version": 0.0
603
+ }
604
+ },
605
+ "mmlu_college_biology": {
606
+ "task": "mmlu_college_biology",
607
+ "task_alias": "college_biology",
608
+ "group": "mmlu_stem",
609
+ "group_alias": "stem",
610
+ "dataset_path": "hails/mmlu_no_train",
611
+ "dataset_name": "college_biology",
612
+ "test_split": "test",
613
+ "fewshot_split": "dev",
614
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
615
+ "doc_to_target": "answer",
616
+ "doc_to_choice": [
617
+ "A",
618
+ "B",
619
+ "C",
620
+ "D"
621
+ ],
622
+ "description": "The following are multiple choice questions (with answers) about college biology.\n\n",
623
+ "target_delimiter": " ",
624
+ "fewshot_delimiter": "\n\n",
625
+ "fewshot_config": {
626
+ "sampler": "first_n"
627
+ },
628
+ "num_fewshot": 5,
629
+ "metric_list": [
630
+ {
631
+ "metric": "acc",
632
+ "aggregation": "mean",
633
+ "higher_is_better": true
634
+ }
635
+ ],
636
+ "output_type": "multiple_choice",
637
+ "repeats": 1,
638
+ "should_decontaminate": false,
639
+ "metadata": {
640
+ "version": 0.0
641
+ }
642
+ },
643
+ "mmlu_college_chemistry": {
644
+ "task": "mmlu_college_chemistry",
645
+ "task_alias": "college_chemistry",
646
+ "group": "mmlu_stem",
647
+ "group_alias": "stem",
648
+ "dataset_path": "hails/mmlu_no_train",
649
+ "dataset_name": "college_chemistry",
650
+ "test_split": "test",
651
+ "fewshot_split": "dev",
652
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
653
+ "doc_to_target": "answer",
654
+ "doc_to_choice": [
655
+ "A",
656
+ "B",
657
+ "C",
658
+ "D"
659
+ ],
660
+ "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
661
+ "target_delimiter": " ",
662
+ "fewshot_delimiter": "\n\n",
663
+ "fewshot_config": {
664
+ "sampler": "first_n"
665
+ },
666
+ "num_fewshot": 5,
667
+ "metric_list": [
668
+ {
669
+ "metric": "acc",
670
+ "aggregation": "mean",
671
+ "higher_is_better": true
672
+ }
673
+ ],
674
+ "output_type": "multiple_choice",
675
+ "repeats": 1,
676
+ "should_decontaminate": false,
677
+ "metadata": {
678
+ "version": 0.0
679
+ }
680
+ },
681
+ "mmlu_college_computer_science": {
682
+ "task": "mmlu_college_computer_science",
683
+ "task_alias": "college_computer_science",
684
+ "group": "mmlu_stem",
685
+ "group_alias": "stem",
686
+ "dataset_path": "hails/mmlu_no_train",
687
+ "dataset_name": "college_computer_science",
688
+ "test_split": "test",
689
+ "fewshot_split": "dev",
690
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
691
+ "doc_to_target": "answer",
692
+ "doc_to_choice": [
693
+ "A",
694
+ "B",
695
+ "C",
696
+ "D"
697
+ ],
698
+ "description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
699
+ "target_delimiter": " ",
700
+ "fewshot_delimiter": "\n\n",
701
+ "fewshot_config": {
702
+ "sampler": "first_n"
703
+ },
704
+ "num_fewshot": 5,
705
+ "metric_list": [
706
+ {
707
+ "metric": "acc",
708
+ "aggregation": "mean",
709
+ "higher_is_better": true
710
+ }
711
+ ],
712
+ "output_type": "multiple_choice",
713
+ "repeats": 1,
714
+ "should_decontaminate": false,
715
+ "metadata": {
716
+ "version": 0.0
717
+ }
718
+ },
719
+ "mmlu_college_mathematics": {
720
+ "task": "mmlu_college_mathematics",
721
+ "task_alias": "college_mathematics",
722
+ "group": "mmlu_stem",
723
+ "group_alias": "stem",
724
+ "dataset_path": "hails/mmlu_no_train",
725
+ "dataset_name": "college_mathematics",
726
+ "test_split": "test",
727
+ "fewshot_split": "dev",
728
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
729
+ "doc_to_target": "answer",
730
+ "doc_to_choice": [
731
+ "A",
732
+ "B",
733
+ "C",
734
+ "D"
735
+ ],
736
+ "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
737
+ "target_delimiter": " ",
738
+ "fewshot_delimiter": "\n\n",
739
+ "fewshot_config": {
740
+ "sampler": "first_n"
741
+ },
742
+ "num_fewshot": 5,
743
+ "metric_list": [
744
+ {
745
+ "metric": "acc",
746
+ "aggregation": "mean",
747
+ "higher_is_better": true
748
+ }
749
+ ],
750
+ "output_type": "multiple_choice",
751
+ "repeats": 1,
752
+ "should_decontaminate": false,
753
+ "metadata": {
754
+ "version": 0.0
755
+ }
756
+ },
757
+ "mmlu_college_medicine": {
758
+ "task": "mmlu_college_medicine",
759
+ "task_alias": "college_medicine",
760
+ "group": "mmlu_other",
761
+ "group_alias": "other",
762
+ "dataset_path": "hails/mmlu_no_train",
763
+ "dataset_name": "college_medicine",
764
+ "test_split": "test",
765
+ "fewshot_split": "dev",
766
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
767
+ "doc_to_target": "answer",
768
+ "doc_to_choice": [
769
+ "A",
770
+ "B",
771
+ "C",
772
+ "D"
773
+ ],
774
+ "description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
775
+ "target_delimiter": " ",
776
+ "fewshot_delimiter": "\n\n",
777
+ "fewshot_config": {
778
+ "sampler": "first_n"
779
+ },
780
+ "num_fewshot": 5,
781
+ "metric_list": [
782
+ {
783
+ "metric": "acc",
784
+ "aggregation": "mean",
785
+ "higher_is_better": true
786
+ }
787
+ ],
788
+ "output_type": "multiple_choice",
789
+ "repeats": 1,
790
+ "should_decontaminate": false,
791
+ "metadata": {
792
+ "version": 0.0
793
+ }
794
+ },
795
+ "mmlu_college_physics": {
796
+ "task": "mmlu_college_physics",
797
+ "task_alias": "college_physics",
798
+ "group": "mmlu_stem",
799
+ "group_alias": "stem",
800
+ "dataset_path": "hails/mmlu_no_train",
801
+ "dataset_name": "college_physics",
802
+ "test_split": "test",
803
+ "fewshot_split": "dev",
804
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
805
+ "doc_to_target": "answer",
806
+ "doc_to_choice": [
807
+ "A",
808
+ "B",
809
+ "C",
810
+ "D"
811
+ ],
812
+ "description": "The following are multiple choice questions (with answers) about college physics.\n\n",
813
+ "target_delimiter": " ",
814
+ "fewshot_delimiter": "\n\n",
815
+ "fewshot_config": {
816
+ "sampler": "first_n"
817
+ },
818
+ "num_fewshot": 5,
819
+ "metric_list": [
820
+ {
821
+ "metric": "acc",
822
+ "aggregation": "mean",
823
+ "higher_is_better": true
824
+ }
825
+ ],
826
+ "output_type": "multiple_choice",
827
+ "repeats": 1,
828
+ "should_decontaminate": false,
829
+ "metadata": {
830
+ "version": 0.0
831
+ }
832
+ },
833
+ "mmlu_computer_security": {
834
+ "task": "mmlu_computer_security",
835
+ "task_alias": "computer_security",
836
+ "group": "mmlu_stem",
837
+ "group_alias": "stem",
838
+ "dataset_path": "hails/mmlu_no_train",
839
+ "dataset_name": "computer_security",
840
+ "test_split": "test",
841
+ "fewshot_split": "dev",
842
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
843
+ "doc_to_target": "answer",
844
+ "doc_to_choice": [
845
+ "A",
846
+ "B",
847
+ "C",
848
+ "D"
849
+ ],
850
+ "description": "The following are multiple choice questions (with answers) about computer security.\n\n",
851
+ "target_delimiter": " ",
852
+ "fewshot_delimiter": "\n\n",
853
+ "fewshot_config": {
854
+ "sampler": "first_n"
855
+ },
856
+ "num_fewshot": 5,
857
+ "metric_list": [
858
+ {
859
+ "metric": "acc",
860
+ "aggregation": "mean",
861
+ "higher_is_better": true
862
+ }
863
+ ],
864
+ "output_type": "multiple_choice",
865
+ "repeats": 1,
866
+ "should_decontaminate": false,
867
+ "metadata": {
868
+ "version": 0.0
869
+ }
870
+ },
871
+ "mmlu_conceptual_physics": {
872
+ "task": "mmlu_conceptual_physics",
873
+ "task_alias": "conceptual_physics",
874
+ "group": "mmlu_stem",
875
+ "group_alias": "stem",
876
+ "dataset_path": "hails/mmlu_no_train",
877
+ "dataset_name": "conceptual_physics",
878
+ "test_split": "test",
879
+ "fewshot_split": "dev",
880
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
881
+ "doc_to_target": "answer",
882
+ "doc_to_choice": [
883
+ "A",
884
+ "B",
885
+ "C",
886
+ "D"
887
+ ],
888
+ "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
889
+ "target_delimiter": " ",
890
+ "fewshot_delimiter": "\n\n",
891
+ "fewshot_config": {
892
+ "sampler": "first_n"
893
+ },
894
+ "num_fewshot": 5,
895
+ "metric_list": [
896
+ {
897
+ "metric": "acc",
898
+ "aggregation": "mean",
899
+ "higher_is_better": true
900
+ }
901
+ ],
902
+ "output_type": "multiple_choice",
903
+ "repeats": 1,
904
+ "should_decontaminate": false,
905
+ "metadata": {
906
+ "version": 0.0
907
+ }
908
+ },
909
+ "mmlu_econometrics": {
910
+ "task": "mmlu_econometrics",
911
+ "task_alias": "econometrics",
912
+ "group": "mmlu_social_sciences",
913
+ "group_alias": "social_sciences",
914
+ "dataset_path": "hails/mmlu_no_train",
915
+ "dataset_name": "econometrics",
916
+ "test_split": "test",
917
+ "fewshot_split": "dev",
918
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
919
+ "doc_to_target": "answer",
920
+ "doc_to_choice": [
921
+ "A",
922
+ "B",
923
+ "C",
924
+ "D"
925
+ ],
926
+ "description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
927
+ "target_delimiter": " ",
928
+ "fewshot_delimiter": "\n\n",
929
+ "fewshot_config": {
930
+ "sampler": "first_n"
931
+ },
932
+ "num_fewshot": 5,
933
+ "metric_list": [
934
+ {
935
+ "metric": "acc",
936
+ "aggregation": "mean",
937
+ "higher_is_better": true
938
+ }
939
+ ],
940
+ "output_type": "multiple_choice",
941
+ "repeats": 1,
942
+ "should_decontaminate": false,
943
+ "metadata": {
944
+ "version": 0.0
945
+ }
946
+ },
947
+ "mmlu_electrical_engineering": {
948
+ "task": "mmlu_electrical_engineering",
949
+ "task_alias": "electrical_engineering",
950
+ "group": "mmlu_stem",
951
+ "group_alias": "stem",
952
+ "dataset_path": "hails/mmlu_no_train",
953
+ "dataset_name": "electrical_engineering",
954
+ "test_split": "test",
955
+ "fewshot_split": "dev",
956
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
957
+ "doc_to_target": "answer",
958
+ "doc_to_choice": [
959
+ "A",
960
+ "B",
961
+ "C",
962
+ "D"
963
+ ],
964
+ "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
965
+ "target_delimiter": " ",
966
+ "fewshot_delimiter": "\n\n",
967
+ "fewshot_config": {
968
+ "sampler": "first_n"
969
+ },
970
+ "num_fewshot": 5,
971
+ "metric_list": [
972
+ {
973
+ "metric": "acc",
974
+ "aggregation": "mean",
975
+ "higher_is_better": true
976
+ }
977
+ ],
978
+ "output_type": "multiple_choice",
979
+ "repeats": 1,
980
+ "should_decontaminate": false,
981
+ "metadata": {
982
+ "version": 0.0
983
+ }
984
+ },
985
+ "mmlu_elementary_mathematics": {
986
+ "task": "mmlu_elementary_mathematics",
987
+ "task_alias": "elementary_mathematics",
988
+ "group": "mmlu_stem",
989
+ "group_alias": "stem",
990
+ "dataset_path": "hails/mmlu_no_train",
991
+ "dataset_name": "elementary_mathematics",
992
+ "test_split": "test",
993
+ "fewshot_split": "dev",
994
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
995
+ "doc_to_target": "answer",
996
+ "doc_to_choice": [
997
+ "A",
998
+ "B",
999
+ "C",
1000
+ "D"
1001
+ ],
1002
+ "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
1003
+ "target_delimiter": " ",
1004
+ "fewshot_delimiter": "\n\n",
1005
+ "fewshot_config": {
1006
+ "sampler": "first_n"
1007
+ },
1008
+ "num_fewshot": 5,
1009
+ "metric_list": [
1010
+ {
1011
+ "metric": "acc",
1012
+ "aggregation": "mean",
1013
+ "higher_is_better": true
1014
+ }
1015
+ ],
1016
+ "output_type": "multiple_choice",
1017
+ "repeats": 1,
1018
+ "should_decontaminate": false,
1019
+ "metadata": {
1020
+ "version": 0.0
1021
+ }
1022
+ },
1023
+ "mmlu_formal_logic": {
1024
+ "task": "mmlu_formal_logic",
1025
+ "task_alias": "formal_logic",
1026
+ "group": "mmlu_humanities",
1027
+ "group_alias": "humanities",
1028
+ "dataset_path": "hails/mmlu_no_train",
1029
+ "dataset_name": "formal_logic",
1030
+ "test_split": "test",
1031
+ "fewshot_split": "dev",
1032
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1033
+ "doc_to_target": "answer",
1034
+ "doc_to_choice": [
1035
+ "A",
1036
+ "B",
1037
+ "C",
1038
+ "D"
1039
+ ],
1040
+ "description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
1041
+ "target_delimiter": " ",
1042
+ "fewshot_delimiter": "\n\n",
1043
+ "fewshot_config": {
1044
+ "sampler": "first_n"
1045
+ },
1046
+ "num_fewshot": 5,
1047
+ "metric_list": [
1048
+ {
1049
+ "metric": "acc",
1050
+ "aggregation": "mean",
1051
+ "higher_is_better": true
1052
+ }
1053
+ ],
1054
+ "output_type": "multiple_choice",
1055
+ "repeats": 1,
1056
+ "should_decontaminate": false,
1057
+ "metadata": {
1058
+ "version": 0.0
1059
+ }
1060
+ },
1061
+ "mmlu_global_facts": {
1062
+ "task": "mmlu_global_facts",
1063
+ "task_alias": "global_facts",
1064
+ "group": "mmlu_other",
1065
+ "group_alias": "other",
1066
+ "dataset_path": "hails/mmlu_no_train",
1067
+ "dataset_name": "global_facts",
1068
+ "test_split": "test",
1069
+ "fewshot_split": "dev",
1070
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1071
+ "doc_to_target": "answer",
1072
+ "doc_to_choice": [
1073
+ "A",
1074
+ "B",
1075
+ "C",
1076
+ "D"
1077
+ ],
1078
+ "description": "The following are multiple choice questions (with answers) about global facts.\n\n",
1079
+ "target_delimiter": " ",
1080
+ "fewshot_delimiter": "\n\n",
1081
+ "fewshot_config": {
1082
+ "sampler": "first_n"
1083
+ },
1084
+ "num_fewshot": 5,
1085
+ "metric_list": [
1086
+ {
1087
+ "metric": "acc",
1088
+ "aggregation": "mean",
1089
+ "higher_is_better": true
1090
+ }
1091
+ ],
1092
+ "output_type": "multiple_choice",
1093
+ "repeats": 1,
1094
+ "should_decontaminate": false,
1095
+ "metadata": {
1096
+ "version": 0.0
1097
+ }
1098
+ },
1099
+ "mmlu_high_school_biology": {
1100
+ "task": "mmlu_high_school_biology",
1101
+ "task_alias": "high_school_biology",
1102
+ "group": "mmlu_stem",
1103
+ "group_alias": "stem",
1104
+ "dataset_path": "hails/mmlu_no_train",
1105
+ "dataset_name": "high_school_biology",
1106
+ "test_split": "test",
1107
+ "fewshot_split": "dev",
1108
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1109
+ "doc_to_target": "answer",
1110
+ "doc_to_choice": [
1111
+ "A",
1112
+ "B",
1113
+ "C",
1114
+ "D"
1115
+ ],
1116
+ "description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
1117
+ "target_delimiter": " ",
1118
+ "fewshot_delimiter": "\n\n",
1119
+ "fewshot_config": {
1120
+ "sampler": "first_n"
1121
+ },
1122
+ "num_fewshot": 5,
1123
+ "metric_list": [
1124
+ {
1125
+ "metric": "acc",
1126
+ "aggregation": "mean",
1127
+ "higher_is_better": true
1128
+ }
1129
+ ],
1130
+ "output_type": "multiple_choice",
1131
+ "repeats": 1,
1132
+ "should_decontaminate": false,
1133
+ "metadata": {
1134
+ "version": 0.0
1135
+ }
1136
+ },
1137
+ "mmlu_high_school_chemistry": {
1138
+ "task": "mmlu_high_school_chemistry",
1139
+ "task_alias": "high_school_chemistry",
1140
+ "group": "mmlu_stem",
1141
+ "group_alias": "stem",
1142
+ "dataset_path": "hails/mmlu_no_train",
1143
+ "dataset_name": "high_school_chemistry",
1144
+ "test_split": "test",
1145
+ "fewshot_split": "dev",
1146
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1147
+ "doc_to_target": "answer",
1148
+ "doc_to_choice": [
1149
+ "A",
1150
+ "B",
1151
+ "C",
1152
+ "D"
1153
+ ],
1154
+ "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
1155
+ "target_delimiter": " ",
1156
+ "fewshot_delimiter": "\n\n",
1157
+ "fewshot_config": {
1158
+ "sampler": "first_n"
1159
+ },
1160
+ "num_fewshot": 5,
1161
+ "metric_list": [
1162
+ {
1163
+ "metric": "acc",
1164
+ "aggregation": "mean",
1165
+ "higher_is_better": true
1166
+ }
1167
+ ],
1168
+ "output_type": "multiple_choice",
1169
+ "repeats": 1,
1170
+ "should_decontaminate": false,
1171
+ "metadata": {
1172
+ "version": 0.0
1173
+ }
1174
+ },
1175
+ "mmlu_high_school_computer_science": {
1176
+ "task": "mmlu_high_school_computer_science",
1177
+ "task_alias": "high_school_computer_science",
1178
+ "group": "mmlu_stem",
1179
+ "group_alias": "stem",
1180
+ "dataset_path": "hails/mmlu_no_train",
1181
+ "dataset_name": "high_school_computer_science",
1182
+ "test_split": "test",
1183
+ "fewshot_split": "dev",
1184
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1185
+ "doc_to_target": "answer",
1186
+ "doc_to_choice": [
1187
+ "A",
1188
+ "B",
1189
+ "C",
1190
+ "D"
1191
+ ],
1192
+ "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
1193
+ "target_delimiter": " ",
1194
+ "fewshot_delimiter": "\n\n",
1195
+ "fewshot_config": {
1196
+ "sampler": "first_n"
1197
+ },
1198
+ "num_fewshot": 5,
1199
+ "metric_list": [
1200
+ {
1201
+ "metric": "acc",
1202
+ "aggregation": "mean",
1203
+ "higher_is_better": true
1204
+ }
1205
+ ],
1206
+ "output_type": "multiple_choice",
1207
+ "repeats": 1,
1208
+ "should_decontaminate": false,
1209
+ "metadata": {
1210
+ "version": 0.0
1211
+ }
1212
+ },
1213
+ "mmlu_high_school_european_history": {
1214
+ "task": "mmlu_high_school_european_history",
1215
+ "task_alias": "high_school_european_history",
1216
+ "group": "mmlu_humanities",
1217
+ "group_alias": "humanities",
1218
+ "dataset_path": "hails/mmlu_no_train",
1219
+ "dataset_name": "high_school_european_history",
1220
+ "test_split": "test",
1221
+ "fewshot_split": "dev",
1222
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1223
+ "doc_to_target": "answer",
1224
+ "doc_to_choice": [
1225
+ "A",
1226
+ "B",
1227
+ "C",
1228
+ "D"
1229
+ ],
1230
+ "description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
1231
+ "target_delimiter": " ",
1232
+ "fewshot_delimiter": "\n\n",
1233
+ "fewshot_config": {
1234
+ "sampler": "first_n"
1235
+ },
1236
+ "num_fewshot": 5,
1237
+ "metric_list": [
1238
+ {
1239
+ "metric": "acc",
1240
+ "aggregation": "mean",
1241
+ "higher_is_better": true
1242
+ }
1243
+ ],
1244
+ "output_type": "multiple_choice",
1245
+ "repeats": 1,
1246
+ "should_decontaminate": false,
1247
+ "metadata": {
1248
+ "version": 0.0
1249
+ }
1250
+ },
1251
+ "mmlu_high_school_geography": {
1252
+ "task": "mmlu_high_school_geography",
1253
+ "task_alias": "high_school_geography",
1254
+ "group": "mmlu_social_sciences",
1255
+ "group_alias": "social_sciences",
1256
+ "dataset_path": "hails/mmlu_no_train",
1257
+ "dataset_name": "high_school_geography",
1258
+ "test_split": "test",
1259
+ "fewshot_split": "dev",
1260
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1261
+ "doc_to_target": "answer",
1262
+ "doc_to_choice": [
1263
+ "A",
1264
+ "B",
1265
+ "C",
1266
+ "D"
1267
+ ],
1268
+ "description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
1269
+ "target_delimiter": " ",
1270
+ "fewshot_delimiter": "\n\n",
1271
+ "fewshot_config": {
1272
+ "sampler": "first_n"
1273
+ },
1274
+ "num_fewshot": 5,
1275
+ "metric_list": [
1276
+ {
1277
+ "metric": "acc",
1278
+ "aggregation": "mean",
1279
+ "higher_is_better": true
1280
+ }
1281
+ ],
1282
+ "output_type": "multiple_choice",
1283
+ "repeats": 1,
1284
+ "should_decontaminate": false,
1285
+ "metadata": {
1286
+ "version": 0.0
1287
+ }
1288
+ },
1289
+ "mmlu_high_school_government_and_politics": {
1290
+ "task": "mmlu_high_school_government_and_politics",
1291
+ "task_alias": "high_school_government_and_politics",
1292
+ "group": "mmlu_social_sciences",
1293
+ "group_alias": "social_sciences",
1294
+ "dataset_path": "hails/mmlu_no_train",
1295
+ "dataset_name": "high_school_government_and_politics",
1296
+ "test_split": "test",
1297
+ "fewshot_split": "dev",
1298
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1299
+ "doc_to_target": "answer",
1300
+ "doc_to_choice": [
1301
+ "A",
1302
+ "B",
1303
+ "C",
1304
+ "D"
1305
+ ],
1306
+ "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
1307
+ "target_delimiter": " ",
1308
+ "fewshot_delimiter": "\n\n",
1309
+ "fewshot_config": {
1310
+ "sampler": "first_n"
1311
+ },
1312
+ "num_fewshot": 5,
1313
+ "metric_list": [
1314
+ {
1315
+ "metric": "acc",
1316
+ "aggregation": "mean",
1317
+ "higher_is_better": true
1318
+ }
1319
+ ],
1320
+ "output_type": "multiple_choice",
1321
+ "repeats": 1,
1322
+ "should_decontaminate": false,
1323
+ "metadata": {
1324
+ "version": 0.0
1325
+ }
1326
+ },
1327
+ "mmlu_high_school_macroeconomics": {
1328
+ "task": "mmlu_high_school_macroeconomics",
1329
+ "task_alias": "high_school_macroeconomics",
1330
+ "group": "mmlu_social_sciences",
1331
+ "group_alias": "social_sciences",
1332
+ "dataset_path": "hails/mmlu_no_train",
1333
+ "dataset_name": "high_school_macroeconomics",
1334
+ "test_split": "test",
1335
+ "fewshot_split": "dev",
1336
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1337
+ "doc_to_target": "answer",
1338
+ "doc_to_choice": [
1339
+ "A",
1340
+ "B",
1341
+ "C",
1342
+ "D"
1343
+ ],
1344
+ "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
1345
+ "target_delimiter": " ",
1346
+ "fewshot_delimiter": "\n\n",
1347
+ "fewshot_config": {
1348
+ "sampler": "first_n"
1349
+ },
1350
+ "num_fewshot": 5,
1351
+ "metric_list": [
1352
+ {
1353
+ "metric": "acc",
1354
+ "aggregation": "mean",
1355
+ "higher_is_better": true
1356
+ }
1357
+ ],
1358
+ "output_type": "multiple_choice",
1359
+ "repeats": 1,
1360
+ "should_decontaminate": false,
1361
+ "metadata": {
1362
+ "version": 0.0
1363
+ }
1364
+ },
1365
+ "mmlu_high_school_mathematics": {
1366
+ "task": "mmlu_high_school_mathematics",
1367
+ "task_alias": "high_school_mathematics",
1368
+ "group": "mmlu_stem",
1369
+ "group_alias": "stem",
1370
+ "dataset_path": "hails/mmlu_no_train",
1371
+ "dataset_name": "high_school_mathematics",
1372
+ "test_split": "test",
1373
+ "fewshot_split": "dev",
1374
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1375
+ "doc_to_target": "answer",
1376
+ "doc_to_choice": [
1377
+ "A",
1378
+ "B",
1379
+ "C",
1380
+ "D"
1381
+ ],
1382
+ "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
1383
+ "target_delimiter": " ",
1384
+ "fewshot_delimiter": "\n\n",
1385
+ "fewshot_config": {
1386
+ "sampler": "first_n"
1387
+ },
1388
+ "num_fewshot": 5,
1389
+ "metric_list": [
1390
+ {
1391
+ "metric": "acc",
1392
+ "aggregation": "mean",
1393
+ "higher_is_better": true
1394
+ }
1395
+ ],
1396
+ "output_type": "multiple_choice",
1397
+ "repeats": 1,
1398
+ "should_decontaminate": false,
1399
+ "metadata": {
1400
+ "version": 0.0
1401
+ }
1402
+ },
1403
+ "mmlu_high_school_microeconomics": {
1404
+ "task": "mmlu_high_school_microeconomics",
1405
+ "task_alias": "high_school_microeconomics",
1406
+ "group": "mmlu_social_sciences",
1407
+ "group_alias": "social_sciences",
1408
+ "dataset_path": "hails/mmlu_no_train",
1409
+ "dataset_name": "high_school_microeconomics",
1410
+ "test_split": "test",
1411
+ "fewshot_split": "dev",
1412
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1413
+ "doc_to_target": "answer",
1414
+ "doc_to_choice": [
1415
+ "A",
1416
+ "B",
1417
+ "C",
1418
+ "D"
1419
+ ],
1420
+ "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
1421
+ "target_delimiter": " ",
1422
+ "fewshot_delimiter": "\n\n",
1423
+ "fewshot_config": {
1424
+ "sampler": "first_n"
1425
+ },
1426
+ "num_fewshot": 5,
1427
+ "metric_list": [
1428
+ {
1429
+ "metric": "acc",
1430
+ "aggregation": "mean",
1431
+ "higher_is_better": true
1432
+ }
1433
+ ],
1434
+ "output_type": "multiple_choice",
1435
+ "repeats": 1,
1436
+ "should_decontaminate": false,
1437
+ "metadata": {
1438
+ "version": 0.0
1439
+ }
1440
+ },
1441
+ "mmlu_high_school_physics": {
1442
+ "task": "mmlu_high_school_physics",
1443
+ "task_alias": "high_school_physics",
1444
+ "group": "mmlu_stem",
1445
+ "group_alias": "stem",
1446
+ "dataset_path": "hails/mmlu_no_train",
1447
+ "dataset_name": "high_school_physics",
1448
+ "test_split": "test",
1449
+ "fewshot_split": "dev",
1450
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1451
+ "doc_to_target": "answer",
1452
+ "doc_to_choice": [
1453
+ "A",
1454
+ "B",
1455
+ "C",
1456
+ "D"
1457
+ ],
1458
+ "description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
1459
+ "target_delimiter": " ",
1460
+ "fewshot_delimiter": "\n\n",
1461
+ "fewshot_config": {
1462
+ "sampler": "first_n"
1463
+ },
1464
+ "num_fewshot": 5,
1465
+ "metric_list": [
1466
+ {
1467
+ "metric": "acc",
1468
+ "aggregation": "mean",
1469
+ "higher_is_better": true
1470
+ }
1471
+ ],
1472
+ "output_type": "multiple_choice",
1473
+ "repeats": 1,
1474
+ "should_decontaminate": false,
1475
+ "metadata": {
1476
+ "version": 0.0
1477
+ }
1478
+ },
1479
+ "mmlu_high_school_psychology": {
1480
+ "task": "mmlu_high_school_psychology",
1481
+ "task_alias": "high_school_psychology",
1482
+ "group": "mmlu_social_sciences",
1483
+ "group_alias": "social_sciences",
1484
+ "dataset_path": "hails/mmlu_no_train",
1485
+ "dataset_name": "high_school_psychology",
1486
+ "test_split": "test",
1487
+ "fewshot_split": "dev",
1488
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1489
+ "doc_to_target": "answer",
1490
+ "doc_to_choice": [
1491
+ "A",
1492
+ "B",
1493
+ "C",
1494
+ "D"
1495
+ ],
1496
+ "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
1497
+ "target_delimiter": " ",
1498
+ "fewshot_delimiter": "\n\n",
1499
+ "fewshot_config": {
1500
+ "sampler": "first_n"
1501
+ },
1502
+ "num_fewshot": 5,
1503
+ "metric_list": [
1504
+ {
1505
+ "metric": "acc",
1506
+ "aggregation": "mean",
1507
+ "higher_is_better": true
1508
+ }
1509
+ ],
1510
+ "output_type": "multiple_choice",
1511
+ "repeats": 1,
1512
+ "should_decontaminate": false,
1513
+ "metadata": {
1514
+ "version": 0.0
1515
+ }
1516
+ },
1517
+ "mmlu_high_school_statistics": {
1518
+ "task": "mmlu_high_school_statistics",
1519
+ "task_alias": "high_school_statistics",
1520
+ "group": "mmlu_stem",
1521
+ "group_alias": "stem",
1522
+ "dataset_path": "hails/mmlu_no_train",
1523
+ "dataset_name": "high_school_statistics",
1524
+ "test_split": "test",
1525
+ "fewshot_split": "dev",
1526
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1527
+ "doc_to_target": "answer",
1528
+ "doc_to_choice": [
1529
+ "A",
1530
+ "B",
1531
+ "C",
1532
+ "D"
1533
+ ],
1534
+ "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
1535
+ "target_delimiter": " ",
1536
+ "fewshot_delimiter": "\n\n",
1537
+ "fewshot_config": {
1538
+ "sampler": "first_n"
1539
+ },
1540
+ "num_fewshot": 5,
1541
+ "metric_list": [
1542
+ {
1543
+ "metric": "acc",
1544
+ "aggregation": "mean",
1545
+ "higher_is_better": true
1546
+ }
1547
+ ],
1548
+ "output_type": "multiple_choice",
1549
+ "repeats": 1,
1550
+ "should_decontaminate": false,
1551
+ "metadata": {
1552
+ "version": 0.0
1553
+ }
1554
+ },
1555
+ "mmlu_high_school_us_history": {
1556
+ "task": "mmlu_high_school_us_history",
1557
+ "task_alias": "high_school_us_history",
1558
+ "group": "mmlu_humanities",
1559
+ "group_alias": "humanities",
1560
+ "dataset_path": "hails/mmlu_no_train",
1561
+ "dataset_name": "high_school_us_history",
1562
+ "test_split": "test",
1563
+ "fewshot_split": "dev",
1564
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1565
+ "doc_to_target": "answer",
1566
+ "doc_to_choice": [
1567
+ "A",
1568
+ "B",
1569
+ "C",
1570
+ "D"
1571
+ ],
1572
+ "description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
1573
+ "target_delimiter": " ",
1574
+ "fewshot_delimiter": "\n\n",
1575
+ "fewshot_config": {
1576
+ "sampler": "first_n"
1577
+ },
1578
+ "num_fewshot": 5,
1579
+ "metric_list": [
1580
+ {
1581
+ "metric": "acc",
1582
+ "aggregation": "mean",
1583
+ "higher_is_better": true
1584
+ }
1585
+ ],
1586
+ "output_type": "multiple_choice",
1587
+ "repeats": 1,
1588
+ "should_decontaminate": false,
1589
+ "metadata": {
1590
+ "version": 0.0
1591
+ }
1592
+ },
1593
+ "mmlu_high_school_world_history": {
1594
+ "task": "mmlu_high_school_world_history",
1595
+ "task_alias": "high_school_world_history",
1596
+ "group": "mmlu_humanities",
1597
+ "group_alias": "humanities",
1598
+ "dataset_path": "hails/mmlu_no_train",
1599
+ "dataset_name": "high_school_world_history",
1600
+ "test_split": "test",
1601
+ "fewshot_split": "dev",
1602
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1603
+ "doc_to_target": "answer",
1604
+ "doc_to_choice": [
1605
+ "A",
1606
+ "B",
1607
+ "C",
1608
+ "D"
1609
+ ],
1610
+ "description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
1611
+ "target_delimiter": " ",
1612
+ "fewshot_delimiter": "\n\n",
1613
+ "fewshot_config": {
1614
+ "sampler": "first_n"
1615
+ },
1616
+ "num_fewshot": 5,
1617
+ "metric_list": [
1618
+ {
1619
+ "metric": "acc",
1620
+ "aggregation": "mean",
1621
+ "higher_is_better": true
1622
+ }
1623
+ ],
1624
+ "output_type": "multiple_choice",
1625
+ "repeats": 1,
1626
+ "should_decontaminate": false,
1627
+ "metadata": {
1628
+ "version": 0.0
1629
+ }
1630
+ },
1631
+ "mmlu_human_aging": {
1632
+ "task": "mmlu_human_aging",
1633
+ "task_alias": "human_aging",
1634
+ "group": "mmlu_other",
1635
+ "group_alias": "other",
1636
+ "dataset_path": "hails/mmlu_no_train",
1637
+ "dataset_name": "human_aging",
1638
+ "test_split": "test",
1639
+ "fewshot_split": "dev",
1640
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1641
+ "doc_to_target": "answer",
1642
+ "doc_to_choice": [
1643
+ "A",
1644
+ "B",
1645
+ "C",
1646
+ "D"
1647
+ ],
1648
+ "description": "The following are multiple choice questions (with answers) about human aging.\n\n",
1649
+ "target_delimiter": " ",
1650
+ "fewshot_delimiter": "\n\n",
1651
+ "fewshot_config": {
1652
+ "sampler": "first_n"
1653
+ },
1654
+ "num_fewshot": 5,
1655
+ "metric_list": [
1656
+ {
1657
+ "metric": "acc",
1658
+ "aggregation": "mean",
1659
+ "higher_is_better": true
1660
+ }
1661
+ ],
1662
+ "output_type": "multiple_choice",
1663
+ "repeats": 1,
1664
+ "should_decontaminate": false,
1665
+ "metadata": {
1666
+ "version": 0.0
1667
+ }
1668
+ },
1669
+ "mmlu_human_sexuality": {
1670
+ "task": "mmlu_human_sexuality",
1671
+ "task_alias": "human_sexuality",
1672
+ "group": "mmlu_social_sciences",
1673
+ "group_alias": "social_sciences",
1674
+ "dataset_path": "hails/mmlu_no_train",
1675
+ "dataset_name": "human_sexuality",
1676
+ "test_split": "test",
1677
+ "fewshot_split": "dev",
1678
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1679
+ "doc_to_target": "answer",
1680
+ "doc_to_choice": [
1681
+ "A",
1682
+ "B",
1683
+ "C",
1684
+ "D"
1685
+ ],
1686
+ "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
1687
+ "target_delimiter": " ",
1688
+ "fewshot_delimiter": "\n\n",
1689
+ "fewshot_config": {
1690
+ "sampler": "first_n"
1691
+ },
1692
+ "num_fewshot": 5,
1693
+ "metric_list": [
1694
+ {
1695
+ "metric": "acc",
1696
+ "aggregation": "mean",
1697
+ "higher_is_better": true
1698
+ }
1699
+ ],
1700
+ "output_type": "multiple_choice",
1701
+ "repeats": 1,
1702
+ "should_decontaminate": false,
1703
+ "metadata": {
1704
+ "version": 0.0
1705
+ }
1706
+ },
1707
+ "mmlu_international_law": {
1708
+ "task": "mmlu_international_law",
1709
+ "task_alias": "international_law",
1710
+ "group": "mmlu_humanities",
1711
+ "group_alias": "humanities",
1712
+ "dataset_path": "hails/mmlu_no_train",
1713
+ "dataset_name": "international_law",
1714
+ "test_split": "test",
1715
+ "fewshot_split": "dev",
1716
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1717
+ "doc_to_target": "answer",
1718
+ "doc_to_choice": [
1719
+ "A",
1720
+ "B",
1721
+ "C",
1722
+ "D"
1723
+ ],
1724
+ "description": "The following are multiple choice questions (with answers) about international law.\n\n",
1725
+ "target_delimiter": " ",
1726
+ "fewshot_delimiter": "\n\n",
1727
+ "fewshot_config": {
1728
+ "sampler": "first_n"
1729
+ },
1730
+ "num_fewshot": 5,
1731
+ "metric_list": [
1732
+ {
1733
+ "metric": "acc",
1734
+ "aggregation": "mean",
1735
+ "higher_is_better": true
1736
+ }
1737
+ ],
1738
+ "output_type": "multiple_choice",
1739
+ "repeats": 1,
1740
+ "should_decontaminate": false,
1741
+ "metadata": {
1742
+ "version": 0.0
1743
+ }
1744
+ },
1745
+ "mmlu_jurisprudence": {
1746
+ "task": "mmlu_jurisprudence",
1747
+ "task_alias": "jurisprudence",
1748
+ "group": "mmlu_humanities",
1749
+ "group_alias": "humanities",
1750
+ "dataset_path": "hails/mmlu_no_train",
1751
+ "dataset_name": "jurisprudence",
1752
+ "test_split": "test",
1753
+ "fewshot_split": "dev",
1754
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1755
+ "doc_to_target": "answer",
1756
+ "doc_to_choice": [
1757
+ "A",
1758
+ "B",
1759
+ "C",
1760
+ "D"
1761
+ ],
1762
+ "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
1763
+ "target_delimiter": " ",
1764
+ "fewshot_delimiter": "\n\n",
1765
+ "fewshot_config": {
1766
+ "sampler": "first_n"
1767
+ },
1768
+ "num_fewshot": 5,
1769
+ "metric_list": [
1770
+ {
1771
+ "metric": "acc",
1772
+ "aggregation": "mean",
1773
+ "higher_is_better": true
1774
+ }
1775
+ ],
1776
+ "output_type": "multiple_choice",
1777
+ "repeats": 1,
1778
+ "should_decontaminate": false,
1779
+ "metadata": {
1780
+ "version": 0.0
1781
+ }
1782
+ },
1783
+ "mmlu_logical_fallacies": {
1784
+ "task": "mmlu_logical_fallacies",
1785
+ "task_alias": "logical_fallacies",
1786
+ "group": "mmlu_humanities",
1787
+ "group_alias": "humanities",
1788
+ "dataset_path": "hails/mmlu_no_train",
1789
+ "dataset_name": "logical_fallacies",
1790
+ "test_split": "test",
1791
+ "fewshot_split": "dev",
1792
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1793
+ "doc_to_target": "answer",
1794
+ "doc_to_choice": [
1795
+ "A",
1796
+ "B",
1797
+ "C",
1798
+ "D"
1799
+ ],
1800
+ "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
1801
+ "target_delimiter": " ",
1802
+ "fewshot_delimiter": "\n\n",
1803
+ "fewshot_config": {
1804
+ "sampler": "first_n"
1805
+ },
1806
+ "num_fewshot": 5,
1807
+ "metric_list": [
1808
+ {
1809
+ "metric": "acc",
1810
+ "aggregation": "mean",
1811
+ "higher_is_better": true
1812
+ }
1813
+ ],
1814
+ "output_type": "multiple_choice",
1815
+ "repeats": 1,
1816
+ "should_decontaminate": false,
1817
+ "metadata": {
1818
+ "version": 0.0
1819
+ }
1820
+ },
1821
+ "mmlu_machine_learning": {
1822
+ "task": "mmlu_machine_learning",
1823
+ "task_alias": "machine_learning",
1824
+ "group": "mmlu_stem",
1825
+ "group_alias": "stem",
1826
+ "dataset_path": "hails/mmlu_no_train",
1827
+ "dataset_name": "machine_learning",
1828
+ "test_split": "test",
1829
+ "fewshot_split": "dev",
1830
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1831
+ "doc_to_target": "answer",
1832
+ "doc_to_choice": [
1833
+ "A",
1834
+ "B",
1835
+ "C",
1836
+ "D"
1837
+ ],
1838
+ "description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
1839
+ "target_delimiter": " ",
1840
+ "fewshot_delimiter": "\n\n",
1841
+ "fewshot_config": {
1842
+ "sampler": "first_n"
1843
+ },
1844
+ "num_fewshot": 5,
1845
+ "metric_list": [
1846
+ {
1847
+ "metric": "acc",
1848
+ "aggregation": "mean",
1849
+ "higher_is_better": true
1850
+ }
1851
+ ],
1852
+ "output_type": "multiple_choice",
1853
+ "repeats": 1,
1854
+ "should_decontaminate": false,
1855
+ "metadata": {
1856
+ "version": 0.0
1857
+ }
1858
+ },
1859
+ "mmlu_management": {
1860
+ "task": "mmlu_management",
1861
+ "task_alias": "management",
1862
+ "group": "mmlu_other",
1863
+ "group_alias": "other",
1864
+ "dataset_path": "hails/mmlu_no_train",
1865
+ "dataset_name": "management",
1866
+ "test_split": "test",
1867
+ "fewshot_split": "dev",
1868
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1869
+ "doc_to_target": "answer",
1870
+ "doc_to_choice": [
1871
+ "A",
1872
+ "B",
1873
+ "C",
1874
+ "D"
1875
+ ],
1876
+ "description": "The following are multiple choice questions (with answers) about management.\n\n",
1877
+ "target_delimiter": " ",
1878
+ "fewshot_delimiter": "\n\n",
1879
+ "fewshot_config": {
1880
+ "sampler": "first_n"
1881
+ },
1882
+ "num_fewshot": 5,
1883
+ "metric_list": [
1884
+ {
1885
+ "metric": "acc",
1886
+ "aggregation": "mean",
1887
+ "higher_is_better": true
1888
+ }
1889
+ ],
1890
+ "output_type": "multiple_choice",
1891
+ "repeats": 1,
1892
+ "should_decontaminate": false,
1893
+ "metadata": {
1894
+ "version": 0.0
1895
+ }
1896
+ },
1897
+ "mmlu_marketing": {
1898
+ "task": "mmlu_marketing",
1899
+ "task_alias": "marketing",
1900
+ "group": "mmlu_other",
1901
+ "group_alias": "other",
1902
+ "dataset_path": "hails/mmlu_no_train",
1903
+ "dataset_name": "marketing",
1904
+ "test_split": "test",
1905
+ "fewshot_split": "dev",
1906
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1907
+ "doc_to_target": "answer",
1908
+ "doc_to_choice": [
1909
+ "A",
1910
+ "B",
1911
+ "C",
1912
+ "D"
1913
+ ],
1914
+ "description": "The following are multiple choice questions (with answers) about marketing.\n\n",
1915
+ "target_delimiter": " ",
1916
+ "fewshot_delimiter": "\n\n",
1917
+ "fewshot_config": {
1918
+ "sampler": "first_n"
1919
+ },
1920
+ "num_fewshot": 5,
1921
+ "metric_list": [
1922
+ {
1923
+ "metric": "acc",
1924
+ "aggregation": "mean",
1925
+ "higher_is_better": true
1926
+ }
1927
+ ],
1928
+ "output_type": "multiple_choice",
1929
+ "repeats": 1,
1930
+ "should_decontaminate": false,
1931
+ "metadata": {
1932
+ "version": 0.0
1933
+ }
1934
+ },
1935
+ "mmlu_medical_genetics": {
1936
+ "task": "mmlu_medical_genetics",
1937
+ "task_alias": "medical_genetics",
1938
+ "group": "mmlu_other",
1939
+ "group_alias": "other",
1940
+ "dataset_path": "hails/mmlu_no_train",
1941
+ "dataset_name": "medical_genetics",
1942
+ "test_split": "test",
1943
+ "fewshot_split": "dev",
1944
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1945
+ "doc_to_target": "answer",
1946
+ "doc_to_choice": [
1947
+ "A",
1948
+ "B",
1949
+ "C",
1950
+ "D"
1951
+ ],
1952
+ "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
1953
+ "target_delimiter": " ",
1954
+ "fewshot_delimiter": "\n\n",
1955
+ "fewshot_config": {
1956
+ "sampler": "first_n"
1957
+ },
1958
+ "num_fewshot": 5,
1959
+ "metric_list": [
1960
+ {
1961
+ "metric": "acc",
1962
+ "aggregation": "mean",
1963
+ "higher_is_better": true
1964
+ }
1965
+ ],
1966
+ "output_type": "multiple_choice",
1967
+ "repeats": 1,
1968
+ "should_decontaminate": false,
1969
+ "metadata": {
1970
+ "version": 0.0
1971
+ }
1972
+ },
1973
+ "mmlu_miscellaneous": {
1974
+ "task": "mmlu_miscellaneous",
1975
+ "task_alias": "miscellaneous",
1976
+ "group": "mmlu_other",
1977
+ "group_alias": "other",
1978
+ "dataset_path": "hails/mmlu_no_train",
1979
+ "dataset_name": "miscellaneous",
1980
+ "test_split": "test",
1981
+ "fewshot_split": "dev",
1982
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
1983
+ "doc_to_target": "answer",
1984
+ "doc_to_choice": [
1985
+ "A",
1986
+ "B",
1987
+ "C",
1988
+ "D"
1989
+ ],
1990
+ "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
1991
+ "target_delimiter": " ",
1992
+ "fewshot_delimiter": "\n\n",
1993
+ "fewshot_config": {
1994
+ "sampler": "first_n"
1995
+ },
1996
+ "num_fewshot": 5,
1997
+ "metric_list": [
1998
+ {
1999
+ "metric": "acc",
2000
+ "aggregation": "mean",
2001
+ "higher_is_better": true
2002
+ }
2003
+ ],
2004
+ "output_type": "multiple_choice",
2005
+ "repeats": 1,
2006
+ "should_decontaminate": false,
2007
+ "metadata": {
2008
+ "version": 0.0
2009
+ }
2010
+ },
2011
+ "mmlu_moral_disputes": {
2012
+ "task": "mmlu_moral_disputes",
2013
+ "task_alias": "moral_disputes",
2014
+ "group": "mmlu_humanities",
2015
+ "group_alias": "humanities",
2016
+ "dataset_path": "hails/mmlu_no_train",
2017
+ "dataset_name": "moral_disputes",
2018
+ "test_split": "test",
2019
+ "fewshot_split": "dev",
2020
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2021
+ "doc_to_target": "answer",
2022
+ "doc_to_choice": [
2023
+ "A",
2024
+ "B",
2025
+ "C",
2026
+ "D"
2027
+ ],
2028
+ "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
2029
+ "target_delimiter": " ",
2030
+ "fewshot_delimiter": "\n\n",
2031
+ "fewshot_config": {
2032
+ "sampler": "first_n"
2033
+ },
2034
+ "num_fewshot": 5,
2035
+ "metric_list": [
2036
+ {
2037
+ "metric": "acc",
2038
+ "aggregation": "mean",
2039
+ "higher_is_better": true
2040
+ }
2041
+ ],
2042
+ "output_type": "multiple_choice",
2043
+ "repeats": 1,
2044
+ "should_decontaminate": false,
2045
+ "metadata": {
2046
+ "version": 0.0
2047
+ }
2048
+ },
2049
+ "mmlu_moral_scenarios": {
2050
+ "task": "mmlu_moral_scenarios",
2051
+ "task_alias": "moral_scenarios",
2052
+ "group": "mmlu_humanities",
2053
+ "group_alias": "humanities",
2054
+ "dataset_path": "hails/mmlu_no_train",
2055
+ "dataset_name": "moral_scenarios",
2056
+ "test_split": "test",
2057
+ "fewshot_split": "dev",
2058
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2059
+ "doc_to_target": "answer",
2060
+ "doc_to_choice": [
2061
+ "A",
2062
+ "B",
2063
+ "C",
2064
+ "D"
2065
+ ],
2066
+ "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
2067
+ "target_delimiter": " ",
2068
+ "fewshot_delimiter": "\n\n",
2069
+ "fewshot_config": {
2070
+ "sampler": "first_n"
2071
+ },
2072
+ "num_fewshot": 5,
2073
+ "metric_list": [
2074
+ {
2075
+ "metric": "acc",
2076
+ "aggregation": "mean",
2077
+ "higher_is_better": true
2078
+ }
2079
+ ],
2080
+ "output_type": "multiple_choice",
2081
+ "repeats": 1,
2082
+ "should_decontaminate": false,
2083
+ "metadata": {
2084
+ "version": 0.0
2085
+ }
2086
+ },
2087
+ "mmlu_nutrition": {
2088
+ "task": "mmlu_nutrition",
2089
+ "task_alias": "nutrition",
2090
+ "group": "mmlu_other",
2091
+ "group_alias": "other",
2092
+ "dataset_path": "hails/mmlu_no_train",
2093
+ "dataset_name": "nutrition",
2094
+ "test_split": "test",
2095
+ "fewshot_split": "dev",
2096
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2097
+ "doc_to_target": "answer",
2098
+ "doc_to_choice": [
2099
+ "A",
2100
+ "B",
2101
+ "C",
2102
+ "D"
2103
+ ],
2104
+ "description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
2105
+ "target_delimiter": " ",
2106
+ "fewshot_delimiter": "\n\n",
2107
+ "fewshot_config": {
2108
+ "sampler": "first_n"
2109
+ },
2110
+ "num_fewshot": 5,
2111
+ "metric_list": [
2112
+ {
2113
+ "metric": "acc",
2114
+ "aggregation": "mean",
2115
+ "higher_is_better": true
2116
+ }
2117
+ ],
2118
+ "output_type": "multiple_choice",
2119
+ "repeats": 1,
2120
+ "should_decontaminate": false,
2121
+ "metadata": {
2122
+ "version": 0.0
2123
+ }
2124
+ },
2125
+ "mmlu_philosophy": {
2126
+ "task": "mmlu_philosophy",
2127
+ "task_alias": "philosophy",
2128
+ "group": "mmlu_humanities",
2129
+ "group_alias": "humanities",
2130
+ "dataset_path": "hails/mmlu_no_train",
2131
+ "dataset_name": "philosophy",
2132
+ "test_split": "test",
2133
+ "fewshot_split": "dev",
2134
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2135
+ "doc_to_target": "answer",
2136
+ "doc_to_choice": [
2137
+ "A",
2138
+ "B",
2139
+ "C",
2140
+ "D"
2141
+ ],
2142
+ "description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
2143
+ "target_delimiter": " ",
2144
+ "fewshot_delimiter": "\n\n",
2145
+ "fewshot_config": {
2146
+ "sampler": "first_n"
2147
+ },
2148
+ "num_fewshot": 5,
2149
+ "metric_list": [
2150
+ {
2151
+ "metric": "acc",
2152
+ "aggregation": "mean",
2153
+ "higher_is_better": true
2154
+ }
2155
+ ],
2156
+ "output_type": "multiple_choice",
2157
+ "repeats": 1,
2158
+ "should_decontaminate": false,
2159
+ "metadata": {
2160
+ "version": 0.0
2161
+ }
2162
+ },
2163
+ "mmlu_prehistory": {
2164
+ "task": "mmlu_prehistory",
2165
+ "task_alias": "prehistory",
2166
+ "group": "mmlu_humanities",
2167
+ "group_alias": "humanities",
2168
+ "dataset_path": "hails/mmlu_no_train",
2169
+ "dataset_name": "prehistory",
2170
+ "test_split": "test",
2171
+ "fewshot_split": "dev",
2172
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2173
+ "doc_to_target": "answer",
2174
+ "doc_to_choice": [
2175
+ "A",
2176
+ "B",
2177
+ "C",
2178
+ "D"
2179
+ ],
2180
+ "description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
2181
+ "target_delimiter": " ",
2182
+ "fewshot_delimiter": "\n\n",
2183
+ "fewshot_config": {
2184
+ "sampler": "first_n"
2185
+ },
2186
+ "num_fewshot": 5,
2187
+ "metric_list": [
2188
+ {
2189
+ "metric": "acc",
2190
+ "aggregation": "mean",
2191
+ "higher_is_better": true
2192
+ }
2193
+ ],
2194
+ "output_type": "multiple_choice",
2195
+ "repeats": 1,
2196
+ "should_decontaminate": false,
2197
+ "metadata": {
2198
+ "version": 0.0
2199
+ }
2200
+ },
2201
+ "mmlu_professional_accounting": {
2202
+ "task": "mmlu_professional_accounting",
2203
+ "task_alias": "professional_accounting",
2204
+ "group": "mmlu_other",
2205
+ "group_alias": "other",
2206
+ "dataset_path": "hails/mmlu_no_train",
2207
+ "dataset_name": "professional_accounting",
2208
+ "test_split": "test",
2209
+ "fewshot_split": "dev",
2210
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2211
+ "doc_to_target": "answer",
2212
+ "doc_to_choice": [
2213
+ "A",
2214
+ "B",
2215
+ "C",
2216
+ "D"
2217
+ ],
2218
+ "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
2219
+ "target_delimiter": " ",
2220
+ "fewshot_delimiter": "\n\n",
2221
+ "fewshot_config": {
2222
+ "sampler": "first_n"
2223
+ },
2224
+ "num_fewshot": 5,
2225
+ "metric_list": [
2226
+ {
2227
+ "metric": "acc",
2228
+ "aggregation": "mean",
2229
+ "higher_is_better": true
2230
+ }
2231
+ ],
2232
+ "output_type": "multiple_choice",
2233
+ "repeats": 1,
2234
+ "should_decontaminate": false,
2235
+ "metadata": {
2236
+ "version": 0.0
2237
+ }
2238
+ },
2239
+ "mmlu_professional_law": {
2240
+ "task": "mmlu_professional_law",
2241
+ "task_alias": "professional_law",
2242
+ "group": "mmlu_humanities",
2243
+ "group_alias": "humanities",
2244
+ "dataset_path": "hails/mmlu_no_train",
2245
+ "dataset_name": "professional_law",
2246
+ "test_split": "test",
2247
+ "fewshot_split": "dev",
2248
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2249
+ "doc_to_target": "answer",
2250
+ "doc_to_choice": [
2251
+ "A",
2252
+ "B",
2253
+ "C",
2254
+ "D"
2255
+ ],
2256
+ "description": "The following are multiple choice questions (with answers) about professional law.\n\n",
2257
+ "target_delimiter": " ",
2258
+ "fewshot_delimiter": "\n\n",
2259
+ "fewshot_config": {
2260
+ "sampler": "first_n"
2261
+ },
2262
+ "num_fewshot": 5,
2263
+ "metric_list": [
2264
+ {
2265
+ "metric": "acc",
2266
+ "aggregation": "mean",
2267
+ "higher_is_better": true
2268
+ }
2269
+ ],
2270
+ "output_type": "multiple_choice",
2271
+ "repeats": 1,
2272
+ "should_decontaminate": false,
2273
+ "metadata": {
2274
+ "version": 0.0
2275
+ }
2276
+ },
2277
+ "mmlu_professional_medicine": {
2278
+ "task": "mmlu_professional_medicine",
2279
+ "task_alias": "professional_medicine",
2280
+ "group": "mmlu_other",
2281
+ "group_alias": "other",
2282
+ "dataset_path": "hails/mmlu_no_train",
2283
+ "dataset_name": "professional_medicine",
2284
+ "test_split": "test",
2285
+ "fewshot_split": "dev",
2286
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2287
+ "doc_to_target": "answer",
2288
+ "doc_to_choice": [
2289
+ "A",
2290
+ "B",
2291
+ "C",
2292
+ "D"
2293
+ ],
2294
+ "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
2295
+ "target_delimiter": " ",
2296
+ "fewshot_delimiter": "\n\n",
2297
+ "fewshot_config": {
2298
+ "sampler": "first_n"
2299
+ },
2300
+ "num_fewshot": 5,
2301
+ "metric_list": [
2302
+ {
2303
+ "metric": "acc",
2304
+ "aggregation": "mean",
2305
+ "higher_is_better": true
2306
+ }
2307
+ ],
2308
+ "output_type": "multiple_choice",
2309
+ "repeats": 1,
2310
+ "should_decontaminate": false,
2311
+ "metadata": {
2312
+ "version": 0.0
2313
+ }
2314
+ },
2315
+ "mmlu_professional_psychology": {
2316
+ "task": "mmlu_professional_psychology",
2317
+ "task_alias": "professional_psychology",
2318
+ "group": "mmlu_social_sciences",
2319
+ "group_alias": "social_sciences",
2320
+ "dataset_path": "hails/mmlu_no_train",
2321
+ "dataset_name": "professional_psychology",
2322
+ "test_split": "test",
2323
+ "fewshot_split": "dev",
2324
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2325
+ "doc_to_target": "answer",
2326
+ "doc_to_choice": [
2327
+ "A",
2328
+ "B",
2329
+ "C",
2330
+ "D"
2331
+ ],
2332
+ "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
2333
+ "target_delimiter": " ",
2334
+ "fewshot_delimiter": "\n\n",
2335
+ "fewshot_config": {
2336
+ "sampler": "first_n"
2337
+ },
2338
+ "num_fewshot": 5,
2339
+ "metric_list": [
2340
+ {
2341
+ "metric": "acc",
2342
+ "aggregation": "mean",
2343
+ "higher_is_better": true
2344
+ }
2345
+ ],
2346
+ "output_type": "multiple_choice",
2347
+ "repeats": 1,
2348
+ "should_decontaminate": false,
2349
+ "metadata": {
2350
+ "version": 0.0
2351
+ }
2352
+ },
2353
+ "mmlu_public_relations": {
2354
+ "task": "mmlu_public_relations",
2355
+ "task_alias": "public_relations",
2356
+ "group": "mmlu_social_sciences",
2357
+ "group_alias": "social_sciences",
2358
+ "dataset_path": "hails/mmlu_no_train",
2359
+ "dataset_name": "public_relations",
2360
+ "test_split": "test",
2361
+ "fewshot_split": "dev",
2362
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2363
+ "doc_to_target": "answer",
2364
+ "doc_to_choice": [
2365
+ "A",
2366
+ "B",
2367
+ "C",
2368
+ "D"
2369
+ ],
2370
+ "description": "The following are multiple choice questions (with answers) about public relations.\n\n",
2371
+ "target_delimiter": " ",
2372
+ "fewshot_delimiter": "\n\n",
2373
+ "fewshot_config": {
2374
+ "sampler": "first_n"
2375
+ },
2376
+ "num_fewshot": 5,
2377
+ "metric_list": [
2378
+ {
2379
+ "metric": "acc",
2380
+ "aggregation": "mean",
2381
+ "higher_is_better": true
2382
+ }
2383
+ ],
2384
+ "output_type": "multiple_choice",
2385
+ "repeats": 1,
2386
+ "should_decontaminate": false,
2387
+ "metadata": {
2388
+ "version": 0.0
2389
+ }
2390
+ },
2391
+ "mmlu_security_studies": {
2392
+ "task": "mmlu_security_studies",
2393
+ "task_alias": "security_studies",
2394
+ "group": "mmlu_social_sciences",
2395
+ "group_alias": "social_sciences",
2396
+ "dataset_path": "hails/mmlu_no_train",
2397
+ "dataset_name": "security_studies",
2398
+ "test_split": "test",
2399
+ "fewshot_split": "dev",
2400
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2401
+ "doc_to_target": "answer",
2402
+ "doc_to_choice": [
2403
+ "A",
2404
+ "B",
2405
+ "C",
2406
+ "D"
2407
+ ],
2408
+ "description": "The following are multiple choice questions (with answers) about security studies.\n\n",
2409
+ "target_delimiter": " ",
2410
+ "fewshot_delimiter": "\n\n",
2411
+ "fewshot_config": {
2412
+ "sampler": "first_n"
2413
+ },
2414
+ "num_fewshot": 5,
2415
+ "metric_list": [
2416
+ {
2417
+ "metric": "acc",
2418
+ "aggregation": "mean",
2419
+ "higher_is_better": true
2420
+ }
2421
+ ],
2422
+ "output_type": "multiple_choice",
2423
+ "repeats": 1,
2424
+ "should_decontaminate": false,
2425
+ "metadata": {
2426
+ "version": 0.0
2427
+ }
2428
+ },
2429
+ "mmlu_sociology": {
2430
+ "task": "mmlu_sociology",
2431
+ "task_alias": "sociology",
2432
+ "group": "mmlu_social_sciences",
2433
+ "group_alias": "social_sciences",
2434
+ "dataset_path": "hails/mmlu_no_train",
2435
+ "dataset_name": "sociology",
2436
+ "test_split": "test",
2437
+ "fewshot_split": "dev",
2438
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2439
+ "doc_to_target": "answer",
2440
+ "doc_to_choice": [
2441
+ "A",
2442
+ "B",
2443
+ "C",
2444
+ "D"
2445
+ ],
2446
+ "description": "The following are multiple choice questions (with answers) about sociology.\n\n",
2447
+ "target_delimiter": " ",
2448
+ "fewshot_delimiter": "\n\n",
2449
+ "fewshot_config": {
2450
+ "sampler": "first_n"
2451
+ },
2452
+ "num_fewshot": 5,
2453
+ "metric_list": [
2454
+ {
2455
+ "metric": "acc",
2456
+ "aggregation": "mean",
2457
+ "higher_is_better": true
2458
+ }
2459
+ ],
2460
+ "output_type": "multiple_choice",
2461
+ "repeats": 1,
2462
+ "should_decontaminate": false,
2463
+ "metadata": {
2464
+ "version": 0.0
2465
+ }
2466
+ },
2467
+ "mmlu_us_foreign_policy": {
2468
+ "task": "mmlu_us_foreign_policy",
2469
+ "task_alias": "us_foreign_policy",
2470
+ "group": "mmlu_social_sciences",
2471
+ "group_alias": "social_sciences",
2472
+ "dataset_path": "hails/mmlu_no_train",
2473
+ "dataset_name": "us_foreign_policy",
2474
+ "test_split": "test",
2475
+ "fewshot_split": "dev",
2476
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2477
+ "doc_to_target": "answer",
2478
+ "doc_to_choice": [
2479
+ "A",
2480
+ "B",
2481
+ "C",
2482
+ "D"
2483
+ ],
2484
+ "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
2485
+ "target_delimiter": " ",
2486
+ "fewshot_delimiter": "\n\n",
2487
+ "fewshot_config": {
2488
+ "sampler": "first_n"
2489
+ },
2490
+ "num_fewshot": 5,
2491
+ "metric_list": [
2492
+ {
2493
+ "metric": "acc",
2494
+ "aggregation": "mean",
2495
+ "higher_is_better": true
2496
+ }
2497
+ ],
2498
+ "output_type": "multiple_choice",
2499
+ "repeats": 1,
2500
+ "should_decontaminate": false,
2501
+ "metadata": {
2502
+ "version": 0.0
2503
+ }
2504
+ },
2505
+ "mmlu_virology": {
2506
+ "task": "mmlu_virology",
2507
+ "task_alias": "virology",
2508
+ "group": "mmlu_other",
2509
+ "group_alias": "other",
2510
+ "dataset_path": "hails/mmlu_no_train",
2511
+ "dataset_name": "virology",
2512
+ "test_split": "test",
2513
+ "fewshot_split": "dev",
2514
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2515
+ "doc_to_target": "answer",
2516
+ "doc_to_choice": [
2517
+ "A",
2518
+ "B",
2519
+ "C",
2520
+ "D"
2521
+ ],
2522
+ "description": "The following are multiple choice questions (with answers) about virology.\n\n",
2523
+ "target_delimiter": " ",
2524
+ "fewshot_delimiter": "\n\n",
2525
+ "fewshot_config": {
2526
+ "sampler": "first_n"
2527
+ },
2528
+ "num_fewshot": 5,
2529
+ "metric_list": [
2530
+ {
2531
+ "metric": "acc",
2532
+ "aggregation": "mean",
2533
+ "higher_is_better": true
2534
+ }
2535
+ ],
2536
+ "output_type": "multiple_choice",
2537
+ "repeats": 1,
2538
+ "should_decontaminate": false,
2539
+ "metadata": {
2540
+ "version": 0.0
2541
+ }
2542
+ },
2543
+ "mmlu_world_religions": {
2544
+ "task": "mmlu_world_religions",
2545
+ "task_alias": "world_religions",
2546
+ "group": "mmlu_humanities",
2547
+ "group_alias": "humanities",
2548
+ "dataset_path": "hails/mmlu_no_train",
2549
+ "dataset_name": "world_religions",
2550
+ "test_split": "test",
2551
+ "fewshot_split": "dev",
2552
+ "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
2553
+ "doc_to_target": "answer",
2554
+ "doc_to_choice": [
2555
+ "A",
2556
+ "B",
2557
+ "C",
2558
+ "D"
2559
+ ],
2560
+ "description": "The following are multiple choice questions (with answers) about world religions.\n\n",
2561
+ "target_delimiter": " ",
2562
+ "fewshot_delimiter": "\n\n",
2563
+ "fewshot_config": {
2564
+ "sampler": "first_n"
2565
+ },
2566
+ "num_fewshot": 5,
2567
+ "metric_list": [
2568
+ {
2569
+ "metric": "acc",
2570
+ "aggregation": "mean",
2571
+ "higher_is_better": true
2572
+ }
2573
+ ],
2574
+ "output_type": "multiple_choice",
2575
+ "repeats": 1,
2576
+ "should_decontaminate": false,
2577
+ "metadata": {
2578
+ "version": 0.0
2579
+ }
2580
+ }
2581
+ },
2582
+ "versions": {
2583
+ "mmlu_abstract_algebra": 0.0,
2584
+ "mmlu_anatomy": 0.0,
2585
+ "mmlu_astronomy": 0.0,
2586
+ "mmlu_business_ethics": 0.0,
2587
+ "mmlu_clinical_knowledge": 0.0,
2588
+ "mmlu_college_biology": 0.0,
2589
+ "mmlu_college_chemistry": 0.0,
2590
+ "mmlu_college_computer_science": 0.0,
2591
+ "mmlu_college_mathematics": 0.0,
2592
+ "mmlu_college_medicine": 0.0,
2593
+ "mmlu_college_physics": 0.0,
2594
+ "mmlu_computer_security": 0.0,
2595
+ "mmlu_conceptual_physics": 0.0,
2596
+ "mmlu_econometrics": 0.0,
2597
+ "mmlu_electrical_engineering": 0.0,
2598
+ "mmlu_elementary_mathematics": 0.0,
2599
+ "mmlu_formal_logic": 0.0,
2600
+ "mmlu_global_facts": 0.0,
2601
+ "mmlu_high_school_biology": 0.0,
2602
+ "mmlu_high_school_chemistry": 0.0,
2603
+ "mmlu_high_school_computer_science": 0.0,
2604
+ "mmlu_high_school_european_history": 0.0,
2605
+ "mmlu_high_school_geography": 0.0,
2606
+ "mmlu_high_school_government_and_politics": 0.0,
2607
+ "mmlu_high_school_macroeconomics": 0.0,
2608
+ "mmlu_high_school_mathematics": 0.0,
2609
+ "mmlu_high_school_microeconomics": 0.0,
2610
+ "mmlu_high_school_physics": 0.0,
2611
+ "mmlu_high_school_psychology": 0.0,
2612
+ "mmlu_high_school_statistics": 0.0,
2613
+ "mmlu_high_school_us_history": 0.0,
2614
+ "mmlu_high_school_world_history": 0.0,
2615
+ "mmlu_human_aging": 0.0,
2616
+ "mmlu_human_sexuality": 0.0,
2617
+ "mmlu_international_law": 0.0,
2618
+ "mmlu_jurisprudence": 0.0,
2619
+ "mmlu_logical_fallacies": 0.0,
2620
+ "mmlu_machine_learning": 0.0,
2621
+ "mmlu_management": 0.0,
2622
+ "mmlu_marketing": 0.0,
2623
+ "mmlu_medical_genetics": 0.0,
2624
+ "mmlu_miscellaneous": 0.0,
2625
+ "mmlu_moral_disputes": 0.0,
2626
+ "mmlu_moral_scenarios": 0.0,
2627
+ "mmlu_nutrition": 0.0,
2628
+ "mmlu_philosophy": 0.0,
2629
+ "mmlu_prehistory": 0.0,
2630
+ "mmlu_professional_accounting": 0.0,
2631
+ "mmlu_professional_law": 0.0,
2632
+ "mmlu_professional_medicine": 0.0,
2633
+ "mmlu_professional_psychology": 0.0,
2634
+ "mmlu_public_relations": 0.0,
2635
+ "mmlu_security_studies": 0.0,
2636
+ "mmlu_sociology": 0.0,
2637
+ "mmlu_us_foreign_policy": 0.0,
2638
+ "mmlu_virology": 0.0,
2639
+ "mmlu_world_religions": 0.0
2640
+ },
2641
+ "n-shot": {
2642
+ "mmlu": 0,
2643
+ "mmlu_abstract_algebra": 5,
2644
+ "mmlu_anatomy": 5,
2645
+ "mmlu_astronomy": 5,
2646
+ "mmlu_business_ethics": 5,
2647
+ "mmlu_clinical_knowledge": 5,
2648
+ "mmlu_college_biology": 5,
2649
+ "mmlu_college_chemistry": 5,
2650
+ "mmlu_college_computer_science": 5,
2651
+ "mmlu_college_mathematics": 5,
2652
+ "mmlu_college_medicine": 5,
2653
+ "mmlu_college_physics": 5,
2654
+ "mmlu_computer_security": 5,
2655
+ "mmlu_conceptual_physics": 5,
2656
+ "mmlu_econometrics": 5,
2657
+ "mmlu_electrical_engineering": 5,
2658
+ "mmlu_elementary_mathematics": 5,
2659
+ "mmlu_formal_logic": 5,
2660
+ "mmlu_global_facts": 5,
2661
+ "mmlu_high_school_biology": 5,
2662
+ "mmlu_high_school_chemistry": 5,
2663
+ "mmlu_high_school_computer_science": 5,
2664
+ "mmlu_high_school_european_history": 5,
2665
+ "mmlu_high_school_geography": 5,
2666
+ "mmlu_high_school_government_and_politics": 5,
2667
+ "mmlu_high_school_macroeconomics": 5,
2668
+ "mmlu_high_school_mathematics": 5,
2669
+ "mmlu_high_school_microeconomics": 5,
2670
+ "mmlu_high_school_physics": 5,
2671
+ "mmlu_high_school_psychology": 5,
2672
+ "mmlu_high_school_statistics": 5,
2673
+ "mmlu_high_school_us_history": 5,
2674
+ "mmlu_high_school_world_history": 5,
2675
+ "mmlu_human_aging": 5,
2676
+ "mmlu_human_sexuality": 5,
2677
+ "mmlu_humanities": 5,
2678
+ "mmlu_international_law": 5,
2679
+ "mmlu_jurisprudence": 5,
2680
+ "mmlu_logical_fallacies": 5,
2681
+ "mmlu_machine_learning": 5,
2682
+ "mmlu_management": 5,
2683
+ "mmlu_marketing": 5,
2684
+ "mmlu_medical_genetics": 5,
2685
+ "mmlu_miscellaneous": 5,
2686
+ "mmlu_moral_disputes": 5,
2687
+ "mmlu_moral_scenarios": 5,
2688
+ "mmlu_nutrition": 5,
2689
+ "mmlu_other": 5,
2690
+ "mmlu_philosophy": 5,
2691
+ "mmlu_prehistory": 5,
2692
+ "mmlu_professional_accounting": 5,
2693
+ "mmlu_professional_law": 5,
2694
+ "mmlu_professional_medicine": 5,
2695
+ "mmlu_professional_psychology": 5,
2696
+ "mmlu_public_relations": 5,
2697
+ "mmlu_security_studies": 5,
2698
+ "mmlu_social_sciences": 5,
2699
+ "mmlu_sociology": 5,
2700
+ "mmlu_stem": 5,
2701
+ "mmlu_us_foreign_policy": 5,
2702
+ "mmlu_virology": 5,
2703
+ "mmlu_world_religions": 5
2704
+ },
2705
+ "higher_is_better": {
2706
+ "mmlu": {
2707
+ "acc": true
2708
+ },
2709
+ "mmlu_abstract_algebra": {
2710
+ "acc": true
2711
+ },
2712
+ "mmlu_anatomy": {
2713
+ "acc": true
2714
+ },
2715
+ "mmlu_astronomy": {
2716
+ "acc": true
2717
+ },
2718
+ "mmlu_business_ethics": {
2719
+ "acc": true
2720
+ },
2721
+ "mmlu_clinical_knowledge": {
2722
+ "acc": true
2723
+ },
2724
+ "mmlu_college_biology": {
2725
+ "acc": true
2726
+ },
2727
+ "mmlu_college_chemistry": {
2728
+ "acc": true
2729
+ },
2730
+ "mmlu_college_computer_science": {
2731
+ "acc": true
2732
+ },
2733
+ "mmlu_college_mathematics": {
2734
+ "acc": true
2735
+ },
2736
+ "mmlu_college_medicine": {
2737
+ "acc": true
2738
+ },
2739
+ "mmlu_college_physics": {
2740
+ "acc": true
2741
+ },
2742
+ "mmlu_computer_security": {
2743
+ "acc": true
2744
+ },
2745
+ "mmlu_conceptual_physics": {
2746
+ "acc": true
2747
+ },
2748
+ "mmlu_econometrics": {
2749
+ "acc": true
2750
+ },
2751
+ "mmlu_electrical_engineering": {
2752
+ "acc": true
2753
+ },
2754
+ "mmlu_elementary_mathematics": {
2755
+ "acc": true
2756
+ },
2757
+ "mmlu_formal_logic": {
2758
+ "acc": true
2759
+ },
2760
+ "mmlu_global_facts": {
2761
+ "acc": true
2762
+ },
2763
+ "mmlu_high_school_biology": {
2764
+ "acc": true
2765
+ },
2766
+ "mmlu_high_school_chemistry": {
2767
+ "acc": true
2768
+ },
2769
+ "mmlu_high_school_computer_science": {
2770
+ "acc": true
2771
+ },
2772
+ "mmlu_high_school_european_history": {
2773
+ "acc": true
2774
+ },
2775
+ "mmlu_high_school_geography": {
2776
+ "acc": true
2777
+ },
2778
+ "mmlu_high_school_government_and_politics": {
2779
+ "acc": true
2780
+ },
2781
+ "mmlu_high_school_macroeconomics": {
2782
+ "acc": true
2783
+ },
2784
+ "mmlu_high_school_mathematics": {
2785
+ "acc": true
2786
+ },
2787
+ "mmlu_high_school_microeconomics": {
2788
+ "acc": true
2789
+ },
2790
+ "mmlu_high_school_physics": {
2791
+ "acc": true
2792
+ },
2793
+ "mmlu_high_school_psychology": {
2794
+ "acc": true
2795
+ },
2796
+ "mmlu_high_school_statistics": {
2797
+ "acc": true
2798
+ },
2799
+ "mmlu_high_school_us_history": {
2800
+ "acc": true
2801
+ },
2802
+ "mmlu_high_school_world_history": {
2803
+ "acc": true
2804
+ },
2805
+ "mmlu_human_aging": {
2806
+ "acc": true
2807
+ },
2808
+ "mmlu_human_sexuality": {
2809
+ "acc": true
2810
+ },
2811
+ "mmlu_humanities": {
2812
+ "acc": true
2813
+ },
2814
+ "mmlu_international_law": {
2815
+ "acc": true
2816
+ },
2817
+ "mmlu_jurisprudence": {
2818
+ "acc": true
2819
+ },
2820
+ "mmlu_logical_fallacies": {
2821
+ "acc": true
2822
+ },
2823
+ "mmlu_machine_learning": {
2824
+ "acc": true
2825
+ },
2826
+ "mmlu_management": {
2827
+ "acc": true
2828
+ },
2829
+ "mmlu_marketing": {
2830
+ "acc": true
2831
+ },
2832
+ "mmlu_medical_genetics": {
2833
+ "acc": true
2834
+ },
2835
+ "mmlu_miscellaneous": {
2836
+ "acc": true
2837
+ },
2838
+ "mmlu_moral_disputes": {
2839
+ "acc": true
2840
+ },
2841
+ "mmlu_moral_scenarios": {
2842
+ "acc": true
2843
+ },
2844
+ "mmlu_nutrition": {
2845
+ "acc": true
2846
+ },
2847
+ "mmlu_other": {
2848
+ "acc": true
2849
+ },
2850
+ "mmlu_philosophy": {
2851
+ "acc": true
2852
+ },
2853
+ "mmlu_prehistory": {
2854
+ "acc": true
2855
+ },
2856
+ "mmlu_professional_accounting": {
2857
+ "acc": true
2858
+ },
2859
+ "mmlu_professional_law": {
2860
+ "acc": true
2861
+ },
2862
+ "mmlu_professional_medicine": {
2863
+ "acc": true
2864
+ },
2865
+ "mmlu_professional_psychology": {
2866
+ "acc": true
2867
+ },
2868
+ "mmlu_public_relations": {
2869
+ "acc": true
2870
+ },
2871
+ "mmlu_security_studies": {
2872
+ "acc": true
2873
+ },
2874
+ "mmlu_social_sciences": {
2875
+ "acc": true
2876
+ },
2877
+ "mmlu_sociology": {
2878
+ "acc": true
2879
+ },
2880
+ "mmlu_stem": {
2881
+ "acc": true
2882
+ },
2883
+ "mmlu_us_foreign_policy": {
2884
+ "acc": true
2885
+ },
2886
+ "mmlu_virology": {
2887
+ "acc": true
2888
+ },
2889
+ "mmlu_world_religions": {
2890
+ "acc": true
2891
+ }
2892
+ },
2893
+ "n-samples": {
2894
+ "mmlu_philosophy": {
2895
+ "original": 311,
2896
+ "effective": 311
2897
+ },
2898
+ "mmlu_logical_fallacies": {
2899
+ "original": 163,
2900
+ "effective": 163
2901
+ },
2902
+ "mmlu_moral_disputes": {
2903
+ "original": 346,
2904
+ "effective": 346
2905
+ },
2906
+ "mmlu_jurisprudence": {
2907
+ "original": 108,
2908
+ "effective": 108
2909
+ },
2910
+ "mmlu_high_school_us_history": {
2911
+ "original": 204,
2912
+ "effective": 204
2913
+ },
2914
+ "mmlu_high_school_world_history": {
2915
+ "original": 237,
2916
+ "effective": 237
2917
+ },
2918
+ "mmlu_world_religions": {
2919
+ "original": 171,
2920
+ "effective": 171
2921
+ },
2922
+ "mmlu_moral_scenarios": {
2923
+ "original": 895,
2924
+ "effective": 895
2925
+ },
2926
+ "mmlu_prehistory": {
2927
+ "original": 324,
2928
+ "effective": 324
2929
+ },
2930
+ "mmlu_formal_logic": {
2931
+ "original": 126,
2932
+ "effective": 126
2933
+ },
2934
+ "mmlu_international_law": {
2935
+ "original": 121,
2936
+ "effective": 121
2937
+ },
2938
+ "mmlu_professional_law": {
2939
+ "original": 1534,
2940
+ "effective": 1534
2941
+ },
2942
+ "mmlu_high_school_european_history": {
2943
+ "original": 165,
2944
+ "effective": 165
2945
+ },
2946
+ "mmlu_public_relations": {
2947
+ "original": 110,
2948
+ "effective": 110
2949
+ },
2950
+ "mmlu_high_school_macroeconomics": {
2951
+ "original": 390,
2952
+ "effective": 390
2953
+ },
2954
+ "mmlu_human_sexuality": {
2955
+ "original": 131,
2956
+ "effective": 131
2957
+ },
2958
+ "mmlu_high_school_geography": {
2959
+ "original": 198,
2960
+ "effective": 198
2961
+ },
2962
+ "mmlu_high_school_psychology": {
2963
+ "original": 545,
2964
+ "effective": 545
2965
+ },
2966
+ "mmlu_high_school_microeconomics": {
2967
+ "original": 238,
2968
+ "effective": 238
2969
+ },
2970
+ "mmlu_high_school_government_and_politics": {
2971
+ "original": 193,
2972
+ "effective": 193
2973
+ },
2974
+ "mmlu_us_foreign_policy": {
2975
+ "original": 100,
2976
+ "effective": 100
2977
+ },
2978
+ "mmlu_sociology": {
2979
+ "original": 201,
2980
+ "effective": 201
2981
+ },
2982
+ "mmlu_security_studies": {
2983
+ "original": 245,
2984
+ "effective": 245
2985
+ },
2986
+ "mmlu_econometrics": {
2987
+ "original": 114,
2988
+ "effective": 114
2989
+ },
2990
+ "mmlu_professional_psychology": {
2991
+ "original": 612,
2992
+ "effective": 612
2993
+ },
2994
+ "mmlu_business_ethics": {
2995
+ "original": 100,
2996
+ "effective": 100
2997
+ },
2998
+ "mmlu_marketing": {
2999
+ "original": 234,
3000
+ "effective": 234
3001
+ },
3002
+ "mmlu_medical_genetics": {
3003
+ "original": 100,
3004
+ "effective": 100
3005
+ },
3006
+ "mmlu_clinical_knowledge": {
3007
+ "original": 265,
3008
+ "effective": 265
3009
+ },
3010
+ "mmlu_global_facts": {
3011
+ "original": 100,
3012
+ "effective": 100
3013
+ },
3014
+ "mmlu_human_aging": {
3015
+ "original": 223,
3016
+ "effective": 223
3017
+ },
3018
+ "mmlu_professional_medicine": {
3019
+ "original": 272,
3020
+ "effective": 272
3021
+ },
3022
+ "mmlu_nutrition": {
3023
+ "original": 306,
3024
+ "effective": 306
3025
+ },
3026
+ "mmlu_management": {
3027
+ "original": 103,
3028
+ "effective": 103
3029
+ },
3030
+ "mmlu_college_medicine": {
3031
+ "original": 173,
3032
+ "effective": 173
3033
+ },
3034
+ "mmlu_professional_accounting": {
3035
+ "original": 282,
3036
+ "effective": 282
3037
+ },
3038
+ "mmlu_virology": {
3039
+ "original": 166,
3040
+ "effective": 166
3041
+ },
3042
+ "mmlu_miscellaneous": {
3043
+ "original": 783,
3044
+ "effective": 783
3045
+ },
3046
+ "mmlu_abstract_algebra": {
3047
+ "original": 100,
3048
+ "effective": 100
3049
+ },
3050
+ "mmlu_college_biology": {
3051
+ "original": 144,
3052
+ "effective": 144
3053
+ },
3054
+ "mmlu_high_school_biology": {
3055
+ "original": 310,
3056
+ "effective": 310
3057
+ },
3058
+ "mmlu_conceptual_physics": {
3059
+ "original": 235,
3060
+ "effective": 235
3061
+ },
3062
+ "mmlu_computer_security": {
3063
+ "original": 100,
3064
+ "effective": 100
3065
+ },
3066
+ "mmlu_college_physics": {
3067
+ "original": 102,
3068
+ "effective": 102
3069
+ },
3070
+ "mmlu_college_chemistry": {
3071
+ "original": 100,
3072
+ "effective": 100
3073
+ },
3074
+ "mmlu_high_school_statistics": {
3075
+ "original": 216,
3076
+ "effective": 216
3077
+ },
3078
+ "mmlu_anatomy": {
3079
+ "original": 135,
3080
+ "effective": 135
3081
+ },
3082
+ "mmlu_high_school_mathematics": {
3083
+ "original": 270,
3084
+ "effective": 270
3085
+ },
3086
+ "mmlu_machine_learning": {
3087
+ "original": 112,
3088
+ "effective": 112
3089
+ },
3090
+ "mmlu_high_school_physics": {
3091
+ "original": 151,
3092
+ "effective": 151
3093
+ },
3094
+ "mmlu_electrical_engineering": {
3095
+ "original": 145,
3096
+ "effective": 145
3097
+ },
3098
+ "mmlu_college_computer_science": {
3099
+ "original": 100,
3100
+ "effective": 100
3101
+ },
3102
+ "mmlu_high_school_chemistry": {
3103
+ "original": 203,
3104
+ "effective": 203
3105
+ },
3106
+ "mmlu_astronomy": {
3107
+ "original": 152,
3108
+ "effective": 152
3109
+ },
3110
+ "mmlu_high_school_computer_science": {
3111
+ "original": 100,
3112
+ "effective": 100
3113
+ },
3114
+ "mmlu_elementary_mathematics": {
3115
+ "original": 378,
3116
+ "effective": 378
3117
+ },
3118
+ "mmlu_college_mathematics": {
3119
+ "original": 100,
3120
+ "effective": 100
3121
+ }
3122
+ },
3123
+ "config": {
3124
+ "model": "vllm",
3125
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
3126
+ "batch_size": "auto",
3127
+ "batch_sizes": [],
3128
+ "device": "cuda",
3129
+ "use_cache": null,
3130
+ "limit": null,
3131
+ "bootstrap_iters": 100000,
3132
+ "gen_kwargs": null,
3133
+ "random_seed": 0,
3134
+ "numpy_seed": 1234,
3135
+ "torch_seed": 1234,
3136
+ "fewshot_seed": 1234
3137
+ },
3138
+ "git_hash": null,
3139
+ "date": 1719624343.9120553,
3140
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
3141
+ "transformers_version": "4.41.2",
3142
+ "upper_git_hash": null,
3143
+ "task_hashes": {},
3144
+ "model_source": "vllm",
3145
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
3146
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
3147
+ "system_instruction": null,
3148
+ "system_instruction_sha": null,
3149
+ "fewshot_as_multiturn": false,
3150
+ "chat_template": null,
3151
+ "chat_template_sha": null,
3152
+ "start_time": 7638031.049041288,
3153
+ "end_time": 7641735.991916678,
3154
+ "total_evaluation_time_seconds": "3704.942875389941"
3155
+ }
vllm-clearml/results_2024-06-29T02-29-25.547540.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k": {
4
+ "exact_match,strict-match": 0.5526914329037149,
5
+ "exact_match_stderr,strict-match": 0.013695795709089896,
6
+ "exact_match,flexible-extract": 0.5534495830174374,
7
+ "exact_match_stderr,flexible-extract": 0.013693566549743146,
8
+ "alias": "gsm8k"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gsm8k": []
13
+ },
14
+ "configs": {
15
+ "gsm8k": {
16
+ "task": "gsm8k",
17
+ "group": [
18
+ "math_word_problems"
19
+ ],
20
+ "dataset_path": "gsm8k",
21
+ "dataset_name": "main",
22
+ "training_split": "train",
23
+ "test_split": "test",
24
+ "fewshot_split": "train",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{answer}}",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 5,
31
+ "metric_list": [
32
+ {
33
+ "metric": "exact_match",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true,
36
+ "ignore_case": true,
37
+ "ignore_punctuation": false,
38
+ "regexes_to_ignore": [
39
+ ",",
40
+ "\\$",
41
+ "(?s).*#### ",
42
+ "\\.$"
43
+ ]
44
+ }
45
+ ],
46
+ "output_type": "generate_until",
47
+ "generation_kwargs": {
48
+ "until": [
49
+ "Question:",
50
+ "</s>",
51
+ "<|im_end|>"
52
+ ],
53
+ "do_sample": false,
54
+ "temperature": 0.0
55
+ },
56
+ "repeats": 1,
57
+ "filter_list": [
58
+ {
59
+ "name": "strict-match",
60
+ "filter": [
61
+ {
62
+ "function": "regex",
63
+ "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
64
+ },
65
+ {
66
+ "function": "take_first"
67
+ }
68
+ ]
69
+ },
70
+ {
71
+ "name": "flexible-extract",
72
+ "filter": [
73
+ {
74
+ "function": "regex",
75
+ "group_select": -1,
76
+ "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
77
+ },
78
+ {
79
+ "function": "take_first"
80
+ }
81
+ ]
82
+ }
83
+ ],
84
+ "should_decontaminate": false,
85
+ "metadata": {
86
+ "version": 3.0
87
+ }
88
+ }
89
+ },
90
+ "versions": {
91
+ "gsm8k": 3.0
92
+ },
93
+ "n-shot": {
94
+ "gsm8k": 5
95
+ },
96
+ "higher_is_better": {
97
+ "gsm8k": {
98
+ "exact_match": true
99
+ }
100
+ },
101
+ "n-samples": {
102
+ "gsm8k": {
103
+ "original": 1319,
104
+ "effective": 1319
105
+ }
106
+ },
107
+ "config": {
108
+ "model": "vllm",
109
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
110
+ "batch_size": "auto",
111
+ "batch_sizes": [],
112
+ "device": "cuda",
113
+ "use_cache": null,
114
+ "limit": null,
115
+ "bootstrap_iters": 100000,
116
+ "gen_kwargs": null,
117
+ "random_seed": 0,
118
+ "numpy_seed": 1234,
119
+ "torch_seed": 1234,
120
+ "fewshot_seed": 1234
121
+ },
122
+ "git_hash": null,
123
+ "date": 1719628061.99107,
124
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
125
+ "transformers_version": "4.41.2",
126
+ "upper_git_hash": null,
127
+ "task_hashes": {},
128
+ "model_source": "vllm",
129
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
130
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
131
+ "system_instruction": null,
132
+ "system_instruction_sha": null,
133
+ "fewshot_as_multiturn": false,
134
+ "chat_template": null,
135
+ "chat_template_sha": null,
136
+ "start_time": 7641749.132861241,
137
+ "end_time": 7641857.900817806,
138
+ "total_evaluation_time_seconds": "108.76795656513423"
139
+ }
vllm-clearml/results_2024-07-01T03-38-50.160777.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "acc,none": 0.3967576791808874,
5
+ "acc_stderr,none": 0.014296513020180639,
6
+ "acc_norm,none": 0.4180887372013652,
7
+ "acc_norm_stderr,none": 0.014413988396996074,
8
+ "alias": "arc_challenge"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "arc_challenge": []
13
+ },
14
+ "configs": {
15
+ "arc_challenge": {
16
+ "task": "arc_challenge",
17
+ "group": [
18
+ "ai2_arc"
19
+ ],
20
+ "dataset_path": "allenai/ai2_arc",
21
+ "dataset_name": "ARC-Challenge",
22
+ "training_split": "train",
23
+ "validation_split": "validation",
24
+ "test_split": "test",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
27
+ "doc_to_choice": "{{choices.text}}",
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 25,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ }
43
+ ],
44
+ "output_type": "multiple_choice",
45
+ "repeats": 1,
46
+ "should_decontaminate": true,
47
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
48
+ "metadata": {
49
+ "version": 1.0
50
+ }
51
+ }
52
+ },
53
+ "versions": {
54
+ "arc_challenge": 1.0
55
+ },
56
+ "n-shot": {
57
+ "arc_challenge": 25
58
+ },
59
+ "higher_is_better": {
60
+ "arc_challenge": {
61
+ "acc": true,
62
+ "acc_norm": true
63
+ }
64
+ },
65
+ "n-samples": {
66
+ "arc_challenge": {
67
+ "original": 1172,
68
+ "effective": 1172
69
+ }
70
+ },
71
+ "config": {
72
+ "model": "vllm",
73
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
74
+ "batch_size": "auto",
75
+ "batch_sizes": [],
76
+ "device": "cuda",
77
+ "use_cache": null,
78
+ "limit": null,
79
+ "bootstrap_iters": 100000,
80
+ "gen_kwargs": null,
81
+ "random_seed": 0,
82
+ "numpy_seed": 1234,
83
+ "torch_seed": 1234,
84
+ "fewshot_seed": 1234
85
+ },
86
+ "git_hash": null,
87
+ "date": 1719804732.5263278,
88
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
89
+ "transformers_version": "4.41.2",
90
+ "upper_git_hash": null,
91
+ "task_hashes": {},
92
+ "model_source": "vllm",
93
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
94
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
95
+ "system_instruction": null,
96
+ "system_instruction_sha": null,
97
+ "fewshot_as_multiturn": false,
98
+ "chat_template": null,
99
+ "chat_template_sha": null,
100
+ "start_time": 7818419.641077244,
101
+ "end_time": 7818822.513901013,
102
+ "total_evaluation_time_seconds": "402.87282376922667"
103
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
winogrande-vllm/__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K/results_2024-06-29T00-41-08.891349.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "winogrande": {
4
+ "acc,none": 0.6369376479873717,
5
+ "acc_stderr,none": 0.013515191866479218,
6
+ "alias": "winogrande"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "winogrande": []
11
+ },
12
+ "configs": {
13
+ "winogrande": {
14
+ "task": "winogrande",
15
+ "dataset_path": "winogrande",
16
+ "dataset_name": "winogrande_xl",
17
+ "training_split": "train",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n",
20
+ "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n",
21
+ "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "acc",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ }
32
+ ],
33
+ "output_type": "multiple_choice",
34
+ "repeats": 1,
35
+ "should_decontaminate": true,
36
+ "doc_to_decontamination_query": "sentence",
37
+ "metadata": {
38
+ "version": 1.0
39
+ }
40
+ }
41
+ },
42
+ "versions": {
43
+ "winogrande": 1.0
44
+ },
45
+ "n-shot": {
46
+ "winogrande": 5
47
+ },
48
+ "higher_is_better": {
49
+ "winogrande": {
50
+ "acc": true
51
+ }
52
+ },
53
+ "n-samples": {
54
+ "winogrande": {
55
+ "original": 1267,
56
+ "effective": 1267
57
+ }
58
+ },
59
+ "config": {
60
+ "model": "vllm",
61
+ "model_args": "pretrained=/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K,tensor_parallel_size=2,dtype=auto,add_bos_token=True,gpu_memory_utilization=0.4,data_parallel_size=1,max_model_len=4096",
62
+ "batch_size": "auto",
63
+ "batch_sizes": [],
64
+ "device": "cuda",
65
+ "use_cache": null,
66
+ "limit": null,
67
+ "bootstrap_iters": 100000,
68
+ "gen_kwargs": null,
69
+ "random_seed": 0,
70
+ "numpy_seed": 1234,
71
+ "torch_seed": 1234,
72
+ "fewshot_seed": 1234
73
+ },
74
+ "git_hash": null,
75
+ "date": 1719621613.7207468,
76
+ "pretty_env_info": "PyTorch version: 2.3.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.6\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-101-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 545.23.08\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7763 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 1\nFrequency boost: enabled\nCPU max MHz: 3529.0520\nCPU min MHz: 1500.0000\nBogoMIPS: 4900.22\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (16 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-63,128-191\nNUMA node1 CPU(s): 64-127,192-255\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.3.0\n[pip3] triton==2.3.0\n[conda] Could not collect",
77
+ "transformers_version": "4.41.2",
78
+ "upper_git_hash": null,
79
+ "task_hashes": {},
80
+ "model_source": "vllm",
81
+ "model_name": "/cache/abhinav/models/Phase1/gptq-Qwen/Qwen2-1.5B-Instruct-garage-bAInd/Open-Platypus-mse-damp0.1-ns512-seqlen4K",
82
+ "model_name_sanitized": "__cache__abhinav__models__Phase1__gptq-Qwen__Qwen2-1.5B-Instruct-garage-bAInd__Open-Platypus-mse-damp0.1-ns512-seqlen4K",
83
+ "system_instruction": null,
84
+ "system_instruction_sha": null,
85
+ "fewshot_as_multiturn": false,
86
+ "chat_template": null,
87
+ "chat_template_sha": null,
88
+ "start_time": 7635301.614735226,
89
+ "end_time": 7635361.244645656,
90
+ "total_evaluation_time_seconds": "59.62991042993963"
91
+ }