CarrotAI commited on
Commit
1582e05
·
verified ·
1 Parent(s): 7e44aad

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/data/private/models/Rabbit-15B/checkpoint-2344",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "/data/private/models/Rabbit-Ko-15B-Instruct/checkpoint-3789",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
eval.txt CHANGED
@@ -1,265 +1,75 @@
1
  |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
2
  |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
3
- |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7854|± |0.0113|
4
- | | |strict-match | 5|exact_match|↑ |0.7597|± |0.0118|
5
 
6
- | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
7
- |---------|------:|------|-----:|--------|---|-----:|---|-----:|
8
- |hellaswag| 1|none | 5|acc |↑ |0.5579|± |0.0050|
9
- | | |none | 5|acc_norm|↑ |0.7380|± |0.0044|
10
 
11
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
12
  |----------------|------:|------|-----:|--------|---|-----:|---|------|
13
- |kobest_boolq | 1|none | 0|acc |↑ |0.9195|± |0.0073|
14
- | | |none | 0|f1 |↑ |0.9195|± | N/A|
15
- |kobest_copa | 1|none | 0|acc |↑ |0.6670|± |0.0149|
16
- | | |none | 0|f1 |↑ |0.6664|± | N/A|
17
- |kobest_hellaswag| 1|none | 0|acc |↑ |0.4380|± |0.0222|
18
- | | |none | 0|acc_norm|↑ |0.5600|± |0.0222|
19
- | | |none | 0|f1 |↑ |0.4349|± | N/A|
20
- |kobest_sentineg | 1|none | 0|acc |↑ |0.6650|± |0.0237|
21
- | | |none | 0|f1 |↑ |0.6233|± | N/A|
22
- |kobest_wic | 1|none | 0|acc |↑ |0.6500|± |0.0134|
23
- | | |none | 0|f1 |↑ |0.6479|± | N/A|
24
-
25
- | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
26
- |----------------|------:|------|-----:|--------|---|-----:|---|------|
27
- |kobest_boolq | 1|none | 5|acc |↑ |0.9145|± |0.0075|
28
- | | |none | 5|f1 |↑ |0.9145|± | N/A|
29
- |kobest_copa | 1|none | 5|acc |↑ |0.7050|± |0.0144|
30
- | | |none | 5|f1 |↑ |0.7045|± | N/A|
31
- |kobest_hellaswag| 1|none | 5|acc |↑ |0.4600|± |0.0223|
32
- | | |none | 5|acc_norm|↑ |0.5540|± |0.0223|
33
- | | |none | 5|f1 |↑ |0.4564|± | N/A|
34
  |kobest_sentineg | 1|none | 5|acc |↑ |0.9496|± |0.0110|
35
  | | |none | 5|f1 |↑ |0.9496|± | N/A|
36
- |kobest_wic | 1|none | 5|acc |↑ |0.7294|± |0.0125|
37
- | | |none | 5|f1 |↑ |0.7258|± | N/A|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- | Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
40
- |------------------------------------------------------------|------:|----------|-----:|-----------|---|-----:|---|-----:|
41
- |kmmlu_direct_accounting | 2|none | 5|exact_match|↑ |0.5200|± |0.0502|
42
- |kmmlu_direct_agricultural_sciences | 2|none | 5|exact_match|↑ |0.3700|± |0.0153|
43
- |kmmlu_direct_aviation_engineering_and_maintenance | 2|none | 5|exact_match|↑ |0.4630|± |0.0158|
44
- |kmmlu_direct_biology | 2|none | 5|exact_match|↑ |0.3680|± |0.0153|
45
- |kmmlu_direct_chemical_engineering | 2|none | 5|exact_match|↑ |0.4700|± |0.0158|
46
- |kmmlu_direct_chemistry | 2|none | 5|exact_match|↑ |0.4733|± |0.0204|
47
- |kmmlu_direct_civil_engineering | 2|none | 5|exact_match|↑ |0.3630|± |0.0152|
48
- |kmmlu_direct_computer_science | 2|none | 5|exact_match|↑ |0.7320|± |0.0140|
49
- |kmmlu_direct_construction | 2|none | 5|exact_match|↑ |0.3620|± |0.0152|
50
- |kmmlu_direct_criminal_law | 2|none | 5|exact_match|↑ |0.4050|± |0.0348|
51
- |kmmlu_direct_ecology | 2|none | 5|exact_match|↑ |0.4860|± |0.0158|
52
- |kmmlu_direct_economics | 2|none | 5|exact_match|↑ |0.6077|± |0.0430|
53
- |kmmlu_direct_education | 2|none | 5|exact_match|↑ |0.7100|± |0.0456|
54
- |kmmlu_direct_electrical_engineering | 2|none | 5|exact_match|↑ |0.3210|± |0.0148|
55
- |kmmlu_direct_electronics_engineering | 2|none | 5|exact_match|↑ |0.5450|± |0.0158|
56
- |kmmlu_direct_energy_management | 2|none | 5|exact_match|↑ |0.3920|± |0.0154|
57
- |kmmlu_direct_environmental_science | 2|none | 5|exact_match|↑ |0.3090|± |0.0146|
58
- |kmmlu_direct_fashion | 2|none | 5|exact_match|↑ |0.4800|± |0.0158|
59
- |kmmlu_direct_food_processing | 2|none | 5|exact_match|↑ |0.4400|± |0.0157|
60
- |kmmlu_direct_gas_technology_and_engineering | 2|none | 5|exact_match|↑ |0.3590|± |0.0152|
61
- |kmmlu_direct_geomatics | 2|none | 5|exact_match|↑ |0.3890|± |0.0154|
62
- |kmmlu_direct_health | 2|none | 5|exact_match|↑ |0.6200|± |0.0488|
63
- |kmmlu_direct_industrial_engineer | 2|none | 5|exact_match|↑ |0.4780|± |0.0158|
64
- |kmmlu_direct_information_technology | 2|none | 5|exact_match|↑ |0.7130|± |0.0143|
65
- |kmmlu_direct_interior_architecture_and_design | 2|none | 5|exact_match|↑ |0.5970|± |0.0155|
66
- |kmmlu_direct_korean_history | 2|none | 5|exact_match|↑ |0.3500|± |0.0479|
67
- |kmmlu_direct_law | 2|none | 5|exact_match|↑ |0.4640|± |0.0158|
68
- |kmmlu_direct_machine_design_and_manufacturing | 2|none | 5|exact_match|↑ |0.4800|± |0.0158|
69
- |kmmlu_direct_management | 2|none | 5|exact_match|↑ |0.6190|± |0.0154|
70
- |kmmlu_direct_maritime_engineering | 2|none | 5|exact_match|↑ |0.4750|± |0.0204|
71
- |kmmlu_direct_marketing | 2|none | 5|exact_match|↑ |0.8010|± |0.0126|
72
- |kmmlu_direct_materials_engineering | 2|none | 5|exact_match|↑ |0.4960|± |0.0158|
73
- |kmmlu_direct_math | 2|none | 5|exact_match|↑ |0.3600|± |0.0278|
74
- |kmmlu_direct_mechanical_engineering | 2|none | 5|exact_match|↑ |0.3960|± |0.0155|
75
- |kmmlu_direct_nondestructive_testing | 2|none | 5|exact_match|↑ |0.4500|± |0.0157|
76
- |kmmlu_direct_patent | 2|none | 5|exact_match|↑ |0.4000|± |0.0492|
77
- |kmmlu_direct_political_science_and_sociology | 2|none | 5|exact_match|↑ |0.5600|± |0.0287|
78
- |kmmlu_direct_psychology | 2|none | 5|exact_match|↑ |0.4790|± |0.0158|
79
- |kmmlu_direct_public_safety | 2|none | 5|exact_match|↑ |0.3710|± |0.0153|
80
- |kmmlu_direct_railway_and_automotive_engineering | 2|none | 5|exact_match|↑ |0.3520|± |0.0151|
81
- |kmmlu_direct_real_estate | 2|none | 5|exact_match|↑ |0.4750|± |0.0354|
82
- |kmmlu_direct_refrigerating_machinery | 2|none | 5|exact_match|↑ |0.3710|± |0.0153|
83
- |kmmlu_direct_social_welfare | 2|none | 5|exact_match|↑ |0.6170|± |0.0154|
84
- |kmmlu_direct_taxation | 2|none | 5|exact_match|↑ |0.3950|± |0.0347|
85
- |kmmlu_direct_telecommunications_and_wireless_technology | 2|none | 5|exact_match|↑ |0.6150|± |0.0154|
86
- |kmmlu_hard_accounting | 2|none | 5|acc |↑ |0.2826|± |0.0671|
87
- | | |none | 5|acc_norm |↑ |0.2826|± |0.0671|
88
- |kmmlu_hard_agricultural_sciences | 2|none | 5|acc |↑ |0.1400|± |0.0349|
89
- | | |none | 5|acc_norm |↑ |0.1400|± |0.0349|
90
- |kmmlu_hard_aviation_engineering_and_maintenance | 2|none | 5|acc |↑ |0.2200|± |0.0416|
91
- | | |none | 5|acc_norm |↑ |0.2200|± |0.0416|
92
- |kmmlu_hard_biology | 2|none | 5|acc |↑ |0.1800|± |0.0386|
93
- | | |none | 5|acc_norm |↑ |0.1800|± |0.0386|
94
- |kmmlu_hard_chemical_engineering | 2|none | 5|acc |↑ |0.1500|± |0.0359|
95
- | | |none | 5|acc_norm |↑ |0.1500|± |0.0359|
96
- |kmmlu_hard_chemistry | 2|none | 5|acc |↑ |0.3000|± |0.0461|
97
- | | |none | 5|acc_norm |↑ |0.3000|± |0.0461|
98
- |kmmlu_hard_civil_engineering | 2|none | 5|acc |↑ |0.1800|± |0.0386|
99
- | | |none | 5|acc_norm |↑ |0.1800|± |0.0386|
100
- |kmmlu_hard_computer_science | 2|none | 5|acc |↑ |0.2400|± |0.0429|
101
- | | |none | 5|acc_norm |↑ |0.2400|± |0.0429|
102
- |kmmlu_hard_construction | 2|none | 5|acc |↑ |0.1800|± |0.0386|
103
- | | |none | 5|acc_norm |↑ |0.1800|± |0.0386|
104
- |kmmlu_hard_cot_accounting | 2|get-answer| 5|exact_match|↑ |0.2391|± |0.0636|
105
- |kmmlu_hard_cot_agricultural_sciences | 2|get-answer| 5|exact_match|↑ |0.0900|± |0.0288|
106
- |kmmlu_hard_cot_aviation_engineering_and_maintenance | 2|get-answer| 5|exact_match|↑ |0.2800|± |0.0451|
107
- |kmmlu_hard_cot_biology | 2|get-answer| 5|exact_match|↑ |0.1200|± |0.0327|
108
- |kmmlu_hard_cot_chemical_engineering | 2|get-answer| 5|exact_match|↑ |0.2100|± |0.0409|
109
- |kmmlu_hard_cot_chemistry | 2|get-answer| 5|exact_match|↑ |0.3100|± |0.0465|
110
- |kmmlu_hard_cot_civil_engineering | 2|get-answer| 5|exact_match|↑ |0.2400|± |0.0429|
111
- |kmmlu_hard_cot_computer_science | 2|get-answer| 5|exact_match|↑ |0.2300|± |0.0423|
112
- |kmmlu_hard_cot_construction | 2|get-answer| 5|exact_match|↑ |0.2100|± |0.0409|
113
- |kmmlu_hard_cot_criminal_law | 2|get-answer| 5|exact_match|↑ |0.1100|± |0.0314|
114
- |kmmlu_hard_cot_ecology | 2|get-answer| 5|exact_match|↑ |0.1300|± |0.0338|
115
- |kmmlu_hard_cot_economics | 2|get-answer| 5|exact_match|↑ |0.1429|± |0.0546|
116
- |kmmlu_hard_cot_education | 2|get-answer| 5|exact_match|↑ |0.0435|± |0.0435|
117
- |kmmlu_hard_cot_electrical_engineering | 2|get-answer| 5|exact_match|↑ |0.2200|± |0.0416|
118
- |kmmlu_hard_cot_electronics_engineering | 2|get-answer| 5|exact_match|↑ |0.3500|± |0.0479|
119
- |kmmlu_hard_cot_energy_management | 2|get-answer| 5|exact_match|↑ |0.3300|± |0.0473|
120
- |kmmlu_hard_cot_environmental_science | 2|get-answer| 5|exact_match|↑ |0.1500|± |0.0359|
121
- |kmmlu_hard_cot_fashion | 2|get-answer| 5|exact_match|↑ |0.2200|± |0.0416|
122
- |kmmlu_hard_cot_food_processing | 2|get-answer| 5|exact_match|↑ |0.2100|± |0.0409|
123
- |kmmlu_hard_cot_gas_technology_and_engineering | 2|get-answer| 5|exact_match|↑ |0.3200|± |0.0469|
124
- |kmmlu_hard_cot_geomatics | 2|get-answer| 5|exact_match|↑ |0.1900|± |0.0394|
125
- |kmmlu_hard_cot_health | 2|get-answer| 5|exact_match|↑ |0.2609|± |0.0936|
126
- |kmmlu_hard_cot_industrial_engineer | 2|get-answer| 5|exact_match|↑ |0.1300|± |0.0338|
127
- |kmmlu_hard_cot_information_technology | 2|get-answer| 5|exact_match|↑ |0.3600|± |0.0482|
128
- |kmmlu_hard_cot_interior_architecture_and_design | 2|get-answer| 5|exact_match|↑ |0.1600|± |0.0368|
129
- |kmmlu_hard_cot_korean_history | 2|get-answer| 5|exact_match|↑ |0.1364|± |0.0523|
130
- |kmmlu_hard_cot_law | 2|get-answer| 5|exact_match|↑ |0.1500|± |0.0359|
131
- |kmmlu_hard_cot_machine_design_and_manufacturing | 2|get-answer| 5|exact_match|↑ |0.3000|± |0.0461|
132
- |kmmlu_hard_cot_management | 2|get-answer| 5|exact_match|↑ |0.1900|± |0.0394|
133
- |kmmlu_hard_cot_maritime_engineering | 2|get-answer| 5|exact_match|↑ |0.2000|± |0.0402|
134
- |kmmlu_hard_cot_marketing | 2|get-answer| 5|exact_match|↑ |0.2000|± |0.0402|
135
- |kmmlu_hard_cot_materials_engineering | 2|get-answer| 5|exact_match|↑ |0.1500|± |0.0359|
136
- |kmmlu_hard_cot_math | 2|get-answer| 5|exact_match|↑ |0.3600|± |0.0482|
137
- |kmmlu_hard_cot_mechanical_engineering | 2|get-answer| 5|exact_match|↑ |0.2600|± |0.0441|
138
- |kmmlu_hard_cot_nondestructive_testing | 2|get-answer| 5|exact_match|↑ |0.3000|± |0.0461|
139
- |kmmlu_hard_cot_patent | 2|get-answer| 5|exact_match|↑ |0.2549|± |0.0616|
140
- |kmmlu_hard_cot_political_science_and_sociology | 2|get-answer| 5|exact_match|↑ |0.0889|± |0.0302|
141
- |kmmlu_hard_cot_psychology | 2|get-answer| 5|exact_match|↑ |0.1300|± |0.0338|
142
- |kmmlu_hard_cot_public_safety | 2|get-answer| 5|exact_match|↑ |0.1700|± |0.0378|
143
- |kmmlu_hard_cot_railway_and_automotive_engineering | 2|get-answer| 5|exact_match|↑ |0.1700|± |0.0378|
144
- |kmmlu_hard_cot_real_estate | 2|get-answer| 5|exact_match|↑ |0.2472|± |0.0460|
145
- |kmmlu_hard_cot_refrigerating_machinery | 2|get-answer| 5|exact_match|↑ |0.3000|± |0.0461|
146
- |kmmlu_hard_cot_social_welfare | 2|get-answer| 5|exact_match|↑ |0.3700|± |0.0485|
147
- |kmmlu_hard_cot_taxation | 2|get-answer| 5|exact_match|↑ |0.1979|± |0.0409|
148
- |kmmlu_hard_cot_telecommunications_and_wireless_technology | 2|get-answer| 5|exact_match|↑ |0.2900|± |0.0456|
149
- |kmmlu_hard_criminal_law | 2|none | 5|acc |↑ |0.2400|± |0.0429|
150
- | | |none | 5|acc_norm |↑ |0.2400|± |0.0429|
151
- |kmmlu_hard_direct_accounting | 2|none | 5|exact_match|↑ |0.3043|± |0.0686|
152
- |kmmlu_hard_direct_agricultural_sciences | 2|none | 5|exact_match|↑ |0.1400|± |0.0349|
153
- |kmmlu_hard_direct_aviation_engineering_and_maintenance | 2|none | 5|exact_match|↑ |0.2200|± |0.0416|
154
- |kmmlu_hard_direct_biology | 2|none | 5|exact_match|↑ |0.1900|± |0.0394|
155
- |kmmlu_hard_direct_chemical_engineering | 2|none | 5|exact_match|↑ |0.1600|± |0.0368|
156
- |kmmlu_hard_direct_chemistry | 2|none | 5|exact_match|↑ |0.2900|± |0.0456|
157
- |kmmlu_hard_direct_civil_engineering | 2|none | 5|exact_match|↑ |0.1800|± |0.0386|
158
- |kmmlu_hard_direct_computer_science | 2|none | 5|exact_match|↑ |0.2400|± |0.0429|
159
- |kmmlu_hard_direct_construction | 2|none | 5|exact_match|↑ |0.1700|± |0.0378|
160
- |kmmlu_hard_direct_criminal_law | 2|none | 5|exact_match|↑ |0.2400|± |0.0429|
161
- |kmmlu_hard_direct_ecology | 2|none | 5|exact_match|↑ |0.1500|± |0.0359|
162
- |kmmlu_hard_direct_economics | 2|none | 5|exact_match|↑ |0.2857|± |0.0706|
163
- |kmmlu_hard_direct_education | 2|none | 5|exact_match|↑ |0.3478|± |0.1015|
164
- |kmmlu_hard_direct_electrical_engineering | 2|none | 5|exact_match|↑ |0.1700|± |0.0378|
165
- |kmmlu_hard_direct_electronics_engineering | 2|none | 5|exact_match|↑ |0.2500|± |0.0435|
166
- |kmmlu_hard_direct_energy_management | 2|none | 5|exact_match|↑ |0.2100|± |0.0409|
167
- |kmmlu_hard_direct_environmental_science | 2|none | 5|exact_match|↑ |0.2000|± |0.0402|
168
- |kmmlu_hard_direct_fashion | 2|none | 5|exact_match|↑ |0.2300|± |0.0423|
169
- |kmmlu_hard_direct_food_processing | 2|none | 5|exact_match|↑ |0.1800|± |0.0386|
170
- |kmmlu_hard_direct_gas_technology_and_engineering | 2|none | 5|exact_match|↑ |0.1100|± |0.0314|
171
- |kmmlu_hard_direct_geomatics | 2|none | 5|exact_match|↑ |0.2200|± |0.0416|
172
- |kmmlu_hard_direct_health | 2|none | 5|exact_match|↑ |0.1739|± |0.0808|
173
- |kmmlu_hard_direct_industrial_engineer | 2|none | 5|exact_match|↑ |0.1900|± |0.0394|
174
- |kmmlu_hard_direct_information_technology | 2|none | 5|exact_match|↑ |0.3200|± |0.0469|
175
- |kmmlu_hard_direct_interior_architecture_and_design | 2|none | 5|exact_match|↑ |0.2100|± |0.0409|
176
- |kmmlu_hard_direct_korean_history | 2|none | 5|exact_match|↑ |0.1591|± |0.0558|
177
- |kmmlu_hard_direct_law | 2|none | 5|exact_match|↑ |0.1700|± |0.0378|
178
- |kmmlu_hard_direct_machine_design_and_manufacturing | 2|none | 5|exact_match|↑ |0.1700|± |0.0378|
179
- |kmmlu_hard_direct_management | 2|none | 5|exact_match|↑ |0.3100|± |0.0465|
180
- |kmmlu_hard_direct_maritime_engineering | 2|none | 5|exact_match|↑ |0.2300|± |0.0423|
181
- |kmmlu_hard_direct_marketing | 2|none | 5|exact_match|↑ |0.3300|± |0.0473|
182
- |kmmlu_hard_direct_materials_engineering | 2|none | 5|exact_match|↑ |0.1700|± |0.0378|
183
- |kmmlu_hard_direct_math | 2|none | 5|exact_match|↑ |0.3100|± |0.0465|
184
- |kmmlu_hard_direct_mechanical_engineering | 2|none | 5|exact_match|↑ |0.1700|± |0.0378|
185
- |kmmlu_hard_direct_nondestructive_testing | 2|none | 5|exact_match|↑ |0.1300|± |0.0338|
186
- |kmmlu_hard_direct_patent | 2|none | 5|exact_match|↑ |0.3137|± |0.0656|
187
- |kmmlu_hard_direct_political_science_and_sociology | 2|none | 5|exact_match|↑ |0.2444|± |0.0456|
188
- |kmmlu_hard_direct_psychology | 2|none | 5|exact_match|↑ |0.2600|± |0.0441|
189
- |kmmlu_hard_direct_public_safety | 2|none | 5|exact_match|↑ |0.1500|± |0.0359|
190
- |kmmlu_hard_direct_railway_and_automotive_engineering | 2|none | 5|exact_match|↑ |0.1800|± |0.0386|
191
- |kmmlu_hard_direct_real_estate | 2|none | 5|exact_match|↑ |0.2247|± |0.0445|
192
- |kmmlu_hard_direct_refrigerating_machinery | 2|none | 5|exact_match|↑ |0.1800|± |0.0386|
193
- |kmmlu_hard_direct_social_welfare | 2|none | 5|exact_match|↑ |0.3100|± |0.0465|
194
- |kmmlu_hard_direct_taxation | 2|none | 5|exact_match|↑ |0.1979|± |0.0409|
195
- |kmmlu_hard_direct_telecommunications_and_wireless_technology| 2|none | 5|exact_match|↑ |0.2500|± |0.0435|
196
- |kmmlu_hard_ecology | 2|none | 5|acc |↑ |0.1400|± |0.0349|
197
- | | |none | 5|acc_norm |↑ |0.1400|± |0.0349|
198
- |kmmlu_hard_economics | 2|none | 5|acc |↑ |0.2857|± |0.0706|
199
- | | |none | 5|acc_norm |↑ |0.2857|± |0.0706|
200
- |kmmlu_hard_education | 2|none | 5|acc |↑ |0.3478|± |0.1015|
201
- | | |none | 5|acc_norm |↑ |0.3478|± |0.1015|
202
- |kmmlu_hard_electrical_engineering | 2|none | 5|acc |↑ |0.1600|± |0.0368|
203
- | | |none | 5|acc_norm |↑ |0.1600|± |0.0368|
204
- |kmmlu_hard_electronics_engineering | 2|none | 5|acc |↑ |0.2500|± |0.0435|
205
- | | |none | 5|acc_norm |↑ |0.2500|± |0.0435|
206
- |kmmlu_hard_energy_management | 2|none | 5|acc |↑ |0.2100|± |0.0409|
207
- | | |none | 5|acc_norm |↑ |0.2100|± |0.0409|
208
- |kmmlu_hard_environmental_science | 2|none | 5|acc |↑ |0.2000|± |0.0402|
209
- | | |none | 5|acc_norm |↑ |0.2000|± |0.0402|
210
- |kmmlu_hard_fashion | 2|none | 5|acc |↑ |0.2100|± |0.0409|
211
- | | |none | 5|acc_norm |↑ |0.2100|± |0.0409|
212
- |kmmlu_hard_food_processing | 2|none | 5|acc |↑ |0.1800|± |0.0386|
213
- | | |none | 5|acc_norm |↑ |0.1800|± |0.0386|
214
- |kmmlu_hard_gas_technology_and_engineering | 2|none | 5|acc |↑ |0.1100|± |0.0314|
215
- | | |none | 5|acc_norm |↑ |0.1100|± |0.0314|
216
- |kmmlu_hard_geomatics | 2|none | 5|acc |↑ |0.2200|± |0.0416|
217
- | | |none | 5|acc_norm |↑ |0.2200|± |0.0416|
218
- |kmmlu_hard_health | 2|none | 5|acc |↑ |0.2174|± |0.0879|
219
- | | |none | 5|acc_norm |↑ |0.2174|± |0.0879|
220
- |kmmlu_hard_industrial_engineer | 2|none | 5|acc |↑ |0.1900|± |0.0394|
221
- | | |none | 5|acc_norm |↑ |0.1900|± |0.0394|
222
- |kmmlu_hard_information_technology | 2|none | 5|acc |↑ |0.3200|± |0.0469|
223
- | | |none | 5|acc_norm |↑ |0.3200|± |0.0469|
224
- |kmmlu_hard_interior_architecture_and_design | 2|none | 5|acc |↑ |0.2100|± |0.0409|
225
- | | |none | 5|acc_norm |↑ |0.2100|± |0.0409|
226
- |kmmlu_hard_korean_history | 2|none | 5|acc |↑ |0.1591|± |0.0558|
227
- | | |none | 5|acc_norm |↑ |0.1591|± |0.0558|
228
- |kmmlu_hard_law | 2|none | 5|acc |↑ |0.1700|± |0.0378|
229
- | | |none | 5|acc_norm |↑ |0.1700|± |0.0378|
230
- |kmmlu_hard_machine_design_and_manufacturing | 2|none | 5|acc |↑ |0.1700|± |0.0378|
231
- | | |none | 5|acc_norm |↑ |0.1700|± |0.0378|
232
- |kmmlu_hard_management | 2|none | 5|acc |↑ |0.3100|± |0.0465|
233
- | | |none | 5|acc_norm |↑ |0.3100|± |0.0465|
234
- |kmmlu_hard_maritime_engineering | 2|none | 5|acc |↑ |0.2300|± |0.0423|
235
- | | |none | 5|acc_norm |↑ |0.2300|± |0.0423|
236
- |kmmlu_hard_marketing | 2|none | 5|acc |↑ |0.3200|± |0.0469|
237
- | | |none | 5|acc_norm |↑ |0.3200|± |0.0469|
238
- |kmmlu_hard_materials_engineering | 2|none | 5|acc |↑ |0.1900|± |0.0394|
239
- | | |none | 5|acc_norm |↑ |0.1900|± |0.0394|
240
- |kmmlu_hard_math | 2|none | 5|acc |↑ |0.3100|± |0.0465|
241
- | | |none | 5|acc_norm |↑ |0.3100|± |0.0465|
242
- |kmmlu_hard_mechanical_engineering | 2|none | 5|acc |↑ |0.1700|± |0.0378|
243
- | | |none | 5|acc_norm |↑ |0.1700|± |0.0378|
244
- |kmmlu_hard_nondestructive_testing | 2|none | 5|acc |↑ |0.1300|± |0.0338|
245
- | | |none | 5|acc_norm |↑ |0.1300|± |0.0338|
246
- |kmmlu_hard_patent | 2|none | 5|acc |↑ |0.2941|± |0.0644|
247
- | | |none | 5|acc_norm |↑ |0.2941|± |0.0644|
248
- |kmmlu_hard_political_science_and_sociology | 2|none | 5|acc |↑ |0.2667|± |0.0469|
249
- | | |none | 5|acc_norm |↑ |0.2667|± |0.0469|
250
- |kmmlu_hard_psychology | 2|none | 5|acc |↑ |0.2700|± |0.0446|
251
- | | |none | 5|acc_norm |↑ |0.2700|± |0.0446|
252
- |kmmlu_hard_public_safety | 2|none | 5|acc |↑ |0.1400|± |0.0349|
253
- | | |none | 5|acc_norm |↑ |0.1400|± |0.0349|
254
- |kmmlu_hard_railway_and_automotive_engineering | 2|none | 5|acc |↑ |0.1700|± |0.0378|
255
- | | |none | 5|acc_norm |↑ |0.1700|± |0.0378|
256
- |kmmlu_hard_real_estate | 2|none | 5|acc |↑ |0.2135|± |0.0437|
257
- | | |none | 5|acc_norm |↑ |0.2135|± |0.0437|
258
- |kmmlu_hard_refrigerating_machinery | 2|none | 5|acc |↑ |0.2000|± |0.0402|
259
- | | |none | 5|acc_norm |↑ |0.2000|± |0.0402|
260
- |kmmlu_hard_social_welfare | 2|none | 5|acc |↑ |0.3200|± |0.0469|
261
- | | |none | 5|acc_norm |↑ |0.3200|± |0.0469|
262
- |kmmlu_hard_taxation | 2|none | 5|acc |↑ |0.2083|± |0.0417|
263
- | | |none | 5|acc_norm |↑ |0.2083|± |0.0417|
264
- |kmmlu_hard_telecommunications_and_wireless_technology | 2|none | 5|acc |↑ |0.2600|± |0.0441|
265
- | | |none | 5|acc_norm |↑ |0.2600|± |0.0441|
 
1
  |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
2
  |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
3
+ |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.8029|± |0.0110|
4
+ | | |strict-match | 5|exact_match|↑ |0.7961|± |0.0111|
5
 
 
 
 
 
6
 
7
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
8
  |----------------|------:|------|-----:|--------|---|-----:|---|------|
9
+ |kobest_boolq | 1|none | 5|acc |↑ |0.9167|± |0.0074|
10
+ | | |none | 5|f1 |↑ |0.9167|± | N/A|
11
+ |kobest_copa | 1|none | 5|acc |↑ |0.7130|± |0.0143|
12
+ | | |none | 5|f1 |↑ |0.7125|± | N/A|
13
+ |kobest_hellaswag| 1|none | 5|acc |↑ |0.4540|± |0.0223|
14
+ | | |none | 5|acc_norm|↑ |0.5700|± |0.0222|
15
+ | | |none | 5|f1 |↑ |0.4505|± | N/A|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  |kobest_sentineg | 1|none | 5|acc |↑ |0.9496|± |0.0110|
17
  | | |none | 5|f1 |↑ |0.9496|± | N/A|
18
+ |kobest_wic | 1|none | 5|acc |↑ |0.7111|± |0.0128|
19
+ | | |none | 5|f1 |↑ |0.7025|± | N/A|
20
+
21
+ | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
22
+ |-------------------------------------------------------|------:|------|-----:|-----------|---|-----:|---|-----:|
23
+ |kmmlu_direct_accounting | 2|none | 5|exact_match|↑ |0.5500|± |0.0500|
24
+ |kmmlu_direct_agricultural_sciences | 2|none | 5|exact_match|↑ |0.3680|± |0.0153|
25
+ |kmmlu_direct_aviation_engineering_and_maintenance | 2|none | 5|exact_match|↑ |0.4670|± |0.0158|
26
+ |kmmlu_direct_biology | 2|none | 5|exact_match|↑ |0.3740|± |0.0153|
27
+ |kmmlu_direct_chemical_engineering | 2|none | 5|exact_match|↑ |0.4650|± |0.0158|
28
+ |kmmlu_direct_chemistry | 2|none | 5|exact_match|↑ |0.4900|± |0.0204|
29
+ |kmmlu_direct_civil_engineering | 2|none | 5|exact_match|↑ |0.3540|± |0.0151|
30
+ |kmmlu_direct_computer_science | 2|none | 5|exact_match|↑ |0.7320|± |0.0140|
31
+ |kmmlu_direct_construction | 2|none | 5|exact_match|↑ |0.3590|± |0.0152|
32
+ |kmmlu_direct_criminal_law | 2|none | 5|exact_match|↑ |0.4250|± |0.0350|
33
+ |kmmlu_direct_ecology | 2|none | 5|exact_match|↑ |0.4900|± |0.0158|
34
+ |kmmlu_direct_economics | 2|none | 5|exact_match|↑ |0.6154|± |0.0428|
35
+ |kmmlu_direct_education | 2|none | 5|exact_match|↑ |0.6900|± |0.0465|
36
+ |kmmlu_direct_electrical_engineering | 2|none | 5|exact_match|↑ |0.3170|± |0.0147|
37
+ |kmmlu_direct_electronics_engineering | 2|none | 5|exact_match|↑ |0.5440|± |0.0158|
38
+ |kmmlu_direct_energy_management | 2|none | 5|exact_match|↑ |0.3960|± |0.0155|
39
+ |kmmlu_direct_environmental_science | 2|none | 5|exact_match|↑ |0.2950|± |0.0144|
40
+ |kmmlu_direct_fashion | 2|none | 5|exact_match|↑ |0.4660|± |0.0158|
41
+ |kmmlu_direct_food_processing | 2|none | 5|exact_match|↑ |0.4370|± |0.0157|
42
+ |kmmlu_direct_gas_technology_and_engineering | 2|none | 5|exact_match|↑ |0.3650|± |0.0152|
43
+ |kmmlu_direct_geomatics | 2|none | 5|exact_match|↑ |0.3770|± |0.0153|
44
+ |kmmlu_direct_health | 2|none | 5|exact_match|↑ |0.6200|± |0.0488|
45
+ |kmmlu_direct_industrial_engineer | 2|none | 5|exact_match|↑ |0.4730|± |0.0158|
46
+ |kmmlu_direct_information_technology | 2|none | 5|exact_match|↑ |0.7080|± |0.0144|
47
+ |kmmlu_direct_interior_architecture_and_design | 2|none | 5|exact_match|↑ |0.6080|± |0.0154|
48
+ |kmmlu_direct_korean_history | 2|none | 5|exact_match|↑ |0.3200|± |0.0469|
49
+ |kmmlu_direct_law | 2|none | 5|exact_match|↑ |0.4730|± |0.0158|
50
+ |kmmlu_direct_machine_design_and_manufacturing | 2|none | 5|exact_match|↑ |0.4750|± |0.0158|
51
+ |kmmlu_direct_management | 2|none | 5|exact_match|↑ |0.6160|± |0.0154|
52
+ |kmmlu_direct_maritime_engineering | 2|none | 5|exact_match|↑ |0.4817|± |0.0204|
53
+ |kmmlu_direct_marketing | 2|none | 5|exact_match|↑ |0.8010|± |0.0126|
54
+ |kmmlu_direct_materials_engineering | 2|none | 5|exact_match|↑ |0.4970|± |0.0158|
55
+ |kmmlu_direct_math | 2|none | 5|exact_match|↑ |0.3500|± |0.0276|
56
+ |kmmlu_direct_mechanical_engineering | 2|none | 5|exact_match|↑ |0.4040|± |0.0155|
57
+ |kmmlu_direct_nondestructive_testing | 2|none | 5|exact_match|↑ |0.4580|± |0.0158|
58
+ |kmmlu_direct_patent | 2|none | 5|exact_match|↑ |0.4100|± |0.0494|
59
+ |kmmlu_direct_political_science_and_sociology | 2|none | 5|exact_match|↑ |0.5500|± |0.0288|
60
+ |kmmlu_direct_psychology | 2|none | 5|exact_match|↑ |0.4700|± |0.0158|
61
+ |kmmlu_direct_public_safety | 2|none | 5|exact_match|↑ |0.3680|± |0.0153|
62
+ |kmmlu_direct_railway_and_automotive_engineering | 2|none | 5|exact_match|↑ |0.3550|± |0.0151|
63
+ |kmmlu_direct_real_estate | 2|none | 5|exact_match|↑ |0.4650|± |0.0354|
64
+ |kmmlu_direct_refrigerating_machinery | 2|none | 5|exact_match|↑ |0.3730|± |0.0153|
65
+ |kmmlu_direct_social_welfare | 2|none | 5|exact_match|↑ |0.6140|± |0.0154|
66
+ |kmmlu_direct_taxation | 2|none | 5|exact_match|↑ |0.4050|± |0.0348|
67
+ |kmmlu_direct_telecommunications_and_wireless_technology| 2|none | 5|exact_match|↑ |0.6080|± |0.0154|
68
 
69
+ | Groups |Version|Filter|n-shot|Metric| |Value | |Stderr|
70
+ |------------------|------:|------|------|------|---|-----:|---|-----:|
71
+ |mmlu | 2|none | |acc |↑ |0.6755|± |0.0038|
72
+ | - humanities | 2|none | |acc |↑ |0.6140|± |0.0067|
73
+ | - other | 2|none | |acc |↑ |0.7271|± |0.0077|
74
+ | - social sciences| 2|none | |acc |↑ |0.7793|± |0.0073|
75
+ | - stem | 2|none | |acc |↑ |0.6153|± |0.0084|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step5787
model-00001-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75efef28bbffbf631709659a8d0b3cf1e2f77e169dc93c8e6d63d72d9d7095af
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b2a38f72b3eb61d20c5b6f7fa395258dccb070ed648d7f52f0e3d8f37f18b06
3
  size 4877660776
model-00002-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:922caa4ea6c1d9b759338b8fd660655feac066efc89e3b3ed1b9703e1ef48d87
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:060e41523ab402378344b983424cafa6c3544e0b2fa766d90b71bbb90da6d05c
3
  size 4932751008
model-00003-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3319912206086ca4a97ce31464152c9be37d688f40950064f92f53b34dc584d2
3
  size 4991495888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:505b70f17b8c21167321d7b0a8788655bb0c7061a8ef8da567be2a9818a48248
3
  size 4991495888
model-00004-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e949a2be58e0966bfb76da4995a7f03e09e4141871c3a16702bc45a3ddc10d7f
3
  size 4991495888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c285f90377ff7d08670d6983d442d273701c02435b2f205d713e38ee452c90d
3
  size 4991495888
model-00005-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1df52907001c545ae2f4d49a1888f0e26f4bf034880eaf0d4a5ec24c514b6bf2
3
  size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b96fcb224818ae73111a12ca322e5445ebf1089bd20c20c03f6bd33c1062b6
3
  size 4932751040
model-00006-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8bc5fae0f95564d05dcb266a681fcf3075e755ac226eef8b23db421bc63b09b
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca82f03df283b171bcf8c7cef7ff2eba3bcc890d8fb19bfd320477f9decf344
3
  size 4330865200
model-00007-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bac53e858731549045ae820fbecc57ee0b3164efbf5498b91f6cc3159d985d62
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70462c3bcedf84efd32b927b411ff11f254cc372d4d3806a847cdd4c6ac4c011
3
  size 1089994880