Training in progress, step 4000, checkpoint
Browse files- last-checkpoint/adapter_model.bin +1 -1
- last-checkpoint/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step4000/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step4000/zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step4000/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step4000/zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step4000/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step4000/zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/trainer_state.json +303 -3
last-checkpoint/adapter_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 19744138
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b074dc89146bd028c0664b132fc46bfc59201185bda06775c72139e7c815a15
|
3 |
size 19744138
|
last-checkpoint/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce6b4c9dcedd64467ad302888d9d03990b2a496988b7b6c2740204ce7313f44e
|
3 |
+
size 6508458036
|
last-checkpoint/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c299557772557013b972c2a06c9a6bb6602373482f42a769c8f7a6aa04cefccb
|
3 |
+
size 29495149
|
last-checkpoint/global_step4000/zero_pp_rank_1_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e86fd6ae2c479448da228eb07296019768c4f0e4dc30347b9037e01cb86c994b
|
3 |
+
size 6508458036
|
last-checkpoint/global_step4000/zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ea5e6e6c511616aee9a69a7f0f2412db28efbab49b0ec207d9d03fdd503ee72
|
3 |
+
size 29495149
|
last-checkpoint/global_step4000/zero_pp_rank_2_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4596995abffaa68f30aeed25446c64c5dfc8f5723d5c9cf9cbcc4afb30924de4
|
3 |
+
size 6508458036
|
last-checkpoint/global_step4000/zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:acd30c5a87bdb041aa371932bc4b3e79aadcfcde787a1c043bd483e5ab5c52d1
|
3 |
+
size 29495149
|
last-checkpoint/global_step4000/zero_pp_rank_3_mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91826afa1d75a4d9fbb2622cdf64c876ef725685a3429693964d9c91d12f04ef
|
3 |
+
size 6508458036
|
last-checkpoint/global_step4000/zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f70fbe73734097b278f8f7aa50fc9d5f1fe73a7ba0143a9e58741a5757e5594
|
3 |
+
size 29495149
|
last-checkpoint/latest
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
global_step4000
|
last-checkpoint/rng_state_0.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15024
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cda7acc40557204710df648d13c6c64dd3bee9e11d98ca8ec6bf9765f6fd55b
|
3 |
size 15024
|
last-checkpoint/rng_state_1.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15024
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df659af25f38deebdca58158c68718a93017141cfb7b33e8079633d427d6debf
|
3 |
size 15024
|
last-checkpoint/rng_state_2.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15024
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f89a12d795a7d9929749bfcc711935eda3c929f167285a61a0defe0e6815157d
|
3 |
size 15024
|
last-checkpoint/rng_state_3.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 15024
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:595bcf6bb327a594b78f46fc654a60578f254399a5961cafc50cf97fe1934fba
|
3 |
size 15024
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 1.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -2107,13 +2107,313 @@
|
|
2107 |
"learning_rate": 1e-05,
|
2108 |
"loss": 0.6091,
|
2109 |
"step": 3500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2110 |
}
|
2111 |
],
|
2112 |
"logging_steps": 10,
|
2113 |
"max_steps": 5000,
|
2114 |
"num_train_epochs": 3,
|
2115 |
"save_steps": 500,
|
2116 |
-
"total_flos":
|
2117 |
"trial_name": null,
|
2118 |
"trial_params": null
|
2119 |
}
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.6863406408094435,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 4000,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
2107 |
"learning_rate": 1e-05,
|
2108 |
"loss": 0.6091,
|
2109 |
"step": 3500
|
2110 |
+
},
|
2111 |
+
{
|
2112 |
+
"epoch": 1.48,
|
2113 |
+
"learning_rate": 1e-05,
|
2114 |
+
"loss": 0.6603,
|
2115 |
+
"step": 3510
|
2116 |
+
},
|
2117 |
+
{
|
2118 |
+
"epoch": 1.48,
|
2119 |
+
"learning_rate": 1e-05,
|
2120 |
+
"loss": 0.6306,
|
2121 |
+
"step": 3520
|
2122 |
+
},
|
2123 |
+
{
|
2124 |
+
"epoch": 1.49,
|
2125 |
+
"learning_rate": 1e-05,
|
2126 |
+
"loss": 0.6345,
|
2127 |
+
"step": 3530
|
2128 |
+
},
|
2129 |
+
{
|
2130 |
+
"epoch": 1.49,
|
2131 |
+
"learning_rate": 1e-05,
|
2132 |
+
"loss": 0.636,
|
2133 |
+
"step": 3540
|
2134 |
+
},
|
2135 |
+
{
|
2136 |
+
"epoch": 1.5,
|
2137 |
+
"learning_rate": 1e-05,
|
2138 |
+
"loss": 0.692,
|
2139 |
+
"step": 3550
|
2140 |
+
},
|
2141 |
+
{
|
2142 |
+
"epoch": 1.5,
|
2143 |
+
"learning_rate": 1e-05,
|
2144 |
+
"loss": 0.6703,
|
2145 |
+
"step": 3560
|
2146 |
+
},
|
2147 |
+
{
|
2148 |
+
"epoch": 1.51,
|
2149 |
+
"learning_rate": 1e-05,
|
2150 |
+
"loss": 0.672,
|
2151 |
+
"step": 3570
|
2152 |
+
},
|
2153 |
+
{
|
2154 |
+
"epoch": 1.51,
|
2155 |
+
"learning_rate": 1e-05,
|
2156 |
+
"loss": 0.6706,
|
2157 |
+
"step": 3580
|
2158 |
+
},
|
2159 |
+
{
|
2160 |
+
"epoch": 1.51,
|
2161 |
+
"learning_rate": 1e-05,
|
2162 |
+
"loss": 0.6546,
|
2163 |
+
"step": 3590
|
2164 |
+
},
|
2165 |
+
{
|
2166 |
+
"epoch": 1.52,
|
2167 |
+
"learning_rate": 1e-05,
|
2168 |
+
"loss": 0.6503,
|
2169 |
+
"step": 3600
|
2170 |
+
},
|
2171 |
+
{
|
2172 |
+
"epoch": 1.52,
|
2173 |
+
"learning_rate": 1e-05,
|
2174 |
+
"loss": 0.6934,
|
2175 |
+
"step": 3610
|
2176 |
+
},
|
2177 |
+
{
|
2178 |
+
"epoch": 1.53,
|
2179 |
+
"learning_rate": 1e-05,
|
2180 |
+
"loss": 0.6327,
|
2181 |
+
"step": 3620
|
2182 |
+
},
|
2183 |
+
{
|
2184 |
+
"epoch": 1.53,
|
2185 |
+
"learning_rate": 1e-05,
|
2186 |
+
"loss": 0.6649,
|
2187 |
+
"step": 3630
|
2188 |
+
},
|
2189 |
+
{
|
2190 |
+
"epoch": 1.53,
|
2191 |
+
"learning_rate": 1e-05,
|
2192 |
+
"loss": 0.6225,
|
2193 |
+
"step": 3640
|
2194 |
+
},
|
2195 |
+
{
|
2196 |
+
"epoch": 1.54,
|
2197 |
+
"learning_rate": 1e-05,
|
2198 |
+
"loss": 0.6425,
|
2199 |
+
"step": 3650
|
2200 |
+
},
|
2201 |
+
{
|
2202 |
+
"epoch": 1.54,
|
2203 |
+
"learning_rate": 1e-05,
|
2204 |
+
"loss": 0.6409,
|
2205 |
+
"step": 3660
|
2206 |
+
},
|
2207 |
+
{
|
2208 |
+
"epoch": 1.55,
|
2209 |
+
"learning_rate": 1e-05,
|
2210 |
+
"loss": 0.6685,
|
2211 |
+
"step": 3670
|
2212 |
+
},
|
2213 |
+
{
|
2214 |
+
"epoch": 1.55,
|
2215 |
+
"learning_rate": 1e-05,
|
2216 |
+
"loss": 0.7274,
|
2217 |
+
"step": 3680
|
2218 |
+
},
|
2219 |
+
{
|
2220 |
+
"epoch": 1.56,
|
2221 |
+
"learning_rate": 1e-05,
|
2222 |
+
"loss": 0.7256,
|
2223 |
+
"step": 3690
|
2224 |
+
},
|
2225 |
+
{
|
2226 |
+
"epoch": 1.56,
|
2227 |
+
"learning_rate": 1e-05,
|
2228 |
+
"loss": 0.6972,
|
2229 |
+
"step": 3700
|
2230 |
+
},
|
2231 |
+
{
|
2232 |
+
"epoch": 1.56,
|
2233 |
+
"learning_rate": 1e-05,
|
2234 |
+
"loss": 0.6425,
|
2235 |
+
"step": 3710
|
2236 |
+
},
|
2237 |
+
{
|
2238 |
+
"epoch": 1.57,
|
2239 |
+
"learning_rate": 1e-05,
|
2240 |
+
"loss": 0.6627,
|
2241 |
+
"step": 3720
|
2242 |
+
},
|
2243 |
+
{
|
2244 |
+
"epoch": 1.57,
|
2245 |
+
"learning_rate": 1e-05,
|
2246 |
+
"loss": 0.7265,
|
2247 |
+
"step": 3730
|
2248 |
+
},
|
2249 |
+
{
|
2250 |
+
"epoch": 1.58,
|
2251 |
+
"learning_rate": 1e-05,
|
2252 |
+
"loss": 0.6436,
|
2253 |
+
"step": 3740
|
2254 |
+
},
|
2255 |
+
{
|
2256 |
+
"epoch": 1.58,
|
2257 |
+
"learning_rate": 1e-05,
|
2258 |
+
"loss": 0.6597,
|
2259 |
+
"step": 3750
|
2260 |
+
},
|
2261 |
+
{
|
2262 |
+
"epoch": 1.59,
|
2263 |
+
"learning_rate": 1e-05,
|
2264 |
+
"loss": 0.6968,
|
2265 |
+
"step": 3760
|
2266 |
+
},
|
2267 |
+
{
|
2268 |
+
"epoch": 1.59,
|
2269 |
+
"learning_rate": 1e-05,
|
2270 |
+
"loss": 0.6513,
|
2271 |
+
"step": 3770
|
2272 |
+
},
|
2273 |
+
{
|
2274 |
+
"epoch": 1.59,
|
2275 |
+
"learning_rate": 1e-05,
|
2276 |
+
"loss": 0.6204,
|
2277 |
+
"step": 3780
|
2278 |
+
},
|
2279 |
+
{
|
2280 |
+
"epoch": 1.6,
|
2281 |
+
"learning_rate": 1e-05,
|
2282 |
+
"loss": 0.7159,
|
2283 |
+
"step": 3790
|
2284 |
+
},
|
2285 |
+
{
|
2286 |
+
"epoch": 1.6,
|
2287 |
+
"learning_rate": 1e-05,
|
2288 |
+
"loss": 0.6509,
|
2289 |
+
"step": 3800
|
2290 |
+
},
|
2291 |
+
{
|
2292 |
+
"epoch": 1.61,
|
2293 |
+
"learning_rate": 1e-05,
|
2294 |
+
"loss": 0.6105,
|
2295 |
+
"step": 3810
|
2296 |
+
},
|
2297 |
+
{
|
2298 |
+
"epoch": 1.61,
|
2299 |
+
"learning_rate": 1e-05,
|
2300 |
+
"loss": 0.6581,
|
2301 |
+
"step": 3820
|
2302 |
+
},
|
2303 |
+
{
|
2304 |
+
"epoch": 1.61,
|
2305 |
+
"learning_rate": 1e-05,
|
2306 |
+
"loss": 0.6775,
|
2307 |
+
"step": 3830
|
2308 |
+
},
|
2309 |
+
{
|
2310 |
+
"epoch": 1.62,
|
2311 |
+
"learning_rate": 1e-05,
|
2312 |
+
"loss": 0.6243,
|
2313 |
+
"step": 3840
|
2314 |
+
},
|
2315 |
+
{
|
2316 |
+
"epoch": 1.62,
|
2317 |
+
"learning_rate": 1e-05,
|
2318 |
+
"loss": 0.644,
|
2319 |
+
"step": 3850
|
2320 |
+
},
|
2321 |
+
{
|
2322 |
+
"epoch": 1.63,
|
2323 |
+
"learning_rate": 1e-05,
|
2324 |
+
"loss": 0.6713,
|
2325 |
+
"step": 3860
|
2326 |
+
},
|
2327 |
+
{
|
2328 |
+
"epoch": 1.63,
|
2329 |
+
"learning_rate": 1e-05,
|
2330 |
+
"loss": 0.6633,
|
2331 |
+
"step": 3870
|
2332 |
+
},
|
2333 |
+
{
|
2334 |
+
"epoch": 1.64,
|
2335 |
+
"learning_rate": 1e-05,
|
2336 |
+
"loss": 0.7093,
|
2337 |
+
"step": 3880
|
2338 |
+
},
|
2339 |
+
{
|
2340 |
+
"epoch": 1.64,
|
2341 |
+
"learning_rate": 1e-05,
|
2342 |
+
"loss": 0.685,
|
2343 |
+
"step": 3890
|
2344 |
+
},
|
2345 |
+
{
|
2346 |
+
"epoch": 1.64,
|
2347 |
+
"learning_rate": 1e-05,
|
2348 |
+
"loss": 0.6923,
|
2349 |
+
"step": 3900
|
2350 |
+
},
|
2351 |
+
{
|
2352 |
+
"epoch": 1.65,
|
2353 |
+
"learning_rate": 1e-05,
|
2354 |
+
"loss": 0.5949,
|
2355 |
+
"step": 3910
|
2356 |
+
},
|
2357 |
+
{
|
2358 |
+
"epoch": 1.65,
|
2359 |
+
"learning_rate": 1e-05,
|
2360 |
+
"loss": 0.738,
|
2361 |
+
"step": 3920
|
2362 |
+
},
|
2363 |
+
{
|
2364 |
+
"epoch": 1.66,
|
2365 |
+
"learning_rate": 1e-05,
|
2366 |
+
"loss": 0.6444,
|
2367 |
+
"step": 3930
|
2368 |
+
},
|
2369 |
+
{
|
2370 |
+
"epoch": 1.66,
|
2371 |
+
"learning_rate": 1e-05,
|
2372 |
+
"loss": 0.6608,
|
2373 |
+
"step": 3940
|
2374 |
+
},
|
2375 |
+
{
|
2376 |
+
"epoch": 1.67,
|
2377 |
+
"learning_rate": 1e-05,
|
2378 |
+
"loss": 0.6469,
|
2379 |
+
"step": 3950
|
2380 |
+
},
|
2381 |
+
{
|
2382 |
+
"epoch": 1.67,
|
2383 |
+
"learning_rate": 1e-05,
|
2384 |
+
"loss": 0.6802,
|
2385 |
+
"step": 3960
|
2386 |
+
},
|
2387 |
+
{
|
2388 |
+
"epoch": 1.67,
|
2389 |
+
"learning_rate": 1e-05,
|
2390 |
+
"loss": 0.6474,
|
2391 |
+
"step": 3970
|
2392 |
+
},
|
2393 |
+
{
|
2394 |
+
"epoch": 1.68,
|
2395 |
+
"learning_rate": 1e-05,
|
2396 |
+
"loss": 0.69,
|
2397 |
+
"step": 3980
|
2398 |
+
},
|
2399 |
+
{
|
2400 |
+
"epoch": 1.68,
|
2401 |
+
"learning_rate": 1e-05,
|
2402 |
+
"loss": 0.658,
|
2403 |
+
"step": 3990
|
2404 |
+
},
|
2405 |
+
{
|
2406 |
+
"epoch": 1.69,
|
2407 |
+
"learning_rate": 1e-05,
|
2408 |
+
"loss": 0.6745,
|
2409 |
+
"step": 4000
|
2410 |
}
|
2411 |
],
|
2412 |
"logging_steps": 10,
|
2413 |
"max_steps": 5000,
|
2414 |
"num_train_epochs": 3,
|
2415 |
"save_steps": 500,
|
2416 |
+
"total_flos": 1005625572065280.0,
|
2417 |
"trial_name": null,
|
2418 |
"trial_params": null
|
2419 |
}
|