Muhammad Khalifa commited on
Commit
9e05026
1 Parent(s): 5a9e909

add instrucode llama-2 13b

Browse files
with_input/decomp_code_with_intermediates/llama-2-13b/best_model/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:438a8f13780e0bf581a7315fe172992cd9548569eae3204bdf11650ae651dc92
3
  size 104973389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4671c2d6162e00bd0679f6a17788fe6ca996a105b9eec0605c113d55d51046e
3
  size 104973389
with_input/decomp_code_with_intermediates/llama-2-13b/best_model/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ad67738e5eeff9ed2c07a2c509a4e942262b90cf5330c2ba60b95835f27a392
3
  size 209984517
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0b33193e2eaa387f1f2594dcd74e6f6c7f8d0bb6ccad541fdc73f56d8806f54
3
  size 209984517
with_input/decomp_code_with_intermediates/llama-2-13b/best_model/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acc1b8cfb3283aad217e327978b022e39fb2943a9cdf82bd8264775163348b83
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7a3940c13988eef2d142987af977c8946a726c9931362bfe39e6700c5381106
3
  size 14575
with_input/decomp_code_with_intermediates/llama-2-13b/best_model/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f0f43fb5ff454f55d12d06f14ece3c45b332039982175877610f7571980eab6
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a28d59beefca59c5677feb27231298b8d4e00afdda714d30a1a974e09b38f41
3
  size 627
with_input/decomp_code_with_intermediates/llama-2-13b/best_model/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.2938584089279175,
3
- "best_model_checkpoint": "checkpoints/instrucode/with_input/decomp_code_with_intermediates/llama-2-13b/checkpoint-3000",
4
- "epoch": 1.9292604501607717,
5
  "eval_steps": 200,
6
- "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1927,13 +1927,653 @@
1927
  "eval_samples_per_second": 2.461,
1928
  "eval_steps_per_second": 0.308,
1929
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1930
  }
1931
  ],
1932
  "logging_steps": 10,
1933
  "max_steps": 7775,
1934
  "num_train_epochs": 5,
1935
  "save_steps": 200,
1936
- "total_flos": 5.284238267487928e+18,
1937
  "trial_name": null,
1938
  "trial_params": null
1939
  }
 
1
  {
2
+ "best_metric": 0.2936367690563202,
3
+ "best_model_checkpoint": "checkpoints/instrucode/with_input/decomp_code_with_intermediates/llama-2-13b/checkpoint-4000",
4
+ "epoch": 2.572347266881029,
5
  "eval_steps": 200,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1927
  "eval_samples_per_second": 2.461,
1928
  "eval_steps_per_second": 0.308,
1929
  "step": 3000
1930
+ },
1931
+ {
1932
+ "epoch": 1.94,
1933
+ "learning_rate": 0.00018633224755700326,
1934
+ "loss": 0.2842,
1935
+ "step": 3010
1936
+ },
1937
+ {
1938
+ "epoch": 1.94,
1939
+ "learning_rate": 0.00018594136807817586,
1940
+ "loss": 0.2872,
1941
+ "step": 3020
1942
+ },
1943
+ {
1944
+ "epoch": 1.95,
1945
+ "learning_rate": 0.00018555048859934852,
1946
+ "loss": 0.2872,
1947
+ "step": 3030
1948
+ },
1949
+ {
1950
+ "epoch": 1.95,
1951
+ "learning_rate": 0.00018515960912052115,
1952
+ "loss": 0.2781,
1953
+ "step": 3040
1954
+ },
1955
+ {
1956
+ "epoch": 1.96,
1957
+ "learning_rate": 0.0001847687296416938,
1958
+ "loss": 0.2933,
1959
+ "step": 3050
1960
+ },
1961
+ {
1962
+ "epoch": 1.97,
1963
+ "learning_rate": 0.00018437785016286646,
1964
+ "loss": 0.2964,
1965
+ "step": 3060
1966
+ },
1967
+ {
1968
+ "epoch": 1.97,
1969
+ "learning_rate": 0.00018398697068403906,
1970
+ "loss": 0.2787,
1971
+ "step": 3070
1972
+ },
1973
+ {
1974
+ "epoch": 1.98,
1975
+ "learning_rate": 0.0001835960912052117,
1976
+ "loss": 0.2891,
1977
+ "step": 3080
1978
+ },
1979
+ {
1980
+ "epoch": 1.99,
1981
+ "learning_rate": 0.00018320521172638434,
1982
+ "loss": 0.2875,
1983
+ "step": 3090
1984
+ },
1985
+ {
1986
+ "epoch": 1.99,
1987
+ "learning_rate": 0.000182814332247557,
1988
+ "loss": 0.294,
1989
+ "step": 3100
1990
+ },
1991
+ {
1992
+ "epoch": 2.0,
1993
+ "learning_rate": 0.00018242345276872965,
1994
+ "loss": 0.2793,
1995
+ "step": 3110
1996
+ },
1997
+ {
1998
+ "epoch": 2.01,
1999
+ "learning_rate": 0.00018203257328990225,
2000
+ "loss": 0.2884,
2001
+ "step": 3120
2002
+ },
2003
+ {
2004
+ "epoch": 2.01,
2005
+ "learning_rate": 0.0001816416938110749,
2006
+ "loss": 0.2795,
2007
+ "step": 3130
2008
+ },
2009
+ {
2010
+ "epoch": 2.02,
2011
+ "learning_rate": 0.00018125081433224754,
2012
+ "loss": 0.2722,
2013
+ "step": 3140
2014
+ },
2015
+ {
2016
+ "epoch": 2.03,
2017
+ "learning_rate": 0.0001808599348534202,
2018
+ "loss": 0.2653,
2019
+ "step": 3150
2020
+ },
2021
+ {
2022
+ "epoch": 2.03,
2023
+ "learning_rate": 0.00018046905537459282,
2024
+ "loss": 0.2664,
2025
+ "step": 3160
2026
+ },
2027
+ {
2028
+ "epoch": 2.04,
2029
+ "learning_rate": 0.00018007817589576545,
2030
+ "loss": 0.2785,
2031
+ "step": 3170
2032
+ },
2033
+ {
2034
+ "epoch": 2.05,
2035
+ "learning_rate": 0.00017968729641693808,
2036
+ "loss": 0.2728,
2037
+ "step": 3180
2038
+ },
2039
+ {
2040
+ "epoch": 2.05,
2041
+ "learning_rate": 0.00017929641693811073,
2042
+ "loss": 0.2675,
2043
+ "step": 3190
2044
+ },
2045
+ {
2046
+ "epoch": 2.06,
2047
+ "learning_rate": 0.00017890553745928339,
2048
+ "loss": 0.2607,
2049
+ "step": 3200
2050
+ },
2051
+ {
2052
+ "epoch": 2.06,
2053
+ "eval_loss": 0.2949482798576355,
2054
+ "eval_runtime": 812.7173,
2055
+ "eval_samples_per_second": 2.461,
2056
+ "eval_steps_per_second": 0.308,
2057
+ "step": 3200
2058
+ },
2059
+ {
2060
+ "epoch": 2.06,
2061
+ "learning_rate": 0.00017851465798045601,
2062
+ "loss": 0.2773,
2063
+ "step": 3210
2064
+ },
2065
+ {
2066
+ "epoch": 2.07,
2067
+ "learning_rate": 0.00017812377850162864,
2068
+ "loss": 0.2684,
2069
+ "step": 3220
2070
+ },
2071
+ {
2072
+ "epoch": 2.08,
2073
+ "learning_rate": 0.00017773289902280127,
2074
+ "loss": 0.2656,
2075
+ "step": 3230
2076
+ },
2077
+ {
2078
+ "epoch": 2.08,
2079
+ "learning_rate": 0.00017734201954397393,
2080
+ "loss": 0.287,
2081
+ "step": 3240
2082
+ },
2083
+ {
2084
+ "epoch": 2.09,
2085
+ "learning_rate": 0.00017695114006514658,
2086
+ "loss": 0.2694,
2087
+ "step": 3250
2088
+ },
2089
+ {
2090
+ "epoch": 2.1,
2091
+ "learning_rate": 0.0001765602605863192,
2092
+ "loss": 0.2667,
2093
+ "step": 3260
2094
+ },
2095
+ {
2096
+ "epoch": 2.1,
2097
+ "learning_rate": 0.00017616938110749184,
2098
+ "loss": 0.2744,
2099
+ "step": 3270
2100
+ },
2101
+ {
2102
+ "epoch": 2.11,
2103
+ "learning_rate": 0.00017577850162866447,
2104
+ "loss": 0.2744,
2105
+ "step": 3280
2106
+ },
2107
+ {
2108
+ "epoch": 2.12,
2109
+ "learning_rate": 0.00017538762214983712,
2110
+ "loss": 0.2615,
2111
+ "step": 3290
2112
+ },
2113
+ {
2114
+ "epoch": 2.12,
2115
+ "learning_rate": 0.00017499674267100975,
2116
+ "loss": 0.2694,
2117
+ "step": 3300
2118
+ },
2119
+ {
2120
+ "epoch": 2.13,
2121
+ "learning_rate": 0.0001746058631921824,
2122
+ "loss": 0.281,
2123
+ "step": 3310
2124
+ },
2125
+ {
2126
+ "epoch": 2.14,
2127
+ "learning_rate": 0.00017421498371335506,
2128
+ "loss": 0.2744,
2129
+ "step": 3320
2130
+ },
2131
+ {
2132
+ "epoch": 2.14,
2133
+ "learning_rate": 0.00017382410423452766,
2134
+ "loss": 0.268,
2135
+ "step": 3330
2136
+ },
2137
+ {
2138
+ "epoch": 2.15,
2139
+ "learning_rate": 0.00017343322475570032,
2140
+ "loss": 0.2859,
2141
+ "step": 3340
2142
+ },
2143
+ {
2144
+ "epoch": 2.15,
2145
+ "learning_rate": 0.00017304234527687294,
2146
+ "loss": 0.2631,
2147
+ "step": 3350
2148
+ },
2149
+ {
2150
+ "epoch": 2.16,
2151
+ "learning_rate": 0.0001726514657980456,
2152
+ "loss": 0.2791,
2153
+ "step": 3360
2154
+ },
2155
+ {
2156
+ "epoch": 2.17,
2157
+ "learning_rate": 0.00017226058631921825,
2158
+ "loss": 0.2873,
2159
+ "step": 3370
2160
+ },
2161
+ {
2162
+ "epoch": 2.17,
2163
+ "learning_rate": 0.00017186970684039086,
2164
+ "loss": 0.2712,
2165
+ "step": 3380
2166
+ },
2167
+ {
2168
+ "epoch": 2.18,
2169
+ "learning_rate": 0.0001714788273615635,
2170
+ "loss": 0.2843,
2171
+ "step": 3390
2172
+ },
2173
+ {
2174
+ "epoch": 2.19,
2175
+ "learning_rate": 0.00017108794788273614,
2176
+ "loss": 0.267,
2177
+ "step": 3400
2178
+ },
2179
+ {
2180
+ "epoch": 2.19,
2181
+ "eval_loss": 0.29497140645980835,
2182
+ "eval_runtime": 812.7448,
2183
+ "eval_samples_per_second": 2.461,
2184
+ "eval_steps_per_second": 0.308,
2185
+ "step": 3400
2186
+ },
2187
+ {
2188
+ "epoch": 2.19,
2189
+ "learning_rate": 0.0001706970684039088,
2190
+ "loss": 0.2697,
2191
+ "step": 3410
2192
+ },
2193
+ {
2194
+ "epoch": 2.2,
2195
+ "learning_rate": 0.00017030618892508142,
2196
+ "loss": 0.2788,
2197
+ "step": 3420
2198
+ },
2199
+ {
2200
+ "epoch": 2.21,
2201
+ "learning_rate": 0.00016991530944625405,
2202
+ "loss": 0.2775,
2203
+ "step": 3430
2204
+ },
2205
+ {
2206
+ "epoch": 2.21,
2207
+ "learning_rate": 0.00016952442996742668,
2208
+ "loss": 0.274,
2209
+ "step": 3440
2210
+ },
2211
+ {
2212
+ "epoch": 2.22,
2213
+ "learning_rate": 0.00016913355048859933,
2214
+ "loss": 0.2733,
2215
+ "step": 3450
2216
+ },
2217
+ {
2218
+ "epoch": 2.23,
2219
+ "learning_rate": 0.000168742671009772,
2220
+ "loss": 0.2675,
2221
+ "step": 3460
2222
+ },
2223
+ {
2224
+ "epoch": 2.23,
2225
+ "learning_rate": 0.00016835179153094462,
2226
+ "loss": 0.2715,
2227
+ "step": 3470
2228
+ },
2229
+ {
2230
+ "epoch": 2.24,
2231
+ "learning_rate": 0.00016796091205211725,
2232
+ "loss": 0.2658,
2233
+ "step": 3480
2234
+ },
2235
+ {
2236
+ "epoch": 2.24,
2237
+ "learning_rate": 0.00016757003257328987,
2238
+ "loss": 0.2797,
2239
+ "step": 3490
2240
+ },
2241
+ {
2242
+ "epoch": 2.25,
2243
+ "learning_rate": 0.00016717915309446253,
2244
+ "loss": 0.2671,
2245
+ "step": 3500
2246
+ },
2247
+ {
2248
+ "epoch": 2.26,
2249
+ "learning_rate": 0.00016678827361563518,
2250
+ "loss": 0.279,
2251
+ "step": 3510
2252
+ },
2253
+ {
2254
+ "epoch": 2.26,
2255
+ "learning_rate": 0.0001663973941368078,
2256
+ "loss": 0.2697,
2257
+ "step": 3520
2258
+ },
2259
+ {
2260
+ "epoch": 2.27,
2261
+ "learning_rate": 0.00016600651465798044,
2262
+ "loss": 0.2689,
2263
+ "step": 3530
2264
+ },
2265
+ {
2266
+ "epoch": 2.28,
2267
+ "learning_rate": 0.00016561563517915307,
2268
+ "loss": 0.2613,
2269
+ "step": 3540
2270
+ },
2271
+ {
2272
+ "epoch": 2.28,
2273
+ "learning_rate": 0.00016522475570032572,
2274
+ "loss": 0.2719,
2275
+ "step": 3550
2276
+ },
2277
+ {
2278
+ "epoch": 2.29,
2279
+ "learning_rate": 0.00016483387622149835,
2280
+ "loss": 0.2722,
2281
+ "step": 3560
2282
+ },
2283
+ {
2284
+ "epoch": 2.3,
2285
+ "learning_rate": 0.000164442996742671,
2286
+ "loss": 0.2714,
2287
+ "step": 3570
2288
+ },
2289
+ {
2290
+ "epoch": 2.3,
2291
+ "learning_rate": 0.0001640521172638436,
2292
+ "loss": 0.2677,
2293
+ "step": 3580
2294
+ },
2295
+ {
2296
+ "epoch": 2.31,
2297
+ "learning_rate": 0.00016366123778501626,
2298
+ "loss": 0.2599,
2299
+ "step": 3590
2300
+ },
2301
+ {
2302
+ "epoch": 2.32,
2303
+ "learning_rate": 0.00016327035830618892,
2304
+ "loss": 0.2765,
2305
+ "step": 3600
2306
+ },
2307
+ {
2308
+ "epoch": 2.32,
2309
+ "eval_loss": 0.294181764125824,
2310
+ "eval_runtime": 814.8768,
2311
+ "eval_samples_per_second": 2.454,
2312
+ "eval_steps_per_second": 0.307,
2313
+ "step": 3600
2314
+ },
2315
+ {
2316
+ "epoch": 2.32,
2317
+ "learning_rate": 0.00016287947882736155,
2318
+ "loss": 0.2608,
2319
+ "step": 3610
2320
+ },
2321
+ {
2322
+ "epoch": 2.33,
2323
+ "learning_rate": 0.0001624885993485342,
2324
+ "loss": 0.2644,
2325
+ "step": 3620
2326
+ },
2327
+ {
2328
+ "epoch": 2.33,
2329
+ "learning_rate": 0.0001620977198697068,
2330
+ "loss": 0.2905,
2331
+ "step": 3630
2332
+ },
2333
+ {
2334
+ "epoch": 2.34,
2335
+ "learning_rate": 0.00016170684039087946,
2336
+ "loss": 0.2722,
2337
+ "step": 3640
2338
+ },
2339
+ {
2340
+ "epoch": 2.35,
2341
+ "learning_rate": 0.0001613159609120521,
2342
+ "loss": 0.2679,
2343
+ "step": 3650
2344
+ },
2345
+ {
2346
+ "epoch": 2.35,
2347
+ "learning_rate": 0.00016092508143322474,
2348
+ "loss": 0.274,
2349
+ "step": 3660
2350
+ },
2351
+ {
2352
+ "epoch": 2.36,
2353
+ "learning_rate": 0.0001605342019543974,
2354
+ "loss": 0.2692,
2355
+ "step": 3670
2356
+ },
2357
+ {
2358
+ "epoch": 2.37,
2359
+ "learning_rate": 0.00016014332247557003,
2360
+ "loss": 0.2663,
2361
+ "step": 3680
2362
+ },
2363
+ {
2364
+ "epoch": 2.37,
2365
+ "learning_rate": 0.00015975244299674265,
2366
+ "loss": 0.2598,
2367
+ "step": 3690
2368
+ },
2369
+ {
2370
+ "epoch": 2.38,
2371
+ "learning_rate": 0.00015936156351791528,
2372
+ "loss": 0.2756,
2373
+ "step": 3700
2374
+ },
2375
+ {
2376
+ "epoch": 2.39,
2377
+ "learning_rate": 0.00015897068403908794,
2378
+ "loss": 0.2715,
2379
+ "step": 3710
2380
+ },
2381
+ {
2382
+ "epoch": 2.39,
2383
+ "learning_rate": 0.0001585798045602606,
2384
+ "loss": 0.2741,
2385
+ "step": 3720
2386
+ },
2387
+ {
2388
+ "epoch": 2.4,
2389
+ "learning_rate": 0.00015818892508143322,
2390
+ "loss": 0.2751,
2391
+ "step": 3730
2392
+ },
2393
+ {
2394
+ "epoch": 2.41,
2395
+ "learning_rate": 0.00015779804560260585,
2396
+ "loss": 0.2727,
2397
+ "step": 3740
2398
+ },
2399
+ {
2400
+ "epoch": 2.41,
2401
+ "learning_rate": 0.00015740716612377848,
2402
+ "loss": 0.262,
2403
+ "step": 3750
2404
+ },
2405
+ {
2406
+ "epoch": 2.42,
2407
+ "learning_rate": 0.00015701628664495113,
2408
+ "loss": 0.2677,
2409
+ "step": 3760
2410
+ },
2411
+ {
2412
+ "epoch": 2.42,
2413
+ "learning_rate": 0.00015662540716612376,
2414
+ "loss": 0.2723,
2415
+ "step": 3770
2416
+ },
2417
+ {
2418
+ "epoch": 2.43,
2419
+ "learning_rate": 0.00015623452768729641,
2420
+ "loss": 0.2682,
2421
+ "step": 3780
2422
+ },
2423
+ {
2424
+ "epoch": 2.44,
2425
+ "learning_rate": 0.00015584364820846904,
2426
+ "loss": 0.2685,
2427
+ "step": 3790
2428
+ },
2429
+ {
2430
+ "epoch": 2.44,
2431
+ "learning_rate": 0.00015545276872964167,
2432
+ "loss": 0.2654,
2433
+ "step": 3800
2434
+ },
2435
+ {
2436
+ "epoch": 2.44,
2437
+ "eval_loss": 0.2940288186073303,
2438
+ "eval_runtime": 812.6786,
2439
+ "eval_samples_per_second": 2.461,
2440
+ "eval_steps_per_second": 0.308,
2441
+ "step": 3800
2442
+ },
2443
+ {
2444
+ "epoch": 2.45,
2445
+ "learning_rate": 0.00015506188925081433,
2446
+ "loss": 0.2652,
2447
+ "step": 3810
2448
+ },
2449
+ {
2450
+ "epoch": 2.46,
2451
+ "learning_rate": 0.00015467100977198695,
2452
+ "loss": 0.2711,
2453
+ "step": 3820
2454
+ },
2455
+ {
2456
+ "epoch": 2.46,
2457
+ "learning_rate": 0.0001542801302931596,
2458
+ "loss": 0.2719,
2459
+ "step": 3830
2460
+ },
2461
+ {
2462
+ "epoch": 2.47,
2463
+ "learning_rate": 0.0001538892508143322,
2464
+ "loss": 0.2548,
2465
+ "step": 3840
2466
+ },
2467
+ {
2468
+ "epoch": 2.48,
2469
+ "learning_rate": 0.00015349837133550487,
2470
+ "loss": 0.2743,
2471
+ "step": 3850
2472
+ },
2473
+ {
2474
+ "epoch": 2.48,
2475
+ "learning_rate": 0.00015310749185667752,
2476
+ "loss": 0.2862,
2477
+ "step": 3860
2478
+ },
2479
+ {
2480
+ "epoch": 2.49,
2481
+ "learning_rate": 0.00015271661237785015,
2482
+ "loss": 0.281,
2483
+ "step": 3870
2484
+ },
2485
+ {
2486
+ "epoch": 2.5,
2487
+ "learning_rate": 0.0001523257328990228,
2488
+ "loss": 0.271,
2489
+ "step": 3880
2490
+ },
2491
+ {
2492
+ "epoch": 2.5,
2493
+ "learning_rate": 0.0001519348534201954,
2494
+ "loss": 0.2677,
2495
+ "step": 3890
2496
+ },
2497
+ {
2498
+ "epoch": 2.51,
2499
+ "learning_rate": 0.00015154397394136806,
2500
+ "loss": 0.2614,
2501
+ "step": 3900
2502
+ },
2503
+ {
2504
+ "epoch": 2.51,
2505
+ "learning_rate": 0.0001511530944625407,
2506
+ "loss": 0.2719,
2507
+ "step": 3910
2508
+ },
2509
+ {
2510
+ "epoch": 2.52,
2511
+ "learning_rate": 0.00015076221498371334,
2512
+ "loss": 0.2678,
2513
+ "step": 3920
2514
+ },
2515
+ {
2516
+ "epoch": 2.53,
2517
+ "learning_rate": 0.000150371335504886,
2518
+ "loss": 0.262,
2519
+ "step": 3930
2520
+ },
2521
+ {
2522
+ "epoch": 2.53,
2523
+ "learning_rate": 0.00014998045602605863,
2524
+ "loss": 0.2853,
2525
+ "step": 3940
2526
+ },
2527
+ {
2528
+ "epoch": 2.54,
2529
+ "learning_rate": 0.00014958957654723126,
2530
+ "loss": 0.2776,
2531
+ "step": 3950
2532
+ },
2533
+ {
2534
+ "epoch": 2.55,
2535
+ "learning_rate": 0.00014919869706840388,
2536
+ "loss": 0.2768,
2537
+ "step": 3960
2538
+ },
2539
+ {
2540
+ "epoch": 2.55,
2541
+ "learning_rate": 0.00014880781758957654,
2542
+ "loss": 0.2716,
2543
+ "step": 3970
2544
+ },
2545
+ {
2546
+ "epoch": 2.56,
2547
+ "learning_rate": 0.00014841693811074917,
2548
+ "loss": 0.2617,
2549
+ "step": 3980
2550
+ },
2551
+ {
2552
+ "epoch": 2.57,
2553
+ "learning_rate": 0.00014802605863192182,
2554
+ "loss": 0.2742,
2555
+ "step": 3990
2556
+ },
2557
+ {
2558
+ "epoch": 2.57,
2559
+ "learning_rate": 0.00014763517915309445,
2560
+ "loss": 0.2673,
2561
+ "step": 4000
2562
+ },
2563
+ {
2564
+ "epoch": 2.57,
2565
+ "eval_loss": 0.2936367690563202,
2566
+ "eval_runtime": 812.6744,
2567
+ "eval_samples_per_second": 2.461,
2568
+ "eval_steps_per_second": 0.308,
2569
+ "step": 4000
2570
  }
2571
  ],
2572
  "logging_steps": 10,
2573
  "max_steps": 7775,
2574
  "num_train_epochs": 5,
2575
  "save_steps": 200,
2576
+ "total_flos": 7.045037400998707e+18,
2577
  "trial_name": null,
2578
  "trial_params": null
2579
  }