|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.994334277620396, |
|
"eval_steps": 500, |
|
"global_step": 3174, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3795244693756104, |
|
"learning_rate": 2.358490566037736e-06, |
|
"loss": 2.6953, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2704118490219116, |
|
"learning_rate": 4.716981132075472e-06, |
|
"loss": 2.7089, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1683423519134521, |
|
"learning_rate": 7.0754716981132075e-06, |
|
"loss": 2.6511, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2082363367080688, |
|
"learning_rate": 9.433962264150944e-06, |
|
"loss": 2.5193, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1215194463729858, |
|
"learning_rate": 1.179245283018868e-05, |
|
"loss": 2.306, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6498327255249023, |
|
"learning_rate": 1.4150943396226415e-05, |
|
"loss": 2.1279, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5128926038742065, |
|
"learning_rate": 1.650943396226415e-05, |
|
"loss": 1.979, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.4196425676345825, |
|
"learning_rate": 1.8867924528301888e-05, |
|
"loss": 1.857, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.39977598190307617, |
|
"learning_rate": 2.122641509433962e-05, |
|
"loss": 1.7905, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.3468642830848694, |
|
"learning_rate": 2.358490566037736e-05, |
|
"loss": 1.7396, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.3703348934650421, |
|
"learning_rate": 2.5943396226415097e-05, |
|
"loss": 1.6826, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.3471335470676422, |
|
"learning_rate": 2.830188679245283e-05, |
|
"loss": 1.6333, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.30928292870521545, |
|
"learning_rate": 2.9926470588235295e-05, |
|
"loss": 1.5934, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.33998557925224304, |
|
"learning_rate": 2.966386554621849e-05, |
|
"loss": 1.5503, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3642776906490326, |
|
"learning_rate": 2.940126050420168e-05, |
|
"loss": 1.5243, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.31012433767318726, |
|
"learning_rate": 2.9138655462184876e-05, |
|
"loss": 1.4618, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4258916974067688, |
|
"learning_rate": 2.8876050420168067e-05, |
|
"loss": 1.4161, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.3025980591773987, |
|
"learning_rate": 2.8613445378151262e-05, |
|
"loss": 1.419, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.3354116678237915, |
|
"learning_rate": 2.8350840336134453e-05, |
|
"loss": 1.3576, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.3400489091873169, |
|
"learning_rate": 2.8088235294117648e-05, |
|
"loss": 1.3323, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.315164715051651, |
|
"learning_rate": 2.7825630252100843e-05, |
|
"loss": 1.344, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.3593141734600067, |
|
"learning_rate": 2.7563025210084034e-05, |
|
"loss": 1.3023, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.4317137598991394, |
|
"learning_rate": 2.730042016806723e-05, |
|
"loss": 1.3028, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.3506380617618561, |
|
"learning_rate": 2.703781512605042e-05, |
|
"loss": 1.3026, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.33726122975349426, |
|
"learning_rate": 2.6775210084033615e-05, |
|
"loss": 1.319, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.37094271183013916, |
|
"learning_rate": 2.6512605042016806e-05, |
|
"loss": 1.2583, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.41374334692955017, |
|
"learning_rate": 2.625e-05, |
|
"loss": 1.2486, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.46000728011131287, |
|
"learning_rate": 2.5987394957983196e-05, |
|
"loss": 1.2463, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.3955087661743164, |
|
"learning_rate": 2.5724789915966387e-05, |
|
"loss": 1.2397, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.4096736013889313, |
|
"learning_rate": 2.546218487394958e-05, |
|
"loss": 1.229, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.3845139741897583, |
|
"learning_rate": 2.5199579831932773e-05, |
|
"loss": 1.2314, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.4077882170677185, |
|
"learning_rate": 2.4936974789915968e-05, |
|
"loss": 1.2219, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.36021721363067627, |
|
"learning_rate": 2.467436974789916e-05, |
|
"loss": 1.234, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.36913222074508667, |
|
"learning_rate": 2.4411764705882354e-05, |
|
"loss": 1.1998, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.35471582412719727, |
|
"learning_rate": 2.414915966386555e-05, |
|
"loss": 1.1988, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.3558790683746338, |
|
"learning_rate": 2.3886554621848737e-05, |
|
"loss": 1.2106, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.36467084288597107, |
|
"learning_rate": 2.362394957983193e-05, |
|
"loss": 1.1717, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.381874680519104, |
|
"learning_rate": 2.3361344537815126e-05, |
|
"loss": 1.1896, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.3758748769760132, |
|
"learning_rate": 2.309873949579832e-05, |
|
"loss": 1.1712, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.35793235898017883, |
|
"learning_rate": 2.2836134453781513e-05, |
|
"loss": 1.1389, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.44111478328704834, |
|
"learning_rate": 2.2573529411764707e-05, |
|
"loss": 1.1726, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.3741939663887024, |
|
"learning_rate": 2.2310924369747902e-05, |
|
"loss": 1.1607, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.3894720673561096, |
|
"learning_rate": 2.2048319327731093e-05, |
|
"loss": 1.2186, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.3636987805366516, |
|
"learning_rate": 2.1785714285714285e-05, |
|
"loss": 1.1376, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.42893752455711365, |
|
"learning_rate": 2.152310924369748e-05, |
|
"loss": 1.158, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.3795158863067627, |
|
"learning_rate": 2.1260504201680674e-05, |
|
"loss": 1.1574, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.36902275681495667, |
|
"learning_rate": 2.0997899159663866e-05, |
|
"loss": 1.1523, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.431219220161438, |
|
"learning_rate": 2.073529411764706e-05, |
|
"loss": 1.1433, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.4199659824371338, |
|
"learning_rate": 2.0472689075630252e-05, |
|
"loss": 1.1481, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.6324878334999084, |
|
"learning_rate": 2.0210084033613447e-05, |
|
"loss": 1.1526, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.523536205291748, |
|
"learning_rate": 1.9947478991596638e-05, |
|
"loss": 1.1216, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.5140235424041748, |
|
"learning_rate": 1.9684873949579833e-05, |
|
"loss": 1.1539, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.3695720136165619, |
|
"learning_rate": 1.9422268907563027e-05, |
|
"loss": 1.1666, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.4080689251422882, |
|
"learning_rate": 1.915966386554622e-05, |
|
"loss": 1.1037, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.35790908336639404, |
|
"learning_rate": 1.889705882352941e-05, |
|
"loss": 1.136, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.42846861481666565, |
|
"learning_rate": 1.8634453781512605e-05, |
|
"loss": 1.1325, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.37662366032600403, |
|
"learning_rate": 1.83718487394958e-05, |
|
"loss": 1.1439, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.4963545501232147, |
|
"learning_rate": 1.810924369747899e-05, |
|
"loss": 1.1701, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.4511197507381439, |
|
"learning_rate": 1.7846638655462186e-05, |
|
"loss": 1.1338, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.44771987199783325, |
|
"learning_rate": 1.758403361344538e-05, |
|
"loss": 1.1021, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.4158724248409271, |
|
"learning_rate": 1.7321428571428572e-05, |
|
"loss": 1.094, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.43490564823150635, |
|
"learning_rate": 1.7058823529411763e-05, |
|
"loss": 1.1154, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.4746383726596832, |
|
"learning_rate": 1.6796218487394958e-05, |
|
"loss": 1.1311, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.4157463312149048, |
|
"learning_rate": 1.6533613445378153e-05, |
|
"loss": 1.1202, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.38272300362586975, |
|
"learning_rate": 1.6271008403361344e-05, |
|
"loss": 1.1173, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.5032052397727966, |
|
"learning_rate": 1.600840336134454e-05, |
|
"loss": 1.1313, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.3842039704322815, |
|
"learning_rate": 1.5745798319327734e-05, |
|
"loss": 1.0984, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.43160513043403625, |
|
"learning_rate": 1.5483193277310925e-05, |
|
"loss": 1.1108, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.420173704624176, |
|
"learning_rate": 1.5220588235294118e-05, |
|
"loss": 1.144, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.43490853905677795, |
|
"learning_rate": 1.4957983193277311e-05, |
|
"loss": 1.0752, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.45708540081977844, |
|
"learning_rate": 1.4695378151260504e-05, |
|
"loss": 1.1447, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.417322039604187, |
|
"learning_rate": 1.4432773109243699e-05, |
|
"loss": 1.102, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.4371644854545593, |
|
"learning_rate": 1.417016806722689e-05, |
|
"loss": 1.1473, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.4273310899734497, |
|
"learning_rate": 1.3907563025210085e-05, |
|
"loss": 1.0967, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.5089781880378723, |
|
"learning_rate": 1.3644957983193278e-05, |
|
"loss": 1.1297, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.48617228865623474, |
|
"learning_rate": 1.3382352941176471e-05, |
|
"loss": 1.0955, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.4370473623275757, |
|
"learning_rate": 1.3119747899159664e-05, |
|
"loss": 1.0791, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.4495941400527954, |
|
"learning_rate": 1.2857142857142857e-05, |
|
"loss": 1.0648, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.4138700067996979, |
|
"learning_rate": 1.259453781512605e-05, |
|
"loss": 1.0948, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.4161551296710968, |
|
"learning_rate": 1.2331932773109243e-05, |
|
"loss": 1.0947, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.3938988745212555, |
|
"learning_rate": 1.2069327731092438e-05, |
|
"loss": 1.0863, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.44733569025993347, |
|
"learning_rate": 1.180672268907563e-05, |
|
"loss": 1.1015, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.4151917099952698, |
|
"learning_rate": 1.1544117647058824e-05, |
|
"loss": 1.0817, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.45207536220550537, |
|
"learning_rate": 1.1281512605042017e-05, |
|
"loss": 1.0935, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.43334582448005676, |
|
"learning_rate": 1.1018907563025212e-05, |
|
"loss": 1.0843, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.44301116466522217, |
|
"learning_rate": 1.0756302521008403e-05, |
|
"loss": 1.0617, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.42584851384162903, |
|
"learning_rate": 1.0493697478991596e-05, |
|
"loss": 1.102, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.46070751547813416, |
|
"learning_rate": 1.0231092436974791e-05, |
|
"loss": 1.0943, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.43757393956184387, |
|
"learning_rate": 9.968487394957983e-06, |
|
"loss": 1.082, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.43552663922309875, |
|
"learning_rate": 9.705882352941177e-06, |
|
"loss": 1.1033, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.44868725538253784, |
|
"learning_rate": 9.44327731092437e-06, |
|
"loss": 1.0912, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.43542513251304626, |
|
"learning_rate": 9.180672268907563e-06, |
|
"loss": 1.1113, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.47481635212898254, |
|
"learning_rate": 8.918067226890756e-06, |
|
"loss": 1.0455, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.46137455105781555, |
|
"learning_rate": 8.65546218487395e-06, |
|
"loss": 1.0898, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.4473894536495209, |
|
"learning_rate": 8.392857142857143e-06, |
|
"loss": 1.0836, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.39784467220306396, |
|
"learning_rate": 8.130252100840336e-06, |
|
"loss": 1.0629, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.48481184244155884, |
|
"learning_rate": 7.86764705882353e-06, |
|
"loss": 1.1173, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.485196590423584, |
|
"learning_rate": 7.605042016806723e-06, |
|
"loss": 1.0673, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.5114961266517639, |
|
"learning_rate": 7.342436974789916e-06, |
|
"loss": 1.0877, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.4506637752056122, |
|
"learning_rate": 7.07983193277311e-06, |
|
"loss": 1.0995, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.45109784603118896, |
|
"learning_rate": 6.817226890756303e-06, |
|
"loss": 1.0819, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.4272564947605133, |
|
"learning_rate": 6.554621848739496e-06, |
|
"loss": 1.1109, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.4301404058933258, |
|
"learning_rate": 6.29201680672269e-06, |
|
"loss": 1.0738, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 0.49940961599349976, |
|
"learning_rate": 6.029411764705883e-06, |
|
"loss": 1.0865, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.41319113969802856, |
|
"learning_rate": 5.7773109243697485e-06, |
|
"loss": 1.0535, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.4326096773147583, |
|
"learning_rate": 5.5147058823529415e-06, |
|
"loss": 1.0745, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 0.4360290765762329, |
|
"learning_rate": 5.252100840336135e-06, |
|
"loss": 1.0745, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 0.42354682087898254, |
|
"learning_rate": 4.989495798319328e-06, |
|
"loss": 1.0685, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"grad_norm": 0.49250248074531555, |
|
"learning_rate": 4.726890756302521e-06, |
|
"loss": 1.0841, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 0.4505230784416199, |
|
"learning_rate": 4.464285714285715e-06, |
|
"loss": 1.0935, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 0.41872066259384155, |
|
"learning_rate": 4.201680672268908e-06, |
|
"loss": 1.0827, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 0.45635831356048584, |
|
"learning_rate": 3.939075630252101e-06, |
|
"loss": 1.0973, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 0.49893826246261597, |
|
"learning_rate": 3.6764705882352942e-06, |
|
"loss": 1.0859, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.5377572774887085, |
|
"learning_rate": 3.4138655462184873e-06, |
|
"loss": 1.088, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.45102909207344055, |
|
"learning_rate": 3.1512605042016808e-06, |
|
"loss": 1.0875, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 0.3922051191329956, |
|
"learning_rate": 2.8886554621848742e-06, |
|
"loss": 1.0708, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 0.4416084289550781, |
|
"learning_rate": 2.6260504201680673e-06, |
|
"loss": 1.0816, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"grad_norm": 0.5171985626220703, |
|
"learning_rate": 2.3634453781512604e-06, |
|
"loss": 1.0859, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 0.4239521920681, |
|
"learning_rate": 2.100840336134454e-06, |
|
"loss": 1.0387, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 0.5627429485321045, |
|
"learning_rate": 1.8382352941176471e-06, |
|
"loss": 1.0818, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"grad_norm": 0.4605351686477661, |
|
"learning_rate": 1.5756302521008404e-06, |
|
"loss": 1.0637, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 0.40121838450431824, |
|
"learning_rate": 1.3130252100840336e-06, |
|
"loss": 1.039, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.45940887928009033, |
|
"learning_rate": 1.050420168067227e-06, |
|
"loss": 1.0434, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 5.85, |
|
"grad_norm": 0.4496408998966217, |
|
"learning_rate": 7.878151260504202e-07, |
|
"loss": 1.1024, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"grad_norm": 0.4458378553390503, |
|
"learning_rate": 5.252100840336135e-07, |
|
"loss": 1.0948, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"grad_norm": 0.49208617210388184, |
|
"learning_rate": 2.6260504201680673e-07, |
|
"loss": 1.0673, |
|
"step": 3150 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 3174, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"total_flos": 1.3634839262527488e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|