{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9963285024154587, "eval_steps": 500, "global_step": 1938, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015458937198067632, "grad_norm": 16.53294779967864, "learning_rate": 5e-06, "loss": 1.0792, "step": 10 }, { "epoch": 0.030917874396135265, "grad_norm": 2.649325227711683, "learning_rate": 5e-06, "loss": 0.9837, "step": 20 }, { "epoch": 0.0463768115942029, "grad_norm": 8.284136228811613, "learning_rate": 5e-06, "loss": 0.9474, "step": 30 }, { "epoch": 0.06183574879227053, "grad_norm": 2.524509268818302, "learning_rate": 5e-06, "loss": 0.9301, "step": 40 }, { "epoch": 0.07729468599033816, "grad_norm": 1.5326850741391629, "learning_rate": 5e-06, "loss": 0.9155, "step": 50 }, { "epoch": 0.0927536231884058, "grad_norm": 1.0781869595697948, "learning_rate": 5e-06, "loss": 0.9037, "step": 60 }, { "epoch": 0.10821256038647344, "grad_norm": 0.8943372564864077, "learning_rate": 5e-06, "loss": 0.8967, "step": 70 }, { "epoch": 0.12367149758454106, "grad_norm": 0.8868808075595438, "learning_rate": 5e-06, "loss": 0.8867, "step": 80 }, { "epoch": 0.1391304347826087, "grad_norm": 0.6755190824568067, "learning_rate": 5e-06, "loss": 0.8792, "step": 90 }, { "epoch": 0.15458937198067632, "grad_norm": 0.6359486667796315, "learning_rate": 5e-06, "loss": 0.8769, "step": 100 }, { "epoch": 0.17004830917874397, "grad_norm": 0.6299873717513695, "learning_rate": 5e-06, "loss": 0.8717, "step": 110 }, { "epoch": 0.1855072463768116, "grad_norm": 0.5832655638295026, "learning_rate": 5e-06, "loss": 0.8671, "step": 120 }, { "epoch": 0.20096618357487922, "grad_norm": 0.5586732401198556, "learning_rate": 5e-06, "loss": 0.8642, "step": 130 }, { "epoch": 0.21642512077294687, "grad_norm": 0.6759403203697444, "learning_rate": 5e-06, "loss": 0.863, "step": 140 }, { "epoch": 0.2318840579710145, "grad_norm": 0.5234393167054423, "learning_rate": 5e-06, "loss": 0.861, "step": 150 }, { "epoch": 0.24734299516908212, "grad_norm": 0.5405697501688478, "learning_rate": 5e-06, "loss": 0.864, "step": 160 }, { "epoch": 0.26280193236714977, "grad_norm": 0.5609477995758988, "learning_rate": 5e-06, "loss": 0.8528, "step": 170 }, { "epoch": 0.2782608695652174, "grad_norm": 0.648827500892738, "learning_rate": 5e-06, "loss": 0.857, "step": 180 }, { "epoch": 0.293719806763285, "grad_norm": 0.6627079853918527, "learning_rate": 5e-06, "loss": 0.8511, "step": 190 }, { "epoch": 0.30917874396135264, "grad_norm": 0.6915034637639949, "learning_rate": 5e-06, "loss": 0.8485, "step": 200 }, { "epoch": 0.32463768115942027, "grad_norm": 0.6366893171242987, "learning_rate": 5e-06, "loss": 0.8462, "step": 210 }, { "epoch": 0.34009661835748795, "grad_norm": 0.5070708715215638, "learning_rate": 5e-06, "loss": 0.8446, "step": 220 }, { "epoch": 0.35555555555555557, "grad_norm": 0.5871716320705027, "learning_rate": 5e-06, "loss": 0.8445, "step": 230 }, { "epoch": 0.3710144927536232, "grad_norm": 0.7730510671832644, "learning_rate": 5e-06, "loss": 0.8497, "step": 240 }, { "epoch": 0.3864734299516908, "grad_norm": 0.7142598054734569, "learning_rate": 5e-06, "loss": 0.8438, "step": 250 }, { "epoch": 0.40193236714975844, "grad_norm": 0.5417070693738956, "learning_rate": 5e-06, "loss": 0.8476, "step": 260 }, { "epoch": 0.41739130434782606, "grad_norm": 0.6163533652451042, "learning_rate": 5e-06, "loss": 0.8442, "step": 270 }, { "epoch": 0.43285024154589374, "grad_norm": 0.5863440757370957, "learning_rate": 5e-06, "loss": 0.8384, "step": 280 }, { "epoch": 0.44830917874396137, "grad_norm": 0.6841438022567938, "learning_rate": 5e-06, "loss": 0.8416, "step": 290 }, { "epoch": 0.463768115942029, "grad_norm": 0.5830887688966261, "learning_rate": 5e-06, "loss": 0.8363, "step": 300 }, { "epoch": 0.4792270531400966, "grad_norm": 0.7565147804535631, "learning_rate": 5e-06, "loss": 0.8391, "step": 310 }, { "epoch": 0.49468599033816424, "grad_norm": 0.5377296055296723, "learning_rate": 5e-06, "loss": 0.8384, "step": 320 }, { "epoch": 0.5101449275362319, "grad_norm": 0.6313225350990711, "learning_rate": 5e-06, "loss": 0.8344, "step": 330 }, { "epoch": 0.5256038647342995, "grad_norm": 0.6011085474374273, "learning_rate": 5e-06, "loss": 0.8352, "step": 340 }, { "epoch": 0.5410628019323671, "grad_norm": 0.6313718464958992, "learning_rate": 5e-06, "loss": 0.8319, "step": 350 }, { "epoch": 0.5565217391304348, "grad_norm": 0.5602186929251594, "learning_rate": 5e-06, "loss": 0.8305, "step": 360 }, { "epoch": 0.5719806763285025, "grad_norm": 0.5682159814727703, "learning_rate": 5e-06, "loss": 0.8283, "step": 370 }, { "epoch": 0.58743961352657, "grad_norm": 0.5718556832144389, "learning_rate": 5e-06, "loss": 0.8341, "step": 380 }, { "epoch": 0.6028985507246377, "grad_norm": 0.546768373760242, "learning_rate": 5e-06, "loss": 0.8291, "step": 390 }, { "epoch": 0.6183574879227053, "grad_norm": 0.5993423126379529, "learning_rate": 5e-06, "loss": 0.8361, "step": 400 }, { "epoch": 0.633816425120773, "grad_norm": 0.5491459229199431, "learning_rate": 5e-06, "loss": 0.8291, "step": 410 }, { "epoch": 0.6492753623188405, "grad_norm": 0.5168339143544802, "learning_rate": 5e-06, "loss": 0.83, "step": 420 }, { "epoch": 0.6647342995169082, "grad_norm": 0.5210184948415354, "learning_rate": 5e-06, "loss": 0.8237, "step": 430 }, { "epoch": 0.6801932367149759, "grad_norm": 0.5424122263787127, "learning_rate": 5e-06, "loss": 0.8228, "step": 440 }, { "epoch": 0.6956521739130435, "grad_norm": 0.5637417843194678, "learning_rate": 5e-06, "loss": 0.829, "step": 450 }, { "epoch": 0.7111111111111111, "grad_norm": 0.4743888435043112, "learning_rate": 5e-06, "loss": 0.8221, "step": 460 }, { "epoch": 0.7265700483091787, "grad_norm": 0.5224626427829437, "learning_rate": 5e-06, "loss": 0.8254, "step": 470 }, { "epoch": 0.7420289855072464, "grad_norm": 0.6508696496425336, "learning_rate": 5e-06, "loss": 0.8264, "step": 480 }, { "epoch": 0.7574879227053141, "grad_norm": 0.5904629481154552, "learning_rate": 5e-06, "loss": 0.8295, "step": 490 }, { "epoch": 0.7729468599033816, "grad_norm": 0.574388340778744, "learning_rate": 5e-06, "loss": 0.8223, "step": 500 }, { "epoch": 0.7884057971014493, "grad_norm": 0.5353158279457252, "learning_rate": 5e-06, "loss": 0.8257, "step": 510 }, { "epoch": 0.8038647342995169, "grad_norm": 0.5821234180105461, "learning_rate": 5e-06, "loss": 0.8265, "step": 520 }, { "epoch": 0.8193236714975846, "grad_norm": 0.5068894400573091, "learning_rate": 5e-06, "loss": 0.8224, "step": 530 }, { "epoch": 0.8347826086956521, "grad_norm": 0.5375828449207095, "learning_rate": 5e-06, "loss": 0.821, "step": 540 }, { "epoch": 0.8502415458937198, "grad_norm": 0.5410980089439529, "learning_rate": 5e-06, "loss": 0.8271, "step": 550 }, { "epoch": 0.8657004830917875, "grad_norm": 0.5953566167479901, "learning_rate": 5e-06, "loss": 0.8168, "step": 560 }, { "epoch": 0.881159420289855, "grad_norm": 0.6599677690788177, "learning_rate": 5e-06, "loss": 0.8234, "step": 570 }, { "epoch": 0.8966183574879227, "grad_norm": 0.6548750016255851, "learning_rate": 5e-06, "loss": 0.8198, "step": 580 }, { "epoch": 0.9120772946859903, "grad_norm": 0.6314610392042965, "learning_rate": 5e-06, "loss": 0.8169, "step": 590 }, { "epoch": 0.927536231884058, "grad_norm": 0.4920135279723095, "learning_rate": 5e-06, "loss": 0.8225, "step": 600 }, { "epoch": 0.9429951690821256, "grad_norm": 0.5279136309564921, "learning_rate": 5e-06, "loss": 0.8188, "step": 610 }, { "epoch": 0.9584541062801932, "grad_norm": 0.5531816471285108, "learning_rate": 5e-06, "loss": 0.8217, "step": 620 }, { "epoch": 0.9739130434782609, "grad_norm": 0.5924801838934433, "learning_rate": 5e-06, "loss": 0.8221, "step": 630 }, { "epoch": 0.9893719806763285, "grad_norm": 0.5117042192273262, "learning_rate": 5e-06, "loss": 0.8188, "step": 640 }, { "epoch": 0.9986473429951691, "eval_loss": 0.8183467984199524, "eval_runtime": 686.568, "eval_samples_per_second": 25.389, "eval_steps_per_second": 0.398, "step": 646 }, { "epoch": 1.0050241545893719, "grad_norm": 0.6925909025947767, "learning_rate": 5e-06, "loss": 0.8235, "step": 650 }, { "epoch": 1.0204830917874397, "grad_norm": 0.6132973767615294, "learning_rate": 5e-06, "loss": 0.7771, "step": 660 }, { "epoch": 1.0359420289855072, "grad_norm": 0.6797848846411009, "learning_rate": 5e-06, "loss": 0.7802, "step": 670 }, { "epoch": 1.0514009661835748, "grad_norm": 0.5295735808202817, "learning_rate": 5e-06, "loss": 0.7777, "step": 680 }, { "epoch": 1.0668599033816426, "grad_norm": 0.5271721002677758, "learning_rate": 5e-06, "loss": 0.7751, "step": 690 }, { "epoch": 1.0823188405797102, "grad_norm": 0.47521281338293253, "learning_rate": 5e-06, "loss": 0.7808, "step": 700 }, { "epoch": 1.0977777777777777, "grad_norm": 0.5201403409762577, "learning_rate": 5e-06, "loss": 0.7769, "step": 710 }, { "epoch": 1.1132367149758453, "grad_norm": 0.5374055398678584, "learning_rate": 5e-06, "loss": 0.7775, "step": 720 }, { "epoch": 1.128695652173913, "grad_norm": 0.520683864449963, "learning_rate": 5e-06, "loss": 0.7787, "step": 730 }, { "epoch": 1.1441545893719807, "grad_norm": 0.5406489528118505, "learning_rate": 5e-06, "loss": 0.7816, "step": 740 }, { "epoch": 1.1596135265700482, "grad_norm": 0.585881797178412, "learning_rate": 5e-06, "loss": 0.7811, "step": 750 }, { "epoch": 1.175072463768116, "grad_norm": 0.5490222258224376, "learning_rate": 5e-06, "loss": 0.7763, "step": 760 }, { "epoch": 1.1905314009661836, "grad_norm": 0.6049557272461074, "learning_rate": 5e-06, "loss": 0.7821, "step": 770 }, { "epoch": 1.2059903381642512, "grad_norm": 0.6287813068938076, "learning_rate": 5e-06, "loss": 0.7771, "step": 780 }, { "epoch": 1.221449275362319, "grad_norm": 0.5791771698431348, "learning_rate": 5e-06, "loss": 0.7832, "step": 790 }, { "epoch": 1.2369082125603865, "grad_norm": 0.552647068072239, "learning_rate": 5e-06, "loss": 0.7795, "step": 800 }, { "epoch": 1.252367149758454, "grad_norm": 0.48953182542010515, "learning_rate": 5e-06, "loss": 0.7767, "step": 810 }, { "epoch": 1.2678260869565217, "grad_norm": 0.5809037976182655, "learning_rate": 5e-06, "loss": 0.7784, "step": 820 }, { "epoch": 1.2832850241545894, "grad_norm": 0.49664609280994976, "learning_rate": 5e-06, "loss": 0.7765, "step": 830 }, { "epoch": 1.298743961352657, "grad_norm": 0.5514267021897065, "learning_rate": 5e-06, "loss": 0.7791, "step": 840 }, { "epoch": 1.3142028985507246, "grad_norm": 0.6174163379347436, "learning_rate": 5e-06, "loss": 0.7775, "step": 850 }, { "epoch": 1.3296618357487922, "grad_norm": 0.5893029009867757, "learning_rate": 5e-06, "loss": 0.7743, "step": 860 }, { "epoch": 1.34512077294686, "grad_norm": 0.5884521899466931, "learning_rate": 5e-06, "loss": 0.7768, "step": 870 }, { "epoch": 1.3605797101449275, "grad_norm": 0.526781782612563, "learning_rate": 5e-06, "loss": 0.773, "step": 880 }, { "epoch": 1.376038647342995, "grad_norm": 0.5133541303046, "learning_rate": 5e-06, "loss": 0.774, "step": 890 }, { "epoch": 1.3914975845410629, "grad_norm": 0.5514217537787884, "learning_rate": 5e-06, "loss": 0.7802, "step": 900 }, { "epoch": 1.4069565217391304, "grad_norm": 0.5829849669974898, "learning_rate": 5e-06, "loss": 0.7787, "step": 910 }, { "epoch": 1.422415458937198, "grad_norm": 0.6099035981973764, "learning_rate": 5e-06, "loss": 0.7738, "step": 920 }, { "epoch": 1.4378743961352658, "grad_norm": 0.4767884324426242, "learning_rate": 5e-06, "loss": 0.7773, "step": 930 }, { "epoch": 1.4533333333333334, "grad_norm": 0.5611337081061908, "learning_rate": 5e-06, "loss": 0.7767, "step": 940 }, { "epoch": 1.468792270531401, "grad_norm": 0.47930773858272085, "learning_rate": 5e-06, "loss": 0.7765, "step": 950 }, { "epoch": 1.4842512077294687, "grad_norm": 0.498168257215718, "learning_rate": 5e-06, "loss": 0.7728, "step": 960 }, { "epoch": 1.4997101449275363, "grad_norm": 0.5576989172567428, "learning_rate": 5e-06, "loss": 0.7777, "step": 970 }, { "epoch": 1.5151690821256039, "grad_norm": 0.5873903650866855, "learning_rate": 5e-06, "loss": 0.7747, "step": 980 }, { "epoch": 1.5306280193236717, "grad_norm": 0.5564865473674926, "learning_rate": 5e-06, "loss": 0.7786, "step": 990 }, { "epoch": 1.546086956521739, "grad_norm": 0.6746662280932265, "learning_rate": 5e-06, "loss": 0.7823, "step": 1000 }, { "epoch": 1.5615458937198068, "grad_norm": 0.550553366091711, "learning_rate": 5e-06, "loss": 0.7704, "step": 1010 }, { "epoch": 1.5770048309178744, "grad_norm": 0.555996816403915, "learning_rate": 5e-06, "loss": 0.7758, "step": 1020 }, { "epoch": 1.592463768115942, "grad_norm": 0.5621088990135378, "learning_rate": 5e-06, "loss": 0.7751, "step": 1030 }, { "epoch": 1.6079227053140097, "grad_norm": 0.4672348676970037, "learning_rate": 5e-06, "loss": 0.7742, "step": 1040 }, { "epoch": 1.6233816425120773, "grad_norm": 0.49112359521062937, "learning_rate": 5e-06, "loss": 0.777, "step": 1050 }, { "epoch": 1.6388405797101449, "grad_norm": 0.5517626252028611, "learning_rate": 5e-06, "loss": 0.7757, "step": 1060 }, { "epoch": 1.6542995169082126, "grad_norm": 0.5518129870744243, "learning_rate": 5e-06, "loss": 0.7744, "step": 1070 }, { "epoch": 1.6697584541062802, "grad_norm": 0.685405898117341, "learning_rate": 5e-06, "loss": 0.7753, "step": 1080 }, { "epoch": 1.6852173913043478, "grad_norm": 0.5720673042328214, "learning_rate": 5e-06, "loss": 0.7753, "step": 1090 }, { "epoch": 1.7006763285024156, "grad_norm": 0.4690028175072265, "learning_rate": 5e-06, "loss": 0.774, "step": 1100 }, { "epoch": 1.7161352657004831, "grad_norm": 0.55568178811657, "learning_rate": 5e-06, "loss": 0.7772, "step": 1110 }, { "epoch": 1.7315942028985507, "grad_norm": 0.5185570493500619, "learning_rate": 5e-06, "loss": 0.781, "step": 1120 }, { "epoch": 1.7470531400966185, "grad_norm": 0.5292299708932318, "learning_rate": 5e-06, "loss": 0.7749, "step": 1130 }, { "epoch": 1.7625120772946858, "grad_norm": 0.5884058161213621, "learning_rate": 5e-06, "loss": 0.7719, "step": 1140 }, { "epoch": 1.7779710144927536, "grad_norm": 0.5072506431239099, "learning_rate": 5e-06, "loss": 0.7753, "step": 1150 }, { "epoch": 1.7934299516908214, "grad_norm": 0.5551938392960334, "learning_rate": 5e-06, "loss": 0.7777, "step": 1160 }, { "epoch": 1.8088888888888888, "grad_norm": 0.5566487078925049, "learning_rate": 5e-06, "loss": 0.7774, "step": 1170 }, { "epoch": 1.8243478260869566, "grad_norm": 0.4749917546235466, "learning_rate": 5e-06, "loss": 0.7734, "step": 1180 }, { "epoch": 1.8398067632850241, "grad_norm": 0.5022635709311233, "learning_rate": 5e-06, "loss": 0.7743, "step": 1190 }, { "epoch": 1.8552657004830917, "grad_norm": 0.5442982810099344, "learning_rate": 5e-06, "loss": 0.7728, "step": 1200 }, { "epoch": 1.8707246376811595, "grad_norm": 0.5155014433123901, "learning_rate": 5e-06, "loss": 0.774, "step": 1210 }, { "epoch": 1.886183574879227, "grad_norm": 0.5934285413681538, "learning_rate": 5e-06, "loss": 0.7746, "step": 1220 }, { "epoch": 1.9016425120772946, "grad_norm": 0.5260175972638601, "learning_rate": 5e-06, "loss": 0.7693, "step": 1230 }, { "epoch": 1.9171014492753624, "grad_norm": 0.515080447493818, "learning_rate": 5e-06, "loss": 0.7717, "step": 1240 }, { "epoch": 1.93256038647343, "grad_norm": 0.6011160845737209, "learning_rate": 5e-06, "loss": 0.7754, "step": 1250 }, { "epoch": 1.9480193236714975, "grad_norm": 0.46061302659355685, "learning_rate": 5e-06, "loss": 0.7755, "step": 1260 }, { "epoch": 1.9634782608695653, "grad_norm": 0.46039484020056154, "learning_rate": 5e-06, "loss": 0.7722, "step": 1270 }, { "epoch": 1.9789371980676327, "grad_norm": 0.5658493454639554, "learning_rate": 5e-06, "loss": 0.7755, "step": 1280 }, { "epoch": 1.9943961352657005, "grad_norm": 0.5908199178180503, "learning_rate": 5e-06, "loss": 0.7709, "step": 1290 }, { "epoch": 1.9990338164251207, "eval_loss": 0.8051349520683289, "eval_runtime": 690.0891, "eval_samples_per_second": 25.259, "eval_steps_per_second": 0.396, "step": 1293 }, { "epoch": 2.0100483091787438, "grad_norm": 0.6943407863572525, "learning_rate": 5e-06, "loss": 0.7604, "step": 1300 }, { "epoch": 2.0255072463768116, "grad_norm": 0.5840764739328596, "learning_rate": 5e-06, "loss": 0.7299, "step": 1310 }, { "epoch": 2.0409661835748794, "grad_norm": 0.645835170219903, "learning_rate": 5e-06, "loss": 0.7254, "step": 1320 }, { "epoch": 2.0564251207729467, "grad_norm": 0.6967100498978297, "learning_rate": 5e-06, "loss": 0.7312, "step": 1330 }, { "epoch": 2.0718840579710145, "grad_norm": 0.5424795093750152, "learning_rate": 5e-06, "loss": 0.7283, "step": 1340 }, { "epoch": 2.0873429951690823, "grad_norm": 0.5651081335517218, "learning_rate": 5e-06, "loss": 0.7322, "step": 1350 }, { "epoch": 2.1028019323671496, "grad_norm": 0.5793019251125064, "learning_rate": 5e-06, "loss": 0.7317, "step": 1360 }, { "epoch": 2.1182608695652174, "grad_norm": 0.5653295937261641, "learning_rate": 5e-06, "loss": 0.7331, "step": 1370 }, { "epoch": 2.133719806763285, "grad_norm": 0.6945092784765529, "learning_rate": 5e-06, "loss": 0.7346, "step": 1380 }, { "epoch": 2.1491787439613526, "grad_norm": 0.5795163218543443, "learning_rate": 5e-06, "loss": 0.7336, "step": 1390 }, { "epoch": 2.1646376811594203, "grad_norm": 0.5922357321216497, "learning_rate": 5e-06, "loss": 0.7299, "step": 1400 }, { "epoch": 2.1800966183574877, "grad_norm": 0.5570557796263025, "learning_rate": 5e-06, "loss": 0.7333, "step": 1410 }, { "epoch": 2.1955555555555555, "grad_norm": 0.5392312450784695, "learning_rate": 5e-06, "loss": 0.7371, "step": 1420 }, { "epoch": 2.2110144927536233, "grad_norm": 0.569063560563541, "learning_rate": 5e-06, "loss": 0.7314, "step": 1430 }, { "epoch": 2.2264734299516906, "grad_norm": 0.6107660118171969, "learning_rate": 5e-06, "loss": 0.7322, "step": 1440 }, { "epoch": 2.2419323671497584, "grad_norm": 0.6566517138097786, "learning_rate": 5e-06, "loss": 0.7356, "step": 1450 }, { "epoch": 2.257391304347826, "grad_norm": 0.5806353609910259, "learning_rate": 5e-06, "loss": 0.7418, "step": 1460 }, { "epoch": 2.2728502415458935, "grad_norm": 0.544246667709765, "learning_rate": 5e-06, "loss": 0.7319, "step": 1470 }, { "epoch": 2.2883091787439613, "grad_norm": 0.5424208252581, "learning_rate": 5e-06, "loss": 0.7332, "step": 1480 }, { "epoch": 2.303768115942029, "grad_norm": 0.5380434193955503, "learning_rate": 5e-06, "loss": 0.7342, "step": 1490 }, { "epoch": 2.3192270531400965, "grad_norm": 0.5919093406358342, "learning_rate": 5e-06, "loss": 0.7345, "step": 1500 }, { "epoch": 2.3346859903381643, "grad_norm": 0.5815232359700448, "learning_rate": 5e-06, "loss": 0.7357, "step": 1510 }, { "epoch": 2.350144927536232, "grad_norm": 0.6561512544812266, "learning_rate": 5e-06, "loss": 0.7339, "step": 1520 }, { "epoch": 2.3656038647342994, "grad_norm": 0.5328952220385875, "learning_rate": 5e-06, "loss": 0.7297, "step": 1530 }, { "epoch": 2.381062801932367, "grad_norm": 0.5216733576185124, "learning_rate": 5e-06, "loss": 0.7298, "step": 1540 }, { "epoch": 2.396521739130435, "grad_norm": 0.6063067814678768, "learning_rate": 5e-06, "loss": 0.7368, "step": 1550 }, { "epoch": 2.4119806763285023, "grad_norm": 0.5818602690123681, "learning_rate": 5e-06, "loss": 0.7353, "step": 1560 }, { "epoch": 2.42743961352657, "grad_norm": 0.5913577701518534, "learning_rate": 5e-06, "loss": 0.7338, "step": 1570 }, { "epoch": 2.442898550724638, "grad_norm": 0.5527497540961946, "learning_rate": 5e-06, "loss": 0.7329, "step": 1580 }, { "epoch": 2.4583574879227053, "grad_norm": 0.6737570445790982, "learning_rate": 5e-06, "loss": 0.7367, "step": 1590 }, { "epoch": 2.473816425120773, "grad_norm": 0.6619470586787684, "learning_rate": 5e-06, "loss": 0.733, "step": 1600 }, { "epoch": 2.4892753623188404, "grad_norm": 0.4750068577638755, "learning_rate": 5e-06, "loss": 0.7375, "step": 1610 }, { "epoch": 2.504734299516908, "grad_norm": 0.6847743909506772, "learning_rate": 5e-06, "loss": 0.7374, "step": 1620 }, { "epoch": 2.520193236714976, "grad_norm": 0.5239840846624293, "learning_rate": 5e-06, "loss": 0.7311, "step": 1630 }, { "epoch": 2.5356521739130433, "grad_norm": 0.4721718835375596, "learning_rate": 5e-06, "loss": 0.7308, "step": 1640 }, { "epoch": 2.551111111111111, "grad_norm": 0.51093602092176, "learning_rate": 5e-06, "loss": 0.7337, "step": 1650 }, { "epoch": 2.566570048309179, "grad_norm": 0.5517386015611798, "learning_rate": 5e-06, "loss": 0.7318, "step": 1660 }, { "epoch": 2.5820289855072462, "grad_norm": 0.6326674619813268, "learning_rate": 5e-06, "loss": 0.736, "step": 1670 }, { "epoch": 2.597487922705314, "grad_norm": 0.5232840712675151, "learning_rate": 5e-06, "loss": 0.7325, "step": 1680 }, { "epoch": 2.6129468599033814, "grad_norm": 0.4969751533645812, "learning_rate": 5e-06, "loss": 0.7375, "step": 1690 }, { "epoch": 2.628405797101449, "grad_norm": 0.49538430512331766, "learning_rate": 5e-06, "loss": 0.7366, "step": 1700 }, { "epoch": 2.643864734299517, "grad_norm": 0.6208865012276192, "learning_rate": 5e-06, "loss": 0.7372, "step": 1710 }, { "epoch": 2.6593236714975843, "grad_norm": 0.5276942120485377, "learning_rate": 5e-06, "loss": 0.7351, "step": 1720 }, { "epoch": 2.674782608695652, "grad_norm": 0.570808842039396, "learning_rate": 5e-06, "loss": 0.7384, "step": 1730 }, { "epoch": 2.69024154589372, "grad_norm": 0.5214638213365278, "learning_rate": 5e-06, "loss": 0.7361, "step": 1740 }, { "epoch": 2.7057004830917872, "grad_norm": 0.5190586781651014, "learning_rate": 5e-06, "loss": 0.7309, "step": 1750 }, { "epoch": 2.721159420289855, "grad_norm": 0.5317230869170978, "learning_rate": 5e-06, "loss": 0.7296, "step": 1760 }, { "epoch": 2.736618357487923, "grad_norm": 0.5917255596181432, "learning_rate": 5e-06, "loss": 0.7406, "step": 1770 }, { "epoch": 2.75207729468599, "grad_norm": 0.49202576322983876, "learning_rate": 5e-06, "loss": 0.7324, "step": 1780 }, { "epoch": 2.767536231884058, "grad_norm": 0.5594574654106287, "learning_rate": 5e-06, "loss": 0.7331, "step": 1790 }, { "epoch": 2.7829951690821257, "grad_norm": 0.6198580773466541, "learning_rate": 5e-06, "loss": 0.7372, "step": 1800 }, { "epoch": 2.798454106280193, "grad_norm": 0.5740394274550438, "learning_rate": 5e-06, "loss": 0.7359, "step": 1810 }, { "epoch": 2.813913043478261, "grad_norm": 0.5501912428656768, "learning_rate": 5e-06, "loss": 0.7384, "step": 1820 }, { "epoch": 2.8293719806763287, "grad_norm": 0.5104778986757859, "learning_rate": 5e-06, "loss": 0.7324, "step": 1830 }, { "epoch": 2.844830917874396, "grad_norm": 0.5395220598812313, "learning_rate": 5e-06, "loss": 0.736, "step": 1840 }, { "epoch": 2.860289855072464, "grad_norm": 0.6030104859258091, "learning_rate": 5e-06, "loss": 0.7327, "step": 1850 }, { "epoch": 2.8757487922705316, "grad_norm": 0.556906171705928, "learning_rate": 5e-06, "loss": 0.7374, "step": 1860 }, { "epoch": 2.891207729468599, "grad_norm": 0.6174821846225631, "learning_rate": 5e-06, "loss": 0.7351, "step": 1870 }, { "epoch": 2.9066666666666667, "grad_norm": 0.5078906232420815, "learning_rate": 5e-06, "loss": 0.7326, "step": 1880 }, { "epoch": 2.9221256038647345, "grad_norm": 0.6177111487230912, "learning_rate": 5e-06, "loss": 0.7321, "step": 1890 }, { "epoch": 2.937584541062802, "grad_norm": 0.5520929737500946, "learning_rate": 5e-06, "loss": 0.7395, "step": 1900 }, { "epoch": 2.9530434782608697, "grad_norm": 0.5185834378400617, "learning_rate": 5e-06, "loss": 0.7368, "step": 1910 }, { "epoch": 2.9685024154589374, "grad_norm": 0.5204851978024219, "learning_rate": 5e-06, "loss": 0.7339, "step": 1920 }, { "epoch": 2.983961352657005, "grad_norm": 0.5807949438616106, "learning_rate": 5e-06, "loss": 0.7386, "step": 1930 }, { "epoch": 2.9963285024154587, "eval_loss": 0.8025317192077637, "eval_runtime": 693.9451, "eval_samples_per_second": 25.119, "eval_steps_per_second": 0.393, "step": 1938 }, { "epoch": 2.9963285024154587, "step": 1938, "total_flos": 3246012802007040.0, "train_loss": 0.7871413270263357, "train_runtime": 114590.6996, "train_samples_per_second": 8.67, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 1938, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3246012802007040.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }