{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1039, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009624639076034649, "grad_norm": 166.23460388183594, "learning_rate": 2.8846153846153845e-06, "loss": 32.9338, "step": 1 }, { "epoch": 0.004812319538017324, "grad_norm": 157.81483459472656, "learning_rate": 1.4423076923076923e-05, "loss": 30.5874, "step": 5 }, { "epoch": 0.009624639076034648, "grad_norm": 69.58987426757812, "learning_rate": 2.8846153846153845e-05, "loss": 26.697, "step": 10 }, { "epoch": 0.014436958614051972, "grad_norm": 44.16779708862305, "learning_rate": 4.326923076923076e-05, "loss": 23.1646, "step": 15 }, { "epoch": 0.019249278152069296, "grad_norm": 11.099946022033691, "learning_rate": 5.769230769230769e-05, "loss": 18.9989, "step": 20 }, { "epoch": 0.02406159769008662, "grad_norm": 9.369112014770508, "learning_rate": 7.211538461538461e-05, "loss": 17.3748, "step": 25 }, { "epoch": 0.028873917228103944, "grad_norm": 6.860820770263672, "learning_rate": 8.653846153846152e-05, "loss": 15.7684, "step": 30 }, { "epoch": 0.03368623676612127, "grad_norm": 3.230618715286255, "learning_rate": 0.00010096153846153846, "loss": 15.0712, "step": 35 }, { "epoch": 0.03849855630413859, "grad_norm": 2.7574245929718018, "learning_rate": 0.00011538461538461538, "loss": 14.191, "step": 40 }, { "epoch": 0.04331087584215592, "grad_norm": 4.076800346374512, "learning_rate": 0.0001298076923076923, "loss": 14.0805, "step": 45 }, { "epoch": 0.04812319538017324, "grad_norm": 5.263434886932373, "learning_rate": 0.00014423076923076922, "loss": 13.0166, "step": 50 }, { "epoch": 0.05293551491819057, "grad_norm": 9.342857360839844, "learning_rate": 0.00015865384615384616, "loss": 12.1825, "step": 55 }, { "epoch": 0.05774783445620789, "grad_norm": 16.83011245727539, "learning_rate": 0.00017307692307692304, "loss": 10.6871, "step": 60 }, { "epoch": 0.06256015399422522, "grad_norm": 19.575435638427734, "learning_rate": 0.00018749999999999998, "loss": 7.6687, "step": 65 }, { "epoch": 0.06737247353224254, "grad_norm": 13.751044273376465, "learning_rate": 0.00020192307692307691, "loss": 4.3583, "step": 70 }, { "epoch": 0.07218479307025986, "grad_norm": 6.957976818084717, "learning_rate": 0.00021634615384615383, "loss": 2.5049, "step": 75 }, { "epoch": 0.07699711260827719, "grad_norm": 4.541615962982178, "learning_rate": 0.00023076923076923076, "loss": 2.1064, "step": 80 }, { "epoch": 0.08180943214629452, "grad_norm": 2.260869264602661, "learning_rate": 0.00024519230769230765, "loss": 1.774, "step": 85 }, { "epoch": 0.08662175168431184, "grad_norm": 1.0187702178955078, "learning_rate": 0.0002596153846153846, "loss": 1.6404, "step": 90 }, { "epoch": 0.09143407122232916, "grad_norm": 1.9154465198516846, "learning_rate": 0.0002740384615384615, "loss": 1.4761, "step": 95 }, { "epoch": 0.09624639076034648, "grad_norm": 0.8144974708557129, "learning_rate": 0.00028846153846153843, "loss": 1.4555, "step": 100 }, { "epoch": 0.10105871029836382, "grad_norm": 1.0940709114074707, "learning_rate": 0.00029999915328475654, "loss": 1.3766, "step": 105 }, { "epoch": 0.10587102983638114, "grad_norm": 0.9726133346557617, "learning_rate": 0.00029996951925492186, "loss": 1.3313, "step": 110 }, { "epoch": 0.11068334937439846, "grad_norm": 2.162132740020752, "learning_rate": 0.0002998975590214534, "loss": 1.2798, "step": 115 }, { "epoch": 0.11549566891241578, "grad_norm": 2.067453622817993, "learning_rate": 0.0002997832928938348, "loss": 1.3008, "step": 120 }, { "epoch": 0.12030798845043311, "grad_norm": 0.8700665831565857, "learning_rate": 0.0002996267531216286, "loss": 1.2811, "step": 125 }, { "epoch": 0.12512030798845045, "grad_norm": 0.6688537001609802, "learning_rate": 0.0002994279838853743, "loss": 1.234, "step": 130 }, { "epoch": 0.12993262752646775, "grad_norm": 0.9057418704032898, "learning_rate": 0.0002991870412841192, "loss": 1.2589, "step": 135 }, { "epoch": 0.1347449470644851, "grad_norm": 0.9883818030357361, "learning_rate": 0.00029890399331958587, "loss": 1.2351, "step": 140 }, { "epoch": 0.1395572666025024, "grad_norm": 1.169600009918213, "learning_rate": 0.0002985789198769791, "loss": 1.2336, "step": 145 }, { "epoch": 0.14436958614051973, "grad_norm": 1.972508192062378, "learning_rate": 0.0002982119127024403, "loss": 1.2234, "step": 150 }, { "epoch": 0.14918190567853706, "grad_norm": 1.5447394847869873, "learning_rate": 0.0002978030753771539, "loss": 1.193, "step": 155 }, { "epoch": 0.15399422521655437, "grad_norm": 1.4402408599853516, "learning_rate": 0.0002973525232881129, "loss": 1.1862, "step": 160 }, { "epoch": 0.1588065447545717, "grad_norm": 1.1992448568344116, "learning_rate": 0.0002968603835955533, "loss": 1.1864, "step": 165 }, { "epoch": 0.16361886429258904, "grad_norm": 1.3996031284332275, "learning_rate": 0.0002963267951970655, "loss": 1.1869, "step": 170 }, { "epoch": 0.16843118383060635, "grad_norm": 1.2955073118209839, "learning_rate": 0.00029575190868839257, "loss": 1.1416, "step": 175 }, { "epoch": 0.17324350336862368, "grad_norm": 0.9568496346473694, "learning_rate": 0.00029513588632092786, "loss": 1.1603, "step": 180 }, { "epoch": 0.17805582290664101, "grad_norm": 3.254401683807373, "learning_rate": 0.00029447890195592177, "loss": 1.1713, "step": 185 }, { "epoch": 0.18286814244465832, "grad_norm": 0.9820644855499268, "learning_rate": 0.0002937811410154133, "loss": 1.1647, "step": 190 }, { "epoch": 0.18768046198267566, "grad_norm": 0.8743553757667542, "learning_rate": 0.00029304280042989725, "loss": 1.1465, "step": 195 }, { "epoch": 0.19249278152069296, "grad_norm": 2.3073689937591553, "learning_rate": 0.00029226408858274474, "loss": 1.1412, "step": 200 }, { "epoch": 0.1973051010587103, "grad_norm": 1.8668166399002075, "learning_rate": 0.0002914452252513903, "loss": 1.1601, "step": 205 }, { "epoch": 0.20211742059672763, "grad_norm": 1.0222842693328857, "learning_rate": 0.000290586441545304, "loss": 1.1425, "step": 210 }, { "epoch": 0.20692974013474494, "grad_norm": 0.9697795510292053, "learning_rate": 0.00028968797984076474, "loss": 1.1247, "step": 215 }, { "epoch": 0.21174205967276227, "grad_norm": 0.8263999223709106, "learning_rate": 0.00028875009371245387, "loss": 1.1138, "step": 220 }, { "epoch": 0.2165543792107796, "grad_norm": 0.7808287143707275, "learning_rate": 0.00028777304786188845, "loss": 1.106, "step": 225 }, { "epoch": 0.22136669874879691, "grad_norm": 1.7108441591262817, "learning_rate": 0.00028675711804271374, "loss": 1.0889, "step": 230 }, { "epoch": 0.22617901828681425, "grad_norm": 0.6597785353660583, "learning_rate": 0.0002857025909828771, "loss": 1.1057, "step": 235 }, { "epoch": 0.23099133782483156, "grad_norm": 0.7856463193893433, "learning_rate": 0.0002846097643037037, "loss": 1.1373, "step": 240 }, { "epoch": 0.2358036573628489, "grad_norm": 1.5672566890716553, "learning_rate": 0.0002834789464358987, "loss": 1.1383, "step": 245 }, { "epoch": 0.24061597690086622, "grad_norm": 1.137799620628357, "learning_rate": 0.00028231045653249787, "loss": 1.0927, "step": 250 }, { "epoch": 0.24542829643888353, "grad_norm": 0.7409512996673584, "learning_rate": 0.00028110462437879244, "loss": 1.092, "step": 255 }, { "epoch": 0.2502406159769009, "grad_norm": 0.7915446162223816, "learning_rate": 0.0002798617902992533, "loss": 1.1099, "step": 260 }, { "epoch": 0.2550529355149182, "grad_norm": 1.610798716545105, "learning_rate": 0.0002785823050614804, "loss": 1.0959, "step": 265 }, { "epoch": 0.2598652550529355, "grad_norm": 1.3092223405838013, "learning_rate": 0.000277266529777205, "loss": 1.0997, "step": 270 }, { "epoch": 0.2646775745909528, "grad_norm": 0.9148079752922058, "learning_rate": 0.000275914835800372, "loss": 1.1055, "step": 275 }, { "epoch": 0.2694898941289702, "grad_norm": 1.2024147510528564, "learning_rate": 0.00027452760462233256, "loss": 1.0864, "step": 280 }, { "epoch": 0.2743022136669875, "grad_norm": 1.1148972511291504, "learning_rate": 0.00027310522776417454, "loss": 1.1042, "step": 285 }, { "epoch": 0.2791145332050048, "grad_norm": 1.1532288789749146, "learning_rate": 0.00027164810666622293, "loss": 1.0599, "step": 290 }, { "epoch": 0.28392685274302215, "grad_norm": 0.7921152114868164, "learning_rate": 0.00027015665257474036, "loss": 1.103, "step": 295 }, { "epoch": 0.28873917228103946, "grad_norm": 0.853850781917572, "learning_rate": 0.0002686312864258605, "loss": 1.1032, "step": 300 }, { "epoch": 0.29355149181905676, "grad_norm": 2.03155517578125, "learning_rate": 0.0002670724387267859, "loss": 1.056, "step": 305 }, { "epoch": 0.2983638113570741, "grad_norm": 0.9392085075378418, "learning_rate": 0.00026548054943428573, "loss": 1.0881, "step": 310 }, { "epoch": 0.30317613089509143, "grad_norm": 1.8212369680404663, "learning_rate": 0.0002638560678305254, "loss": 1.0693, "step": 315 }, { "epoch": 0.30798845043310874, "grad_norm": 2.271634101867676, "learning_rate": 0.0002621994523962649, "loss": 1.0546, "step": 320 }, { "epoch": 0.3128007699711261, "grad_norm": 2.8677988052368164, "learning_rate": 0.0002605111706814607, "loss": 1.0844, "step": 325 }, { "epoch": 0.3176130895091434, "grad_norm": 1.7386771440505981, "learning_rate": 0.00025879169917330847, "loss": 1.0778, "step": 330 }, { "epoch": 0.3224254090471607, "grad_norm": 1.8729993104934692, "learning_rate": 0.00025704152316176283, "loss": 1.0393, "step": 335 }, { "epoch": 0.3272377285851781, "grad_norm": 2.2115373611450195, "learning_rate": 0.00025526113660257307, "loss": 1.0491, "step": 340 }, { "epoch": 0.3320500481231954, "grad_norm": 1.0617059469223022, "learning_rate": 0.0002534510419778731, "loss": 1.0691, "step": 345 }, { "epoch": 0.3368623676612127, "grad_norm": 1.0045102834701538, "learning_rate": 0.00025161175015436473, "loss": 1.0147, "step": 350 }, { "epoch": 0.34167468719923005, "grad_norm": 1.8575717210769653, "learning_rate": 0.0002497437802391349, "loss": 1.0567, "step": 355 }, { "epoch": 0.34648700673724736, "grad_norm": 1.1691515445709229, "learning_rate": 0.0002478476594331469, "loss": 1.0472, "step": 360 }, { "epoch": 0.35129932627526467, "grad_norm": 0.8885318040847778, "learning_rate": 0.0002459239228824474, "loss": 1.0426, "step": 365 }, { "epoch": 0.35611164581328203, "grad_norm": 1.19156014919281, "learning_rate": 0.000243973113527131, "loss": 1.0431, "step": 370 }, { "epoch": 0.36092396535129934, "grad_norm": 1.0888525247573853, "learning_rate": 0.00024199578194810534, "loss": 1.0385, "step": 375 }, { "epoch": 0.36573628488931664, "grad_norm": 0.8975598216056824, "learning_rate": 0.00023999248621169936, "loss": 1.0238, "step": 380 }, { "epoch": 0.37054860442733395, "grad_norm": 1.4952484369277954, "learning_rate": 0.0002379637917121592, "loss": 1.0666, "step": 385 }, { "epoch": 0.3753609239653513, "grad_norm": 0.6977307200431824, "learning_rate": 0.00023591027101207575, "loss": 1.0631, "step": 390 }, { "epoch": 0.3801732435033686, "grad_norm": 1.0105865001678467, "learning_rate": 0.00023383250368078917, "loss": 1.037, "step": 395 }, { "epoch": 0.3849855630413859, "grad_norm": 0.8000687956809998, "learning_rate": 0.00023173107613081576, "loss": 1.0358, "step": 400 }, { "epoch": 0.3897978825794033, "grad_norm": 0.9405982494354248, "learning_rate": 0.00022960658145234378, "loss": 1.0131, "step": 405 }, { "epoch": 0.3946102021174206, "grad_norm": 2.1845569610595703, "learning_rate": 0.00022745961924584428, "loss": 1.0179, "step": 410 }, { "epoch": 0.3994225216554379, "grad_norm": 1.5082358121871948, "learning_rate": 0.0002252907954528445, "loss": 1.0354, "step": 415 }, { "epoch": 0.40423484119345526, "grad_norm": 0.8987241983413696, "learning_rate": 0.000223100722184912, "loss": 1.0087, "step": 420 }, { "epoch": 0.40904716073147257, "grad_norm": 1.57774019241333, "learning_rate": 0.00022089001755089686, "loss": 1.0253, "step": 425 }, { "epoch": 0.4138594802694899, "grad_norm": 1.6441487073898315, "learning_rate": 0.00021865930548248198, "loss": 1.0353, "step": 430 }, { "epoch": 0.41867179980750724, "grad_norm": 1.5525552034378052, "learning_rate": 0.00021640921555808913, "loss": 1.0555, "step": 435 }, { "epoch": 0.42348411934552455, "grad_norm": 0.9230406284332275, "learning_rate": 0.00021414038282519207, "loss": 1.0175, "step": 440 }, { "epoch": 0.42829643888354185, "grad_norm": 1.0450150966644287, "learning_rate": 0.0002118534476210855, "loss": 1.0243, "step": 445 }, { "epoch": 0.4331087584215592, "grad_norm": 1.575501799583435, "learning_rate": 0.00020954905539216173, "loss": 0.9861, "step": 450 }, { "epoch": 0.4379210779595765, "grad_norm": 1.0209721326828003, "learning_rate": 0.0002072278565117447, "loss": 1.0094, "step": 455 }, { "epoch": 0.44273339749759383, "grad_norm": 0.7234334349632263, "learning_rate": 0.000204890506096534, "loss": 1.0599, "step": 460 }, { "epoch": 0.4475457170356112, "grad_norm": 0.8980212807655334, "learning_rate": 0.00020253766382170982, "loss": 1.0268, "step": 465 }, { "epoch": 0.4523580365736285, "grad_norm": 0.9909809827804565, "learning_rate": 0.00020016999373475146, "loss": 1.0123, "step": 470 }, { "epoch": 0.4571703561116458, "grad_norm": 0.6521352529525757, "learning_rate": 0.00019778816406802198, "loss": 0.9937, "step": 475 }, { "epoch": 0.4619826756496631, "grad_norm": 0.6760934591293335, "learning_rate": 0.0001953928470501716, "loss": 1.0513, "step": 480 }, { "epoch": 0.4667949951876805, "grad_norm": 1.8086323738098145, "learning_rate": 0.00019298471871641312, "loss": 1.0259, "step": 485 }, { "epoch": 0.4716073147256978, "grad_norm": 1.66387140750885, "learning_rate": 0.0001905644587177232, "loss": 0.9961, "step": 490 }, { "epoch": 0.4764196342637151, "grad_norm": 1.7097184658050537, "learning_rate": 0.00018813275012902306, "loss": 1.0118, "step": 495 }, { "epoch": 0.48123195380173245, "grad_norm": 1.2963345050811768, "learning_rate": 0.0001856902792563928, "loss": 1.0362, "step": 500 }, { "epoch": 0.48604427333974976, "grad_norm": 1.906731367111206, "learning_rate": 0.00018323773544337403, "loss": 1.0121, "step": 505 }, { "epoch": 0.49085659287776706, "grad_norm": 1.2913812398910522, "learning_rate": 0.00018077581087641483, "loss": 1.001, "step": 510 }, { "epoch": 0.4956689124157844, "grad_norm": 0.8876517415046692, "learning_rate": 0.00017830520038951251, "loss": 1.0014, "step": 515 }, { "epoch": 0.5004812319538018, "grad_norm": 1.1110894680023193, "learning_rate": 0.00017582660126810946, "loss": 1.0054, "step": 520 }, { "epoch": 0.5052935514918191, "grad_norm": 0.577628493309021, "learning_rate": 0.00017334071305229661, "loss": 1.01, "step": 525 }, { "epoch": 0.5101058710298364, "grad_norm": 0.7405449748039246, "learning_rate": 0.0001708482373393809, "loss": 0.9978, "step": 530 }, { "epoch": 0.5149181905678537, "grad_norm": 1.3642311096191406, "learning_rate": 0.00016834987758587204, "loss": 0.9943, "step": 535 }, { "epoch": 0.519730510105871, "grad_norm": 1.058975100517273, "learning_rate": 0.00016584633890894448, "loss": 0.9829, "step": 540 }, { "epoch": 0.5245428296438883, "grad_norm": 0.8703084588050842, "learning_rate": 0.0001633383278874309, "loss": 1.0105, "step": 545 }, { "epoch": 0.5293551491819056, "grad_norm": 0.8028627634048462, "learning_rate": 0.0001608265523624029, "loss": 1.0129, "step": 550 }, { "epoch": 0.534167468719923, "grad_norm": 0.9001644849777222, "learning_rate": 0.00015831172123739547, "loss": 0.9943, "step": 555 }, { "epoch": 0.5389797882579404, "grad_norm": 0.6863451600074768, "learning_rate": 0.00015579454427833205, "loss": 0.9979, "step": 560 }, { "epoch": 0.5437921077959577, "grad_norm": 1.088158130645752, "learning_rate": 0.0001532757319132053, "loss": 0.9964, "step": 565 }, { "epoch": 0.548604427333975, "grad_norm": 0.9019696116447449, "learning_rate": 0.00015075599503157217, "loss": 1.002, "step": 570 }, { "epoch": 0.5534167468719923, "grad_norm": 0.7873600125312805, "learning_rate": 0.0001482360447839177, "loss": 0.9794, "step": 575 }, { "epoch": 0.5582290664100096, "grad_norm": 0.7380373477935791, "learning_rate": 0.00014571659238094556, "loss": 0.998, "step": 580 }, { "epoch": 0.563041385948027, "grad_norm": 0.7692922353744507, "learning_rate": 0.00014319834889285182, "loss": 0.9923, "step": 585 }, { "epoch": 0.5678537054860443, "grad_norm": 2.178382158279419, "learning_rate": 0.00014068202504863793, "loss": 0.9951, "step": 590 }, { "epoch": 0.5726660250240616, "grad_norm": 0.6920925378799438, "learning_rate": 0.0001381683310355204, "loss": 0.9984, "step": 595 }, { "epoch": 0.5774783445620789, "grad_norm": 1.601839542388916, "learning_rate": 0.00013565797629849319, "loss": 0.9833, "step": 600 }, { "epoch": 0.5822906641000962, "grad_norm": 1.4429692029953003, "learning_rate": 0.00013315166934009943, "loss": 0.9971, "step": 605 }, { "epoch": 0.5871029836381135, "grad_norm": 1.3127554655075073, "learning_rate": 0.00013065011752046952, "loss": 0.9796, "step": 610 }, { "epoch": 0.591915303176131, "grad_norm": 0.933466911315918, "learning_rate": 0.0001281540268576812, "loss": 0.9878, "step": 615 }, { "epoch": 0.5967276227141483, "grad_norm": 1.4065930843353271, "learning_rate": 0.00012566410182849894, "loss": 1.0078, "step": 620 }, { "epoch": 0.6015399422521656, "grad_norm": 0.9120624661445618, "learning_rate": 0.00012318104516954775, "loss": 0.9845, "step": 625 }, { "epoch": 0.6063522617901829, "grad_norm": 0.888067364692688, "learning_rate": 0.00012070555767897848, "loss": 1.0093, "step": 630 }, { "epoch": 0.6111645813282002, "grad_norm": 0.6477493047714233, "learning_rate": 0.00011823833801868024, "loss": 1.0082, "step": 635 }, { "epoch": 0.6159769008662175, "grad_norm": 0.656038224697113, "learning_rate": 0.0001157800825170953, "loss": 1.0008, "step": 640 }, { "epoch": 0.6207892204042348, "grad_norm": 0.8053154945373535, "learning_rate": 0.00011333148497269334, "loss": 0.9899, "step": 645 }, { "epoch": 0.6256015399422522, "grad_norm": 1.2297455072402954, "learning_rate": 0.0001108932364581589, "loss": 0.9938, "step": 650 }, { "epoch": 0.6304138594802695, "grad_norm": 1.7194623947143555, "learning_rate": 0.00010846602512534882, "loss": 1.0019, "step": 655 }, { "epoch": 0.6352261790182868, "grad_norm": 1.0724910497665405, "learning_rate": 0.0001060505360110736, "loss": 0.963, "step": 660 }, { "epoch": 0.6400384985563041, "grad_norm": 0.8016871809959412, "learning_rate": 0.0001036474508437579, "loss": 1.0036, "step": 665 }, { "epoch": 0.6448508180943214, "grad_norm": 0.8981702923774719, "learning_rate": 0.00010125744785103527, "loss": 0.9935, "step": 670 }, { "epoch": 0.6496631376323387, "grad_norm": 1.4376094341278076, "learning_rate": 9.888120156833034e-05, "loss": 0.9864, "step": 675 }, { "epoch": 0.6544754571703562, "grad_norm": 1.0005443096160889, "learning_rate": 9.651938264848343e-05, "loss": 0.959, "step": 680 }, { "epoch": 0.6592877767083735, "grad_norm": 2.983701705932617, "learning_rate": 9.417265767247081e-05, "loss": 0.9944, "step": 685 }, { "epoch": 0.6641000962463908, "grad_norm": 0.6275534629821777, "learning_rate": 9.184168896127397e-05, "loss": 0.9669, "step": 690 }, { "epoch": 0.6689124157844081, "grad_norm": 0.6808886528015137, "learning_rate": 8.952713438895186e-05, "loss": 0.9584, "step": 695 }, { "epoch": 0.6737247353224254, "grad_norm": 0.6760061979293823, "learning_rate": 8.722964719696728e-05, "loss": 0.9618, "step": 700 }, { "epoch": 0.6785370548604427, "grad_norm": 0.695017397403717, "learning_rate": 8.494987580982167e-05, "loss": 0.9666, "step": 705 }, { "epoch": 0.6833493743984601, "grad_norm": 0.8269957900047302, "learning_rate": 8.268846365204855e-05, "loss": 0.9747, "step": 710 }, { "epoch": 0.6881616939364774, "grad_norm": 0.7746295928955078, "learning_rate": 8.044604896661853e-05, "loss": 0.9923, "step": 715 }, { "epoch": 0.6929740134744947, "grad_norm": 1.039602518081665, "learning_rate": 7.822326463480703e-05, "loss": 0.9902, "step": 720 }, { "epoch": 0.697786333012512, "grad_norm": 1.0319743156433105, "learning_rate": 7.60207379975746e-05, "loss": 0.9839, "step": 725 }, { "epoch": 0.7025986525505293, "grad_norm": 2.147736072540283, "learning_rate": 7.383909067851123e-05, "loss": 0.9544, "step": 730 }, { "epoch": 0.7074109720885466, "grad_norm": 0.7728191018104553, "learning_rate": 7.167893840839445e-05, "loss": 0.953, "step": 735 }, { "epoch": 0.7122232916265641, "grad_norm": 1.7299913167953491, "learning_rate": 6.95408908514101e-05, "loss": 0.9943, "step": 740 }, { "epoch": 0.7170356111645814, "grad_norm": 0.9071583151817322, "learning_rate": 6.742555143308576e-05, "loss": 0.9562, "step": 745 }, { "epoch": 0.7218479307025987, "grad_norm": 1.0585880279541016, "learning_rate": 6.533351716998465e-05, "loss": 1.0041, "step": 750 }, { "epoch": 0.726660250240616, "grad_norm": 1.0146371126174927, "learning_rate": 6.32653785012084e-05, "loss": 0.967, "step": 755 }, { "epoch": 0.7314725697786333, "grad_norm": 0.9219699501991272, "learning_rate": 6.122171912175641e-05, "loss": 0.9802, "step": 760 }, { "epoch": 0.7362848893166506, "grad_norm": 1.033147931098938, "learning_rate": 5.920311581778817e-05, "loss": 0.9978, "step": 765 }, { "epoch": 0.7410972088546679, "grad_norm": 0.9214019179344177, "learning_rate": 5.7210138303835774e-05, "loss": 0.9588, "step": 770 }, { "epoch": 0.7459095283926853, "grad_norm": 1.1637225151062012, "learning_rate": 5.524334906201239e-05, "loss": 0.97, "step": 775 }, { "epoch": 0.7507218479307026, "grad_norm": 1.5100277662277222, "learning_rate": 5.330330318326152e-05, "loss": 0.9574, "step": 780 }, { "epoch": 0.7555341674687199, "grad_norm": 1.6493581533432007, "learning_rate": 5.1390548210692907e-05, "loss": 0.965, "step": 785 }, { "epoch": 0.7603464870067372, "grad_norm": 0.6085478663444519, "learning_rate": 4.9505623985047986e-05, "loss": 0.9715, "step": 790 }, { "epoch": 0.7651588065447545, "grad_norm": 0.8465254306793213, "learning_rate": 4.764906249233964e-05, "loss": 0.9712, "step": 795 }, { "epoch": 0.7699711260827719, "grad_norm": 0.8800078630447388, "learning_rate": 4.582138771370881e-05, "loss": 0.9876, "step": 800 }, { "epoch": 0.7747834456207893, "grad_norm": 0.7603727579116821, "learning_rate": 4.402311547754003e-05, "loss": 0.9601, "step": 805 }, { "epoch": 0.7795957651588066, "grad_norm": 1.0161685943603516, "learning_rate": 4.225475331387838e-05, "loss": 0.9983, "step": 810 }, { "epoch": 0.7844080846968239, "grad_norm": 0.7719089984893799, "learning_rate": 4.051680031118801e-05, "loss": 0.9661, "step": 815 }, { "epoch": 0.7892204042348412, "grad_norm": 0.8942157626152039, "learning_rate": 3.880974697549349e-05, "loss": 0.9341, "step": 820 }, { "epoch": 0.7940327237728585, "grad_norm": 0.729216992855072, "learning_rate": 3.713407509194329e-05, "loss": 0.9788, "step": 825 }, { "epoch": 0.7988450433108758, "grad_norm": 1.0603898763656616, "learning_rate": 3.549025758883455e-05, "loss": 0.955, "step": 830 }, { "epoch": 0.8036573628488932, "grad_norm": 0.7697267532348633, "learning_rate": 3.3878758404137624e-05, "loss": 0.982, "step": 835 }, { "epoch": 0.8084696823869105, "grad_norm": 0.6257608532905579, "learning_rate": 3.2300032354557834e-05, "loss": 0.9389, "step": 840 }, { "epoch": 0.8132820019249278, "grad_norm": 0.6368157863616943, "learning_rate": 3.075452500717167e-05, "loss": 0.9759, "step": 845 }, { "epoch": 0.8180943214629451, "grad_norm": 0.8453686237335205, "learning_rate": 2.9242672553673458e-05, "loss": 0.937, "step": 850 }, { "epoch": 0.8229066410009624, "grad_norm": 0.7471240162849426, "learning_rate": 2.7764901687268065e-05, "loss": 0.9789, "step": 855 }, { "epoch": 0.8277189605389798, "grad_norm": 0.9190341830253601, "learning_rate": 2.63216294822446e-05, "loss": 0.944, "step": 860 }, { "epoch": 0.8325312800769971, "grad_norm": 0.9102936387062073, "learning_rate": 2.4913263276264363e-05, "loss": 0.9477, "step": 865 }, { "epoch": 0.8373435996150145, "grad_norm": 1.130305528640747, "learning_rate": 2.354020055539736e-05, "loss": 0.9495, "step": 870 }, { "epoch": 0.8421559191530318, "grad_norm": 0.7928130626678467, "learning_rate": 2.2202828841938847e-05, "loss": 0.9484, "step": 875 }, { "epoch": 0.8469682386910491, "grad_norm": 0.9081096649169922, "learning_rate": 2.0901525585038194e-05, "loss": 0.9635, "step": 880 }, { "epoch": 0.8517805582290664, "grad_norm": 0.659153163433075, "learning_rate": 1.9636658054170745e-05, "loss": 0.945, "step": 885 }, { "epoch": 0.8565928777670837, "grad_norm": 0.5528463125228882, "learning_rate": 1.8408583235482348e-05, "loss": 0.9506, "step": 890 }, { "epoch": 0.861405197305101, "grad_norm": 0.5605229139328003, "learning_rate": 1.721764773103662e-05, "loss": 0.9689, "step": 895 }, { "epoch": 0.8662175168431184, "grad_norm": 0.8554848432540894, "learning_rate": 1.60641876609927e-05, "loss": 0.9483, "step": 900 }, { "epoch": 0.8710298363811357, "grad_norm": 0.5970616340637207, "learning_rate": 1.4948528568741297e-05, "loss": 0.9619, "step": 905 }, { "epoch": 0.875842155919153, "grad_norm": 0.5926843881607056, "learning_rate": 1.3870985329026075e-05, "loss": 0.9346, "step": 910 }, { "epoch": 0.8806544754571703, "grad_norm": 0.7669967412948608, "learning_rate": 1.2831862059075731e-05, "loss": 0.9588, "step": 915 }, { "epoch": 0.8854667949951877, "grad_norm": 0.8727617859840393, "learning_rate": 1.1831452032772498e-05, "loss": 0.965, "step": 920 }, { "epoch": 0.890279114533205, "grad_norm": 1.1539345979690552, "learning_rate": 1.0870037597880727e-05, "loss": 0.9725, "step": 925 }, { "epoch": 0.8950914340712224, "grad_norm": 0.6160501837730408, "learning_rate": 9.947890096359484e-06, "loss": 0.9537, "step": 930 }, { "epoch": 0.8999037536092397, "grad_norm": 0.6065817475318909, "learning_rate": 9.065269787780966e-06, "loss": 0.9517, "step": 935 }, { "epoch": 0.904716073147257, "grad_norm": 0.6423489451408386, "learning_rate": 8.222425775877078e-06, "loss": 0.9565, "step": 940 }, { "epoch": 0.9095283926852743, "grad_norm": 0.7299497127532959, "learning_rate": 7.4195959382342355e-06, "loss": 0.9555, "step": 945 }, { "epoch": 0.9143407122232916, "grad_norm": 0.6036633849143982, "learning_rate": 6.657006859156772e-06, "loss": 0.9744, "step": 950 }, { "epoch": 0.9191530317613089, "grad_norm": 0.5731170773506165, "learning_rate": 5.934873765717551e-06, "loss": 0.9528, "step": 955 }, { "epoch": 0.9239653512993262, "grad_norm": 0.9859450459480286, "learning_rate": 5.253400467014024e-06, "loss": 0.9379, "step": 960 }, { "epoch": 0.9287776708373436, "grad_norm": 0.6689264178276062, "learning_rate": 4.612779296646768e-06, "loss": 0.9522, "step": 965 }, { "epoch": 0.933589990375361, "grad_norm": 0.5114886164665222, "learning_rate": 4.013191058436732e-06, "loss": 0.9438, "step": 970 }, { "epoch": 0.9384023099133783, "grad_norm": 0.5692219734191895, "learning_rate": 3.4548049753966523e-06, "loss": 0.9708, "step": 975 }, { "epoch": 0.9432146294513956, "grad_norm": 0.6835126280784607, "learning_rate": 2.9377786419708725e-06, "loss": 0.9715, "step": 980 }, { "epoch": 0.9480269489894129, "grad_norm": 0.6431335210800171, "learning_rate": 2.4622579795571586e-06, "loss": 0.9684, "step": 985 }, { "epoch": 0.9528392685274302, "grad_norm": 0.7213614583015442, "learning_rate": 2.0283771953230223e-06, "loss": 0.9591, "step": 990 }, { "epoch": 0.9576515880654476, "grad_norm": 0.6287111043930054, "learning_rate": 1.636258744328106e-06, "loss": 0.9503, "step": 995 }, { "epoch": 0.9624639076034649, "grad_norm": 0.5980503559112549, "learning_rate": 1.2860132949634782e-06, "loss": 0.9578, "step": 1000 }, { "epoch": 0.9672762271414822, "grad_norm": 0.6850931644439697, "learning_rate": 9.777396977174666e-07, "loss": 0.9285, "step": 1005 }, { "epoch": 0.9720885466794995, "grad_norm": 0.5950331091880798, "learning_rate": 7.1152495727686e-07, "loss": 0.9541, "step": 1010 }, { "epoch": 0.9769008662175168, "grad_norm": 0.6369845271110535, "learning_rate": 4.874442079714325e-07, "loss": 0.9496, "step": 1015 }, { "epoch": 0.9817131857555341, "grad_norm": 0.557193398475647, "learning_rate": 3.055606925685905e-07, "loss": 0.9769, "step": 1020 }, { "epoch": 0.9865255052935515, "grad_norm": 0.6320910453796387, "learning_rate": 1.6592574442426676e-07, "loss": 0.9403, "step": 1025 }, { "epoch": 0.9913378248315688, "grad_norm": 0.6016994714736938, "learning_rate": 6.857877299497605e-08, "loss": 0.9561, "step": 1030 }, { "epoch": 0.9961501443695862, "grad_norm": 0.6348958015441895, "learning_rate": 1.3547252715262468e-08, "loss": 0.9481, "step": 1035 }, { "epoch": 1.0, "eval_loss": 2.1865932941436768, "eval_runtime": 0.8424, "eval_samples_per_second": 7.123, "eval_steps_per_second": 1.187, "step": 1039 }, { "epoch": 1.0, "step": 1039, "total_flos": 7.92023955614466e+17, "train_loss": 2.0617070422938975, "train_runtime": 5660.1106, "train_samples_per_second": 2.937, "train_steps_per_second": 0.184 } ], "logging_steps": 5, "max_steps": 1039, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.92023955614466e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }