{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.996101364522417, "eval_steps": 500, "global_step": 384, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005198180636777128, "grad_norm": 46.50929641723633, "learning_rate": 8.333333333333333e-08, "loss": 1.4958, "step": 1 }, { "epoch": 0.010396361273554255, "grad_norm": 36.618709564208984, "learning_rate": 1.6666666666666665e-07, "loss": 1.2222, "step": 2 }, { "epoch": 0.015594541910331383, "grad_norm": 42.09670639038086, "learning_rate": 2.5e-07, "loss": 1.2627, "step": 3 }, { "epoch": 0.02079272254710851, "grad_norm": 38.13116455078125, "learning_rate": 3.333333333333333e-07, "loss": 1.2579, "step": 4 }, { "epoch": 0.02599090318388564, "grad_norm": 40.15380859375, "learning_rate": 4.1666666666666667e-07, "loss": 1.2421, "step": 5 }, { "epoch": 0.031189083820662766, "grad_norm": 43.696563720703125, "learning_rate": 5e-07, "loss": 1.264, "step": 6 }, { "epoch": 0.036387264457439894, "grad_norm": 44.71561813354492, "learning_rate": 5.833333333333334e-07, "loss": 1.3881, "step": 7 }, { "epoch": 0.04158544509421702, "grad_norm": 41.07535171508789, "learning_rate": 6.666666666666666e-07, "loss": 1.1705, "step": 8 }, { "epoch": 0.04678362573099415, "grad_norm": 37.13037109375, "learning_rate": 7.5e-07, "loss": 1.1108, "step": 9 }, { "epoch": 0.05198180636777128, "grad_norm": 39.47488021850586, "learning_rate": 8.333333333333333e-07, "loss": 1.1024, "step": 10 }, { "epoch": 0.057179987004548405, "grad_norm": 35.2398681640625, "learning_rate": 9.166666666666665e-07, "loss": 1.0848, "step": 11 }, { "epoch": 0.06237816764132553, "grad_norm": 26.39617347717285, "learning_rate": 1e-06, "loss": 0.7678, "step": 12 }, { "epoch": 0.06757634827810266, "grad_norm": 23.569713592529297, "learning_rate": 9.999821700020548e-07, "loss": 0.7753, "step": 13 }, { "epoch": 0.07277452891487979, "grad_norm": 20.85965919494629, "learning_rate": 9.99928681279855e-07, "loss": 0.6663, "step": 14 }, { "epoch": 0.07797270955165692, "grad_norm": 19.964326858520508, "learning_rate": 9.998395376482152e-07, "loss": 0.5468, "step": 15 }, { "epoch": 0.08317089018843404, "grad_norm": 10.887548446655273, "learning_rate": 9.997147454648588e-07, "loss": 0.4754, "step": 16 }, { "epoch": 0.08836907082521117, "grad_norm": 10.726633071899414, "learning_rate": 9.995543136299635e-07, "loss": 0.4547, "step": 17 }, { "epoch": 0.0935672514619883, "grad_norm": 9.673012733459473, "learning_rate": 9.993582535855263e-07, "loss": 0.4634, "step": 18 }, { "epoch": 0.09876543209876543, "grad_norm": 7.258286476135254, "learning_rate": 9.991265793145479e-07, "loss": 0.3635, "step": 19 }, { "epoch": 0.10396361273554255, "grad_norm": 5.402667999267578, "learning_rate": 9.988593073400354e-07, "loss": 0.356, "step": 20 }, { "epoch": 0.10916179337231968, "grad_norm": 4.859364032745361, "learning_rate": 9.985564567238236e-07, "loss": 0.3692, "step": 21 }, { "epoch": 0.11435997400909681, "grad_norm": 3.8276686668395996, "learning_rate": 9.982180490652164e-07, "loss": 0.2976, "step": 22 }, { "epoch": 0.11955815464587394, "grad_norm": 3.0185964107513428, "learning_rate": 9.97844108499445e-07, "loss": 0.2635, "step": 23 }, { "epoch": 0.12475633528265107, "grad_norm": 2.6632726192474365, "learning_rate": 9.974346616959475e-07, "loss": 0.3086, "step": 24 }, { "epoch": 0.1299545159194282, "grad_norm": 2.5440852642059326, "learning_rate": 9.969897378564667e-07, "loss": 0.2746, "step": 25 }, { "epoch": 0.13515269655620532, "grad_norm": 2.642413377761841, "learning_rate": 9.965093687129667e-07, "loss": 0.2889, "step": 26 }, { "epoch": 0.14035087719298245, "grad_norm": 2.513338565826416, "learning_rate": 9.959935885253715e-07, "loss": 0.2778, "step": 27 }, { "epoch": 0.14554905782975958, "grad_norm": 2.2585127353668213, "learning_rate": 9.954424340791195e-07, "loss": 0.2311, "step": 28 }, { "epoch": 0.1507472384665367, "grad_norm": 2.021958351135254, "learning_rate": 9.948559446825411e-07, "loss": 0.2403, "step": 29 }, { "epoch": 0.15594541910331383, "grad_norm": 2.938659429550171, "learning_rate": 9.942341621640557e-07, "loss": 0.2984, "step": 30 }, { "epoch": 0.16114359974009096, "grad_norm": 1.9811211824417114, "learning_rate": 9.93577130869187e-07, "loss": 0.2607, "step": 31 }, { "epoch": 0.1663417803768681, "grad_norm": 1.8804433345794678, "learning_rate": 9.928848976574018e-07, "loss": 0.2236, "step": 32 }, { "epoch": 0.17153996101364521, "grad_norm": 2.2095425128936768, "learning_rate": 9.921575118987671e-07, "loss": 0.247, "step": 33 }, { "epoch": 0.17673814165042234, "grad_norm": 2.0361135005950928, "learning_rate": 9.91395025470429e-07, "loss": 0.2519, "step": 34 }, { "epoch": 0.18193632228719947, "grad_norm": 1.9882704019546509, "learning_rate": 9.905974927529133e-07, "loss": 0.2387, "step": 35 }, { "epoch": 0.1871345029239766, "grad_norm": 2.1970348358154297, "learning_rate": 9.897649706262473e-07, "loss": 0.2506, "step": 36 }, { "epoch": 0.19233268356075373, "grad_norm": 1.9535129070281982, "learning_rate": 9.888975184659016e-07, "loss": 0.2491, "step": 37 }, { "epoch": 0.19753086419753085, "grad_norm": 1.7368297576904297, "learning_rate": 9.879951981385577e-07, "loss": 0.2002, "step": 38 }, { "epoch": 0.20272904483430798, "grad_norm": 1.8394443988800049, "learning_rate": 9.870580739976935e-07, "loss": 0.2107, "step": 39 }, { "epoch": 0.2079272254710851, "grad_norm": 2.0104002952575684, "learning_rate": 9.860862128789952e-07, "loss": 0.2373, "step": 40 }, { "epoch": 0.21312540610786224, "grad_norm": 1.7731590270996094, "learning_rate": 9.850796840955899e-07, "loss": 0.1881, "step": 41 }, { "epoch": 0.21832358674463936, "grad_norm": 1.937873363494873, "learning_rate": 9.840385594331022e-07, "loss": 0.2238, "step": 42 }, { "epoch": 0.2235217673814165, "grad_norm": 1.9435638189315796, "learning_rate": 9.82962913144534e-07, "loss": 0.2237, "step": 43 }, { "epoch": 0.22871994801819362, "grad_norm": 1.786537766456604, "learning_rate": 9.818528219449704e-07, "loss": 0.1951, "step": 44 }, { "epoch": 0.23391812865497075, "grad_norm": 1.7915631532669067, "learning_rate": 9.807083650061062e-07, "loss": 0.2257, "step": 45 }, { "epoch": 0.23911630929174787, "grad_norm": 1.798353910446167, "learning_rate": 9.79529623950601e-07, "loss": 0.236, "step": 46 }, { "epoch": 0.244314489928525, "grad_norm": 1.872049331665039, "learning_rate": 9.783166828462572e-07, "loss": 0.2354, "step": 47 }, { "epoch": 0.24951267056530213, "grad_norm": 1.879210114479065, "learning_rate": 9.770696282000244e-07, "loss": 0.2229, "step": 48 }, { "epoch": 0.25471085120207926, "grad_norm": 1.9663130044937134, "learning_rate": 9.757885489518296e-07, "loss": 0.2461, "step": 49 }, { "epoch": 0.2599090318388564, "grad_norm": 1.6957286596298218, "learning_rate": 9.744735364682344e-07, "loss": 0.2065, "step": 50 }, { "epoch": 0.2651072124756335, "grad_norm": 1.7848544120788574, "learning_rate": 9.731246845359184e-07, "loss": 0.1949, "step": 51 }, { "epoch": 0.27030539311241064, "grad_norm": 1.8262349367141724, "learning_rate": 9.7174208935499e-07, "loss": 0.2135, "step": 52 }, { "epoch": 0.27550357374918777, "grad_norm": 1.6100451946258545, "learning_rate": 9.703258495321265e-07, "loss": 0.1643, "step": 53 }, { "epoch": 0.2807017543859649, "grad_norm": 1.6476277112960815, "learning_rate": 9.688760660735402e-07, "loss": 0.1796, "step": 54 }, { "epoch": 0.285899935022742, "grad_norm": 1.6926974058151245, "learning_rate": 9.673928423777756e-07, "loss": 0.2048, "step": 55 }, { "epoch": 0.29109811565951915, "grad_norm": 1.7797563076019287, "learning_rate": 9.658762842283341e-07, "loss": 0.1953, "step": 56 }, { "epoch": 0.2962962962962963, "grad_norm": 1.7844983339309692, "learning_rate": 9.643264997861312e-07, "loss": 0.2103, "step": 57 }, { "epoch": 0.3014944769330734, "grad_norm": 1.8552502393722534, "learning_rate": 9.627435995817797e-07, "loss": 0.1854, "step": 58 }, { "epoch": 0.30669265756985054, "grad_norm": 1.8308689594268799, "learning_rate": 9.611276965077097e-07, "loss": 0.1892, "step": 59 }, { "epoch": 0.31189083820662766, "grad_norm": 1.8636093139648438, "learning_rate": 9.594789058101153e-07, "loss": 0.216, "step": 60 }, { "epoch": 0.3170890188434048, "grad_norm": 1.5857099294662476, "learning_rate": 9.577973450807351e-07, "loss": 0.1924, "step": 61 }, { "epoch": 0.3222871994801819, "grad_norm": 1.670000433921814, "learning_rate": 9.560831342484666e-07, "loss": 0.2088, "step": 62 }, { "epoch": 0.32748538011695905, "grad_norm": 1.8758388757705688, "learning_rate": 9.543363955708124e-07, "loss": 0.1697, "step": 63 }, { "epoch": 0.3326835607537362, "grad_norm": 2.020310401916504, "learning_rate": 9.525572536251605e-07, "loss": 0.2249, "step": 64 }, { "epoch": 0.3378817413905133, "grad_norm": 1.8294882774353027, "learning_rate": 9.507458352999001e-07, "loss": 0.1884, "step": 65 }, { "epoch": 0.34307992202729043, "grad_norm": 1.606002926826477, "learning_rate": 9.489022697853708e-07, "loss": 0.1761, "step": 66 }, { "epoch": 0.34827810266406756, "grad_norm": 1.6073530912399292, "learning_rate": 9.470266885646503e-07, "loss": 0.1871, "step": 67 }, { "epoch": 0.3534762833008447, "grad_norm": 1.7087726593017578, "learning_rate": 9.451192254041758e-07, "loss": 0.173, "step": 68 }, { "epoch": 0.3586744639376218, "grad_norm": 1.7764538526535034, "learning_rate": 9.431800163442041e-07, "loss": 0.1957, "step": 69 }, { "epoch": 0.36387264457439894, "grad_norm": 1.8759775161743164, "learning_rate": 9.412091996891095e-07, "loss": 0.2154, "step": 70 }, { "epoch": 0.36907082521117607, "grad_norm": 1.8281443119049072, "learning_rate": 9.392069159975198e-07, "loss": 0.1679, "step": 71 }, { "epoch": 0.3742690058479532, "grad_norm": 1.7894129753112793, "learning_rate": 9.37173308072291e-07, "loss": 0.1679, "step": 72 }, { "epoch": 0.3794671864847303, "grad_norm": 1.6492183208465576, "learning_rate": 9.35108520950324e-07, "loss": 0.1833, "step": 73 }, { "epoch": 0.38466536712150745, "grad_norm": 1.6076239347457886, "learning_rate": 9.330127018922193e-07, "loss": 0.1507, "step": 74 }, { "epoch": 0.3898635477582846, "grad_norm": 1.8182544708251953, "learning_rate": 9.308860003717748e-07, "loss": 0.1759, "step": 75 }, { "epoch": 0.3950617283950617, "grad_norm": 2.183497667312622, "learning_rate": 9.287285680653254e-07, "loss": 0.2069, "step": 76 }, { "epoch": 0.40025990903183883, "grad_norm": 1.9281930923461914, "learning_rate": 9.265405588409256e-07, "loss": 0.1813, "step": 77 }, { "epoch": 0.40545808966861596, "grad_norm": 1.7534650564193726, "learning_rate": 9.243221287473755e-07, "loss": 0.1764, "step": 78 }, { "epoch": 0.4106562703053931, "grad_norm": 1.7174078226089478, "learning_rate": 9.220734360030906e-07, "loss": 0.1863, "step": 79 }, { "epoch": 0.4158544509421702, "grad_norm": 1.7550305128097534, "learning_rate": 9.197946409848194e-07, "loss": 0.1718, "step": 80 }, { "epoch": 0.42105263157894735, "grad_norm": 1.4776816368103027, "learning_rate": 9.174859062162037e-07, "loss": 0.156, "step": 81 }, { "epoch": 0.4262508122157245, "grad_norm": 1.7932229042053223, "learning_rate": 9.151473963561882e-07, "loss": 0.1821, "step": 82 }, { "epoch": 0.4314489928525016, "grad_norm": 1.6103583574295044, "learning_rate": 9.127792781872768e-07, "loss": 0.1749, "step": 83 }, { "epoch": 0.43664717348927873, "grad_norm": 1.8216729164123535, "learning_rate": 9.103817206036382e-07, "loss": 0.177, "step": 84 }, { "epoch": 0.44184535412605586, "grad_norm": 1.7169886827468872, "learning_rate": 9.079548945990592e-07, "loss": 0.1845, "step": 85 }, { "epoch": 0.447043534762833, "grad_norm": 1.4935150146484375, "learning_rate": 9.054989732547506e-07, "loss": 0.1518, "step": 86 }, { "epoch": 0.4522417153996101, "grad_norm": 1.7215607166290283, "learning_rate": 9.030141317270025e-07, "loss": 0.1651, "step": 87 }, { "epoch": 0.45743989603638724, "grad_norm": 1.885299801826477, "learning_rate": 9.005005472346923e-07, "loss": 0.1862, "step": 88 }, { "epoch": 0.46263807667316437, "grad_norm": 1.6924781799316406, "learning_rate": 8.979583990466452e-07, "loss": 0.1834, "step": 89 }, { "epoch": 0.4678362573099415, "grad_norm": 1.6620601415634155, "learning_rate": 8.953878684688492e-07, "loss": 0.1736, "step": 90 }, { "epoch": 0.4730344379467186, "grad_norm": 1.7256325483322144, "learning_rate": 8.92789138831524e-07, "loss": 0.1792, "step": 91 }, { "epoch": 0.47823261858349575, "grad_norm": 1.6039340496063232, "learning_rate": 8.901623954760459e-07, "loss": 0.1704, "step": 92 }, { "epoch": 0.4834307992202729, "grad_norm": 1.6422524452209473, "learning_rate": 8.875078257417294e-07, "loss": 0.1621, "step": 93 }, { "epoch": 0.48862897985705, "grad_norm": 1.6837060451507568, "learning_rate": 8.84825618952466e-07, "loss": 0.183, "step": 94 }, { "epoch": 0.49382716049382713, "grad_norm": 1.750653862953186, "learning_rate": 8.821159664032223e-07, "loss": 0.1689, "step": 95 }, { "epoch": 0.49902534113060426, "grad_norm": 1.6462229490280151, "learning_rate": 8.793790613463954e-07, "loss": 0.1394, "step": 96 }, { "epoch": 0.5042235217673814, "grad_norm": 1.7336857318878174, "learning_rate": 8.766150989780317e-07, "loss": 0.1581, "step": 97 }, { "epoch": 0.5094217024041585, "grad_norm": 1.8384933471679688, "learning_rate": 8.738242764239046e-07, "loss": 0.1918, "step": 98 }, { "epoch": 0.5146198830409356, "grad_norm": 1.723486065864563, "learning_rate": 8.710067927254554e-07, "loss": 0.1737, "step": 99 }, { "epoch": 0.5198180636777128, "grad_norm": 1.9092669486999512, "learning_rate": 8.681628488255986e-07, "loss": 0.1728, "step": 100 }, { "epoch": 0.5250162443144899, "grad_norm": 1.729762315750122, "learning_rate": 8.652926475543898e-07, "loss": 0.162, "step": 101 }, { "epoch": 0.530214424951267, "grad_norm": 1.7867392301559448, "learning_rate": 8.623963936145599e-07, "loss": 0.1658, "step": 102 }, { "epoch": 0.5354126055880442, "grad_norm": 2.0217678546905518, "learning_rate": 8.594742935669164e-07, "loss": 0.1865, "step": 103 }, { "epoch": 0.5406107862248213, "grad_norm": 1.7473349571228027, "learning_rate": 8.565265558156101e-07, "loss": 0.1535, "step": 104 }, { "epoch": 0.5458089668615984, "grad_norm": 1.5292036533355713, "learning_rate": 8.535533905932737e-07, "loss": 0.1559, "step": 105 }, { "epoch": 0.5510071474983755, "grad_norm": 1.5472049713134766, "learning_rate": 8.505550099460263e-07, "loss": 0.1423, "step": 106 }, { "epoch": 0.5562053281351527, "grad_norm": 1.636443853378296, "learning_rate": 8.475316277183508e-07, "loss": 0.1747, "step": 107 }, { "epoch": 0.5614035087719298, "grad_norm": 1.5992189645767212, "learning_rate": 8.444834595378433e-07, "loss": 0.1766, "step": 108 }, { "epoch": 0.5666016894087069, "grad_norm": 1.6766347885131836, "learning_rate": 8.414107227998328e-07, "loss": 0.1421, "step": 109 }, { "epoch": 0.571799870045484, "grad_norm": 1.7345399856567383, "learning_rate": 8.383136366518787e-07, "loss": 0.1752, "step": 110 }, { "epoch": 0.5769980506822612, "grad_norm": 1.669264793395996, "learning_rate": 8.351924219781392e-07, "loss": 0.1661, "step": 111 }, { "epoch": 0.5821962313190383, "grad_norm": 1.7636350393295288, "learning_rate": 8.320473013836195e-07, "loss": 0.1892, "step": 112 }, { "epoch": 0.5873944119558154, "grad_norm": 1.8429635763168335, "learning_rate": 8.288784991782945e-07, "loss": 0.1883, "step": 113 }, { "epoch": 0.5925925925925926, "grad_norm": 1.5329152345657349, "learning_rate": 8.256862413611112e-07, "loss": 0.1472, "step": 114 }, { "epoch": 0.5977907732293697, "grad_norm": 1.9208284616470337, "learning_rate": 8.22470755603871e-07, "loss": 0.1714, "step": 115 }, { "epoch": 0.6029889538661468, "grad_norm": 1.6381752490997314, "learning_rate": 8.192322712349917e-07, "loss": 0.1806, "step": 116 }, { "epoch": 0.6081871345029239, "grad_norm": 1.5502922534942627, "learning_rate": 8.159710192231519e-07, "loss": 0.1653, "step": 117 }, { "epoch": 0.6133853151397011, "grad_norm": 1.604650616645813, "learning_rate": 8.126872321608183e-07, "loss": 0.1478, "step": 118 }, { "epoch": 0.6185834957764782, "grad_norm": 1.6860443353652954, "learning_rate": 8.093811442476572e-07, "loss": 0.1639, "step": 119 }, { "epoch": 0.6237816764132553, "grad_norm": 1.5915076732635498, "learning_rate": 8.060529912738314e-07, "loss": 0.1511, "step": 120 }, { "epoch": 0.6289798570500325, "grad_norm": 1.7241225242614746, "learning_rate": 8.027030106031835e-07, "loss": 0.1848, "step": 121 }, { "epoch": 0.6341780376868096, "grad_norm": 1.7747095823287964, "learning_rate": 7.993314411563075e-07, "loss": 0.1816, "step": 122 }, { "epoch": 0.6393762183235867, "grad_norm": 1.6497771739959717, "learning_rate": 7.959385233935085e-07, "loss": 0.1696, "step": 123 }, { "epoch": 0.6445743989603638, "grad_norm": 1.4712307453155518, "learning_rate": 7.925244992976537e-07, "loss": 0.1297, "step": 124 }, { "epoch": 0.649772579597141, "grad_norm": 1.618713140487671, "learning_rate": 7.890896123569135e-07, "loss": 0.1708, "step": 125 }, { "epoch": 0.6549707602339181, "grad_norm": 1.8550593852996826, "learning_rate": 7.856341075473961e-07, "loss": 0.1646, "step": 126 }, { "epoch": 0.6601689408706952, "grad_norm": 1.7929205894470215, "learning_rate": 7.821582313156763e-07, "loss": 0.1555, "step": 127 }, { "epoch": 0.6653671215074723, "grad_norm": 1.8011633157730103, "learning_rate": 7.786622315612181e-07, "loss": 0.1882, "step": 128 }, { "epoch": 0.6705653021442495, "grad_norm": 1.642986536026001, "learning_rate": 7.751463576186957e-07, "loss": 0.1659, "step": 129 }, { "epoch": 0.6757634827810266, "grad_norm": 1.547602653503418, "learning_rate": 7.716108602402094e-07, "loss": 0.1479, "step": 130 }, { "epoch": 0.6809616634178037, "grad_norm": 1.6602659225463867, "learning_rate": 7.680559915774033e-07, "loss": 0.1627, "step": 131 }, { "epoch": 0.6861598440545809, "grad_norm": 1.8091386556625366, "learning_rate": 7.644820051634812e-07, "loss": 0.1637, "step": 132 }, { "epoch": 0.691358024691358, "grad_norm": 1.669487476348877, "learning_rate": 7.608891558951248e-07, "loss": 0.1599, "step": 133 }, { "epoch": 0.6965562053281351, "grad_norm": 1.9016000032424927, "learning_rate": 7.572777000143145e-07, "loss": 0.1654, "step": 134 }, { "epoch": 0.7017543859649122, "grad_norm": 1.4672502279281616, "learning_rate": 7.536478950900536e-07, "loss": 0.1482, "step": 135 }, { "epoch": 0.7069525666016894, "grad_norm": 1.4602234363555908, "learning_rate": 7.5e-07, "loss": 0.1214, "step": 136 }, { "epoch": 0.7121507472384665, "grad_norm": 1.725661277770996, "learning_rate": 7.463342749120013e-07, "loss": 0.1406, "step": 137 }, { "epoch": 0.7173489278752436, "grad_norm": 1.6164398193359375, "learning_rate": 7.426509812655405e-07, "loss": 0.1492, "step": 138 }, { "epoch": 0.7225471085120208, "grad_norm": 1.609312891960144, "learning_rate": 7.389503817530905e-07, "loss": 0.1669, "step": 139 }, { "epoch": 0.7277452891487979, "grad_norm": 1.512629508972168, "learning_rate": 7.352327403013779e-07, "loss": 0.1318, "step": 140 }, { "epoch": 0.732943469785575, "grad_norm": 1.7129087448120117, "learning_rate": 7.314983220525604e-07, "loss": 0.1762, "step": 141 }, { "epoch": 0.7381416504223521, "grad_norm": 1.6480506658554077, "learning_rate": 7.277473933453169e-07, "loss": 0.1738, "step": 142 }, { "epoch": 0.7433398310591293, "grad_norm": 1.5904552936553955, "learning_rate": 7.239802216958522e-07, "loss": 0.1558, "step": 143 }, { "epoch": 0.7485380116959064, "grad_norm": 1.6988767385482788, "learning_rate": 7.201970757788171e-07, "loss": 0.1661, "step": 144 }, { "epoch": 0.7537361923326835, "grad_norm": 1.5458639860153198, "learning_rate": 7.163982254081474e-07, "loss": 0.1338, "step": 145 }, { "epoch": 0.7589343729694606, "grad_norm": 1.5240118503570557, "learning_rate": 7.125839415178203e-07, "loss": 0.1405, "step": 146 }, { "epoch": 0.7641325536062378, "grad_norm": 1.7464412450790405, "learning_rate": 7.087544961425316e-07, "loss": 0.1682, "step": 147 }, { "epoch": 0.7693307342430149, "grad_norm": 1.7425211668014526, "learning_rate": 7.049101623982937e-07, "loss": 0.1839, "step": 148 }, { "epoch": 0.774528914879792, "grad_norm": 1.4918522834777832, "learning_rate": 7.010512144629579e-07, "loss": 0.1124, "step": 149 }, { "epoch": 0.7797270955165692, "grad_norm": 1.6756539344787598, "learning_rate": 6.971779275566593e-07, "loss": 0.1546, "step": 150 }, { "epoch": 0.7849252761533463, "grad_norm": 1.5222876071929932, "learning_rate": 6.93290577922188e-07, "loss": 0.1313, "step": 151 }, { "epoch": 0.7901234567901234, "grad_norm": 1.548453688621521, "learning_rate": 6.89389442805288e-07, "loss": 0.1349, "step": 152 }, { "epoch": 0.7953216374269005, "grad_norm": 1.6898419857025146, "learning_rate": 6.85474800434884e-07, "loss": 0.1568, "step": 153 }, { "epoch": 0.8005198180636777, "grad_norm": 1.8794304132461548, "learning_rate": 6.815469300032373e-07, "loss": 0.161, "step": 154 }, { "epoch": 0.8057179987004548, "grad_norm": 1.6816418170928955, "learning_rate": 6.776061116460352e-07, "loss": 0.1615, "step": 155 }, { "epoch": 0.8109161793372319, "grad_norm": 1.960444688796997, "learning_rate": 6.7365262642241e-07, "loss": 0.1948, "step": 156 }, { "epoch": 0.816114359974009, "grad_norm": 1.6450730562210083, "learning_rate": 6.696867562948962e-07, "loss": 0.161, "step": 157 }, { "epoch": 0.8213125406107862, "grad_norm": 1.4993230104446411, "learning_rate": 6.657087841093179e-07, "loss": 0.1476, "step": 158 }, { "epoch": 0.8265107212475633, "grad_norm": 1.856066346168518, "learning_rate": 6.61718993574619e-07, "loss": 0.1599, "step": 159 }, { "epoch": 0.8317089018843404, "grad_norm": 1.6243445873260498, "learning_rate": 6.577176692426278e-07, "loss": 0.1548, "step": 160 }, { "epoch": 0.8369070825211176, "grad_norm": 1.538219928741455, "learning_rate": 6.537050964877625e-07, "loss": 0.1428, "step": 161 }, { "epoch": 0.8421052631578947, "grad_norm": 1.429417371749878, "learning_rate": 6.496815614866791e-07, "loss": 0.1205, "step": 162 }, { "epoch": 0.8473034437946718, "grad_norm": 1.7732073068618774, "learning_rate": 6.456473511978606e-07, "loss": 0.1903, "step": 163 }, { "epoch": 0.852501624431449, "grad_norm": 1.575061321258545, "learning_rate": 6.416027533411519e-07, "loss": 0.1571, "step": 164 }, { "epoch": 0.8576998050682261, "grad_norm": 1.6352499723434448, "learning_rate": 6.375480563772389e-07, "loss": 0.1484, "step": 165 }, { "epoch": 0.8628979857050032, "grad_norm": 1.7170888185501099, "learning_rate": 6.334835494870758e-07, "loss": 0.1735, "step": 166 }, { "epoch": 0.8680961663417803, "grad_norm": 1.5450496673583984, "learning_rate": 6.294095225512604e-07, "loss": 0.1339, "step": 167 }, { "epoch": 0.8732943469785575, "grad_norm": 1.5989458560943604, "learning_rate": 6.253262661293602e-07, "loss": 0.1393, "step": 168 }, { "epoch": 0.8784925276153346, "grad_norm": 1.464534878730774, "learning_rate": 6.2123407143919e-07, "loss": 0.1421, "step": 169 }, { "epoch": 0.8836907082521117, "grad_norm": 1.6165345907211304, "learning_rate": 6.17133230336041e-07, "loss": 0.154, "step": 170 }, { "epoch": 0.8888888888888888, "grad_norm": 1.5105384588241577, "learning_rate": 6.130240352918674e-07, "loss": 0.1614, "step": 171 }, { "epoch": 0.894087069525666, "grad_norm": 1.6538264751434326, "learning_rate": 6.089067793744257e-07, "loss": 0.1213, "step": 172 }, { "epoch": 0.8992852501624431, "grad_norm": 1.5659717321395874, "learning_rate": 6.047817562263743e-07, "loss": 0.1349, "step": 173 }, { "epoch": 0.9044834307992202, "grad_norm": 1.6108099222183228, "learning_rate": 6.0064926004433e-07, "loss": 0.1572, "step": 174 }, { "epoch": 0.9096816114359974, "grad_norm": 1.7230148315429688, "learning_rate": 5.965095855578868e-07, "loss": 0.1376, "step": 175 }, { "epoch": 0.9148797920727745, "grad_norm": 1.7344483137130737, "learning_rate": 5.923630280085947e-07, "loss": 0.1572, "step": 176 }, { "epoch": 0.9200779727095516, "grad_norm": 1.6481879949569702, "learning_rate": 5.882098831289043e-07, "loss": 0.1626, "step": 177 }, { "epoch": 0.9252761533463287, "grad_norm": 1.7318065166473389, "learning_rate": 5.840504471210741e-07, "loss": 0.1756, "step": 178 }, { "epoch": 0.9304743339831059, "grad_norm": 1.676165223121643, "learning_rate": 5.79885016636046e-07, "loss": 0.147, "step": 179 }, { "epoch": 0.935672514619883, "grad_norm": 1.4620646238327026, "learning_rate": 5.757138887522883e-07, "loss": 0.1249, "step": 180 }, { "epoch": 0.9408706952566601, "grad_norm": 1.65927255153656, "learning_rate": 5.71537360954607e-07, "loss": 0.163, "step": 181 }, { "epoch": 0.9460688758934372, "grad_norm": 1.5536587238311768, "learning_rate": 5.673557311129306e-07, "loss": 0.1351, "step": 182 }, { "epoch": 0.9512670565302144, "grad_norm": 1.7076836824417114, "learning_rate": 5.631692974610647e-07, "loss": 0.1771, "step": 183 }, { "epoch": 0.9564652371669915, "grad_norm": 1.4979828596115112, "learning_rate": 5.589783585754231e-07, "loss": 0.121, "step": 184 }, { "epoch": 0.9616634178037686, "grad_norm": 1.5839756727218628, "learning_rate": 5.547832133537327e-07, "loss": 0.1458, "step": 185 }, { "epoch": 0.9668615984405458, "grad_norm": 1.7546137571334839, "learning_rate": 5.505841609937161e-07, "loss": 0.1671, "step": 186 }, { "epoch": 0.9720597790773229, "grad_norm": 1.7105190753936768, "learning_rate": 5.463815009717532e-07, "loss": 0.1314, "step": 187 }, { "epoch": 0.9772579597141, "grad_norm": 1.8557852506637573, "learning_rate": 5.421755330215223e-07, "loss": 0.1794, "step": 188 }, { "epoch": 0.9824561403508771, "grad_norm": 1.569214105606079, "learning_rate": 5.379665571126231e-07, "loss": 0.1307, "step": 189 }, { "epoch": 0.9876543209876543, "grad_norm": 1.6137492656707764, "learning_rate": 5.337548734291826e-07, "loss": 0.1412, "step": 190 }, { "epoch": 0.9928525016244314, "grad_norm": 1.6707996129989624, "learning_rate": 5.295407823484467e-07, "loss": 0.1627, "step": 191 }, { "epoch": 0.9980506822612085, "grad_norm": 2.3229496479034424, "learning_rate": 5.253245844193564e-07, "loss": 0.1872, "step": 192 }, { "epoch": 1.0032488628979856, "grad_norm": 1.4374881982803345, "learning_rate": 5.211065803411134e-07, "loss": 0.1118, "step": 193 }, { "epoch": 1.0084470435347628, "grad_norm": 1.7097264528274536, "learning_rate": 5.168870709417341e-07, "loss": 0.1603, "step": 194 }, { "epoch": 1.01364522417154, "grad_norm": 1.5904935598373413, "learning_rate": 5.126663571565939e-07, "loss": 0.128, "step": 195 }, { "epoch": 1.018843404808317, "grad_norm": 1.5433835983276367, "learning_rate": 5.084447400069654e-07, "loss": 0.1192, "step": 196 }, { "epoch": 1.0240415854450942, "grad_norm": 1.40073561668396, "learning_rate": 5.042225205785492e-07, "loss": 0.1188, "step": 197 }, { "epoch": 1.0292397660818713, "grad_norm": 1.6374619007110596, "learning_rate": 5e-07, "loss": 0.1486, "step": 198 }, { "epoch": 1.0344379467186484, "grad_norm": 1.4800790548324585, "learning_rate": 4.957774794214508e-07, "loss": 0.1297, "step": 199 }, { "epoch": 1.0396361273554255, "grad_norm": 1.5941686630249023, "learning_rate": 4.915552599930345e-07, "loss": 0.1466, "step": 200 }, { "epoch": 1.0448343079922027, "grad_norm": 1.5258111953735352, "learning_rate": 4.873336428434061e-07, "loss": 0.1264, "step": 201 }, { "epoch": 1.0500324886289798, "grad_norm": 1.725998878479004, "learning_rate": 4.831129290582659e-07, "loss": 0.1334, "step": 202 }, { "epoch": 1.055230669265757, "grad_norm": 1.8626656532287598, "learning_rate": 4.788934196588865e-07, "loss": 0.1503, "step": 203 }, { "epoch": 1.060428849902534, "grad_norm": 1.4593480825424194, "learning_rate": 4.746754155806437e-07, "loss": 0.1225, "step": 204 }, { "epoch": 1.0656270305393112, "grad_norm": 1.6284171342849731, "learning_rate": 4.7045921765155337e-07, "loss": 0.1397, "step": 205 }, { "epoch": 1.0708252111760883, "grad_norm": 1.4469428062438965, "learning_rate": 4.662451265708174e-07, "loss": 0.1082, "step": 206 }, { "epoch": 1.0760233918128654, "grad_norm": 1.3986436128616333, "learning_rate": 4.620334428873769e-07, "loss": 0.1025, "step": 207 }, { "epoch": 1.0812215724496426, "grad_norm": 1.6322413682937622, "learning_rate": 4.5782446697847764e-07, "loss": 0.126, "step": 208 }, { "epoch": 1.0864197530864197, "grad_norm": 1.4022878408432007, "learning_rate": 4.536184990282467e-07, "loss": 0.0932, "step": 209 }, { "epoch": 1.0916179337231968, "grad_norm": 1.8021215200424194, "learning_rate": 4.4941583900628393e-07, "loss": 0.1662, "step": 210 }, { "epoch": 1.096816114359974, "grad_norm": 1.806060552597046, "learning_rate": 4.4521678664626745e-07, "loss": 0.1574, "step": 211 }, { "epoch": 1.102014294996751, "grad_norm": 1.7470866441726685, "learning_rate": 4.4102164142457705e-07, "loss": 0.1467, "step": 212 }, { "epoch": 1.1072124756335282, "grad_norm": 1.5723119974136353, "learning_rate": 4.368307025389355e-07, "loss": 0.1084, "step": 213 }, { "epoch": 1.1124106562703053, "grad_norm": 1.5377353429794312, "learning_rate": 4.326442688870696e-07, "loss": 0.116, "step": 214 }, { "epoch": 1.1176088369070825, "grad_norm": 1.7077683210372925, "learning_rate": 4.2846263904539303e-07, "loss": 0.1483, "step": 215 }, { "epoch": 1.1228070175438596, "grad_norm": 1.8333083391189575, "learning_rate": 4.242861112477118e-07, "loss": 0.1527, "step": 216 }, { "epoch": 1.1280051981806367, "grad_norm": 1.5769239664077759, "learning_rate": 4.201149833639539e-07, "loss": 0.121, "step": 217 }, { "epoch": 1.1332033788174138, "grad_norm": 1.919921636581421, "learning_rate": 4.15949552878926e-07, "loss": 0.173, "step": 218 }, { "epoch": 1.138401559454191, "grad_norm": 1.5571967363357544, "learning_rate": 4.117901168710959e-07, "loss": 0.1227, "step": 219 }, { "epoch": 1.143599740090968, "grad_norm": 1.6683669090270996, "learning_rate": 4.0763697199140546e-07, "loss": 0.1422, "step": 220 }, { "epoch": 1.1487979207277452, "grad_norm": 1.6401057243347168, "learning_rate": 4.034904144421134e-07, "loss": 0.1256, "step": 221 }, { "epoch": 1.1539961013645224, "grad_norm": 1.5419056415557861, "learning_rate": 3.9935073995566987e-07, "loss": 0.1302, "step": 222 }, { "epoch": 1.1591942820012995, "grad_norm": 1.650795817375183, "learning_rate": 3.952182437736256e-07, "loss": 0.1471, "step": 223 }, { "epoch": 1.1643924626380766, "grad_norm": 1.672743797302246, "learning_rate": 3.910932206255742e-07, "loss": 0.1298, "step": 224 }, { "epoch": 1.1695906432748537, "grad_norm": 1.6902425289154053, "learning_rate": 3.869759647081325e-07, "loss": 0.1414, "step": 225 }, { "epoch": 1.1747888239116309, "grad_norm": 1.607485055923462, "learning_rate": 3.828667696639589e-07, "loss": 0.1032, "step": 226 }, { "epoch": 1.179987004548408, "grad_norm": 1.5336533784866333, "learning_rate": 3.7876592856081e-07, "loss": 0.1116, "step": 227 }, { "epoch": 1.1851851851851851, "grad_norm": 1.5528396368026733, "learning_rate": 3.7467373387063964e-07, "loss": 0.1243, "step": 228 }, { "epoch": 1.1903833658219622, "grad_norm": 1.743318796157837, "learning_rate": 3.7059047744873955e-07, "loss": 0.1437, "step": 229 }, { "epoch": 1.1955815464587394, "grad_norm": 1.6484525203704834, "learning_rate": 3.665164505129241e-07, "loss": 0.131, "step": 230 }, { "epoch": 1.2007797270955165, "grad_norm": 1.6531956195831299, "learning_rate": 3.6245194362276094e-07, "loss": 0.1268, "step": 231 }, { "epoch": 1.2059779077322936, "grad_norm": 1.491297960281372, "learning_rate": 3.5839724665884795e-07, "loss": 0.1261, "step": 232 }, { "epoch": 1.2111760883690708, "grad_norm": 1.5535728931427002, "learning_rate": 3.5435264880213937e-07, "loss": 0.1233, "step": 233 }, { "epoch": 1.2163742690058479, "grad_norm": 1.6621695756912231, "learning_rate": 3.50318438513321e-07, "loss": 0.1331, "step": 234 }, { "epoch": 1.221572449642625, "grad_norm": 1.5829371213912964, "learning_rate": 3.462949035122376e-07, "loss": 0.1229, "step": 235 }, { "epoch": 1.2267706302794021, "grad_norm": 1.7693301439285278, "learning_rate": 3.4228233075737223e-07, "loss": 0.1434, "step": 236 }, { "epoch": 1.2319688109161793, "grad_norm": 1.6113789081573486, "learning_rate": 3.3828100642538093e-07, "loss": 0.1213, "step": 237 }, { "epoch": 1.2371669915529564, "grad_norm": 1.5799354314804077, "learning_rate": 3.342912158906821e-07, "loss": 0.1191, "step": 238 }, { "epoch": 1.2423651721897335, "grad_norm": 1.5467253923416138, "learning_rate": 3.3031324370510396e-07, "loss": 0.1133, "step": 239 }, { "epoch": 1.2475633528265107, "grad_norm": 1.8147982358932495, "learning_rate": 3.263473735775899e-07, "loss": 0.1391, "step": 240 }, { "epoch": 1.2527615334632878, "grad_norm": 1.702359676361084, "learning_rate": 3.2239388835396484e-07, "loss": 0.1339, "step": 241 }, { "epoch": 1.257959714100065, "grad_norm": 1.7504138946533203, "learning_rate": 3.184530699967627e-07, "loss": 0.1565, "step": 242 }, { "epoch": 1.263157894736842, "grad_norm": 1.7226463556289673, "learning_rate": 3.1452519956511614e-07, "loss": 0.1266, "step": 243 }, { "epoch": 1.2683560753736192, "grad_norm": 1.8186461925506592, "learning_rate": 3.1061055719471197e-07, "loss": 0.1347, "step": 244 }, { "epoch": 1.2735542560103963, "grad_norm": 1.6384614706039429, "learning_rate": 3.0670942207781204e-07, "loss": 0.1115, "step": 245 }, { "epoch": 1.2787524366471734, "grad_norm": 1.7369823455810547, "learning_rate": 3.028220724433408e-07, "loss": 0.129, "step": 246 }, { "epoch": 1.2839506172839505, "grad_norm": 1.6780822277069092, "learning_rate": 2.989487855370421e-07, "loss": 0.1385, "step": 247 }, { "epoch": 1.2891487979207277, "grad_norm": 1.87428879737854, "learning_rate": 2.9508983760170634e-07, "loss": 0.1435, "step": 248 }, { "epoch": 1.2943469785575048, "grad_norm": 1.7940468788146973, "learning_rate": 2.9124550385746856e-07, "loss": 0.1491, "step": 249 }, { "epoch": 1.299545159194282, "grad_norm": 1.712099552154541, "learning_rate": 2.8741605848217976e-07, "loss": 0.131, "step": 250 }, { "epoch": 1.304743339831059, "grad_norm": 1.6154824495315552, "learning_rate": 2.8360177459185263e-07, "loss": 0.1145, "step": 251 }, { "epoch": 1.3099415204678362, "grad_norm": 1.6371185779571533, "learning_rate": 2.7980292422118277e-07, "loss": 0.1232, "step": 252 }, { "epoch": 1.3151397011046133, "grad_norm": 1.8156847953796387, "learning_rate": 2.7601977830414766e-07, "loss": 0.1274, "step": 253 }, { "epoch": 1.3203378817413904, "grad_norm": 1.6596229076385498, "learning_rate": 2.72252606654683e-07, "loss": 0.1168, "step": 254 }, { "epoch": 1.3255360623781676, "grad_norm": 1.6106423139572144, "learning_rate": 2.685016779474396e-07, "loss": 0.1139, "step": 255 }, { "epoch": 1.3307342430149447, "grad_norm": 1.6363728046417236, "learning_rate": 2.6476725969862226e-07, "loss": 0.1297, "step": 256 }, { "epoch": 1.3359324236517218, "grad_norm": 1.4978957176208496, "learning_rate": 2.6104961824690964e-07, "loss": 0.1191, "step": 257 }, { "epoch": 1.341130604288499, "grad_norm": 1.5889379978179932, "learning_rate": 2.5734901873445956e-07, "loss": 0.1236, "step": 258 }, { "epoch": 1.346328784925276, "grad_norm": 1.534178376197815, "learning_rate": 2.536657250879988e-07, "loss": 0.1053, "step": 259 }, { "epoch": 1.3515269655620532, "grad_norm": 1.8409833908081055, "learning_rate": 2.500000000000001e-07, "loss": 0.1427, "step": 260 }, { "epoch": 1.3567251461988303, "grad_norm": 1.7446589469909668, "learning_rate": 2.4635210490994647e-07, "loss": 0.1194, "step": 261 }, { "epoch": 1.3619233268356075, "grad_norm": 1.7886688709259033, "learning_rate": 2.427222999856857e-07, "loss": 0.1351, "step": 262 }, { "epoch": 1.3671215074723846, "grad_norm": 1.6462031602859497, "learning_rate": 2.391108441048753e-07, "loss": 0.1249, "step": 263 }, { "epoch": 1.3723196881091617, "grad_norm": 1.8700019121170044, "learning_rate": 2.355179948365189e-07, "loss": 0.1482, "step": 264 }, { "epoch": 1.3775178687459388, "grad_norm": 1.8244132995605469, "learning_rate": 2.3194400842259687e-07, "loss": 0.134, "step": 265 }, { "epoch": 1.382716049382716, "grad_norm": 1.8189918994903564, "learning_rate": 2.283891397597908e-07, "loss": 0.1258, "step": 266 }, { "epoch": 1.387914230019493, "grad_norm": 1.5552480220794678, "learning_rate": 2.2485364238130433e-07, "loss": 0.1131, "step": 267 }, { "epoch": 1.3931124106562702, "grad_norm": 1.659328579902649, "learning_rate": 2.2133776843878183e-07, "loss": 0.1119, "step": 268 }, { "epoch": 1.3983105912930474, "grad_norm": 1.3867595195770264, "learning_rate": 2.1784176868432375e-07, "loss": 0.0851, "step": 269 }, { "epoch": 1.4035087719298245, "grad_norm": 1.7647099494934082, "learning_rate": 2.1436589245260372e-07, "loss": 0.1158, "step": 270 }, { "epoch": 1.4087069525666016, "grad_norm": 1.8926544189453125, "learning_rate": 2.109103876430864e-07, "loss": 0.1527, "step": 271 }, { "epoch": 1.4139051332033787, "grad_norm": 1.839390516281128, "learning_rate": 2.074755007023461e-07, "loss": 0.1108, "step": 272 }, { "epoch": 1.4191033138401559, "grad_norm": 1.797500729560852, "learning_rate": 2.040614766064913e-07, "loss": 0.1508, "step": 273 }, { "epoch": 1.424301494476933, "grad_norm": 1.7864497900009155, "learning_rate": 2.0066855884369243e-07, "loss": 0.1242, "step": 274 }, { "epoch": 1.4294996751137101, "grad_norm": 1.853615641593933, "learning_rate": 1.9729698939681644e-07, "loss": 0.122, "step": 275 }, { "epoch": 1.4346978557504872, "grad_norm": 1.6054631471633911, "learning_rate": 1.9394700872616853e-07, "loss": 0.1212, "step": 276 }, { "epoch": 1.4398960363872644, "grad_norm": 1.632055640220642, "learning_rate": 1.906188557523427e-07, "loss": 0.1101, "step": 277 }, { "epoch": 1.4450942170240415, "grad_norm": 1.6560664176940918, "learning_rate": 1.873127678391816e-07, "loss": 0.1217, "step": 278 }, { "epoch": 1.4502923976608186, "grad_norm": 1.4159197807312012, "learning_rate": 1.8402898077684803e-07, "loss": 0.0988, "step": 279 }, { "epoch": 1.4554905782975958, "grad_norm": 1.6314151287078857, "learning_rate": 1.8076772876500828e-07, "loss": 0.1293, "step": 280 }, { "epoch": 1.4606887589343729, "grad_norm": 1.7430942058563232, "learning_rate": 1.775292443961291e-07, "loss": 0.1401, "step": 281 }, { "epoch": 1.46588693957115, "grad_norm": 1.7857812643051147, "learning_rate": 1.7431375863888898e-07, "loss": 0.1275, "step": 282 }, { "epoch": 1.4710851202079271, "grad_norm": 1.5308443307876587, "learning_rate": 1.7112150082170568e-07, "loss": 0.1061, "step": 283 }, { "epoch": 1.4762833008447043, "grad_norm": 1.6288737058639526, "learning_rate": 1.679526986163804e-07, "loss": 0.1119, "step": 284 }, { "epoch": 1.4814814814814814, "grad_norm": 1.704690933227539, "learning_rate": 1.6480757802186068e-07, "loss": 0.1166, "step": 285 }, { "epoch": 1.4866796621182585, "grad_norm": 1.5033763647079468, "learning_rate": 1.6168636334812125e-07, "loss": 0.1045, "step": 286 }, { "epoch": 1.4918778427550357, "grad_norm": 1.4401872158050537, "learning_rate": 1.5858927720016706e-07, "loss": 0.0959, "step": 287 }, { "epoch": 1.4970760233918128, "grad_norm": 1.6706205606460571, "learning_rate": 1.555165404621567e-07, "loss": 0.1124, "step": 288 }, { "epoch": 1.50227420402859, "grad_norm": 1.8483508825302124, "learning_rate": 1.5246837228164905e-07, "loss": 0.1146, "step": 289 }, { "epoch": 1.507472384665367, "grad_norm": 1.8255398273468018, "learning_rate": 1.494449900539737e-07, "loss": 0.1413, "step": 290 }, { "epoch": 1.5126705653021442, "grad_norm": 1.8132373094558716, "learning_rate": 1.4644660940672627e-07, "loss": 0.1316, "step": 291 }, { "epoch": 1.5178687459389213, "grad_norm": 1.7580801248550415, "learning_rate": 1.434734441843899e-07, "loss": 0.1252, "step": 292 }, { "epoch": 1.5230669265756984, "grad_norm": 1.4802451133728027, "learning_rate": 1.4052570643308375e-07, "loss": 0.1087, "step": 293 }, { "epoch": 1.5282651072124755, "grad_norm": 1.595434308052063, "learning_rate": 1.376036063854401e-07, "loss": 0.1063, "step": 294 }, { "epoch": 1.5334632878492527, "grad_norm": 1.5052251815795898, "learning_rate": 1.3470735244561027e-07, "loss": 0.1071, "step": 295 }, { "epoch": 1.5386614684860298, "grad_norm": 1.4966932535171509, "learning_rate": 1.3183715117440142e-07, "loss": 0.1003, "step": 296 }, { "epoch": 1.543859649122807, "grad_norm": 1.5577093362808228, "learning_rate": 1.2899320727454472e-07, "loss": 0.1147, "step": 297 }, { "epoch": 1.549057829759584, "grad_norm": 2.081566572189331, "learning_rate": 1.2617572357609562e-07, "loss": 0.1479, "step": 298 }, { "epoch": 1.5542560103963612, "grad_norm": 1.5348504781723022, "learning_rate": 1.2338490102196825e-07, "loss": 0.1061, "step": 299 }, { "epoch": 1.5594541910331383, "grad_norm": 1.7641793489456177, "learning_rate": 1.2062093865360457e-07, "loss": 0.1359, "step": 300 }, { "epoch": 1.5646523716699154, "grad_norm": 1.5747112035751343, "learning_rate": 1.1788403359677767e-07, "loss": 0.1069, "step": 301 }, { "epoch": 1.5698505523066926, "grad_norm": 1.6333017349243164, "learning_rate": 1.1517438104753385e-07, "loss": 0.1077, "step": 302 }, { "epoch": 1.5750487329434697, "grad_norm": 1.5666186809539795, "learning_rate": 1.1249217425827062e-07, "loss": 0.1118, "step": 303 }, { "epoch": 1.5802469135802468, "grad_norm": 1.5051664113998413, "learning_rate": 1.0983760452395413e-07, "loss": 0.1043, "step": 304 }, { "epoch": 1.585445094217024, "grad_norm": 1.7494423389434814, "learning_rate": 1.07210861168476e-07, "loss": 0.1327, "step": 305 }, { "epoch": 1.590643274853801, "grad_norm": 1.5114164352416992, "learning_rate": 1.0461213153115079e-07, "loss": 0.0938, "step": 306 }, { "epoch": 1.5958414554905782, "grad_norm": 1.562703251838684, "learning_rate": 1.0204160095335479e-07, "loss": 0.1056, "step": 307 }, { "epoch": 1.6010396361273553, "grad_norm": 1.6739459037780762, "learning_rate": 9.94994527653078e-08, "loss": 0.1223, "step": 308 }, { "epoch": 1.6062378167641325, "grad_norm": 1.6659190654754639, "learning_rate": 9.69858682729976e-08, "loss": 0.1201, "step": 309 }, { "epoch": 1.6114359974009096, "grad_norm": 1.6198205947875977, "learning_rate": 9.45010267452495e-08, "loss": 0.1041, "step": 310 }, { "epoch": 1.6166341780376867, "grad_norm": 1.6569786071777344, "learning_rate": 9.204510540094095e-08, "loss": 0.1153, "step": 311 }, { "epoch": 1.6218323586744638, "grad_norm": 1.6825376749038696, "learning_rate": 8.961827939636196e-08, "loss": 0.1195, "step": 312 }, { "epoch": 1.627030539311241, "grad_norm": 1.6692954301834106, "learning_rate": 8.722072181272311e-08, "loss": 0.14, "step": 313 }, { "epoch": 1.632228719948018, "grad_norm": 1.86336350440979, "learning_rate": 8.485260364381186e-08, "loss": 0.1154, "step": 314 }, { "epoch": 1.6374269005847952, "grad_norm": 1.6375104188919067, "learning_rate": 8.251409378379637e-08, "loss": 0.1087, "step": 315 }, { "epoch": 1.6426250812215724, "grad_norm": 1.6260446310043335, "learning_rate": 8.02053590151805e-08, "loss": 0.1099, "step": 316 }, { "epoch": 1.6478232618583495, "grad_norm": 1.7146011590957642, "learning_rate": 7.792656399690922e-08, "loss": 0.1167, "step": 317 }, { "epoch": 1.6530214424951266, "grad_norm": 1.6423556804656982, "learning_rate": 7.567787125262449e-08, "loss": 0.1171, "step": 318 }, { "epoch": 1.6582196231319037, "grad_norm": 1.5510940551757812, "learning_rate": 7.345944115907421e-08, "loss": 0.1013, "step": 319 }, { "epoch": 1.6634178037686809, "grad_norm": 1.863503336906433, "learning_rate": 7.127143193467445e-08, "loss": 0.1423, "step": 320 }, { "epoch": 1.668615984405458, "grad_norm": 1.7284936904907227, "learning_rate": 6.911399962822518e-08, "loss": 0.1112, "step": 321 }, { "epoch": 1.6738141650422351, "grad_norm": 1.7205549478530884, "learning_rate": 6.698729810778064e-08, "loss": 0.136, "step": 322 }, { "epoch": 1.6790123456790123, "grad_norm": 1.4762628078460693, "learning_rate": 6.48914790496759e-08, "loss": 0.1144, "step": 323 }, { "epoch": 1.6842105263157894, "grad_norm": 1.8362925052642822, "learning_rate": 6.282669192770895e-08, "loss": 0.1328, "step": 324 }, { "epoch": 1.6894087069525665, "grad_norm": 1.4527249336242676, "learning_rate": 6.079308400248029e-08, "loss": 0.1055, "step": 325 }, { "epoch": 1.6946068875893436, "grad_norm": 1.7803164720535278, "learning_rate": 5.8790800310890456e-08, "loss": 0.1451, "step": 326 }, { "epoch": 1.6998050682261208, "grad_norm": 2.059589147567749, "learning_rate": 5.6819983655795936e-08, "loss": 0.1458, "step": 327 }, { "epoch": 1.705003248862898, "grad_norm": 1.7053784132003784, "learning_rate": 5.4880774595824245e-08, "loss": 0.1257, "step": 328 }, { "epoch": 1.710201429499675, "grad_norm": 1.97287118434906, "learning_rate": 5.297331143534972e-08, "loss": 0.1381, "step": 329 }, { "epoch": 1.7153996101364521, "grad_norm": 1.6465667486190796, "learning_rate": 5.109773021462921e-08, "loss": 0.1155, "step": 330 }, { "epoch": 1.7205977907732293, "grad_norm": 1.6381739377975464, "learning_rate": 4.925416470009991e-08, "loss": 0.1224, "step": 331 }, { "epoch": 1.7257959714100064, "grad_norm": 1.5278061628341675, "learning_rate": 4.744274637483936e-08, "loss": 0.1203, "step": 332 }, { "epoch": 1.7309941520467835, "grad_norm": 1.7712794542312622, "learning_rate": 4.566360442918754e-08, "loss": 0.1334, "step": 333 }, { "epoch": 1.7361923326835607, "grad_norm": 1.912316083908081, "learning_rate": 4.3916865751533306e-08, "loss": 0.1475, "step": 334 }, { "epoch": 1.7413905133203378, "grad_norm": 1.7500931024551392, "learning_rate": 4.220265491926489e-08, "loss": 0.1165, "step": 335 }, { "epoch": 1.746588693957115, "grad_norm": 1.7582532167434692, "learning_rate": 4.0521094189884696e-08, "loss": 0.119, "step": 336 }, { "epoch": 1.751786874593892, "grad_norm": 1.6794097423553467, "learning_rate": 3.887230349229015e-08, "loss": 0.1093, "step": 337 }, { "epoch": 1.7569850552306692, "grad_norm": 1.797913908958435, "learning_rate": 3.7256400418220256e-08, "loss": 0.1235, "step": 338 }, { "epoch": 1.7621832358674463, "grad_norm": 1.6042097806930542, "learning_rate": 3.567350021386895e-08, "loss": 0.096, "step": 339 }, { "epoch": 1.7673814165042234, "grad_norm": 1.7623355388641357, "learning_rate": 3.412371577166578e-08, "loss": 0.1153, "step": 340 }, { "epoch": 1.7725795971410006, "grad_norm": 1.737052321434021, "learning_rate": 3.260715762222449e-08, "loss": 0.1327, "step": 341 }, { "epoch": 1.7777777777777777, "grad_norm": 1.61103355884552, "learning_rate": 3.1123933926459845e-08, "loss": 0.1213, "step": 342 }, { "epoch": 1.7829759584145548, "grad_norm": 1.78020441532135, "learning_rate": 2.9674150467873527e-08, "loss": 0.1262, "step": 343 }, { "epoch": 1.788174139051332, "grad_norm": 1.7625352144241333, "learning_rate": 2.825791064500993e-08, "loss": 0.1465, "step": 344 }, { "epoch": 1.793372319688109, "grad_norm": 1.7474111318588257, "learning_rate": 2.6875315464081562e-08, "loss": 0.1289, "step": 345 }, { "epoch": 1.7985705003248862, "grad_norm": 1.7776579856872559, "learning_rate": 2.5526463531765463e-08, "loss": 0.1219, "step": 346 }, { "epoch": 1.8037686809616633, "grad_norm": 1.7461159229278564, "learning_rate": 2.4211451048170296e-08, "loss": 0.1361, "step": 347 }, { "epoch": 1.8089668615984404, "grad_norm": 1.701060175895691, "learning_rate": 2.293037179997559e-08, "loss": 0.1463, "step": 348 }, { "epoch": 1.8141650422352176, "grad_norm": 1.7974237203598022, "learning_rate": 2.1683317153742775e-08, "loss": 0.14, "step": 349 }, { "epoch": 1.8193632228719947, "grad_norm": 1.5575218200683594, "learning_rate": 2.047037604939894e-08, "loss": 0.1234, "step": 350 }, { "epoch": 1.8245614035087718, "grad_norm": 1.6900442838668823, "learning_rate": 1.92916349938938e-08, "loss": 0.1119, "step": 351 }, { "epoch": 1.829759584145549, "grad_norm": 1.7725763320922852, "learning_rate": 1.8147178055029577e-08, "loss": 0.1456, "step": 352 }, { "epoch": 1.834957764782326, "grad_norm": 1.677262544631958, "learning_rate": 1.7037086855465898e-08, "loss": 0.1041, "step": 353 }, { "epoch": 1.8401559454191032, "grad_norm": 1.5990703105926514, "learning_rate": 1.596144056689791e-08, "loss": 0.1149, "step": 354 }, { "epoch": 1.8453541260558803, "grad_norm": 2.083341598510742, "learning_rate": 1.4920315904410064e-08, "loss": 0.1375, "step": 355 }, { "epoch": 1.8505523066926575, "grad_norm": 1.7379205226898193, "learning_rate": 1.3913787121004716e-08, "loss": 0.1365, "step": 356 }, { "epoch": 1.8557504873294346, "grad_norm": 1.6175756454467773, "learning_rate": 1.2941926002306536e-08, "loss": 0.1348, "step": 357 }, { "epoch": 1.8609486679662117, "grad_norm": 1.7680426836013794, "learning_rate": 1.200480186144237e-08, "loss": 0.1232, "step": 358 }, { "epoch": 1.8661468486029889, "grad_norm": 1.3798372745513916, "learning_rate": 1.1102481534098374e-08, "loss": 0.0966, "step": 359 }, { "epoch": 1.871345029239766, "grad_norm": 1.584373950958252, "learning_rate": 1.0235029373752757e-08, "loss": 0.1082, "step": 360 }, { "epoch": 1.876543209876543, "grad_norm": 1.7235833406448364, "learning_rate": 9.402507247086578e-09, "loss": 0.1348, "step": 361 }, { "epoch": 1.8817413905133202, "grad_norm": 1.6077182292938232, "learning_rate": 8.60497452957104e-09, "loss": 0.1234, "step": 362 }, { "epoch": 1.8869395711500974, "grad_norm": 1.5776941776275635, "learning_rate": 7.842488101232891e-09, "loss": 0.1177, "step": 363 }, { "epoch": 1.8921377517868745, "grad_norm": 1.5909714698791504, "learning_rate": 7.115102342598101e-09, "loss": 0.1133, "step": 364 }, { "epoch": 1.8973359324236516, "grad_norm": 1.7370619773864746, "learning_rate": 6.422869130812913e-09, "loss": 0.1269, "step": 365 }, { "epoch": 1.9025341130604287, "grad_norm": 1.7405683994293213, "learning_rate": 5.765837835944309e-09, "loss": 0.1307, "step": 366 }, { "epoch": 1.9077322936972059, "grad_norm": 1.9631272554397583, "learning_rate": 5.144055317458817e-09, "loss": 0.1546, "step": 367 }, { "epoch": 1.912930474333983, "grad_norm": 1.6291249990463257, "learning_rate": 4.55756592088058e-09, "loss": 0.1249, "step": 368 }, { "epoch": 1.9181286549707601, "grad_norm": 1.528748869895935, "learning_rate": 4.0064114746284905e-09, "loss": 0.1027, "step": 369 }, { "epoch": 1.9233268356075373, "grad_norm": 1.8311703205108643, "learning_rate": 3.4906312870331965e-09, "loss": 0.1128, "step": 370 }, { "epoch": 1.9285250162443144, "grad_norm": 1.8863190412521362, "learning_rate": 3.010262143533393e-09, "loss": 0.1441, "step": 371 }, { "epoch": 1.9337231968810915, "grad_norm": 1.5785506963729858, "learning_rate": 2.5653383040524224e-09, "loss": 0.1198, "step": 372 }, { "epoch": 1.9389213775178686, "grad_norm": 1.6531542539596558, "learning_rate": 2.155891500554896e-09, "loss": 0.1224, "step": 373 }, { "epoch": 1.9441195581546458, "grad_norm": 1.5542500019073486, "learning_rate": 1.7819509347835049e-09, "loss": 0.1149, "step": 374 }, { "epoch": 1.949317738791423, "grad_norm": 1.5082885026931763, "learning_rate": 1.4435432761762955e-09, "loss": 0.1061, "step": 375 }, { "epoch": 1.9545159194282, "grad_norm": 1.5856765508651733, "learning_rate": 1.1406926599646372e-09, "loss": 0.1327, "step": 376 }, { "epoch": 1.9597141000649771, "grad_norm": 1.6566141843795776, "learning_rate": 8.73420685452042e-10, "loss": 0.1105, "step": 377 }, { "epoch": 1.9649122807017543, "grad_norm": 1.7047754526138306, "learning_rate": 6.417464144736207e-10, "loss": 0.1301, "step": 378 }, { "epoch": 1.9701104613385314, "grad_norm": 1.7699244022369385, "learning_rate": 4.4568637003633556e-10, "loss": 0.1412, "step": 379 }, { "epoch": 1.9753086419753085, "grad_norm": 1.8177380561828613, "learning_rate": 2.852545351409996e-10, "loss": 0.1571, "step": 380 }, { "epoch": 1.9805068226120857, "grad_norm": 1.5334748029708862, "learning_rate": 1.6046235178474033e-10, "loss": 0.104, "step": 381 }, { "epoch": 1.9857050032488628, "grad_norm": 1.7260781526565552, "learning_rate": 7.13187201450971e-11, "loss": 0.1263, "step": 382 }, { "epoch": 1.99090318388564, "grad_norm": 1.6607623100280762, "learning_rate": 1.7829997945084662e-11, "loss": 0.1135, "step": 383 }, { "epoch": 1.996101364522417, "grad_norm": 1.7004939317703247, "learning_rate": 0.0, "loss": 0.1315, "step": 384 }, { "epoch": 1.996101364522417, "step": 384, "total_flos": 3.0550374682743276e+18, "train_loss": 0.19105235013800362, "train_runtime": 6828.3197, "train_samples_per_second": 7.211, "train_steps_per_second": 0.056 } ], "logging_steps": 1.0, "max_steps": 384, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0550374682743276e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }