{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.40431266846361186, "eval_steps": 38, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026954177897574125, "grad_norm": 0.4888671256680066, "learning_rate": 2.5e-06, "loss": 2.6499, "step": 1 }, { "epoch": 0.0026954177897574125, "eval_loss": 2.7584831714630127, "eval_runtime": 108.9254, "eval_samples_per_second": 0.734, "eval_steps_per_second": 0.184, "step": 1 }, { "epoch": 0.005390835579514825, "grad_norm": 0.4983074714711494, "learning_rate": 5e-06, "loss": 2.5992, "step": 2 }, { "epoch": 0.008086253369272238, "grad_norm": 0.7241191448700953, "learning_rate": 7.5e-06, "loss": 2.8753, "step": 3 }, { "epoch": 0.01078167115902965, "grad_norm": 0.5260744264207279, "learning_rate": 1e-05, "loss": 2.6477, "step": 4 }, { "epoch": 0.013477088948787063, "grad_norm": 0.36639797120190076, "learning_rate": 1.25e-05, "loss": 2.5086, "step": 5 }, { "epoch": 0.016172506738544475, "grad_norm": 0.7809079947872807, "learning_rate": 1.5e-05, "loss": 2.8346, "step": 6 }, { "epoch": 0.018867924528301886, "grad_norm": 0.5980534435082484, "learning_rate": 1.75e-05, "loss": 2.9624, "step": 7 }, { "epoch": 0.0215633423180593, "grad_norm": 0.5798622117264719, "learning_rate": 2e-05, "loss": 2.8441, "step": 8 }, { "epoch": 0.02425876010781671, "grad_norm": 0.7528222488078259, "learning_rate": 2.25e-05, "loss": 2.6867, "step": 9 }, { "epoch": 0.026954177897574125, "grad_norm": 0.612852819847163, "learning_rate": 2.5e-05, "loss": 2.7927, "step": 10 }, { "epoch": 0.029649595687331536, "grad_norm": 0.6397459055001692, "learning_rate": 2.7500000000000004e-05, "loss": 2.6072, "step": 11 }, { "epoch": 0.03234501347708895, "grad_norm": 0.6598580504868401, "learning_rate": 3e-05, "loss": 2.5274, "step": 12 }, { "epoch": 0.03504043126684636, "grad_norm": 0.30737376780003156, "learning_rate": 3.2500000000000004e-05, "loss": 2.5507, "step": 13 }, { "epoch": 0.03773584905660377, "grad_norm": 0.14992908808228841, "learning_rate": 3.5e-05, "loss": 2.4811, "step": 14 }, { "epoch": 0.04043126684636118, "grad_norm": 0.22421462173093706, "learning_rate": 3.7500000000000003e-05, "loss": 2.6159, "step": 15 }, { "epoch": 0.0431266846361186, "grad_norm": 0.16819168085460345, "learning_rate": 4e-05, "loss": 2.6193, "step": 16 }, { "epoch": 0.04582210242587601, "grad_norm": 0.12218717106853527, "learning_rate": 4.25e-05, "loss": 2.7094, "step": 17 }, { "epoch": 0.04851752021563342, "grad_norm": 0.1145865395458764, "learning_rate": 4.5e-05, "loss": 2.6314, "step": 18 }, { "epoch": 0.05121293800539083, "grad_norm": 0.08821545171076035, "learning_rate": 4.75e-05, "loss": 2.4876, "step": 19 }, { "epoch": 0.05390835579514825, "grad_norm": 0.10203760270867333, "learning_rate": 5e-05, "loss": 2.622, "step": 20 }, { "epoch": 0.05660377358490566, "grad_norm": 0.07748067981259345, "learning_rate": 4.9999098771046674e-05, "loss": 2.4191, "step": 21 }, { "epoch": 0.05929919137466307, "grad_norm": 0.13225530856023926, "learning_rate": 4.999639515638348e-05, "loss": 2.642, "step": 22 }, { "epoch": 0.06199460916442048, "grad_norm": 0.07588900512747042, "learning_rate": 4.999188937259492e-05, "loss": 2.2706, "step": 23 }, { "epoch": 0.0646900269541779, "grad_norm": 0.12017105244364508, "learning_rate": 4.998558178063592e-05, "loss": 2.6609, "step": 24 }, { "epoch": 0.0673854447439353, "grad_norm": 0.13885153935053832, "learning_rate": 4.9977472885802876e-05, "loss": 2.3222, "step": 25 }, { "epoch": 0.07008086253369272, "grad_norm": 0.08273500402204699, "learning_rate": 4.996756333769319e-05, "loss": 2.2653, "step": 26 }, { "epoch": 0.07277628032345014, "grad_norm": 0.08128820734533522, "learning_rate": 4.995585393015324e-05, "loss": 2.468, "step": 27 }, { "epoch": 0.07547169811320754, "grad_norm": 0.11425098408554267, "learning_rate": 4.9942345601214764e-05, "loss": 2.0993, "step": 28 }, { "epoch": 0.07816711590296496, "grad_norm": 0.10026403239718774, "learning_rate": 4.992703943301973e-05, "loss": 2.4194, "step": 29 }, { "epoch": 0.08086253369272237, "grad_norm": 0.0892349090753594, "learning_rate": 4.990993665173364e-05, "loss": 2.5426, "step": 30 }, { "epoch": 0.08355795148247978, "grad_norm": 0.08905031867375157, "learning_rate": 4.989103862744732e-05, "loss": 2.4458, "step": 31 }, { "epoch": 0.0862533692722372, "grad_norm": 0.08382151700226004, "learning_rate": 4.987034687406713e-05, "loss": 2.4005, "step": 32 }, { "epoch": 0.0889487870619946, "grad_norm": 0.07624420840782331, "learning_rate": 4.984786304919372e-05, "loss": 2.3028, "step": 33 }, { "epoch": 0.09164420485175202, "grad_norm": 0.08195304711308972, "learning_rate": 4.9823588953989216e-05, "loss": 2.2564, "step": 34 }, { "epoch": 0.09433962264150944, "grad_norm": 0.08199393883574592, "learning_rate": 4.9797526533032945e-05, "loss": 2.325, "step": 35 }, { "epoch": 0.09703504043126684, "grad_norm": 0.08525060108567321, "learning_rate": 4.976967787416565e-05, "loss": 2.08, "step": 36 }, { "epoch": 0.09973045822102426, "grad_norm": 0.08829832243395126, "learning_rate": 4.974004520832223e-05, "loss": 2.3272, "step": 37 }, { "epoch": 0.10242587601078167, "grad_norm": 0.0805993575215856, "learning_rate": 4.970863090935304e-05, "loss": 2.4375, "step": 38 }, { "epoch": 0.10242587601078167, "eval_loss": 2.4220809936523438, "eval_runtime": 109.7313, "eval_samples_per_second": 0.729, "eval_steps_per_second": 0.182, "step": 38 }, { "epoch": 0.10512129380053908, "grad_norm": 0.07362914307313682, "learning_rate": 4.967543749383371e-05, "loss": 2.5666, "step": 39 }, { "epoch": 0.1078167115902965, "grad_norm": 0.07781163135663244, "learning_rate": 4.9640467620863526e-05, "loss": 2.3806, "step": 40 }, { "epoch": 0.1105121293800539, "grad_norm": 0.07965171973675265, "learning_rate": 4.9603724091852456e-05, "loss": 2.3914, "step": 41 }, { "epoch": 0.11320754716981132, "grad_norm": 0.0769658349876996, "learning_rate": 4.95652098502967e-05, "loss": 2.3622, "step": 42 }, { "epoch": 0.11590296495956873, "grad_norm": 0.07123531103603928, "learning_rate": 4.952492798154287e-05, "loss": 2.4497, "step": 43 }, { "epoch": 0.11859838274932614, "grad_norm": 0.08376895847589426, "learning_rate": 4.948288171254089e-05, "loss": 2.2966, "step": 44 }, { "epoch": 0.12129380053908356, "grad_norm": 0.06991588614283951, "learning_rate": 4.9439074411585406e-05, "loss": 2.6397, "step": 45 }, { "epoch": 0.12398921832884097, "grad_norm": 0.07115181837614266, "learning_rate": 4.9393509588046036e-05, "loss": 2.3112, "step": 46 }, { "epoch": 0.12668463611859837, "grad_norm": 0.12069809526368983, "learning_rate": 4.9346190892086174e-05, "loss": 2.6053, "step": 47 }, { "epoch": 0.1293800539083558, "grad_norm": 0.08358966882474354, "learning_rate": 4.9297122114370596e-05, "loss": 2.3594, "step": 48 }, { "epoch": 0.1320754716981132, "grad_norm": 0.07439368303284428, "learning_rate": 4.9246307185761815e-05, "loss": 2.3608, "step": 49 }, { "epoch": 0.1347708894878706, "grad_norm": 0.07077366564222114, "learning_rate": 4.919375017700515e-05, "loss": 2.1389, "step": 50 }, { "epoch": 0.13746630727762804, "grad_norm": 0.0786367422272597, "learning_rate": 4.9139455298402656e-05, "loss": 2.3726, "step": 51 }, { "epoch": 0.14016172506738545, "grad_norm": 0.07772287420711031, "learning_rate": 4.908342689947581e-05, "loss": 2.6495, "step": 52 }, { "epoch": 0.14285714285714285, "grad_norm": 0.09095569172141003, "learning_rate": 4.902566946861708e-05, "loss": 2.3952, "step": 53 }, { "epoch": 0.14555256064690028, "grad_norm": 0.08022908718012228, "learning_rate": 4.8966187632730356e-05, "loss": 2.2133, "step": 54 }, { "epoch": 0.14824797843665768, "grad_norm": 0.08853617116138855, "learning_rate": 4.8904986156860346e-05, "loss": 2.2795, "step": 55 }, { "epoch": 0.1509433962264151, "grad_norm": 0.08155234218395907, "learning_rate": 4.884206994381078e-05, "loss": 2.6322, "step": 56 }, { "epoch": 0.15363881401617252, "grad_norm": 0.07017826282717579, "learning_rate": 4.877744403375168e-05, "loss": 2.36, "step": 57 }, { "epoch": 0.15633423180592992, "grad_norm": 0.07787943384594204, "learning_rate": 4.871111360381562e-05, "loss": 2.4609, "step": 58 }, { "epoch": 0.15902964959568733, "grad_norm": 0.074730895279447, "learning_rate": 4.864308396768294e-05, "loss": 2.3737, "step": 59 }, { "epoch": 0.16172506738544473, "grad_norm": 0.0754808721924062, "learning_rate": 4.857336057515611e-05, "loss": 2.4353, "step": 60 }, { "epoch": 0.16442048517520216, "grad_norm": 0.17074429587919648, "learning_rate": 4.8501949011723144e-05, "loss": 2.3747, "step": 61 }, { "epoch": 0.16711590296495957, "grad_norm": 0.12258858526490496, "learning_rate": 4.842885499811012e-05, "loss": 2.3253, "step": 62 }, { "epoch": 0.16981132075471697, "grad_norm": 0.08223383099932692, "learning_rate": 4.835408438982294e-05, "loss": 2.5312, "step": 63 }, { "epoch": 0.1725067385444744, "grad_norm": 0.11131907859034289, "learning_rate": 4.827764317667825e-05, "loss": 2.6191, "step": 64 }, { "epoch": 0.1752021563342318, "grad_norm": 0.06793561987591092, "learning_rate": 4.8199537482323545e-05, "loss": 2.4681, "step": 65 }, { "epoch": 0.1778975741239892, "grad_norm": 0.07850381171278048, "learning_rate": 4.811977356374667e-05, "loss": 2.4868, "step": 66 }, { "epoch": 0.18059299191374664, "grad_norm": 0.08512994202655803, "learning_rate": 4.803835781077455e-05, "loss": 2.6481, "step": 67 }, { "epoch": 0.18328840970350405, "grad_norm": 0.0846518039690973, "learning_rate": 4.795529674556129e-05, "loss": 2.4295, "step": 68 }, { "epoch": 0.18598382749326145, "grad_norm": 0.08031185133795576, "learning_rate": 4.7870597022065724e-05, "loss": 2.4132, "step": 69 }, { "epoch": 0.18867924528301888, "grad_norm": 0.11316780608440431, "learning_rate": 4.7784265425518365e-05, "loss": 2.4341, "step": 70 }, { "epoch": 0.19137466307277629, "grad_norm": 0.0781705228710772, "learning_rate": 4.769630887187782e-05, "loss": 2.4258, "step": 71 }, { "epoch": 0.1940700808625337, "grad_norm": 0.09475565364886825, "learning_rate": 4.760673440727678e-05, "loss": 2.6535, "step": 72 }, { "epoch": 0.1967654986522911, "grad_norm": 0.0936468075339049, "learning_rate": 4.751554920745755e-05, "loss": 2.4902, "step": 73 }, { "epoch": 0.19946091644204852, "grad_norm": 0.07028228522733242, "learning_rate": 4.7422760577197226e-05, "loss": 2.3132, "step": 74 }, { "epoch": 0.20215633423180593, "grad_norm": 0.087359632341199, "learning_rate": 4.7328375949722476e-05, "loss": 2.3678, "step": 75 }, { "epoch": 0.20485175202156333, "grad_norm": 0.07567735863088498, "learning_rate": 4.72324028861141e-05, "loss": 2.5361, "step": 76 }, { "epoch": 0.20485175202156333, "eval_loss": 2.409113645553589, "eval_runtime": 109.4094, "eval_samples_per_second": 0.731, "eval_steps_per_second": 0.183, "step": 76 }, { "epoch": 0.20754716981132076, "grad_norm": 0.0806053756655779, "learning_rate": 4.713484907470133e-05, "loss": 2.4361, "step": 77 }, { "epoch": 0.21024258760107817, "grad_norm": 0.08462062433959809, "learning_rate": 4.7035722330445857e-05, "loss": 2.5358, "step": 78 }, { "epoch": 0.21293800539083557, "grad_norm": 0.08211420166838714, "learning_rate": 4.693503059431588e-05, "loss": 2.3083, "step": 79 }, { "epoch": 0.215633423180593, "grad_norm": 0.07226082893047459, "learning_rate": 4.6832781932649884e-05, "loss": 2.468, "step": 80 }, { "epoch": 0.2183288409703504, "grad_norm": 0.08491650207069548, "learning_rate": 4.6728984536510454e-05, "loss": 2.436, "step": 81 }, { "epoch": 0.2210242587601078, "grad_norm": 0.07893280409193093, "learning_rate": 4.662364672102817e-05, "loss": 2.3986, "step": 82 }, { "epoch": 0.22371967654986524, "grad_norm": 0.08530475722783892, "learning_rate": 4.651677692473538e-05, "loss": 2.5171, "step": 83 }, { "epoch": 0.22641509433962265, "grad_norm": 0.0782081304914248, "learning_rate": 4.640838370889029e-05, "loss": 2.4167, "step": 84 }, { "epoch": 0.22911051212938005, "grad_norm": 0.09010839515008441, "learning_rate": 4.629847575679107e-05, "loss": 2.3621, "step": 85 }, { "epoch": 0.23180592991913745, "grad_norm": 0.08429756465698064, "learning_rate": 4.6187061873080264e-05, "loss": 2.5409, "step": 86 }, { "epoch": 0.23450134770889489, "grad_norm": 0.08174597825553892, "learning_rate": 4.607415098303945e-05, "loss": 2.3467, "step": 87 }, { "epoch": 0.2371967654986523, "grad_norm": 0.08048024256086499, "learning_rate": 4.5959752131874263e-05, "loss": 2.5402, "step": 88 }, { "epoch": 0.2398921832884097, "grad_norm": 0.07839092789606332, "learning_rate": 4.5843874483989746e-05, "loss": 2.2092, "step": 89 }, { "epoch": 0.24258760107816713, "grad_norm": 0.08207450024743052, "learning_rate": 4.572652732225625e-05, "loss": 2.4616, "step": 90 }, { "epoch": 0.24528301886792453, "grad_norm": 0.08180483750519887, "learning_rate": 4.560772004726575e-05, "loss": 2.4805, "step": 91 }, { "epoch": 0.24797843665768193, "grad_norm": 0.09232170138763231, "learning_rate": 4.548746217657878e-05, "loss": 2.4803, "step": 92 }, { "epoch": 0.25067385444743934, "grad_norm": 0.08251617231525912, "learning_rate": 4.5365763343962005e-05, "loss": 2.3035, "step": 93 }, { "epoch": 0.25336927223719674, "grad_norm": 0.08840722457592919, "learning_rate": 4.524263329861644e-05, "loss": 2.4107, "step": 94 }, { "epoch": 0.2560646900269542, "grad_norm": 0.08098547429953781, "learning_rate": 4.5118081904396504e-05, "loss": 2.5351, "step": 95 }, { "epoch": 0.2587601078167116, "grad_norm": 0.09582643101903592, "learning_rate": 4.499211913901974e-05, "loss": 2.3998, "step": 96 }, { "epoch": 0.261455525606469, "grad_norm": 0.09325977474372174, "learning_rate": 4.486475509326759e-05, "loss": 2.6805, "step": 97 }, { "epoch": 0.2641509433962264, "grad_norm": 0.08869290989815509, "learning_rate": 4.473599997017701e-05, "loss": 2.384, "step": 98 }, { "epoch": 0.2668463611859838, "grad_norm": 0.07911227872845064, "learning_rate": 4.460586408422308e-05, "loss": 2.4893, "step": 99 }, { "epoch": 0.2695417789757412, "grad_norm": 0.08301640398701123, "learning_rate": 4.447435786049278e-05, "loss": 2.4671, "step": 100 }, { "epoch": 0.2722371967654987, "grad_norm": 0.0959773217992077, "learning_rate": 4.434149183384977e-05, "loss": 2.5863, "step": 101 }, { "epoch": 0.2749326145552561, "grad_norm": 0.07951557525414293, "learning_rate": 4.420727664809053e-05, "loss": 2.552, "step": 102 }, { "epoch": 0.2776280323450135, "grad_norm": 0.07489545345156162, "learning_rate": 4.407172305509158e-05, "loss": 2.3026, "step": 103 }, { "epoch": 0.2803234501347709, "grad_norm": 0.08339425757380002, "learning_rate": 4.3934841913948323e-05, "loss": 2.3962, "step": 104 }, { "epoch": 0.2830188679245283, "grad_norm": 0.09366852913157536, "learning_rate": 4.379664419010496e-05, "loss": 2.364, "step": 105 }, { "epoch": 0.2857142857142857, "grad_norm": 0.07982165605990695, "learning_rate": 4.3657140954476165e-05, "loss": 2.4098, "step": 106 }, { "epoch": 0.2884097035040431, "grad_norm": 0.10046638975690705, "learning_rate": 4.351634338256017e-05, "loss": 2.3549, "step": 107 }, { "epoch": 0.29110512129380056, "grad_norm": 0.08495800159851925, "learning_rate": 4.337426275354348e-05, "loss": 2.3466, "step": 108 }, { "epoch": 0.29380053908355797, "grad_norm": 0.08176244120225214, "learning_rate": 4.323091044939736e-05, "loss": 2.4374, "step": 109 }, { "epoch": 0.29649595687331537, "grad_norm": 0.0766986612060137, "learning_rate": 4.308629795396599e-05, "loss": 2.3918, "step": 110 }, { "epoch": 0.2991913746630728, "grad_norm": 0.08318997009881038, "learning_rate": 4.294043685204651e-05, "loss": 2.1499, "step": 111 }, { "epoch": 0.3018867924528302, "grad_norm": 0.09128600009973836, "learning_rate": 4.2793338828460984e-05, "loss": 2.3912, "step": 112 }, { "epoch": 0.3045822102425876, "grad_norm": 0.07755181417221228, "learning_rate": 4.264501566712033e-05, "loss": 2.5428, "step": 113 }, { "epoch": 0.30727762803234504, "grad_norm": 0.07626744119606738, "learning_rate": 4.249547925008033e-05, "loss": 2.3882, "step": 114 }, { "epoch": 0.30727762803234504, "eval_loss": 2.4033055305480957, "eval_runtime": 108.7319, "eval_samples_per_second": 0.736, "eval_steps_per_second": 0.184, "step": 114 }, { "epoch": 0.30997304582210244, "grad_norm": 0.10404280937654081, "learning_rate": 4.234474155658974e-05, "loss": 2.4894, "step": 115 }, { "epoch": 0.31266846361185985, "grad_norm": 0.08349137443478906, "learning_rate": 4.219281466213066e-05, "loss": 2.3636, "step": 116 }, { "epoch": 0.31536388140161725, "grad_norm": 0.09717978095378631, "learning_rate": 4.203971073745122e-05, "loss": 2.4826, "step": 117 }, { "epoch": 0.31805929919137466, "grad_norm": 0.08810095937170669, "learning_rate": 4.188544204759048e-05, "loss": 2.5296, "step": 118 }, { "epoch": 0.32075471698113206, "grad_norm": 0.09443980051741153, "learning_rate": 4.173002095089599e-05, "loss": 2.2018, "step": 119 }, { "epoch": 0.32345013477088946, "grad_norm": 0.09464431143756959, "learning_rate": 4.157345989803375e-05, "loss": 2.4329, "step": 120 }, { "epoch": 0.3261455525606469, "grad_norm": 0.0921167133236036, "learning_rate": 4.141577143099074e-05, "loss": 2.411, "step": 121 }, { "epoch": 0.3288409703504043, "grad_norm": 0.09102913953050058, "learning_rate": 4.1256968182070275e-05, "loss": 2.6301, "step": 122 }, { "epoch": 0.33153638814016173, "grad_norm": 0.09442024128701812, "learning_rate": 4.109706287287998e-05, "loss": 2.2963, "step": 123 }, { "epoch": 0.33423180592991913, "grad_norm": 0.09048360971081038, "learning_rate": 4.093606831331269e-05, "loss": 2.3533, "step": 124 }, { "epoch": 0.33692722371967654, "grad_norm": 0.09958454863765234, "learning_rate": 4.077399740052027e-05, "loss": 2.5592, "step": 125 }, { "epoch": 0.33962264150943394, "grad_norm": 0.10352388511166186, "learning_rate": 4.06108631178804e-05, "loss": 2.5035, "step": 126 }, { "epoch": 0.3423180592991914, "grad_norm": 0.09119254602507475, "learning_rate": 4.044667853395655e-05, "loss": 2.3612, "step": 127 }, { "epoch": 0.3450134770889488, "grad_norm": 0.11817603428015877, "learning_rate": 4.0281456801451004e-05, "loss": 2.3564, "step": 128 }, { "epoch": 0.3477088948787062, "grad_norm": 0.08282970624160388, "learning_rate": 4.011521115615123e-05, "loss": 2.3187, "step": 129 }, { "epoch": 0.3504043126684636, "grad_norm": 0.09138997506736304, "learning_rate": 3.9947954915869565e-05, "loss": 2.4211, "step": 130 }, { "epoch": 0.353099730458221, "grad_norm": 0.10315874858803445, "learning_rate": 3.977970147937635e-05, "loss": 2.4255, "step": 131 }, { "epoch": 0.3557951482479784, "grad_norm": 0.12324355580259778, "learning_rate": 3.961046432532659e-05, "loss": 2.4149, "step": 132 }, { "epoch": 0.3584905660377358, "grad_norm": 0.07993016245985274, "learning_rate": 3.944025701118009e-05, "loss": 2.1721, "step": 133 }, { "epoch": 0.3611859838274933, "grad_norm": 0.08795688215451386, "learning_rate": 3.9269093172115496e-05, "loss": 2.3163, "step": 134 }, { "epoch": 0.3638814016172507, "grad_norm": 0.08804534236891878, "learning_rate": 3.9096986519937924e-05, "loss": 2.3146, "step": 135 }, { "epoch": 0.3665768194070081, "grad_norm": 0.08147499851736127, "learning_rate": 3.892395084198053e-05, "loss": 2.4056, "step": 136 }, { "epoch": 0.3692722371967655, "grad_norm": 0.08190511447175075, "learning_rate": 3.875e-05, "loss": 2.3873, "step": 137 }, { "epoch": 0.3719676549865229, "grad_norm": 0.10702441021155629, "learning_rate": 3.857514792906616e-05, "loss": 2.643, "step": 138 }, { "epoch": 0.3746630727762803, "grad_norm": 0.0785708035522026, "learning_rate": 3.839940863644556e-05, "loss": 2.5635, "step": 139 }, { "epoch": 0.37735849056603776, "grad_norm": 0.09362896847049527, "learning_rate": 3.822279620047943e-05, "loss": 2.377, "step": 140 }, { "epoch": 0.38005390835579517, "grad_norm": 0.09794943329180347, "learning_rate": 3.8045324769455834e-05, "loss": 2.5292, "step": 141 }, { "epoch": 0.38274932614555257, "grad_norm": 0.0906842624325085, "learning_rate": 3.78670085604763e-05, "loss": 2.3576, "step": 142 }, { "epoch": 0.38544474393531, "grad_norm": 0.09335279764944357, "learning_rate": 3.7687861858316844e-05, "loss": 2.1434, "step": 143 }, { "epoch": 0.3881401617250674, "grad_norm": 0.11106908056412888, "learning_rate": 3.7507899014283684e-05, "loss": 2.2124, "step": 144 }, { "epoch": 0.3908355795148248, "grad_norm": 0.10748379262846679, "learning_rate": 3.732713444506354e-05, "loss": 2.4934, "step": 145 }, { "epoch": 0.3935309973045822, "grad_norm": 0.08611757548383629, "learning_rate": 3.714558263156872e-05, "loss": 2.2559, "step": 146 }, { "epoch": 0.39622641509433965, "grad_norm": 0.08579591203817094, "learning_rate": 3.696325811777709e-05, "loss": 2.3151, "step": 147 }, { "epoch": 0.39892183288409705, "grad_norm": 0.08307902702869267, "learning_rate": 3.678017550956692e-05, "loss": 2.3902, "step": 148 }, { "epoch": 0.40161725067385445, "grad_norm": 0.0915570030354968, "learning_rate": 3.659634947354686e-05, "loss": 2.223, "step": 149 }, { "epoch": 0.40431266846361186, "grad_norm": 0.07897467847137554, "learning_rate": 3.6411794735881035e-05, "loss": 2.3241, "step": 150 } ], "logging_steps": 1, "max_steps": 371, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 75, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 24793369804800.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }