{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9990995047276003, "eval_steps": 500, "global_step": 1110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018009905447996397, "grad_norm": 3.8927660727308404, "learning_rate": 1.801801801801802e-06, "loss": 1.3686, "step": 10 }, { "epoch": 0.03601981089599279, "grad_norm": 1.2914340039825571, "learning_rate": 3.603603603603604e-06, "loss": 1.2618, "step": 20 }, { "epoch": 0.0540297163439892, "grad_norm": 0.5117572348414451, "learning_rate": 5.405405405405406e-06, "loss": 1.1941, "step": 30 }, { "epoch": 0.07203962179198559, "grad_norm": 0.3415375067191979, "learning_rate": 7.207207207207208e-06, "loss": 1.157, "step": 40 }, { "epoch": 0.090049527239982, "grad_norm": 0.3050099720948823, "learning_rate": 9.00900900900901e-06, "loss": 1.1486, "step": 50 }, { "epoch": 0.1080594326879784, "grad_norm": 0.2628693083106922, "learning_rate": 1.0810810810810812e-05, "loss": 1.12, "step": 60 }, { "epoch": 0.1260693381359748, "grad_norm": 0.23606513506788576, "learning_rate": 1.2612612612612613e-05, "loss": 1.1014, "step": 70 }, { "epoch": 0.14407924358397117, "grad_norm": 0.24000278532244093, "learning_rate": 1.4414414414414416e-05, "loss": 1.1087, "step": 80 }, { "epoch": 0.16208914903196758, "grad_norm": 0.21377676080662794, "learning_rate": 1.6216216216216218e-05, "loss": 1.0764, "step": 90 }, { "epoch": 0.180099054479964, "grad_norm": 0.1794631392027073, "learning_rate": 1.801801801801802e-05, "loss": 1.0757, "step": 100 }, { "epoch": 0.19810895992796038, "grad_norm": 0.19497374657829, "learning_rate": 1.981981981981982e-05, "loss": 1.0904, "step": 110 }, { "epoch": 0.2161188653759568, "grad_norm": 0.21533071811896856, "learning_rate": 1.999599507118322e-05, "loss": 1.0765, "step": 120 }, { "epoch": 0.23412877082395317, "grad_norm": 0.19307788030973838, "learning_rate": 1.998215499120161e-05, "loss": 1.0702, "step": 130 }, { "epoch": 0.2521386762719496, "grad_norm": 0.18836481163407426, "learning_rate": 1.99584439990734e-05, "loss": 1.0766, "step": 140 }, { "epoch": 0.270148581719946, "grad_norm": 0.21511735322364797, "learning_rate": 1.992488554155135e-05, "loss": 1.0811, "step": 150 }, { "epoch": 0.28815848716794235, "grad_norm": 0.195309434953766, "learning_rate": 1.9881512803111797e-05, "loss": 1.0658, "step": 160 }, { "epoch": 0.30616839261593876, "grad_norm": 0.2421941539327756, "learning_rate": 1.9828368673139947e-05, "loss": 1.0672, "step": 170 }, { "epoch": 0.32417829806393517, "grad_norm": 0.1860112417998846, "learning_rate": 1.9765505703518494e-05, "loss": 1.0611, "step": 180 }, { "epoch": 0.3421882035119316, "grad_norm": 0.17818304810152363, "learning_rate": 1.9692986056661355e-05, "loss": 1.066, "step": 190 }, { "epoch": 0.360198108959928, "grad_norm": 0.2397415845301638, "learning_rate": 1.961088144404403e-05, "loss": 1.0634, "step": 200 }, { "epoch": 0.37820801440792434, "grad_norm": 0.18967589505610974, "learning_rate": 1.9519273055291266e-05, "loss": 1.0613, "step": 210 }, { "epoch": 0.39621791985592075, "grad_norm": 0.2130198103194759, "learning_rate": 1.941825147789225e-05, "loss": 1.0668, "step": 220 }, { "epoch": 0.41422782530391716, "grad_norm": 0.18395677854198192, "learning_rate": 1.930791660762262e-05, "loss": 1.0619, "step": 230 }, { "epoch": 0.4322377307519136, "grad_norm": 0.1865388893667488, "learning_rate": 1.9188377549761962e-05, "loss": 1.0636, "step": 240 }, { "epoch": 0.45024763619990993, "grad_norm": 0.18776091331352887, "learning_rate": 1.90597525112044e-05, "loss": 1.0514, "step": 250 }, { "epoch": 0.46825754164790634, "grad_norm": 0.18243911298468143, "learning_rate": 1.8922168683569038e-05, "loss": 1.0519, "step": 260 }, { "epoch": 0.48626744709590275, "grad_norm": 0.18797424109818742, "learning_rate": 1.8775762117425777e-05, "loss": 1.0544, "step": 270 }, { "epoch": 0.5042773525438992, "grad_norm": 0.18906571186079896, "learning_rate": 1.8620677587760916e-05, "loss": 1.0567, "step": 280 }, { "epoch": 0.5222872579918956, "grad_norm": 0.18393014193254023, "learning_rate": 1.8457068450815562e-05, "loss": 1.0567, "step": 290 }, { "epoch": 0.540297163439892, "grad_norm": 0.19328266852176415, "learning_rate": 1.8285096492438424e-05, "loss": 1.055, "step": 300 }, { "epoch": 0.5583070688878884, "grad_norm": 0.18341259531456078, "learning_rate": 1.810493176810292e-05, "loss": 1.0537, "step": 310 }, { "epoch": 0.5763169743358847, "grad_norm": 0.19070534973528358, "learning_rate": 1.7916752434746856e-05, "loss": 1.0388, "step": 320 }, { "epoch": 0.5943268797838811, "grad_norm": 0.18919026633766578, "learning_rate": 1.7720744574600865e-05, "loss": 1.054, "step": 330 }, { "epoch": 0.6123367852318775, "grad_norm": 0.2324366480100927, "learning_rate": 1.7517102011179935e-05, "loss": 1.0521, "step": 340 }, { "epoch": 0.6303466906798739, "grad_norm": 0.18954585112855193, "learning_rate": 1.730602611761989e-05, "loss": 1.0442, "step": 350 }, { "epoch": 0.6483565961278703, "grad_norm": 0.19841527530500794, "learning_rate": 1.7087725617548385e-05, "loss": 1.0461, "step": 360 }, { "epoch": 0.6663665015758667, "grad_norm": 0.18229995357270518, "learning_rate": 1.686241637868734e-05, "loss": 1.0416, "step": 370 }, { "epoch": 0.6843764070238632, "grad_norm": 0.18303392694652232, "learning_rate": 1.6630321199390868e-05, "loss": 1.0444, "step": 380 }, { "epoch": 0.7023863124718596, "grad_norm": 0.18252570663629492, "learning_rate": 1.639166958832985e-05, "loss": 1.0382, "step": 390 }, { "epoch": 0.720396217919856, "grad_norm": 0.1920090778916924, "learning_rate": 1.6146697537540926e-05, "loss": 1.0451, "step": 400 }, { "epoch": 0.7384061233678523, "grad_norm": 0.182014948065573, "learning_rate": 1.5895647289064395e-05, "loss": 1.0459, "step": 410 }, { "epoch": 0.7564160288158487, "grad_norm": 0.18069534485817443, "learning_rate": 1.5638767095401778e-05, "loss": 1.0414, "step": 420 }, { "epoch": 0.7744259342638451, "grad_norm": 0.1970419139073729, "learning_rate": 1.5376310974029872e-05, "loss": 1.0379, "step": 430 }, { "epoch": 0.7924358397118415, "grad_norm": 0.1831244876657377, "learning_rate": 1.5108538456214088e-05, "loss": 1.038, "step": 440 }, { "epoch": 0.8104457451598379, "grad_norm": 0.18766104093935895, "learning_rate": 1.4835714330369445e-05, "loss": 1.0396, "step": 450 }, { "epoch": 0.8284556506078343, "grad_norm": 0.20205755605640713, "learning_rate": 1.4558108380223013e-05, "loss": 1.0312, "step": 460 }, { "epoch": 0.8464655560558307, "grad_norm": 0.1958644336406297, "learning_rate": 1.4275995118036694e-05, "loss": 1.0333, "step": 470 }, { "epoch": 0.8644754615038271, "grad_norm": 0.17729339680552028, "learning_rate": 1.3989653513154165e-05, "loss": 1.0429, "step": 480 }, { "epoch": 0.8824853669518236, "grad_norm": 0.17561113142823978, "learning_rate": 1.3699366716140434e-05, "loss": 1.0419, "step": 490 }, { "epoch": 0.9004952723998199, "grad_norm": 0.18385449880212262, "learning_rate": 1.3405421778786738e-05, "loss": 1.0482, "step": 500 }, { "epoch": 0.9185051778478163, "grad_norm": 0.18110333089233704, "learning_rate": 1.3108109370257714e-05, "loss": 1.0539, "step": 510 }, { "epoch": 0.9365150832958127, "grad_norm": 0.18232534077561582, "learning_rate": 1.2807723489661497e-05, "loss": 1.0371, "step": 520 }, { "epoch": 0.9545249887438091, "grad_norm": 0.17118485590371502, "learning_rate": 1.2504561175326986e-05, "loss": 1.034, "step": 530 }, { "epoch": 0.9725348941918055, "grad_norm": 0.17973311103034112, "learning_rate": 1.2198922211075779e-05, "loss": 1.0374, "step": 540 }, { "epoch": 0.9905447996398019, "grad_norm": 0.18714581161352067, "learning_rate": 1.1891108829779166e-05, "loss": 1.0317, "step": 550 }, { "epoch": 0.9995497523638001, "eval_loss": 1.039684534072876, "eval_runtime": 662.247, "eval_samples_per_second": 11.876, "eval_steps_per_second": 0.743, "step": 555 }, { "epoch": 1.0085547050877983, "grad_norm": 0.20548749168254346, "learning_rate": 1.158142541449341e-05, "loss": 1.0109, "step": 560 }, { "epoch": 1.0265646105357946, "grad_norm": 0.201629063500619, "learning_rate": 1.1270178197468788e-05, "loss": 0.9851, "step": 570 }, { "epoch": 1.0445745159837911, "grad_norm": 0.19089253248727517, "learning_rate": 1.0957674957330043e-05, "loss": 0.9736, "step": 580 }, { "epoch": 1.0625844214317874, "grad_norm": 0.18420339461964708, "learning_rate": 1.0644224714727683e-05, "loss": 0.9918, "step": 590 }, { "epoch": 1.080594326879784, "grad_norm": 0.18488819323563785, "learning_rate": 1.0330137426761136e-05, "loss": 0.9832, "step": 600 }, { "epoch": 1.0986042323277803, "grad_norm": 0.18816750176239094, "learning_rate": 1.0015723680475847e-05, "loss": 0.9894, "step": 610 }, { "epoch": 1.1166141377757768, "grad_norm": 0.17955677324222288, "learning_rate": 9.701294385737471e-06, "loss": 0.9915, "step": 620 }, { "epoch": 1.134624043223773, "grad_norm": 0.1860265308958814, "learning_rate": 9.38716046778684e-06, "loss": 0.9702, "step": 630 }, { "epoch": 1.1526339486717694, "grad_norm": 0.18388832465015095, "learning_rate": 9.073632559779731e-06, "loss": 0.981, "step": 640 }, { "epoch": 1.170643854119766, "grad_norm": 0.1788808037903788, "learning_rate": 8.76102069561545e-06, "loss": 0.9812, "step": 650 }, { "epoch": 1.1886537595677622, "grad_norm": 0.178574258193573, "learning_rate": 8.449634003358022e-06, "loss": 0.9836, "step": 660 }, { "epoch": 1.2066636650157587, "grad_norm": 0.18306275208917874, "learning_rate": 8.13978039955308e-06, "loss": 0.9831, "step": 670 }, { "epoch": 1.224673570463755, "grad_norm": 0.18918449761657866, "learning_rate": 7.831766284742807e-06, "loss": 0.9846, "step": 680 }, { "epoch": 1.2426834759117515, "grad_norm": 0.1855294097858111, "learning_rate": 7.525896240479977e-06, "loss": 0.9744, "step": 690 }, { "epoch": 1.2606933813597478, "grad_norm": 0.20185722671737671, "learning_rate": 7.222472728140695e-06, "loss": 0.9859, "step": 700 }, { "epoch": 1.2787032868077444, "grad_norm": 0.192482680701874, "learning_rate": 6.921795789833723e-06, "loss": 0.9886, "step": 710 }, { "epoch": 1.2967131922557407, "grad_norm": 0.1795264410436132, "learning_rate": 6.624162751702077e-06, "loss": 0.9854, "step": 720 }, { "epoch": 1.314723097703737, "grad_norm": 0.18175034374465715, "learning_rate": 6.329867929910347e-06, "loss": 0.9836, "step": 730 }, { "epoch": 1.3327330031517335, "grad_norm": 0.18874834848612046, "learning_rate": 6.039202339608432e-06, "loss": 0.9792, "step": 740 }, { "epoch": 1.3507429085997298, "grad_norm": 0.1843992268180646, "learning_rate": 5.752453407159521e-06, "loss": 0.9794, "step": 750 }, { "epoch": 1.3687528140477263, "grad_norm": 0.17887298529702766, "learning_rate": 5.469904685916861e-06, "loss": 0.985, "step": 760 }, { "epoch": 1.3867627194957226, "grad_norm": 0.17428526152643262, "learning_rate": 5.1918355758303515e-06, "loss": 0.9793, "step": 770 }, { "epoch": 1.4047726249437191, "grad_norm": 0.17579200186681246, "learning_rate": 4.918521047160309e-06, "loss": 0.9797, "step": 780 }, { "epoch": 1.4227825303917154, "grad_norm": 0.17482239840906302, "learning_rate": 4.650231368571486e-06, "loss": 0.967, "step": 790 }, { "epoch": 1.440792435839712, "grad_norm": 0.1846232978343707, "learning_rate": 4.387231839876349e-06, "loss": 0.9795, "step": 800 }, { "epoch": 1.4588023412877082, "grad_norm": 0.17722992573590463, "learning_rate": 4.1297825296918145e-06, "loss": 0.9749, "step": 810 }, { "epoch": 1.4768122467357045, "grad_norm": 0.18740341744136668, "learning_rate": 3.878138018268867e-06, "loss": 0.9846, "step": 820 }, { "epoch": 1.494822152183701, "grad_norm": 0.18222853458564284, "learning_rate": 3.6325471457493956e-06, "loss": 0.9869, "step": 830 }, { "epoch": 1.5128320576316976, "grad_norm": 0.17604705966690576, "learning_rate": 3.3932527660991877e-06, "loss": 0.9791, "step": 840 }, { "epoch": 1.530841963079694, "grad_norm": 0.17129760493794519, "learning_rate": 3.160491506960344e-06, "loss": 0.9803, "step": 850 }, { "epoch": 1.5488518685276902, "grad_norm": 0.17743331668980605, "learning_rate": 2.934493535660677e-06, "loss": 0.9849, "step": 860 }, { "epoch": 1.5668617739756865, "grad_norm": 0.17206764149200166, "learning_rate": 2.715482331611393e-06, "loss": 0.9799, "step": 870 }, { "epoch": 1.584871679423683, "grad_norm": 0.17401212500860314, "learning_rate": 2.5036744653181755e-06, "loss": 0.977, "step": 880 }, { "epoch": 1.6028815848716795, "grad_norm": 0.1864847419394007, "learning_rate": 2.29927938422419e-06, "loss": 0.9824, "step": 890 }, { "epoch": 1.6208914903196758, "grad_norm": 0.1794444567972596, "learning_rate": 2.102499205596743e-06, "loss": 0.9711, "step": 900 }, { "epoch": 1.6389013957676721, "grad_norm": 0.17867218012908836, "learning_rate": 1.913528516662452e-06, "loss": 0.9904, "step": 910 }, { "epoch": 1.6569113012156687, "grad_norm": 0.17632077595030837, "learning_rate": 1.7325541821885383e-06, "loss": 0.9846, "step": 920 }, { "epoch": 1.6749212066636652, "grad_norm": 0.1749399044308501, "learning_rate": 1.5597551597004968e-06, "loss": 0.9759, "step": 930 }, { "epoch": 1.6929311121116615, "grad_norm": 0.1765753424915534, "learning_rate": 1.3953023225189243e-06, "loss": 0.9947, "step": 940 }, { "epoch": 1.7109410175596578, "grad_norm": 0.17249203862535814, "learning_rate": 1.23935829079042e-06, "loss": 0.987, "step": 950 }, { "epoch": 1.728950923007654, "grad_norm": 0.17083864764712672, "learning_rate": 1.0920772706797166e-06, "loss": 0.9655, "step": 960 }, { "epoch": 1.7469608284556506, "grad_norm": 0.17684409732386205, "learning_rate": 9.536049018820193e-07, "loss": 0.9741, "step": 970 }, { "epoch": 1.7649707339036471, "grad_norm": 0.17089785327384419, "learning_rate": 8.240781136063348e-07, "loss": 0.9699, "step": 980 }, { "epoch": 1.7829806393516434, "grad_norm": 0.17526364109022735, "learning_rate": 7.03624989172228e-07, "loss": 0.9884, "step": 990 }, { "epoch": 1.8009905447996397, "grad_norm": 0.17477084621817432, "learning_rate": 5.923646393538907e-07, "loss": 0.9762, "step": 1000 }, { "epoch": 1.8190004502476362, "grad_norm": 0.17586999782551316, "learning_rate": 4.904070845967467e-07, "loss": 0.9763, "step": 1010 }, { "epoch": 1.8370103556956328, "grad_norm": 0.17351730726317385, "learning_rate": 3.97853146223105e-07, "loss": 0.9759, "step": 1020 }, { "epoch": 1.855020261143629, "grad_norm": 0.17245529759955128, "learning_rate": 3.1479434673440167e-07, "loss": 0.9762, "step": 1030 }, { "epoch": 1.8730301665916254, "grad_norm": 0.17074305784570018, "learning_rate": 2.4131281930864006e-07, "loss": 0.9778, "step": 1040 }, { "epoch": 1.8910400720396217, "grad_norm": 0.1699923858478813, "learning_rate": 1.7748122658251877e-07, "loss": 0.9898, "step": 1050 }, { "epoch": 1.9090499774876182, "grad_norm": 0.17560912690299982, "learning_rate": 1.2336268879856728e-07, "loss": 0.9764, "step": 1060 }, { "epoch": 1.9270598829356147, "grad_norm": 0.17172727854066605, "learning_rate": 7.901072138831512e-08, "loss": 0.9908, "step": 1070 }, { "epoch": 1.945069788383611, "grad_norm": 0.17416282443666103, "learning_rate": 4.44691820532539e-08, "loss": 0.9765, "step": 1080 }, { "epoch": 1.9630796938316073, "grad_norm": 0.1719348511857924, "learning_rate": 1.977222739588891e-08, "loss": 0.9784, "step": 1090 }, { "epoch": 1.9810895992796038, "grad_norm": 0.1744398791007269, "learning_rate": 4.944279143784814e-09, "loss": 0.9841, "step": 1100 }, { "epoch": 1.9990995047276003, "grad_norm": 0.17567978804671192, "learning_rate": 0.0, "loss": 0.9698, "step": 1110 }, { "epoch": 1.9990995047276003, "eval_loss": 1.0319925546646118, "eval_runtime": 659.9931, "eval_samples_per_second": 11.917, "eval_steps_per_second": 0.745, "step": 1110 }, { "epoch": 1.9990995047276003, "step": 1110, "total_flos": 929619498762240.0, "train_loss": 1.0262996888375497, "train_runtime": 54210.2289, "train_samples_per_second": 2.622, "train_steps_per_second": 0.02 } ], "logging_steps": 10, "max_steps": 1110, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 929619498762240.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }