|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.40431266846361186, |
|
"eval_steps": 38, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0026954177897574125, |
|
"grad_norm": 0.4888671256680066, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.6499, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0026954177897574125, |
|
"eval_loss": 2.7584831714630127, |
|
"eval_runtime": 108.9254, |
|
"eval_samples_per_second": 0.734, |
|
"eval_steps_per_second": 0.184, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005390835579514825, |
|
"grad_norm": 0.4983074714711494, |
|
"learning_rate": 5e-06, |
|
"loss": 2.5992, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008086253369272238, |
|
"grad_norm": 0.7241191448700953, |
|
"learning_rate": 7.5e-06, |
|
"loss": 2.8753, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01078167115902965, |
|
"grad_norm": 0.5260744264207279, |
|
"learning_rate": 1e-05, |
|
"loss": 2.6477, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013477088948787063, |
|
"grad_norm": 0.36639797120190076, |
|
"learning_rate": 1.25e-05, |
|
"loss": 2.5086, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016172506738544475, |
|
"grad_norm": 0.7809079947872807, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.8346, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.018867924528301886, |
|
"grad_norm": 0.5980534435082484, |
|
"learning_rate": 1.75e-05, |
|
"loss": 2.9624, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0215633423180593, |
|
"grad_norm": 0.5798622117264719, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8441, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02425876010781671, |
|
"grad_norm": 0.7528222488078259, |
|
"learning_rate": 2.25e-05, |
|
"loss": 2.6867, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.026954177897574125, |
|
"grad_norm": 0.612852819847163, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.7927, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029649595687331536, |
|
"grad_norm": 0.6397459055001692, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 2.6072, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03234501347708895, |
|
"grad_norm": 0.6598580504868401, |
|
"learning_rate": 3e-05, |
|
"loss": 2.5274, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03504043126684636, |
|
"grad_norm": 0.30737376780003156, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 2.5507, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03773584905660377, |
|
"grad_norm": 0.14992908808228841, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.4811, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04043126684636118, |
|
"grad_norm": 0.22421462173093706, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 2.6159, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0431266846361186, |
|
"grad_norm": 0.16819168085460345, |
|
"learning_rate": 4e-05, |
|
"loss": 2.6193, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04582210242587601, |
|
"grad_norm": 0.12218717106853527, |
|
"learning_rate": 4.25e-05, |
|
"loss": 2.7094, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04851752021563342, |
|
"grad_norm": 0.1145865395458764, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.6314, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05121293800539083, |
|
"grad_norm": 0.08821545171076035, |
|
"learning_rate": 4.75e-05, |
|
"loss": 2.4876, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05390835579514825, |
|
"grad_norm": 0.10203760270867333, |
|
"learning_rate": 5e-05, |
|
"loss": 2.622, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05660377358490566, |
|
"grad_norm": 0.07748067981259345, |
|
"learning_rate": 4.9999098771046674e-05, |
|
"loss": 2.4191, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05929919137466307, |
|
"grad_norm": 0.13225530856023926, |
|
"learning_rate": 4.999639515638348e-05, |
|
"loss": 2.642, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06199460916442048, |
|
"grad_norm": 0.07588900512747042, |
|
"learning_rate": 4.999188937259492e-05, |
|
"loss": 2.2706, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0646900269541779, |
|
"grad_norm": 0.12017105244364508, |
|
"learning_rate": 4.998558178063592e-05, |
|
"loss": 2.6609, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0673854447439353, |
|
"grad_norm": 0.13885153935053832, |
|
"learning_rate": 4.9977472885802876e-05, |
|
"loss": 2.3222, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07008086253369272, |
|
"grad_norm": 0.08273500402204699, |
|
"learning_rate": 4.996756333769319e-05, |
|
"loss": 2.2653, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07277628032345014, |
|
"grad_norm": 0.08128820734533522, |
|
"learning_rate": 4.995585393015324e-05, |
|
"loss": 2.468, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 0.11425098408554267, |
|
"learning_rate": 4.9942345601214764e-05, |
|
"loss": 2.0993, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07816711590296496, |
|
"grad_norm": 0.10026403239718774, |
|
"learning_rate": 4.992703943301973e-05, |
|
"loss": 2.4194, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08086253369272237, |
|
"grad_norm": 0.0892349090753594, |
|
"learning_rate": 4.990993665173364e-05, |
|
"loss": 2.5426, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08355795148247978, |
|
"grad_norm": 0.08905031867375157, |
|
"learning_rate": 4.989103862744732e-05, |
|
"loss": 2.4458, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0862533692722372, |
|
"grad_norm": 0.08382151700226004, |
|
"learning_rate": 4.987034687406713e-05, |
|
"loss": 2.4005, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0889487870619946, |
|
"grad_norm": 0.07624420840782331, |
|
"learning_rate": 4.984786304919372e-05, |
|
"loss": 2.3028, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09164420485175202, |
|
"grad_norm": 0.08195304711308972, |
|
"learning_rate": 4.9823588953989216e-05, |
|
"loss": 2.2564, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09433962264150944, |
|
"grad_norm": 0.08199393883574592, |
|
"learning_rate": 4.9797526533032945e-05, |
|
"loss": 2.325, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09703504043126684, |
|
"grad_norm": 0.08525060108567321, |
|
"learning_rate": 4.976967787416565e-05, |
|
"loss": 2.08, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09973045822102426, |
|
"grad_norm": 0.08829832243395126, |
|
"learning_rate": 4.974004520832223e-05, |
|
"loss": 2.3272, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10242587601078167, |
|
"grad_norm": 0.0805993575215856, |
|
"learning_rate": 4.970863090935304e-05, |
|
"loss": 2.4375, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10242587601078167, |
|
"eval_loss": 2.4220809936523438, |
|
"eval_runtime": 109.7313, |
|
"eval_samples_per_second": 0.729, |
|
"eval_steps_per_second": 0.182, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10512129380053908, |
|
"grad_norm": 0.07362914307313682, |
|
"learning_rate": 4.967543749383371e-05, |
|
"loss": 2.5666, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1078167115902965, |
|
"grad_norm": 0.07781163135663244, |
|
"learning_rate": 4.9640467620863526e-05, |
|
"loss": 2.3806, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1105121293800539, |
|
"grad_norm": 0.07965171973675265, |
|
"learning_rate": 4.9603724091852456e-05, |
|
"loss": 2.3914, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11320754716981132, |
|
"grad_norm": 0.0769658349876996, |
|
"learning_rate": 4.95652098502967e-05, |
|
"loss": 2.3622, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11590296495956873, |
|
"grad_norm": 0.07123531103603928, |
|
"learning_rate": 4.952492798154287e-05, |
|
"loss": 2.4497, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11859838274932614, |
|
"grad_norm": 0.08376895847589426, |
|
"learning_rate": 4.948288171254089e-05, |
|
"loss": 2.2966, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12129380053908356, |
|
"grad_norm": 0.06991588614283951, |
|
"learning_rate": 4.9439074411585406e-05, |
|
"loss": 2.6397, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12398921832884097, |
|
"grad_norm": 0.07115181837614266, |
|
"learning_rate": 4.9393509588046036e-05, |
|
"loss": 2.3112, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12668463611859837, |
|
"grad_norm": 0.12069809526368983, |
|
"learning_rate": 4.9346190892086174e-05, |
|
"loss": 2.6053, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1293800539083558, |
|
"grad_norm": 0.08358966882474354, |
|
"learning_rate": 4.9297122114370596e-05, |
|
"loss": 2.3594, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1320754716981132, |
|
"grad_norm": 0.07439368303284428, |
|
"learning_rate": 4.9246307185761815e-05, |
|
"loss": 2.3608, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1347708894878706, |
|
"grad_norm": 0.07077366564222114, |
|
"learning_rate": 4.919375017700515e-05, |
|
"loss": 2.1389, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13746630727762804, |
|
"grad_norm": 0.0786367422272597, |
|
"learning_rate": 4.9139455298402656e-05, |
|
"loss": 2.3726, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.14016172506738545, |
|
"grad_norm": 0.07772287420711031, |
|
"learning_rate": 4.908342689947581e-05, |
|
"loss": 2.6495, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.09095569172141003, |
|
"learning_rate": 4.902566946861708e-05, |
|
"loss": 2.3952, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14555256064690028, |
|
"grad_norm": 0.08022908718012228, |
|
"learning_rate": 4.8966187632730356e-05, |
|
"loss": 2.2133, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14824797843665768, |
|
"grad_norm": 0.08853617116138855, |
|
"learning_rate": 4.8904986156860346e-05, |
|
"loss": 2.2795, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 0.08155234218395907, |
|
"learning_rate": 4.884206994381078e-05, |
|
"loss": 2.6322, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15363881401617252, |
|
"grad_norm": 0.07017826282717579, |
|
"learning_rate": 4.877744403375168e-05, |
|
"loss": 2.36, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15633423180592992, |
|
"grad_norm": 0.07787943384594204, |
|
"learning_rate": 4.871111360381562e-05, |
|
"loss": 2.4609, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15902964959568733, |
|
"grad_norm": 0.074730895279447, |
|
"learning_rate": 4.864308396768294e-05, |
|
"loss": 2.3737, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.16172506738544473, |
|
"grad_norm": 0.0754808721924062, |
|
"learning_rate": 4.857336057515611e-05, |
|
"loss": 2.4353, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16442048517520216, |
|
"grad_norm": 0.17074429587919648, |
|
"learning_rate": 4.8501949011723144e-05, |
|
"loss": 2.3747, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16711590296495957, |
|
"grad_norm": 0.12258858526490496, |
|
"learning_rate": 4.842885499811012e-05, |
|
"loss": 2.3253, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.16981132075471697, |
|
"grad_norm": 0.08223383099932692, |
|
"learning_rate": 4.835408438982294e-05, |
|
"loss": 2.5312, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1725067385444744, |
|
"grad_norm": 0.11131907859034289, |
|
"learning_rate": 4.827764317667825e-05, |
|
"loss": 2.6191, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1752021563342318, |
|
"grad_norm": 0.06793561987591092, |
|
"learning_rate": 4.8199537482323545e-05, |
|
"loss": 2.4681, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1778975741239892, |
|
"grad_norm": 0.07850381171278048, |
|
"learning_rate": 4.811977356374667e-05, |
|
"loss": 2.4868, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18059299191374664, |
|
"grad_norm": 0.08512994202655803, |
|
"learning_rate": 4.803835781077455e-05, |
|
"loss": 2.6481, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18328840970350405, |
|
"grad_norm": 0.0846518039690973, |
|
"learning_rate": 4.795529674556129e-05, |
|
"loss": 2.4295, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18598382749326145, |
|
"grad_norm": 0.08031185133795576, |
|
"learning_rate": 4.7870597022065724e-05, |
|
"loss": 2.4132, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 0.11316780608440431, |
|
"learning_rate": 4.7784265425518365e-05, |
|
"loss": 2.4341, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19137466307277629, |
|
"grad_norm": 0.0781705228710772, |
|
"learning_rate": 4.769630887187782e-05, |
|
"loss": 2.4258, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1940700808625337, |
|
"grad_norm": 0.09475565364886825, |
|
"learning_rate": 4.760673440727678e-05, |
|
"loss": 2.6535, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1967654986522911, |
|
"grad_norm": 0.0936468075339049, |
|
"learning_rate": 4.751554920745755e-05, |
|
"loss": 2.4902, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19946091644204852, |
|
"grad_norm": 0.07028228522733242, |
|
"learning_rate": 4.7422760577197226e-05, |
|
"loss": 2.3132, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.20215633423180593, |
|
"grad_norm": 0.087359632341199, |
|
"learning_rate": 4.7328375949722476e-05, |
|
"loss": 2.3678, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20485175202156333, |
|
"grad_norm": 0.07567735863088498, |
|
"learning_rate": 4.72324028861141e-05, |
|
"loss": 2.5361, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20485175202156333, |
|
"eval_loss": 2.409113645553589, |
|
"eval_runtime": 109.4094, |
|
"eval_samples_per_second": 0.731, |
|
"eval_steps_per_second": 0.183, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20754716981132076, |
|
"grad_norm": 0.0806053756655779, |
|
"learning_rate": 4.713484907470133e-05, |
|
"loss": 2.4361, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.21024258760107817, |
|
"grad_norm": 0.08462062433959809, |
|
"learning_rate": 4.7035722330445857e-05, |
|
"loss": 2.5358, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.21293800539083557, |
|
"grad_norm": 0.08211420166838714, |
|
"learning_rate": 4.693503059431588e-05, |
|
"loss": 2.3083, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.215633423180593, |
|
"grad_norm": 0.07226082893047459, |
|
"learning_rate": 4.6832781932649884e-05, |
|
"loss": 2.468, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2183288409703504, |
|
"grad_norm": 0.08491650207069548, |
|
"learning_rate": 4.6728984536510454e-05, |
|
"loss": 2.436, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2210242587601078, |
|
"grad_norm": 0.07893280409193093, |
|
"learning_rate": 4.662364672102817e-05, |
|
"loss": 2.3986, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.22371967654986524, |
|
"grad_norm": 0.08530475722783892, |
|
"learning_rate": 4.651677692473538e-05, |
|
"loss": 2.5171, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.22641509433962265, |
|
"grad_norm": 0.0782081304914248, |
|
"learning_rate": 4.640838370889029e-05, |
|
"loss": 2.4167, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22911051212938005, |
|
"grad_norm": 0.09010839515008441, |
|
"learning_rate": 4.629847575679107e-05, |
|
"loss": 2.3621, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23180592991913745, |
|
"grad_norm": 0.08429756465698064, |
|
"learning_rate": 4.6187061873080264e-05, |
|
"loss": 2.5409, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.23450134770889489, |
|
"grad_norm": 0.08174597825553892, |
|
"learning_rate": 4.607415098303945e-05, |
|
"loss": 2.3467, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2371967654986523, |
|
"grad_norm": 0.08048024256086499, |
|
"learning_rate": 4.5959752131874263e-05, |
|
"loss": 2.5402, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2398921832884097, |
|
"grad_norm": 0.07839092789606332, |
|
"learning_rate": 4.5843874483989746e-05, |
|
"loss": 2.2092, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.24258760107816713, |
|
"grad_norm": 0.08207450024743052, |
|
"learning_rate": 4.572652732225625e-05, |
|
"loss": 2.4616, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24528301886792453, |
|
"grad_norm": 0.08180483750519887, |
|
"learning_rate": 4.560772004726575e-05, |
|
"loss": 2.4805, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.24797843665768193, |
|
"grad_norm": 0.09232170138763231, |
|
"learning_rate": 4.548746217657878e-05, |
|
"loss": 2.4803, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.25067385444743934, |
|
"grad_norm": 0.08251617231525912, |
|
"learning_rate": 4.5365763343962005e-05, |
|
"loss": 2.3035, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.25336927223719674, |
|
"grad_norm": 0.08840722457592919, |
|
"learning_rate": 4.524263329861644e-05, |
|
"loss": 2.4107, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2560646900269542, |
|
"grad_norm": 0.08098547429953781, |
|
"learning_rate": 4.5118081904396504e-05, |
|
"loss": 2.5351, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2587601078167116, |
|
"grad_norm": 0.09582643101903592, |
|
"learning_rate": 4.499211913901974e-05, |
|
"loss": 2.3998, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.261455525606469, |
|
"grad_norm": 0.09325977474372174, |
|
"learning_rate": 4.486475509326759e-05, |
|
"loss": 2.6805, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2641509433962264, |
|
"grad_norm": 0.08869290989815509, |
|
"learning_rate": 4.473599997017701e-05, |
|
"loss": 2.384, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2668463611859838, |
|
"grad_norm": 0.07911227872845064, |
|
"learning_rate": 4.460586408422308e-05, |
|
"loss": 2.4893, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2695417789757412, |
|
"grad_norm": 0.08301640398701123, |
|
"learning_rate": 4.447435786049278e-05, |
|
"loss": 2.4671, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2722371967654987, |
|
"grad_norm": 0.0959773217992077, |
|
"learning_rate": 4.434149183384977e-05, |
|
"loss": 2.5863, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2749326145552561, |
|
"grad_norm": 0.07951557525414293, |
|
"learning_rate": 4.420727664809053e-05, |
|
"loss": 2.552, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2776280323450135, |
|
"grad_norm": 0.07489545345156162, |
|
"learning_rate": 4.407172305509158e-05, |
|
"loss": 2.3026, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2803234501347709, |
|
"grad_norm": 0.08339425757380002, |
|
"learning_rate": 4.3934841913948323e-05, |
|
"loss": 2.3962, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2830188679245283, |
|
"grad_norm": 0.09366852913157536, |
|
"learning_rate": 4.379664419010496e-05, |
|
"loss": 2.364, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.07982165605990695, |
|
"learning_rate": 4.3657140954476165e-05, |
|
"loss": 2.4098, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2884097035040431, |
|
"grad_norm": 0.10046638975690705, |
|
"learning_rate": 4.351634338256017e-05, |
|
"loss": 2.3549, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.29110512129380056, |
|
"grad_norm": 0.08495800159851925, |
|
"learning_rate": 4.337426275354348e-05, |
|
"loss": 2.3466, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.29380053908355797, |
|
"grad_norm": 0.08176244120225214, |
|
"learning_rate": 4.323091044939736e-05, |
|
"loss": 2.4374, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.29649595687331537, |
|
"grad_norm": 0.0766986612060137, |
|
"learning_rate": 4.308629795396599e-05, |
|
"loss": 2.3918, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2991913746630728, |
|
"grad_norm": 0.08318997009881038, |
|
"learning_rate": 4.294043685204651e-05, |
|
"loss": 2.1499, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 0.09128600009973836, |
|
"learning_rate": 4.2793338828460984e-05, |
|
"loss": 2.3912, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3045822102425876, |
|
"grad_norm": 0.07755181417221228, |
|
"learning_rate": 4.264501566712033e-05, |
|
"loss": 2.5428, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.30727762803234504, |
|
"grad_norm": 0.07626744119606738, |
|
"learning_rate": 4.249547925008033e-05, |
|
"loss": 2.3882, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.30727762803234504, |
|
"eval_loss": 2.4033055305480957, |
|
"eval_runtime": 108.7319, |
|
"eval_samples_per_second": 0.736, |
|
"eval_steps_per_second": 0.184, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.30997304582210244, |
|
"grad_norm": 0.10404280937654081, |
|
"learning_rate": 4.234474155658974e-05, |
|
"loss": 2.4894, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.31266846361185985, |
|
"grad_norm": 0.08349137443478906, |
|
"learning_rate": 4.219281466213066e-05, |
|
"loss": 2.3636, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.31536388140161725, |
|
"grad_norm": 0.09717978095378631, |
|
"learning_rate": 4.203971073745122e-05, |
|
"loss": 2.4826, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.31805929919137466, |
|
"grad_norm": 0.08810095937170669, |
|
"learning_rate": 4.188544204759048e-05, |
|
"loss": 2.5296, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.32075471698113206, |
|
"grad_norm": 0.09443980051741153, |
|
"learning_rate": 4.173002095089599e-05, |
|
"loss": 2.2018, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.32345013477088946, |
|
"grad_norm": 0.09464431143756959, |
|
"learning_rate": 4.157345989803375e-05, |
|
"loss": 2.4329, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3261455525606469, |
|
"grad_norm": 0.0921167133236036, |
|
"learning_rate": 4.141577143099074e-05, |
|
"loss": 2.411, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3288409703504043, |
|
"grad_norm": 0.09102913953050058, |
|
"learning_rate": 4.1256968182070275e-05, |
|
"loss": 2.6301, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.33153638814016173, |
|
"grad_norm": 0.09442024128701812, |
|
"learning_rate": 4.109706287287998e-05, |
|
"loss": 2.2963, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.33423180592991913, |
|
"grad_norm": 0.09048360971081038, |
|
"learning_rate": 4.093606831331269e-05, |
|
"loss": 2.3533, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.33692722371967654, |
|
"grad_norm": 0.09958454863765234, |
|
"learning_rate": 4.077399740052027e-05, |
|
"loss": 2.5592, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.33962264150943394, |
|
"grad_norm": 0.10352388511166186, |
|
"learning_rate": 4.06108631178804e-05, |
|
"loss": 2.5035, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3423180592991914, |
|
"grad_norm": 0.09119254602507475, |
|
"learning_rate": 4.044667853395655e-05, |
|
"loss": 2.3612, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3450134770889488, |
|
"grad_norm": 0.11817603428015877, |
|
"learning_rate": 4.0281456801451004e-05, |
|
"loss": 2.3564, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3477088948787062, |
|
"grad_norm": 0.08282970624160388, |
|
"learning_rate": 4.011521115615123e-05, |
|
"loss": 2.3187, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3504043126684636, |
|
"grad_norm": 0.09138997506736304, |
|
"learning_rate": 3.9947954915869565e-05, |
|
"loss": 2.4211, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.353099730458221, |
|
"grad_norm": 0.10315874858803445, |
|
"learning_rate": 3.977970147937635e-05, |
|
"loss": 2.4255, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3557951482479784, |
|
"grad_norm": 0.12324355580259778, |
|
"learning_rate": 3.961046432532659e-05, |
|
"loss": 2.4149, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3584905660377358, |
|
"grad_norm": 0.07993016245985274, |
|
"learning_rate": 3.944025701118009e-05, |
|
"loss": 2.1721, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3611859838274933, |
|
"grad_norm": 0.08795688215451386, |
|
"learning_rate": 3.9269093172115496e-05, |
|
"loss": 2.3163, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3638814016172507, |
|
"grad_norm": 0.08804534236891878, |
|
"learning_rate": 3.9096986519937924e-05, |
|
"loss": 2.3146, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3665768194070081, |
|
"grad_norm": 0.08147499851736127, |
|
"learning_rate": 3.892395084198053e-05, |
|
"loss": 2.4056, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3692722371967655, |
|
"grad_norm": 0.08190511447175075, |
|
"learning_rate": 3.875e-05, |
|
"loss": 2.3873, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3719676549865229, |
|
"grad_norm": 0.10702441021155629, |
|
"learning_rate": 3.857514792906616e-05, |
|
"loss": 2.643, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3746630727762803, |
|
"grad_norm": 0.0785708035522026, |
|
"learning_rate": 3.839940863644556e-05, |
|
"loss": 2.5635, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 0.09362896847049527, |
|
"learning_rate": 3.822279620047943e-05, |
|
"loss": 2.377, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.38005390835579517, |
|
"grad_norm": 0.09794943329180347, |
|
"learning_rate": 3.8045324769455834e-05, |
|
"loss": 2.5292, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.38274932614555257, |
|
"grad_norm": 0.0906842624325085, |
|
"learning_rate": 3.78670085604763e-05, |
|
"loss": 2.3576, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.38544474393531, |
|
"grad_norm": 0.09335279764944357, |
|
"learning_rate": 3.7687861858316844e-05, |
|
"loss": 2.1434, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3881401617250674, |
|
"grad_norm": 0.11106908056412888, |
|
"learning_rate": 3.7507899014283684e-05, |
|
"loss": 2.2124, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3908355795148248, |
|
"grad_norm": 0.10748379262846679, |
|
"learning_rate": 3.732713444506354e-05, |
|
"loss": 2.4934, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3935309973045822, |
|
"grad_norm": 0.08611757548383629, |
|
"learning_rate": 3.714558263156872e-05, |
|
"loss": 2.2559, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.39622641509433965, |
|
"grad_norm": 0.08579591203817094, |
|
"learning_rate": 3.696325811777709e-05, |
|
"loss": 2.3151, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.39892183288409705, |
|
"grad_norm": 0.08307902702869267, |
|
"learning_rate": 3.678017550956692e-05, |
|
"loss": 2.3902, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.40161725067385445, |
|
"grad_norm": 0.0915570030354968, |
|
"learning_rate": 3.659634947354686e-05, |
|
"loss": 2.223, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.40431266846361186, |
|
"grad_norm": 0.07897467847137554, |
|
"learning_rate": 3.6411794735881035e-05, |
|
"loss": 2.3241, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 371, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 75, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 24793369804800.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|