{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 685, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0072992700729927005, "grad_norm": 708.0, "learning_rate": 2.898550724637681e-06, "loss": 56.8346, "step": 1 }, { "epoch": 0.0364963503649635, "grad_norm": 604.0, "learning_rate": 1.4492753623188407e-05, "loss": 52.9742, "step": 5 }, { "epoch": 0.072992700729927, "grad_norm": 340.0, "learning_rate": 2.8985507246376814e-05, "loss": 39.0746, "step": 10 }, { "epoch": 0.10948905109489052, "grad_norm": 40.25, "learning_rate": 4.347826086956522e-05, "loss": 20.8099, "step": 15 }, { "epoch": 0.145985401459854, "grad_norm": 25.5, "learning_rate": 5.797101449275363e-05, "loss": 17.6144, "step": 20 }, { "epoch": 0.18248175182481752, "grad_norm": 7.78125, "learning_rate": 7.246376811594203e-05, "loss": 15.3803, "step": 25 }, { "epoch": 0.21897810218978103, "grad_norm": 6.40625, "learning_rate": 8.695652173913044e-05, "loss": 14.0798, "step": 30 }, { "epoch": 0.25547445255474455, "grad_norm": 13.4375, "learning_rate": 0.00010144927536231885, "loss": 13.4032, "step": 35 }, { "epoch": 0.291970802919708, "grad_norm": 41.0, "learning_rate": 0.00011594202898550725, "loss": 10.8827, "step": 40 }, { "epoch": 0.3284671532846715, "grad_norm": 13.1875, "learning_rate": 0.00013043478260869567, "loss": 4.5915, "step": 45 }, { "epoch": 0.36496350364963503, "grad_norm": 4.09375, "learning_rate": 0.00014492753623188405, "loss": 1.9, "step": 50 }, { "epoch": 0.40145985401459855, "grad_norm": 2.28125, "learning_rate": 0.00015942028985507247, "loss": 1.6474, "step": 55 }, { "epoch": 0.43795620437956206, "grad_norm": 3.5, "learning_rate": 0.00017391304347826088, "loss": 1.477, "step": 60 }, { "epoch": 0.4744525547445255, "grad_norm": 2.28125, "learning_rate": 0.00018840579710144927, "loss": 1.3309, "step": 65 }, { "epoch": 0.5109489051094891, "grad_norm": 1.6171875, "learning_rate": 0.00019999869950890106, "loss": 1.2538, "step": 70 }, { "epoch": 0.5474452554744526, "grad_norm": 5.9375, "learning_rate": 0.0001999531858720213, "loss": 1.224, "step": 75 }, { "epoch": 0.583941605839416, "grad_norm": 2.25, "learning_rate": 0.00019984268150178167, "loss": 1.1823, "step": 80 }, { "epoch": 0.6204379562043796, "grad_norm": 2.078125, "learning_rate": 0.00019966725824941932, "loss": 1.1279, "step": 85 }, { "epoch": 0.656934306569343, "grad_norm": 3.0625, "learning_rate": 0.00019942703017718975, "loss": 1.127, "step": 90 }, { "epoch": 0.6934306569343066, "grad_norm": 1.75, "learning_rate": 0.000199122153484202, "loss": 1.1284, "step": 95 }, { "epoch": 0.7299270072992701, "grad_norm": 1.5625, "learning_rate": 0.00019875282640485645, "loss": 1.0566, "step": 100 }, { "epoch": 0.7664233576642335, "grad_norm": 4.53125, "learning_rate": 0.0001983192890799503, "loss": 1.0361, "step": 105 }, { "epoch": 0.8029197080291971, "grad_norm": 2.5, "learning_rate": 0.0001978218234005352, "loss": 1.0371, "step": 110 }, { "epoch": 0.8394160583941606, "grad_norm": 1.890625, "learning_rate": 0.00019726075282462845, "loss": 1.0235, "step": 115 }, { "epoch": 0.8759124087591241, "grad_norm": 0.67578125, "learning_rate": 0.00019663644216689683, "loss": 0.996, "step": 120 }, { "epoch": 0.9124087591240876, "grad_norm": 1.2421875, "learning_rate": 0.00019594929736144976, "loss": 0.9734, "step": 125 }, { "epoch": 0.948905109489051, "grad_norm": 1.5625, "learning_rate": 0.00019519976519789616, "loss": 0.978, "step": 130 }, { "epoch": 0.9854014598540146, "grad_norm": 0.95703125, "learning_rate": 0.00019438833303083678, "loss": 0.9712, "step": 135 }, { "epoch": 1.0, "eval_loss": 2.307734489440918, "eval_runtime": 0.9962, "eval_samples_per_second": 5.019, "eval_steps_per_second": 2.008, "step": 137 }, { "epoch": 1.0218978102189782, "grad_norm": 2.125, "learning_rate": 0.00019351552846298025, "loss": 0.9374, "step": 140 }, { "epoch": 1.0583941605839415, "grad_norm": 2.265625, "learning_rate": 0.0001925819190020898, "loss": 0.9173, "step": 145 }, { "epoch": 1.094890510948905, "grad_norm": 0.828125, "learning_rate": 0.00019158811169198313, "loss": 0.8916, "step": 150 }, { "epoch": 1.1313868613138687, "grad_norm": 1.0703125, "learning_rate": 0.0001905347527178252, "loss": 0.9418, "step": 155 }, { "epoch": 1.167883211678832, "grad_norm": 0.9140625, "learning_rate": 0.00018942252698597113, "loss": 0.9054, "step": 160 }, { "epoch": 1.2043795620437956, "grad_norm": 2.0625, "learning_rate": 0.00018825215767863214, "loss": 0.9039, "step": 165 }, { "epoch": 1.2408759124087592, "grad_norm": 1.5859375, "learning_rate": 0.00018702440578365387, "loss": 0.9146, "step": 170 }, { "epoch": 1.2773722627737225, "grad_norm": 1.3515625, "learning_rate": 0.00018574006959971333, "loss": 0.8896, "step": 175 }, { "epoch": 1.313868613138686, "grad_norm": 2.09375, "learning_rate": 0.00018439998421725554, "loss": 0.8947, "step": 180 }, { "epoch": 1.3503649635036497, "grad_norm": 0.80078125, "learning_rate": 0.00018300502097550806, "loss": 0.881, "step": 185 }, { "epoch": 1.3868613138686132, "grad_norm": 0.80078125, "learning_rate": 0.00018155608689592604, "loss": 0.8906, "step": 190 }, { "epoch": 1.4233576642335766, "grad_norm": 0.80859375, "learning_rate": 0.00018005412409243606, "loss": 0.8939, "step": 195 }, { "epoch": 1.4598540145985401, "grad_norm": 1.0234375, "learning_rate": 0.0001785001091588628, "loss": 0.9016, "step": 200 }, { "epoch": 1.4963503649635037, "grad_norm": 0.70703125, "learning_rate": 0.0001768950525339362, "loss": 0.8943, "step": 205 }, { "epoch": 1.5328467153284673, "grad_norm": 1.2109375, "learning_rate": 0.00017523999784429238, "loss": 0.8614, "step": 210 }, { "epoch": 1.5693430656934306, "grad_norm": 0.7734375, "learning_rate": 0.00017353602122589527, "loss": 0.8788, "step": 215 }, { "epoch": 1.6058394160583942, "grad_norm": 0.82421875, "learning_rate": 0.0001717842306243205, "loss": 0.8833, "step": 220 }, { "epoch": 1.6423357664233578, "grad_norm": 0.84765625, "learning_rate": 0.00016998576507435618, "loss": 0.8713, "step": 225 }, { "epoch": 1.6788321167883211, "grad_norm": 1.234375, "learning_rate": 0.00016814179395938913, "loss": 0.8661, "step": 230 }, { "epoch": 1.7153284671532847, "grad_norm": 0.91015625, "learning_rate": 0.00016625351625105796, "loss": 0.8413, "step": 235 }, { "epoch": 1.7518248175182483, "grad_norm": 0.63671875, "learning_rate": 0.0001643221597296679, "loss": 0.8741, "step": 240 }, { "epoch": 1.7883211678832116, "grad_norm": 0.73046875, "learning_rate": 0.00016234898018587337, "loss": 0.8744, "step": 245 }, { "epoch": 1.8248175182481752, "grad_norm": 0.671875, "learning_rate": 0.00016033526060414842, "loss": 0.8517, "step": 250 }, { "epoch": 1.8613138686131387, "grad_norm": 1.0234375, "learning_rate": 0.00015828231032857503, "loss": 0.8899, "step": 255 }, { "epoch": 1.897810218978102, "grad_norm": 0.66796875, "learning_rate": 0.00015619146421149232, "loss": 0.8537, "step": 260 }, { "epoch": 1.9343065693430657, "grad_norm": 0.7109375, "learning_rate": 0.00015406408174555976, "loss": 0.8329, "step": 265 }, { "epoch": 1.9708029197080292, "grad_norm": 0.71875, "learning_rate": 0.00015190154617979938, "loss": 0.8675, "step": 270 }, { "epoch": 2.0, "eval_loss": 2.247941017150879, "eval_runtime": 0.9979, "eval_samples_per_second": 5.01, "eval_steps_per_second": 2.004, "step": 274 }, { "epoch": 2.0072992700729926, "grad_norm": 0.80859375, "learning_rate": 0.00014970526362019079, "loss": 0.8435, "step": 275 }, { "epoch": 2.0437956204379564, "grad_norm": 1.515625, "learning_rate": 0.00014747666211540459, "loss": 0.7774, "step": 280 }, { "epoch": 2.0802919708029197, "grad_norm": 1.0859375, "learning_rate": 0.00014521719072826858, "loss": 0.79, "step": 285 }, { "epoch": 2.116788321167883, "grad_norm": 0.498046875, "learning_rate": 0.00014292831859356997, "loss": 0.7929, "step": 290 }, { "epoch": 2.153284671532847, "grad_norm": 1.59375, "learning_rate": 0.00014061153396280674, "loss": 0.8032, "step": 295 }, { "epoch": 2.18978102189781, "grad_norm": 0.83203125, "learning_rate": 0.000138268343236509, "loss": 0.7932, "step": 300 }, { "epoch": 2.2262773722627736, "grad_norm": 0.734375, "learning_rate": 0.00013590026998475986, "loss": 0.7657, "step": 305 }, { "epoch": 2.2627737226277373, "grad_norm": 0.609375, "learning_rate": 0.0001335088539565523, "loss": 0.783, "step": 310 }, { "epoch": 2.2992700729927007, "grad_norm": 0.71484375, "learning_rate": 0.00013109565007862596, "loss": 0.7755, "step": 315 }, { "epoch": 2.335766423357664, "grad_norm": 0.609375, "learning_rate": 0.0001286622274444361, "loss": 0.7723, "step": 320 }, { "epoch": 2.372262773722628, "grad_norm": 1.3359375, "learning_rate": 0.00012621016829391022, "loss": 0.7739, "step": 325 }, { "epoch": 2.408759124087591, "grad_norm": 1.1328125, "learning_rate": 0.00012374106698465732, "loss": 0.7821, "step": 330 }, { "epoch": 2.445255474452555, "grad_norm": 0.91015625, "learning_rate": 0.00012125652895529766, "loss": 0.7852, "step": 335 }, { "epoch": 2.4817518248175183, "grad_norm": 0.74609375, "learning_rate": 0.00011875816968158815, "loss": 0.7792, "step": 340 }, { "epoch": 2.5182481751824817, "grad_norm": 0.625, "learning_rate": 0.00011624761362602061, "loss": 0.7799, "step": 345 }, { "epoch": 2.554744525547445, "grad_norm": 0.81640625, "learning_rate": 0.00011372649318157749, "loss": 0.7914, "step": 350 }, { "epoch": 2.591240875912409, "grad_norm": 0.80078125, "learning_rate": 0.00011119644761033078, "loss": 0.7847, "step": 355 }, { "epoch": 2.627737226277372, "grad_norm": 0.984375, "learning_rate": 0.0001086591219775746, "loss": 0.8049, "step": 360 }, { "epoch": 2.664233576642336, "grad_norm": 0.81640625, "learning_rate": 0.00010611616608218429, "loss": 0.7865, "step": 365 }, { "epoch": 2.7007299270072993, "grad_norm": 0.51953125, "learning_rate": 0.00010356923338389806, "loss": 0.7908, "step": 370 }, { "epoch": 2.7372262773722627, "grad_norm": 0.53125, "learning_rate": 0.00010101997992821797, "loss": 0.7925, "step": 375 }, { "epoch": 2.7737226277372264, "grad_norm": 0.49609375, "learning_rate": 9.847006326962974e-05, "loss": 0.799, "step": 380 }, { "epoch": 2.81021897810219, "grad_norm": 0.51171875, "learning_rate": 9.592114139384145e-05, "loss": 0.7832, "step": 385 }, { "epoch": 2.846715328467153, "grad_norm": 0.7109375, "learning_rate": 9.337487163974164e-05, "loss": 0.7796, "step": 390 }, { "epoch": 2.883211678832117, "grad_norm": 0.6328125, "learning_rate": 9.083290962177828e-05, "loss": 0.7839, "step": 395 }, { "epoch": 2.9197080291970803, "grad_norm": 0.59765625, "learning_rate": 8.829690815345886e-05, "loss": 0.7781, "step": 400 }, { "epoch": 2.9562043795620436, "grad_norm": 0.58203125, "learning_rate": 8.57685161726715e-05, "loss": 0.7457, "step": 405 }, { "epoch": 2.9927007299270074, "grad_norm": 0.6171875, "learning_rate": 8.324937766952638e-05, "loss": 0.7623, "step": 410 }, { "epoch": 3.0, "eval_loss": 2.275648355484009, "eval_runtime": 0.9945, "eval_samples_per_second": 5.028, "eval_steps_per_second": 2.011, "step": 411 }, { "epoch": 3.0291970802919708, "grad_norm": 0.8359375, "learning_rate": 8.074113061741397e-05, "loss": 0.7329, "step": 415 }, { "epoch": 3.065693430656934, "grad_norm": 0.50390625, "learning_rate": 7.824540590797568e-05, "loss": 0.7052, "step": 420 }, { "epoch": 3.102189781021898, "grad_norm": 0.5703125, "learning_rate": 7.576382629067877e-05, "loss": 0.7015, "step": 425 }, { "epoch": 3.1386861313868613, "grad_norm": 0.6015625, "learning_rate": 7.329800531768584e-05, "loss": 0.696, "step": 430 }, { "epoch": 3.1751824817518246, "grad_norm": 0.55078125, "learning_rate": 7.084954629470417e-05, "loss": 0.7154, "step": 435 }, { "epoch": 3.2116788321167884, "grad_norm": 0.59765625, "learning_rate": 6.842004123849752e-05, "loss": 0.7113, "step": 440 }, { "epoch": 3.2481751824817517, "grad_norm": 0.5625, "learning_rate": 6.601106984173835e-05, "loss": 0.7139, "step": 445 }, { "epoch": 3.2846715328467155, "grad_norm": 0.59765625, "learning_rate": 6.362419844587287e-05, "loss": 0.6967, "step": 450 }, { "epoch": 3.321167883211679, "grad_norm": 0.52734375, "learning_rate": 6.126097902266772e-05, "loss": 0.7073, "step": 455 }, { "epoch": 3.3576642335766422, "grad_norm": 0.5625, "learning_rate": 5.8922948165099524e-05, "loss": 0.6857, "step": 460 }, { "epoch": 3.394160583941606, "grad_norm": 0.55859375, "learning_rate": 5.6611626088244194e-05, "loss": 0.7199, "step": 465 }, { "epoch": 3.4306569343065694, "grad_norm": 0.58203125, "learning_rate": 5.432851564081534e-05, "loss": 0.7075, "step": 470 }, { "epoch": 3.4671532846715327, "grad_norm": 0.52734375, "learning_rate": 5.207510132799436e-05, "loss": 0.7006, "step": 475 }, { "epoch": 3.5036496350364965, "grad_norm": 0.53515625, "learning_rate": 4.9852848346187566e-05, "loss": 0.7151, "step": 480 }, { "epoch": 3.54014598540146, "grad_norm": 0.546875, "learning_rate": 4.7663201630338816e-05, "loss": 0.7129, "step": 485 }, { "epoch": 3.576642335766423, "grad_norm": 0.5859375, "learning_rate": 4.550758491441526e-05, "loss": 0.7139, "step": 490 }, { "epoch": 3.613138686131387, "grad_norm": 0.51953125, "learning_rate": 4.3387399805679255e-05, "loss": 0.7162, "step": 495 }, { "epoch": 3.6496350364963503, "grad_norm": 0.55859375, "learning_rate": 4.1304024873346705e-05, "loss": 0.7132, "step": 500 }, { "epoch": 3.686131386861314, "grad_norm": 0.57421875, "learning_rate": 3.9258814752225284e-05, "loss": 0.7007, "step": 505 }, { "epoch": 3.7226277372262775, "grad_norm": 0.546875, "learning_rate": 3.725309926191479e-05, "loss": 0.7037, "step": 510 }, { "epoch": 3.759124087591241, "grad_norm": 0.73828125, "learning_rate": 3.528818254214329e-05, "loss": 0.7255, "step": 515 }, { "epoch": 3.795620437956204, "grad_norm": 0.52734375, "learning_rate": 3.336534220479961e-05, "loss": 0.6966, "step": 520 }, { "epoch": 3.832116788321168, "grad_norm": 0.5078125, "learning_rate": 3.1485828503215585e-05, "loss": 0.7143, "step": 525 }, { "epoch": 3.8686131386861313, "grad_norm": 0.6328125, "learning_rate": 2.9650863519236418e-05, "loss": 0.7005, "step": 530 }, { "epoch": 3.905109489051095, "grad_norm": 0.5703125, "learning_rate": 2.7861640368608844e-05, "loss": 0.7005, "step": 535 }, { "epoch": 3.9416058394160585, "grad_norm": 0.53125, "learning_rate": 2.6119322425203197e-05, "loss": 0.7139, "step": 540 }, { "epoch": 3.978102189781022, "grad_norm": 0.51953125, "learning_rate": 2.4425042564574184e-05, "loss": 0.709, "step": 545 }, { "epoch": 4.0, "eval_loss": 2.341665267944336, "eval_runtime": 0.9977, "eval_samples_per_second": 5.012, "eval_steps_per_second": 2.005, "step": 548 }, { "epoch": 4.014598540145985, "grad_norm": 0.53515625, "learning_rate": 2.277990242735185e-05, "loss": 0.6801, "step": 550 }, { "epoch": 4.0510948905109485, "grad_norm": 0.52734375, "learning_rate": 2.118497170294195e-05, "loss": 0.6495, "step": 555 }, { "epoch": 4.087591240875913, "grad_norm": 0.5625, "learning_rate": 1.9641287434001355e-05, "loss": 0.672, "step": 560 }, { "epoch": 4.124087591240876, "grad_norm": 0.55078125, "learning_rate": 1.8149853342140645e-05, "loss": 0.6611, "step": 565 }, { "epoch": 4.160583941605839, "grad_norm": 0.59375, "learning_rate": 1.671163917529285e-05, "loss": 0.662, "step": 570 }, { "epoch": 4.197080291970803, "grad_norm": 0.51171875, "learning_rate": 1.5327580077171587e-05, "loss": 0.6635, "step": 575 }, { "epoch": 4.233576642335766, "grad_norm": 0.54296875, "learning_rate": 1.3998575979229944e-05, "loss": 0.6624, "step": 580 }, { "epoch": 4.2700729927007295, "grad_norm": 0.50390625, "learning_rate": 1.272549101551438e-05, "loss": 0.6523, "step": 585 }, { "epoch": 4.306569343065694, "grad_norm": 0.51171875, "learning_rate": 1.1509152960794666e-05, "loss": 0.6607, "step": 590 }, { "epoch": 4.343065693430657, "grad_norm": 0.546875, "learning_rate": 1.035035269233493e-05, "loss": 0.6626, "step": 595 }, { "epoch": 4.37956204379562, "grad_norm": 0.54296875, "learning_rate": 9.249843675656212e-06, "loss": 0.678, "step": 600 }, { "epoch": 4.416058394160584, "grad_norm": 0.5234375, "learning_rate": 8.208341474624071e-06, "loss": 0.6783, "step": 605 }, { "epoch": 4.452554744525547, "grad_norm": 0.53515625, "learning_rate": 7.226523286180776e-06, "loss": 0.6699, "step": 610 }, { "epoch": 4.489051094890511, "grad_norm": 0.5703125, "learning_rate": 6.3050275000238414e-06, "loss": 0.6607, "step": 615 }, { "epoch": 4.525547445255475, "grad_norm": 0.5234375, "learning_rate": 5.4444532835175144e-06, "loss": 0.6702, "step": 620 }, { "epoch": 4.562043795620438, "grad_norm": 0.5234375, "learning_rate": 4.6453601921072395e-06, "loss": 0.6793, "step": 625 }, { "epoch": 4.598540145985401, "grad_norm": 0.5234375, "learning_rate": 3.908267805490051e-06, "loss": 0.6622, "step": 630 }, { "epoch": 4.635036496350365, "grad_norm": 0.54296875, "learning_rate": 3.233655389777801e-06, "loss": 0.677, "step": 635 }, { "epoch": 4.671532846715328, "grad_norm": 0.5234375, "learning_rate": 2.62196158587269e-06, "loss": 0.6588, "step": 640 }, { "epoch": 4.708029197080292, "grad_norm": 0.5234375, "learning_rate": 2.073584124257899e-06, "loss": 0.6621, "step": 645 }, { "epoch": 4.744525547445256, "grad_norm": 0.53515625, "learning_rate": 1.5888795663883904e-06, "loss": 0.6655, "step": 650 }, { "epoch": 4.781021897810219, "grad_norm": 0.515625, "learning_rate": 1.1681630728506699e-06, "loss": 0.6653, "step": 655 }, { "epoch": 4.817518248175182, "grad_norm": 0.52734375, "learning_rate": 8.117081984415298e-07, "loss": 0.6734, "step": 660 }, { "epoch": 4.854014598540146, "grad_norm": 0.5390625, "learning_rate": 5.19746714299596e-07, "loss": 0.6541, "step": 665 }, { "epoch": 4.89051094890511, "grad_norm": 0.5390625, "learning_rate": 2.9246845720496407e-07, "loss": 0.6722, "step": 670 }, { "epoch": 4.927007299270073, "grad_norm": 0.55859375, "learning_rate": 1.300212061451367e-07, "loss": 0.6472, "step": 675 }, { "epoch": 4.963503649635037, "grad_norm": 0.51953125, "learning_rate": 3.251058622737446e-08, "loss": 0.667, "step": 680 }, { "epoch": 5.0, "grad_norm": 0.52734375, "learning_rate": 0.0, "loss": 0.6601, "step": 685 }, { "epoch": 5.0, "eval_loss": 2.3811252117156982, "eval_runtime": 0.9953, "eval_samples_per_second": 5.024, "eval_steps_per_second": 2.01, "step": 685 }, { "epoch": 5.0, "step": 685, "total_flos": 1.0472781231601746e+18, "train_loss": 2.151051264783762, "train_runtime": 5341.9856, "train_samples_per_second": 2.052, "train_steps_per_second": 0.128 } ], "logging_steps": 5, "max_steps": 685, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0472781231601746e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }