{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 76.32773843110532, "learning_rate": 1.5873015873015874e-07, "loss": 4.4795, "step": 1 }, { "epoch": 0.04, "grad_norm": 71.81150350946773, "learning_rate": 7.936507936507937e-07, "loss": 4.307, "step": 5 }, { "epoch": 0.08, "grad_norm": 42.999152941442446, "learning_rate": 1.5873015873015873e-06, "loss": 3.4849, "step": 10 }, { "epoch": 0.12, "grad_norm": 46.44864615552445, "learning_rate": 2.380952380952381e-06, "loss": 1.5332, "step": 15 }, { "epoch": 0.16, "grad_norm": 5.770504948547854, "learning_rate": 3.1746031746031746e-06, "loss": 0.3264, "step": 20 }, { "epoch": 0.2, "grad_norm": 3.383097613641565, "learning_rate": 3.968253968253968e-06, "loss": 0.195, "step": 25 }, { "epoch": 0.24, "grad_norm": 2.543807204293022, "learning_rate": 4.761904761904762e-06, "loss": 0.152, "step": 30 }, { "epoch": 0.28, "grad_norm": 2.071517230467083, "learning_rate": 5.555555555555557e-06, "loss": 0.1464, "step": 35 }, { "epoch": 0.32, "grad_norm": 3.1880730134020405, "learning_rate": 6.349206349206349e-06, "loss": 0.1532, "step": 40 }, { "epoch": 0.36, "grad_norm": 2.273945566819242, "learning_rate": 7.1428571428571436e-06, "loss": 0.1477, "step": 45 }, { "epoch": 0.4, "grad_norm": 2.8908492033026, "learning_rate": 7.936507936507936e-06, "loss": 0.1655, "step": 50 }, { "epoch": 0.44, "grad_norm": 1.408828965452355, "learning_rate": 8.730158730158731e-06, "loss": 0.163, "step": 55 }, { "epoch": 0.48, "grad_norm": 1.7147558780209895, "learning_rate": 9.523809523809525e-06, "loss": 0.1581, "step": 60 }, { "epoch": 0.52, "grad_norm": 2.3248153460703316, "learning_rate": 9.999687519737639e-06, "loss": 0.1587, "step": 65 }, { "epoch": 0.56, "grad_norm": 3.19958218992742, "learning_rate": 9.996172565322375e-06, "loss": 0.1782, "step": 70 }, { "epoch": 0.6, "grad_norm": 2.3598525259461014, "learning_rate": 9.988754811052616e-06, "loss": 0.1708, "step": 75 }, { "epoch": 0.64, "grad_norm": 2.350384753086168, "learning_rate": 9.97744005136599e-06, "loss": 0.1676, "step": 80 }, { "epoch": 0.68, "grad_norm": 1.8952757818976034, "learning_rate": 9.962237124876828e-06, "loss": 0.1709, "step": 85 }, { "epoch": 0.72, "grad_norm": 1.7959529824065594, "learning_rate": 9.943157907471825e-06, "loss": 0.1818, "step": 90 }, { "epoch": 0.76, "grad_norm": 3.148673436638757, "learning_rate": 9.920217303033091e-06, "loss": 0.1868, "step": 95 }, { "epoch": 0.8, "grad_norm": 2.6291492953956035, "learning_rate": 9.893433231795864e-06, "loss": 0.1802, "step": 100 }, { "epoch": 0.84, "grad_norm": 2.116086213452932, "learning_rate": 9.862826616349981e-06, "loss": 0.1963, "step": 105 }, { "epoch": 0.88, "grad_norm": 2.590351057974938, "learning_rate": 9.828421365296023e-06, "loss": 0.1884, "step": 110 }, { "epoch": 0.92, "grad_norm": 2.2826587299690297, "learning_rate": 9.79024435456893e-06, "loss": 0.194, "step": 115 }, { "epoch": 0.96, "grad_norm": 1.7617227651937795, "learning_rate": 9.748325406443647e-06, "loss": 0.1814, "step": 120 }, { "epoch": 1.0, "grad_norm": 2.7640313184450886, "learning_rate": 9.702697266239211e-06, "loss": 0.18, "step": 125 }, { "epoch": 1.04, "grad_norm": 13.383134400696948, "learning_rate": 9.653395576739504e-06, "loss": 0.1596, "step": 130 }, { "epoch": 1.08, "grad_norm": 1.960084286202522, "learning_rate": 9.600458850350588e-06, "loss": 0.163, "step": 135 }, { "epoch": 1.12, "grad_norm": 1.6888079694486844, "learning_rate": 9.543928439016445e-06, "loss": 0.1712, "step": 140 }, { "epoch": 1.16, "grad_norm": 2.5101725396253265, "learning_rate": 9.483848501916578e-06, "loss": 0.1667, "step": 145 }, { "epoch": 1.2, "grad_norm": 1.7853092667274246, "learning_rate": 9.42026597097071e-06, "loss": 0.17, "step": 150 }, { "epoch": 1.24, "grad_norm": 1.9741842768861118, "learning_rate": 9.353230514177553e-06, "loss": 0.1679, "step": 155 }, { "epoch": 1.28, "grad_norm": 2.4658963041094615, "learning_rate": 9.282794496816244e-06, "loss": 0.1731, "step": 160 }, { "epoch": 1.32, "grad_norm": 2.923478442896321, "learning_rate": 9.209012940540806e-06, "loss": 0.1708, "step": 165 }, { "epoch": 1.3599999999999999, "grad_norm": 3.19217673963713, "learning_rate": 9.131943480399531e-06, "loss": 0.1752, "step": 170 }, { "epoch": 1.4, "grad_norm": 2.2076768543351575, "learning_rate": 9.05164631981292e-06, "loss": 0.1646, "step": 175 }, { "epoch": 1.44, "grad_norm": 1.5911362670253353, "learning_rate": 8.968184183545285e-06, "loss": 0.1558, "step": 180 }, { "epoch": 1.48, "grad_norm": 2.438333228986101, "learning_rate": 8.881622268706825e-06, "loss": 0.1639, "step": 185 }, { "epoch": 1.52, "grad_norm": 1.8709425955134034, "learning_rate": 8.792028193824364e-06, "loss": 0.1566, "step": 190 }, { "epoch": 1.56, "grad_norm": 2.273561130366255, "learning_rate": 8.699471946020612e-06, "loss": 0.1743, "step": 195 }, { "epoch": 1.6, "grad_norm": 2.2532334985776963, "learning_rate": 8.604025826343167e-06, "loss": 0.1614, "step": 200 }, { "epoch": 1.6400000000000001, "grad_norm": 2.8000291867854368, "learning_rate": 8.505764393285985e-06, "loss": 0.1688, "step": 205 }, { "epoch": 1.6800000000000002, "grad_norm": 3.5386862233237553, "learning_rate": 8.404764404547404e-06, "loss": 0.1736, "step": 210 }, { "epoch": 1.72, "grad_norm": 2.1915281083973475, "learning_rate": 8.301104757070276e-06, "loss": 0.1655, "step": 215 }, { "epoch": 1.76, "grad_norm": 1.735218330228595, "learning_rate": 8.194866425410984e-06, "loss": 0.1511, "step": 220 }, { "epoch": 1.8, "grad_norm": 2.0251426912636514, "learning_rate": 8.086132398485525e-06, "loss": 0.1499, "step": 225 }, { "epoch": 1.8399999999999999, "grad_norm": 2.595962962052259, "learning_rate": 7.974987614742066e-06, "loss": 0.1696, "step": 230 }, { "epoch": 1.88, "grad_norm": 1.8465107375877872, "learning_rate": 7.861518895810597e-06, "loss": 0.1645, "step": 235 }, { "epoch": 1.92, "grad_norm": 1.7181169308567998, "learning_rate": 7.745814878681516e-06, "loss": 0.1628, "step": 240 }, { "epoch": 1.96, "grad_norm": 1.914333923678904, "learning_rate": 7.627965946466167e-06, "loss": 0.1453, "step": 245 }, { "epoch": 2.0, "grad_norm": 1.6246422019481361, "learning_rate": 7.50806415779332e-06, "loss": 0.1489, "step": 250 }, { "epoch": 2.04, "grad_norm": 1.5381806636260356, "learning_rate": 7.386203174896872e-06, "loss": 0.1361, "step": 255 }, { "epoch": 2.08, "grad_norm": 1.6845512041994002, "learning_rate": 7.262478190450834e-06, "loss": 0.1332, "step": 260 }, { "epoch": 2.12, "grad_norm": 2.367964043590009, "learning_rate": 7.136985853208824e-06, "loss": 0.1426, "step": 265 }, { "epoch": 2.16, "grad_norm": 1.7353039468487315, "learning_rate": 7.0098241925061215e-06, "loss": 0.1348, "step": 270 }, { "epoch": 2.2, "grad_norm": 1.8745427410009357, "learning_rate": 6.881092541683279e-06, "loss": 0.1367, "step": 275 }, { "epoch": 2.24, "grad_norm": 1.7486606607428097, "learning_rate": 6.750891460491093e-06, "loss": 0.1432, "step": 280 }, { "epoch": 2.2800000000000002, "grad_norm": 2.7256153449292957, "learning_rate": 6.619322656537552e-06, "loss": 0.1327, "step": 285 }, { "epoch": 2.32, "grad_norm": 2.011781932169801, "learning_rate": 6.486488905838143e-06, "loss": 0.1471, "step": 290 }, { "epoch": 2.36, "grad_norm": 1.958955631206453, "learning_rate": 6.352493972531535e-06, "loss": 0.146, "step": 295 }, { "epoch": 2.4, "grad_norm": 2.125212212501986, "learning_rate": 6.2174425278234115e-06, "loss": 0.1439, "step": 300 }, { "epoch": 2.44, "grad_norm": 1.7405119151293693, "learning_rate": 6.0814400682217236e-06, "loss": 0.1345, "step": 305 }, { "epoch": 2.48, "grad_norm": 1.307904513904714, "learning_rate": 5.944592833127253e-06, "loss": 0.1412, "step": 310 }, { "epoch": 2.52, "grad_norm": 1.5618909410769424, "learning_rate": 5.807007721843862e-06, "loss": 0.1396, "step": 315 }, { "epoch": 2.56, "grad_norm": 1.6001114294946008, "learning_rate": 5.668792210073255e-06, "loss": 0.1381, "step": 320 }, { "epoch": 2.6, "grad_norm": 2.0825376870245433, "learning_rate": 5.530054265959486e-06, "loss": 0.1279, "step": 325 }, { "epoch": 2.64, "grad_norm": 1.4899012611449984, "learning_rate": 5.39090226574877e-06, "loss": 0.1273, "step": 330 }, { "epoch": 2.68, "grad_norm": 1.3587693779949943, "learning_rate": 5.2514449091305375e-06, "loss": 0.139, "step": 335 }, { "epoch": 2.7199999999999998, "grad_norm": 1.3905248184940833, "learning_rate": 5.111791134325793e-06, "loss": 0.1337, "step": 340 }, { "epoch": 2.76, "grad_norm": 1.7261472057985348, "learning_rate": 4.9720500329891755e-06, "loss": 0.1419, "step": 345 }, { "epoch": 2.8, "grad_norm": 1.5882220461692291, "learning_rate": 4.832330764991131e-06, "loss": 0.129, "step": 350 }, { "epoch": 2.84, "grad_norm": 33.74153723265867, "learning_rate": 4.692742473146818e-06, "loss": 0.182, "step": 355 }, { "epoch": 2.88, "grad_norm": 1.2964424054104968, "learning_rate": 4.553394197958339e-06, "loss": 0.1353, "step": 360 }, { "epoch": 2.92, "grad_norm": 1.203808822414652, "learning_rate": 4.414394792436877e-06, "loss": 0.1394, "step": 365 }, { "epoch": 2.96, "grad_norm": 1.2806702384421942, "learning_rate": 4.275852837071309e-06, "loss": 0.1251, "step": 370 }, { "epoch": 3.0, "grad_norm": 1.337023723464791, "learning_rate": 4.137876555009684e-06, "loss": 0.1376, "step": 375 }, { "epoch": 3.04, "grad_norm": 1.12357649083518, "learning_rate": 4.000573727519868e-06, "loss": 0.12, "step": 380 }, { "epoch": 3.08, "grad_norm": 1.1261752050608114, "learning_rate": 3.86405160979534e-06, "loss": 0.12, "step": 385 }, { "epoch": 3.12, "grad_norm": 1.31280326818583, "learning_rate": 3.7284168471719527e-06, "loss": 0.1153, "step": 390 }, { "epoch": 3.16, "grad_norm": 1.3340360591201343, "learning_rate": 3.5937753918210705e-06, "loss": 0.125, "step": 395 }, { "epoch": 3.2, "grad_norm": 1.074093734063287, "learning_rate": 3.4602324199842026e-06, "loss": 0.1201, "step": 400 }, { "epoch": 3.24, "grad_norm": 1.3868574560188593, "learning_rate": 3.3278922498137455e-06, "loss": 0.1165, "step": 405 }, { "epoch": 3.2800000000000002, "grad_norm": 1.5476889671334009, "learning_rate": 3.1968582598840234e-06, "loss": 0.1348, "step": 410 }, { "epoch": 3.32, "grad_norm": 2.2098782619120945, "learning_rate": 3.067232808436299e-06, "loss": 0.1264, "step": 415 }, { "epoch": 3.36, "grad_norm": 1.4042367103273385, "learning_rate": 2.9391171534208185e-06, "loss": 0.1254, "step": 420 }, { "epoch": 3.4, "grad_norm": 0.9517570079120965, "learning_rate": 2.812611373398365e-06, "loss": 0.1196, "step": 425 }, { "epoch": 3.44, "grad_norm": 1.411957760122345, "learning_rate": 2.6878142893630904e-06, "loss": 0.1237, "step": 430 }, { "epoch": 3.48, "grad_norm": 1.0469263122557906, "learning_rate": 2.564823387547716e-06, "loss": 0.1234, "step": 435 }, { "epoch": 3.52, "grad_norm": 1.0880117862903047, "learning_rate": 2.4437347432713838e-06, "loss": 0.1281, "step": 440 }, { "epoch": 3.56, "grad_norm": 1.4980302601520634, "learning_rate": 2.3246429458896637e-06, "loss": 0.1196, "step": 445 }, { "epoch": 3.6, "grad_norm": 1.0217971404002908, "learning_rate": 2.207641024905322e-06, "loss": 0.1224, "step": 450 }, { "epoch": 3.64, "grad_norm": 1.1671521628697192, "learning_rate": 2.0928203772975917e-06, "loss": 0.119, "step": 455 }, { "epoch": 3.68, "grad_norm": 1.107501819947676, "learning_rate": 1.9802706961266936e-06, "loss": 0.1201, "step": 460 }, { "epoch": 3.7199999999999998, "grad_norm": 0.9124789903191979, "learning_rate": 1.870079900469392e-06, "loss": 0.1204, "step": 465 }, { "epoch": 3.76, "grad_norm": 1.4405450223921223, "learning_rate": 1.7623340667403089e-06, "loss": 0.1173, "step": 470 }, { "epoch": 3.8, "grad_norm": 1.2383539174028362, "learning_rate": 1.657117361452651e-06, "loss": 0.123, "step": 475 }, { "epoch": 3.84, "grad_norm": 1.5492361002489776, "learning_rate": 1.5545119754708682e-06, "loss": 0.1213, "step": 480 }, { "epoch": 3.88, "grad_norm": 0.9688910602285367, "learning_rate": 1.454598059806609e-06, "loss": 0.1244, "step": 485 }, { "epoch": 3.92, "grad_norm": 1.2497049971642828, "learning_rate": 1.3574536630081208e-06, "loss": 0.1266, "step": 490 }, { "epoch": 3.96, "grad_norm": 1.4359900705563882, "learning_rate": 1.2631546701920073e-06, "loss": 0.1196, "step": 495 }, { "epoch": 4.0, "grad_norm": 1.013179470712264, "learning_rate": 1.1717747437649657e-06, "loss": 0.1186, "step": 500 }, { "epoch": 4.04, "grad_norm": 1.128971555703373, "learning_rate": 1.0833852658818167e-06, "loss": 0.1077, "step": 505 }, { "epoch": 4.08, "grad_norm": 0.8064418108816587, "learning_rate": 9.980552826847635e-07, "loss": 0.1099, "step": 510 }, { "epoch": 4.12, "grad_norm": 0.8053995942309401, "learning_rate": 9.158514503674543e-07, "loss": 0.1122, "step": 515 }, { "epoch": 4.16, "grad_norm": 1.0137589959248894, "learning_rate": 8.368379831059592e-07, "loss": 0.1108, "step": 520 }, { "epoch": 4.2, "grad_norm": 1.179886916033213, "learning_rate": 7.61076602897371e-07, "loss": 0.1168, "step": 525 }, { "epoch": 4.24, "grad_norm": 1.1023307485760356, "learning_rate": 6.886264913451635e-07, "loss": 0.111, "step": 530 }, { "epoch": 4.28, "grad_norm": 0.8911248731367788, "learning_rate": 6.1954424342902e-07, "loss": 0.114, "step": 535 }, { "epoch": 4.32, "grad_norm": 0.9031682796275837, "learning_rate": 5.538838232952104e-07, "loss": 0.1099, "step": 540 }, { "epoch": 4.36, "grad_norm": 0.9777576873311943, "learning_rate": 4.916965221020753e-07, "loss": 0.1149, "step": 545 }, { "epoch": 4.4, "grad_norm": 0.794466996633541, "learning_rate": 4.3303091795353024e-07, "loss": 0.109, "step": 550 }, { "epoch": 4.44, "grad_norm": 1.1715360106229433, "learning_rate": 3.779328379518898e-07, "loss": 0.119, "step": 555 }, { "epoch": 4.48, "grad_norm": 0.9196816880764788, "learning_rate": 3.2644532239966444e-07, "loss": 0.1107, "step": 560 }, { "epoch": 4.52, "grad_norm": 0.8511671131233757, "learning_rate": 2.7860859117828985e-07, "loss": 0.1106, "step": 565 }, { "epoch": 4.5600000000000005, "grad_norm": 1.0226582223355896, "learning_rate": 2.3446001233004333e-07, "loss": 0.1176, "step": 570 }, { "epoch": 4.6, "grad_norm": 0.8883730009317985, "learning_rate": 1.9403407286770592e-07, "loss": 0.1122, "step": 575 }, { "epoch": 4.64, "grad_norm": 0.9879760794050276, "learning_rate": 1.573623518347517e-07, "loss": 0.1122, "step": 580 }, { "epoch": 4.68, "grad_norm": 0.8638827655369075, "learning_rate": 1.2447349563713186e-07, "loss": 0.1144, "step": 585 }, { "epoch": 4.72, "grad_norm": 0.8402422894300282, "learning_rate": 9.539319566590766e-08, "loss": 0.1108, "step": 590 }, { "epoch": 4.76, "grad_norm": 1.173156707092929, "learning_rate": 7.014416822821557e-08, "loss": 0.1186, "step": 595 }, { "epoch": 4.8, "grad_norm": 1.0955817536201944, "learning_rate": 4.8746136802240716e-08, "loss": 0.1115, "step": 600 }, { "epoch": 4.84, "grad_norm": 0.9330940788218187, "learning_rate": 3.121581663007134e-08, "loss": 0.1157, "step": 605 }, { "epoch": 4.88, "grad_norm": 1.1560897368741774, "learning_rate": 1.75669016604485e-08, "loss": 0.1035, "step": 610 }, { "epoch": 4.92, "grad_norm": 1.0534577947172947, "learning_rate": 7.81005385163458e-09, "loss": 0.1134, "step": 615 }, { "epoch": 4.96, "grad_norm": 0.893194596809323, "learning_rate": 1.952894842735531e-09, "loss": 0.1145, "step": 620 }, { "epoch": 5.0, "grad_norm": 0.84002664295751, "learning_rate": 0.0, "loss": 0.1091, "step": 625 }, { "epoch": 5.0, "step": 625, "total_flos": 1839624683520.0, "train_loss": 0.2137949773788452, "train_runtime": 9889.1554, "train_samples_per_second": 1.011, "train_steps_per_second": 0.063 } ], "logging_steps": 5, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1839624683520.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }