{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99695843190267, "eval_steps": 500, "global_step": 1107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027036160865157147, "grad_norm": 3.523985384534832, "learning_rate": 5e-06, "loss": 0.8602, "step": 10 }, { "epoch": 0.054072321730314295, "grad_norm": 2.330230157876199, "learning_rate": 5e-06, "loss": 0.6873, "step": 20 }, { "epoch": 0.08110848259547145, "grad_norm": 2.692866742411895, "learning_rate": 5e-06, "loss": 0.6489, "step": 30 }, { "epoch": 0.10814464346062859, "grad_norm": 1.742013451785388, "learning_rate": 5e-06, "loss": 0.6363, "step": 40 }, { "epoch": 0.13518080432578575, "grad_norm": 1.9825861616522564, "learning_rate": 5e-06, "loss": 0.6212, "step": 50 }, { "epoch": 0.1622169651909429, "grad_norm": 1.963028689906959, "learning_rate": 5e-06, "loss": 0.6158, "step": 60 }, { "epoch": 0.18925312605610004, "grad_norm": 1.5400134410581494, "learning_rate": 5e-06, "loss": 0.6126, "step": 70 }, { "epoch": 0.21628928692125718, "grad_norm": 1.7957880103549582, "learning_rate": 5e-06, "loss": 0.6046, "step": 80 }, { "epoch": 0.24332544778641432, "grad_norm": 2.0825972162611626, "learning_rate": 5e-06, "loss": 0.6015, "step": 90 }, { "epoch": 0.2703616086515715, "grad_norm": 1.5684743614874819, "learning_rate": 5e-06, "loss": 0.6044, "step": 100 }, { "epoch": 0.29739776951672864, "grad_norm": 1.5536576895608207, "learning_rate": 5e-06, "loss": 0.6009, "step": 110 }, { "epoch": 0.3244339303818858, "grad_norm": 1.9211273900718062, "learning_rate": 5e-06, "loss": 0.5959, "step": 120 }, { "epoch": 0.3514700912470429, "grad_norm": 1.857918426411918, "learning_rate": 5e-06, "loss": 0.5981, "step": 130 }, { "epoch": 0.37850625211220007, "grad_norm": 2.480111887309551, "learning_rate": 5e-06, "loss": 0.5972, "step": 140 }, { "epoch": 0.4055424129773572, "grad_norm": 1.9435865546624187, "learning_rate": 5e-06, "loss": 0.589, "step": 150 }, { "epoch": 0.43257857384251436, "grad_norm": 2.256806266648931, "learning_rate": 5e-06, "loss": 0.5905, "step": 160 }, { "epoch": 0.4596147347076715, "grad_norm": 1.9395141957859183, "learning_rate": 5e-06, "loss": 0.5907, "step": 170 }, { "epoch": 0.48665089557282865, "grad_norm": 1.70470436825217, "learning_rate": 5e-06, "loss": 0.5891, "step": 180 }, { "epoch": 0.5136870564379858, "grad_norm": 1.2803454947922437, "learning_rate": 5e-06, "loss": 0.5864, "step": 190 }, { "epoch": 0.540723217303143, "grad_norm": 1.6583430834053514, "learning_rate": 5e-06, "loss": 0.5845, "step": 200 }, { "epoch": 0.5677593781683001, "grad_norm": 1.6236160787848246, "learning_rate": 5e-06, "loss": 0.5873, "step": 210 }, { "epoch": 0.5947955390334573, "grad_norm": 1.4448675180275712, "learning_rate": 5e-06, "loss": 0.5829, "step": 220 }, { "epoch": 0.6218316998986144, "grad_norm": 1.5221126449990794, "learning_rate": 5e-06, "loss": 0.5844, "step": 230 }, { "epoch": 0.6488678607637716, "grad_norm": 1.3401517373172736, "learning_rate": 5e-06, "loss": 0.581, "step": 240 }, { "epoch": 0.6759040216289287, "grad_norm": 3.0974221570456697, "learning_rate": 5e-06, "loss": 0.5751, "step": 250 }, { "epoch": 0.7029401824940859, "grad_norm": 1.7078786998431879, "learning_rate": 5e-06, "loss": 0.5786, "step": 260 }, { "epoch": 0.729976343359243, "grad_norm": 1.3452565169314334, "learning_rate": 5e-06, "loss": 0.5802, "step": 270 }, { "epoch": 0.7570125042244001, "grad_norm": 1.2754169661691006, "learning_rate": 5e-06, "loss": 0.5753, "step": 280 }, { "epoch": 0.7840486650895573, "grad_norm": 1.561813747345879, "learning_rate": 5e-06, "loss": 0.5756, "step": 290 }, { "epoch": 0.8110848259547144, "grad_norm": 1.4562454979417123, "learning_rate": 5e-06, "loss": 0.5754, "step": 300 }, { "epoch": 0.8381209868198716, "grad_norm": 1.3655414659589415, "learning_rate": 5e-06, "loss": 0.5731, "step": 310 }, { "epoch": 0.8651571476850287, "grad_norm": 1.57353294016275, "learning_rate": 5e-06, "loss": 0.5721, "step": 320 }, { "epoch": 0.8921933085501859, "grad_norm": 1.9418137004471465, "learning_rate": 5e-06, "loss": 0.5736, "step": 330 }, { "epoch": 0.919229469415343, "grad_norm": 1.6960343893725316, "learning_rate": 5e-06, "loss": 0.5806, "step": 340 }, { "epoch": 0.9462656302805001, "grad_norm": 2.406507142621058, "learning_rate": 5e-06, "loss": 0.5714, "step": 350 }, { "epoch": 0.9733017911456573, "grad_norm": 1.9230224466359063, "learning_rate": 5e-06, "loss": 0.5704, "step": 360 }, { "epoch": 0.9976343359242987, "eval_loss": 0.07099956274032593, "eval_runtime": 383.3884, "eval_samples_per_second": 25.992, "eval_steps_per_second": 0.407, "step": 369 }, { "epoch": 1.0023656640757013, "grad_norm": 3.4559216993428326, "learning_rate": 5e-06, "loss": 0.5655, "step": 370 }, { "epoch": 1.0294018249408583, "grad_norm": 2.6945339527227783, "learning_rate": 5e-06, "loss": 0.4803, "step": 380 }, { "epoch": 1.0564379858060156, "grad_norm": 2.274860352040799, "learning_rate": 5e-06, "loss": 0.4751, "step": 390 }, { "epoch": 1.0834741466711728, "grad_norm": 1.9418080331159586, "learning_rate": 5e-06, "loss": 0.4775, "step": 400 }, { "epoch": 1.1105103075363298, "grad_norm": 1.3221563752390588, "learning_rate": 5e-06, "loss": 0.4751, "step": 410 }, { "epoch": 1.1375464684014869, "grad_norm": 1.6270959849174909, "learning_rate": 5e-06, "loss": 0.4773, "step": 420 }, { "epoch": 1.1645826292666441, "grad_norm": 1.723481598695817, "learning_rate": 5e-06, "loss": 0.4861, "step": 430 }, { "epoch": 1.1916187901318014, "grad_norm": 1.6689122667194243, "learning_rate": 5e-06, "loss": 0.4771, "step": 440 }, { "epoch": 1.2186549509969584, "grad_norm": 1.8852129756960698, "learning_rate": 5e-06, "loss": 0.4817, "step": 450 }, { "epoch": 1.2456911118621157, "grad_norm": 1.5781803973122046, "learning_rate": 5e-06, "loss": 0.4832, "step": 460 }, { "epoch": 1.2727272727272727, "grad_norm": 1.8932565449503365, "learning_rate": 5e-06, "loss": 0.4855, "step": 470 }, { "epoch": 1.29976343359243, "grad_norm": 1.5040934591134398, "learning_rate": 5e-06, "loss": 0.4816, "step": 480 }, { "epoch": 1.326799594457587, "grad_norm": 1.415624345433887, "learning_rate": 5e-06, "loss": 0.4817, "step": 490 }, { "epoch": 1.3538357553227442, "grad_norm": 1.4726186128545236, "learning_rate": 5e-06, "loss": 0.4859, "step": 500 }, { "epoch": 1.3808719161879013, "grad_norm": 1.371837855586058, "learning_rate": 5e-06, "loss": 0.4862, "step": 510 }, { "epoch": 1.4079080770530585, "grad_norm": 1.625255953470612, "learning_rate": 5e-06, "loss": 0.4899, "step": 520 }, { "epoch": 1.4349442379182156, "grad_norm": 1.4470657655644708, "learning_rate": 5e-06, "loss": 0.484, "step": 530 }, { "epoch": 1.4619803987833728, "grad_norm": 1.4168708504506906, "learning_rate": 5e-06, "loss": 0.489, "step": 540 }, { "epoch": 1.4890165596485299, "grad_norm": 1.2953354115079219, "learning_rate": 5e-06, "loss": 0.4876, "step": 550 }, { "epoch": 1.5160527205136871, "grad_norm": 1.2905587316106748, "learning_rate": 5e-06, "loss": 0.4898, "step": 560 }, { "epoch": 1.5430888813788441, "grad_norm": 1.874461693755812, "learning_rate": 5e-06, "loss": 0.4852, "step": 570 }, { "epoch": 1.5701250422440014, "grad_norm": 1.537393515627057, "learning_rate": 5e-06, "loss": 0.4874, "step": 580 }, { "epoch": 1.5971612031091587, "grad_norm": 1.7234212392856714, "learning_rate": 5e-06, "loss": 0.4911, "step": 590 }, { "epoch": 1.6241973639743157, "grad_norm": 1.4569000028167551, "learning_rate": 5e-06, "loss": 0.487, "step": 600 }, { "epoch": 1.6512335248394727, "grad_norm": 1.4876997193606485, "learning_rate": 5e-06, "loss": 0.4854, "step": 610 }, { "epoch": 1.67826968570463, "grad_norm": 1.4853131850089583, "learning_rate": 5e-06, "loss": 0.4902, "step": 620 }, { "epoch": 1.7053058465697872, "grad_norm": 1.4854501350323384, "learning_rate": 5e-06, "loss": 0.4952, "step": 630 }, { "epoch": 1.7323420074349443, "grad_norm": 1.613201730070182, "learning_rate": 5e-06, "loss": 0.493, "step": 640 }, { "epoch": 1.7593781683001013, "grad_norm": 1.3411867074544503, "learning_rate": 5e-06, "loss": 0.4904, "step": 650 }, { "epoch": 1.7864143291652586, "grad_norm": 1.3453881021060534, "learning_rate": 5e-06, "loss": 0.4879, "step": 660 }, { "epoch": 1.8134504900304158, "grad_norm": 1.4275860747925428, "learning_rate": 5e-06, "loss": 0.4904, "step": 670 }, { "epoch": 1.8404866508955728, "grad_norm": 1.3712075307477265, "learning_rate": 5e-06, "loss": 0.4935, "step": 680 }, { "epoch": 1.8675228117607299, "grad_norm": 1.2986482215538881, "learning_rate": 5e-06, "loss": 0.4917, "step": 690 }, { "epoch": 1.8945589726258871, "grad_norm": 1.2770662158812232, "learning_rate": 5e-06, "loss": 0.4943, "step": 700 }, { "epoch": 1.9215951334910444, "grad_norm": 1.2971456378708284, "learning_rate": 5e-06, "loss": 0.4994, "step": 710 }, { "epoch": 1.9486312943562014, "grad_norm": 1.2953045522832038, "learning_rate": 5e-06, "loss": 0.4927, "step": 720 }, { "epoch": 1.9756674552213584, "grad_norm": 1.2187798944947157, "learning_rate": 5e-06, "loss": 0.4886, "step": 730 }, { "epoch": 1.9972963839134843, "eval_loss": 0.07174264639616013, "eval_runtime": 383.5066, "eval_samples_per_second": 25.984, "eval_steps_per_second": 0.407, "step": 738 }, { "epoch": 2.0047313281514025, "grad_norm": 3.2606644157882583, "learning_rate": 5e-06, "loss": 0.477, "step": 740 }, { "epoch": 2.0317674890165596, "grad_norm": 1.997812835463083, "learning_rate": 5e-06, "loss": 0.3854, "step": 750 }, { "epoch": 2.0588036498817166, "grad_norm": 1.9996540417166961, "learning_rate": 5e-06, "loss": 0.3864, "step": 760 }, { "epoch": 2.085839810746874, "grad_norm": 1.8618681796717789, "learning_rate": 5e-06, "loss": 0.3812, "step": 770 }, { "epoch": 2.112875971612031, "grad_norm": 1.4915524529602713, "learning_rate": 5e-06, "loss": 0.3811, "step": 780 }, { "epoch": 2.139912132477188, "grad_norm": 1.5723106961760522, "learning_rate": 5e-06, "loss": 0.3808, "step": 790 }, { "epoch": 2.1669482933423456, "grad_norm": 1.714466030885258, "learning_rate": 5e-06, "loss": 0.3876, "step": 800 }, { "epoch": 2.1939844542075027, "grad_norm": 1.7511244660613634, "learning_rate": 5e-06, "loss": 0.3862, "step": 810 }, { "epoch": 2.2210206150726597, "grad_norm": 1.5027273200313567, "learning_rate": 5e-06, "loss": 0.3873, "step": 820 }, { "epoch": 2.2480567759378167, "grad_norm": 1.6128980496405356, "learning_rate": 5e-06, "loss": 0.3896, "step": 830 }, { "epoch": 2.2750929368029738, "grad_norm": 1.561276115932866, "learning_rate": 5e-06, "loss": 0.3909, "step": 840 }, { "epoch": 2.3021290976681312, "grad_norm": 1.7787495204510098, "learning_rate": 5e-06, "loss": 0.3866, "step": 850 }, { "epoch": 2.3291652585332883, "grad_norm": 1.5802735443144562, "learning_rate": 5e-06, "loss": 0.3896, "step": 860 }, { "epoch": 2.3562014193984453, "grad_norm": 1.5469387511116455, "learning_rate": 5e-06, "loss": 0.3948, "step": 870 }, { "epoch": 2.3832375802636028, "grad_norm": 1.6780934080456225, "learning_rate": 5e-06, "loss": 0.3937, "step": 880 }, { "epoch": 2.41027374112876, "grad_norm": 1.5538724349535749, "learning_rate": 5e-06, "loss": 0.3933, "step": 890 }, { "epoch": 2.437309901993917, "grad_norm": 1.6919213854745063, "learning_rate": 5e-06, "loss": 0.3927, "step": 900 }, { "epoch": 2.464346062859074, "grad_norm": 1.6467399942324181, "learning_rate": 5e-06, "loss": 0.3916, "step": 910 }, { "epoch": 2.4913822237242313, "grad_norm": 1.5494538660407549, "learning_rate": 5e-06, "loss": 0.393, "step": 920 }, { "epoch": 2.5184183845893884, "grad_norm": 1.6061582119048823, "learning_rate": 5e-06, "loss": 0.392, "step": 930 }, { "epoch": 2.5454545454545454, "grad_norm": 1.5497867717979459, "learning_rate": 5e-06, "loss": 0.3996, "step": 940 }, { "epoch": 2.5724907063197024, "grad_norm": 1.8696325727031804, "learning_rate": 5e-06, "loss": 0.3949, "step": 950 }, { "epoch": 2.59952686718486, "grad_norm": 1.6545052949984496, "learning_rate": 5e-06, "loss": 0.4002, "step": 960 }, { "epoch": 2.626563028050017, "grad_norm": 1.5124642558655546, "learning_rate": 5e-06, "loss": 0.3988, "step": 970 }, { "epoch": 2.653599188915174, "grad_norm": 1.559510310440385, "learning_rate": 5e-06, "loss": 0.4024, "step": 980 }, { "epoch": 2.6806353497803315, "grad_norm": 1.676052966396514, "learning_rate": 5e-06, "loss": 0.4007, "step": 990 }, { "epoch": 2.7076715106454885, "grad_norm": 2.1046446839691, "learning_rate": 5e-06, "loss": 0.3952, "step": 1000 }, { "epoch": 2.7347076715106455, "grad_norm": 2.0451177849286464, "learning_rate": 5e-06, "loss": 0.4021, "step": 1010 }, { "epoch": 2.7617438323758026, "grad_norm": 1.6691707308120913, "learning_rate": 5e-06, "loss": 0.4046, "step": 1020 }, { "epoch": 2.7887799932409596, "grad_norm": 1.4465197945931527, "learning_rate": 5e-06, "loss": 0.4022, "step": 1030 }, { "epoch": 2.815816154106117, "grad_norm": 1.4351190888202614, "learning_rate": 5e-06, "loss": 0.4014, "step": 1040 }, { "epoch": 2.842852314971274, "grad_norm": 1.485706611946695, "learning_rate": 5e-06, "loss": 0.4034, "step": 1050 }, { "epoch": 2.869888475836431, "grad_norm": 1.4860832182563435, "learning_rate": 5e-06, "loss": 0.3996, "step": 1060 }, { "epoch": 2.8969246367015886, "grad_norm": 1.5630718102725172, "learning_rate": 5e-06, "loss": 0.4077, "step": 1070 }, { "epoch": 2.9239607975667457, "grad_norm": 1.4737818975824717, "learning_rate": 5e-06, "loss": 0.4027, "step": 1080 }, { "epoch": 2.9509969584319027, "grad_norm": 1.5487795543993597, "learning_rate": 5e-06, "loss": 0.406, "step": 1090 }, { "epoch": 2.9780331192970597, "grad_norm": 1.3628828414709748, "learning_rate": 5e-06, "loss": 0.4047, "step": 1100 }, { "epoch": 2.99695843190267, "eval_loss": 0.07649385929107666, "eval_runtime": 385.8472, "eval_samples_per_second": 25.826, "eval_steps_per_second": 0.404, "step": 1107 }, { "epoch": 2.99695843190267, "step": 1107, "total_flos": 1854056851046400.0, "train_loss": 0.49424083467636865, "train_runtime": 64065.8676, "train_samples_per_second": 8.866, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 1107, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1854056851046400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }