{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.5842293906810037, "eval_steps": 1000, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005973715651135006, "grad_norm": 8.9375, "learning_rate": 2e-06, "loss": 1.3168, "step": 1 }, { "epoch": 0.05973715651135006, "grad_norm": 0.625, "learning_rate": 0.0002, "loss": 0.5498, "step": 100 }, { "epoch": 0.11947431302270012, "grad_norm": 0.322265625, "learning_rate": 0.0004, "loss": 0.3884, "step": 200 }, { "epoch": 0.17921146953405018, "grad_norm": 0.302734375, "learning_rate": 0.0006, "loss": 0.376, "step": 300 }, { "epoch": 0.23894862604540024, "grad_norm": 0.12353515625, "learning_rate": 0.0008, "loss": 0.3635, "step": 400 }, { "epoch": 0.2986857825567503, "grad_norm": 0.1826171875, "learning_rate": 0.001, "loss": 0.3559, "step": 500 }, { "epoch": 0.35842293906810035, "grad_norm": 0.10888671875, "learning_rate": 0.0012, "loss": 0.3438, "step": 600 }, { "epoch": 0.41816009557945044, "grad_norm": 0.11572265625, "learning_rate": 0.0014, "loss": 0.3412, "step": 700 }, { "epoch": 0.4778972520908005, "grad_norm": 0.08984375, "learning_rate": 0.0016, "loss": 0.3392, "step": 800 }, { "epoch": 0.5376344086021505, "grad_norm": 0.11376953125, "learning_rate": 0.0018000000000000002, "loss": 0.346, "step": 900 }, { "epoch": 0.5973715651135006, "grad_norm": 0.08251953125, "learning_rate": 0.002, "loss": 0.3174, "step": 1000 }, { "epoch": 0.6571087216248507, "grad_norm": 0.0703125, "learning_rate": 0.0019998292504580526, "loss": 0.3309, "step": 1100 }, { "epoch": 0.7168458781362007, "grad_norm": 0.1533203125, "learning_rate": 0.001999317060143023, "loss": 0.323, "step": 1200 }, { "epoch": 0.7765830346475507, "grad_norm": 0.09228515625, "learning_rate": 0.001998463603967434, "loss": 0.3157, "step": 1300 }, { "epoch": 0.8363201911589009, "grad_norm": 0.072265625, "learning_rate": 0.0019972691733857882, "loss": 0.3155, "step": 1400 }, { "epoch": 0.8960573476702509, "grad_norm": 0.12890625, "learning_rate": 0.0019957341762950344, "loss": 0.3148, "step": 1500 }, { "epoch": 0.955794504181601, "grad_norm": 0.10205078125, "learning_rate": 0.001993859136895274, "loss": 0.3013, "step": 1600 }, { "epoch": 1.015531660692951, "grad_norm": 0.11474609375, "learning_rate": 0.0019916446955107426, "loss": 0.3009, "step": 1700 }, { "epoch": 1.075268817204301, "grad_norm": 0.12890625, "learning_rate": 0.001989091608371146, "loss": 0.3005, "step": 1800 }, { "epoch": 1.135005973715651, "grad_norm": 0.087890625, "learning_rate": 0.0019862007473534027, "loss": 0.3028, "step": 1900 }, { "epoch": 1.1947431302270013, "grad_norm": 0.10546875, "learning_rate": 0.001982973099683902, "loss": 0.3006, "step": 2000 }, { "epoch": 1.2544802867383513, "grad_norm": 0.1044921875, "learning_rate": 0.001979409767601366, "loss": 0.2981, "step": 2100 }, { "epoch": 1.3142174432497014, "grad_norm": 0.10888671875, "learning_rate": 0.001975511967980437, "loss": 0.2917, "step": 2200 }, { "epoch": 1.3739545997610514, "grad_norm": 0.10107421875, "learning_rate": 0.001971281031916114, "loss": 0.2816, "step": 2300 }, { "epoch": 1.4336917562724014, "grad_norm": 0.1337890625, "learning_rate": 0.0019667184042691877, "loss": 0.2754, "step": 2400 }, { "epoch": 1.4934289127837514, "grad_norm": 0.08203125, "learning_rate": 0.001961825643172819, "loss": 0.2775, "step": 2500 }, { "epoch": 1.5531660692951015, "grad_norm": 0.0634765625, "learning_rate": 0.0019566044195004407, "loss": 0.2756, "step": 2600 }, { "epoch": 1.6129032258064515, "grad_norm": 0.1142578125, "learning_rate": 0.0019510565162951536, "loss": 0.27, "step": 2700 }, { "epoch": 1.6726403823178018, "grad_norm": 0.09423828125, "learning_rate": 0.0019451838281608197, "loss": 0.2616, "step": 2800 }, { "epoch": 1.7323775388291518, "grad_norm": 0.095703125, "learning_rate": 0.0019389883606150567, "loss": 0.2589, "step": 2900 }, { "epoch": 1.7921146953405018, "grad_norm": 0.12353515625, "learning_rate": 0.0019324722294043557, "loss": 0.2567, "step": 3000 }, { "epoch": 1.8518518518518519, "grad_norm": 0.138671875, "learning_rate": 0.0019256376597815564, "loss": 0.2462, "step": 3100 }, { "epoch": 1.911589008363202, "grad_norm": 0.10205078125, "learning_rate": 0.001918486985745923, "loss": 0.246, "step": 3200 }, { "epoch": 1.971326164874552, "grad_norm": 0.07861328125, "learning_rate": 0.0019110226492460884, "loss": 0.2378, "step": 3300 }, { "epoch": 2.031063321385902, "grad_norm": 0.091796875, "learning_rate": 0.0019032471993461289, "loss": 0.2322, "step": 3400 }, { "epoch": 2.090800477897252, "grad_norm": 0.09814453125, "learning_rate": 0.0018951632913550625, "loss": 0.2341, "step": 3500 }, { "epoch": 2.150537634408602, "grad_norm": 0.0859375, "learning_rate": 0.0018867736859200619, "loss": 0.2314, "step": 3600 }, { "epoch": 2.2102747909199523, "grad_norm": 0.09765625, "learning_rate": 0.0018780812480836979, "loss": 0.2297, "step": 3700 }, { "epoch": 2.270011947431302, "grad_norm": 0.087890625, "learning_rate": 0.0018690889463055284, "loss": 0.224, "step": 3800 }, { "epoch": 2.3297491039426523, "grad_norm": 0.0693359375, "learning_rate": 0.0018597998514483724, "loss": 0.2193, "step": 3900 }, { "epoch": 2.3894862604540026, "grad_norm": 0.11669921875, "learning_rate": 0.0018502171357296143, "loss": 0.2158, "step": 4000 }, { "epoch": 2.4492234169653524, "grad_norm": 0.07568359375, "learning_rate": 0.0018403440716378927, "loss": 0.2081, "step": 4100 }, { "epoch": 2.5089605734767026, "grad_norm": 0.10498046875, "learning_rate": 0.0018301840308155505, "loss": 0.2092, "step": 4200 }, { "epoch": 2.5686977299880525, "grad_norm": 0.10107421875, "learning_rate": 0.0018197404829072212, "loss": 0.2038, "step": 4300 }, { "epoch": 2.6284348864994027, "grad_norm": 0.103515625, "learning_rate": 0.0018090169943749475, "loss": 0.199, "step": 4400 }, { "epoch": 2.6881720430107525, "grad_norm": 0.087890625, "learning_rate": 0.0017980172272802398, "loss": 0.1955, "step": 4500 }, { "epoch": 2.7479091995221028, "grad_norm": 0.0986328125, "learning_rate": 0.0017867449380334832, "loss": 0.1937, "step": 4600 }, { "epoch": 2.807646356033453, "grad_norm": 0.1044921875, "learning_rate": 0.0017752039761111298, "loss": 0.1914, "step": 4700 }, { "epoch": 2.867383512544803, "grad_norm": 0.0947265625, "learning_rate": 0.001763398282741103, "loss": 0.1875, "step": 4800 }, { "epoch": 2.927120669056153, "grad_norm": 0.09765625, "learning_rate": 0.0017513318895568735, "loss": 0.1855, "step": 4900 }, { "epoch": 2.986857825567503, "grad_norm": 0.08056640625, "learning_rate": 0.001739008917220659, "loss": 0.1795, "step": 5000 }, { "epoch": 3.046594982078853, "grad_norm": 0.1259765625, "learning_rate": 0.0017264335740162242, "loss": 0.1742, "step": 5100 }, { "epoch": 3.106332138590203, "grad_norm": 0.09228515625, "learning_rate": 0.0017136101544117524, "loss": 0.1759, "step": 5200 }, { "epoch": 3.166069295101553, "grad_norm": 0.1259765625, "learning_rate": 0.0017005430375932908, "loss": 0.1756, "step": 5300 }, { "epoch": 3.225806451612903, "grad_norm": 0.07470703125, "learning_rate": 0.0016872366859692627, "loss": 0.1766, "step": 5400 }, { "epoch": 3.2855436081242533, "grad_norm": 0.087890625, "learning_rate": 0.0016736956436465573, "loss": 0.1666, "step": 5500 }, { "epoch": 3.3452807646356035, "grad_norm": 0.0869140625, "learning_rate": 0.0016599245348787228, "loss": 0.169, "step": 5600 }, { "epoch": 3.4050179211469533, "grad_norm": 0.08740234375, "learning_rate": 0.0016459280624867873, "loss": 0.1616, "step": 5700 }, { "epoch": 3.4647550776583036, "grad_norm": 0.158203125, "learning_rate": 0.001631711006253251, "loss": 0.1595, "step": 5800 }, { "epoch": 3.5244922341696534, "grad_norm": 0.078125, "learning_rate": 0.001617278221289793, "loss": 0.1626, "step": 5900 }, { "epoch": 3.5842293906810037, "grad_norm": 0.0908203125, "learning_rate": 0.0016026346363792565, "loss": 0.1546, "step": 6000 } ], "logging_steps": 100, "max_steps": 18000, "num_input_tokens_seen": 0, "num_train_epochs": 11, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1559951680731546e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }