|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.5842293906810037, |
|
"eval_steps": 1000, |
|
"global_step": 6000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005973715651135006, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 2e-06, |
|
"loss": 1.3168, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05973715651135006, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5498, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11947431302270012, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3884, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17921146953405018, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0006, |
|
"loss": 0.376, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23894862604540024, |
|
"grad_norm": 0.12353515625, |
|
"learning_rate": 0.0008, |
|
"loss": 0.3635, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2986857825567503, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.3559, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.35842293906810035, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 0.0012, |
|
"loss": 0.3438, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.41816009557945044, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.0014, |
|
"loss": 0.3412, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4778972520908005, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 0.0016, |
|
"loss": 0.3392, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5376344086021505, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 0.0018000000000000002, |
|
"loss": 0.346, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5973715651135006, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 0.002, |
|
"loss": 0.3174, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6571087216248507, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.0019998292504580526, |
|
"loss": 0.3309, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7168458781362007, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.001999317060143023, |
|
"loss": 0.323, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7765830346475507, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.001998463603967434, |
|
"loss": 0.3157, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8363201911589009, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.0019972691733857882, |
|
"loss": 0.3155, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8960573476702509, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 0.0019957341762950344, |
|
"loss": 0.3148, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.955794504181601, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 0.001993859136895274, |
|
"loss": 0.3013, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.015531660692951, |
|
"grad_norm": 0.11474609375, |
|
"learning_rate": 0.0019916446955107426, |
|
"loss": 0.3009, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.075268817204301, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 0.001989091608371146, |
|
"loss": 0.3005, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.135005973715651, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 0.0019862007473534027, |
|
"loss": 0.3028, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.1947431302270013, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 0.001982973099683902, |
|
"loss": 0.3006, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.2544802867383513, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.001979409767601366, |
|
"loss": 0.2981, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.3142174432497014, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 0.001975511967980437, |
|
"loss": 0.2917, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.3739545997610514, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.001971281031916114, |
|
"loss": 0.2816, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.4336917562724014, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 0.0019667184042691877, |
|
"loss": 0.2754, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.4934289127837514, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.001961825643172819, |
|
"loss": 0.2775, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.5531660692951015, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 0.0019566044195004407, |
|
"loss": 0.2756, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 0.0019510565162951536, |
|
"loss": 0.27, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.6726403823178018, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.0019451838281608197, |
|
"loss": 0.2616, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.7323775388291518, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.0019389883606150567, |
|
"loss": 0.2589, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.7921146953405018, |
|
"grad_norm": 0.12353515625, |
|
"learning_rate": 0.0019324722294043557, |
|
"loss": 0.2567, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.0019256376597815564, |
|
"loss": 0.2462, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.911589008363202, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 0.001918486985745923, |
|
"loss": 0.246, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.971326164874552, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.0019110226492460884, |
|
"loss": 0.2378, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.031063321385902, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.0019032471993461289, |
|
"loss": 0.2322, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.090800477897252, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.0018951632913550625, |
|
"loss": 0.2341, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.150537634408602, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.0018867736859200619, |
|
"loss": 0.2314, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.2102747909199523, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.0018780812480836979, |
|
"loss": 0.2297, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.270011947431302, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 0.0018690889463055284, |
|
"loss": 0.224, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.3297491039426523, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.0018597998514483724, |
|
"loss": 0.2193, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.3894862604540026, |
|
"grad_norm": 0.11669921875, |
|
"learning_rate": 0.0018502171357296143, |
|
"loss": 0.2158, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.4492234169653524, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.0018403440716378927, |
|
"loss": 0.2081, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.5089605734767026, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 0.0018301840308155505, |
|
"loss": 0.2092, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.5686977299880525, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.0018197404829072212, |
|
"loss": 0.2038, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.6284348864994027, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.0018090169943749475, |
|
"loss": 0.199, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.6881720430107525, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 0.0017980172272802398, |
|
"loss": 0.1955, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.7479091995221028, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 0.0017867449380334832, |
|
"loss": 0.1937, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.807646356033453, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.0017752039761111298, |
|
"loss": 0.1914, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.867383512544803, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 0.001763398282741103, |
|
"loss": 0.1875, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.927120669056153, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.0017513318895568735, |
|
"loss": 0.1855, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.986857825567503, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.001739008917220659, |
|
"loss": 0.1795, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.046594982078853, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.0017264335740162242, |
|
"loss": 0.1742, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.106332138590203, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.0017136101544117524, |
|
"loss": 0.1759, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.166069295101553, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.0017005430375932908, |
|
"loss": 0.1756, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.0016872366859692627, |
|
"loss": 0.1766, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.2855436081242533, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 0.0016736956436465573, |
|
"loss": 0.1666, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.3452807646356035, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.0016599245348787228, |
|
"loss": 0.169, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.4050179211469533, |
|
"grad_norm": 0.08740234375, |
|
"learning_rate": 0.0016459280624867873, |
|
"loss": 0.1616, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.4647550776583036, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.001631711006253251, |
|
"loss": 0.1595, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.5244922341696534, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.001617278221289793, |
|
"loss": 0.1626, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.5842293906810037, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 0.0016026346363792565, |
|
"loss": 0.1546, |
|
"step": 6000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 18000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 11, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": true, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1559951680731546e+17, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|