|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.00485074853113271, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.4253742655663552e-05, |
|
"grad_norm": 2.9237160682678223, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.4192, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 4.8507485311327104e-05, |
|
"grad_norm": 8.286487579345703, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.7523, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 7.276122796699065e-05, |
|
"grad_norm": 1.957672119140625, |
|
"learning_rate": 6e-06, |
|
"loss": 1.9875, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 9.701497062265421e-05, |
|
"grad_norm": 0.5736268758773804, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.427, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00012126871327831776, |
|
"grad_norm": 0.7024741172790527, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4541, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0001455224559339813, |
|
"grad_norm": 6.171888828277588, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.9617, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00016977619858964487, |
|
"grad_norm": 1.4335829019546509, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.6549, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00019402994124530841, |
|
"grad_norm": 0.8052822947502136, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.681, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00021828368390097196, |
|
"grad_norm": 1.0054032802581787, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.7113, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00024253742655663553, |
|
"grad_norm": 2.378899097442627, |
|
"learning_rate": 2e-05, |
|
"loss": 2.0225, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00026679116921229907, |
|
"grad_norm": 12.526110649108887, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.3219, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0002910449118679626, |
|
"grad_norm": 1.658133864402771, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.5336, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00031529865452362615, |
|
"grad_norm": 18.127710342407227, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 2.1609, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00033955239717928975, |
|
"grad_norm": 0.7923838496208191, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.7354, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0003638061398349533, |
|
"grad_norm": 1.6248362064361572, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5774, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00038805988249061683, |
|
"grad_norm": 1.6371029615402222, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.2978, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00041231362514628037, |
|
"grad_norm": 1.409232497215271, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.1136, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0004365673678019439, |
|
"grad_norm": 5.593843936920166, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.3497, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0004608211104576075, |
|
"grad_norm": 1.9881709814071655, |
|
"learning_rate": 3.8e-05, |
|
"loss": 2.0849, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.00048507485311327105, |
|
"grad_norm": 1.7124191522598267, |
|
"learning_rate": 4e-05, |
|
"loss": 1.565, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0005093285957689346, |
|
"grad_norm": 0.6472250819206238, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.0869, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0005335823384245981, |
|
"grad_norm": 4.816655158996582, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.2343, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0005578360810802617, |
|
"grad_norm": 1.3076063394546509, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.8767, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0005820898237359252, |
|
"grad_norm": 1.0358009338378906, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.7001, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0006063435663915888, |
|
"grad_norm": 0.8611243367195129, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0703, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0006305973090472523, |
|
"grad_norm": 1.8082849979400635, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 2.409, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.000654851051702916, |
|
"grad_norm": 2.4376935958862305, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.6188, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0006791047943585795, |
|
"grad_norm": 1.7561759948730469, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.8782, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.000703358537014243, |
|
"grad_norm": 1.4189746379852295, |
|
"learning_rate": 5.8e-05, |
|
"loss": 2.1414, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0007276122796699066, |
|
"grad_norm": 1.186546802520752, |
|
"learning_rate": 6e-05, |
|
"loss": 1.5557, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0007518660223255701, |
|
"grad_norm": 1.365446925163269, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.9806, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0007761197649812337, |
|
"grad_norm": 5.6013641357421875, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 2.836, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0008003735076368972, |
|
"grad_norm": 1.5393822193145752, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.6833, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0008246272502925607, |
|
"grad_norm": 1.7934705018997192, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.6762, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0008488809929482243, |
|
"grad_norm": 1.1468597650527954, |
|
"learning_rate": 7e-05, |
|
"loss": 1.2819, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0008731347356038878, |
|
"grad_norm": 4.899059772491455, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.7406, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0008973884782595515, |
|
"grad_norm": 1.6008192300796509, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.5028, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.000921642220915215, |
|
"grad_norm": 3.4235761165618896, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.6816, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0009458959635708786, |
|
"grad_norm": 1.1148329973220825, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.4805, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0009701497062265421, |
|
"grad_norm": 4.683089733123779, |
|
"learning_rate": 8e-05, |
|
"loss": 1.9679, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0009944034488822055, |
|
"grad_norm": 0.75764000415802, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.3406, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0010186571915378692, |
|
"grad_norm": 1.3041810989379883, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.4009, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0010429109341935328, |
|
"grad_norm": 1.4765511751174927, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.3446, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0010671646768491963, |
|
"grad_norm": 3.716846227645874, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 2.1417, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.00109141841950486, |
|
"grad_norm": 1.0331724882125854, |
|
"learning_rate": 9e-05, |
|
"loss": 1.396, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0011156721621605234, |
|
"grad_norm": 2.426039934158325, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.466, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.001139925904816187, |
|
"grad_norm": 1.3340977430343628, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.7891, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0011641796474718504, |
|
"grad_norm": 3.5191874504089355, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.6633, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.001188433390127514, |
|
"grad_norm": 1.6775161027908325, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.4875, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0012126871327831775, |
|
"grad_norm": 8.406580924987793, |
|
"learning_rate": 0.0001, |
|
"loss": 2.1048, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0012369408754388412, |
|
"grad_norm": 6.012252330780029, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 2.0742, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0012611946180945046, |
|
"grad_norm": 12.059433937072754, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 1.843, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0012854483607501683, |
|
"grad_norm": 1.3695584535598755, |
|
"learning_rate": 0.00010600000000000002, |
|
"loss": 1.8865, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.001309702103405832, |
|
"grad_norm": 2.331925630569458, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 1.8662, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0013339558460614953, |
|
"grad_norm": 2.2023472785949707, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.3941, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.001358209588717159, |
|
"grad_norm": 1.971096396446228, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 1.9362, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0013824633313728224, |
|
"grad_norm": 1.254345417022705, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 1.5581, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.001406717074028486, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 1.5454, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0014309708166841495, |
|
"grad_norm": 1.5801585912704468, |
|
"learning_rate": 0.000116, |
|
"loss": 1.553, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0014552245593398132, |
|
"grad_norm": 0.9962314963340759, |
|
"learning_rate": 0.000118, |
|
"loss": 1.4924, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0014794783019954766, |
|
"grad_norm": 5.291327476501465, |
|
"learning_rate": 0.00012, |
|
"loss": 1.8365, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0015037320446511402, |
|
"grad_norm": 1.6945866346359253, |
|
"learning_rate": 0.000122, |
|
"loss": 1.4683, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0015279857873068039, |
|
"grad_norm": 1.0400323867797852, |
|
"learning_rate": 0.000124, |
|
"loss": 0.8587, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0015522395299624673, |
|
"grad_norm": 3.5120689868927, |
|
"learning_rate": 0.000126, |
|
"loss": 1.2127, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.001576493272618131, |
|
"grad_norm": 1.0469675064086914, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 1.654, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0016007470152737944, |
|
"grad_norm": 1.5673848390579224, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.801, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.001625000757929458, |
|
"grad_norm": 2.0134799480438232, |
|
"learning_rate": 0.000132, |
|
"loss": 2.0432, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0016492545005851215, |
|
"grad_norm": 7.747060775756836, |
|
"learning_rate": 0.000134, |
|
"loss": 1.4823, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0016735082432407851, |
|
"grad_norm": 3.2787840366363525, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 1.4865, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0016977619858964486, |
|
"grad_norm": 4.277163028717041, |
|
"learning_rate": 0.000138, |
|
"loss": 1.4665, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0017220157285521122, |
|
"grad_norm": 11.562691688537598, |
|
"learning_rate": 0.00014, |
|
"loss": 2.0221, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0017462694712077757, |
|
"grad_norm": 2.5653882026672363, |
|
"learning_rate": 0.000142, |
|
"loss": 1.4631, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0017705232138634393, |
|
"grad_norm": 2.392688751220703, |
|
"learning_rate": 0.000144, |
|
"loss": 1.3514, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.001794776956519103, |
|
"grad_norm": 3.874115467071533, |
|
"learning_rate": 0.000146, |
|
"loss": 1.5983, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0018190306991747664, |
|
"grad_norm": 3.085160732269287, |
|
"learning_rate": 0.000148, |
|
"loss": 1.321, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.00184328444183043, |
|
"grad_norm": 1.7499727010726929, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.385, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0018675381844860935, |
|
"grad_norm": 14.051904678344727, |
|
"learning_rate": 0.000152, |
|
"loss": 1.5748, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0018917919271417571, |
|
"grad_norm": 4.689906597137451, |
|
"learning_rate": 0.000154, |
|
"loss": 1.2126, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0019160456697974206, |
|
"grad_norm": 1.0368577241897583, |
|
"learning_rate": 0.00015600000000000002, |
|
"loss": 1.0542, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0019402994124530842, |
|
"grad_norm": 3.577094078063965, |
|
"learning_rate": 0.00015800000000000002, |
|
"loss": 1.9481, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.001964553155108748, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015800000000000002, |
|
"loss": 1.2913, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.001988806897764411, |
|
"grad_norm": 4.969227313995361, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5551, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0020130606404200747, |
|
"grad_norm": 71.41371154785156, |
|
"learning_rate": 0.000162, |
|
"loss": 1.5847, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0020373143830757384, |
|
"grad_norm": 0.9679685831069946, |
|
"learning_rate": 0.000164, |
|
"loss": 1.1516, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.002061568125731402, |
|
"grad_norm": 7.050138473510742, |
|
"learning_rate": 0.000166, |
|
"loss": 1.6904, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0020858218683870657, |
|
"grad_norm": 1.4181028604507446, |
|
"learning_rate": 0.000168, |
|
"loss": 1.5385, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.002110075611042729, |
|
"grad_norm": 1.122887134552002, |
|
"learning_rate": 0.00017, |
|
"loss": 1.282, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.0021343293536983925, |
|
"grad_norm": 3.0532054901123047, |
|
"learning_rate": 0.000172, |
|
"loss": 1.7126, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.002158583096354056, |
|
"grad_norm": 1.1368091106414795, |
|
"learning_rate": 0.000174, |
|
"loss": 1.3792, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.00218283683900972, |
|
"grad_norm": 1.5556614398956299, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 1.1266, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.002207090581665383, |
|
"grad_norm": 2.5969090461730957, |
|
"learning_rate": 0.00017800000000000002, |
|
"loss": 1.1916, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0022313443243210467, |
|
"grad_norm": 7.9968719482421875, |
|
"learning_rate": 0.00018, |
|
"loss": 2.3005, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0022555980669767104, |
|
"grad_norm": 1.0678150653839111, |
|
"learning_rate": 0.000182, |
|
"loss": 1.1991, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.002279851809632374, |
|
"grad_norm": 6.678997993469238, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 1.9749, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0023041055522880377, |
|
"grad_norm": 2.6500961780548096, |
|
"learning_rate": 0.00018600000000000002, |
|
"loss": 1.7569, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.002328359294943701, |
|
"grad_norm": 1.547878384590149, |
|
"learning_rate": 0.000188, |
|
"loss": 1.7286, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0023526130375993645, |
|
"grad_norm": 1.910861849784851, |
|
"learning_rate": 0.00019, |
|
"loss": 2.0815, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.002376866780255028, |
|
"grad_norm": 3.678144693374634, |
|
"learning_rate": 0.000192, |
|
"loss": 1.7011, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.002401120522910692, |
|
"grad_norm": 1.6496407985687256, |
|
"learning_rate": 0.000194, |
|
"loss": 1.8013, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.002425374265566355, |
|
"grad_norm": 1.3245445489883423, |
|
"learning_rate": 0.000196, |
|
"loss": 1.5302, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0024496280082220187, |
|
"grad_norm": 1.609408974647522, |
|
"learning_rate": 0.00019800000000000002, |
|
"loss": 1.6082, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.0024738817508776823, |
|
"grad_norm": 2.7757482528686523, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6157, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.002498135493533346, |
|
"grad_norm": 2.0618674755096436, |
|
"learning_rate": 0.00019800000000000002, |
|
"loss": 1.7212, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.002522389236189009, |
|
"grad_norm": 1.7433160543441772, |
|
"learning_rate": 0.000196, |
|
"loss": 1.9133, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.002546642978844673, |
|
"grad_norm": 1.4754176139831543, |
|
"learning_rate": 0.000194, |
|
"loss": 1.3311, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0025708967215003365, |
|
"grad_norm": 1.4527512788772583, |
|
"learning_rate": 0.000192, |
|
"loss": 1.6432, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.002595150464156, |
|
"grad_norm": 5.3099045753479, |
|
"learning_rate": 0.00019, |
|
"loss": 1.3233, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.002619404206811664, |
|
"grad_norm": 1.016135573387146, |
|
"learning_rate": 0.000188, |
|
"loss": 1.3291, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.002643657949467327, |
|
"grad_norm": 1.0468915700912476, |
|
"learning_rate": 0.00018600000000000002, |
|
"loss": 1.4454, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0026679116921229907, |
|
"grad_norm": 3.245159387588501, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 1.6501, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0026921654347786543, |
|
"grad_norm": 1.1628005504608154, |
|
"learning_rate": 0.000182, |
|
"loss": 1.399, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.002716419177434318, |
|
"grad_norm": 3.3215761184692383, |
|
"learning_rate": 0.00018, |
|
"loss": 1.4829, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.002740672920089981, |
|
"grad_norm": 1.424054503440857, |
|
"learning_rate": 0.00017800000000000002, |
|
"loss": 1.7692, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.002764926662745645, |
|
"grad_norm": 1.2700444459915161, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 1.6959, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0027891804054013085, |
|
"grad_norm": 1.262736201286316, |
|
"learning_rate": 0.000174, |
|
"loss": 1.5445, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.002813434148056972, |
|
"grad_norm": 4.976276397705078, |
|
"learning_rate": 0.000172, |
|
"loss": 1.4571, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.002837687890712636, |
|
"grad_norm": 1.2716983556747437, |
|
"learning_rate": 0.00017, |
|
"loss": 1.217, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.002861941633368299, |
|
"grad_norm": 1.5143672227859497, |
|
"learning_rate": 0.000168, |
|
"loss": 1.3059, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.0028861953760239627, |
|
"grad_norm": 0.9907928705215454, |
|
"learning_rate": 0.000166, |
|
"loss": 1.8035, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0029104491186796263, |
|
"grad_norm": 1.7163773775100708, |
|
"learning_rate": 0.000164, |
|
"loss": 2.1635, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00293470286133529, |
|
"grad_norm": 7.744179725646973, |
|
"learning_rate": 0.000162, |
|
"loss": 2.2964, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.002958956603990953, |
|
"grad_norm": 1.101208209991455, |
|
"learning_rate": 0.00016, |
|
"loss": 1.46, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.002983210346646617, |
|
"grad_norm": 0.7589418888092041, |
|
"learning_rate": 0.00015800000000000002, |
|
"loss": 1.1036, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.0030074640893022805, |
|
"grad_norm": 4.626345634460449, |
|
"learning_rate": 0.00015600000000000002, |
|
"loss": 1.5257, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.003031717831957944, |
|
"grad_norm": 1.5994218587875366, |
|
"learning_rate": 0.000154, |
|
"loss": 1.2856, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0030559715746136078, |
|
"grad_norm": 1.3893097639083862, |
|
"learning_rate": 0.000152, |
|
"loss": 1.3359, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.003080225317269271, |
|
"grad_norm": 1.0536174774169922, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.1718, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0031044790599249346, |
|
"grad_norm": 1.2898694276809692, |
|
"learning_rate": 0.000148, |
|
"loss": 1.2908, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0031287328025805983, |
|
"grad_norm": 1.6943238973617554, |
|
"learning_rate": 0.000146, |
|
"loss": 1.7723, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.003152986545236262, |
|
"grad_norm": 1.2562038898468018, |
|
"learning_rate": 0.000144, |
|
"loss": 1.4476, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.003177240287891925, |
|
"grad_norm": 1.0170366764068604, |
|
"learning_rate": 0.000142, |
|
"loss": 1.2967, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.003201494030547589, |
|
"grad_norm": 5.646616458892822, |
|
"learning_rate": 0.00014, |
|
"loss": 1.4366, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0032257477732032525, |
|
"grad_norm": 1.29356849193573, |
|
"learning_rate": 0.000138, |
|
"loss": 1.4713, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.003250001515858916, |
|
"grad_norm": 1.3575730323791504, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 1.2233, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0032742552585145793, |
|
"grad_norm": 1.158686876296997, |
|
"learning_rate": 0.000134, |
|
"loss": 1.3622, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.003298509001170243, |
|
"grad_norm": 1.4680081605911255, |
|
"learning_rate": 0.000132, |
|
"loss": 1.7191, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0033227627438259066, |
|
"grad_norm": 1.2448982000350952, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.9376, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0033470164864815703, |
|
"grad_norm": 1.2404478788375854, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 1.5334, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.003371270229137234, |
|
"grad_norm": 0.9862974882125854, |
|
"learning_rate": 0.000126, |
|
"loss": 1.2073, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.003395523971792897, |
|
"grad_norm": 7.36776876449585, |
|
"learning_rate": 0.000124, |
|
"loss": 1.5745, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.003419777714448561, |
|
"grad_norm": 1.1356145143508911, |
|
"learning_rate": 0.000122, |
|
"loss": 1.563, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.0034440314571042244, |
|
"grad_norm": 2.073420286178589, |
|
"learning_rate": 0.00012, |
|
"loss": 1.7438, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.003468285199759888, |
|
"grad_norm": 3.029547691345215, |
|
"learning_rate": 0.000118, |
|
"loss": 1.6755, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0034925389424155513, |
|
"grad_norm": 14.66256046295166, |
|
"learning_rate": 0.000116, |
|
"loss": 1.8375, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.003516792685071215, |
|
"grad_norm": 0.8765383958816528, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 0.6782, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0035410464277268786, |
|
"grad_norm": 3.167731761932373, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 1.4619, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.0035653001703825423, |
|
"grad_norm": 1.1016606092453003, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.5817, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.003589553913038206, |
|
"grad_norm": 2.5535902976989746, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 1.3491, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.003613807655693869, |
|
"grad_norm": 1.1920926570892334, |
|
"learning_rate": 0.00010600000000000002, |
|
"loss": 1.5675, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0036380613983495328, |
|
"grad_norm": 1.1023300886154175, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 1.2821, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0036623151410051964, |
|
"grad_norm": 2.1146063804626465, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 1.5396, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.00368656888366086, |
|
"grad_norm": 1.1838195323944092, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2992, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.0037108226263165233, |
|
"grad_norm": 1.414258360862732, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.6888, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.003735076368972187, |
|
"grad_norm": 0.8630995154380798, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.141, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0037593301116278506, |
|
"grad_norm": 2.4763197898864746, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.5743, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0037835838542835142, |
|
"grad_norm": 0.9457703828811646, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.3879, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.003807837596939178, |
|
"grad_norm": 1.308862328529358, |
|
"learning_rate": 9e-05, |
|
"loss": 1.1564, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.003832091339594841, |
|
"grad_norm": 1.208833932876587, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.2059, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.0038563450822505048, |
|
"grad_norm": 1.9145225286483765, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.4744, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0038805988249061684, |
|
"grad_norm": 1.0121599435806274, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.8676, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.003904852567561832, |
|
"grad_norm": 1.9128226041793823, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.6948, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.003929106310217496, |
|
"grad_norm": 1.7783511877059937, |
|
"learning_rate": 8e-05, |
|
"loss": 1.4198, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.003953360052873159, |
|
"grad_norm": 0.9799928665161133, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.185, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.003977613795528822, |
|
"grad_norm": 1.5025819540023804, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.0638, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.004001867538184486, |
|
"grad_norm": 1.2552540302276611, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.3463, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0040261212808401494, |
|
"grad_norm": 1.0616928339004517, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.9275, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.004050375023495813, |
|
"grad_norm": 2.8277781009674072, |
|
"learning_rate": 7e-05, |
|
"loss": 1.9653, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.004074628766151477, |
|
"grad_norm": 1.7752915620803833, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.2783, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.00409888250880714, |
|
"grad_norm": 2.4863102436065674, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.2803, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.004123136251462804, |
|
"grad_norm": 0.9784297943115234, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.0347, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004147389994118468, |
|
"grad_norm": 2.0077602863311768, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.5035, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.004171643736774131, |
|
"grad_norm": 1.5943875312805176, |
|
"learning_rate": 6e-05, |
|
"loss": 1.7834, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.004195897479429794, |
|
"grad_norm": 1.0898661613464355, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.5043, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.004220151222085458, |
|
"grad_norm": 1.8181873559951782, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.5026, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.004244404964741121, |
|
"grad_norm": 0.9227128028869629, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.3945, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.004268658707396785, |
|
"grad_norm": 1.3131438493728638, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.2629, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.004292912450052449, |
|
"grad_norm": 1.388433814048767, |
|
"learning_rate": 5e-05, |
|
"loss": 1.607, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.004317166192708112, |
|
"grad_norm": 1.5145083665847778, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.8671, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.004341419935363776, |
|
"grad_norm": 2.0021004676818848, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 2.0679, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.00436567367801944, |
|
"grad_norm": 1.8512483835220337, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.3095, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.004389927420675103, |
|
"grad_norm": 1.0164090394973755, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.3749, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.004414181163330766, |
|
"grad_norm": 30.730253219604492, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3106, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.00443843490598643, |
|
"grad_norm": 1.2311471700668335, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.1078, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.004462688648642093, |
|
"grad_norm": 0.9555554389953613, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.4175, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.004486942391297757, |
|
"grad_norm": 1.4511969089508057, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.4099, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.004511196133953421, |
|
"grad_norm": 5.319721221923828, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.2749, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.004535449876609084, |
|
"grad_norm": 1.8668482303619385, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4658, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.004559703619264748, |
|
"grad_norm": 1.737425684928894, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.8142, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.004583957361920412, |
|
"grad_norm": 12.860699653625488, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.6338, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.004608211104576075, |
|
"grad_norm": 0.6315305233001709, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.0426, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.004632464847231738, |
|
"grad_norm": 1.0747138261795044, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.4042, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.004656718589887402, |
|
"grad_norm": 1.1410670280456543, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2148, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.004680972332543065, |
|
"grad_norm": 1.4486732482910156, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.3208, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.004705226075198729, |
|
"grad_norm": 2.5336716175079346, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.7136, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.004729479817854393, |
|
"grad_norm": 3.4056637287139893, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.5443, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.004753733560510056, |
|
"grad_norm": 1.1721996068954468, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.4518, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.00477798730316572, |
|
"grad_norm": 1.9326874017715454, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6381, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.004802241045821384, |
|
"grad_norm": 1.0234136581420898, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.4116, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.004826494788477047, |
|
"grad_norm": 2.1815526485443115, |
|
"learning_rate": 6e-06, |
|
"loss": 1.7442, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.00485074853113271, |
|
"grad_norm": 1.9557406902313232, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.9568, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6034692870365184.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|