{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.012587324564163886, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012587324564163887, "grad_norm": 4.972455978393555, "learning_rate": 2e-05, "loss": 11.8587, "step": 1 }, { "epoch": 0.00012587324564163887, "eval_loss": NaN, "eval_runtime": 3606.4326, "eval_samples_per_second": 0.928, "eval_steps_per_second": 0.464, "step": 1 }, { "epoch": 0.00025174649128327774, "grad_norm": 12.494519233703613, "learning_rate": 4e-05, "loss": 5.71, "step": 2 }, { "epoch": 0.0003776197369249166, "grad_norm": 18.601806640625, "learning_rate": 6e-05, "loss": 11.1117, "step": 3 }, { "epoch": 0.0005034929825665555, "grad_norm": 8.032801628112793, "learning_rate": 8e-05, "loss": 3.9358, "step": 4 }, { "epoch": 0.0006293662282081943, "grad_norm": 12.177727699279785, "learning_rate": 0.0001, "loss": 16.7821, "step": 5 }, { "epoch": 0.0007552394738498332, "grad_norm": 6.706995964050293, "learning_rate": 0.00012, "loss": 10.3418, "step": 6 }, { "epoch": 0.0008811127194914721, "grad_norm": 6.680157661437988, "learning_rate": 0.00014, "loss": 4.845, "step": 7 }, { "epoch": 0.001006985965133111, "grad_norm": 9.944180488586426, "learning_rate": 0.00016, "loss": 2.6742, "step": 8 }, { "epoch": 0.0011328592107747498, "grad_norm": 14.13370418548584, "learning_rate": 0.00018, "loss": 3.8844, "step": 9 }, { "epoch": 0.0012587324564163887, "grad_norm": 17.886985778808594, "learning_rate": 0.0002, "loss": 3.3169, "step": 10 }, { "epoch": 0.0013846057020580275, "grad_norm": 21.377635955810547, "learning_rate": 0.0001999390827019096, "loss": 1.0058, "step": 11 }, { "epoch": 0.0015104789476996663, "grad_norm": 11.985687255859375, "learning_rate": 0.00019975640502598244, "loss": 0.8173, "step": 12 }, { "epoch": 0.0016363521933413054, "grad_norm": 17.24648666381836, "learning_rate": 0.00019945218953682734, "loss": 0.3288, "step": 13 }, { "epoch": 0.0017622254389829443, "grad_norm": 38.9212532043457, "learning_rate": 0.00019902680687415705, "loss": 1.7752, "step": 14 }, { "epoch": 0.001888098684624583, "grad_norm": 12.58133316040039, "learning_rate": 0.00019848077530122083, "loss": 1.7308, "step": 15 }, { "epoch": 0.002013971930266222, "grad_norm": 5.521592617034912, "learning_rate": 0.00019781476007338058, "loss": 0.1023, "step": 16 }, { "epoch": 0.002139845175907861, "grad_norm": 1.7692532539367676, "learning_rate": 0.00019702957262759965, "loss": 0.0166, "step": 17 }, { "epoch": 0.0022657184215494996, "grad_norm": 9.756790161132812, "learning_rate": 0.0001961261695938319, "loss": 1.1932, "step": 18 }, { "epoch": 0.0023915916671911385, "grad_norm": 8.156234741210938, "learning_rate": 0.00019510565162951537, "loss": 0.3879, "step": 19 }, { "epoch": 0.0025174649128327773, "grad_norm": 6.355808258056641, "learning_rate": 0.00019396926207859084, "loss": 0.4888, "step": 20 }, { "epoch": 0.002643338158474416, "grad_norm": 6.595178127288818, "learning_rate": 0.00019271838545667876, "loss": 0.6385, "step": 21 }, { "epoch": 0.002769211404116055, "grad_norm": 3.2132580280303955, "learning_rate": 0.0001913545457642601, "loss": 0.518, "step": 22 }, { "epoch": 0.002895084649757694, "grad_norm": 16.919002532958984, "learning_rate": 0.0001898794046299167, "loss": 0.3769, "step": 23 }, { "epoch": 0.0030209578953993327, "grad_norm": 0.020209377631545067, "learning_rate": 0.00018829475928589271, "loss": 0.0002, "step": 24 }, { "epoch": 0.0031468311410409715, "grad_norm": 0.8074254989624023, "learning_rate": 0.00018660254037844388, "loss": 0.0241, "step": 25 }, { "epoch": 0.0031468311410409715, "eval_loss": NaN, "eval_runtime": 3493.5914, "eval_samples_per_second": 0.957, "eval_steps_per_second": 0.479, "step": 25 }, { "epoch": 0.003272704386682611, "grad_norm": 19.329341888427734, "learning_rate": 0.0001848048096156426, "loss": 0.7896, "step": 26 }, { "epoch": 0.0033985776323242497, "grad_norm": 8.426005363464355, "learning_rate": 0.00018290375725550417, "loss": 0.4921, "step": 27 }, { "epoch": 0.0035244508779658885, "grad_norm": 6.269211292266846, "learning_rate": 0.00018090169943749476, "loss": 0.5567, "step": 28 }, { "epoch": 0.0036503241236075274, "grad_norm": 4.1337480545043945, "learning_rate": 0.00017880107536067218, "loss": 0.0482, "step": 29 }, { "epoch": 0.003776197369249166, "grad_norm": 0.4672463834285736, "learning_rate": 0.0001766044443118978, "loss": 0.0424, "step": 30 }, { "epoch": 0.003902070614890805, "grad_norm": 0.22993697226047516, "learning_rate": 0.00017431448254773944, "loss": 0.0041, "step": 31 }, { "epoch": 0.004027943860532444, "grad_norm": 1.7465827465057373, "learning_rate": 0.0001719339800338651, "loss": 0.0228, "step": 32 }, { "epoch": 0.004153817106174082, "grad_norm": 0.28992122411727905, "learning_rate": 0.00016946583704589973, "loss": 0.0067, "step": 33 }, { "epoch": 0.004279690351815722, "grad_norm": 0.9798206090927124, "learning_rate": 0.00016691306063588583, "loss": 0.0262, "step": 34 }, { "epoch": 0.004405563597457361, "grad_norm": 0.009784302674233913, "learning_rate": 0.00016427876096865394, "loss": 0.0007, "step": 35 }, { "epoch": 0.004531436843098999, "grad_norm": 2.536167860031128, "learning_rate": 0.0001615661475325658, "loss": 1.0487, "step": 36 }, { "epoch": 0.0046573100887406385, "grad_norm": 0.8000788688659668, "learning_rate": 0.00015877852522924732, "loss": 0.0149, "step": 37 }, { "epoch": 0.004783183334382277, "grad_norm": 0.0, "learning_rate": 0.0001559192903470747, "loss": 0.0, "step": 38 }, { "epoch": 0.004909056580023916, "grad_norm": 0.015558776445686817, "learning_rate": 0.0001529919264233205, "loss": 0.0014, "step": 39 }, { "epoch": 0.005034929825665555, "grad_norm": 3.2701315879821777, "learning_rate": 0.00015000000000000001, "loss": 0.2162, "step": 40 }, { "epoch": 0.005160803071307194, "grad_norm": 4.584814071655273, "learning_rate": 0.00014694715627858908, "loss": 0.8873, "step": 41 }, { "epoch": 0.005286676316948832, "grad_norm": 0.006552003789693117, "learning_rate": 0.00014383711467890774, "loss": 0.0004, "step": 42 }, { "epoch": 0.005412549562590472, "grad_norm": 3.2025034427642822, "learning_rate": 0.00014067366430758004, "loss": 0.2046, "step": 43 }, { "epoch": 0.00553842280823211, "grad_norm": 0.2174777388572693, "learning_rate": 0.00013746065934159123, "loss": 0.0041, "step": 44 }, { "epoch": 0.005664296053873749, "grad_norm": 0.0, "learning_rate": 0.00013420201433256689, "loss": 0.0, "step": 45 }, { "epoch": 0.005790169299515388, "grad_norm": 2.011909246444702, "learning_rate": 0.00013090169943749476, "loss": 0.052, "step": 46 }, { "epoch": 0.005916042545157027, "grad_norm": 0.04959569871425629, "learning_rate": 0.0001275637355816999, "loss": 0.0016, "step": 47 }, { "epoch": 0.006041915790798665, "grad_norm": 9.175588607788086, "learning_rate": 0.00012419218955996676, "loss": 0.4144, "step": 48 }, { "epoch": 0.006167789036440305, "grad_norm": 1.922889232635498, "learning_rate": 0.00012079116908177593, "loss": 0.0746, "step": 49 }, { "epoch": 0.006293662282081943, "grad_norm": 3.5199947357177734, "learning_rate": 0.00011736481776669306, "loss": 0.092, "step": 50 }, { "epoch": 0.006293662282081943, "eval_loss": NaN, "eval_runtime": 3519.1151, "eval_samples_per_second": 0.951, "eval_steps_per_second": 0.475, "step": 50 }, { "epoch": 0.006419535527723582, "grad_norm": 0.006420983001589775, "learning_rate": 0.00011391731009600654, "loss": 0.0011, "step": 51 }, { "epoch": 0.006545408773365222, "grad_norm": 0.5650655627250671, "learning_rate": 0.00011045284632676536, "loss": 0.0442, "step": 52 }, { "epoch": 0.00667128201900686, "grad_norm": 0.09890392422676086, "learning_rate": 0.00010697564737441252, "loss": 0.0048, "step": 53 }, { "epoch": 0.006797155264648499, "grad_norm": 0.05633799359202385, "learning_rate": 0.00010348994967025012, "loss": 0.002, "step": 54 }, { "epoch": 0.006923028510290138, "grad_norm": 3.1540989875793457, "learning_rate": 0.0001, "loss": 1.6562, "step": 55 }, { "epoch": 0.007048901755931777, "grad_norm": 5.062186241149902, "learning_rate": 9.651005032974994e-05, "loss": 0.2802, "step": 56 }, { "epoch": 0.007174775001573415, "grad_norm": 1.7618149518966675, "learning_rate": 9.302435262558747e-05, "loss": 0.1865, "step": 57 }, { "epoch": 0.007300648247215055, "grad_norm": 0.37688034772872925, "learning_rate": 8.954715367323468e-05, "loss": 0.0144, "step": 58 }, { "epoch": 0.007426521492856693, "grad_norm": 0.11028943210840225, "learning_rate": 8.608268990399349e-05, "loss": 0.0063, "step": 59 }, { "epoch": 0.007552394738498332, "grad_norm": 0.5467414855957031, "learning_rate": 8.263518223330697e-05, "loss": 0.0118, "step": 60 }, { "epoch": 0.007678267984139971, "grad_norm": 0.029303928837180138, "learning_rate": 7.920883091822408e-05, "loss": 0.0006, "step": 61 }, { "epoch": 0.00780414122978161, "grad_norm": 0.16770148277282715, "learning_rate": 7.580781044003324e-05, "loss": 0.0068, "step": 62 }, { "epoch": 0.007930014475423248, "grad_norm": 0.1437792032957077, "learning_rate": 7.243626441830009e-05, "loss": 0.0024, "step": 63 }, { "epoch": 0.008055887721064888, "grad_norm": 0.11593683063983917, "learning_rate": 6.909830056250527e-05, "loss": 0.0146, "step": 64 }, { "epoch": 0.008181760966706527, "grad_norm": 1.9799082279205322, "learning_rate": 6.579798566743314e-05, "loss": 0.0358, "step": 65 }, { "epoch": 0.008307634212348165, "grad_norm": 0.15605556964874268, "learning_rate": 6.25393406584088e-05, "loss": 0.0031, "step": 66 }, { "epoch": 0.008433507457989804, "grad_norm": 0.03782504051923752, "learning_rate": 5.9326335692419995e-05, "loss": 0.0008, "step": 67 }, { "epoch": 0.008559380703631443, "grad_norm": 0.013293488882482052, "learning_rate": 5.616288532109225e-05, "loss": 0.0006, "step": 68 }, { "epoch": 0.008685253949273082, "grad_norm": 2.1244702339172363, "learning_rate": 5.305284372141095e-05, "loss": 0.3516, "step": 69 }, { "epoch": 0.008811127194914722, "grad_norm": 0.1244107261300087, "learning_rate": 5.000000000000002e-05, "loss": 0.0042, "step": 70 }, { "epoch": 0.00893700044055636, "grad_norm": 0.05394396558403969, "learning_rate": 4.700807357667952e-05, "loss": 0.0018, "step": 71 }, { "epoch": 0.009062873686197999, "grad_norm": 2.811530113220215, "learning_rate": 4.4080709652925336e-05, "loss": 0.2694, "step": 72 }, { "epoch": 0.009188746931839638, "grad_norm": 0.06051735207438469, "learning_rate": 4.12214747707527e-05, "loss": 0.0016, "step": 73 }, { "epoch": 0.009314620177481277, "grad_norm": 0.596136748790741, "learning_rate": 3.843385246743417e-05, "loss": 0.0087, "step": 74 }, { "epoch": 0.009440493423122915, "grad_norm": 0.11233700811862946, "learning_rate": 3.5721239031346066e-05, "loss": 0.0045, "step": 75 }, { "epoch": 0.009440493423122915, "eval_loss": NaN, "eval_runtime": 3551.4208, "eval_samples_per_second": 0.942, "eval_steps_per_second": 0.471, "step": 75 }, { "epoch": 0.009566366668764554, "grad_norm": 0.09344177693128586, "learning_rate": 3.308693936411421e-05, "loss": 0.0053, "step": 76 }, { "epoch": 0.009692239914406193, "grad_norm": 0.017914390191435814, "learning_rate": 3.053416295410026e-05, "loss": 0.0009, "step": 77 }, { "epoch": 0.009818113160047832, "grad_norm": 0.14340530335903168, "learning_rate": 2.8066019966134904e-05, "loss": 0.007, "step": 78 }, { "epoch": 0.00994398640568947, "grad_norm": 2.4637436866760254, "learning_rate": 2.5685517452260567e-05, "loss": 0.2807, "step": 79 }, { "epoch": 0.01006985965133111, "grad_norm": 0.016049662604928017, "learning_rate": 2.339555568810221e-05, "loss": 0.0003, "step": 80 }, { "epoch": 0.010195732896972749, "grad_norm": 5.916388511657715, "learning_rate": 2.119892463932781e-05, "loss": 1.0992, "step": 81 }, { "epoch": 0.010321606142614388, "grad_norm": 0.010422502644360065, "learning_rate": 1.9098300562505266e-05, "loss": 0.0005, "step": 82 }, { "epoch": 0.010447479388256025, "grad_norm": 3.361562728881836, "learning_rate": 1.7096242744495837e-05, "loss": 0.1324, "step": 83 }, { "epoch": 0.010573352633897665, "grad_norm": 0.04859397932887077, "learning_rate": 1.5195190384357404e-05, "loss": 0.0013, "step": 84 }, { "epoch": 0.010699225879539304, "grad_norm": 0.023048996925354004, "learning_rate": 1.339745962155613e-05, "loss": 0.0011, "step": 85 }, { "epoch": 0.010825099125180943, "grad_norm": 0.02878495492041111, "learning_rate": 1.1705240714107302e-05, "loss": 0.0012, "step": 86 }, { "epoch": 0.010950972370822582, "grad_norm": 0.03888264298439026, "learning_rate": 1.0120595370083318e-05, "loss": 0.0033, "step": 87 }, { "epoch": 0.01107684561646422, "grad_norm": 3.1203830242156982, "learning_rate": 8.645454235739903e-06, "loss": 0.1787, "step": 88 }, { "epoch": 0.01120271886210586, "grad_norm": 3.7907369136810303, "learning_rate": 7.281614543321269e-06, "loss": 0.3365, "step": 89 }, { "epoch": 0.011328592107747499, "grad_norm": 2.6835732460021973, "learning_rate": 6.030737921409169e-06, "loss": 0.0975, "step": 90 }, { "epoch": 0.011454465353389138, "grad_norm": 0.0066988165490329266, "learning_rate": 4.8943483704846475e-06, "loss": 0.0004, "step": 91 }, { "epoch": 0.011580338599030775, "grad_norm": 0.0022131705190986395, "learning_rate": 3.873830406168111e-06, "loss": 0.0004, "step": 92 }, { "epoch": 0.011706211844672415, "grad_norm": 0.01956171914935112, "learning_rate": 2.970427372400353e-06, "loss": 0.0015, "step": 93 }, { "epoch": 0.011832085090314054, "grad_norm": 0.0475313700735569, "learning_rate": 2.1852399266194314e-06, "loss": 0.0016, "step": 94 }, { "epoch": 0.011957958335955693, "grad_norm": 3.4122633934020996, "learning_rate": 1.5192246987791981e-06, "loss": 0.7071, "step": 95 }, { "epoch": 0.01208383158159733, "grad_norm": 0.13446319103240967, "learning_rate": 9.731931258429638e-07, "loss": 0.0256, "step": 96 }, { "epoch": 0.01220970482723897, "grad_norm": 0.10620174556970596, "learning_rate": 5.478104631726711e-07, "loss": 0.0039, "step": 97 }, { "epoch": 0.01233557807288061, "grad_norm": 0.5097644329071045, "learning_rate": 2.4359497401758024e-07, "loss": 0.0671, "step": 98 }, { "epoch": 0.012461451318522249, "grad_norm": 1.6247103214263916, "learning_rate": 6.09172980904238e-08, "loss": 0.0878, "step": 99 }, { "epoch": 0.012587324564163886, "grad_norm": 0.002280471846461296, "learning_rate": 0.0, "loss": 0.0001, "step": 100 }, { "epoch": 0.012587324564163886, "eval_loss": NaN, "eval_runtime": 3565.9397, "eval_samples_per_second": 0.938, "eval_steps_per_second": 0.469, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.5853840162816e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }