diff --git "a/checkpoint-450/trainer_state.json" "b/checkpoint-450/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-450/trainer_state.json" @@ -0,0 +1,3903 @@ +{ + "best_metric": 0.01355398166924715, + "best_model_checkpoint": "/home/paperspace/Data/models/dbischof_premise_aea/llm3br256/checkpoint-450", + "epoch": 3.484995159728945, + "eval_steps": 5, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007744433688286544, + "grad_norm": 0.3086823523044586, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0814, + "step": 1 + }, + { + "epoch": 0.015488867376573089, + "grad_norm": 0.3209303617477417, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0926, + "step": 2 + }, + { + "epoch": 0.023233301064859633, + "grad_norm": 0.30226805806159973, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0981, + "step": 3 + }, + { + "epoch": 0.030977734753146177, + "grad_norm": 0.3128693699836731, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0918, + "step": 4 + }, + { + "epoch": 0.03872216844143272, + "grad_norm": 0.2983686327934265, + "learning_rate": 7.692307692307694e-06, + "loss": 0.0896, + "step": 5 + }, + { + "epoch": 0.03872216844143272, + "eval_loss": 0.0767456516623497, + "eval_runtime": 6.3413, + "eval_samples_per_second": 7.885, + "eval_steps_per_second": 2.05, + "step": 5 + }, + { + "epoch": 0.046466602129719266, + "grad_norm": 0.22847148776054382, + "learning_rate": 9.230769230769232e-06, + "loss": 0.1014, + "step": 6 + }, + { + "epoch": 0.05421103581800581, + "grad_norm": 0.2187601923942566, + "learning_rate": 1.0769230769230771e-05, + "loss": 0.0791, + "step": 7 + }, + { + "epoch": 0.061955469506292354, + "grad_norm": 0.17710556089878082, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0764, + "step": 8 + }, + { + "epoch": 0.0696999031945789, + "grad_norm": 0.1523497849702835, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.0548, + "step": 9 + }, + { + "epoch": 0.07744433688286544, + "grad_norm": 0.11329648643732071, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.057, + "step": 10 + }, + { + "epoch": 0.07744433688286544, + "eval_loss": 0.039704836905002594, + "eval_runtime": 4.3898, + "eval_samples_per_second": 11.39, + "eval_steps_per_second": 2.961, + "step": 10 + }, + { + "epoch": 0.08518877057115198, + "grad_norm": 0.09438801556825638, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.0398, + "step": 11 + }, + { + "epoch": 0.09293320425943853, + "grad_norm": 0.09298978000879288, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.0423, + "step": 12 + }, + { + "epoch": 0.10067763794772508, + "grad_norm": 0.10603779554367065, + "learning_rate": 2e-05, + "loss": 0.0455, + "step": 13 + }, + { + "epoch": 0.10842207163601161, + "grad_norm": 0.1061321273446083, + "learning_rate": 2.1538461538461542e-05, + "loss": 0.0582, + "step": 14 + }, + { + "epoch": 0.11616650532429816, + "grad_norm": 0.08672691136598587, + "learning_rate": 2.307692307692308e-05, + "loss": 0.0361, + "step": 15 + }, + { + "epoch": 0.11616650532429816, + "eval_loss": 0.032478053122758865, + "eval_runtime": 4.3871, + "eval_samples_per_second": 11.397, + "eval_steps_per_second": 2.963, + "step": 15 + }, + { + "epoch": 0.12391093901258471, + "grad_norm": 0.06632386893033981, + "learning_rate": 2.461538461538462e-05, + "loss": 0.0436, + "step": 16 + }, + { + "epoch": 0.13165537270087124, + "grad_norm": 0.049268174916505814, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.0365, + "step": 17 + }, + { + "epoch": 0.1393998063891578, + "grad_norm": 0.05043736472725868, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.0322, + "step": 18 + }, + { + "epoch": 0.14714424007744434, + "grad_norm": 0.04957738518714905, + "learning_rate": 2.9230769230769234e-05, + "loss": 0.0354, + "step": 19 + }, + { + "epoch": 0.15488867376573087, + "grad_norm": 0.05300221964716911, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0478, + "step": 20 + }, + { + "epoch": 0.15488867376573087, + "eval_loss": 0.030403098091483116, + "eval_runtime": 4.3847, + "eval_samples_per_second": 11.403, + "eval_steps_per_second": 2.965, + "step": 20 + }, + { + "epoch": 0.16263310745401743, + "grad_norm": 0.049294158816337585, + "learning_rate": 3.230769230769231e-05, + "loss": 0.037, + "step": 21 + }, + { + "epoch": 0.17037754114230397, + "grad_norm": 0.05417300760746002, + "learning_rate": 3.384615384615385e-05, + "loss": 0.0427, + "step": 22 + }, + { + "epoch": 0.1781219748305905, + "grad_norm": 0.05300293490290642, + "learning_rate": 3.538461538461539e-05, + "loss": 0.0396, + "step": 23 + }, + { + "epoch": 0.18586640851887706, + "grad_norm": 0.043480049818754196, + "learning_rate": 3.692307692307693e-05, + "loss": 0.035, + "step": 24 + }, + { + "epoch": 0.1936108422071636, + "grad_norm": 0.041117988526821136, + "learning_rate": 3.846153846153846e-05, + "loss": 0.0293, + "step": 25 + }, + { + "epoch": 0.1936108422071636, + "eval_loss": 0.02704680897295475, + "eval_runtime": 4.4019, + "eval_samples_per_second": 11.359, + "eval_steps_per_second": 2.953, + "step": 25 + }, + { + "epoch": 0.20135527589545016, + "grad_norm": 0.037952277809381485, + "learning_rate": 4e-05, + "loss": 0.034, + "step": 26 + }, + { + "epoch": 0.2090997095837367, + "grad_norm": 0.03611045330762863, + "learning_rate": 4.1538461538461544e-05, + "loss": 0.0296, + "step": 27 + }, + { + "epoch": 0.21684414327202323, + "grad_norm": 0.04834708571434021, + "learning_rate": 4.3076923076923084e-05, + "loss": 0.0448, + "step": 28 + }, + { + "epoch": 0.2245885769603098, + "grad_norm": 0.03826717659831047, + "learning_rate": 4.461538461538462e-05, + "loss": 0.0306, + "step": 29 + }, + { + "epoch": 0.23233301064859632, + "grad_norm": 0.03805238753557205, + "learning_rate": 4.615384615384616e-05, + "loss": 0.0429, + "step": 30 + }, + { + "epoch": 0.23233301064859632, + "eval_loss": 0.025311218574643135, + "eval_runtime": 4.3856, + "eval_samples_per_second": 11.401, + "eval_steps_per_second": 2.964, + "step": 30 + }, + { + "epoch": 0.24007744433688286, + "grad_norm": 0.03321514651179314, + "learning_rate": 4.76923076923077e-05, + "loss": 0.0378, + "step": 31 + }, + { + "epoch": 0.24782187802516942, + "grad_norm": 0.03013491816818714, + "learning_rate": 4.923076923076924e-05, + "loss": 0.0389, + "step": 32 + }, + { + "epoch": 0.25556631171345595, + "grad_norm": 0.03460712358355522, + "learning_rate": 5.0769230769230766e-05, + "loss": 0.027, + "step": 33 + }, + { + "epoch": 0.2633107454017425, + "grad_norm": 0.02806415595114231, + "learning_rate": 5.230769230769231e-05, + "loss": 0.0306, + "step": 34 + }, + { + "epoch": 0.271055179090029, + "grad_norm": 0.03351674601435661, + "learning_rate": 5.384615384615385e-05, + "loss": 0.0368, + "step": 35 + }, + { + "epoch": 0.271055179090029, + "eval_loss": 0.024370329454541206, + "eval_runtime": 4.3804, + "eval_samples_per_second": 11.414, + "eval_steps_per_second": 2.968, + "step": 35 + }, + { + "epoch": 0.2787996127783156, + "grad_norm": 0.028795143589377403, + "learning_rate": 5.538461538461539e-05, + "loss": 0.0302, + "step": 36 + }, + { + "epoch": 0.28654404646660214, + "grad_norm": 0.027806995436549187, + "learning_rate": 5.692307692307692e-05, + "loss": 0.025, + "step": 37 + }, + { + "epoch": 0.2942884801548887, + "grad_norm": 0.02950594201683998, + "learning_rate": 5.846153846153847e-05, + "loss": 0.0282, + "step": 38 + }, + { + "epoch": 0.3020329138431752, + "grad_norm": 0.025643320754170418, + "learning_rate": 6e-05, + "loss": 0.017, + "step": 39 + }, + { + "epoch": 0.30977734753146174, + "grad_norm": 0.03114083595573902, + "learning_rate": 6.153846153846155e-05, + "loss": 0.0323, + "step": 40 + }, + { + "epoch": 0.30977734753146174, + "eval_loss": 0.022903937846422195, + "eval_runtime": 4.4182, + "eval_samples_per_second": 11.317, + "eval_steps_per_second": 2.942, + "step": 40 + }, + { + "epoch": 0.31752178121974833, + "grad_norm": 0.03194240480661392, + "learning_rate": 6.307692307692308e-05, + "loss": 0.0261, + "step": 41 + }, + { + "epoch": 0.32526621490803487, + "grad_norm": 0.02684875763952732, + "learning_rate": 6.461538461538462e-05, + "loss": 0.0219, + "step": 42 + }, + { + "epoch": 0.3330106485963214, + "grad_norm": 0.025787649676203728, + "learning_rate": 6.615384615384616e-05, + "loss": 0.0332, + "step": 43 + }, + { + "epoch": 0.34075508228460794, + "grad_norm": 0.022300513461232185, + "learning_rate": 6.76923076923077e-05, + "loss": 0.0267, + "step": 44 + }, + { + "epoch": 0.34849951597289447, + "grad_norm": 0.025305645540356636, + "learning_rate": 6.923076923076924e-05, + "loss": 0.0223, + "step": 45 + }, + { + "epoch": 0.34849951597289447, + "eval_loss": 0.022478284314274788, + "eval_runtime": 4.3839, + "eval_samples_per_second": 11.405, + "eval_steps_per_second": 2.965, + "step": 45 + }, + { + "epoch": 0.356243949661181, + "grad_norm": 0.03382590040564537, + "learning_rate": 7.076923076923078e-05, + "loss": 0.0446, + "step": 46 + }, + { + "epoch": 0.3639883833494676, + "grad_norm": 0.026729533448815346, + "learning_rate": 7.23076923076923e-05, + "loss": 0.0336, + "step": 47 + }, + { + "epoch": 0.3717328170377541, + "grad_norm": 0.02412431500852108, + "learning_rate": 7.384615384615386e-05, + "loss": 0.0438, + "step": 48 + }, + { + "epoch": 0.37947725072604066, + "grad_norm": 0.03072945401072502, + "learning_rate": 7.538461538461539e-05, + "loss": 0.0331, + "step": 49 + }, + { + "epoch": 0.3872216844143272, + "grad_norm": 0.026063738390803337, + "learning_rate": 7.692307692307693e-05, + "loss": 0.0327, + "step": 50 + }, + { + "epoch": 0.3872216844143272, + "eval_loss": 0.021576760336756706, + "eval_runtime": 4.3771, + "eval_samples_per_second": 11.423, + "eval_steps_per_second": 2.97, + "step": 50 + }, + { + "epoch": 0.39496611810261373, + "grad_norm": 0.02626851014792919, + "learning_rate": 7.846153846153847e-05, + "loss": 0.0295, + "step": 51 + }, + { + "epoch": 0.4027105517909003, + "grad_norm": 0.030156496912240982, + "learning_rate": 8e-05, + "loss": 0.02, + "step": 52 + }, + { + "epoch": 0.41045498547918685, + "grad_norm": 0.028987275436520576, + "learning_rate": 8.153846153846155e-05, + "loss": 0.0311, + "step": 53 + }, + { + "epoch": 0.4181994191674734, + "grad_norm": 0.02352583222091198, + "learning_rate": 8.307692307692309e-05, + "loss": 0.0192, + "step": 54 + }, + { + "epoch": 0.4259438528557599, + "grad_norm": 0.03398854285478592, + "learning_rate": 8.461538461538461e-05, + "loss": 0.0237, + "step": 55 + }, + { + "epoch": 0.4259438528557599, + "eval_loss": 0.020917313173413277, + "eval_runtime": 4.3812, + "eval_samples_per_second": 11.412, + "eval_steps_per_second": 2.967, + "step": 55 + }, + { + "epoch": 0.43368828654404645, + "grad_norm": 0.029533132910728455, + "learning_rate": 8.615384615384617e-05, + "loss": 0.0227, + "step": 56 + }, + { + "epoch": 0.441432720232333, + "grad_norm": 0.024922896176576614, + "learning_rate": 8.76923076923077e-05, + "loss": 0.0229, + "step": 57 + }, + { + "epoch": 0.4491771539206196, + "grad_norm": 0.022748827934265137, + "learning_rate": 8.923076923076924e-05, + "loss": 0.0214, + "step": 58 + }, + { + "epoch": 0.4569215876089061, + "grad_norm": 0.03145488351583481, + "learning_rate": 9.076923076923078e-05, + "loss": 0.0274, + "step": 59 + }, + { + "epoch": 0.46466602129719264, + "grad_norm": 0.02403653971850872, + "learning_rate": 9.230769230769232e-05, + "loss": 0.0255, + "step": 60 + }, + { + "epoch": 0.46466602129719264, + "eval_loss": 0.02042277343571186, + "eval_runtime": 4.409, + "eval_samples_per_second": 11.34, + "eval_steps_per_second": 2.949, + "step": 60 + }, + { + "epoch": 0.4724104549854792, + "grad_norm": 0.0241488516330719, + "learning_rate": 9.384615384615386e-05, + "loss": 0.0213, + "step": 61 + }, + { + "epoch": 0.4801548886737657, + "grad_norm": 0.030788561329245567, + "learning_rate": 9.53846153846154e-05, + "loss": 0.0259, + "step": 62 + }, + { + "epoch": 0.4878993223620523, + "grad_norm": 0.027498334646224976, + "learning_rate": 9.692307692307692e-05, + "loss": 0.0299, + "step": 63 + }, + { + "epoch": 0.49564375605033884, + "grad_norm": 0.0270383283495903, + "learning_rate": 9.846153846153848e-05, + "loss": 0.018, + "step": 64 + }, + { + "epoch": 0.5033881897386253, + "grad_norm": 0.026719942688941956, + "learning_rate": 0.0001, + "loss": 0.0237, + "step": 65 + }, + { + "epoch": 0.5033881897386253, + "eval_loss": 0.019689319655299187, + "eval_runtime": 4.3801, + "eval_samples_per_second": 11.415, + "eval_steps_per_second": 2.968, + "step": 65 + }, + { + "epoch": 0.5111326234269119, + "grad_norm": 0.02332148514688015, + "learning_rate": 9.999926652940913e-05, + "loss": 0.016, + "step": 66 + }, + { + "epoch": 0.5188770571151985, + "grad_norm": 0.0363909974694252, + "learning_rate": 9.999706613915566e-05, + "loss": 0.0243, + "step": 67 + }, + { + "epoch": 0.526621490803485, + "grad_norm": 0.02725972980260849, + "learning_rate": 9.999339889379647e-05, + "loss": 0.0243, + "step": 68 + }, + { + "epoch": 0.5343659244917716, + "grad_norm": 0.026129694655537605, + "learning_rate": 9.998826490092421e-05, + "loss": 0.0289, + "step": 69 + }, + { + "epoch": 0.542110358180058, + "grad_norm": 0.024957949295639992, + "learning_rate": 9.99816643111642e-05, + "loss": 0.0273, + "step": 70 + }, + { + "epoch": 0.542110358180058, + "eval_loss": 0.019689122214913368, + "eval_runtime": 4.3815, + "eval_samples_per_second": 11.412, + "eval_steps_per_second": 2.967, + "step": 70 + }, + { + "epoch": 0.5498547918683446, + "grad_norm": 0.02722254954278469, + "learning_rate": 9.997359731816998e-05, + "loss": 0.0269, + "step": 71 + }, + { + "epoch": 0.5575992255566312, + "grad_norm": 0.03509791940450668, + "learning_rate": 9.996406415861763e-05, + "loss": 0.0358, + "step": 72 + }, + { + "epoch": 0.5653436592449177, + "grad_norm": 0.02415742725133896, + "learning_rate": 9.995306511219885e-05, + "loss": 0.0321, + "step": 73 + }, + { + "epoch": 0.5730880929332043, + "grad_norm": 0.024679476395249367, + "learning_rate": 9.994060050161269e-05, + "loss": 0.0199, + "step": 74 + }, + { + "epoch": 0.5808325266214908, + "grad_norm": 0.02391170710325241, + "learning_rate": 9.992667069255619e-05, + "loss": 0.0192, + "step": 75 + }, + { + "epoch": 0.5808325266214908, + "eval_loss": 0.019159631803631783, + "eval_runtime": 4.3838, + "eval_samples_per_second": 11.406, + "eval_steps_per_second": 2.965, + "step": 75 + }, + { + "epoch": 0.5885769603097774, + "grad_norm": 0.025728462263941765, + "learning_rate": 9.991127609371356e-05, + "loss": 0.0347, + "step": 76 + }, + { + "epoch": 0.5963213939980639, + "grad_norm": 0.023557180538773537, + "learning_rate": 9.989441715674422e-05, + "loss": 0.0294, + "step": 77 + }, + { + "epoch": 0.6040658276863504, + "grad_norm": 0.030054917559027672, + "learning_rate": 9.987609437626955e-05, + "loss": 0.0302, + "step": 78 + }, + { + "epoch": 0.611810261374637, + "grad_norm": 0.023635441437363625, + "learning_rate": 9.985630828985835e-05, + "loss": 0.0228, + "step": 79 + }, + { + "epoch": 0.6195546950629235, + "grad_norm": 0.04299585148692131, + "learning_rate": 9.983505947801115e-05, + "loss": 0.0459, + "step": 80 + }, + { + "epoch": 0.6195546950629235, + "eval_loss": 0.018766988068819046, + "eval_runtime": 4.3783, + "eval_samples_per_second": 11.42, + "eval_steps_per_second": 2.969, + "step": 80 + }, + { + "epoch": 0.6272991287512101, + "grad_norm": 0.024934271350502968, + "learning_rate": 9.981234856414307e-05, + "loss": 0.0236, + "step": 81 + }, + { + "epoch": 0.6350435624394967, + "grad_norm": 0.030999857932329178, + "learning_rate": 9.978817621456562e-05, + "loss": 0.0218, + "step": 82 + }, + { + "epoch": 0.6427879961277831, + "grad_norm": 0.02605932205915451, + "learning_rate": 9.97625431384671e-05, + "loss": 0.0262, + "step": 83 + }, + { + "epoch": 0.6505324298160697, + "grad_norm": 0.02674640342593193, + "learning_rate": 9.973545008789181e-05, + "loss": 0.0334, + "step": 84 + }, + { + "epoch": 0.6582768635043562, + "grad_norm": 0.026963254436850548, + "learning_rate": 9.970689785771798e-05, + "loss": 0.0203, + "step": 85 + }, + { + "epoch": 0.6582768635043562, + "eval_loss": 0.01849793642759323, + "eval_runtime": 4.3938, + "eval_samples_per_second": 11.38, + "eval_steps_per_second": 2.959, + "step": 85 + }, + { + "epoch": 0.6660212971926428, + "grad_norm": 0.026988934725522995, + "learning_rate": 9.967688728563446e-05, + "loss": 0.0164, + "step": 86 + }, + { + "epoch": 0.6737657308809293, + "grad_norm": 0.0342542827129364, + "learning_rate": 9.964541925211612e-05, + "loss": 0.0189, + "step": 87 + }, + { + "epoch": 0.6815101645692159, + "grad_norm": 0.0271429605782032, + "learning_rate": 9.961249468039807e-05, + "loss": 0.0293, + "step": 88 + }, + { + "epoch": 0.6892545982575025, + "grad_norm": 0.037191689014434814, + "learning_rate": 9.957811453644847e-05, + "loss": 0.0204, + "step": 89 + }, + { + "epoch": 0.6969990319457889, + "grad_norm": 0.02466176636517048, + "learning_rate": 9.954227982894034e-05, + "loss": 0.032, + "step": 90 + }, + { + "epoch": 0.6969990319457889, + "eval_loss": 0.018251437693834305, + "eval_runtime": 4.3926, + "eval_samples_per_second": 11.383, + "eval_steps_per_second": 2.96, + "step": 90 + }, + { + "epoch": 0.7047434656340755, + "grad_norm": 0.025401102378964424, + "learning_rate": 9.950499160922183e-05, + "loss": 0.0287, + "step": 91 + }, + { + "epoch": 0.712487899322362, + "grad_norm": 0.02289285883307457, + "learning_rate": 9.946625097128543e-05, + "loss": 0.023, + "step": 92 + }, + { + "epoch": 0.7202323330106486, + "grad_norm": 0.047656841576099396, + "learning_rate": 9.942605905173592e-05, + "loss": 0.0229, + "step": 93 + }, + { + "epoch": 0.7279767666989352, + "grad_norm": 0.027169659733772278, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0216, + "step": 94 + }, + { + "epoch": 0.7357212003872217, + "grad_norm": 0.027544977143406868, + "learning_rate": 9.934132612707632e-05, + "loss": 0.0145, + "step": 95 + }, + { + "epoch": 0.7357212003872217, + "eval_loss": 0.01838814653456211, + "eval_runtime": 4.4091, + "eval_samples_per_second": 11.34, + "eval_steps_per_second": 2.948, + "step": 95 + }, + { + "epoch": 0.7434656340755083, + "grad_norm": 0.03812320902943611, + "learning_rate": 9.929678760793057e-05, + "loss": 0.0293, + "step": 96 + }, + { + "epoch": 0.7512100677637947, + "grad_norm": 0.026603760197758675, + "learning_rate": 9.925080277902743e-05, + "loss": 0.0237, + "step": 97 + }, + { + "epoch": 0.7589545014520813, + "grad_norm": 0.023724529892206192, + "learning_rate": 9.920337298950765e-05, + "loss": 0.0213, + "step": 98 + }, + { + "epoch": 0.7666989351403679, + "grad_norm": 0.02539847232401371, + "learning_rate": 9.91544996309055e-05, + "loss": 0.0213, + "step": 99 + }, + { + "epoch": 0.7744433688286544, + "grad_norm": 0.031199516728520393, + "learning_rate": 9.91041841371078e-05, + "loss": 0.0299, + "step": 100 + }, + { + "epoch": 0.7744433688286544, + "eval_loss": 0.018127141520380974, + "eval_runtime": 4.3744, + "eval_samples_per_second": 11.43, + "eval_steps_per_second": 2.972, + "step": 100 + }, + { + "epoch": 0.782187802516941, + "grad_norm": 0.01887812837958336, + "learning_rate": 9.905242798431196e-05, + "loss": 0.026, + "step": 101 + }, + { + "epoch": 0.7899322362052275, + "grad_norm": 0.029699862003326416, + "learning_rate": 9.899923269098262e-05, + "loss": 0.0275, + "step": 102 + }, + { + "epoch": 0.797676669893514, + "grad_norm": 0.023589355871081352, + "learning_rate": 9.894459981780711e-05, + "loss": 0.0223, + "step": 103 + }, + { + "epoch": 0.8054211035818006, + "grad_norm": 0.020126909017562866, + "learning_rate": 9.888853096764964e-05, + "loss": 0.0301, + "step": 104 + }, + { + "epoch": 0.8131655372700871, + "grad_norm": 0.027466170489788055, + "learning_rate": 9.883102778550434e-05, + "loss": 0.0186, + "step": 105 + }, + { + "epoch": 0.8131655372700871, + "eval_loss": 0.018270503729581833, + "eval_runtime": 4.4268, + "eval_samples_per_second": 11.295, + "eval_steps_per_second": 2.937, + "step": 105 + }, + { + "epoch": 0.8209099709583737, + "grad_norm": 0.029163997620344162, + "learning_rate": 9.877209195844692e-05, + "loss": 0.0274, + "step": 106 + }, + { + "epoch": 0.8286544046466602, + "grad_norm": 0.031382910907268524, + "learning_rate": 9.871172521558523e-05, + "loss": 0.0308, + "step": 107 + }, + { + "epoch": 0.8363988383349468, + "grad_norm": 0.02163223922252655, + "learning_rate": 9.864992932800845e-05, + "loss": 0.0231, + "step": 108 + }, + { + "epoch": 0.8441432720232332, + "grad_norm": 0.03102894499897957, + "learning_rate": 9.858670610873528e-05, + "loss": 0.0252, + "step": 109 + }, + { + "epoch": 0.8518877057115198, + "grad_norm": 0.02512267790734768, + "learning_rate": 9.852205741266058e-05, + "loss": 0.0255, + "step": 110 + }, + { + "epoch": 0.8518877057115198, + "eval_loss": 0.017753126099705696, + "eval_runtime": 4.3888, + "eval_samples_per_second": 11.393, + "eval_steps_per_second": 2.962, + "step": 110 + }, + { + "epoch": 0.8596321393998064, + "grad_norm": 0.021887609735131264, + "learning_rate": 9.845598513650103e-05, + "loss": 0.0203, + "step": 111 + }, + { + "epoch": 0.8673765730880929, + "grad_norm": 0.026221172884106636, + "learning_rate": 9.838849121873949e-05, + "loss": 0.0216, + "step": 112 + }, + { + "epoch": 0.8751210067763795, + "grad_norm": 0.01997440867125988, + "learning_rate": 9.831957763956813e-05, + "loss": 0.0226, + "step": 113 + }, + { + "epoch": 0.882865440464666, + "grad_norm": 0.01921810209751129, + "learning_rate": 9.824924642083026e-05, + "loss": 0.0133, + "step": 114 + }, + { + "epoch": 0.8906098741529526, + "grad_norm": 0.021300863474607468, + "learning_rate": 9.817749962596115e-05, + "loss": 0.0199, + "step": 115 + }, + { + "epoch": 0.8906098741529526, + "eval_loss": 0.017665784806013107, + "eval_runtime": 4.3883, + "eval_samples_per_second": 11.394, + "eval_steps_per_second": 2.962, + "step": 115 + }, + { + "epoch": 0.8983543078412392, + "grad_norm": 0.02153032273054123, + "learning_rate": 9.810433935992733e-05, + "loss": 0.0238, + "step": 116 + }, + { + "epoch": 0.9060987415295256, + "grad_norm": 0.021676376461982727, + "learning_rate": 9.802976776916494e-05, + "loss": 0.0167, + "step": 117 + }, + { + "epoch": 0.9138431752178122, + "grad_norm": 0.021497417241334915, + "learning_rate": 9.795378704151675e-05, + "loss": 0.0189, + "step": 118 + }, + { + "epoch": 0.9215876089060987, + "grad_norm": 0.022522611543536186, + "learning_rate": 9.787639940616788e-05, + "loss": 0.024, + "step": 119 + }, + { + "epoch": 0.9293320425943853, + "grad_norm": 0.023478692397475243, + "learning_rate": 9.779760713358059e-05, + "loss": 0.0216, + "step": 120 + }, + { + "epoch": 0.9293320425943853, + "eval_loss": 0.01727675460278988, + "eval_runtime": 4.3786, + "eval_samples_per_second": 11.419, + "eval_steps_per_second": 2.969, + "step": 120 + }, + { + "epoch": 0.9370764762826719, + "grad_norm": 0.029589442536234856, + "learning_rate": 9.771741253542741e-05, + "loss": 0.0234, + "step": 121 + }, + { + "epoch": 0.9448209099709584, + "grad_norm": 0.021279161795973778, + "learning_rate": 9.763581796452353e-05, + "loss": 0.0163, + "step": 122 + }, + { + "epoch": 0.952565343659245, + "grad_norm": 0.036104779690504074, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0328, + "step": 123 + }, + { + "epoch": 0.9603097773475314, + "grad_norm": 0.020669342949986458, + "learning_rate": 9.74684385210219e-05, + "loss": 0.0155, + "step": 124 + }, + { + "epoch": 0.968054211035818, + "grad_norm": 0.021985569968819618, + "learning_rate": 9.738265855914013e-05, + "loss": 0.024, + "step": 125 + }, + { + "epoch": 0.968054211035818, + "eval_loss": 0.017624683678150177, + "eval_runtime": 4.3751, + "eval_samples_per_second": 11.428, + "eval_steps_per_second": 2.971, + "step": 125 + }, + { + "epoch": 0.9757986447241046, + "grad_norm": 0.026250576600432396, + "learning_rate": 9.729548844579552e-05, + "loss": 0.0158, + "step": 126 + }, + { + "epoch": 0.9835430784123911, + "grad_norm": 0.028716014698147774, + "learning_rate": 9.720693073845667e-05, + "loss": 0.0293, + "step": 127 + }, + { + "epoch": 0.9912875121006777, + "grad_norm": 0.025626949965953827, + "learning_rate": 9.711698803530254e-05, + "loss": 0.0321, + "step": 128 + }, + { + "epoch": 0.9990319457889641, + "grad_norm": 0.021693823859095573, + "learning_rate": 9.70256629751462e-05, + "loss": 0.0149, + "step": 129 + }, + { + "epoch": 1.0067763794772506, + "grad_norm": 0.049767978489398956, + "learning_rate": 9.693295823735753e-05, + "loss": 0.0319, + "step": 130 + }, + { + "epoch": 1.0067763794772506, + "eval_loss": 0.01727178506553173, + "eval_runtime": 4.3778, + "eval_samples_per_second": 11.421, + "eval_steps_per_second": 2.97, + "step": 130 + }, + { + "epoch": 1.0145208131655372, + "grad_norm": 0.024738334119319916, + "learning_rate": 9.683887654178445e-05, + "loss": 0.0239, + "step": 131 + }, + { + "epoch": 1.0222652468538238, + "grad_norm": 0.024547995999455452, + "learning_rate": 9.674342064867326e-05, + "loss": 0.0206, + "step": 132 + }, + { + "epoch": 1.0300096805421104, + "grad_norm": 0.031586963683366776, + "learning_rate": 9.664659335858755e-05, + "loss": 0.0138, + "step": 133 + }, + { + "epoch": 1.037754114230397, + "grad_norm": 0.018796470016241074, + "learning_rate": 9.654839751232611e-05, + "loss": 0.0209, + "step": 134 + }, + { + "epoch": 1.0454985479186834, + "grad_norm": 0.0231508519500494, + "learning_rate": 9.644883599083958e-05, + "loss": 0.0202, + "step": 135 + }, + { + "epoch": 1.0454985479186834, + "eval_loss": 0.017550285905599594, + "eval_runtime": 4.3777, + "eval_samples_per_second": 11.422, + "eval_steps_per_second": 2.97, + "step": 135 + }, + { + "epoch": 1.05324298160697, + "grad_norm": 0.029408982023596764, + "learning_rate": 9.634791171514585e-05, + "loss": 0.0278, + "step": 136 + }, + { + "epoch": 1.0609874152952565, + "grad_norm": 0.027235113084316254, + "learning_rate": 9.624562764624445e-05, + "loss": 0.0149, + "step": 137 + }, + { + "epoch": 1.0687318489835431, + "grad_norm": 0.024401573464274406, + "learning_rate": 9.614198678502965e-05, + "loss": 0.015, + "step": 138 + }, + { + "epoch": 1.0764762826718297, + "grad_norm": 0.028705554082989693, + "learning_rate": 9.603699217220239e-05, + "loss": 0.0196, + "step": 139 + }, + { + "epoch": 1.084220716360116, + "grad_norm": 0.02626665309071541, + "learning_rate": 9.59306468881811e-05, + "loss": 0.0167, + "step": 140 + }, + { + "epoch": 1.084220716360116, + "eval_loss": 0.017066117376089096, + "eval_runtime": 4.3787, + "eval_samples_per_second": 11.419, + "eval_steps_per_second": 2.969, + "step": 140 + }, + { + "epoch": 1.0919651500484027, + "grad_norm": 0.024959465488791466, + "learning_rate": 9.582295405301131e-05, + "loss": 0.0212, + "step": 141 + }, + { + "epoch": 1.0997095837366893, + "grad_norm": 0.02779693529009819, + "learning_rate": 9.571391682627412e-05, + "loss": 0.0178, + "step": 142 + }, + { + "epoch": 1.1074540174249758, + "grad_norm": 0.029659366235136986, + "learning_rate": 9.56035384069935e-05, + "loss": 0.04, + "step": 143 + }, + { + "epoch": 1.1151984511132624, + "grad_norm": 0.025969749316573143, + "learning_rate": 9.549182203354242e-05, + "loss": 0.0225, + "step": 144 + }, + { + "epoch": 1.1229428848015488, + "grad_norm": 0.026329027488827705, + "learning_rate": 9.537877098354786e-05, + "loss": 0.0205, + "step": 145 + }, + { + "epoch": 1.1229428848015488, + "eval_loss": 0.01678595133125782, + "eval_runtime": 4.3744, + "eval_samples_per_second": 11.43, + "eval_steps_per_second": 2.972, + "step": 145 + }, + { + "epoch": 1.1306873184898354, + "grad_norm": 0.022639548406004906, + "learning_rate": 9.526438857379463e-05, + "loss": 0.0174, + "step": 146 + }, + { + "epoch": 1.138431752178122, + "grad_norm": 0.019363639876246452, + "learning_rate": 9.514867816012809e-05, + "loss": 0.0178, + "step": 147 + }, + { + "epoch": 1.1461761858664086, + "grad_norm": 0.024218933656811714, + "learning_rate": 9.503164313735566e-05, + "loss": 0.0191, + "step": 148 + }, + { + "epoch": 1.1539206195546952, + "grad_norm": 0.02254585176706314, + "learning_rate": 9.491328693914722e-05, + "loss": 0.0216, + "step": 149 + }, + { + "epoch": 1.1616650532429815, + "grad_norm": 0.021929148584604263, + "learning_rate": 9.47936130379344e-05, + "loss": 0.0164, + "step": 150 + }, + { + "epoch": 1.1616650532429815, + "eval_loss": 0.016734711825847626, + "eval_runtime": 4.3868, + "eval_samples_per_second": 11.398, + "eval_steps_per_second": 2.963, + "step": 150 + }, + { + "epoch": 1.1694094869312681, + "grad_norm": 0.02163657918572426, + "learning_rate": 9.467262494480869e-05, + "loss": 0.0239, + "step": 151 + }, + { + "epoch": 1.1771539206195547, + "grad_norm": 0.020555593073368073, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0212, + "step": 152 + }, + { + "epoch": 1.1848983543078413, + "grad_norm": 0.02054913528263569, + "learning_rate": 9.442672041986457e-05, + "loss": 0.0338, + "step": 153 + }, + { + "epoch": 1.1926427879961277, + "grad_norm": 0.017929015681147575, + "learning_rate": 9.430181120259565e-05, + "loss": 0.021, + "step": 154 + }, + { + "epoch": 1.2003872216844143, + "grad_norm": 0.02343195676803589, + "learning_rate": 9.417560222230115e-05, + "loss": 0.0303, + "step": 155 + }, + { + "epoch": 1.2003872216844143, + "eval_loss": 0.0167617779225111, + "eval_runtime": 4.4167, + "eval_samples_per_second": 11.321, + "eval_steps_per_second": 2.943, + "step": 155 + }, + { + "epoch": 1.2081316553727008, + "grad_norm": 0.026128176599740982, + "learning_rate": 9.404809718180407e-05, + "loss": 0.0245, + "step": 156 + }, + { + "epoch": 1.2158760890609874, + "grad_norm": 0.02399616688489914, + "learning_rate": 9.391929982195232e-05, + "loss": 0.032, + "step": 157 + }, + { + "epoch": 1.223620522749274, + "grad_norm": 0.027354400604963303, + "learning_rate": 9.378921392150892e-05, + "loss": 0.0198, + "step": 158 + }, + { + "epoch": 1.2313649564375604, + "grad_norm": 0.0203176848590374, + "learning_rate": 9.365784329704115e-05, + "loss": 0.0184, + "step": 159 + }, + { + "epoch": 1.239109390125847, + "grad_norm": 0.022227482870221138, + "learning_rate": 9.35251918028086e-05, + "loss": 0.0201, + "step": 160 + }, + { + "epoch": 1.239109390125847, + "eval_loss": 0.016485435888171196, + "eval_runtime": 4.376, + "eval_samples_per_second": 11.426, + "eval_steps_per_second": 2.971, + "step": 160 + }, + { + "epoch": 1.2468538238141336, + "grad_norm": 0.020081602036952972, + "learning_rate": 9.339126333065007e-05, + "loss": 0.0191, + "step": 161 + }, + { + "epoch": 1.2545982575024202, + "grad_norm": 0.018554236739873886, + "learning_rate": 9.325606180986939e-05, + "loss": 0.0154, + "step": 162 + }, + { + "epoch": 1.2623426911907067, + "grad_norm": 0.02304654009640217, + "learning_rate": 9.31195912071201e-05, + "loss": 0.0319, + "step": 163 + }, + { + "epoch": 1.2700871248789931, + "grad_norm": 0.02430463396012783, + "learning_rate": 9.298185552628917e-05, + "loss": 0.0157, + "step": 164 + }, + { + "epoch": 1.2778315585672797, + "grad_norm": 0.023204822093248367, + "learning_rate": 9.284285880837946e-05, + "loss": 0.0183, + "step": 165 + }, + { + "epoch": 1.2778315585672797, + "eval_loss": 0.016406066715717316, + "eval_runtime": 4.4127, + "eval_samples_per_second": 11.331, + "eval_steps_per_second": 2.946, + "step": 165 + }, + { + "epoch": 1.2855759922555663, + "grad_norm": 0.02340216562151909, + "learning_rate": 9.270260513139116e-05, + "loss": 0.0329, + "step": 166 + }, + { + "epoch": 1.2933204259438529, + "grad_norm": 0.023041503503918648, + "learning_rate": 9.256109861020213e-05, + "loss": 0.0181, + "step": 167 + }, + { + "epoch": 1.3010648596321395, + "grad_norm": 0.02382810041308403, + "learning_rate": 9.241834339644726e-05, + "loss": 0.0199, + "step": 168 + }, + { + "epoch": 1.3088092933204258, + "grad_norm": 0.021713877096772194, + "learning_rate": 9.22743436783966e-05, + "loss": 0.0183, + "step": 169 + }, + { + "epoch": 1.3165537270087124, + "grad_norm": 0.02216421440243721, + "learning_rate": 9.212910368083245e-05, + "loss": 0.0221, + "step": 170 + }, + { + "epoch": 1.3165537270087124, + "eval_loss": 0.016315914690494537, + "eval_runtime": 4.3752, + "eval_samples_per_second": 11.428, + "eval_steps_per_second": 2.971, + "step": 170 + }, + { + "epoch": 1.324298160696999, + "grad_norm": 0.022575192153453827, + "learning_rate": 9.198262766492554e-05, + "loss": 0.0251, + "step": 171 + }, + { + "epoch": 1.3320425943852856, + "grad_norm": 0.02038014493882656, + "learning_rate": 9.183491992810979e-05, + "loss": 0.0257, + "step": 172 + }, + { + "epoch": 1.3397870280735722, + "grad_norm": 0.022038010880351067, + "learning_rate": 9.168598480395651e-05, + "loss": 0.0192, + "step": 173 + }, + { + "epoch": 1.3475314617618586, + "grad_norm": 0.022859683260321617, + "learning_rate": 9.153582666204701e-05, + "loss": 0.0246, + "step": 174 + }, + { + "epoch": 1.3552758954501452, + "grad_norm": 0.02186562865972519, + "learning_rate": 9.138444990784453e-05, + "loss": 0.0132, + "step": 175 + }, + { + "epoch": 1.3552758954501452, + "eval_loss": 0.01624121144413948, + "eval_runtime": 4.3851, + "eval_samples_per_second": 11.402, + "eval_steps_per_second": 2.965, + "step": 175 + }, + { + "epoch": 1.3630203291384317, + "grad_norm": 0.022015810012817383, + "learning_rate": 9.123185898256496e-05, + "loss": 0.0209, + "step": 176 + }, + { + "epoch": 1.3707647628267183, + "grad_norm": 0.022759562358260155, + "learning_rate": 9.107805836304658e-05, + "loss": 0.0215, + "step": 177 + }, + { + "epoch": 1.378509196515005, + "grad_norm": 0.0224290881305933, + "learning_rate": 9.092305256161859e-05, + "loss": 0.0137, + "step": 178 + }, + { + "epoch": 1.3862536302032913, + "grad_norm": 0.02067345194518566, + "learning_rate": 9.076684612596891e-05, + "loss": 0.0234, + "step": 179 + }, + { + "epoch": 1.3939980638915779, + "grad_norm": 0.0244379211217165, + "learning_rate": 9.060944363901056e-05, + "loss": 0.0226, + "step": 180 + }, + { + "epoch": 1.3939980638915779, + "eval_loss": 0.01581944338977337, + "eval_runtime": 4.39, + "eval_samples_per_second": 11.39, + "eval_steps_per_second": 2.961, + "step": 180 + }, + { + "epoch": 1.4017424975798645, + "grad_norm": 0.025064224377274513, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0137, + "step": 181 + }, + { + "epoch": 1.409486931268151, + "grad_norm": 0.022704744711518288, + "learning_rate": 9.029106901813839e-05, + "loss": 0.0224, + "step": 182 + }, + { + "epoch": 1.4172313649564376, + "grad_norm": 0.023529507219791412, + "learning_rate": 9.013010622496144e-05, + "loss": 0.0148, + "step": 183 + }, + { + "epoch": 1.424975798644724, + "grad_norm": 0.022351229563355446, + "learning_rate": 8.996796606167548e-05, + "loss": 0.0186, + "step": 184 + }, + { + "epoch": 1.4327202323330106, + "grad_norm": 0.02896580472588539, + "learning_rate": 8.980465328528219e-05, + "loss": 0.0173, + "step": 185 + }, + { + "epoch": 1.4327202323330106, + "eval_loss": 0.0159382913261652, + "eval_runtime": 4.3905, + "eval_samples_per_second": 11.388, + "eval_steps_per_second": 2.961, + "step": 185 + }, + { + "epoch": 1.4404646660212972, + "grad_norm": 0.022037331014871597, + "learning_rate": 8.96401726871863e-05, + "loss": 0.0116, + "step": 186 + }, + { + "epoch": 1.4482090997095838, + "grad_norm": 0.02559385821223259, + "learning_rate": 8.94745290930551e-05, + "loss": 0.023, + "step": 187 + }, + { + "epoch": 1.4559535333978704, + "grad_norm": 0.03725734353065491, + "learning_rate": 8.930772736267674e-05, + "loss": 0.0351, + "step": 188 + }, + { + "epoch": 1.4636979670861567, + "grad_norm": 0.021388601511716843, + "learning_rate": 8.913977238981778e-05, + "loss": 0.0169, + "step": 189 + }, + { + "epoch": 1.4714424007744433, + "grad_norm": 0.019340962171554565, + "learning_rate": 8.897066910207958e-05, + "loss": 0.0304, + "step": 190 + }, + { + "epoch": 1.4714424007744433, + "eval_loss": 0.016351182013750076, + "eval_runtime": 4.3963, + "eval_samples_per_second": 11.373, + "eval_steps_per_second": 2.957, + "step": 190 + }, + { + "epoch": 1.47918683446273, + "grad_norm": 0.02193869836628437, + "learning_rate": 8.880042246075365e-05, + "loss": 0.0219, + "step": 191 + }, + { + "epoch": 1.4869312681510165, + "grad_norm": 0.020674917846918106, + "learning_rate": 8.862903746067618e-05, + "loss": 0.017, + "step": 192 + }, + { + "epoch": 1.494675701839303, + "grad_norm": 0.01704789139330387, + "learning_rate": 8.845651913008145e-05, + "loss": 0.0121, + "step": 193 + }, + { + "epoch": 1.5024201355275895, + "grad_norm": 0.02329368144273758, + "learning_rate": 8.828287253045435e-05, + "loss": 0.0156, + "step": 194 + }, + { + "epoch": 1.510164569215876, + "grad_norm": 0.023794591426849365, + "learning_rate": 8.810810275638183e-05, + "loss": 0.0177, + "step": 195 + }, + { + "epoch": 1.510164569215876, + "eval_loss": 0.016085928305983543, + "eval_runtime": 4.391, + "eval_samples_per_second": 11.387, + "eval_steps_per_second": 2.961, + "step": 195 + }, + { + "epoch": 1.5179090029041626, + "grad_norm": 0.01901249960064888, + "learning_rate": 8.793221493540347e-05, + "loss": 0.0138, + "step": 196 + }, + { + "epoch": 1.5256534365924492, + "grad_norm": 0.020555458962917328, + "learning_rate": 8.775521422786104e-05, + "loss": 0.0154, + "step": 197 + }, + { + "epoch": 1.5333978702807358, + "grad_norm": 0.02266288548707962, + "learning_rate": 8.757710582674707e-05, + "loss": 0.0196, + "step": 198 + }, + { + "epoch": 1.5411423039690222, + "grad_norm": 0.016408788040280342, + "learning_rate": 8.739789495755253e-05, + "loss": 0.0147, + "step": 199 + }, + { + "epoch": 1.5488867376573088, + "grad_norm": 0.024890296161174774, + "learning_rate": 8.721758687811352e-05, + "loss": 0.0155, + "step": 200 + }, + { + "epoch": 1.5488867376573088, + "eval_loss": 0.016014162451028824, + "eval_runtime": 4.3722, + "eval_samples_per_second": 11.436, + "eval_steps_per_second": 2.973, + "step": 200 + }, + { + "epoch": 1.5566311713455954, + "grad_norm": 0.023205876350402832, + "learning_rate": 8.703618687845696e-05, + "loss": 0.0158, + "step": 201 + }, + { + "epoch": 1.5643756050338817, + "grad_norm": 0.023312438279390335, + "learning_rate": 8.685370028064546e-05, + "loss": 0.0249, + "step": 202 + }, + { + "epoch": 1.5721200387221685, + "grad_norm": 0.02218124084174633, + "learning_rate": 8.667013243862113e-05, + "loss": 0.0236, + "step": 203 + }, + { + "epoch": 1.579864472410455, + "grad_norm": 0.021021878346800804, + "learning_rate": 8.64854887380485e-05, + "loss": 0.0172, + "step": 204 + }, + { + "epoch": 1.5876089060987415, + "grad_norm": 0.022670872509479523, + "learning_rate": 8.629977459615655e-05, + "loss": 0.0258, + "step": 205 + }, + { + "epoch": 1.5876089060987415, + "eval_loss": 0.015911860391497612, + "eval_runtime": 4.3722, + "eval_samples_per_second": 11.436, + "eval_steps_per_second": 2.973, + "step": 205 + }, + { + "epoch": 1.595353339787028, + "grad_norm": 0.02052771858870983, + "learning_rate": 8.611299546157974e-05, + "loss": 0.0291, + "step": 206 + }, + { + "epoch": 1.6030977734753145, + "grad_norm": 0.023315995931625366, + "learning_rate": 8.592515681419813e-05, + "loss": 0.0258, + "step": 207 + }, + { + "epoch": 1.6108422071636013, + "grad_norm": 0.03357525169849396, + "learning_rate": 8.573626416497668e-05, + "loss": 0.0187, + "step": 208 + }, + { + "epoch": 1.6185866408518876, + "grad_norm": 0.021634763106703758, + "learning_rate": 8.554632305580354e-05, + "loss": 0.0212, + "step": 209 + }, + { + "epoch": 1.6263310745401742, + "grad_norm": 0.02398337796330452, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0217, + "step": 210 + }, + { + "epoch": 1.6263310745401742, + "eval_loss": 0.01627761498093605, + "eval_runtime": 4.39, + "eval_samples_per_second": 11.389, + "eval_steps_per_second": 2.961, + "step": 210 + }, + { + "epoch": 1.6340755082284608, + "grad_norm": 0.024927034974098206, + "learning_rate": 8.5163317778794e-05, + "loss": 0.0225, + "step": 211 + }, + { + "epoch": 1.6418199419167472, + "grad_norm": 0.022668635472655296, + "learning_rate": 8.497026484788189e-05, + "loss": 0.0261, + "step": 212 + }, + { + "epoch": 1.649564375605034, + "grad_norm": 0.020460564643144608, + "learning_rate": 8.477618593053693e-05, + "loss": 0.0273, + "step": 213 + }, + { + "epoch": 1.6573088092933204, + "grad_norm": 0.027241550385951996, + "learning_rate": 8.458108672080624e-05, + "loss": 0.0255, + "step": 214 + }, + { + "epoch": 1.665053242981607, + "grad_norm": 0.019933342933654785, + "learning_rate": 8.438497294267117e-05, + "loss": 0.0197, + "step": 215 + }, + { + "epoch": 1.665053242981607, + "eval_loss": 0.016057245433330536, + "eval_runtime": 4.3758, + "eval_samples_per_second": 11.427, + "eval_steps_per_second": 2.971, + "step": 215 + }, + { + "epoch": 1.6727976766698935, + "grad_norm": 0.016015920788049698, + "learning_rate": 8.418785034987921e-05, + "loss": 0.0148, + "step": 216 + }, + { + "epoch": 1.68054211035818, + "grad_norm": 0.020762229338288307, + "learning_rate": 8.39897247257754e-05, + "loss": 0.022, + "step": 217 + }, + { + "epoch": 1.6882865440464667, + "grad_norm": 0.0254424549639225, + "learning_rate": 8.379060188313244e-05, + "loss": 0.0286, + "step": 218 + }, + { + "epoch": 1.696030977734753, + "grad_norm": 0.01940356194972992, + "learning_rate": 8.359048766398031e-05, + "loss": 0.0171, + "step": 219 + }, + { + "epoch": 1.7037754114230397, + "grad_norm": 0.020493976771831512, + "learning_rate": 8.338938793943478e-05, + "loss": 0.0124, + "step": 220 + }, + { + "epoch": 1.7037754114230397, + "eval_loss": 0.015820881351828575, + "eval_runtime": 4.3798, + "eval_samples_per_second": 11.416, + "eval_steps_per_second": 2.968, + "step": 220 + }, + { + "epoch": 1.7115198451113263, + "grad_norm": 0.0217495858669281, + "learning_rate": 8.318730860952522e-05, + "loss": 0.0229, + "step": 221 + }, + { + "epoch": 1.7192642787996126, + "grad_norm": 0.021223610267043114, + "learning_rate": 8.298425560302146e-05, + "loss": 0.0233, + "step": 222 + }, + { + "epoch": 1.7270087124878994, + "grad_norm": 0.02781669795513153, + "learning_rate": 8.278023487725982e-05, + "loss": 0.032, + "step": 223 + }, + { + "epoch": 1.7347531461761858, + "grad_norm": 0.02280505932867527, + "learning_rate": 8.257525241796838e-05, + "loss": 0.0162, + "step": 224 + }, + { + "epoch": 1.7424975798644724, + "grad_norm": 0.021283939480781555, + "learning_rate": 8.236931423909138e-05, + "loss": 0.0248, + "step": 225 + }, + { + "epoch": 1.7424975798644724, + "eval_loss": 0.01560266874730587, + "eval_runtime": 4.3796, + "eval_samples_per_second": 11.416, + "eval_steps_per_second": 2.968, + "step": 225 + }, + { + "epoch": 1.750242013552759, + "grad_norm": 0.025019438937306404, + "learning_rate": 8.216242638261276e-05, + "loss": 0.0261, + "step": 226 + }, + { + "epoch": 1.7579864472410454, + "grad_norm": 0.020630711689591408, + "learning_rate": 8.19545949183788e-05, + "loss": 0.0171, + "step": 227 + }, + { + "epoch": 1.7657308809293322, + "grad_norm": 0.022184062749147415, + "learning_rate": 8.17458259439202e-05, + "loss": 0.0276, + "step": 228 + }, + { + "epoch": 1.7734753146176185, + "grad_norm": 0.018736379221081734, + "learning_rate": 8.153612558427311e-05, + "loss": 0.0199, + "step": 229 + }, + { + "epoch": 1.7812197483059051, + "grad_norm": 0.02617255039513111, + "learning_rate": 8.132549999179933e-05, + "loss": 0.017, + "step": 230 + }, + { + "epoch": 1.7812197483059051, + "eval_loss": 0.015948235988616943, + "eval_runtime": 4.3895, + "eval_samples_per_second": 11.391, + "eval_steps_per_second": 2.962, + "step": 230 + }, + { + "epoch": 1.7889641819941917, + "grad_norm": 0.024188663810491562, + "learning_rate": 8.111395534600603e-05, + "loss": 0.0184, + "step": 231 + }, + { + "epoch": 1.796708615682478, + "grad_norm": 0.020861351862549782, + "learning_rate": 8.090149785336425e-05, + "loss": 0.0185, + "step": 232 + }, + { + "epoch": 1.804453049370765, + "grad_norm": 0.02357521653175354, + "learning_rate": 8.068813374712688e-05, + "loss": 0.031, + "step": 233 + }, + { + "epoch": 1.8121974830590513, + "grad_norm": 0.022738052532076836, + "learning_rate": 8.047386928714582e-05, + "loss": 0.0136, + "step": 234 + }, + { + "epoch": 1.8199419167473379, + "grad_norm": 0.02491193450987339, + "learning_rate": 8.025871075968828e-05, + "loss": 0.0248, + "step": 235 + }, + { + "epoch": 1.8199419167473379, + "eval_loss": 0.015809817239642143, + "eval_runtime": 4.3863, + "eval_samples_per_second": 11.399, + "eval_steps_per_second": 2.964, + "step": 235 + }, + { + "epoch": 1.8276863504356244, + "grad_norm": 0.021618014201521873, + "learning_rate": 8.00426644772523e-05, + "loss": 0.0184, + "step": 236 + }, + { + "epoch": 1.8354307841239108, + "grad_norm": 0.026012565940618515, + "learning_rate": 7.982573677838172e-05, + "loss": 0.0136, + "step": 237 + }, + { + "epoch": 1.8431752178121976, + "grad_norm": 0.023996589705348015, + "learning_rate": 7.960793402748002e-05, + "loss": 0.0288, + "step": 238 + }, + { + "epoch": 1.850919651500484, + "grad_norm": 0.025611311197280884, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0254, + "step": 239 + }, + { + "epoch": 1.8586640851887706, + "grad_norm": 0.01994282752275467, + "learning_rate": 7.916972895537471e-05, + "loss": 0.0189, + "step": 240 + }, + { + "epoch": 1.8586640851887706, + "eval_loss": 0.015514240600168705, + "eval_runtime": 4.3816, + "eval_samples_per_second": 11.411, + "eval_steps_per_second": 2.967, + "step": 240 + }, + { + "epoch": 1.8664085188770572, + "grad_norm": 0.02635214664041996, + "learning_rate": 7.894933949059245e-05, + "loss": 0.0268, + "step": 241 + }, + { + "epoch": 1.8741529525653435, + "grad_norm": 0.022437842562794685, + "learning_rate": 7.872810068624451e-05, + "loss": 0.0158, + "step": 242 + }, + { + "epoch": 1.8818973862536303, + "grad_norm": 0.022370561957359314, + "learning_rate": 7.850601903321716e-05, + "loss": 0.0188, + "step": 243 + }, + { + "epoch": 1.8896418199419167, + "grad_norm": 0.027548542246222496, + "learning_rate": 7.828310104712489e-05, + "loss": 0.0164, + "step": 244 + }, + { + "epoch": 1.8973862536302033, + "grad_norm": 0.02081076055765152, + "learning_rate": 7.805935326811912e-05, + "loss": 0.0185, + "step": 245 + }, + { + "epoch": 1.8973862536302033, + "eval_loss": 0.015138417482376099, + "eval_runtime": 4.386, + "eval_samples_per_second": 11.4, + "eval_steps_per_second": 2.964, + "step": 245 + }, + { + "epoch": 1.90513068731849, + "grad_norm": 0.01958652399480343, + "learning_rate": 7.783478226069651e-05, + "loss": 0.0164, + "step": 246 + }, + { + "epoch": 1.9128751210067763, + "grad_norm": 0.02671448513865471, + "learning_rate": 7.760939461350623e-05, + "loss": 0.018, + "step": 247 + }, + { + "epoch": 1.920619554695063, + "grad_norm": 0.02252519316971302, + "learning_rate": 7.738319693915672e-05, + "loss": 0.0248, + "step": 248 + }, + { + "epoch": 1.9283639883833494, + "grad_norm": 0.023020565509796143, + "learning_rate": 7.715619587402164e-05, + "loss": 0.0244, + "step": 249 + }, + { + "epoch": 1.936108422071636, + "grad_norm": 0.021742597222328186, + "learning_rate": 7.692839807804521e-05, + "loss": 0.0154, + "step": 250 + }, + { + "epoch": 1.936108422071636, + "eval_loss": 0.015089833177626133, + "eval_runtime": 4.3844, + "eval_samples_per_second": 11.404, + "eval_steps_per_second": 2.965, + "step": 250 + }, + { + "epoch": 1.9438528557599226, + "grad_norm": 0.025942670181393623, + "learning_rate": 7.669981023454682e-05, + "loss": 0.0317, + "step": 251 + }, + { + "epoch": 1.951597289448209, + "grad_norm": 0.021227659657597542, + "learning_rate": 7.647043905002484e-05, + "loss": 0.0187, + "step": 252 + }, + { + "epoch": 1.9593417231364958, + "grad_norm": 0.020006388425827026, + "learning_rate": 7.624029125396004e-05, + "loss": 0.0184, + "step": 253 + }, + { + "epoch": 1.9670861568247822, + "grad_norm": 0.024099906906485558, + "learning_rate": 7.6009373598618e-05, + "loss": 0.0182, + "step": 254 + }, + { + "epoch": 1.9748305905130688, + "grad_norm": 0.025464504957199097, + "learning_rate": 7.577769285885109e-05, + "loss": 0.0223, + "step": 255 + }, + { + "epoch": 1.9748305905130688, + "eval_loss": 0.015248560346662998, + "eval_runtime": 4.3709, + "eval_samples_per_second": 11.439, + "eval_steps_per_second": 2.974, + "step": 255 + }, + { + "epoch": 1.9825750242013553, + "grad_norm": 0.025076473131775856, + "learning_rate": 7.554525583189969e-05, + "loss": 0.0188, + "step": 256 + }, + { + "epoch": 1.9903194578896417, + "grad_norm": 0.016828592866659164, + "learning_rate": 7.53120693371927e-05, + "loss": 0.0182, + "step": 257 + }, + { + "epoch": 1.9980638915779285, + "grad_norm": 0.02599474973976612, + "learning_rate": 7.507814021614761e-05, + "loss": 0.0282, + "step": 258 + }, + { + "epoch": 2.005808325266215, + "grad_norm": 0.040355827659368515, + "learning_rate": 7.484347533196961e-05, + "loss": 0.0184, + "step": 259 + }, + { + "epoch": 2.0135527589545013, + "grad_norm": 0.01993139646947384, + "learning_rate": 7.460808156945036e-05, + "loss": 0.0161, + "step": 260 + }, + { + "epoch": 2.0135527589545013, + "eval_loss": 0.01520194485783577, + "eval_runtime": 4.405, + "eval_samples_per_second": 11.351, + "eval_steps_per_second": 2.951, + "step": 260 + }, + { + "epoch": 2.021297192642788, + "grad_norm": 0.021229611709713936, + "learning_rate": 7.437196583476596e-05, + "loss": 0.0161, + "step": 261 + }, + { + "epoch": 2.0290416263310744, + "grad_norm": 0.0274257343262434, + "learning_rate": 7.413513505527429e-05, + "loss": 0.0223, + "step": 262 + }, + { + "epoch": 2.0367860600193612, + "grad_norm": 0.020992042496800423, + "learning_rate": 7.389759617931182e-05, + "loss": 0.0109, + "step": 263 + }, + { + "epoch": 2.0445304937076476, + "grad_norm": 0.021474428474903107, + "learning_rate": 7.365935617598975e-05, + "loss": 0.0101, + "step": 264 + }, + { + "epoch": 2.052274927395934, + "grad_norm": 0.021412339061498642, + "learning_rate": 7.342042203498951e-05, + "loss": 0.0139, + "step": 265 + }, + { + "epoch": 2.052274927395934, + "eval_loss": 0.015354767441749573, + "eval_runtime": 4.3913, + "eval_samples_per_second": 11.386, + "eval_steps_per_second": 2.96, + "step": 265 + }, + { + "epoch": 2.060019361084221, + "grad_norm": 0.026226527988910675, + "learning_rate": 7.318080076635772e-05, + "loss": 0.013, + "step": 266 + }, + { + "epoch": 2.067763794772507, + "grad_norm": 0.018183927983045578, + "learning_rate": 7.294049940030055e-05, + "loss": 0.0088, + "step": 267 + }, + { + "epoch": 2.075508228460794, + "grad_norm": 0.019244108349084854, + "learning_rate": 7.269952498697734e-05, + "loss": 0.0131, + "step": 268 + }, + { + "epoch": 2.0832526621490803, + "grad_norm": 0.027739770710468292, + "learning_rate": 7.245788459629396e-05, + "loss": 0.0191, + "step": 269 + }, + { + "epoch": 2.0909970958373667, + "grad_norm": 0.022437987849116325, + "learning_rate": 7.221558531769519e-05, + "loss": 0.0173, + "step": 270 + }, + { + "epoch": 2.0909970958373667, + "eval_loss": 0.015298700891435146, + "eval_runtime": 4.3814, + "eval_samples_per_second": 11.412, + "eval_steps_per_second": 2.967, + "step": 270 + }, + { + "epoch": 2.0987415295256535, + "grad_norm": 0.02765963226556778, + "learning_rate": 7.197263425995682e-05, + "loss": 0.0192, + "step": 271 + }, + { + "epoch": 2.10648596321394, + "grad_norm": 0.022411804646253586, + "learning_rate": 7.172903855097711e-05, + "loss": 0.0207, + "step": 272 + }, + { + "epoch": 2.1142303969022267, + "grad_norm": 0.017790112644433975, + "learning_rate": 7.14848053375676e-05, + "loss": 0.0229, + "step": 273 + }, + { + "epoch": 2.121974830590513, + "grad_norm": 0.019638855010271072, + "learning_rate": 7.123994178524345e-05, + "loss": 0.0189, + "step": 274 + }, + { + "epoch": 2.1297192642787994, + "grad_norm": 0.022188464179635048, + "learning_rate": 7.099445507801323e-05, + "loss": 0.0237, + "step": 275 + }, + { + "epoch": 2.1297192642787994, + "eval_loss": 0.015217526815831661, + "eval_runtime": 4.3816, + "eval_samples_per_second": 11.411, + "eval_steps_per_second": 2.967, + "step": 275 + }, + { + "epoch": 2.1374636979670862, + "grad_norm": 0.02580423839390278, + "learning_rate": 7.074835241816817e-05, + "loss": 0.0271, + "step": 276 + }, + { + "epoch": 2.1452081316553726, + "grad_norm": 0.022569693624973297, + "learning_rate": 7.05016410260708e-05, + "loss": 0.0161, + "step": 277 + }, + { + "epoch": 2.1529525653436594, + "grad_norm": 0.023885734379291534, + "learning_rate": 7.025432813994315e-05, + "loss": 0.0182, + "step": 278 + }, + { + "epoch": 2.160696999031946, + "grad_norm": 0.024710629135370255, + "learning_rate": 7.000642101565434e-05, + "loss": 0.0105, + "step": 279 + }, + { + "epoch": 2.168441432720232, + "grad_norm": 0.023105064406991005, + "learning_rate": 6.975792692650777e-05, + "loss": 0.0167, + "step": 280 + }, + { + "epoch": 2.168441432720232, + "eval_loss": 0.01506556011736393, + "eval_runtime": 4.375, + "eval_samples_per_second": 11.428, + "eval_steps_per_second": 2.971, + "step": 280 + }, + { + "epoch": 2.176185866408519, + "grad_norm": 0.022073717787861824, + "learning_rate": 6.950885316302773e-05, + "loss": 0.0191, + "step": 281 + }, + { + "epoch": 2.1839303000968053, + "grad_norm": 0.01825207658112049, + "learning_rate": 6.925920703274541e-05, + "loss": 0.0151, + "step": 282 + }, + { + "epoch": 2.191674733785092, + "grad_norm": 0.03273662552237511, + "learning_rate": 6.90089958599846e-05, + "loss": 0.0266, + "step": 283 + }, + { + "epoch": 2.1994191674733785, + "grad_norm": 0.022013463079929352, + "learning_rate": 6.875822698564679e-05, + "loss": 0.0228, + "step": 284 + }, + { + "epoch": 2.207163601161665, + "grad_norm": 0.016240620985627174, + "learning_rate": 6.850690776699573e-05, + "loss": 0.0086, + "step": 285 + }, + { + "epoch": 2.207163601161665, + "eval_loss": 0.014874408952891827, + "eval_runtime": 4.403, + "eval_samples_per_second": 11.356, + "eval_steps_per_second": 2.953, + "step": 285 + }, + { + "epoch": 2.2149080348499517, + "grad_norm": 0.025109486654400826, + "learning_rate": 6.825504557744167e-05, + "loss": 0.0234, + "step": 286 + }, + { + "epoch": 2.222652468538238, + "grad_norm": 0.02156895585358143, + "learning_rate": 6.800264780632494e-05, + "loss": 0.0094, + "step": 287 + }, + { + "epoch": 2.230396902226525, + "grad_norm": 0.024725494906306267, + "learning_rate": 6.774972185869927e-05, + "loss": 0.0165, + "step": 288 + }, + { + "epoch": 2.2381413359148112, + "grad_norm": 0.022492345422506332, + "learning_rate": 6.749627515511442e-05, + "loss": 0.0196, + "step": 289 + }, + { + "epoch": 2.2458857696030976, + "grad_norm": 0.020706169307231903, + "learning_rate": 6.724231513139852e-05, + "loss": 0.012, + "step": 290 + }, + { + "epoch": 2.2458857696030976, + "eval_loss": 0.01468344684690237, + "eval_runtime": 4.3794, + "eval_samples_per_second": 11.417, + "eval_steps_per_second": 2.968, + "step": 290 + }, + { + "epoch": 2.2536302032913844, + "grad_norm": 0.02125599980354309, + "learning_rate": 6.698784923843992e-05, + "loss": 0.0173, + "step": 291 + }, + { + "epoch": 2.261374636979671, + "grad_norm": 0.029972407966852188, + "learning_rate": 6.673288494196858e-05, + "loss": 0.0225, + "step": 292 + }, + { + "epoch": 2.2691190706679576, + "grad_norm": 0.02421470358967781, + "learning_rate": 6.647742972233703e-05, + "loss": 0.0211, + "step": 293 + }, + { + "epoch": 2.276863504356244, + "grad_norm": 0.02178541198372841, + "learning_rate": 6.622149107430088e-05, + "loss": 0.0113, + "step": 294 + }, + { + "epoch": 2.2846079380445303, + "grad_norm": 0.023280832916498184, + "learning_rate": 6.5965076506799e-05, + "loss": 0.015, + "step": 295 + }, + { + "epoch": 2.2846079380445303, + "eval_loss": 0.014939627610147, + "eval_runtime": 4.424, + "eval_samples_per_second": 11.302, + "eval_steps_per_second": 2.938, + "step": 295 + }, + { + "epoch": 2.292352371732817, + "grad_norm": 0.026383766904473305, + "learning_rate": 6.570819354273317e-05, + "loss": 0.0208, + "step": 296 + }, + { + "epoch": 2.3000968054211035, + "grad_norm": 0.028219886124134064, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0234, + "step": 297 + }, + { + "epoch": 2.3078412391093903, + "grad_norm": 0.025900105014443398, + "learning_rate": 6.519305258500666e-05, + "loss": 0.01, + "step": 298 + }, + { + "epoch": 2.3155856727976767, + "grad_norm": 0.033304035663604736, + "learning_rate": 6.493480970497569e-05, + "loss": 0.0291, + "step": 299 + }, + { + "epoch": 2.323330106485963, + "grad_norm": 0.01692046783864498, + "learning_rate": 6.467612865519674e-05, + "loss": 0.0165, + "step": 300 + }, + { + "epoch": 2.323330106485963, + "eval_loss": 0.015093058347702026, + "eval_runtime": 4.382, + "eval_samples_per_second": 11.41, + "eval_steps_per_second": 2.967, + "step": 300 + }, + { + "epoch": 2.33107454017425, + "grad_norm": 0.021342158317565918, + "learning_rate": 6.441701702506754e-05, + "loss": 0.0193, + "step": 301 + }, + { + "epoch": 2.3388189738625362, + "grad_norm": 0.01914130710065365, + "learning_rate": 6.415748241661851e-05, + "loss": 0.017, + "step": 302 + }, + { + "epoch": 2.346563407550823, + "grad_norm": 0.03137556463479996, + "learning_rate": 6.389753244428972e-05, + "loss": 0.0239, + "step": 303 + }, + { + "epoch": 2.3543078412391094, + "grad_norm": 0.02187853306531906, + "learning_rate": 6.363717473470759e-05, + "loss": 0.0162, + "step": 304 + }, + { + "epoch": 2.362052274927396, + "grad_norm": 0.024960605427622795, + "learning_rate": 6.337641692646106e-05, + "loss": 0.0183, + "step": 305 + }, + { + "epoch": 2.362052274927396, + "eval_loss": 0.015014478005468845, + "eval_runtime": 4.394, + "eval_samples_per_second": 11.379, + "eval_steps_per_second": 2.959, + "step": 305 + }, + { + "epoch": 2.3697967086156826, + "grad_norm": 0.022767778486013412, + "learning_rate": 6.311526666987743e-05, + "loss": 0.0135, + "step": 306 + }, + { + "epoch": 2.377541142303969, + "grad_norm": 0.031544558703899384, + "learning_rate": 6.285373162679803e-05, + "loss": 0.0314, + "step": 307 + }, + { + "epoch": 2.3852855759922553, + "grad_norm": 0.022678814828395844, + "learning_rate": 6.259181947035342e-05, + "loss": 0.0127, + "step": 308 + }, + { + "epoch": 2.393030009680542, + "grad_norm": 0.024432960897684097, + "learning_rate": 6.232953788473811e-05, + "loss": 0.0113, + "step": 309 + }, + { + "epoch": 2.4007744433688285, + "grad_norm": 0.031243745237588882, + "learning_rate": 6.206689456498529e-05, + "loss": 0.0233, + "step": 310 + }, + { + "epoch": 2.4007744433688285, + "eval_loss": 0.015055688098073006, + "eval_runtime": 4.3853, + "eval_samples_per_second": 11.402, + "eval_steps_per_second": 2.964, + "step": 310 + }, + { + "epoch": 2.4085188770571153, + "grad_norm": 0.02543908730149269, + "learning_rate": 6.1803897216741e-05, + "loss": 0.0167, + "step": 311 + }, + { + "epoch": 2.4162633107454017, + "grad_norm": 0.03086618147790432, + "learning_rate": 6.154055355603807e-05, + "loss": 0.0167, + "step": 312 + }, + { + "epoch": 2.4240077444336885, + "grad_norm": 0.027871888130903244, + "learning_rate": 6.127687130906972e-05, + "loss": 0.0142, + "step": 313 + }, + { + "epoch": 2.431752178121975, + "grad_norm": 0.02505462057888508, + "learning_rate": 6.101285821196285e-05, + "loss": 0.0172, + "step": 314 + }, + { + "epoch": 2.4394966118102612, + "grad_norm": 0.02217499166727066, + "learning_rate": 6.0748522010551215e-05, + "loss": 0.0163, + "step": 315 + }, + { + "epoch": 2.4394966118102612, + "eval_loss": 0.014902649447321892, + "eval_runtime": 4.4117, + "eval_samples_per_second": 11.334, + "eval_steps_per_second": 2.947, + "step": 315 + }, + { + "epoch": 2.447241045498548, + "grad_norm": 0.0311166662722826, + "learning_rate": 6.048387046014795e-05, + "loss": 0.0216, + "step": 316 + }, + { + "epoch": 2.4549854791868344, + "grad_norm": 0.023563671857118607, + "learning_rate": 6.021891132531825e-05, + "loss": 0.0163, + "step": 317 + }, + { + "epoch": 2.4627299128751208, + "grad_norm": 0.022931981831789017, + "learning_rate": 5.995365237965144e-05, + "loss": 0.0293, + "step": 318 + }, + { + "epoch": 2.4704743465634076, + "grad_norm": 0.022824544459581375, + "learning_rate": 5.9688101405532925e-05, + "loss": 0.0212, + "step": 319 + }, + { + "epoch": 2.478218780251694, + "grad_norm": 0.018992481753230095, + "learning_rate": 5.9422266193915924e-05, + "loss": 0.0121, + "step": 320 + }, + { + "epoch": 2.478218780251694, + "eval_loss": 0.014721118845045567, + "eval_runtime": 4.4027, + "eval_samples_per_second": 11.357, + "eval_steps_per_second": 2.953, + "step": 320 + }, + { + "epoch": 2.4859632139399808, + "grad_norm": 0.02364177815616131, + "learning_rate": 5.9156154544092815e-05, + "loss": 0.0214, + "step": 321 + }, + { + "epoch": 2.493707647628267, + "grad_norm": 0.019365180283784866, + "learning_rate": 5.8889774263466355e-05, + "loss": 0.0135, + "step": 322 + }, + { + "epoch": 2.501452081316554, + "grad_norm": 0.025308910757303238, + "learning_rate": 5.862313316732063e-05, + "loss": 0.0162, + "step": 323 + }, + { + "epoch": 2.5091965150048403, + "grad_norm": 0.019596999511122704, + "learning_rate": 5.8356239078591724e-05, + "loss": 0.0118, + "step": 324 + }, + { + "epoch": 2.5169409486931267, + "grad_norm": 0.029731974005699158, + "learning_rate": 5.808909982763825e-05, + "loss": 0.0213, + "step": 325 + }, + { + "epoch": 2.5169409486931267, + "eval_loss": 0.014542792923748493, + "eval_runtime": 4.3988, + "eval_samples_per_second": 11.367, + "eval_steps_per_second": 2.955, + "step": 325 + }, + { + "epoch": 2.5246853823814135, + "grad_norm": 0.021189380437135696, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0122, + "step": 326 + }, + { + "epoch": 2.5324298160697, + "grad_norm": 0.030742768198251724, + "learning_rate": 5.7554117196225846e-05, + "loss": 0.0271, + "step": 327 + }, + { + "epoch": 2.5401742497579862, + "grad_norm": 0.01944803074002266, + "learning_rate": 5.728628951152799e-05, + "loss": 0.0171, + "step": 328 + }, + { + "epoch": 2.547918683446273, + "grad_norm": 0.021265676245093346, + "learning_rate": 5.701824805566722e-05, + "loss": 0.0178, + "step": 329 + }, + { + "epoch": 2.5556631171345594, + "grad_norm": 0.02614085003733635, + "learning_rate": 5.675000069266451e-05, + "loss": 0.0253, + "step": 330 + }, + { + "epoch": 2.5556631171345594, + "eval_loss": 0.014535368420183659, + "eval_runtime": 4.4134, + "eval_samples_per_second": 11.329, + "eval_steps_per_second": 2.946, + "step": 330 + }, + { + "epoch": 2.563407550822846, + "grad_norm": 0.021601444110274315, + "learning_rate": 5.6481555292581946e-05, + "loss": 0.0115, + "step": 331 + }, + { + "epoch": 2.5711519845111326, + "grad_norm": 0.02360411174595356, + "learning_rate": 5.621291973129177e-05, + "loss": 0.0149, + "step": 332 + }, + { + "epoch": 2.5788964181994194, + "grad_norm": 0.024015702307224274, + "learning_rate": 5.5944101890245324e-05, + "loss": 0.0197, + "step": 333 + }, + { + "epoch": 2.5866408518877058, + "grad_norm": 0.025345394387841225, + "learning_rate": 5.5675109656241876e-05, + "loss": 0.0116, + "step": 334 + }, + { + "epoch": 2.594385285575992, + "grad_norm": 0.03014366328716278, + "learning_rate": 5.540595092119709e-05, + "loss": 0.023, + "step": 335 + }, + { + "epoch": 2.594385285575992, + "eval_loss": 0.014927403070032597, + "eval_runtime": 4.3882, + "eval_samples_per_second": 11.394, + "eval_steps_per_second": 2.962, + "step": 335 + }, + { + "epoch": 2.602129719264279, + "grad_norm": 0.028806153684854507, + "learning_rate": 5.5136633581911655e-05, + "loss": 0.0245, + "step": 336 + }, + { + "epoch": 2.6098741529525653, + "grad_norm": 0.028237810358405113, + "learning_rate": 5.486716553983951e-05, + "loss": 0.017, + "step": 337 + }, + { + "epoch": 2.6176185866408517, + "grad_norm": 0.020786168053746223, + "learning_rate": 5.4597554700855946e-05, + "loss": 0.0103, + "step": 338 + }, + { + "epoch": 2.6253630203291385, + "grad_norm": 0.024947011843323708, + "learning_rate": 5.432780897502589e-05, + "loss": 0.0182, + "step": 339 + }, + { + "epoch": 2.633107454017425, + "grad_norm": 0.02584216557443142, + "learning_rate": 5.4057936276371565e-05, + "loss": 0.014, + "step": 340 + }, + { + "epoch": 2.633107454017425, + "eval_loss": 0.014420481398701668, + "eval_runtime": 4.3828, + "eval_samples_per_second": 11.408, + "eval_steps_per_second": 2.966, + "step": 340 + }, + { + "epoch": 2.6408518877057117, + "grad_norm": 0.02316311001777649, + "learning_rate": 5.378794452264053e-05, + "loss": 0.0132, + "step": 341 + }, + { + "epoch": 2.648596321393998, + "grad_norm": 0.02283734641969204, + "learning_rate": 5.351784163507319e-05, + "loss": 0.0144, + "step": 342 + }, + { + "epoch": 2.656340755082285, + "grad_norm": 0.02243635058403015, + "learning_rate": 5.324763553817054e-05, + "loss": 0.0173, + "step": 343 + }, + { + "epoch": 2.664085188770571, + "grad_norm": 0.028348112478852272, + "learning_rate": 5.2977334159461614e-05, + "loss": 0.0222, + "step": 344 + }, + { + "epoch": 2.6718296224588576, + "grad_norm": 0.021189652383327484, + "learning_rate": 5.270694542927088e-05, + "loss": 0.0156, + "step": 345 + }, + { + "epoch": 2.6718296224588576, + "eval_loss": 0.014493023045361042, + "eval_runtime": 4.3854, + "eval_samples_per_second": 11.402, + "eval_steps_per_second": 2.964, + "step": 345 + }, + { + "epoch": 2.6795740561471444, + "grad_norm": 0.02965528517961502, + "learning_rate": 5.2436477280485605e-05, + "loss": 0.018, + "step": 346 + }, + { + "epoch": 2.6873184898354308, + "grad_norm": 0.026447108015418053, + "learning_rate": 5.216593764832311e-05, + "loss": 0.0227, + "step": 347 + }, + { + "epoch": 2.695062923523717, + "grad_norm": 0.02978765405714512, + "learning_rate": 5.189533447009794e-05, + "loss": 0.0234, + "step": 348 + }, + { + "epoch": 2.702807357212004, + "grad_norm": 0.02861112542450428, + "learning_rate": 5.162467568498903e-05, + "loss": 0.0177, + "step": 349 + }, + { + "epoch": 2.7105517909002903, + "grad_norm": 0.020247234031558037, + "learning_rate": 5.135396923380673e-05, + "loss": 0.0164, + "step": 350 + }, + { + "epoch": 2.7105517909002903, + "eval_loss": 0.014293421059846878, + "eval_runtime": 4.3961, + "eval_samples_per_second": 11.374, + "eval_steps_per_second": 2.957, + "step": 350 + }, + { + "epoch": 2.718296224588577, + "grad_norm": 0.02749788947403431, + "learning_rate": 5.108322305875988e-05, + "loss": 0.0129, + "step": 351 + }, + { + "epoch": 2.7260406582768635, + "grad_norm": 0.02441268227994442, + "learning_rate": 5.081244510322274e-05, + "loss": 0.0154, + "step": 352 + }, + { + "epoch": 2.7337850919651503, + "grad_norm": 0.0233867596834898, + "learning_rate": 5.0541643311502e-05, + "loss": 0.0138, + "step": 353 + }, + { + "epoch": 2.7415295256534367, + "grad_norm": 0.024917351081967354, + "learning_rate": 5.027082562860368e-05, + "loss": 0.0148, + "step": 354 + }, + { + "epoch": 2.749273959341723, + "grad_norm": 0.029262401163578033, + "learning_rate": 5e-05, + "loss": 0.0262, + "step": 355 + }, + { + "epoch": 2.749273959341723, + "eval_loss": 0.01397681050002575, + "eval_runtime": 4.3833, + "eval_samples_per_second": 11.407, + "eval_steps_per_second": 2.966, + "step": 355 + }, + { + "epoch": 2.75701839303001, + "grad_norm": 0.020170222967863083, + "learning_rate": 4.9729174371396334e-05, + "loss": 0.0116, + "step": 356 + }, + { + "epoch": 2.764762826718296, + "grad_norm": 0.020452341064810753, + "learning_rate": 4.945835668849801e-05, + "loss": 0.0124, + "step": 357 + }, + { + "epoch": 2.7725072604065826, + "grad_norm": 0.0310356542468071, + "learning_rate": 4.9187554896777285e-05, + "loss": 0.0168, + "step": 358 + }, + { + "epoch": 2.7802516940948694, + "grad_norm": 0.024840321391820908, + "learning_rate": 4.8916776941240135e-05, + "loss": 0.0153, + "step": 359 + }, + { + "epoch": 2.7879961277831558, + "grad_norm": 0.020108085125684738, + "learning_rate": 4.8646030766193285e-05, + "loss": 0.0134, + "step": 360 + }, + { + "epoch": 2.7879961277831558, + "eval_loss": 0.014164156280457973, + "eval_runtime": 4.391, + "eval_samples_per_second": 11.387, + "eval_steps_per_second": 2.961, + "step": 360 + }, + { + "epoch": 2.7957405614714426, + "grad_norm": 0.020801270380616188, + "learning_rate": 4.837532431501098e-05, + "loss": 0.0194, + "step": 361 + }, + { + "epoch": 2.803484995159729, + "grad_norm": 0.024626409634947777, + "learning_rate": 4.8104665529902075e-05, + "loss": 0.0152, + "step": 362 + }, + { + "epoch": 2.8112294288480157, + "grad_norm": 0.02485991269350052, + "learning_rate": 4.78340623516769e-05, + "loss": 0.0156, + "step": 363 + }, + { + "epoch": 2.818973862536302, + "grad_norm": 0.02390717901289463, + "learning_rate": 4.756352271951441e-05, + "loss": 0.0208, + "step": 364 + }, + { + "epoch": 2.8267182962245885, + "grad_norm": 0.021397821605205536, + "learning_rate": 4.729305457072913e-05, + "loss": 0.018, + "step": 365 + }, + { + "epoch": 2.8267182962245885, + "eval_loss": 0.014422168955206871, + "eval_runtime": 4.4089, + "eval_samples_per_second": 11.341, + "eval_steps_per_second": 2.949, + "step": 365 + }, + { + "epoch": 2.8344627299128753, + "grad_norm": 0.023740559816360474, + "learning_rate": 4.70226658405384e-05, + "loss": 0.0139, + "step": 366 + }, + { + "epoch": 2.8422071636011617, + "grad_norm": 0.025647273287177086, + "learning_rate": 4.675236446182946e-05, + "loss": 0.0162, + "step": 367 + }, + { + "epoch": 2.849951597289448, + "grad_norm": 0.030185095965862274, + "learning_rate": 4.648215836492682e-05, + "loss": 0.0153, + "step": 368 + }, + { + "epoch": 2.857696030977735, + "grad_norm": 0.02621537074446678, + "learning_rate": 4.6212055477359486e-05, + "loss": 0.0175, + "step": 369 + }, + { + "epoch": 2.865440464666021, + "grad_norm": 0.02325296215713024, + "learning_rate": 4.594206372362845e-05, + "loss": 0.0166, + "step": 370 + }, + { + "epoch": 2.865440464666021, + "eval_loss": 0.01447114534676075, + "eval_runtime": 4.3858, + "eval_samples_per_second": 11.4, + "eval_steps_per_second": 2.964, + "step": 370 + }, + { + "epoch": 2.8731848983543076, + "grad_norm": 0.022154103964567184, + "learning_rate": 4.567219102497412e-05, + "loss": 0.0135, + "step": 371 + }, + { + "epoch": 2.8809293320425944, + "grad_norm": 0.025845184922218323, + "learning_rate": 4.540244529914406e-05, + "loss": 0.0123, + "step": 372 + }, + { + "epoch": 2.888673765730881, + "grad_norm": 0.029730072245001793, + "learning_rate": 4.5132834460160524e-05, + "loss": 0.017, + "step": 373 + }, + { + "epoch": 2.8964181994191676, + "grad_norm": 0.024403782561421394, + "learning_rate": 4.486336641808835e-05, + "loss": 0.0115, + "step": 374 + }, + { + "epoch": 2.904162633107454, + "grad_norm": 0.023468228057026863, + "learning_rate": 4.4594049078802925e-05, + "loss": 0.0204, + "step": 375 + }, + { + "epoch": 2.904162633107454, + "eval_loss": 0.01408495381474495, + "eval_runtime": 4.3803, + "eval_samples_per_second": 11.415, + "eval_steps_per_second": 2.968, + "step": 375 + }, + { + "epoch": 2.9119070667957407, + "grad_norm": 0.022963017225265503, + "learning_rate": 4.4324890343758136e-05, + "loss": 0.0099, + "step": 376 + }, + { + "epoch": 2.919651500484027, + "grad_norm": 0.020131012424826622, + "learning_rate": 4.405589810975468e-05, + "loss": 0.0086, + "step": 377 + }, + { + "epoch": 2.9273959341723135, + "grad_norm": 0.023492030799388885, + "learning_rate": 4.3787080268708244e-05, + "loss": 0.0124, + "step": 378 + }, + { + "epoch": 2.9351403678606003, + "grad_norm": 0.025374602526426315, + "learning_rate": 4.351844470741808e-05, + "loss": 0.0092, + "step": 379 + }, + { + "epoch": 2.9428848015488867, + "grad_norm": 0.03317565843462944, + "learning_rate": 4.3249999307335495e-05, + "loss": 0.0284, + "step": 380 + }, + { + "epoch": 2.9428848015488867, + "eval_loss": 0.013897891156375408, + "eval_runtime": 4.3747, + "eval_samples_per_second": 11.429, + "eval_steps_per_second": 2.972, + "step": 380 + }, + { + "epoch": 2.950629235237173, + "grad_norm": 0.02796320803463459, + "learning_rate": 4.298175194433279e-05, + "loss": 0.0157, + "step": 381 + }, + { + "epoch": 2.95837366892546, + "grad_norm": 0.01982416957616806, + "learning_rate": 4.2713710488472006e-05, + "loss": 0.0084, + "step": 382 + }, + { + "epoch": 2.9661181026137466, + "grad_norm": 0.027833865955471992, + "learning_rate": 4.244588280377417e-05, + "loss": 0.0189, + "step": 383 + }, + { + "epoch": 2.973862536302033, + "grad_norm": 0.024295540526509285, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0124, + "step": 384 + }, + { + "epoch": 2.9816069699903194, + "grad_norm": 0.03249318525195122, + "learning_rate": 4.1910900172361764e-05, + "loss": 0.021, + "step": 385 + }, + { + "epoch": 2.9816069699903194, + "eval_loss": 0.013895859941840172, + "eval_runtime": 4.3776, + "eval_samples_per_second": 11.422, + "eval_steps_per_second": 2.97, + "step": 385 + }, + { + "epoch": 2.989351403678606, + "grad_norm": 0.031626492738723755, + "learning_rate": 4.164376092140828e-05, + "loss": 0.0255, + "step": 386 + }, + { + "epoch": 2.9970958373668926, + "grad_norm": 0.02495499886572361, + "learning_rate": 4.1376866832679385e-05, + "loss": 0.0174, + "step": 387 + }, + { + "epoch": 3.004840271055179, + "grad_norm": 0.05411753058433533, + "learning_rate": 4.1110225736533664e-05, + "loss": 0.0246, + "step": 388 + }, + { + "epoch": 3.0125847047434657, + "grad_norm": 0.018482988700270653, + "learning_rate": 4.084384545590719e-05, + "loss": 0.0066, + "step": 389 + }, + { + "epoch": 3.020329138431752, + "grad_norm": 0.024023229256272316, + "learning_rate": 4.057773380608411e-05, + "loss": 0.0125, + "step": 390 + }, + { + "epoch": 3.020329138431752, + "eval_loss": 0.014542804099619389, + "eval_runtime": 4.4398, + "eval_samples_per_second": 11.262, + "eval_steps_per_second": 2.928, + "step": 390 + }, + { + "epoch": 3.028073572120039, + "grad_norm": 0.023777876049280167, + "learning_rate": 4.0311898594467086e-05, + "loss": 0.0129, + "step": 391 + }, + { + "epoch": 3.0358180058083253, + "grad_norm": 0.02729886770248413, + "learning_rate": 4.0046347620348586e-05, + "loss": 0.0146, + "step": 392 + }, + { + "epoch": 3.0435624394966116, + "grad_norm": 0.02031349390745163, + "learning_rate": 3.9781088674681764e-05, + "loss": 0.0097, + "step": 393 + }, + { + "epoch": 3.0513068731848985, + "grad_norm": 0.04125319421291351, + "learning_rate": 3.951612953985207e-05, + "loss": 0.018, + "step": 394 + }, + { + "epoch": 3.059051306873185, + "grad_norm": 0.02806476317346096, + "learning_rate": 3.92514779894488e-05, + "loss": 0.0157, + "step": 395 + }, + { + "epoch": 3.059051306873185, + "eval_loss": 0.014474487863481045, + "eval_runtime": 4.3861, + "eval_samples_per_second": 11.4, + "eval_steps_per_second": 2.964, + "step": 395 + }, + { + "epoch": 3.0667957405614716, + "grad_norm": 0.03150569275021553, + "learning_rate": 3.8987141788037154e-05, + "loss": 0.0104, + "step": 396 + }, + { + "epoch": 3.074540174249758, + "grad_norm": 0.034014422446489334, + "learning_rate": 3.8723128690930296e-05, + "loss": 0.0128, + "step": 397 + }, + { + "epoch": 3.0822846079380444, + "grad_norm": 0.024559644982218742, + "learning_rate": 3.8459446443961944e-05, + "loss": 0.0073, + "step": 398 + }, + { + "epoch": 3.090029041626331, + "grad_norm": 0.03375939279794693, + "learning_rate": 3.8196102783258994e-05, + "loss": 0.0184, + "step": 399 + }, + { + "epoch": 3.0977734753146176, + "grad_norm": 0.026912059634923935, + "learning_rate": 3.793310543501473e-05, + "loss": 0.0136, + "step": 400 + }, + { + "epoch": 3.0977734753146176, + "eval_loss": 0.014197892509400845, + "eval_runtime": 4.3827, + "eval_samples_per_second": 11.408, + "eval_steps_per_second": 2.966, + "step": 400 + }, + { + "epoch": 3.1055179090029044, + "grad_norm": 0.0277080275118351, + "learning_rate": 3.7670462115261906e-05, + "loss": 0.0191, + "step": 401 + }, + { + "epoch": 3.1132623426911907, + "grad_norm": 0.03689959645271301, + "learning_rate": 3.7408180529646596e-05, + "loss": 0.0167, + "step": 402 + }, + { + "epoch": 3.121006776379477, + "grad_norm": 0.025296786800026894, + "learning_rate": 3.714626837320195e-05, + "loss": 0.0104, + "step": 403 + }, + { + "epoch": 3.128751210067764, + "grad_norm": 0.03194635733962059, + "learning_rate": 3.688473333012259e-05, + "loss": 0.0185, + "step": 404 + }, + { + "epoch": 3.1364956437560503, + "grad_norm": 0.030948929488658905, + "learning_rate": 3.6623583073538966e-05, + "loss": 0.0087, + "step": 405 + }, + { + "epoch": 3.1364956437560503, + "eval_loss": 0.014084648340940475, + "eval_runtime": 4.3751, + "eval_samples_per_second": 11.428, + "eval_steps_per_second": 2.971, + "step": 405 + }, + { + "epoch": 3.144240077444337, + "grad_norm": 0.028673294931650162, + "learning_rate": 3.636282526529242e-05, + "loss": 0.0149, + "step": 406 + }, + { + "epoch": 3.1519845111326235, + "grad_norm": 0.030361266806721687, + "learning_rate": 3.6102467555710295e-05, + "loss": 0.0133, + "step": 407 + }, + { + "epoch": 3.15972894482091, + "grad_norm": 0.03140443190932274, + "learning_rate": 3.584251758338151e-05, + "loss": 0.0136, + "step": 408 + }, + { + "epoch": 3.1674733785091966, + "grad_norm": 0.02556728571653366, + "learning_rate": 3.558298297493247e-05, + "loss": 0.0093, + "step": 409 + }, + { + "epoch": 3.175217812197483, + "grad_norm": 0.03045126423239708, + "learning_rate": 3.5323871344803263e-05, + "loss": 0.0217, + "step": 410 + }, + { + "epoch": 3.175217812197483, + "eval_loss": 0.01385459117591381, + "eval_runtime": 4.3774, + "eval_samples_per_second": 11.422, + "eval_steps_per_second": 2.97, + "step": 410 + }, + { + "epoch": 3.1829622458857694, + "grad_norm": 0.021814046427607536, + "learning_rate": 3.506519029502433e-05, + "loss": 0.0118, + "step": 411 + }, + { + "epoch": 3.190706679574056, + "grad_norm": 0.04128853231668472, + "learning_rate": 3.480694741499334e-05, + "loss": 0.0143, + "step": 412 + }, + { + "epoch": 3.1984511132623425, + "grad_norm": 0.02113756537437439, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0119, + "step": 413 + }, + { + "epoch": 3.2061955469506294, + "grad_norm": 0.030161535367369652, + "learning_rate": 3.4291806457266826e-05, + "loss": 0.0138, + "step": 414 + }, + { + "epoch": 3.2139399806389157, + "grad_norm": 0.023096317425370216, + "learning_rate": 3.403492349320101e-05, + "loss": 0.0125, + "step": 415 + }, + { + "epoch": 3.2139399806389157, + "eval_loss": 0.013588756322860718, + "eval_runtime": 4.3818, + "eval_samples_per_second": 11.411, + "eval_steps_per_second": 2.967, + "step": 415 + }, + { + "epoch": 3.2216844143272025, + "grad_norm": 0.032038476318120956, + "learning_rate": 3.3778508925699124e-05, + "loss": 0.0246, + "step": 416 + }, + { + "epoch": 3.229428848015489, + "grad_norm": 0.019922303035855293, + "learning_rate": 3.3522570277662985e-05, + "loss": 0.0084, + "step": 417 + }, + { + "epoch": 3.2371732817037753, + "grad_norm": 0.031198205426335335, + "learning_rate": 3.326711505803142e-05, + "loss": 0.0114, + "step": 418 + }, + { + "epoch": 3.244917715392062, + "grad_norm": 0.03261866793036461, + "learning_rate": 3.3012150761560085e-05, + "loss": 0.0224, + "step": 419 + }, + { + "epoch": 3.2526621490803485, + "grad_norm": 0.02441861294209957, + "learning_rate": 3.275768486860149e-05, + "loss": 0.0115, + "step": 420 + }, + { + "epoch": 3.2526621490803485, + "eval_loss": 0.01377787534147501, + "eval_runtime": 4.4109, + "eval_samples_per_second": 11.336, + "eval_steps_per_second": 2.947, + "step": 420 + }, + { + "epoch": 3.260406582768635, + "grad_norm": 0.023703662678599358, + "learning_rate": 3.250372484488558e-05, + "loss": 0.0096, + "step": 421 + }, + { + "epoch": 3.2681510164569216, + "grad_norm": 0.03410341590642929, + "learning_rate": 3.225027814130074e-05, + "loss": 0.012, + "step": 422 + }, + { + "epoch": 3.275895450145208, + "grad_norm": 0.0334019772708416, + "learning_rate": 3.199735219367507e-05, + "loss": 0.0134, + "step": 423 + }, + { + "epoch": 3.283639883833495, + "grad_norm": 0.024044804275035858, + "learning_rate": 3.174495442255836e-05, + "loss": 0.0088, + "step": 424 + }, + { + "epoch": 3.291384317521781, + "grad_norm": 0.027393875643610954, + "learning_rate": 3.149309223300428e-05, + "loss": 0.0128, + "step": 425 + }, + { + "epoch": 3.291384317521781, + "eval_loss": 0.013857954181730747, + "eval_runtime": 4.3701, + "eval_samples_per_second": 11.441, + "eval_steps_per_second": 2.975, + "step": 425 + }, + { + "epoch": 3.299128751210068, + "grad_norm": 0.02405543252825737, + "learning_rate": 3.124177301435324e-05, + "loss": 0.0133, + "step": 426 + }, + { + "epoch": 3.3068731848983544, + "grad_norm": 0.02057347074151039, + "learning_rate": 3.09910041400154e-05, + "loss": 0.0097, + "step": 427 + }, + { + "epoch": 3.3146176185866407, + "grad_norm": 0.025036605075001717, + "learning_rate": 3.0740792967254604e-05, + "loss": 0.0129, + "step": 428 + }, + { + "epoch": 3.3223620522749275, + "grad_norm": 0.03239162638783455, + "learning_rate": 3.0491146836972272e-05, + "loss": 0.0177, + "step": 429 + }, + { + "epoch": 3.330106485963214, + "grad_norm": 0.03456740453839302, + "learning_rate": 3.024207307349224e-05, + "loss": 0.0278, + "step": 430 + }, + { + "epoch": 3.330106485963214, + "eval_loss": 0.013759260065853596, + "eval_runtime": 4.3788, + "eval_samples_per_second": 11.419, + "eval_steps_per_second": 2.969, + "step": 430 + }, + { + "epoch": 3.3378509196515003, + "grad_norm": 0.02619517222046852, + "learning_rate": 2.9993578984345672e-05, + "loss": 0.0109, + "step": 431 + }, + { + "epoch": 3.345595353339787, + "grad_norm": 0.02865227498114109, + "learning_rate": 2.9745671860056868e-05, + "loss": 0.0142, + "step": 432 + }, + { + "epoch": 3.3533397870280734, + "grad_norm": 0.028698042035102844, + "learning_rate": 2.9498358973929196e-05, + "loss": 0.0121, + "step": 433 + }, + { + "epoch": 3.3610842207163603, + "grad_norm": 0.029013466089963913, + "learning_rate": 2.9251647581831836e-05, + "loss": 0.0158, + "step": 434 + }, + { + "epoch": 3.3688286544046466, + "grad_norm": 0.0390060655772686, + "learning_rate": 2.900554492198677e-05, + "loss": 0.0197, + "step": 435 + }, + { + "epoch": 3.3688286544046466, + "eval_loss": 0.013566420413553715, + "eval_runtime": 4.3727, + "eval_samples_per_second": 11.435, + "eval_steps_per_second": 2.973, + "step": 435 + }, + { + "epoch": 3.3765730880929334, + "grad_norm": 0.024725977331399918, + "learning_rate": 2.876005821475657e-05, + "loss": 0.0104, + "step": 436 + }, + { + "epoch": 3.38431752178122, + "grad_norm": 0.029143916442990303, + "learning_rate": 2.851519466243242e-05, + "loss": 0.0104, + "step": 437 + }, + { + "epoch": 3.392061955469506, + "grad_norm": 0.03371770679950714, + "learning_rate": 2.8270961449022893e-05, + "loss": 0.0192, + "step": 438 + }, + { + "epoch": 3.399806389157793, + "grad_norm": 0.026737425476312637, + "learning_rate": 2.802736574004319e-05, + "loss": 0.0172, + "step": 439 + }, + { + "epoch": 3.4075508228460794, + "grad_norm": 0.02045326493680477, + "learning_rate": 2.7784414682304832e-05, + "loss": 0.0095, + "step": 440 + }, + { + "epoch": 3.4075508228460794, + "eval_loss": 0.013339264318346977, + "eval_runtime": 4.3959, + "eval_samples_per_second": 11.374, + "eval_steps_per_second": 2.957, + "step": 440 + }, + { + "epoch": 3.4152952565343657, + "grad_norm": 0.02013881504535675, + "learning_rate": 2.7542115403706063e-05, + "loss": 0.0076, + "step": 441 + }, + { + "epoch": 3.4230396902226525, + "grad_norm": 0.028215833008289337, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0118, + "step": 442 + }, + { + "epoch": 3.430784123910939, + "grad_norm": 0.02091830037534237, + "learning_rate": 2.7059500599699476e-05, + "loss": 0.0098, + "step": 443 + }, + { + "epoch": 3.4385285575992257, + "grad_norm": 0.035130295902490616, + "learning_rate": 2.6819199233642278e-05, + "loss": 0.0151, + "step": 444 + }, + { + "epoch": 3.446272991287512, + "grad_norm": 0.01884886436164379, + "learning_rate": 2.65795779650105e-05, + "loss": 0.0075, + "step": 445 + }, + { + "epoch": 3.446272991287512, + "eval_loss": 0.01332936156541109, + "eval_runtime": 4.3734, + "eval_samples_per_second": 11.433, + "eval_steps_per_second": 2.973, + "step": 445 + }, + { + "epoch": 3.454017424975799, + "grad_norm": 0.01937946490943432, + "learning_rate": 2.6340643824010247e-05, + "loss": 0.0102, + "step": 446 + }, + { + "epoch": 3.4617618586640853, + "grad_norm": 0.03142572566866875, + "learning_rate": 2.6102403820688177e-05, + "loss": 0.0182, + "step": 447 + }, + { + "epoch": 3.4695062923523716, + "grad_norm": 0.024633850902318954, + "learning_rate": 2.586486494472572e-05, + "loss": 0.0121, + "step": 448 + }, + { + "epoch": 3.4772507260406584, + "grad_norm": 0.03723684325814247, + "learning_rate": 2.562803416523405e-05, + "loss": 0.0131, + "step": 449 + }, + { + "epoch": 3.484995159728945, + "grad_norm": 0.029163116589188576, + "learning_rate": 2.539191843054963e-05, + "loss": 0.0112, + "step": 450 + }, + { + "epoch": 3.484995159728945, + "eval_loss": 0.01355398166924715, + "eval_runtime": 4.377, + "eval_samples_per_second": 11.423, + "eval_steps_per_second": 2.97, + "step": 450 + } + ], + "logging_steps": 1, + "max_steps": 645, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.047483370030367e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}