{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.987212276214834, "eval_steps": 500, "global_step": 975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05115089514066496, "grad_norm": 214.7565900278413, "learning_rate": 1.0204081632653063e-06, "loss": 4.0075, "step": 10 }, { "epoch": 0.10230179028132992, "grad_norm": 22.211069433232247, "learning_rate": 2.0408163265306125e-06, "loss": 0.6282, "step": 20 }, { "epoch": 0.1534526854219949, "grad_norm": 12.11809889509627, "learning_rate": 3.0612244897959185e-06, "loss": 0.2107, "step": 30 }, { "epoch": 0.20460358056265984, "grad_norm": 2.8569999074326495, "learning_rate": 4.081632653061225e-06, "loss": 0.2135, "step": 40 }, { "epoch": 0.2557544757033248, "grad_norm": 6.867087399363433, "learning_rate": 5.1020408163265315e-06, "loss": 0.2072, "step": 50 }, { "epoch": 0.3069053708439898, "grad_norm": 3.4564712619293663, "learning_rate": 6.122448979591837e-06, "loss": 0.2002, "step": 60 }, { "epoch": 0.35805626598465473, "grad_norm": 3.69055204043271, "learning_rate": 7.1428571428571436e-06, "loss": 0.2, "step": 70 }, { "epoch": 0.4092071611253197, "grad_norm": 1.0696324041329028, "learning_rate": 8.16326530612245e-06, "loss": 0.1964, "step": 80 }, { "epoch": 0.46035805626598464, "grad_norm": 2.964347091843066, "learning_rate": 9.183673469387756e-06, "loss": 0.1952, "step": 90 }, { "epoch": 0.5115089514066496, "grad_norm": 3.646664998250867, "learning_rate": 9.99987167871469e-06, "loss": 0.2045, "step": 100 }, { "epoch": 0.5626598465473146, "grad_norm": 1.6191665925446022, "learning_rate": 9.995381125277747e-06, "loss": 0.2018, "step": 110 }, { "epoch": 0.6138107416879796, "grad_norm": 4.017961040145252, "learning_rate": 9.984481092596683e-06, "loss": 0.1975, "step": 120 }, { "epoch": 0.6649616368286445, "grad_norm": 2.9108813585819817, "learning_rate": 9.967185566297713e-06, "loss": 0.199, "step": 130 }, { "epoch": 0.7161125319693095, "grad_norm": 3.1021824472509767, "learning_rate": 9.94351673794421e-06, "loss": 0.1978, "step": 140 }, { "epoch": 0.7672634271099744, "grad_norm": 3.497183421035605, "learning_rate": 9.913504976563138e-06, "loss": 0.1972, "step": 150 }, { "epoch": 0.8184143222506394, "grad_norm": 2.667271306285725, "learning_rate": 9.877188789679111e-06, "loss": 0.1972, "step": 160 }, { "epoch": 0.8695652173913043, "grad_norm": 1.939938871403963, "learning_rate": 9.834614773906125e-06, "loss": 0.1983, "step": 170 }, { "epoch": 0.9207161125319693, "grad_norm": 3.277095189836375, "learning_rate": 9.785837555160322e-06, "loss": 0.2001, "step": 180 }, { "epoch": 0.9718670076726342, "grad_norm": 2.1349632530669784, "learning_rate": 9.730919718570501e-06, "loss": 0.1985, "step": 190 }, { "epoch": 1.0230179028132993, "grad_norm": 1.9921171293582758, "learning_rate": 9.669931728176329e-06, "loss": 0.2044, "step": 200 }, { "epoch": 1.0741687979539642, "grad_norm": 1.6800643640667052, "learning_rate": 9.602951836517255e-06, "loss": 0.1968, "step": 210 }, { "epoch": 1.1253196930946292, "grad_norm": 3.6566429458754515, "learning_rate": 9.530065984228156e-06, "loss": 0.1972, "step": 220 }, { "epoch": 1.1764705882352942, "grad_norm": 1.0633803952953227, "learning_rate": 9.451367689770532e-06, "loss": 0.1957, "step": 230 }, { "epoch": 1.227621483375959, "grad_norm": 1.4368952344952126, "learning_rate": 9.366957929440731e-06, "loss": 0.1942, "step": 240 }, { "epoch": 1.278772378516624, "grad_norm": 2.8115853641591015, "learning_rate": 9.276945007809167e-06, "loss": 0.1975, "step": 250 }, { "epoch": 1.329923273657289, "grad_norm": 2.2368726208053515, "learning_rate": 9.181444418756774e-06, "loss": 0.1941, "step": 260 }, { "epoch": 1.381074168797954, "grad_norm": 1.718287417414397, "learning_rate": 9.080578697286986e-06, "loss": 0.1955, "step": 270 }, { "epoch": 1.432225063938619, "grad_norm": 2.6480547516752373, "learning_rate": 8.974477262303382e-06, "loss": 0.1965, "step": 280 }, { "epoch": 1.4833759590792839, "grad_norm": 2.1323772112759856, "learning_rate": 8.86327625055474e-06, "loss": 0.1928, "step": 290 }, { "epoch": 1.5345268542199488, "grad_norm": 1.3695013623025927, "learning_rate": 8.747118341960542e-06, "loss": 0.1964, "step": 300 }, { "epoch": 1.5856777493606138, "grad_norm": 2.816490216879556, "learning_rate": 8.626152576541059e-06, "loss": 0.1926, "step": 310 }, { "epoch": 1.6368286445012787, "grad_norm": 2.63475775763493, "learning_rate": 8.500534163186914e-06, "loss": 0.1944, "step": 320 }, { "epoch": 1.6879795396419437, "grad_norm": 1.2976999470451909, "learning_rate": 8.37042428051349e-06, "loss": 0.1935, "step": 330 }, { "epoch": 1.7391304347826086, "grad_norm": 3.0528789638369314, "learning_rate": 8.235989870055684e-06, "loss": 0.1931, "step": 340 }, { "epoch": 1.7902813299232738, "grad_norm": 2.761510505813076, "learning_rate": 8.097403422068377e-06, "loss": 0.1901, "step": 350 }, { "epoch": 1.8414322250639388, "grad_norm": 1.2860400398022556, "learning_rate": 7.954842754207442e-06, "loss": 0.1846, "step": 360 }, { "epoch": 1.8925831202046037, "grad_norm": 1.640012696939897, "learning_rate": 7.80849078337527e-06, "loss": 0.1801, "step": 370 }, { "epoch": 1.9437340153452687, "grad_norm": 2.959712367273353, "learning_rate": 7.658535291023533e-06, "loss": 0.1766, "step": 380 }, { "epoch": 1.9948849104859336, "grad_norm": 2.582572305335923, "learning_rate": 7.5051686822143614e-06, "loss": 0.1592, "step": 390 }, { "epoch": 2.0460358056265986, "grad_norm": 2.8473557994991165, "learning_rate": 7.348587738749037e-06, "loss": 0.1467, "step": 400 }, { "epoch": 2.0971867007672635, "grad_norm": 2.246956202126538, "learning_rate": 7.18899336668097e-06, "loss": 0.1102, "step": 410 }, { "epoch": 2.1483375959079285, "grad_norm": 2.3922923060664685, "learning_rate": 7.0265903385369385e-06, "loss": 0.0799, "step": 420 }, { "epoch": 2.1994884910485935, "grad_norm": 2.235940967501667, "learning_rate": 6.861587030577326e-06, "loss": 0.0643, "step": 430 }, { "epoch": 2.2506393861892584, "grad_norm": 2.1749048219884615, "learning_rate": 6.694195155432473e-06, "loss": 0.039, "step": 440 }, { "epoch": 2.3017902813299234, "grad_norm": 2.4706549250013494, "learning_rate": 6.524629490458193e-06, "loss": 0.0287, "step": 450 }, { "epoch": 2.3529411764705883, "grad_norm": 2.1875953818358975, "learning_rate": 6.353107602159005e-06, "loss": 0.0211, "step": 460 }, { "epoch": 2.4040920716112533, "grad_norm": 1.1932819949506583, "learning_rate": 6.179849567032658e-06, "loss": 0.0199, "step": 470 }, { "epoch": 2.455242966751918, "grad_norm": 3.043082138284078, "learning_rate": 6.0050776891941266e-06, "loss": 0.0138, "step": 480 }, { "epoch": 2.506393861892583, "grad_norm": 1.4066811981696432, "learning_rate": 5.8290162151414e-06, "loss": 0.0102, "step": 490 }, { "epoch": 2.557544757033248, "grad_norm": 1.089275292286627, "learning_rate": 5.651891046029031e-06, "loss": 0.0091, "step": 500 }, { "epoch": 2.608695652173913, "grad_norm": 1.4386597224188378, "learning_rate": 5.473929447818628e-06, "loss": 0.0093, "step": 510 }, { "epoch": 2.659846547314578, "grad_norm": 1.6800741407957291, "learning_rate": 5.295359759678187e-06, "loss": 0.0062, "step": 520 }, { "epoch": 2.710997442455243, "grad_norm": 2.0143218617779004, "learning_rate": 5.1164111010044225e-06, "loss": 0.0058, "step": 530 }, { "epoch": 2.762148337595908, "grad_norm": 2.927032625013264, "learning_rate": 4.937313077443985e-06, "loss": 0.0071, "step": 540 }, { "epoch": 2.813299232736573, "grad_norm": 0.7049312278321461, "learning_rate": 4.7582954862907896e-06, "loss": 0.0041, "step": 550 }, { "epoch": 2.864450127877238, "grad_norm": 0.39393926121790496, "learning_rate": 4.579588021637448e-06, "loss": 0.0029, "step": 560 }, { "epoch": 2.915601023017903, "grad_norm": 0.7023810428810136, "learning_rate": 4.401419979659119e-06, "loss": 0.0036, "step": 570 }, { "epoch": 2.9667519181585678, "grad_norm": 1.7272328840467275, "learning_rate": 4.224019964407902e-06, "loss": 0.0028, "step": 580 }, { "epoch": 3.0179028132992327, "grad_norm": 0.19616140979782443, "learning_rate": 4.047615594495299e-06, "loss": 0.0017, "step": 590 }, { "epoch": 3.0690537084398977, "grad_norm": 0.060475506150692124, "learning_rate": 3.872433211039067e-06, "loss": 0.0013, "step": 600 }, { "epoch": 3.1202046035805626, "grad_norm": 0.060879802064906625, "learning_rate": 3.698697587249208e-06, "loss": 0.0012, "step": 610 }, { "epoch": 3.1713554987212276, "grad_norm": 0.8953617552425924, "learning_rate": 3.526631640025697e-06, "loss": 0.0009, "step": 620 }, { "epoch": 3.2225063938618925, "grad_norm": 0.400415376801392, "learning_rate": 3.3564561439380206e-06, "loss": 0.0006, "step": 630 }, { "epoch": 3.2736572890025575, "grad_norm": 0.13846989650131333, "learning_rate": 3.1883894479534882e-06, "loss": 0.0005, "step": 640 }, { "epoch": 3.3248081841432224, "grad_norm": 0.03190740929655757, "learning_rate": 3.022647195277799e-06, "loss": 0.0009, "step": 650 }, { "epoch": 3.3759590792838874, "grad_norm": 0.01044371613769382, "learning_rate": 2.8594420466673133e-06, "loss": 0.0001, "step": 660 }, { "epoch": 3.4271099744245523, "grad_norm": 0.021747528690115973, "learning_rate": 2.6989834075680452e-06, "loss": 0.0003, "step": 670 }, { "epoch": 3.4782608695652173, "grad_norm": 0.003734147207667446, "learning_rate": 2.541477159431489e-06, "loss": 0.0002, "step": 680 }, { "epoch": 3.5294117647058822, "grad_norm": 0.02578812155709838, "learning_rate": 2.3871253955520163e-06, "loss": 0.0003, "step": 690 }, { "epoch": 3.580562659846547, "grad_norm": 0.07582199264749429, "learning_rate": 2.2361261617647873e-06, "loss": 0.0001, "step": 700 }, { "epoch": 3.631713554987212, "grad_norm": 0.003064177461855074, "learning_rate": 2.0886732023368637e-06, "loss": 0.0001, "step": 710 }, { "epoch": 3.682864450127877, "grad_norm": 0.0034307231208645776, "learning_rate": 1.9449557113776157e-06, "loss": 0.0, "step": 720 }, { "epoch": 3.734015345268542, "grad_norm": 0.003406179275735569, "learning_rate": 1.8051580900873112e-06, "loss": 0.0003, "step": 730 }, { "epoch": 3.785166240409207, "grad_norm": 0.0017568341440027282, "learning_rate": 1.6694597101554354e-06, "loss": 0.0001, "step": 740 }, { "epoch": 3.836317135549872, "grad_norm": 0.001639728689373477, "learning_rate": 1.5380346836122429e-06, "loss": 0.0001, "step": 750 }, { "epoch": 3.887468030690537, "grad_norm": 0.001901581956312772, "learning_rate": 1.4110516394289103e-06, "loss": 0.0001, "step": 760 }, { "epoch": 3.938618925831202, "grad_norm": 0.001484483455691581, "learning_rate": 1.2886735071528829e-06, "loss": 0.0001, "step": 770 }, { "epoch": 3.9897698209718673, "grad_norm": 0.0018431338087995083, "learning_rate": 1.1710573078560406e-06, "loss": 0.0001, "step": 780 }, { "epoch": 4.040920716112532, "grad_norm": 0.06550643685551072, "learning_rate": 1.058353952663937e-06, "loss": 0.0002, "step": 790 }, { "epoch": 4.092071611253197, "grad_norm": 0.06509818369816833, "learning_rate": 9.50708049124568e-07, "loss": 0.0001, "step": 800 }, { "epoch": 4.143222506393862, "grad_norm": 0.001965866467798876, "learning_rate": 8.482577156651683e-07, "loss": 0.0001, "step": 810 }, { "epoch": 4.194373401534527, "grad_norm": 0.003659181883344629, "learning_rate": 7.511344043750546e-07, "loss": 0.0001, "step": 820 }, { "epoch": 4.245524296675192, "grad_norm": 0.0008421352975079428, "learning_rate": 6.594627323419383e-07, "loss": 0.0001, "step": 830 }, { "epoch": 4.296675191815857, "grad_norm": 0.0015741296233894467, "learning_rate": 5.733603217580885e-07, "loss": 0.0001, "step": 840 }, { "epoch": 4.3478260869565215, "grad_norm": 0.07134647602151516, "learning_rate": 4.929376490015148e-07, "loss": 0.0001, "step": 850 }, { "epoch": 4.398976982097187, "grad_norm": 0.0009193314183040581, "learning_rate": 4.1829790288581694e-07, "loss": 0.0, "step": 860 }, { "epoch": 4.450127877237851, "grad_norm": 0.0011078081696583465, "learning_rate": 3.495368522605602e-07, "loss": 0.0, "step": 870 }, { "epoch": 4.501278772378517, "grad_norm": 0.002913655514720279, "learning_rate": 2.867427231320774e-07, "loss": 0.0001, "step": 880 }, { "epoch": 4.552429667519181, "grad_norm": 0.0011670444227254474, "learning_rate": 2.2999608546234063e-07, "loss": 0.0001, "step": 890 }, { "epoch": 4.603580562659847, "grad_norm": 0.0010703222073159705, "learning_rate": 1.7936974979116505e-07, "loss": 0.0001, "step": 900 }, { "epoch": 4.654731457800511, "grad_norm": 0.0010341753002907296, "learning_rate": 1.349286738143829e-07, "loss": 0.0001, "step": 910 }, { "epoch": 4.705882352941177, "grad_norm": 0.12688243780881078, "learning_rate": 9.672987903784237e-08, "loss": 0.0002, "step": 920 }, { "epoch": 4.757033248081841, "grad_norm": 0.0013506462774850975, "learning_rate": 6.482237761419652e-08, "loss": 0.0001, "step": 930 }, { "epoch": 4.8081841432225065, "grad_norm": 0.009021779442771162, "learning_rate": 3.9247109456332925e-08, "loss": 0.0002, "step": 940 }, { "epoch": 4.859335038363171, "grad_norm": 0.0010536719662307653, "learning_rate": 2.003688970814377e-08, "loss": 0.0001, "step": 950 }, { "epoch": 4.910485933503836, "grad_norm": 0.0010263273253712944, "learning_rate": 7.216366640032668e-09, "loss": 0.0001, "step": 960 }, { "epoch": 4.961636828644501, "grad_norm": 0.060715491871115294, "learning_rate": 8.019900231881483e-10, "loss": 0.0001, "step": 970 }, { "epoch": 4.987212276214834, "step": 975, "total_flos": 261426857902080.0, "train_loss": 0.12791604029011908, "train_runtime": 13376.8379, "train_samples_per_second": 18.689, "train_steps_per_second": 0.073 } ], "logging_steps": 10, "max_steps": 975, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 261426857902080.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }