{ "best_metric": 11.038614273071289, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.09813542688910697, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009813542688910696, "grad_norm": 4.1202497482299805, "learning_rate": 5.000000000000001e-07, "loss": 44.3701, "step": 1 }, { "epoch": 0.0009813542688910696, "eval_loss": 11.094230651855469, "eval_runtime": 5.8483, "eval_samples_per_second": 293.417, "eval_steps_per_second": 36.763, "step": 1 }, { "epoch": 0.001962708537782139, "grad_norm": 4.6952314376831055, "learning_rate": 1.0000000000000002e-06, "loss": 44.357, "step": 2 }, { "epoch": 0.002944062806673209, "grad_norm": 4.975703239440918, "learning_rate": 1.5e-06, "loss": 44.3399, "step": 3 }, { "epoch": 0.003925417075564278, "grad_norm": 4.898032188415527, "learning_rate": 2.0000000000000003e-06, "loss": 44.3496, "step": 4 }, { "epoch": 0.004906771344455349, "grad_norm": 4.699200630187988, "learning_rate": 2.5e-06, "loss": 44.3727, "step": 5 }, { "epoch": 0.005888125613346418, "grad_norm": 4.840972423553467, "learning_rate": 3e-06, "loss": 44.3562, "step": 6 }, { "epoch": 0.0068694798822374874, "grad_norm": 4.837454795837402, "learning_rate": 3.5e-06, "loss": 44.3481, "step": 7 }, { "epoch": 0.007850834151128557, "grad_norm": 4.493252754211426, "learning_rate": 4.000000000000001e-06, "loss": 44.3513, "step": 8 }, { "epoch": 0.008832188420019628, "grad_norm": 4.787064552307129, "learning_rate": 4.5e-06, "loss": 44.3664, "step": 9 }, { "epoch": 0.009813542688910697, "grad_norm": 4.681407928466797, "learning_rate": 5e-06, "loss": 44.3545, "step": 10 }, { "epoch": 0.010794896957801767, "grad_norm": 4.4647321701049805, "learning_rate": 4.99847706754774e-06, "loss": 44.3586, "step": 11 }, { "epoch": 0.011776251226692836, "grad_norm": 4.408224105834961, "learning_rate": 4.993910125649561e-06, "loss": 44.3444, "step": 12 }, { "epoch": 0.012757605495583905, "grad_norm": 4.694049835205078, "learning_rate": 4.986304738420684e-06, "loss": 44.3296, "step": 13 }, { "epoch": 0.013738959764474975, "grad_norm": 4.30063009262085, "learning_rate": 4.975670171853926e-06, "loss": 44.3525, "step": 14 }, { "epoch": 0.014720314033366046, "grad_norm": 4.59287691116333, "learning_rate": 4.962019382530521e-06, "loss": 44.3156, "step": 15 }, { "epoch": 0.015701668302257114, "grad_norm": 4.614467620849609, "learning_rate": 4.9453690018345144e-06, "loss": 44.3097, "step": 16 }, { "epoch": 0.016683022571148183, "grad_norm": 4.312943935394287, "learning_rate": 4.925739315689991e-06, "loss": 44.3131, "step": 17 }, { "epoch": 0.017664376840039256, "grad_norm": 4.630992412567139, "learning_rate": 4.903154239845798e-06, "loss": 44.3009, "step": 18 }, { "epoch": 0.018645731108930325, "grad_norm": 4.261321544647217, "learning_rate": 4.8776412907378845e-06, "loss": 44.294, "step": 19 }, { "epoch": 0.019627085377821395, "grad_norm": 4.367868423461914, "learning_rate": 4.849231551964771e-06, "loss": 44.3004, "step": 20 }, { "epoch": 0.020608439646712464, "grad_norm": 4.4335503578186035, "learning_rate": 4.817959636416969e-06, "loss": 44.3056, "step": 21 }, { "epoch": 0.021589793915603533, "grad_norm": 4.420947074890137, "learning_rate": 4.783863644106502e-06, "loss": 44.2962, "step": 22 }, { "epoch": 0.022571148184494603, "grad_norm": 4.479759693145752, "learning_rate": 4.746985115747918e-06, "loss": 44.2849, "step": 23 }, { "epoch": 0.023552502453385672, "grad_norm": 4.4245805740356445, "learning_rate": 4.707368982147318e-06, "loss": 44.3022, "step": 24 }, { "epoch": 0.02453385672227674, "grad_norm": 4.920063495635986, "learning_rate": 4.665063509461098e-06, "loss": 44.2652, "step": 25 }, { "epoch": 0.02551521099116781, "grad_norm": 4.60530424118042, "learning_rate": 4.620120240391065e-06, "loss": 44.2587, "step": 26 }, { "epoch": 0.02649656526005888, "grad_norm": 4.810507774353027, "learning_rate": 4.572593931387604e-06, "loss": 44.2725, "step": 27 }, { "epoch": 0.02747791952894995, "grad_norm": 4.324153900146484, "learning_rate": 4.522542485937369e-06, "loss": 44.2904, "step": 28 }, { "epoch": 0.02845927379784102, "grad_norm": 4.457505226135254, "learning_rate": 4.470026884016805e-06, "loss": 44.2651, "step": 29 }, { "epoch": 0.029440628066732092, "grad_norm": 4.2836151123046875, "learning_rate": 4.415111107797445e-06, "loss": 44.272, "step": 30 }, { "epoch": 0.03042198233562316, "grad_norm": 4.381106853485107, "learning_rate": 4.357862063693486e-06, "loss": 44.2687, "step": 31 }, { "epoch": 0.03140333660451423, "grad_norm": 4.747635841369629, "learning_rate": 4.2983495008466285e-06, "loss": 44.2819, "step": 32 }, { "epoch": 0.0323846908734053, "grad_norm": 4.578216075897217, "learning_rate": 4.236645926147493e-06, "loss": 44.2072, "step": 33 }, { "epoch": 0.033366045142296366, "grad_norm": 4.499361515045166, "learning_rate": 4.172826515897146e-06, "loss": 44.2359, "step": 34 }, { "epoch": 0.03434739941118744, "grad_norm": 4.1071553230285645, "learning_rate": 4.106969024216348e-06, "loss": 44.2548, "step": 35 }, { "epoch": 0.03532875368007851, "grad_norm": 4.5971550941467285, "learning_rate": 4.039153688314146e-06, "loss": 44.2519, "step": 36 }, { "epoch": 0.03631010794896958, "grad_norm": 4.54329776763916, "learning_rate": 3.969463130731183e-06, "loss": 44.2257, "step": 37 }, { "epoch": 0.03729146221786065, "grad_norm": 4.047177791595459, "learning_rate": 3.897982258676867e-06, "loss": 44.2495, "step": 38 }, { "epoch": 0.038272816486751716, "grad_norm": 3.584639549255371, "learning_rate": 3.824798160583012e-06, "loss": 44.2784, "step": 39 }, { "epoch": 0.03925417075564279, "grad_norm": 3.700026750564575, "learning_rate": 3.7500000000000005e-06, "loss": 44.2736, "step": 40 }, { "epoch": 0.040235525024533855, "grad_norm": 4.082536220550537, "learning_rate": 3.6736789069647273e-06, "loss": 44.2867, "step": 41 }, { "epoch": 0.04121687929342493, "grad_norm": 3.2533934116363525, "learning_rate": 3.595927866972694e-06, "loss": 44.319, "step": 42 }, { "epoch": 0.042198233562315994, "grad_norm": 3.820909261703491, "learning_rate": 3.516841607689501e-06, "loss": 44.2602, "step": 43 }, { "epoch": 0.04317958783120707, "grad_norm": 3.5113043785095215, "learning_rate": 3.436516483539781e-06, "loss": 44.2805, "step": 44 }, { "epoch": 0.04416094210009813, "grad_norm": 3.5770328044891357, "learning_rate": 3.3550503583141726e-06, "loss": 44.3724, "step": 45 }, { "epoch": 0.045142296368989206, "grad_norm": 3.635390043258667, "learning_rate": 3.272542485937369e-06, "loss": 44.4049, "step": 46 }, { "epoch": 0.04612365063788027, "grad_norm": 3.494272232055664, "learning_rate": 3.189093389542498e-06, "loss": 44.3339, "step": 47 }, { "epoch": 0.047105004906771344, "grad_norm": 4.5640668869018555, "learning_rate": 3.1048047389991693e-06, "loss": 44.3798, "step": 48 }, { "epoch": 0.04808635917566242, "grad_norm": 5.724422931671143, "learning_rate": 3.019779227044398e-06, "loss": 44.29, "step": 49 }, { "epoch": 0.04906771344455348, "grad_norm": 6.1241936683654785, "learning_rate": 2.9341204441673267e-06, "loss": 44.4495, "step": 50 }, { "epoch": 0.04906771344455348, "eval_loss": 11.050891876220703, "eval_runtime": 5.0876, "eval_samples_per_second": 337.292, "eval_steps_per_second": 42.26, "step": 50 }, { "epoch": 0.050049067713444556, "grad_norm": 4.3872294425964355, "learning_rate": 2.847932752400164e-06, "loss": 44.1583, "step": 51 }, { "epoch": 0.05103042198233562, "grad_norm": 4.958473205566406, "learning_rate": 2.761321158169134e-06, "loss": 44.1556, "step": 52 }, { "epoch": 0.052011776251226695, "grad_norm": 4.641199588775635, "learning_rate": 2.6743911843603134e-06, "loss": 44.1518, "step": 53 }, { "epoch": 0.05299313052011776, "grad_norm": 4.831917762756348, "learning_rate": 2.587248741756253e-06, "loss": 44.1574, "step": 54 }, { "epoch": 0.053974484789008834, "grad_norm": 4.641740322113037, "learning_rate": 2.5e-06, "loss": 44.177, "step": 55 }, { "epoch": 0.0549558390578999, "grad_norm": 5.027724742889404, "learning_rate": 2.4127512582437486e-06, "loss": 44.1403, "step": 56 }, { "epoch": 0.05593719332679097, "grad_norm": 4.93941068649292, "learning_rate": 2.325608815639687e-06, "loss": 44.1304, "step": 57 }, { "epoch": 0.05691854759568204, "grad_norm": 4.8098063468933105, "learning_rate": 2.238678841830867e-06, "loss": 44.1245, "step": 58 }, { "epoch": 0.05789990186457311, "grad_norm": 4.973179340362549, "learning_rate": 2.1520672475998374e-06, "loss": 44.141, "step": 59 }, { "epoch": 0.058881256133464184, "grad_norm": 4.688238620758057, "learning_rate": 2.0658795558326745e-06, "loss": 44.154, "step": 60 }, { "epoch": 0.05986261040235525, "grad_norm": 4.850657939910889, "learning_rate": 1.9802207729556023e-06, "loss": 44.1483, "step": 61 }, { "epoch": 0.06084396467124632, "grad_norm": 4.537477016448975, "learning_rate": 1.895195261000831e-06, "loss": 44.1491, "step": 62 }, { "epoch": 0.06182531894013739, "grad_norm": 4.47567081451416, "learning_rate": 1.8109066104575023e-06, "loss": 44.1562, "step": 63 }, { "epoch": 0.06280667320902845, "grad_norm": 4.615531921386719, "learning_rate": 1.7274575140626318e-06, "loss": 44.1694, "step": 64 }, { "epoch": 0.06378802747791953, "grad_norm": 4.739807605743408, "learning_rate": 1.6449496416858285e-06, "loss": 44.1339, "step": 65 }, { "epoch": 0.0647693817468106, "grad_norm": 4.978174686431885, "learning_rate": 1.56348351646022e-06, "loss": 44.1016, "step": 66 }, { "epoch": 0.06575073601570167, "grad_norm": 4.427630424499512, "learning_rate": 1.4831583923105e-06, "loss": 44.1453, "step": 67 }, { "epoch": 0.06673209028459273, "grad_norm": 4.460508823394775, "learning_rate": 1.4040721330273063e-06, "loss": 44.1541, "step": 68 }, { "epoch": 0.06771344455348381, "grad_norm": 4.592861175537109, "learning_rate": 1.3263210930352737e-06, "loss": 44.1361, "step": 69 }, { "epoch": 0.06869479882237488, "grad_norm": 4.343865394592285, "learning_rate": 1.2500000000000007e-06, "loss": 44.1732, "step": 70 }, { "epoch": 0.06967615309126594, "grad_norm": 4.420068264007568, "learning_rate": 1.1752018394169882e-06, "loss": 44.1383, "step": 71 }, { "epoch": 0.07065750736015702, "grad_norm": 4.614556789398193, "learning_rate": 1.1020177413231334e-06, "loss": 44.142, "step": 72 }, { "epoch": 0.07163886162904809, "grad_norm": 4.291627883911133, "learning_rate": 1.0305368692688175e-06, "loss": 44.1535, "step": 73 }, { "epoch": 0.07262021589793916, "grad_norm": 4.367705345153809, "learning_rate": 9.608463116858544e-07, "loss": 44.1653, "step": 74 }, { "epoch": 0.07360157016683022, "grad_norm": 4.751893997192383, "learning_rate": 8.930309757836517e-07, "loss": 44.1325, "step": 75 }, { "epoch": 0.0745829244357213, "grad_norm": 4.403656959533691, "learning_rate": 8.271734841028553e-07, "loss": 44.1413, "step": 76 }, { "epoch": 0.07556427870461237, "grad_norm": 4.389298915863037, "learning_rate": 7.633540738525066e-07, "loss": 44.1718, "step": 77 }, { "epoch": 0.07654563297350343, "grad_norm": 4.401954174041748, "learning_rate": 7.016504991533727e-07, "loss": 44.1616, "step": 78 }, { "epoch": 0.0775269872423945, "grad_norm": 4.580694198608398, "learning_rate": 6.421379363065142e-07, "loss": 44.1418, "step": 79 }, { "epoch": 0.07850834151128558, "grad_norm": 4.462921619415283, "learning_rate": 5.848888922025553e-07, "loss": 44.1092, "step": 80 }, { "epoch": 0.07948969578017664, "grad_norm": 4.829708099365234, "learning_rate": 5.299731159831953e-07, "loss": 44.1295, "step": 81 }, { "epoch": 0.08047105004906771, "grad_norm": 4.394340991973877, "learning_rate": 4.774575140626317e-07, "loss": 44.1685, "step": 82 }, { "epoch": 0.08145240431795878, "grad_norm": 4.454510688781738, "learning_rate": 4.27406068612396e-07, "loss": 44.1542, "step": 83 }, { "epoch": 0.08243375858684986, "grad_norm": 4.497045040130615, "learning_rate": 3.798797596089351e-07, "loss": 44.1375, "step": 84 }, { "epoch": 0.08341511285574092, "grad_norm": 4.434609889984131, "learning_rate": 3.3493649053890325e-07, "loss": 44.1345, "step": 85 }, { "epoch": 0.08439646712463199, "grad_norm": 4.5244035720825195, "learning_rate": 2.9263101785268253e-07, "loss": 44.1467, "step": 86 }, { "epoch": 0.08537782139352307, "grad_norm": 4.269883632659912, "learning_rate": 2.53014884252083e-07, "loss": 44.1526, "step": 87 }, { "epoch": 0.08635917566241413, "grad_norm": 4.184451103210449, "learning_rate": 2.1613635589349756e-07, "loss": 44.1822, "step": 88 }, { "epoch": 0.0873405299313052, "grad_norm": 4.249412536621094, "learning_rate": 1.8204036358303173e-07, "loss": 44.1716, "step": 89 }, { "epoch": 0.08832188420019627, "grad_norm": 3.827402114868164, "learning_rate": 1.507684480352292e-07, "loss": 44.1924, "step": 90 }, { "epoch": 0.08930323846908735, "grad_norm": 4.114834785461426, "learning_rate": 1.223587092621162e-07, "loss": 44.2127, "step": 91 }, { "epoch": 0.09028459273797841, "grad_norm": 3.819640636444092, "learning_rate": 9.684576015420277e-08, "loss": 44.2051, "step": 92 }, { "epoch": 0.09126594700686948, "grad_norm": 4.212294578552246, "learning_rate": 7.426068431000883e-08, "loss": 44.1976, "step": 93 }, { "epoch": 0.09224730127576054, "grad_norm": 3.624086618423462, "learning_rate": 5.463099816548578e-08, "loss": 44.2229, "step": 94 }, { "epoch": 0.09322865554465162, "grad_norm": 3.4577605724334717, "learning_rate": 3.798061746947995e-08, "loss": 44.2466, "step": 95 }, { "epoch": 0.09421000981354269, "grad_norm": 3.643979072570801, "learning_rate": 2.4329828146074096e-08, "loss": 44.3109, "step": 96 }, { "epoch": 0.09519136408243375, "grad_norm": 4.863288879394531, "learning_rate": 1.3695261579316776e-08, "loss": 44.1834, "step": 97 }, { "epoch": 0.09617271835132483, "grad_norm": 3.8488268852233887, "learning_rate": 6.089874350439507e-09, "loss": 44.2992, "step": 98 }, { "epoch": 0.0971540726202159, "grad_norm": 4.9372148513793945, "learning_rate": 1.5229324522605949e-09, "loss": 44.3093, "step": 99 }, { "epoch": 0.09813542688910697, "grad_norm": 5.151540756225586, "learning_rate": 0.0, "loss": 44.4666, "step": 100 }, { "epoch": 0.09813542688910697, "eval_loss": 11.038614273071289, "eval_runtime": 5.0993, "eval_samples_per_second": 336.516, "eval_steps_per_second": 42.163, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 15407373090816.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }