{ "best_metric": 6.792733192443848, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.11763321962122103, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00023526643924244207, "eval_loss": 6.933060169219971, "eval_runtime": 3.2084, "eval_samples_per_second": 557.906, "eval_steps_per_second": 139.632, "step": 1 }, { "epoch": 0.0023526643924244206, "grad_norm": 0.6924540996551514, "learning_rate": 4.2800000000000004e-05, "loss": 6.9289, "step": 10 }, { "epoch": 0.004705328784848841, "grad_norm": 0.6077510118484497, "learning_rate": 8.560000000000001e-05, "loss": 6.9322, "step": 20 }, { "epoch": 0.007057993177273262, "grad_norm": 0.5621480941772461, "learning_rate": 0.0001284, "loss": 6.9184, "step": 30 }, { "epoch": 0.009410657569697682, "grad_norm": 0.8597236275672913, "learning_rate": 0.00017120000000000001, "loss": 6.8977, "step": 40 }, { "epoch": 0.011763321962122103, "grad_norm": 0.7145811319351196, "learning_rate": 0.000214, "loss": 6.8562, "step": 50 }, { "epoch": 0.011763321962122103, "eval_loss": 6.864169597625732, "eval_runtime": 3.1888, "eval_samples_per_second": 561.335, "eval_steps_per_second": 140.491, "step": 50 }, { "epoch": 0.014115986354546525, "grad_norm": 0.49429962038993835, "learning_rate": 0.00021373935337780118, "loss": 6.8976, "step": 60 }, { "epoch": 0.016468650746970945, "grad_norm": 0.4391353726387024, "learning_rate": 0.00021295868335534802, "loss": 6.8643, "step": 70 }, { "epoch": 0.018821315139395365, "grad_norm": 0.4785984754562378, "learning_rate": 0.0002116617932785172, "loss": 6.8459, "step": 80 }, { "epoch": 0.021173979531819785, "grad_norm": 0.5224839448928833, "learning_rate": 0.00020985500146540012, "loss": 6.8126, "step": 90 }, { "epoch": 0.023526643924244205, "grad_norm": 0.7273280024528503, "learning_rate": 0.0002075471104240922, "loss": 6.8084, "step": 100 }, { "epoch": 0.023526643924244205, "eval_loss": 6.832787036895752, "eval_runtime": 3.2348, "eval_samples_per_second": 553.365, "eval_steps_per_second": 138.496, "step": 100 }, { "epoch": 0.025879308316668626, "grad_norm": 0.907507598400116, "learning_rate": 0.00020474936396775828, "loss": 6.8848, "step": 110 }, { "epoch": 0.02823197270909305, "grad_norm": 0.47431233525276184, "learning_rate": 0.00020147539243590517, "loss": 6.8506, "step": 120 }, { "epoch": 0.03058463710151747, "grad_norm": 0.5569249391555786, "learning_rate": 0.00019774114628873756, "loss": 6.8298, "step": 130 }, { "epoch": 0.03293730149394189, "grad_norm": 0.5526637434959412, "learning_rate": 0.00019356481839811937, "loss": 6.7974, "step": 140 }, { "epoch": 0.03528996588636631, "grad_norm": 0.7836293578147888, "learning_rate": 0.00018896675541373064, "loss": 6.7893, "step": 150 }, { "epoch": 0.03528996588636631, "eval_loss": 6.818791389465332, "eval_runtime": 3.2485, "eval_samples_per_second": 551.016, "eval_steps_per_second": 137.908, "step": 150 }, { "epoch": 0.03764263027879073, "grad_norm": 0.6871911883354187, "learning_rate": 0.00018396935863623567, "loss": 6.8675, "step": 160 }, { "epoch": 0.039995294671215154, "grad_norm": 0.5481328964233398, "learning_rate": 0.00017859697488039784, "loss": 6.8491, "step": 170 }, { "epoch": 0.04234795906363957, "grad_norm": 0.5421447157859802, "learning_rate": 0.00017287577785984542, "loss": 6.8076, "step": 180 }, { "epoch": 0.044700623456063994, "grad_norm": 0.5816627740859985, "learning_rate": 0.0001668336406713699, "loss": 6.7903, "step": 190 }, { "epoch": 0.04705328784848841, "grad_norm": 0.8540289402008057, "learning_rate": 0.0001605, "loss": 6.7782, "step": 200 }, { "epoch": 0.04705328784848841, "eval_loss": 6.80977725982666, "eval_runtime": 3.211, "eval_samples_per_second": 557.461, "eval_steps_per_second": 139.521, "step": 200 }, { "epoch": 0.049405952240912834, "grad_norm": 0.6835295557975769, "learning_rate": 0.00015390571270643128, "loss": 6.8697, "step": 210 }, { "epoch": 0.05175861663333725, "grad_norm": 0.723054826259613, "learning_rate": 0.0001470829054955026, "loss": 6.8365, "step": 220 }, { "epoch": 0.054111281025761675, "grad_norm": 0.6531696319580078, "learning_rate": 0.00014006481839811937, "loss": 6.8019, "step": 230 }, { "epoch": 0.0564639454181861, "grad_norm": 0.5882543921470642, "learning_rate": 0.00013288564282916442, "loss": 6.7801, "step": 240 }, { "epoch": 0.058816609810610515, "grad_norm": 0.7119730710983276, "learning_rate": 0.00012558035501036158, "loss": 6.7837, "step": 250 }, { "epoch": 0.058816609810610515, "eval_loss": 6.803854465484619, "eval_runtime": 3.3022, "eval_samples_per_second": 542.059, "eval_steps_per_second": 135.666, "step": 250 }, { "epoch": 0.06116927420303494, "grad_norm": 0.7334095239639282, "learning_rate": 0.00011818454556963892, "loss": 6.8623, "step": 260 }, { "epoch": 0.06352193859545936, "grad_norm": 0.4964336156845093, "learning_rate": 0.00011073424614716762, "loss": 6.8151, "step": 270 }, { "epoch": 0.06587460298788378, "grad_norm": 0.6035408973693848, "learning_rate": 0.00010326575385283242, "loss": 6.8015, "step": 280 }, { "epoch": 0.0682272673803082, "grad_norm": 0.723812460899353, "learning_rate": 9.58154544303611e-05, "loss": 6.7626, "step": 290 }, { "epoch": 0.07057993177273263, "grad_norm": 1.4514328241348267, "learning_rate": 8.841964498963846e-05, "loss": 6.7715, "step": 300 }, { "epoch": 0.07057993177273263, "eval_loss": 6.798593044281006, "eval_runtime": 3.2084, "eval_samples_per_second": 557.906, "eval_steps_per_second": 139.632, "step": 300 }, { "epoch": 0.07293259616515704, "grad_norm": 0.9387493133544922, "learning_rate": 8.111435717083556e-05, "loss": 6.8594, "step": 310 }, { "epoch": 0.07528526055758146, "grad_norm": 0.4966665506362915, "learning_rate": 7.393518160188063e-05, "loss": 6.8124, "step": 320 }, { "epoch": 0.07763792495000588, "grad_norm": 0.5690203905105591, "learning_rate": 6.69170945044974e-05, "loss": 6.7967, "step": 330 }, { "epoch": 0.07999058934243031, "grad_norm": 0.5846540927886963, "learning_rate": 6.009428729356871e-05, "loss": 6.788, "step": 340 }, { "epoch": 0.08234325373485472, "grad_norm": 0.8786140084266663, "learning_rate": 5.3500000000000026e-05, "loss": 6.7735, "step": 350 }, { "epoch": 0.08234325373485472, "eval_loss": 6.795702934265137, "eval_runtime": 3.2583, "eval_samples_per_second": 549.37, "eval_steps_per_second": 137.496, "step": 350 }, { "epoch": 0.08469591812727914, "grad_norm": 0.5708802342414856, "learning_rate": 4.7166359328630106e-05, "loss": 6.8333, "step": 360 }, { "epoch": 0.08704858251970357, "grad_norm": 0.5529999136924744, "learning_rate": 4.112422214015456e-05, "loss": 6.8258, "step": 370 }, { "epoch": 0.08940124691212799, "grad_norm": 0.6375375390052795, "learning_rate": 3.5403025119602206e-05, "loss": 6.7989, "step": 380 }, { "epoch": 0.0917539113045524, "grad_norm": 0.974999189376831, "learning_rate": 3.0030641363764346e-05, "loss": 6.751, "step": 390 }, { "epoch": 0.09410657569697682, "grad_norm": 1.1716803312301636, "learning_rate": 2.5033244586269365e-05, "loss": 6.7699, "step": 400 }, { "epoch": 0.09410657569697682, "eval_loss": 6.793699741363525, "eval_runtime": 3.2263, "eval_samples_per_second": 554.821, "eval_steps_per_second": 138.86, "step": 400 }, { "epoch": 0.09645924008940125, "grad_norm": 0.6257851719856262, "learning_rate": 2.0435181601880635e-05, "loss": 6.8415, "step": 410 }, { "epoch": 0.09881190448182567, "grad_norm": 0.5798721313476562, "learning_rate": 1.625885371126242e-05, "loss": 6.825, "step": 420 }, { "epoch": 0.10116456887425009, "grad_norm": 0.5897112488746643, "learning_rate": 1.2524607564094813e-05, "loss": 6.797, "step": 430 }, { "epoch": 0.1035172332666745, "grad_norm": 0.7111787796020508, "learning_rate": 9.250636032241695e-06, "loss": 6.772, "step": 440 }, { "epoch": 0.10586989765909893, "grad_norm": 1.2426642179489136, "learning_rate": 6.45288957590781e-06, "loss": 6.7647, "step": 450 }, { "epoch": 0.10586989765909893, "eval_loss": 6.792733192443848, "eval_runtime": 3.2252, "eval_samples_per_second": 555.008, "eval_steps_per_second": 138.907, "step": 450 }, { "epoch": 0.10822256205152335, "grad_norm": 0.5731855630874634, "learning_rate": 4.144998534599878e-06, "loss": 6.8432, "step": 460 }, { "epoch": 0.11057522644394777, "grad_norm": 0.7041200399398804, "learning_rate": 2.3382067214827915e-06, "loss": 6.8216, "step": 470 }, { "epoch": 0.1129278908363722, "grad_norm": 0.7549869418144226, "learning_rate": 1.0413166446519713e-06, "loss": 6.7779, "step": 480 }, { "epoch": 0.11528055522879661, "grad_norm": 0.7202994227409363, "learning_rate": 2.6064662219881083e-07, "loss": 6.7666, "step": 490 }, { "epoch": 0.11763321962122103, "grad_norm": 1.0722975730895996, "learning_rate": 0.0, "loss": 6.7527, "step": 500 }, { "epoch": 0.11763321962122103, "eval_loss": 6.792914867401123, "eval_runtime": 3.2274, "eval_samples_per_second": 554.628, "eval_steps_per_second": 138.812, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1201410048000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }