{ "best_metric": 2.0367350578308105, "best_model_checkpoint": "/home/sr2464/scratch/C2S_Files/C2S_training_runs/multicell_v2_pretraining_runs/finetuning-EleutherAI/pythia-410m-multicell_v2_pretraining-2024-07-22_18-32-23/checkpoint-15600", "epoch": 0.030125854619963952, "eval_steps": 100, "global_step": 15600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019311445269207662, "grad_norm": 25.695459365844727, "learning_rate": 9.655678504528514e-08, "loss": 5.0491, "step": 100 }, { "epoch": 0.00019311445269207662, "eval_loss": 4.909114360809326, "eval_runtime": 453.5777, "eval_samples_per_second": 4.414, "eval_steps_per_second": 1.473, "step": 100 }, { "epoch": 0.00038622890538415324, "grad_norm": 9.293487548828125, "learning_rate": 1.931135700905703e-07, "loss": 4.7166, "step": 200 }, { "epoch": 0.00038622890538415324, "eval_loss": 4.411260604858398, "eval_runtime": 451.7078, "eval_samples_per_second": 4.432, "eval_steps_per_second": 1.479, "step": 200 }, { "epoch": 0.0005793433580762299, "grad_norm": 7.290333271026611, "learning_rate": 2.896703551358554e-07, "loss": 4.2079, "step": 300 }, { "epoch": 0.0005793433580762299, "eval_loss": 3.940201759338379, "eval_runtime": 451.9244, "eval_samples_per_second": 4.43, "eval_steps_per_second": 1.478, "step": 300 }, { "epoch": 0.0007724578107683065, "grad_norm": 4.443881511688232, "learning_rate": 3.862271401811406e-07, "loss": 3.8061, "step": 400 }, { "epoch": 0.0007724578107683065, "eval_loss": 3.649780511856079, "eval_runtime": 453.8369, "eval_samples_per_second": 4.411, "eval_steps_per_second": 1.472, "step": 400 }, { "epoch": 0.0009655722634603831, "grad_norm": 4.046736717224121, "learning_rate": 4.827839252264257e-07, "loss": 3.5886, "step": 500 }, { "epoch": 0.0009655722634603831, "eval_loss": 3.4819984436035156, "eval_runtime": 454.1608, "eval_samples_per_second": 4.408, "eval_steps_per_second": 1.471, "step": 500 }, { "epoch": 0.0011586867161524597, "grad_norm": 4.199274063110352, "learning_rate": 5.793407102717108e-07, "loss": 3.4235, "step": 600 }, { "epoch": 0.0011586867161524597, "eval_loss": 3.3766062259674072, "eval_runtime": 451.2045, "eval_samples_per_second": 4.437, "eval_steps_per_second": 1.48, "step": 600 }, { "epoch": 0.0013518011688445364, "grad_norm": 3.957104444503784, "learning_rate": 6.75897495316996e-07, "loss": 3.3368, "step": 700 }, { "epoch": 0.0013518011688445364, "eval_loss": 3.298933982849121, "eval_runtime": 452.0648, "eval_samples_per_second": 4.429, "eval_steps_per_second": 1.478, "step": 700 }, { "epoch": 0.001544915621536613, "grad_norm": 3.570394992828369, "learning_rate": 7.724542803622812e-07, "loss": 3.258, "step": 800 }, { "epoch": 0.001544915621536613, "eval_loss": 3.2292919158935547, "eval_runtime": 455.0475, "eval_samples_per_second": 4.4, "eval_steps_per_second": 1.468, "step": 800 }, { "epoch": 0.0017380300742286896, "grad_norm": 2.5731029510498047, "learning_rate": 8.690110654075663e-07, "loss": 3.1963, "step": 900 }, { "epoch": 0.0017380300742286896, "eval_loss": 3.1713345050811768, "eval_runtime": 451.7557, "eval_samples_per_second": 4.432, "eval_steps_per_second": 1.479, "step": 900 }, { "epoch": 0.0019311445269207662, "grad_norm": 4.118990898132324, "learning_rate": 9.655678504528513e-07, "loss": 3.146, "step": 1000 }, { "epoch": 0.0019311445269207662, "eval_loss": 3.1244723796844482, "eval_runtime": 445.9466, "eval_samples_per_second": 4.489, "eval_steps_per_second": 1.498, "step": 1000 }, { "epoch": 0.002124258979612843, "grad_norm": 3.6328647136688232, "learning_rate": 1.0621246354981365e-06, "loss": 3.1092, "step": 1100 }, { "epoch": 0.002124258979612843, "eval_loss": 3.0821051597595215, "eval_runtime": 445.7792, "eval_samples_per_second": 4.491, "eval_steps_per_second": 1.498, "step": 1100 }, { "epoch": 0.0023173734323049195, "grad_norm": 3.1631813049316406, "learning_rate": 1.1586814205434216e-06, "loss": 3.0401, "step": 1200 }, { "epoch": 0.0023173734323049195, "eval_loss": 3.04966402053833, "eval_runtime": 445.2862, "eval_samples_per_second": 4.496, "eval_steps_per_second": 1.5, "step": 1200 }, { "epoch": 0.0025104878849969963, "grad_norm": 3.6014091968536377, "learning_rate": 1.2552382055887068e-06, "loss": 3.0162, "step": 1300 }, { "epoch": 0.0025104878849969963, "eval_loss": 3.0158376693725586, "eval_runtime": 445.2448, "eval_samples_per_second": 4.496, "eval_steps_per_second": 1.5, "step": 1300 }, { "epoch": 0.0027036023376890727, "grad_norm": 3.6174280643463135, "learning_rate": 1.351794990633992e-06, "loss": 2.9725, "step": 1400 }, { "epoch": 0.0027036023376890727, "eval_loss": 2.979250431060791, "eval_runtime": 448.3628, "eval_samples_per_second": 4.465, "eval_steps_per_second": 1.49, "step": 1400 }, { "epoch": 0.0028967167903811496, "grad_norm": 4.053787708282471, "learning_rate": 1.448351775679277e-06, "loss": 2.9443, "step": 1500 }, { "epoch": 0.0028967167903811496, "eval_loss": 2.938361883163452, "eval_runtime": 445.077, "eval_samples_per_second": 4.498, "eval_steps_per_second": 1.501, "step": 1500 }, { "epoch": 0.003089831243073226, "grad_norm": 4.192756652832031, "learning_rate": 1.5449085607245623e-06, "loss": 2.9164, "step": 1600 }, { "epoch": 0.003089831243073226, "eval_loss": 2.89266037940979, "eval_runtime": 474.6898, "eval_samples_per_second": 4.217, "eval_steps_per_second": 1.407, "step": 1600 }, { "epoch": 0.003282945695765303, "grad_norm": 4.469438076019287, "learning_rate": 1.6414653457698475e-06, "loss": 2.8643, "step": 1700 }, { "epoch": 0.003282945695765303, "eval_loss": 2.851994752883911, "eval_runtime": 470.3677, "eval_samples_per_second": 4.256, "eval_steps_per_second": 1.42, "step": 1700 }, { "epoch": 0.003476060148457379, "grad_norm": 4.589786052703857, "learning_rate": 1.7380221308151326e-06, "loss": 2.8158, "step": 1800 }, { "epoch": 0.003476060148457379, "eval_loss": 2.8121631145477295, "eval_runtime": 474.8649, "eval_samples_per_second": 4.216, "eval_steps_per_second": 1.407, "step": 1800 }, { "epoch": 0.003669174601149456, "grad_norm": 4.862005710601807, "learning_rate": 1.8345789158604178e-06, "loss": 2.7741, "step": 1900 }, { "epoch": 0.003669174601149456, "eval_loss": 2.767763614654541, "eval_runtime": 451.5831, "eval_samples_per_second": 4.433, "eval_steps_per_second": 1.479, "step": 1900 }, { "epoch": 0.0038622890538415324, "grad_norm": 8.818233489990234, "learning_rate": 1.9311357009057026e-06, "loss": 2.7214, "step": 2000 }, { "epoch": 0.0038622890538415324, "eval_loss": 2.727350950241089, "eval_runtime": 450.1021, "eval_samples_per_second": 4.448, "eval_steps_per_second": 1.484, "step": 2000 }, { "epoch": 0.004055403506533609, "grad_norm": 6.361968040466309, "learning_rate": 2.027692485950988e-06, "loss": 2.6898, "step": 2100 }, { "epoch": 0.004055403506533609, "eval_loss": 2.6967461109161377, "eval_runtime": 455.1751, "eval_samples_per_second": 4.398, "eval_steps_per_second": 1.468, "step": 2100 }, { "epoch": 0.004248517959225686, "grad_norm": 3.338510513305664, "learning_rate": 2.124249270996273e-06, "loss": 2.6505, "step": 2200 }, { "epoch": 0.004248517959225686, "eval_loss": 2.6587347984313965, "eval_runtime": 454.0116, "eval_samples_per_second": 4.41, "eval_steps_per_second": 1.471, "step": 2200 }, { "epoch": 0.004441632411917762, "grad_norm": 3.8161659240722656, "learning_rate": 2.2208060560415583e-06, "loss": 2.6135, "step": 2300 }, { "epoch": 0.004441632411917762, "eval_loss": 2.6250338554382324, "eval_runtime": 455.8437, "eval_samples_per_second": 4.392, "eval_steps_per_second": 1.465, "step": 2300 }, { "epoch": 0.004634746864609839, "grad_norm": 4.693429946899414, "learning_rate": 2.317362841086843e-06, "loss": 2.5805, "step": 2400 }, { "epoch": 0.004634746864609839, "eval_loss": 2.5991430282592773, "eval_runtime": 450.2467, "eval_samples_per_second": 4.446, "eval_steps_per_second": 1.484, "step": 2400 }, { "epoch": 0.004827861317301916, "grad_norm": 3.4596662521362305, "learning_rate": 2.4139196261321284e-06, "loss": 2.5585, "step": 2500 }, { "epoch": 0.004827861317301916, "eval_loss": 2.563077926635742, "eval_runtime": 454.4394, "eval_samples_per_second": 4.405, "eval_steps_per_second": 1.47, "step": 2500 }, { "epoch": 0.005020975769993993, "grad_norm": 5.340704917907715, "learning_rate": 2.5104764111774136e-06, "loss": 2.5134, "step": 2600 }, { "epoch": 0.005020975769993993, "eval_loss": 2.5388693809509277, "eval_runtime": 454.2794, "eval_samples_per_second": 4.407, "eval_steps_per_second": 1.47, "step": 2600 }, { "epoch": 0.005214090222686069, "grad_norm": 3.7562334537506104, "learning_rate": 2.6070331962226984e-06, "loss": 2.4956, "step": 2700 }, { "epoch": 0.005214090222686069, "eval_loss": 2.5175445079803467, "eval_runtime": 454.0268, "eval_samples_per_second": 4.409, "eval_steps_per_second": 1.471, "step": 2700 }, { "epoch": 0.005407204675378145, "grad_norm": 4.657394886016846, "learning_rate": 2.703589981267984e-06, "loss": 2.484, "step": 2800 }, { "epoch": 0.005407204675378145, "eval_loss": 2.493380069732666, "eval_runtime": 451.8282, "eval_samples_per_second": 4.431, "eval_steps_per_second": 1.478, "step": 2800 }, { "epoch": 0.005600319128070222, "grad_norm": 4.6835150718688965, "learning_rate": 2.800146766313269e-06, "loss": 2.4705, "step": 2900 }, { "epoch": 0.005600319128070222, "eval_loss": 2.4784457683563232, "eval_runtime": 451.6775, "eval_samples_per_second": 4.432, "eval_steps_per_second": 1.479, "step": 2900 }, { "epoch": 0.005793433580762299, "grad_norm": 5.81275749206543, "learning_rate": 2.896703551358554e-06, "loss": 2.4405, "step": 3000 }, { "epoch": 0.005793433580762299, "eval_loss": 2.4617481231689453, "eval_runtime": 453.6192, "eval_samples_per_second": 4.413, "eval_steps_per_second": 1.473, "step": 3000 }, { "epoch": 0.005986548033454375, "grad_norm": 3.0662126541137695, "learning_rate": 2.9932603364038394e-06, "loss": 2.427, "step": 3100 }, { "epoch": 0.005986548033454375, "eval_loss": 2.448239326477051, "eval_runtime": 451.3577, "eval_samples_per_second": 4.436, "eval_steps_per_second": 1.48, "step": 3100 }, { "epoch": 0.006179662486146452, "grad_norm": 7.125177383422852, "learning_rate": 3.0898171214491246e-06, "loss": 2.41, "step": 3200 }, { "epoch": 0.006179662486146452, "eval_loss": 2.435983896255493, "eval_runtime": 451.9197, "eval_samples_per_second": 4.43, "eval_steps_per_second": 1.478, "step": 3200 }, { "epoch": 0.006372776938838529, "grad_norm": 5.4418439865112305, "learning_rate": 3.1863739064944094e-06, "loss": 2.401, "step": 3300 }, { "epoch": 0.006372776938838529, "eval_loss": 2.42331862449646, "eval_runtime": 449.2333, "eval_samples_per_second": 4.456, "eval_steps_per_second": 1.487, "step": 3300 }, { "epoch": 0.006565891391530606, "grad_norm": 10.877897262573242, "learning_rate": 3.282930691539695e-06, "loss": 2.3891, "step": 3400 }, { "epoch": 0.006565891391530606, "eval_loss": 2.417658567428589, "eval_runtime": 453.4294, "eval_samples_per_second": 4.415, "eval_steps_per_second": 1.473, "step": 3400 }, { "epoch": 0.006759005844222682, "grad_norm": 5.651215076446533, "learning_rate": 3.37948747658498e-06, "loss": 2.3852, "step": 3500 }, { "epoch": 0.006759005844222682, "eval_loss": 2.4044547080993652, "eval_runtime": 456.5451, "eval_samples_per_second": 4.385, "eval_steps_per_second": 1.463, "step": 3500 }, { "epoch": 0.006952120296914758, "grad_norm": 4.415964126586914, "learning_rate": 3.476044261630265e-06, "loss": 2.3623, "step": 3600 }, { "epoch": 0.006952120296914758, "eval_loss": 2.3921079635620117, "eval_runtime": 455.241, "eval_samples_per_second": 4.398, "eval_steps_per_second": 1.467, "step": 3600 }, { "epoch": 0.007145234749606835, "grad_norm": 4.4306793212890625, "learning_rate": 3.57260104667555e-06, "loss": 2.3519, "step": 3700 }, { "epoch": 0.007145234749606835, "eval_loss": 2.3823041915893555, "eval_runtime": 453.1329, "eval_samples_per_second": 4.418, "eval_steps_per_second": 1.474, "step": 3700 }, { "epoch": 0.007338349202298912, "grad_norm": 4.322727203369141, "learning_rate": 3.6691578317208356e-06, "loss": 2.3449, "step": 3800 }, { "epoch": 0.007338349202298912, "eval_loss": 2.3733458518981934, "eval_runtime": 452.5517, "eval_samples_per_second": 4.424, "eval_steps_per_second": 1.476, "step": 3800 }, { "epoch": 0.007531463654990988, "grad_norm": 3.902733325958252, "learning_rate": 3.76571461676612e-06, "loss": 2.3425, "step": 3900 }, { "epoch": 0.007531463654990988, "eval_loss": 2.3656857013702393, "eval_runtime": 470.1556, "eval_samples_per_second": 4.258, "eval_steps_per_second": 1.421, "step": 3900 }, { "epoch": 0.007724578107683065, "grad_norm": 8.922844886779785, "learning_rate": 3.862271401811405e-06, "loss": 2.3372, "step": 4000 }, { "epoch": 0.007724578107683065, "eval_loss": 2.3647260665893555, "eval_runtime": 473.4275, "eval_samples_per_second": 4.229, "eval_steps_per_second": 1.411, "step": 4000 }, { "epoch": 0.00791769256037514, "grad_norm": 5.44854736328125, "learning_rate": 3.9588281868566905e-06, "loss": 2.3265, "step": 4100 }, { "epoch": 0.00791769256037514, "eval_loss": 2.354055643081665, "eval_runtime": 469.6055, "eval_samples_per_second": 4.263, "eval_steps_per_second": 1.422, "step": 4100 }, { "epoch": 0.008110807013067219, "grad_norm": 3.362586736679077, "learning_rate": 4.055384971901976e-06, "loss": 2.3184, "step": 4200 }, { "epoch": 0.008110807013067219, "eval_loss": 2.3412978649139404, "eval_runtime": 475.6279, "eval_samples_per_second": 4.209, "eval_steps_per_second": 1.404, "step": 4200 }, { "epoch": 0.008303921465759295, "grad_norm": 2.9980242252349854, "learning_rate": 4.151941756947261e-06, "loss": 2.3133, "step": 4300 }, { "epoch": 0.008303921465759295, "eval_loss": 2.3344852924346924, "eval_runtime": 471.0361, "eval_samples_per_second": 4.25, "eval_steps_per_second": 1.418, "step": 4300 }, { "epoch": 0.008497035918451372, "grad_norm": 2.9420857429504395, "learning_rate": 4.248498541992546e-06, "loss": 2.298, "step": 4400 }, { "epoch": 0.008497035918451372, "eval_loss": 2.3255553245544434, "eval_runtime": 469.2896, "eval_samples_per_second": 4.266, "eval_steps_per_second": 1.423, "step": 4400 }, { "epoch": 0.008690150371143448, "grad_norm": 3.5292468070983887, "learning_rate": 4.345055327037831e-06, "loss": 2.2892, "step": 4500 }, { "epoch": 0.008690150371143448, "eval_loss": 2.3173961639404297, "eval_runtime": 446.9797, "eval_samples_per_second": 4.479, "eval_steps_per_second": 1.494, "step": 4500 }, { "epoch": 0.008883264823835524, "grad_norm": 3.7100796699523926, "learning_rate": 4.441612112083117e-06, "loss": 2.2829, "step": 4600 }, { "epoch": 0.008883264823835524, "eval_loss": 2.313410520553589, "eval_runtime": 443.1163, "eval_samples_per_second": 4.518, "eval_steps_per_second": 1.508, "step": 4600 }, { "epoch": 0.009076379276527602, "grad_norm": 3.060458183288574, "learning_rate": 4.538168897128402e-06, "loss": 2.2622, "step": 4700 }, { "epoch": 0.009076379276527602, "eval_loss": 2.303041934967041, "eval_runtime": 515.3629, "eval_samples_per_second": 3.885, "eval_steps_per_second": 1.296, "step": 4700 }, { "epoch": 0.009269493729219678, "grad_norm": 2.960869312286377, "learning_rate": 4.634725682173686e-06, "loss": 2.2691, "step": 4800 }, { "epoch": 0.009269493729219678, "eval_loss": 2.2965316772460938, "eval_runtime": 519.7712, "eval_samples_per_second": 3.852, "eval_steps_per_second": 1.285, "step": 4800 }, { "epoch": 0.009462608181911754, "grad_norm": 4.1272993087768555, "learning_rate": 4.7312824672189715e-06, "loss": 2.2662, "step": 4900 }, { "epoch": 0.009462608181911754, "eval_loss": 2.2936575412750244, "eval_runtime": 517.9358, "eval_samples_per_second": 3.865, "eval_steps_per_second": 1.29, "step": 4900 }, { "epoch": 0.009655722634603832, "grad_norm": 3.13708758354187, "learning_rate": 4.827839252264257e-06, "loss": 2.2566, "step": 5000 }, { "epoch": 0.009655722634603832, "eval_loss": 2.282074451446533, "eval_runtime": 518.3534, "eval_samples_per_second": 3.862, "eval_steps_per_second": 1.289, "step": 5000 }, { "epoch": 0.009848837087295908, "grad_norm": 6.472213268280029, "learning_rate": 4.924396037309542e-06, "loss": 2.2491, "step": 5100 }, { "epoch": 0.009848837087295908, "eval_loss": 2.2818562984466553, "eval_runtime": 516.8354, "eval_samples_per_second": 3.874, "eval_steps_per_second": 1.292, "step": 5100 }, { "epoch": 0.010041951539987985, "grad_norm": 3.39032244682312, "learning_rate": 5.020952822354827e-06, "loss": 2.2439, "step": 5200 }, { "epoch": 0.010041951539987985, "eval_loss": 2.2728891372680664, "eval_runtime": 519.2594, "eval_samples_per_second": 3.855, "eval_steps_per_second": 1.286, "step": 5200 }, { "epoch": 0.010235065992680061, "grad_norm": 2.733482837677002, "learning_rate": 5.1175096074001125e-06, "loss": 2.2377, "step": 5300 }, { "epoch": 0.010235065992680061, "eval_loss": 2.266763687133789, "eval_runtime": 516.9841, "eval_samples_per_second": 3.872, "eval_steps_per_second": 1.292, "step": 5300 }, { "epoch": 0.010428180445372137, "grad_norm": 3.014375686645508, "learning_rate": 5.214066392445397e-06, "loss": 2.229, "step": 5400 }, { "epoch": 0.010428180445372137, "eval_loss": 2.2636349201202393, "eval_runtime": 518.5144, "eval_samples_per_second": 3.861, "eval_steps_per_second": 1.288, "step": 5400 }, { "epoch": 0.010621294898064215, "grad_norm": 3.083723306655884, "learning_rate": 5.310623177490683e-06, "loss": 2.2328, "step": 5500 }, { "epoch": 0.010621294898064215, "eval_loss": 2.258470058441162, "eval_runtime": 516.2515, "eval_samples_per_second": 3.878, "eval_steps_per_second": 1.294, "step": 5500 }, { "epoch": 0.01081440935075629, "grad_norm": 2.823312997817993, "learning_rate": 5.407179962535968e-06, "loss": 2.2203, "step": 5600 }, { "epoch": 0.01081440935075629, "eval_loss": 2.2555058002471924, "eval_runtime": 517.1852, "eval_samples_per_second": 3.871, "eval_steps_per_second": 1.292, "step": 5600 }, { "epoch": 0.011007523803448367, "grad_norm": 2.4301838874816895, "learning_rate": 5.5037367475812526e-06, "loss": 2.219, "step": 5700 }, { "epoch": 0.011007523803448367, "eval_loss": 2.2486064434051514, "eval_runtime": 519.4056, "eval_samples_per_second": 3.854, "eval_steps_per_second": 1.286, "step": 5700 }, { "epoch": 0.011200638256140445, "grad_norm": 2.8203349113464355, "learning_rate": 5.600293532626538e-06, "loss": 2.2194, "step": 5800 }, { "epoch": 0.011200638256140445, "eval_loss": 2.2447359561920166, "eval_runtime": 517.18, "eval_samples_per_second": 3.871, "eval_steps_per_second": 1.292, "step": 5800 }, { "epoch": 0.01139375270883252, "grad_norm": 3.018662214279175, "learning_rate": 5.696850317671823e-06, "loss": 2.216, "step": 5900 }, { "epoch": 0.01139375270883252, "eval_loss": 2.2398221492767334, "eval_runtime": 526.6953, "eval_samples_per_second": 3.801, "eval_steps_per_second": 1.268, "step": 5900 }, { "epoch": 0.011586867161524598, "grad_norm": 3.5099599361419678, "learning_rate": 5.793407102717108e-06, "loss": 2.2066, "step": 6000 }, { "epoch": 0.011586867161524598, "eval_loss": 2.2389719486236572, "eval_runtime": 454.8275, "eval_samples_per_second": 4.402, "eval_steps_per_second": 1.469, "step": 6000 }, { "epoch": 0.011779981614216674, "grad_norm": 2.642719268798828, "learning_rate": 5.8899638877623935e-06, "loss": 2.199, "step": 6100 }, { "epoch": 0.011779981614216674, "eval_loss": 2.2292520999908447, "eval_runtime": 453.5237, "eval_samples_per_second": 4.414, "eval_steps_per_second": 1.473, "step": 6100 }, { "epoch": 0.01197309606690875, "grad_norm": 2.63216233253479, "learning_rate": 5.986520672807679e-06, "loss": 2.1942, "step": 6200 }, { "epoch": 0.01197309606690875, "eval_loss": 2.224433183670044, "eval_runtime": 454.9765, "eval_samples_per_second": 4.4, "eval_steps_per_second": 1.468, "step": 6200 }, { "epoch": 0.012166210519600828, "grad_norm": 3.0821280479431152, "learning_rate": 6.083077457852963e-06, "loss": 2.1926, "step": 6300 }, { "epoch": 0.012166210519600828, "eval_loss": 2.2213406562805176, "eval_runtime": 455.9375, "eval_samples_per_second": 4.391, "eval_steps_per_second": 1.465, "step": 6300 }, { "epoch": 0.012359324972292904, "grad_norm": 3.2778172492980957, "learning_rate": 6.179634242898249e-06, "loss": 2.1771, "step": 6400 }, { "epoch": 0.012359324972292904, "eval_loss": 2.221287488937378, "eval_runtime": 451.8099, "eval_samples_per_second": 4.431, "eval_steps_per_second": 1.478, "step": 6400 }, { "epoch": 0.01255243942498498, "grad_norm": 1.9902693033218384, "learning_rate": 6.2761910279435345e-06, "loss": 2.1654, "step": 6500 }, { "epoch": 0.01255243942498498, "eval_loss": 2.2130887508392334, "eval_runtime": 452.4518, "eval_samples_per_second": 4.425, "eval_steps_per_second": 1.476, "step": 6500 }, { "epoch": 0.012745553877677058, "grad_norm": 3.490743637084961, "learning_rate": 6.372747812988819e-06, "loss": 2.1891, "step": 6600 }, { "epoch": 0.012745553877677058, "eval_loss": 2.210174083709717, "eval_runtime": 481.3251, "eval_samples_per_second": 4.159, "eval_steps_per_second": 1.388, "step": 6600 }, { "epoch": 0.012938668330369133, "grad_norm": 2.3788084983825684, "learning_rate": 6.469304598034104e-06, "loss": 2.1834, "step": 6700 }, { "epoch": 0.012938668330369133, "eval_loss": 2.2062604427337646, "eval_runtime": 452.5564, "eval_samples_per_second": 4.424, "eval_steps_per_second": 1.476, "step": 6700 }, { "epoch": 0.013131782783061211, "grad_norm": 2.141946792602539, "learning_rate": 6.56586138307939e-06, "loss": 2.1605, "step": 6800 }, { "epoch": 0.013131782783061211, "eval_loss": 2.2010421752929688, "eval_runtime": 455.2341, "eval_samples_per_second": 4.398, "eval_steps_per_second": 1.467, "step": 6800 }, { "epoch": 0.013324897235753287, "grad_norm": 2.3113036155700684, "learning_rate": 6.6624181681246746e-06, "loss": 2.1643, "step": 6900 }, { "epoch": 0.013324897235753287, "eval_loss": 2.2002882957458496, "eval_runtime": 454.153, "eval_samples_per_second": 4.408, "eval_steps_per_second": 1.471, "step": 6900 }, { "epoch": 0.013518011688445363, "grad_norm": 2.2467916011810303, "learning_rate": 6.75897495316996e-06, "loss": 2.1609, "step": 7000 }, { "epoch": 0.013518011688445363, "eval_loss": 2.1954784393310547, "eval_runtime": 455.4356, "eval_samples_per_second": 4.396, "eval_steps_per_second": 1.467, "step": 7000 }, { "epoch": 0.01371112614113744, "grad_norm": 2.992947816848755, "learning_rate": 6.855531738215244e-06, "loss": 2.1617, "step": 7100 }, { "epoch": 0.01371112614113744, "eval_loss": 2.192232608795166, "eval_runtime": 455.4904, "eval_samples_per_second": 4.395, "eval_steps_per_second": 1.467, "step": 7100 }, { "epoch": 0.013904240593829517, "grad_norm": 2.2335407733917236, "learning_rate": 6.95208852326053e-06, "loss": 2.1525, "step": 7200 }, { "epoch": 0.013904240593829517, "eval_loss": 2.1886117458343506, "eval_runtime": 451.074, "eval_samples_per_second": 4.438, "eval_steps_per_second": 1.481, "step": 7200 }, { "epoch": 0.014097355046521593, "grad_norm": 2.345625877380371, "learning_rate": 7.0486453083058155e-06, "loss": 2.15, "step": 7300 }, { "epoch": 0.014097355046521593, "eval_loss": 2.1845381259918213, "eval_runtime": 442.662, "eval_samples_per_second": 4.523, "eval_steps_per_second": 1.509, "step": 7300 }, { "epoch": 0.01429046949921367, "grad_norm": 2.1838791370391846, "learning_rate": 7.1452020933511e-06, "loss": 2.1579, "step": 7400 }, { "epoch": 0.01429046949921367, "eval_loss": 2.1800084114074707, "eval_runtime": 448.4458, "eval_samples_per_second": 4.464, "eval_steps_per_second": 1.49, "step": 7400 }, { "epoch": 0.014483583951905746, "grad_norm": 2.7497525215148926, "learning_rate": 7.241758878396384e-06, "loss": 2.1352, "step": 7500 }, { "epoch": 0.014483583951905746, "eval_loss": 2.178600311279297, "eval_runtime": 449.0785, "eval_samples_per_second": 4.458, "eval_steps_per_second": 1.487, "step": 7500 }, { "epoch": 0.014676698404597824, "grad_norm": 5.904616355895996, "learning_rate": 7.338315663441671e-06, "loss": 2.151, "step": 7600 }, { "epoch": 0.014676698404597824, "eval_loss": 2.197683334350586, "eval_runtime": 449.0498, "eval_samples_per_second": 4.458, "eval_steps_per_second": 1.488, "step": 7600 }, { "epoch": 0.0148698128572899, "grad_norm": 2.8403713703155518, "learning_rate": 7.434872448486956e-06, "loss": 2.1309, "step": 7700 }, { "epoch": 0.0148698128572899, "eval_loss": 2.1773977279663086, "eval_runtime": 451.4103, "eval_samples_per_second": 4.435, "eval_steps_per_second": 1.48, "step": 7700 }, { "epoch": 0.015062927309981976, "grad_norm": 6.5780415534973145, "learning_rate": 7.53142923353224e-06, "loss": 2.1228, "step": 7800 }, { "epoch": 0.015062927309981976, "eval_loss": 2.1752445697784424, "eval_runtime": 450.5777, "eval_samples_per_second": 4.443, "eval_steps_per_second": 1.483, "step": 7800 }, { "epoch": 0.015256041762674054, "grad_norm": 1.9982277154922485, "learning_rate": 7.627986018577526e-06, "loss": 2.1401, "step": 7900 }, { "epoch": 0.015256041762674054, "eval_loss": 2.170668363571167, "eval_runtime": 453.2726, "eval_samples_per_second": 4.417, "eval_steps_per_second": 1.474, "step": 7900 }, { "epoch": 0.01544915621536613, "grad_norm": 2.037261486053467, "learning_rate": 7.72454280362281e-06, "loss": 2.1276, "step": 8000 }, { "epoch": 0.01544915621536613, "eval_loss": 2.166703462600708, "eval_runtime": 451.1428, "eval_samples_per_second": 4.438, "eval_steps_per_second": 1.481, "step": 8000 }, { "epoch": 0.015642270668058206, "grad_norm": 2.6199100017547607, "learning_rate": 7.821099588668096e-06, "loss": 2.1366, "step": 8100 }, { "epoch": 0.015642270668058206, "eval_loss": 2.1716530323028564, "eval_runtime": 450.9985, "eval_samples_per_second": 4.439, "eval_steps_per_second": 1.481, "step": 8100 }, { "epoch": 0.01583538512075028, "grad_norm": 2.100360870361328, "learning_rate": 7.917656373713381e-06, "loss": 2.1219, "step": 8200 }, { "epoch": 0.01583538512075028, "eval_loss": 2.1657602787017822, "eval_runtime": 449.8619, "eval_samples_per_second": 4.45, "eval_steps_per_second": 1.485, "step": 8200 }, { "epoch": 0.01602849957344236, "grad_norm": 2.637589931488037, "learning_rate": 8.014213158758666e-06, "loss": 2.1232, "step": 8300 }, { "epoch": 0.01602849957344236, "eval_loss": 2.1619670391082764, "eval_runtime": 444.6116, "eval_samples_per_second": 4.503, "eval_steps_per_second": 1.502, "step": 8300 }, { "epoch": 0.016221614026134437, "grad_norm": 2.937350273132324, "learning_rate": 8.110769943803951e-06, "loss": 2.1195, "step": 8400 }, { "epoch": 0.016221614026134437, "eval_loss": 2.160137891769409, "eval_runtime": 452.1301, "eval_samples_per_second": 4.428, "eval_steps_per_second": 1.477, "step": 8400 }, { "epoch": 0.016414728478826513, "grad_norm": 2.5765292644500732, "learning_rate": 8.207326728849237e-06, "loss": 2.124, "step": 8500 }, { "epoch": 0.016414728478826513, "eval_loss": 2.1571407318115234, "eval_runtime": 449.1301, "eval_samples_per_second": 4.458, "eval_steps_per_second": 1.487, "step": 8500 }, { "epoch": 0.01660784293151859, "grad_norm": 3.157248020172119, "learning_rate": 8.303883513894522e-06, "loss": 2.1231, "step": 8600 }, { "epoch": 0.01660784293151859, "eval_loss": 2.1576952934265137, "eval_runtime": 450.206, "eval_samples_per_second": 4.447, "eval_steps_per_second": 1.484, "step": 8600 }, { "epoch": 0.016800957384210665, "grad_norm": 2.0448155403137207, "learning_rate": 8.400440298939807e-06, "loss": 2.1095, "step": 8700 }, { "epoch": 0.016800957384210665, "eval_loss": 2.1506128311157227, "eval_runtime": 459.0304, "eval_samples_per_second": 4.361, "eval_steps_per_second": 1.455, "step": 8700 }, { "epoch": 0.016994071836902745, "grad_norm": 2.436048746109009, "learning_rate": 8.496997083985092e-06, "loss": 2.1052, "step": 8800 }, { "epoch": 0.016994071836902745, "eval_loss": 2.1523828506469727, "eval_runtime": 448.3924, "eval_samples_per_second": 4.465, "eval_steps_per_second": 1.49, "step": 8800 }, { "epoch": 0.01718718628959482, "grad_norm": 2.020001173019409, "learning_rate": 8.593553869030378e-06, "loss": 2.1114, "step": 8900 }, { "epoch": 0.01718718628959482, "eval_loss": 2.149531126022339, "eval_runtime": 473.4917, "eval_samples_per_second": 4.228, "eval_steps_per_second": 1.411, "step": 8900 }, { "epoch": 0.017380300742286896, "grad_norm": 1.7989071607589722, "learning_rate": 8.690110654075663e-06, "loss": 2.1113, "step": 9000 }, { "epoch": 0.017380300742286896, "eval_loss": 2.145468235015869, "eval_runtime": 447.8226, "eval_samples_per_second": 4.471, "eval_steps_per_second": 1.492, "step": 9000 }, { "epoch": 0.017573415194978972, "grad_norm": 1.8341110944747925, "learning_rate": 8.786667439120948e-06, "loss": 2.1072, "step": 9100 }, { "epoch": 0.017573415194978972, "eval_loss": 2.1460776329040527, "eval_runtime": 449.2918, "eval_samples_per_second": 4.456, "eval_steps_per_second": 1.487, "step": 9100 }, { "epoch": 0.01776652964767105, "grad_norm": 2.2147250175476074, "learning_rate": 8.883224224166233e-06, "loss": 2.1023, "step": 9200 }, { "epoch": 0.01776652964767105, "eval_loss": 2.144148349761963, "eval_runtime": 449.952, "eval_samples_per_second": 4.449, "eval_steps_per_second": 1.485, "step": 9200 }, { "epoch": 0.017959644100363124, "grad_norm": 1.647185206413269, "learning_rate": 8.979781009211517e-06, "loss": 2.1091, "step": 9300 }, { "epoch": 0.017959644100363124, "eval_loss": 2.1457300186157227, "eval_runtime": 453.9364, "eval_samples_per_second": 4.41, "eval_steps_per_second": 1.472, "step": 9300 }, { "epoch": 0.018152758553055204, "grad_norm": 1.5562564134597778, "learning_rate": 9.076337794256804e-06, "loss": 2.097, "step": 9400 }, { "epoch": 0.018152758553055204, "eval_loss": 2.13842511177063, "eval_runtime": 451.2721, "eval_samples_per_second": 4.436, "eval_steps_per_second": 1.48, "step": 9400 }, { "epoch": 0.01834587300574728, "grad_norm": 1.7713780403137207, "learning_rate": 9.172894579302089e-06, "loss": 2.0898, "step": 9500 }, { "epoch": 0.01834587300574728, "eval_loss": 2.1376545429229736, "eval_runtime": 448.606, "eval_samples_per_second": 4.463, "eval_steps_per_second": 1.489, "step": 9500 }, { "epoch": 0.018538987458439356, "grad_norm": 2.3007733821868896, "learning_rate": 9.269451364347373e-06, "loss": 2.093, "step": 9600 }, { "epoch": 0.018538987458439356, "eval_loss": 2.1340465545654297, "eval_runtime": 446.1625, "eval_samples_per_second": 4.487, "eval_steps_per_second": 1.497, "step": 9600 }, { "epoch": 0.01873210191113143, "grad_norm": 1.4584991931915283, "learning_rate": 9.366008149392658e-06, "loss": 2.0982, "step": 9700 }, { "epoch": 0.01873210191113143, "eval_loss": 2.1380152702331543, "eval_runtime": 448.4165, "eval_samples_per_second": 4.465, "eval_steps_per_second": 1.49, "step": 9700 }, { "epoch": 0.018925216363823508, "grad_norm": 1.7227883338928223, "learning_rate": 9.462564934437943e-06, "loss": 2.0887, "step": 9800 }, { "epoch": 0.018925216363823508, "eval_loss": 2.128800868988037, "eval_runtime": 448.7222, "eval_samples_per_second": 4.462, "eval_steps_per_second": 1.489, "step": 9800 }, { "epoch": 0.019118330816515587, "grad_norm": 7.744117736816406, "learning_rate": 9.559121719483228e-06, "loss": 2.0938, "step": 9900 }, { "epoch": 0.019118330816515587, "eval_loss": 2.1388182640075684, "eval_runtime": 449.3968, "eval_samples_per_second": 4.455, "eval_steps_per_second": 1.486, "step": 9900 }, { "epoch": 0.019311445269207663, "grad_norm": 1.3925788402557373, "learning_rate": 9.655678504528514e-06, "loss": 2.0923, "step": 10000 }, { "epoch": 0.019311445269207663, "eval_loss": 2.127917528152466, "eval_runtime": 474.9682, "eval_samples_per_second": 4.215, "eval_steps_per_second": 1.406, "step": 10000 }, { "epoch": 0.01950455972189974, "grad_norm": 1.4508131742477417, "learning_rate": 9.752235289573799e-06, "loss": 2.1154, "step": 10100 }, { "epoch": 0.01950455972189974, "eval_loss": 2.132688045501709, "eval_runtime": 747.3867, "eval_samples_per_second": 2.679, "eval_steps_per_second": 0.894, "step": 10100 }, { "epoch": 0.019697674174591815, "grad_norm": 1.6229538917541504, "learning_rate": 9.848792074619084e-06, "loss": 2.0896, "step": 10200 }, { "epoch": 0.019697674174591815, "eval_loss": 2.1259098052978516, "eval_runtime": 746.7156, "eval_samples_per_second": 2.681, "eval_steps_per_second": 0.895, "step": 10200 }, { "epoch": 0.01989078862728389, "grad_norm": 1.4115490913391113, "learning_rate": 9.94534885966437e-06, "loss": 2.0825, "step": 10300 }, { "epoch": 0.01989078862728389, "eval_loss": 2.119873523712158, "eval_runtime": 746.4042, "eval_samples_per_second": 2.682, "eval_steps_per_second": 0.895, "step": 10300 }, { "epoch": 0.02008390307997597, "grad_norm": 1.349443793296814, "learning_rate": 1.0041905644709654e-05, "loss": 2.0791, "step": 10400 }, { "epoch": 0.02008390307997597, "eval_loss": 2.1162357330322266, "eval_runtime": 745.7602, "eval_samples_per_second": 2.685, "eval_steps_per_second": 0.896, "step": 10400 }, { "epoch": 0.020277017532668046, "grad_norm": 1.5818018913269043, "learning_rate": 1.013846242975494e-05, "loss": 2.0742, "step": 10500 }, { "epoch": 0.020277017532668046, "eval_loss": 2.1109495162963867, "eval_runtime": 751.1179, "eval_samples_per_second": 2.665, "eval_steps_per_second": 0.889, "step": 10500 }, { "epoch": 0.020470131985360122, "grad_norm": 1.2264171838760376, "learning_rate": 1.0235019214800225e-05, "loss": 2.069, "step": 10600 }, { "epoch": 0.020470131985360122, "eval_loss": 2.108383893966675, "eval_runtime": 745.7151, "eval_samples_per_second": 2.685, "eval_steps_per_second": 0.896, "step": 10600 }, { "epoch": 0.0206632464380522, "grad_norm": 1.3637938499450684, "learning_rate": 1.033157599984551e-05, "loss": 2.0665, "step": 10700 }, { "epoch": 0.0206632464380522, "eval_loss": 2.1054770946502686, "eval_runtime": 747.0672, "eval_samples_per_second": 2.68, "eval_steps_per_second": 0.894, "step": 10700 }, { "epoch": 0.020856360890744274, "grad_norm": 1.5810554027557373, "learning_rate": 1.0428132784890794e-05, "loss": 2.0599, "step": 10800 }, { "epoch": 0.020856360890744274, "eval_loss": 2.106874704360962, "eval_runtime": 749.8957, "eval_samples_per_second": 2.67, "eval_steps_per_second": 0.891, "step": 10800 }, { "epoch": 0.02104947534343635, "grad_norm": 1.1894757747650146, "learning_rate": 1.052468956993608e-05, "loss": 2.0616, "step": 10900 }, { "epoch": 0.02104947534343635, "eval_loss": 2.0996124744415283, "eval_runtime": 747.4095, "eval_samples_per_second": 2.679, "eval_steps_per_second": 0.894, "step": 10900 }, { "epoch": 0.02124258979612843, "grad_norm": 1.0796828269958496, "learning_rate": 1.0621246354981366e-05, "loss": 2.0657, "step": 11000 }, { "epoch": 0.02124258979612843, "eval_loss": 2.098177671432495, "eval_runtime": 748.3753, "eval_samples_per_second": 2.675, "eval_steps_per_second": 0.893, "step": 11000 }, { "epoch": 0.021435704248820506, "grad_norm": 1.5541911125183105, "learning_rate": 1.071780314002665e-05, "loss": 2.0451, "step": 11100 }, { "epoch": 0.021435704248820506, "eval_loss": 2.0963504314422607, "eval_runtime": 445.3814, "eval_samples_per_second": 4.495, "eval_steps_per_second": 1.5, "step": 11100 }, { "epoch": 0.02162881870151258, "grad_norm": 1.224878191947937, "learning_rate": 1.0814359925071936e-05, "loss": 2.0636, "step": 11200 }, { "epoch": 0.02162881870151258, "eval_loss": 2.0935559272766113, "eval_runtime": 446.9887, "eval_samples_per_second": 4.479, "eval_steps_per_second": 1.494, "step": 11200 }, { "epoch": 0.021821933154204658, "grad_norm": 1.1165440082550049, "learning_rate": 1.0910916710117222e-05, "loss": 2.0675, "step": 11300 }, { "epoch": 0.021821933154204658, "eval_loss": 2.096463203430176, "eval_runtime": 442.2835, "eval_samples_per_second": 4.527, "eval_steps_per_second": 1.51, "step": 11300 }, { "epoch": 0.022015047606896734, "grad_norm": 1.2041581869125366, "learning_rate": 1.1007473495162505e-05, "loss": 2.0488, "step": 11400 }, { "epoch": 0.022015047606896734, "eval_loss": 2.092092990875244, "eval_runtime": 441.9553, "eval_samples_per_second": 4.53, "eval_steps_per_second": 1.511, "step": 11400 }, { "epoch": 0.022208162059588813, "grad_norm": 1.3523286581039429, "learning_rate": 1.110403028020779e-05, "loss": 2.05, "step": 11500 }, { "epoch": 0.022208162059588813, "eval_loss": 2.092932939529419, "eval_runtime": 445.7387, "eval_samples_per_second": 4.491, "eval_steps_per_second": 1.499, "step": 11500 }, { "epoch": 0.02240127651228089, "grad_norm": 1.2852915525436401, "learning_rate": 1.1200587065253076e-05, "loss": 2.0451, "step": 11600 }, { "epoch": 0.02240127651228089, "eval_loss": 2.08709716796875, "eval_runtime": 442.9466, "eval_samples_per_second": 4.52, "eval_steps_per_second": 1.508, "step": 11600 }, { "epoch": 0.022594390964972965, "grad_norm": 1.2034499645233154, "learning_rate": 1.129714385029836e-05, "loss": 2.05, "step": 11700 }, { "epoch": 0.022594390964972965, "eval_loss": 2.087824821472168, "eval_runtime": 441.7707, "eval_samples_per_second": 4.532, "eval_steps_per_second": 1.512, "step": 11700 }, { "epoch": 0.02278750541766504, "grad_norm": 4.378060340881348, "learning_rate": 1.1393700635343646e-05, "loss": 2.0418, "step": 11800 }, { "epoch": 0.02278750541766504, "eval_loss": 2.086357593536377, "eval_runtime": 443.3936, "eval_samples_per_second": 4.515, "eval_steps_per_second": 1.507, "step": 11800 }, { "epoch": 0.022980619870357117, "grad_norm": 1.2029449939727783, "learning_rate": 1.1490257420388931e-05, "loss": 2.0516, "step": 11900 }, { "epoch": 0.022980619870357117, "eval_loss": 2.084092140197754, "eval_runtime": 443.8581, "eval_samples_per_second": 4.51, "eval_steps_per_second": 1.505, "step": 11900 }, { "epoch": 0.023173734323049196, "grad_norm": 1.160551905632019, "learning_rate": 1.1586814205434217e-05, "loss": 2.0531, "step": 12000 }, { "epoch": 0.023173734323049196, "eval_loss": 2.0821826457977295, "eval_runtime": 450.152, "eval_samples_per_second": 4.447, "eval_steps_per_second": 1.484, "step": 12000 }, { "epoch": 0.023366848775741272, "grad_norm": 1.156447410583496, "learning_rate": 1.1683370990479502e-05, "loss": 2.0445, "step": 12100 }, { "epoch": 0.023366848775741272, "eval_loss": 2.079378366470337, "eval_runtime": 442.5582, "eval_samples_per_second": 4.524, "eval_steps_per_second": 1.509, "step": 12100 }, { "epoch": 0.02355996322843335, "grad_norm": 1.1055281162261963, "learning_rate": 1.1779927775524787e-05, "loss": 2.0482, "step": 12200 }, { "epoch": 0.02355996322843335, "eval_loss": 2.075855016708374, "eval_runtime": 442.7884, "eval_samples_per_second": 4.521, "eval_steps_per_second": 1.509, "step": 12200 }, { "epoch": 0.023753077681125424, "grad_norm": 1.0532505512237549, "learning_rate": 1.187648456057007e-05, "loss": 2.0446, "step": 12300 }, { "epoch": 0.023753077681125424, "eval_loss": 2.0749075412750244, "eval_runtime": 443.7026, "eval_samples_per_second": 4.512, "eval_steps_per_second": 1.506, "step": 12300 }, { "epoch": 0.0239461921338175, "grad_norm": 0.9835991859436035, "learning_rate": 1.1973041345615357e-05, "loss": 2.0393, "step": 12400 }, { "epoch": 0.0239461921338175, "eval_loss": 2.072516918182373, "eval_runtime": 442.5377, "eval_samples_per_second": 4.524, "eval_steps_per_second": 1.509, "step": 12400 }, { "epoch": 0.024139306586509576, "grad_norm": 1.1653251647949219, "learning_rate": 1.2069598130660643e-05, "loss": 2.0376, "step": 12500 }, { "epoch": 0.024139306586509576, "eval_loss": 2.0766026973724365, "eval_runtime": 442.5122, "eval_samples_per_second": 4.524, "eval_steps_per_second": 1.51, "step": 12500 }, { "epoch": 0.024332421039201656, "grad_norm": 1.0789021253585815, "learning_rate": 1.2166154915705926e-05, "loss": 2.0355, "step": 12600 }, { "epoch": 0.024332421039201656, "eval_loss": 2.0720646381378174, "eval_runtime": 440.799, "eval_samples_per_second": 4.542, "eval_steps_per_second": 1.515, "step": 12600 }, { "epoch": 0.02452553549189373, "grad_norm": 1.171810507774353, "learning_rate": 1.2262711700751213e-05, "loss": 2.0397, "step": 12700 }, { "epoch": 0.02452553549189373, "eval_loss": 2.0727598667144775, "eval_runtime": 442.7395, "eval_samples_per_second": 4.522, "eval_steps_per_second": 1.509, "step": 12700 }, { "epoch": 0.024718649944585808, "grad_norm": 1.255620002746582, "learning_rate": 1.2359268485796498e-05, "loss": 2.0305, "step": 12800 }, { "epoch": 0.024718649944585808, "eval_loss": 2.0683271884918213, "eval_runtime": 444.9818, "eval_samples_per_second": 4.499, "eval_steps_per_second": 1.501, "step": 12800 }, { "epoch": 0.024911764397277884, "grad_norm": 1.0595488548278809, "learning_rate": 1.2455825270841782e-05, "loss": 2.0362, "step": 12900 }, { "epoch": 0.024911764397277884, "eval_loss": 2.0663349628448486, "eval_runtime": 496.75, "eval_samples_per_second": 4.03, "eval_steps_per_second": 1.345, "step": 12900 }, { "epoch": 0.02510487884996996, "grad_norm": 1.1099809408187866, "learning_rate": 1.2552382055887069e-05, "loss": 2.0466, "step": 13000 }, { "epoch": 0.02510487884996996, "eval_loss": 2.064277410507202, "eval_runtime": 445.7251, "eval_samples_per_second": 4.492, "eval_steps_per_second": 1.499, "step": 13000 }, { "epoch": 0.02529799330266204, "grad_norm": 1.263329029083252, "learning_rate": 1.2648938840932354e-05, "loss": 2.0265, "step": 13100 }, { "epoch": 0.02529799330266204, "eval_loss": 2.062872886657715, "eval_runtime": 446.6164, "eval_samples_per_second": 4.483, "eval_steps_per_second": 1.496, "step": 13100 }, { "epoch": 0.025491107755354115, "grad_norm": 1.2612566947937012, "learning_rate": 1.2745495625977638e-05, "loss": 2.0343, "step": 13200 }, { "epoch": 0.025491107755354115, "eval_loss": 2.061845541000366, "eval_runtime": 444.0257, "eval_samples_per_second": 4.509, "eval_steps_per_second": 1.504, "step": 13200 }, { "epoch": 0.02568422220804619, "grad_norm": 1.1635104417800903, "learning_rate": 1.2842052411022923e-05, "loss": 2.0218, "step": 13300 }, { "epoch": 0.02568422220804619, "eval_loss": 2.0589962005615234, "eval_runtime": 447.3235, "eval_samples_per_second": 4.476, "eval_steps_per_second": 1.493, "step": 13300 }, { "epoch": 0.025877336660738267, "grad_norm": 1.0829100608825684, "learning_rate": 1.2938609196068208e-05, "loss": 2.022, "step": 13400 }, { "epoch": 0.025877336660738267, "eval_loss": 2.058737277984619, "eval_runtime": 441.9917, "eval_samples_per_second": 4.529, "eval_steps_per_second": 1.511, "step": 13400 }, { "epoch": 0.026070451113430343, "grad_norm": 0.9413105249404907, "learning_rate": 1.3035165981113492e-05, "loss": 2.0146, "step": 13500 }, { "epoch": 0.026070451113430343, "eval_loss": 2.0579559803009033, "eval_runtime": 454.9936, "eval_samples_per_second": 4.4, "eval_steps_per_second": 1.468, "step": 13500 }, { "epoch": 0.026263565566122422, "grad_norm": 0.9331383109092712, "learning_rate": 1.313172276615878e-05, "loss": 2.0266, "step": 13600 }, { "epoch": 0.026263565566122422, "eval_loss": 2.0555694103240967, "eval_runtime": 449.5577, "eval_samples_per_second": 4.453, "eval_steps_per_second": 1.486, "step": 13600 }, { "epoch": 0.0264566800188145, "grad_norm": 1.066735029220581, "learning_rate": 1.3228279551204064e-05, "loss": 2.0291, "step": 13700 }, { "epoch": 0.0264566800188145, "eval_loss": 2.054654598236084, "eval_runtime": 448.1523, "eval_samples_per_second": 4.467, "eval_steps_per_second": 1.491, "step": 13700 }, { "epoch": 0.026649794471506574, "grad_norm": 0.9554963111877441, "learning_rate": 1.3324836336249349e-05, "loss": 2.0108, "step": 13800 }, { "epoch": 0.026649794471506574, "eval_loss": 2.0530266761779785, "eval_runtime": 447.7752, "eval_samples_per_second": 4.471, "eval_steps_per_second": 1.492, "step": 13800 }, { "epoch": 0.02684290892419865, "grad_norm": 0.9701889753341675, "learning_rate": 1.3421393121294634e-05, "loss": 2.0306, "step": 13900 }, { "epoch": 0.02684290892419865, "eval_loss": 2.0509798526763916, "eval_runtime": 448.7204, "eval_samples_per_second": 4.462, "eval_steps_per_second": 1.489, "step": 13900 }, { "epoch": 0.027036023376890726, "grad_norm": 0.9781773090362549, "learning_rate": 1.351794990633992e-05, "loss": 2.0182, "step": 14000 }, { "epoch": 0.027036023376890726, "eval_loss": 2.0510973930358887, "eval_runtime": 449.0716, "eval_samples_per_second": 4.458, "eval_steps_per_second": 1.488, "step": 14000 }, { "epoch": 0.027229137829582802, "grad_norm": 0.9371834397315979, "learning_rate": 1.3614506691385203e-05, "loss": 2.0255, "step": 14100 }, { "epoch": 0.027229137829582802, "eval_loss": 2.048981189727783, "eval_runtime": 448.6448, "eval_samples_per_second": 4.462, "eval_steps_per_second": 1.489, "step": 14100 }, { "epoch": 0.02742225228227488, "grad_norm": 0.8181398510932922, "learning_rate": 1.3711063476430488e-05, "loss": 2.0152, "step": 14200 }, { "epoch": 0.02742225228227488, "eval_loss": 2.047391176223755, "eval_runtime": 442.5828, "eval_samples_per_second": 4.523, "eval_steps_per_second": 1.509, "step": 14200 }, { "epoch": 0.027615366734966958, "grad_norm": 0.9860553741455078, "learning_rate": 1.3807620261475774e-05, "loss": 2.0272, "step": 14300 }, { "epoch": 0.027615366734966958, "eval_loss": 2.047621250152588, "eval_runtime": 447.4945, "eval_samples_per_second": 4.474, "eval_steps_per_second": 1.493, "step": 14300 }, { "epoch": 0.027808481187659034, "grad_norm": 0.8708497881889343, "learning_rate": 1.390417704652106e-05, "loss": 2.0148, "step": 14400 }, { "epoch": 0.027808481187659034, "eval_loss": 2.0451223850250244, "eval_runtime": 442.4977, "eval_samples_per_second": 4.524, "eval_steps_per_second": 1.51, "step": 14400 }, { "epoch": 0.02800159564035111, "grad_norm": 0.7860450744628906, "learning_rate": 1.4000733831566346e-05, "loss": 2.0203, "step": 14500 }, { "epoch": 0.02800159564035111, "eval_loss": 2.0454912185668945, "eval_runtime": 446.3855, "eval_samples_per_second": 4.485, "eval_steps_per_second": 1.496, "step": 14500 }, { "epoch": 0.028194710093043186, "grad_norm": 0.9461175203323364, "learning_rate": 1.4097290616611631e-05, "loss": 2.0018, "step": 14600 }, { "epoch": 0.028194710093043186, "eval_loss": 2.0433313846588135, "eval_runtime": 444.3827, "eval_samples_per_second": 4.505, "eval_steps_per_second": 1.503, "step": 14600 }, { "epoch": 0.028387824545735265, "grad_norm": 0.9540334939956665, "learning_rate": 1.4193847401656915e-05, "loss": 2.0076, "step": 14700 }, { "epoch": 0.028387824545735265, "eval_loss": 2.0425238609313965, "eval_runtime": 446.3018, "eval_samples_per_second": 4.486, "eval_steps_per_second": 1.497, "step": 14700 }, { "epoch": 0.02858093899842734, "grad_norm": 0.8437981605529785, "learning_rate": 1.42904041867022e-05, "loss": 2.0098, "step": 14800 }, { "epoch": 0.02858093899842734, "eval_loss": 2.0422203540802, "eval_runtime": 442.7055, "eval_samples_per_second": 4.522, "eval_steps_per_second": 1.509, "step": 14800 }, { "epoch": 0.028774053451119417, "grad_norm": 1.0205272436141968, "learning_rate": 1.4386960971747485e-05, "loss": 2.0116, "step": 14900 }, { "epoch": 0.028774053451119417, "eval_loss": 2.040599822998047, "eval_runtime": 444.0043, "eval_samples_per_second": 4.509, "eval_steps_per_second": 1.504, "step": 14900 }, { "epoch": 0.028967167903811493, "grad_norm": 0.9906435608863831, "learning_rate": 1.4483517756792769e-05, "loss": 2.0212, "step": 15000 }, { "epoch": 0.028967167903811493, "eval_loss": 2.0395123958587646, "eval_runtime": 448.0337, "eval_samples_per_second": 4.468, "eval_steps_per_second": 1.491, "step": 15000 }, { "epoch": 0.02916028235650357, "grad_norm": 1.0708587169647217, "learning_rate": 1.4580074541838057e-05, "loss": 2.0119, "step": 15100 }, { "epoch": 0.02916028235650357, "eval_loss": 2.047750473022461, "eval_runtime": 443.2078, "eval_samples_per_second": 4.517, "eval_steps_per_second": 1.507, "step": 15100 }, { "epoch": 0.02935339680919565, "grad_norm": 1.0378001928329468, "learning_rate": 1.4676631326883342e-05, "loss": 2.0154, "step": 15200 }, { "epoch": 0.02935339680919565, "eval_loss": 2.0439839363098145, "eval_runtime": 441.3501, "eval_samples_per_second": 4.536, "eval_steps_per_second": 1.514, "step": 15200 }, { "epoch": 0.029546511261887724, "grad_norm": 0.8756581544876099, "learning_rate": 1.4773188111928626e-05, "loss": 1.9966, "step": 15300 }, { "epoch": 0.029546511261887724, "eval_loss": 2.04148530960083, "eval_runtime": 444.4494, "eval_samples_per_second": 4.504, "eval_steps_per_second": 1.503, "step": 15300 }, { "epoch": 0.0297396257145798, "grad_norm": 0.9758510589599609, "learning_rate": 1.4869744896973911e-05, "loss": 2.0051, "step": 15400 }, { "epoch": 0.0297396257145798, "eval_loss": 2.0388731956481934, "eval_runtime": 447.5472, "eval_samples_per_second": 4.473, "eval_steps_per_second": 1.493, "step": 15400 }, { "epoch": 0.029932740167271876, "grad_norm": 0.8933822512626648, "learning_rate": 1.4966301682019196e-05, "loss": 2.004, "step": 15500 }, { "epoch": 0.029932740167271876, "eval_loss": 2.0385355949401855, "eval_runtime": 448.7984, "eval_samples_per_second": 4.461, "eval_steps_per_second": 1.488, "step": 15500 }, { "epoch": 0.030125854619963952, "grad_norm": 1.0588964223861694, "learning_rate": 1.506285846706448e-05, "loss": 1.995, "step": 15600 }, { "epoch": 0.030125854619963952, "eval_loss": 2.0367350578308105, "eval_runtime": 446.8061, "eval_samples_per_second": 4.481, "eval_steps_per_second": 1.495, "step": 15600 } ], "logging_steps": 100, "max_steps": 517827, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.108193260524032e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }