{ "best_metric": 0.002703184960409999, "best_model_checkpoint": "./results/checkpoint-8540", "epoch": 1.0, "eval_steps": 10, "global_step": 9386, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001065416577881952, "grad_norm": 2.598374366760254, "learning_rate": 1.9978691668442363e-05, "loss": 0.6478, "step": 10 }, { "epoch": 0.001065416577881952, "eval_loss": 0.5497280955314636, "eval_runtime": 34.7845, "eval_samples_per_second": 4317.273, "eval_steps_per_second": 67.473, "step": 10 }, { "epoch": 0.002130833155763904, "grad_norm": 2.8029165267944336, "learning_rate": 1.9957383336884725e-05, "loss": 0.4639, "step": 20 }, { "epoch": 0.002130833155763904, "eval_loss": 0.3493800461292267, "eval_runtime": 34.8493, "eval_samples_per_second": 4309.239, "eval_steps_per_second": 67.347, "step": 20 }, { "epoch": 0.0031962497336458554, "grad_norm": 1.549603819847107, "learning_rate": 1.9936075005327084e-05, "loss": 0.2637, "step": 30 }, { "epoch": 0.0031962497336458554, "eval_loss": 0.17470526695251465, "eval_runtime": 35.0922, "eval_samples_per_second": 4279.407, "eval_steps_per_second": 66.881, "step": 30 }, { "epoch": 0.004261666311527808, "grad_norm": 0.5110528469085693, "learning_rate": 1.9914766673769446e-05, "loss": 0.1509, "step": 40 }, { "epoch": 0.004261666311527808, "eval_loss": 0.10934165120124817, "eval_runtime": 34.8506, "eval_samples_per_second": 4309.077, "eval_steps_per_second": 67.345, "step": 40 }, { "epoch": 0.005327082889409759, "grad_norm": 0.5021756887435913, "learning_rate": 1.9893458342211807e-05, "loss": 0.1135, "step": 50 }, { "epoch": 0.005327082889409759, "eval_loss": 0.08590664714574814, "eval_runtime": 34.8851, "eval_samples_per_second": 4304.813, "eval_steps_per_second": 67.278, "step": 50 }, { "epoch": 0.006392499467291711, "grad_norm": 0.8349336385726929, "learning_rate": 1.987215001065417e-05, "loss": 0.1037, "step": 60 }, { "epoch": 0.006392499467291711, "eval_loss": 0.07590685039758682, "eval_runtime": 34.9244, "eval_samples_per_second": 4299.977, "eval_steps_per_second": 67.202, "step": 60 }, { "epoch": 0.007457916045173663, "grad_norm": 0.5514059066772461, "learning_rate": 1.9850841679096528e-05, "loss": 0.0652, "step": 70 }, { "epoch": 0.007457916045173663, "eval_loss": 0.06897564232349396, "eval_runtime": 34.8575, "eval_samples_per_second": 4308.227, "eval_steps_per_second": 67.331, "step": 70 }, { "epoch": 0.008523332623055616, "grad_norm": 0.4209994673728943, "learning_rate": 1.982953334753889e-05, "loss": 0.0596, "step": 80 }, { "epoch": 0.008523332623055616, "eval_loss": 0.060382526367902756, "eval_runtime": 34.9303, "eval_samples_per_second": 4299.245, "eval_steps_per_second": 67.191, "step": 80 }, { "epoch": 0.009588749200937566, "grad_norm": 0.4787921905517578, "learning_rate": 1.980822501598125e-05, "loss": 0.0729, "step": 90 }, { "epoch": 0.009588749200937566, "eval_loss": 0.04821014031767845, "eval_runtime": 34.9624, "eval_samples_per_second": 4295.297, "eval_steps_per_second": 67.129, "step": 90 }, { "epoch": 0.010654165778819518, "grad_norm": 0.5219190120697021, "learning_rate": 1.9786916684423613e-05, "loss": 0.0312, "step": 100 }, { "epoch": 0.010654165778819518, "eval_loss": 0.0431891568005085, "eval_runtime": 34.9505, "eval_samples_per_second": 4296.761, "eval_steps_per_second": 67.152, "step": 100 }, { "epoch": 0.01171958235670147, "grad_norm": 1.1521271467208862, "learning_rate": 1.976560835286597e-05, "loss": 0.037, "step": 110 }, { "epoch": 0.01171958235670147, "eval_loss": 0.03519212082028389, "eval_runtime": 34.9299, "eval_samples_per_second": 4299.301, "eval_steps_per_second": 67.192, "step": 110 }, { "epoch": 0.012784998934583422, "grad_norm": 0.20447871088981628, "learning_rate": 1.9744300021308334e-05, "loss": 0.0279, "step": 120 }, { "epoch": 0.012784998934583422, "eval_loss": 0.02899288199841976, "eval_runtime": 34.9533, "eval_samples_per_second": 4296.421, "eval_steps_per_second": 67.147, "step": 120 }, { "epoch": 0.013850415512465374, "grad_norm": 0.18650375306606293, "learning_rate": 1.9722991689750695e-05, "loss": 0.0215, "step": 130 }, { "epoch": 0.013850415512465374, "eval_loss": 0.025864260271191597, "eval_runtime": 35.0071, "eval_samples_per_second": 4289.821, "eval_steps_per_second": 67.044, "step": 130 }, { "epoch": 0.014915832090347326, "grad_norm": 0.32700005173683167, "learning_rate": 1.9701683358193057e-05, "loss": 0.0259, "step": 140 }, { "epoch": 0.014915832090347326, "eval_loss": 0.0220870953053236, "eval_runtime": 34.9891, "eval_samples_per_second": 4292.02, "eval_steps_per_second": 67.078, "step": 140 }, { "epoch": 0.015981248668229277, "grad_norm": 0.601952850818634, "learning_rate": 1.9680375026635416e-05, "loss": 0.0227, "step": 150 }, { "epoch": 0.015981248668229277, "eval_loss": 0.01984524168074131, "eval_runtime": 34.9806, "eval_samples_per_second": 4293.068, "eval_steps_per_second": 67.094, "step": 150 }, { "epoch": 0.01704666524611123, "grad_norm": 0.5832983255386353, "learning_rate": 1.9659066695077777e-05, "loss": 0.0129, "step": 160 }, { "epoch": 0.01704666524611123, "eval_loss": 0.018882030621170998, "eval_runtime": 34.9919, "eval_samples_per_second": 4291.685, "eval_steps_per_second": 67.073, "step": 160 }, { "epoch": 0.01811208182399318, "grad_norm": 0.5878477096557617, "learning_rate": 1.963775836352014e-05, "loss": 0.0142, "step": 170 }, { "epoch": 0.01811208182399318, "eval_loss": 0.01820511370897293, "eval_runtime": 34.9938, "eval_samples_per_second": 4291.445, "eval_steps_per_second": 67.069, "step": 170 }, { "epoch": 0.01917749840187513, "grad_norm": 0.11928685754537582, "learning_rate": 1.96164500319625e-05, "loss": 0.0112, "step": 180 }, { "epoch": 0.01917749840187513, "eval_loss": 0.015623863786458969, "eval_runtime": 34.9739, "eval_samples_per_second": 4293.893, "eval_steps_per_second": 67.107, "step": 180 }, { "epoch": 0.020242914979757085, "grad_norm": 0.16800431907176971, "learning_rate": 1.959514170040486e-05, "loss": 0.0204, "step": 190 }, { "epoch": 0.020242914979757085, "eval_loss": 0.014880867674946785, "eval_runtime": 34.9967, "eval_samples_per_second": 4291.092, "eval_steps_per_second": 67.063, "step": 190 }, { "epoch": 0.021308331557639035, "grad_norm": 0.18934427201747894, "learning_rate": 1.957383336884722e-05, "loss": 0.0217, "step": 200 }, { "epoch": 0.021308331557639035, "eval_loss": 0.015829147771000862, "eval_runtime": 35.0483, "eval_samples_per_second": 4284.768, "eval_steps_per_second": 66.965, "step": 200 }, { "epoch": 0.02237374813552099, "grad_norm": 0.13677361607551575, "learning_rate": 1.9552525037289583e-05, "loss": 0.0242, "step": 210 }, { "epoch": 0.02237374813552099, "eval_loss": 0.013792283833026886, "eval_runtime": 35.0652, "eval_samples_per_second": 4282.709, "eval_steps_per_second": 66.932, "step": 210 }, { "epoch": 0.02343916471340294, "grad_norm": 0.09131798893213272, "learning_rate": 1.9531216705731945e-05, "loss": 0.0149, "step": 220 }, { "epoch": 0.02343916471340294, "eval_loss": 0.012414357624948025, "eval_runtime": 35.0415, "eval_samples_per_second": 4285.601, "eval_steps_per_second": 66.978, "step": 220 }, { "epoch": 0.024504581291284893, "grad_norm": 1.9855362176895142, "learning_rate": 1.9509908374174304e-05, "loss": 0.0213, "step": 230 }, { "epoch": 0.024504581291284893, "eval_loss": 0.013272976502776146, "eval_runtime": 35.023, "eval_samples_per_second": 4287.865, "eval_steps_per_second": 67.013, "step": 230 }, { "epoch": 0.025569997869166843, "grad_norm": 0.9082479476928711, "learning_rate": 1.9488600042616665e-05, "loss": 0.0199, "step": 240 }, { "epoch": 0.025569997869166843, "eval_loss": 0.01087925210595131, "eval_runtime": 35.0338, "eval_samples_per_second": 4286.545, "eval_steps_per_second": 66.992, "step": 240 }, { "epoch": 0.026635414447048797, "grad_norm": 0.14924249053001404, "learning_rate": 1.9467291711059027e-05, "loss": 0.0169, "step": 250 }, { "epoch": 0.026635414447048797, "eval_loss": 0.012179452925920486, "eval_runtime": 35.0707, "eval_samples_per_second": 4282.032, "eval_steps_per_second": 66.922, "step": 250 }, { "epoch": 0.027700831024930747, "grad_norm": 0.08655331283807755, "learning_rate": 1.944598337950139e-05, "loss": 0.01, "step": 260 }, { "epoch": 0.027700831024930747, "eval_loss": 0.00992455706000328, "eval_runtime": 35.0605, "eval_samples_per_second": 4283.286, "eval_steps_per_second": 66.941, "step": 260 }, { "epoch": 0.0287662476028127, "grad_norm": 0.0683509036898613, "learning_rate": 1.9424675047943748e-05, "loss": 0.0064, "step": 270 }, { "epoch": 0.0287662476028127, "eval_loss": 0.011564863845705986, "eval_runtime": 35.1547, "eval_samples_per_second": 4271.81, "eval_steps_per_second": 66.762, "step": 270 }, { "epoch": 0.02983166418069465, "grad_norm": 0.07224300503730774, "learning_rate": 1.940336671638611e-05, "loss": 0.0053, "step": 280 }, { "epoch": 0.02983166418069465, "eval_loss": 0.011479129083454609, "eval_runtime": 35.0371, "eval_samples_per_second": 4286.14, "eval_steps_per_second": 66.986, "step": 280 }, { "epoch": 0.030897080758576605, "grad_norm": 0.07006030529737473, "learning_rate": 1.938205838482847e-05, "loss": 0.0096, "step": 290 }, { "epoch": 0.030897080758576605, "eval_loss": 0.009303942322731018, "eval_runtime": 35.042, "eval_samples_per_second": 4285.543, "eval_steps_per_second": 66.977, "step": 290 }, { "epoch": 0.031962497336458555, "grad_norm": 0.13934771716594696, "learning_rate": 1.9360750053270833e-05, "loss": 0.0038, "step": 300 }, { "epoch": 0.031962497336458555, "eval_loss": 0.009287681430578232, "eval_runtime": 35.0082, "eval_samples_per_second": 4289.686, "eval_steps_per_second": 67.042, "step": 300 }, { "epoch": 0.033027913914340505, "grad_norm": 0.13616764545440674, "learning_rate": 1.933944172171319e-05, "loss": 0.0062, "step": 310 }, { "epoch": 0.033027913914340505, "eval_loss": 0.009377561509609222, "eval_runtime": 34.9955, "eval_samples_per_second": 4291.232, "eval_steps_per_second": 67.066, "step": 310 }, { "epoch": 0.03409333049222246, "grad_norm": 0.05875202640891075, "learning_rate": 1.9318133390155553e-05, "loss": 0.0049, "step": 320 }, { "epoch": 0.03409333049222246, "eval_loss": 0.008939397521317005, "eval_runtime": 35.0043, "eval_samples_per_second": 4290.158, "eval_steps_per_second": 67.049, "step": 320 }, { "epoch": 0.03515874707010441, "grad_norm": 1.2608754634857178, "learning_rate": 1.9296825058597915e-05, "loss": 0.0047, "step": 330 }, { "epoch": 0.03515874707010441, "eval_loss": 0.00868003349751234, "eval_runtime": 35.0087, "eval_samples_per_second": 4289.62, "eval_steps_per_second": 67.04, "step": 330 }, { "epoch": 0.03622416364798636, "grad_norm": 0.045649394392967224, "learning_rate": 1.9275516727040277e-05, "loss": 0.0039, "step": 340 }, { "epoch": 0.03622416364798636, "eval_loss": 0.00883927196264267, "eval_runtime": 34.9953, "eval_samples_per_second": 4291.265, "eval_steps_per_second": 67.066, "step": 340 }, { "epoch": 0.03728958022586831, "grad_norm": 0.052812762558460236, "learning_rate": 1.9254208395482635e-05, "loss": 0.0029, "step": 350 }, { "epoch": 0.03728958022586831, "eval_loss": 0.008632567711174488, "eval_runtime": 35.0069, "eval_samples_per_second": 4289.834, "eval_steps_per_second": 67.044, "step": 350 }, { "epoch": 0.03835499680375026, "grad_norm": 0.36764615774154663, "learning_rate": 1.9232900063924994e-05, "loss": 0.0082, "step": 360 }, { "epoch": 0.03835499680375026, "eval_loss": 0.008467404171824455, "eval_runtime": 34.9954, "eval_samples_per_second": 4291.254, "eval_steps_per_second": 67.066, "step": 360 }, { "epoch": 0.03942041338163222, "grad_norm": 0.046024467796087265, "learning_rate": 1.9211591732367356e-05, "loss": 0.0091, "step": 370 }, { "epoch": 0.03942041338163222, "eval_loss": 0.008618910796940327, "eval_runtime": 34.9555, "eval_samples_per_second": 4296.153, "eval_steps_per_second": 67.143, "step": 370 }, { "epoch": 0.04048582995951417, "grad_norm": 2.4632503986358643, "learning_rate": 1.9190283400809718e-05, "loss": 0.0088, "step": 380 }, { "epoch": 0.04048582995951417, "eval_loss": 0.010347607545554638, "eval_runtime": 34.9756, "eval_samples_per_second": 4293.684, "eval_steps_per_second": 67.104, "step": 380 }, { "epoch": 0.04155124653739612, "grad_norm": 0.060661379247903824, "learning_rate": 1.916897506925208e-05, "loss": 0.0144, "step": 390 }, { "epoch": 0.04155124653739612, "eval_loss": 0.007851127535104752, "eval_runtime": 35.016, "eval_samples_per_second": 4288.73, "eval_steps_per_second": 67.027, "step": 390 }, { "epoch": 0.04261666311527807, "grad_norm": 0.0612405426800251, "learning_rate": 1.9147666737694438e-05, "loss": 0.0212, "step": 400 }, { "epoch": 0.04261666311527807, "eval_loss": 0.008376965299248695, "eval_runtime": 35.0394, "eval_samples_per_second": 4285.857, "eval_steps_per_second": 66.982, "step": 400 }, { "epoch": 0.04368207969316003, "grad_norm": 0.05073362961411476, "learning_rate": 1.91263584061368e-05, "loss": 0.008, "step": 410 }, { "epoch": 0.04368207969316003, "eval_loss": 0.008724682033061981, "eval_runtime": 35.0067, "eval_samples_per_second": 4289.87, "eval_steps_per_second": 67.044, "step": 410 }, { "epoch": 0.04474749627104198, "grad_norm": 0.06536891311407089, "learning_rate": 1.910505007457916e-05, "loss": 0.0055, "step": 420 }, { "epoch": 0.04474749627104198, "eval_loss": 0.007841785438358784, "eval_runtime": 35.0751, "eval_samples_per_second": 4281.496, "eval_steps_per_second": 66.914, "step": 420 }, { "epoch": 0.04581291284892393, "grad_norm": 0.1056961938738823, "learning_rate": 1.9083741743021523e-05, "loss": 0.0028, "step": 430 }, { "epoch": 0.04581291284892393, "eval_loss": 0.0076719019562006, "eval_runtime": 34.9762, "eval_samples_per_second": 4293.602, "eval_steps_per_second": 67.103, "step": 430 }, { "epoch": 0.04687832942680588, "grad_norm": 0.0451618917286396, "learning_rate": 1.9062433411463882e-05, "loss": 0.0133, "step": 440 }, { "epoch": 0.04687832942680588, "eval_loss": 0.007590805646032095, "eval_runtime": 35.0087, "eval_samples_per_second": 4289.616, "eval_steps_per_second": 67.04, "step": 440 }, { "epoch": 0.047943746004687836, "grad_norm": 1.1059614419937134, "learning_rate": 1.9041125079906244e-05, "loss": 0.0168, "step": 450 }, { "epoch": 0.047943746004687836, "eval_loss": 0.007430546451359987, "eval_runtime": 35.0188, "eval_samples_per_second": 4288.385, "eval_steps_per_second": 67.021, "step": 450 }, { "epoch": 0.049009162582569786, "grad_norm": 0.06677515804767609, "learning_rate": 1.9019816748348605e-05, "loss": 0.0069, "step": 460 }, { "epoch": 0.049009162582569786, "eval_loss": 0.008023254573345184, "eval_runtime": 34.9882, "eval_samples_per_second": 4292.131, "eval_steps_per_second": 67.08, "step": 460 }, { "epoch": 0.050074579160451736, "grad_norm": 0.04259790852665901, "learning_rate": 1.8998508416790967e-05, "loss": 0.0047, "step": 470 }, { "epoch": 0.050074579160451736, "eval_loss": 0.007901841774582863, "eval_runtime": 35.0652, "eval_samples_per_second": 4282.71, "eval_steps_per_second": 66.932, "step": 470 }, { "epoch": 0.051139995738333686, "grad_norm": 0.12410833686590195, "learning_rate": 1.8977200085233326e-05, "loss": 0.0066, "step": 480 }, { "epoch": 0.051139995738333686, "eval_loss": 0.00759873166680336, "eval_runtime": 35.0487, "eval_samples_per_second": 4284.728, "eval_steps_per_second": 66.964, "step": 480 }, { "epoch": 0.052205412316215644, "grad_norm": 0.08035538345575333, "learning_rate": 1.8955891753675688e-05, "loss": 0.0023, "step": 490 }, { "epoch": 0.052205412316215644, "eval_loss": 0.007343141362071037, "eval_runtime": 35.0739, "eval_samples_per_second": 4281.64, "eval_steps_per_second": 66.916, "step": 490 }, { "epoch": 0.053270828894097594, "grad_norm": 0.04620998725295067, "learning_rate": 1.893458342211805e-05, "loss": 0.0087, "step": 500 }, { "epoch": 0.053270828894097594, "eval_loss": 0.007286736276000738, "eval_runtime": 35.0515, "eval_samples_per_second": 4284.378, "eval_steps_per_second": 66.959, "step": 500 }, { "epoch": 0.054336245471979544, "grad_norm": 0.03123115375638008, "learning_rate": 1.891327509056041e-05, "loss": 0.0029, "step": 510 }, { "epoch": 0.054336245471979544, "eval_loss": 0.007131603546440601, "eval_runtime": 35.0687, "eval_samples_per_second": 4282.275, "eval_steps_per_second": 66.926, "step": 510 }, { "epoch": 0.055401662049861494, "grad_norm": 0.06445208191871643, "learning_rate": 1.889196675900277e-05, "loss": 0.0107, "step": 520 }, { "epoch": 0.055401662049861494, "eval_loss": 0.007308864034712315, "eval_runtime": 35.0344, "eval_samples_per_second": 4286.475, "eval_steps_per_second": 66.991, "step": 520 }, { "epoch": 0.056467078627743444, "grad_norm": 0.026168525218963623, "learning_rate": 1.887065842744513e-05, "loss": 0.0074, "step": 530 }, { "epoch": 0.056467078627743444, "eval_loss": 0.007467833813279867, "eval_runtime": 35.1394, "eval_samples_per_second": 4273.663, "eval_steps_per_second": 66.791, "step": 530 }, { "epoch": 0.0575324952056254, "grad_norm": 0.06097254157066345, "learning_rate": 1.8849350095887493e-05, "loss": 0.0079, "step": 540 }, { "epoch": 0.0575324952056254, "eval_loss": 0.006862245034426451, "eval_runtime": 35.0843, "eval_samples_per_second": 4280.379, "eval_steps_per_second": 66.896, "step": 540 }, { "epoch": 0.05859791178350735, "grad_norm": 1.4416311979293823, "learning_rate": 1.8828041764329855e-05, "loss": 0.003, "step": 550 }, { "epoch": 0.05859791178350735, "eval_loss": 0.006868099793791771, "eval_runtime": 35.0895, "eval_samples_per_second": 4279.74, "eval_steps_per_second": 66.886, "step": 550 }, { "epoch": 0.0596633283613893, "grad_norm": 0.02442428097128868, "learning_rate": 1.8806733432772214e-05, "loss": 0.007, "step": 560 }, { "epoch": 0.0596633283613893, "eval_loss": 0.0072341980412602425, "eval_runtime": 35.1045, "eval_samples_per_second": 4277.91, "eval_steps_per_second": 66.857, "step": 560 }, { "epoch": 0.06072874493927125, "grad_norm": 0.024394547566771507, "learning_rate": 1.8785425101214576e-05, "loss": 0.0202, "step": 570 }, { "epoch": 0.06072874493927125, "eval_loss": 0.009562548249959946, "eval_runtime": 35.0609, "eval_samples_per_second": 4283.231, "eval_steps_per_second": 66.941, "step": 570 }, { "epoch": 0.06179416151715321, "grad_norm": 0.14542266726493835, "learning_rate": 1.8764116769656937e-05, "loss": 0.0023, "step": 580 }, { "epoch": 0.06179416151715321, "eval_loss": 0.0071628945879638195, "eval_runtime": 35.1203, "eval_samples_per_second": 4275.993, "eval_steps_per_second": 66.828, "step": 580 }, { "epoch": 0.06285957809503516, "grad_norm": 0.027121223509311676, "learning_rate": 1.87428084380993e-05, "loss": 0.007, "step": 590 }, { "epoch": 0.06285957809503516, "eval_loss": 0.00703906686976552, "eval_runtime": 35.0628, "eval_samples_per_second": 4283.001, "eval_steps_per_second": 66.937, "step": 590 }, { "epoch": 0.06392499467291711, "grad_norm": 0.052346475422382355, "learning_rate": 1.8721500106541658e-05, "loss": 0.0104, "step": 600 }, { "epoch": 0.06392499467291711, "eval_loss": 0.006780738476663828, "eval_runtime": 35.0665, "eval_samples_per_second": 4282.547, "eval_steps_per_second": 66.93, "step": 600 }, { "epoch": 0.06499041125079906, "grad_norm": 0.026305731385946274, "learning_rate": 1.870019177498402e-05, "loss": 0.0021, "step": 610 }, { "epoch": 0.06499041125079906, "eval_loss": 0.006772972177714109, "eval_runtime": 35.0776, "eval_samples_per_second": 4281.194, "eval_steps_per_second": 66.909, "step": 610 }, { "epoch": 0.06605582782868101, "grad_norm": 0.029565811157226562, "learning_rate": 1.867888344342638e-05, "loss": 0.0033, "step": 620 }, { "epoch": 0.06605582782868101, "eval_loss": 0.00664109131321311, "eval_runtime": 35.0977, "eval_samples_per_second": 4278.741, "eval_steps_per_second": 66.87, "step": 620 }, { "epoch": 0.06712124440656296, "grad_norm": 0.020861292257905006, "learning_rate": 1.8657575111868743e-05, "loss": 0.0016, "step": 630 }, { "epoch": 0.06712124440656296, "eval_loss": 0.006744803860783577, "eval_runtime": 35.0927, "eval_samples_per_second": 4279.354, "eval_steps_per_second": 66.88, "step": 630 }, { "epoch": 0.06818666098444492, "grad_norm": 0.08481285721063614, "learning_rate": 1.86362667803111e-05, "loss": 0.0021, "step": 640 }, { "epoch": 0.06818666098444492, "eval_loss": 0.007654301356524229, "eval_runtime": 35.0712, "eval_samples_per_second": 4281.973, "eval_steps_per_second": 66.921, "step": 640 }, { "epoch": 0.06925207756232687, "grad_norm": 0.0536779910326004, "learning_rate": 1.8614958448753463e-05, "loss": 0.0182, "step": 650 }, { "epoch": 0.06925207756232687, "eval_loss": 0.00671360595151782, "eval_runtime": 35.107, "eval_samples_per_second": 4277.604, "eval_steps_per_second": 66.853, "step": 650 }, { "epoch": 0.07031749414020882, "grad_norm": 0.5193659067153931, "learning_rate": 1.8593650117195825e-05, "loss": 0.0069, "step": 660 }, { "epoch": 0.07031749414020882, "eval_loss": 0.00693098409101367, "eval_runtime": 35.1433, "eval_samples_per_second": 4273.191, "eval_steps_per_second": 66.784, "step": 660 }, { "epoch": 0.07138291071809078, "grad_norm": 0.020888999104499817, "learning_rate": 1.8572341785638187e-05, "loss": 0.0013, "step": 670 }, { "epoch": 0.07138291071809078, "eval_loss": 0.006469358690083027, "eval_runtime": 35.1021, "eval_samples_per_second": 4278.206, "eval_steps_per_second": 66.862, "step": 670 }, { "epoch": 0.07244832729597273, "grad_norm": 0.019426677376031876, "learning_rate": 1.8551033454080546e-05, "loss": 0.0014, "step": 680 }, { "epoch": 0.07244832729597273, "eval_loss": 0.006414386909455061, "eval_runtime": 35.1154, "eval_samples_per_second": 4276.585, "eval_steps_per_second": 66.837, "step": 680 }, { "epoch": 0.07351374387385468, "grad_norm": 0.019121970981359482, "learning_rate": 1.8529725122522907e-05, "loss": 0.0015, "step": 690 }, { "epoch": 0.07351374387385468, "eval_loss": 0.00648439209908247, "eval_runtime": 35.1, "eval_samples_per_second": 4278.46, "eval_steps_per_second": 66.866, "step": 690 }, { "epoch": 0.07457916045173663, "grad_norm": 0.02870786562561989, "learning_rate": 1.850841679096527e-05, "loss": 0.0161, "step": 700 }, { "epoch": 0.07457916045173663, "eval_loss": 0.006600781809538603, "eval_runtime": 35.1446, "eval_samples_per_second": 4273.032, "eval_steps_per_second": 66.781, "step": 700 }, { "epoch": 0.07564457702961858, "grad_norm": 0.048932578414678574, "learning_rate": 1.848710845940763e-05, "loss": 0.0033, "step": 710 }, { "epoch": 0.07564457702961858, "eval_loss": 0.00685811135917902, "eval_runtime": 35.0851, "eval_samples_per_second": 4280.274, "eval_steps_per_second": 66.894, "step": 710 }, { "epoch": 0.07670999360750053, "grad_norm": 0.020688414573669434, "learning_rate": 1.846580012784999e-05, "loss": 0.0022, "step": 720 }, { "epoch": 0.07670999360750053, "eval_loss": 0.006442280951887369, "eval_runtime": 35.0598, "eval_samples_per_second": 4283.366, "eval_steps_per_second": 66.943, "step": 720 }, { "epoch": 0.07777541018538249, "grad_norm": 0.02903449535369873, "learning_rate": 1.844449179629235e-05, "loss": 0.0167, "step": 730 }, { "epoch": 0.07777541018538249, "eval_loss": 0.0070753456093370914, "eval_runtime": 35.0516, "eval_samples_per_second": 4284.367, "eval_steps_per_second": 66.958, "step": 730 }, { "epoch": 0.07884082676326444, "grad_norm": 0.02733609639108181, "learning_rate": 1.8423183464734713e-05, "loss": 0.0014, "step": 740 }, { "epoch": 0.07884082676326444, "eval_loss": 0.007332003675401211, "eval_runtime": 35.1395, "eval_samples_per_second": 4273.647, "eval_steps_per_second": 66.791, "step": 740 }, { "epoch": 0.07990624334114639, "grad_norm": 0.2964134216308594, "learning_rate": 1.8401875133177075e-05, "loss": 0.0061, "step": 750 }, { "epoch": 0.07990624334114639, "eval_loss": 0.006313994061201811, "eval_runtime": 35.0697, "eval_samples_per_second": 4282.154, "eval_steps_per_second": 66.924, "step": 750 }, { "epoch": 0.08097165991902834, "grad_norm": 0.01741037145256996, "learning_rate": 1.8380566801619433e-05, "loss": 0.0012, "step": 760 }, { "epoch": 0.08097165991902834, "eval_loss": 0.006638936698436737, "eval_runtime": 35.0747, "eval_samples_per_second": 4281.544, "eval_steps_per_second": 66.914, "step": 760 }, { "epoch": 0.08203707649691029, "grad_norm": 0.016884565353393555, "learning_rate": 1.8359258470061795e-05, "loss": 0.0072, "step": 770 }, { "epoch": 0.08203707649691029, "eval_loss": 0.006715176161378622, "eval_runtime": 35.0656, "eval_samples_per_second": 4282.664, "eval_steps_per_second": 66.932, "step": 770 }, { "epoch": 0.08310249307479224, "grad_norm": 0.023544272407889366, "learning_rate": 1.8337950138504157e-05, "loss": 0.0115, "step": 780 }, { "epoch": 0.08310249307479224, "eval_loss": 0.006417686585336924, "eval_runtime": 35.0831, "eval_samples_per_second": 4280.519, "eval_steps_per_second": 66.898, "step": 780 }, { "epoch": 0.08416790965267419, "grad_norm": 0.03194332495331764, "learning_rate": 1.831664180694652e-05, "loss": 0.0023, "step": 790 }, { "epoch": 0.08416790965267419, "eval_loss": 0.00630133505910635, "eval_runtime": 35.0749, "eval_samples_per_second": 4281.519, "eval_steps_per_second": 66.914, "step": 790 }, { "epoch": 0.08523332623055614, "grad_norm": 0.017488490790128708, "learning_rate": 1.8295333475388877e-05, "loss": 0.0117, "step": 800 }, { "epoch": 0.08523332623055614, "eval_loss": 0.006475712638348341, "eval_runtime": 35.0689, "eval_samples_per_second": 4282.252, "eval_steps_per_second": 66.925, "step": 800 }, { "epoch": 0.0862987428084381, "grad_norm": 0.8892996311187744, "learning_rate": 1.827402514383124e-05, "loss": 0.0157, "step": 810 }, { "epoch": 0.0862987428084381, "eval_loss": 0.005997061729431152, "eval_runtime": 35.0766, "eval_samples_per_second": 4281.311, "eval_steps_per_second": 66.911, "step": 810 }, { "epoch": 0.08736415938632006, "grad_norm": 0.019312532618641853, "learning_rate": 1.82527168122736e-05, "loss": 0.0204, "step": 820 }, { "epoch": 0.08736415938632006, "eval_loss": 0.0064412918873131275, "eval_runtime": 35.0849, "eval_samples_per_second": 4280.301, "eval_steps_per_second": 66.895, "step": 820 }, { "epoch": 0.088429575964202, "grad_norm": 0.07955110818147659, "learning_rate": 1.8231408480715963e-05, "loss": 0.0084, "step": 830 }, { "epoch": 0.088429575964202, "eval_loss": 0.006166558247059584, "eval_runtime": 35.1204, "eval_samples_per_second": 4275.979, "eval_steps_per_second": 66.827, "step": 830 }, { "epoch": 0.08949499254208396, "grad_norm": 0.2547236680984497, "learning_rate": 1.821010014915832e-05, "loss": 0.0036, "step": 840 }, { "epoch": 0.08949499254208396, "eval_loss": 0.0059661865234375, "eval_runtime": 35.1305, "eval_samples_per_second": 4274.751, "eval_steps_per_second": 66.808, "step": 840 }, { "epoch": 0.0905604091199659, "grad_norm": 0.028440352529287338, "learning_rate": 1.8188791817600683e-05, "loss": 0.0026, "step": 850 }, { "epoch": 0.0905604091199659, "eval_loss": 0.0058583482168614864, "eval_runtime": 35.0777, "eval_samples_per_second": 4281.186, "eval_steps_per_second": 66.909, "step": 850 }, { "epoch": 0.09162582569784786, "grad_norm": 0.015186217613518238, "learning_rate": 1.8167483486043045e-05, "loss": 0.0053, "step": 860 }, { "epoch": 0.09162582569784786, "eval_loss": 0.0058418079279363155, "eval_runtime": 35.0918, "eval_samples_per_second": 4279.466, "eval_steps_per_second": 66.882, "step": 860 }, { "epoch": 0.09269124227572981, "grad_norm": 0.0264381542801857, "learning_rate": 1.8146175154485407e-05, "loss": 0.0071, "step": 870 }, { "epoch": 0.09269124227572981, "eval_loss": 0.0058640833012759686, "eval_runtime": 35.0547, "eval_samples_per_second": 4283.985, "eval_steps_per_second": 66.952, "step": 870 }, { "epoch": 0.09375665885361176, "grad_norm": 0.04548242315649986, "learning_rate": 1.8124866822927765e-05, "loss": 0.0047, "step": 880 }, { "epoch": 0.09375665885361176, "eval_loss": 0.005854357033967972, "eval_runtime": 35.0612, "eval_samples_per_second": 4283.196, "eval_steps_per_second": 66.94, "step": 880 }, { "epoch": 0.09482207543149371, "grad_norm": 0.05402829125523567, "learning_rate": 1.8103558491370127e-05, "loss": 0.0017, "step": 890 }, { "epoch": 0.09482207543149371, "eval_loss": 0.0061132400296628475, "eval_runtime": 35.0448, "eval_samples_per_second": 4285.206, "eval_steps_per_second": 66.972, "step": 890 }, { "epoch": 0.09588749200937567, "grad_norm": 0.23751606047153473, "learning_rate": 1.808225015981249e-05, "loss": 0.006, "step": 900 }, { "epoch": 0.09588749200937567, "eval_loss": 0.006291827652603388, "eval_runtime": 35.0656, "eval_samples_per_second": 4282.655, "eval_steps_per_second": 66.932, "step": 900 }, { "epoch": 0.09695290858725762, "grad_norm": 0.03594108670949936, "learning_rate": 1.806094182825485e-05, "loss": 0.0021, "step": 910 }, { "epoch": 0.09695290858725762, "eval_loss": 0.006167920306324959, "eval_runtime": 35.0452, "eval_samples_per_second": 4285.146, "eval_steps_per_second": 66.971, "step": 910 }, { "epoch": 0.09801832516513957, "grad_norm": 0.013563692569732666, "learning_rate": 1.803963349669721e-05, "loss": 0.0057, "step": 920 }, { "epoch": 0.09801832516513957, "eval_loss": 0.006407948210835457, "eval_runtime": 35.0591, "eval_samples_per_second": 4283.448, "eval_steps_per_second": 66.944, "step": 920 }, { "epoch": 0.09908374174302152, "grad_norm": 0.013167720288038254, "learning_rate": 1.801832516513957e-05, "loss": 0.0043, "step": 930 }, { "epoch": 0.09908374174302152, "eval_loss": 0.005963355768471956, "eval_runtime": 34.9944, "eval_samples_per_second": 4291.378, "eval_steps_per_second": 67.068, "step": 930 }, { "epoch": 0.10014915832090347, "grad_norm": 0.6629673838615417, "learning_rate": 1.7997016833581933e-05, "loss": 0.0263, "step": 940 }, { "epoch": 0.10014915832090347, "eval_loss": 0.00560146477073431, "eval_runtime": 35.028, "eval_samples_per_second": 4287.256, "eval_steps_per_second": 67.004, "step": 940 }, { "epoch": 0.10121457489878542, "grad_norm": 0.01741507463157177, "learning_rate": 1.7975708502024295e-05, "loss": 0.0075, "step": 950 }, { "epoch": 0.10121457489878542, "eval_loss": 0.006059127859771252, "eval_runtime": 35.0757, "eval_samples_per_second": 4281.429, "eval_steps_per_second": 66.912, "step": 950 }, { "epoch": 0.10227999147666737, "grad_norm": 0.11731712520122528, "learning_rate": 1.7954400170466653e-05, "loss": 0.0034, "step": 960 }, { "epoch": 0.10227999147666737, "eval_loss": 0.005581183824688196, "eval_runtime": 35.0641, "eval_samples_per_second": 4282.843, "eval_steps_per_second": 66.935, "step": 960 }, { "epoch": 0.10334540805454932, "grad_norm": 0.15285035967826843, "learning_rate": 1.7933091838909015e-05, "loss": 0.0138, "step": 970 }, { "epoch": 0.10334540805454932, "eval_loss": 0.005551150534301996, "eval_runtime": 35.0458, "eval_samples_per_second": 4285.083, "eval_steps_per_second": 66.97, "step": 970 }, { "epoch": 0.10441082463243129, "grad_norm": 0.0942121297121048, "learning_rate": 1.7911783507351377e-05, "loss": 0.0033, "step": 980 }, { "epoch": 0.10441082463243129, "eval_loss": 0.005906397942453623, "eval_runtime": 35.0753, "eval_samples_per_second": 4281.47, "eval_steps_per_second": 66.913, "step": 980 }, { "epoch": 0.10547624121031324, "grad_norm": 0.01376664824783802, "learning_rate": 1.789047517579374e-05, "loss": 0.0013, "step": 990 }, { "epoch": 0.10547624121031324, "eval_loss": 0.006121132522821426, "eval_runtime": 35.0395, "eval_samples_per_second": 4285.847, "eval_steps_per_second": 66.982, "step": 990 }, { "epoch": 0.10654165778819519, "grad_norm": 1.00563645362854, "learning_rate": 1.7869166844236097e-05, "loss": 0.005, "step": 1000 }, { "epoch": 0.10654165778819519, "eval_loss": 0.005701792426407337, "eval_runtime": 34.9988, "eval_samples_per_second": 4290.838, "eval_steps_per_second": 67.06, "step": 1000 }, { "epoch": 0.10760707436607714, "grad_norm": 0.012580779381096363, "learning_rate": 1.784785851267846e-05, "loss": 0.017, "step": 1010 }, { "epoch": 0.10760707436607714, "eval_loss": 0.006706910207867622, "eval_runtime": 35.1083, "eval_samples_per_second": 4277.451, "eval_steps_per_second": 66.85, "step": 1010 }, { "epoch": 0.10867249094395909, "grad_norm": 0.6065902709960938, "learning_rate": 1.782655018112082e-05, "loss": 0.0074, "step": 1020 }, { "epoch": 0.10867249094395909, "eval_loss": 0.005461184773594141, "eval_runtime": 35.2265, "eval_samples_per_second": 4263.103, "eval_steps_per_second": 66.626, "step": 1020 }, { "epoch": 0.10973790752184104, "grad_norm": 0.5651019215583801, "learning_rate": 1.780524184956318e-05, "loss": 0.0072, "step": 1030 }, { "epoch": 0.10973790752184104, "eval_loss": 0.005571197718381882, "eval_runtime": 35.0715, "eval_samples_per_second": 4281.944, "eval_steps_per_second": 66.921, "step": 1030 }, { "epoch": 0.11080332409972299, "grad_norm": 0.015482685528695583, "learning_rate": 1.778393351800554e-05, "loss": 0.0089, "step": 1040 }, { "epoch": 0.11080332409972299, "eval_loss": 0.008133814670145512, "eval_runtime": 35.2657, "eval_samples_per_second": 4258.363, "eval_steps_per_second": 66.552, "step": 1040 }, { "epoch": 0.11186874067760494, "grad_norm": 0.016248241066932678, "learning_rate": 1.7762625186447903e-05, "loss": 0.004, "step": 1050 }, { "epoch": 0.11186874067760494, "eval_loss": 0.00686853239312768, "eval_runtime": 35.1303, "eval_samples_per_second": 4274.77, "eval_steps_per_second": 66.808, "step": 1050 }, { "epoch": 0.11293415725548689, "grad_norm": 0.16063354909420013, "learning_rate": 1.774131685489026e-05, "loss": 0.018, "step": 1060 }, { "epoch": 0.11293415725548689, "eval_loss": 0.005687262862920761, "eval_runtime": 35.0941, "eval_samples_per_second": 4279.181, "eval_steps_per_second": 66.877, "step": 1060 }, { "epoch": 0.11399957383336885, "grad_norm": 0.012475158087909222, "learning_rate": 1.7720008523332623e-05, "loss": 0.0026, "step": 1070 }, { "epoch": 0.11399957383336885, "eval_loss": 0.0057092043571174145, "eval_runtime": 35.285, "eval_samples_per_second": 4256.028, "eval_steps_per_second": 66.515, "step": 1070 }, { "epoch": 0.1150649904112508, "grad_norm": 0.12960150837898254, "learning_rate": 1.7698700191774985e-05, "loss": 0.0081, "step": 1080 }, { "epoch": 0.1150649904112508, "eval_loss": 0.005676012486219406, "eval_runtime": 35.0659, "eval_samples_per_second": 4282.619, "eval_steps_per_second": 66.931, "step": 1080 }, { "epoch": 0.11613040698913275, "grad_norm": 0.015063290484249592, "learning_rate": 1.7677391860217347e-05, "loss": 0.0109, "step": 1090 }, { "epoch": 0.11613040698913275, "eval_loss": 0.005491800140589476, "eval_runtime": 35.1395, "eval_samples_per_second": 4273.656, "eval_steps_per_second": 66.791, "step": 1090 }, { "epoch": 0.1171958235670147, "grad_norm": 0.35197392106056213, "learning_rate": 1.7656083528659705e-05, "loss": 0.0015, "step": 1100 }, { "epoch": 0.1171958235670147, "eval_loss": 0.005604551173746586, "eval_runtime": 35.0678, "eval_samples_per_second": 4282.389, "eval_steps_per_second": 66.927, "step": 1100 }, { "epoch": 0.11826124014489665, "grad_norm": 0.37558114528656006, "learning_rate": 1.7634775197102067e-05, "loss": 0.0109, "step": 1110 }, { "epoch": 0.11826124014489665, "eval_loss": 0.005907374434173107, "eval_runtime": 35.1031, "eval_samples_per_second": 4278.078, "eval_steps_per_second": 66.86, "step": 1110 }, { "epoch": 0.1193266567227786, "grad_norm": 0.040187984704971313, "learning_rate": 1.761346686554443e-05, "loss": 0.0014, "step": 1120 }, { "epoch": 0.1193266567227786, "eval_loss": 0.0068566263653337955, "eval_runtime": 35.0588, "eval_samples_per_second": 4283.485, "eval_steps_per_second": 66.945, "step": 1120 }, { "epoch": 0.12039207330066055, "grad_norm": 0.029192611575126648, "learning_rate": 1.759215853398679e-05, "loss": 0.0068, "step": 1130 }, { "epoch": 0.12039207330066055, "eval_loss": 0.005835311952978373, "eval_runtime": 35.0615, "eval_samples_per_second": 4283.161, "eval_steps_per_second": 66.94, "step": 1130 }, { "epoch": 0.1214574898785425, "grad_norm": 0.03659944236278534, "learning_rate": 1.757085020242915e-05, "loss": 0.0141, "step": 1140 }, { "epoch": 0.1214574898785425, "eval_loss": 0.0052527920342981815, "eval_runtime": 35.0493, "eval_samples_per_second": 4284.648, "eval_steps_per_second": 66.963, "step": 1140 }, { "epoch": 0.12252290645642447, "grad_norm": 0.011439694091677666, "learning_rate": 1.754954187087151e-05, "loss": 0.0015, "step": 1150 }, { "epoch": 0.12252290645642447, "eval_loss": 0.005540814250707626, "eval_runtime": 35.0762, "eval_samples_per_second": 4281.361, "eval_steps_per_second": 66.911, "step": 1150 }, { "epoch": 0.12358832303430642, "grad_norm": 0.010304667986929417, "learning_rate": 1.7528233539313873e-05, "loss": 0.0044, "step": 1160 }, { "epoch": 0.12358832303430642, "eval_loss": 0.005671035032719374, "eval_runtime": 35.1111, "eval_samples_per_second": 4277.113, "eval_steps_per_second": 66.845, "step": 1160 }, { "epoch": 0.12465373961218837, "grad_norm": 1.2378696203231812, "learning_rate": 1.7506925207756235e-05, "loss": 0.0167, "step": 1170 }, { "epoch": 0.12465373961218837, "eval_loss": 0.005685662850737572, "eval_runtime": 35.0507, "eval_samples_per_second": 4284.477, "eval_steps_per_second": 66.96, "step": 1170 }, { "epoch": 0.12571915619007032, "grad_norm": 0.029286779463291168, "learning_rate": 1.7485616876198593e-05, "loss": 0.0092, "step": 1180 }, { "epoch": 0.12571915619007032, "eval_loss": 0.0064590792171657085, "eval_runtime": 35.0487, "eval_samples_per_second": 4284.729, "eval_steps_per_second": 66.964, "step": 1180 }, { "epoch": 0.12678457276795227, "grad_norm": 0.08294548094272614, "learning_rate": 1.7464308544640955e-05, "loss": 0.002, "step": 1190 }, { "epoch": 0.12678457276795227, "eval_loss": 0.0068722073920071125, "eval_runtime": 35.0733, "eval_samples_per_second": 4281.721, "eval_steps_per_second": 66.917, "step": 1190 }, { "epoch": 0.12784998934583422, "grad_norm": 0.10596469789743423, "learning_rate": 1.7443000213083317e-05, "loss": 0.0144, "step": 1200 }, { "epoch": 0.12784998934583422, "eval_loss": 0.005687403492629528, "eval_runtime": 35.0846, "eval_samples_per_second": 4280.338, "eval_steps_per_second": 66.895, "step": 1200 }, { "epoch": 0.12891540592371617, "grad_norm": 0.03988677263259888, "learning_rate": 1.742169188152568e-05, "loss": 0.0016, "step": 1210 }, { "epoch": 0.12891540592371617, "eval_loss": 0.005167008843272924, "eval_runtime": 35.0969, "eval_samples_per_second": 4278.845, "eval_steps_per_second": 66.872, "step": 1210 }, { "epoch": 0.12998082250159812, "grad_norm": 0.16653232276439667, "learning_rate": 1.7400383549968037e-05, "loss": 0.0045, "step": 1220 }, { "epoch": 0.12998082250159812, "eval_loss": 0.005050142761319876, "eval_runtime": 35.0847, "eval_samples_per_second": 4280.322, "eval_steps_per_second": 66.895, "step": 1220 }, { "epoch": 0.13104623907948007, "grad_norm": 1.8148539066314697, "learning_rate": 1.73790752184104e-05, "loss": 0.0052, "step": 1230 }, { "epoch": 0.13104623907948007, "eval_loss": 0.0050388118252158165, "eval_runtime": 35.0834, "eval_samples_per_second": 4280.482, "eval_steps_per_second": 66.898, "step": 1230 }, { "epoch": 0.13211165565736202, "grad_norm": 0.027562782168388367, "learning_rate": 1.735776688685276e-05, "loss": 0.0068, "step": 1240 }, { "epoch": 0.13211165565736202, "eval_loss": 0.005193131044507027, "eval_runtime": 35.0886, "eval_samples_per_second": 4279.855, "eval_steps_per_second": 66.888, "step": 1240 }, { "epoch": 0.13317707223524397, "grad_norm": 2.252390146255493, "learning_rate": 1.7336458555295123e-05, "loss": 0.0084, "step": 1250 }, { "epoch": 0.13317707223524397, "eval_loss": 0.005359725095331669, "eval_runtime": 35.056, "eval_samples_per_second": 4283.831, "eval_steps_per_second": 66.95, "step": 1250 }, { "epoch": 0.13424248881312592, "grad_norm": 0.13568538427352905, "learning_rate": 1.731515022373748e-05, "loss": 0.0055, "step": 1260 }, { "epoch": 0.13424248881312592, "eval_loss": 0.005222136154770851, "eval_runtime": 35.0994, "eval_samples_per_second": 4278.531, "eval_steps_per_second": 66.867, "step": 1260 }, { "epoch": 0.13530790539100787, "grad_norm": 0.020430419594049454, "learning_rate": 1.7293841892179843e-05, "loss": 0.0052, "step": 1270 }, { "epoch": 0.13530790539100787, "eval_loss": 0.005026769824326038, "eval_runtime": 35.0367, "eval_samples_per_second": 4286.186, "eval_steps_per_second": 66.987, "step": 1270 }, { "epoch": 0.13637332196888985, "grad_norm": 0.033307794481515884, "learning_rate": 1.7272533560622205e-05, "loss": 0.0111, "step": 1280 }, { "epoch": 0.13637332196888985, "eval_loss": 0.004983577877283096, "eval_runtime": 35.0924, "eval_samples_per_second": 4279.39, "eval_steps_per_second": 66.881, "step": 1280 }, { "epoch": 0.1374387385467718, "grad_norm": 0.010311348363757133, "learning_rate": 1.7251225229064567e-05, "loss": 0.0017, "step": 1290 }, { "epoch": 0.1374387385467718, "eval_loss": 0.005127367097884417, "eval_runtime": 35.0723, "eval_samples_per_second": 4281.841, "eval_steps_per_second": 66.919, "step": 1290 }, { "epoch": 0.13850415512465375, "grad_norm": 0.009298436343669891, "learning_rate": 1.7229916897506925e-05, "loss": 0.0055, "step": 1300 }, { "epoch": 0.13850415512465375, "eval_loss": 0.0049048615619540215, "eval_runtime": 35.062, "eval_samples_per_second": 4283.093, "eval_steps_per_second": 66.938, "step": 1300 }, { "epoch": 0.1395695717025357, "grad_norm": 0.009401198476552963, "learning_rate": 1.7208608565949287e-05, "loss": 0.0041, "step": 1310 }, { "epoch": 0.1395695717025357, "eval_loss": 0.005013017915189266, "eval_runtime": 35.0683, "eval_samples_per_second": 4282.332, "eval_steps_per_second": 66.927, "step": 1310 }, { "epoch": 0.14063498828041765, "grad_norm": 0.030792182311415672, "learning_rate": 1.718730023439165e-05, "loss": 0.0054, "step": 1320 }, { "epoch": 0.14063498828041765, "eval_loss": 0.0054897707886993885, "eval_runtime": 35.0864, "eval_samples_per_second": 4280.117, "eval_steps_per_second": 66.892, "step": 1320 }, { "epoch": 0.1417004048582996, "grad_norm": 0.5939790606498718, "learning_rate": 1.716599190283401e-05, "loss": 0.0137, "step": 1330 }, { "epoch": 0.1417004048582996, "eval_loss": 0.005527508445084095, "eval_runtime": 35.068, "eval_samples_per_second": 4282.366, "eval_steps_per_second": 66.927, "step": 1330 }, { "epoch": 0.14276582143618155, "grad_norm": 0.2677154242992401, "learning_rate": 1.714468357127637e-05, "loss": 0.0039, "step": 1340 }, { "epoch": 0.14276582143618155, "eval_loss": 0.004874934908002615, "eval_runtime": 35.0254, "eval_samples_per_second": 4287.575, "eval_steps_per_second": 67.009, "step": 1340 }, { "epoch": 0.1438312380140635, "grad_norm": 1.0665801763534546, "learning_rate": 1.712337523971873e-05, "loss": 0.0044, "step": 1350 }, { "epoch": 0.1438312380140635, "eval_loss": 0.004903439898043871, "eval_runtime": 35.0969, "eval_samples_per_second": 4278.835, "eval_steps_per_second": 66.872, "step": 1350 }, { "epoch": 0.14489665459194545, "grad_norm": 0.016081418842077255, "learning_rate": 1.7102066908161093e-05, "loss": 0.0006, "step": 1360 }, { "epoch": 0.14489665459194545, "eval_loss": 0.004955473821610212, "eval_runtime": 35.0973, "eval_samples_per_second": 4278.788, "eval_steps_per_second": 66.871, "step": 1360 }, { "epoch": 0.1459620711698274, "grad_norm": 0.012007024139165878, "learning_rate": 1.7080758576603455e-05, "loss": 0.0009, "step": 1370 }, { "epoch": 0.1459620711698274, "eval_loss": 0.005029810592532158, "eval_runtime": 35.1077, "eval_samples_per_second": 4277.52, "eval_steps_per_second": 66.851, "step": 1370 }, { "epoch": 0.14702748774770935, "grad_norm": 0.01866872049868107, "learning_rate": 1.7059450245045813e-05, "loss": 0.0005, "step": 1380 }, { "epoch": 0.14702748774770935, "eval_loss": 0.005090104416012764, "eval_runtime": 35.1091, "eval_samples_per_second": 4277.357, "eval_steps_per_second": 66.849, "step": 1380 }, { "epoch": 0.1480929043255913, "grad_norm": 0.030831903219223022, "learning_rate": 1.7038141913488175e-05, "loss": 0.0008, "step": 1390 }, { "epoch": 0.1480929043255913, "eval_loss": 0.005153083708137274, "eval_runtime": 35.0702, "eval_samples_per_second": 4282.1, "eval_steps_per_second": 66.923, "step": 1390 }, { "epoch": 0.14915832090347325, "grad_norm": 0.012699014507234097, "learning_rate": 1.7016833581930537e-05, "loss": 0.0068, "step": 1400 }, { "epoch": 0.14915832090347325, "eval_loss": 0.005071562714874744, "eval_runtime": 35.1126, "eval_samples_per_second": 4276.932, "eval_steps_per_second": 66.842, "step": 1400 }, { "epoch": 0.1502237374813552, "grad_norm": 0.022656958550214767, "learning_rate": 1.69955252503729e-05, "loss": 0.0035, "step": 1410 }, { "epoch": 0.1502237374813552, "eval_loss": 0.005125128198415041, "eval_runtime": 35.0598, "eval_samples_per_second": 4283.37, "eval_steps_per_second": 66.943, "step": 1410 }, { "epoch": 0.15128915405923715, "grad_norm": 0.00814993865787983, "learning_rate": 1.6974216918815257e-05, "loss": 0.001, "step": 1420 }, { "epoch": 0.15128915405923715, "eval_loss": 0.005287368781864643, "eval_runtime": 35.0555, "eval_samples_per_second": 4283.894, "eval_steps_per_second": 66.951, "step": 1420 }, { "epoch": 0.1523545706371191, "grad_norm": 0.06181171163916588, "learning_rate": 1.695290858725762e-05, "loss": 0.0006, "step": 1430 }, { "epoch": 0.1523545706371191, "eval_loss": 0.005389242433011532, "eval_runtime": 35.0758, "eval_samples_per_second": 4281.412, "eval_steps_per_second": 66.912, "step": 1430 }, { "epoch": 0.15341998721500105, "grad_norm": 0.018909545615315437, "learning_rate": 1.693160025569998e-05, "loss": 0.0101, "step": 1440 }, { "epoch": 0.15341998721500105, "eval_loss": 0.0051095616072416306, "eval_runtime": 35.0847, "eval_samples_per_second": 4280.327, "eval_steps_per_second": 66.895, "step": 1440 }, { "epoch": 0.15448540379288303, "grad_norm": 0.007403901778161526, "learning_rate": 1.6910291924142343e-05, "loss": 0.001, "step": 1450 }, { "epoch": 0.15448540379288303, "eval_loss": 0.0049713412299752235, "eval_runtime": 35.051, "eval_samples_per_second": 4284.439, "eval_steps_per_second": 66.96, "step": 1450 }, { "epoch": 0.15555082037076498, "grad_norm": 0.21636687219142914, "learning_rate": 1.68889835925847e-05, "loss": 0.0056, "step": 1460 }, { "epoch": 0.15555082037076498, "eval_loss": 0.0050105685368180275, "eval_runtime": 35.0503, "eval_samples_per_second": 4284.527, "eval_steps_per_second": 66.961, "step": 1460 }, { "epoch": 0.15661623694864693, "grad_norm": 0.021923823282122612, "learning_rate": 1.6867675261027063e-05, "loss": 0.0173, "step": 1470 }, { "epoch": 0.15661623694864693, "eval_loss": 0.004856941290199757, "eval_runtime": 35.1061, "eval_samples_per_second": 4277.715, "eval_steps_per_second": 66.854, "step": 1470 }, { "epoch": 0.15768165352652888, "grad_norm": 0.16257305443286896, "learning_rate": 1.6846366929469425e-05, "loss": 0.0056, "step": 1480 }, { "epoch": 0.15768165352652888, "eval_loss": 0.005984555929899216, "eval_runtime": 35.0991, "eval_samples_per_second": 4278.575, "eval_steps_per_second": 66.868, "step": 1480 }, { "epoch": 0.15874707010441083, "grad_norm": 0.3750154674053192, "learning_rate": 1.6825058597911787e-05, "loss": 0.0187, "step": 1490 }, { "epoch": 0.15874707010441083, "eval_loss": 0.006785502657294273, "eval_runtime": 35.0856, "eval_samples_per_second": 4280.213, "eval_steps_per_second": 66.893, "step": 1490 }, { "epoch": 0.15981248668229278, "grad_norm": 0.009664146229624748, "learning_rate": 1.6803750266354145e-05, "loss": 0.0037, "step": 1500 }, { "epoch": 0.15981248668229278, "eval_loss": 0.0051004113629460335, "eval_runtime": 35.0502, "eval_samples_per_second": 4284.534, "eval_steps_per_second": 66.961, "step": 1500 }, { "epoch": 0.16087790326017473, "grad_norm": 0.019265178591012955, "learning_rate": 1.6782441934796507e-05, "loss": 0.0048, "step": 1510 }, { "epoch": 0.16087790326017473, "eval_loss": 0.004774358589202166, "eval_runtime": 35.0608, "eval_samples_per_second": 4283.242, "eval_steps_per_second": 66.941, "step": 1510 }, { "epoch": 0.16194331983805668, "grad_norm": 0.02663380466401577, "learning_rate": 1.676113360323887e-05, "loss": 0.0008, "step": 1520 }, { "epoch": 0.16194331983805668, "eval_loss": 0.004892702680081129, "eval_runtime": 35.1044, "eval_samples_per_second": 4277.93, "eval_steps_per_second": 66.858, "step": 1520 }, { "epoch": 0.16300873641593863, "grad_norm": 0.559873104095459, "learning_rate": 1.673982527168123e-05, "loss": 0.0085, "step": 1530 }, { "epoch": 0.16300873641593863, "eval_loss": 0.005092754494398832, "eval_runtime": 35.0253, "eval_samples_per_second": 4287.587, "eval_steps_per_second": 67.009, "step": 1530 }, { "epoch": 0.16407415299382058, "grad_norm": 0.2393077164888382, "learning_rate": 1.671851694012359e-05, "loss": 0.0045, "step": 1540 }, { "epoch": 0.16407415299382058, "eval_loss": 0.005027854815125465, "eval_runtime": 35.0738, "eval_samples_per_second": 4281.653, "eval_steps_per_second": 66.916, "step": 1540 }, { "epoch": 0.16513956957170253, "grad_norm": 0.01315162144601345, "learning_rate": 1.669720860856595e-05, "loss": 0.0017, "step": 1550 }, { "epoch": 0.16513956957170253, "eval_loss": 0.004932031966745853, "eval_runtime": 35.05, "eval_samples_per_second": 4284.571, "eval_steps_per_second": 66.962, "step": 1550 }, { "epoch": 0.16620498614958448, "grad_norm": 1.2454622983932495, "learning_rate": 1.6675900277008313e-05, "loss": 0.0032, "step": 1560 }, { "epoch": 0.16620498614958448, "eval_loss": 0.005039810668677092, "eval_runtime": 35.064, "eval_samples_per_second": 4282.858, "eval_steps_per_second": 66.935, "step": 1560 }, { "epoch": 0.16727040272746643, "grad_norm": 0.07791124284267426, "learning_rate": 1.6654591945450675e-05, "loss": 0.0041, "step": 1570 }, { "epoch": 0.16727040272746643, "eval_loss": 0.004877708852291107, "eval_runtime": 35.1101, "eval_samples_per_second": 4277.232, "eval_steps_per_second": 66.847, "step": 1570 }, { "epoch": 0.16833581930534838, "grad_norm": 0.006707167252898216, "learning_rate": 1.6633283613893033e-05, "loss": 0.0098, "step": 1580 }, { "epoch": 0.16833581930534838, "eval_loss": 0.005282689351588488, "eval_runtime": 35.0557, "eval_samples_per_second": 4283.873, "eval_steps_per_second": 66.951, "step": 1580 }, { "epoch": 0.16940123588323033, "grad_norm": 0.12985199689865112, "learning_rate": 1.6611975282335395e-05, "loss": 0.006, "step": 1590 }, { "epoch": 0.16940123588323033, "eval_loss": 0.00591092836111784, "eval_runtime": 35.0663, "eval_samples_per_second": 4282.579, "eval_steps_per_second": 66.93, "step": 1590 }, { "epoch": 0.17046665246111228, "grad_norm": 0.006378485355526209, "learning_rate": 1.6590666950777757e-05, "loss": 0.0061, "step": 1600 }, { "epoch": 0.17046665246111228, "eval_loss": 0.005602375138550997, "eval_runtime": 35.0822, "eval_samples_per_second": 4280.633, "eval_steps_per_second": 66.9, "step": 1600 }, { "epoch": 0.17153206903899423, "grad_norm": 0.27390819787979126, "learning_rate": 1.656935861922012e-05, "loss": 0.0026, "step": 1610 }, { "epoch": 0.17153206903899423, "eval_loss": 0.005227269604802132, "eval_runtime": 35.064, "eval_samples_per_second": 4282.849, "eval_steps_per_second": 66.935, "step": 1610 }, { "epoch": 0.1725974856168762, "grad_norm": 0.01282795425504446, "learning_rate": 1.6548050287662477e-05, "loss": 0.0013, "step": 1620 }, { "epoch": 0.1725974856168762, "eval_loss": 0.005013093817979097, "eval_runtime": 35.0468, "eval_samples_per_second": 4284.953, "eval_steps_per_second": 66.968, "step": 1620 }, { "epoch": 0.17366290219475816, "grad_norm": 0.0058741201646625996, "learning_rate": 1.652674195610484e-05, "loss": 0.0015, "step": 1630 }, { "epoch": 0.17366290219475816, "eval_loss": 0.005092192441225052, "eval_runtime": 35.0896, "eval_samples_per_second": 4279.726, "eval_steps_per_second": 66.886, "step": 1630 }, { "epoch": 0.1747283187726401, "grad_norm": 0.022938504815101624, "learning_rate": 1.65054336245472e-05, "loss": 0.0006, "step": 1640 }, { "epoch": 0.1747283187726401, "eval_loss": 0.005200070794671774, "eval_runtime": 35.0531, "eval_samples_per_second": 4284.182, "eval_steps_per_second": 66.956, "step": 1640 }, { "epoch": 0.17579373535052206, "grad_norm": 0.03323187306523323, "learning_rate": 1.6484125292989562e-05, "loss": 0.0047, "step": 1650 }, { "epoch": 0.17579373535052206, "eval_loss": 0.005280145909637213, "eval_runtime": 35.0383, "eval_samples_per_second": 4285.991, "eval_steps_per_second": 66.984, "step": 1650 }, { "epoch": 0.176859151928404, "grad_norm": 0.005984348710626364, "learning_rate": 1.646281696143192e-05, "loss": 0.005, "step": 1660 }, { "epoch": 0.176859151928404, "eval_loss": 0.005126793868839741, "eval_runtime": 35.0851, "eval_samples_per_second": 4280.28, "eval_steps_per_second": 66.895, "step": 1660 }, { "epoch": 0.17792456850628596, "grad_norm": 0.02346760779619217, "learning_rate": 1.6441508629874283e-05, "loss": 0.0042, "step": 1670 }, { "epoch": 0.17792456850628596, "eval_loss": 0.005286376923322678, "eval_runtime": 35.0619, "eval_samples_per_second": 4283.111, "eval_steps_per_second": 66.939, "step": 1670 }, { "epoch": 0.1789899850841679, "grad_norm": 0.00583941163495183, "learning_rate": 1.6420200298316645e-05, "loss": 0.0059, "step": 1680 }, { "epoch": 0.1789899850841679, "eval_loss": 0.0052011506631970406, "eval_runtime": 35.0433, "eval_samples_per_second": 4285.389, "eval_steps_per_second": 66.974, "step": 1680 }, { "epoch": 0.18005540166204986, "grad_norm": 0.020624622702598572, "learning_rate": 1.6398891966759006e-05, "loss": 0.0019, "step": 1690 }, { "epoch": 0.18005540166204986, "eval_loss": 0.004804234951734543, "eval_runtime": 35.0973, "eval_samples_per_second": 4278.789, "eval_steps_per_second": 66.871, "step": 1690 }, { "epoch": 0.1811208182399318, "grad_norm": 1.2170437574386597, "learning_rate": 1.6377583635201365e-05, "loss": 0.0071, "step": 1700 }, { "epoch": 0.1811208182399318, "eval_loss": 0.004888548050075769, "eval_runtime": 35.1137, "eval_samples_per_second": 4276.79, "eval_steps_per_second": 66.84, "step": 1700 }, { "epoch": 0.18218623481781376, "grad_norm": 0.030709806829690933, "learning_rate": 1.6356275303643723e-05, "loss": 0.0025, "step": 1710 }, { "epoch": 0.18218623481781376, "eval_loss": 0.004803914111107588, "eval_runtime": 35.0555, "eval_samples_per_second": 4283.895, "eval_steps_per_second": 66.951, "step": 1710 }, { "epoch": 0.1832516513956957, "grad_norm": 4.421119213104248, "learning_rate": 1.6334966972086085e-05, "loss": 0.0183, "step": 1720 }, { "epoch": 0.1832516513956957, "eval_loss": 0.004751955159008503, "eval_runtime": 35.0418, "eval_samples_per_second": 4285.571, "eval_steps_per_second": 66.977, "step": 1720 }, { "epoch": 0.18431706797357766, "grad_norm": 0.009466009214520454, "learning_rate": 1.6313658640528447e-05, "loss": 0.0035, "step": 1730 }, { "epoch": 0.18431706797357766, "eval_loss": 0.0049493880942463875, "eval_runtime": 35.0694, "eval_samples_per_second": 4282.201, "eval_steps_per_second": 66.925, "step": 1730 }, { "epoch": 0.18538248455145961, "grad_norm": 0.259084016084671, "learning_rate": 1.629235030897081e-05, "loss": 0.0062, "step": 1740 }, { "epoch": 0.18538248455145961, "eval_loss": 0.00492563983425498, "eval_runtime": 35.0548, "eval_samples_per_second": 4283.98, "eval_steps_per_second": 66.952, "step": 1740 }, { "epoch": 0.18644790112934156, "grad_norm": 0.07872737944126129, "learning_rate": 1.6271041977413167e-05, "loss": 0.0044, "step": 1750 }, { "epoch": 0.18644790112934156, "eval_loss": 0.004815262276679277, "eval_runtime": 35.0635, "eval_samples_per_second": 4282.919, "eval_steps_per_second": 66.936, "step": 1750 }, { "epoch": 0.18751331770722351, "grad_norm": 0.008825350552797318, "learning_rate": 1.624973364585553e-05, "loss": 0.001, "step": 1760 }, { "epoch": 0.18751331770722351, "eval_loss": 0.00481291301548481, "eval_runtime": 35.0737, "eval_samples_per_second": 4281.672, "eval_steps_per_second": 66.916, "step": 1760 }, { "epoch": 0.18857873428510546, "grad_norm": 0.03307470306754112, "learning_rate": 1.622842531429789e-05, "loss": 0.0128, "step": 1770 }, { "epoch": 0.18857873428510546, "eval_loss": 0.004740755073726177, "eval_runtime": 35.0789, "eval_samples_per_second": 4281.038, "eval_steps_per_second": 66.906, "step": 1770 }, { "epoch": 0.18964415086298742, "grad_norm": 0.058342017233371735, "learning_rate": 1.6207116982740253e-05, "loss": 0.0021, "step": 1780 }, { "epoch": 0.18964415086298742, "eval_loss": 0.00447422219440341, "eval_runtime": 35.0637, "eval_samples_per_second": 4282.894, "eval_steps_per_second": 66.935, "step": 1780 }, { "epoch": 0.1907095674408694, "grad_norm": 0.5684050917625427, "learning_rate": 1.618580865118261e-05, "loss": 0.0051, "step": 1790 }, { "epoch": 0.1907095674408694, "eval_loss": 0.004538466222584248, "eval_runtime": 35.0598, "eval_samples_per_second": 4283.37, "eval_steps_per_second": 66.943, "step": 1790 }, { "epoch": 0.19177498401875134, "grad_norm": 4.642019748687744, "learning_rate": 1.6164500319624973e-05, "loss": 0.0041, "step": 1800 }, { "epoch": 0.19177498401875134, "eval_loss": 0.004466844256967306, "eval_runtime": 35.066, "eval_samples_per_second": 4282.609, "eval_steps_per_second": 66.931, "step": 1800 }, { "epoch": 0.1928404005966333, "grad_norm": 0.08696554601192474, "learning_rate": 1.6143191988067335e-05, "loss": 0.001, "step": 1810 }, { "epoch": 0.1928404005966333, "eval_loss": 0.004406987689435482, "eval_runtime": 35.0911, "eval_samples_per_second": 4279.541, "eval_steps_per_second": 66.883, "step": 1810 }, { "epoch": 0.19390581717451524, "grad_norm": 2.991973876953125, "learning_rate": 1.6121883656509697e-05, "loss": 0.0039, "step": 1820 }, { "epoch": 0.19390581717451524, "eval_loss": 0.004455452784895897, "eval_runtime": 35.0798, "eval_samples_per_second": 4280.926, "eval_steps_per_second": 66.905, "step": 1820 }, { "epoch": 0.1949712337523972, "grad_norm": 0.005264118313789368, "learning_rate": 1.6100575324952055e-05, "loss": 0.0006, "step": 1830 }, { "epoch": 0.1949712337523972, "eval_loss": 0.004717789124697447, "eval_runtime": 35.0708, "eval_samples_per_second": 4282.025, "eval_steps_per_second": 66.922, "step": 1830 }, { "epoch": 0.19603665033027914, "grad_norm": 0.03296063467860222, "learning_rate": 1.6079266993394417e-05, "loss": 0.0006, "step": 1840 }, { "epoch": 0.19603665033027914, "eval_loss": 0.004698717035353184, "eval_runtime": 35.1299, "eval_samples_per_second": 4274.824, "eval_steps_per_second": 66.809, "step": 1840 }, { "epoch": 0.1971020669081611, "grad_norm": 0.0412713959813118, "learning_rate": 1.605795866183678e-05, "loss": 0.0032, "step": 1850 }, { "epoch": 0.1971020669081611, "eval_loss": 0.0045752511359751225, "eval_runtime": 35.0645, "eval_samples_per_second": 4282.79, "eval_steps_per_second": 66.934, "step": 1850 }, { "epoch": 0.19816748348604304, "grad_norm": 0.03118061274290085, "learning_rate": 1.603665033027914e-05, "loss": 0.0009, "step": 1860 }, { "epoch": 0.19816748348604304, "eval_loss": 0.0048514497466385365, "eval_runtime": 35.0481, "eval_samples_per_second": 4284.795, "eval_steps_per_second": 66.965, "step": 1860 }, { "epoch": 0.199232900063925, "grad_norm": 0.778282105922699, "learning_rate": 1.6015341998721503e-05, "loss": 0.0098, "step": 1870 }, { "epoch": 0.199232900063925, "eval_loss": 0.00497164111584425, "eval_runtime": 35.0663, "eval_samples_per_second": 4282.574, "eval_steps_per_second": 66.93, "step": 1870 }, { "epoch": 0.20029831664180694, "grad_norm": 0.007694170344620943, "learning_rate": 1.599403366716386e-05, "loss": 0.0005, "step": 1880 }, { "epoch": 0.20029831664180694, "eval_loss": 0.004604881163686514, "eval_runtime": 35.0134, "eval_samples_per_second": 4289.046, "eval_steps_per_second": 67.032, "step": 1880 }, { "epoch": 0.2013637332196889, "grad_norm": 0.9302027821540833, "learning_rate": 1.5972725335606223e-05, "loss": 0.0048, "step": 1890 }, { "epoch": 0.2013637332196889, "eval_loss": 0.004474525805562735, "eval_runtime": 35.0823, "eval_samples_per_second": 4280.622, "eval_steps_per_second": 66.9, "step": 1890 }, { "epoch": 0.20242914979757085, "grad_norm": 0.00496167317032814, "learning_rate": 1.5951417004048585e-05, "loss": 0.0038, "step": 1900 }, { "epoch": 0.20242914979757085, "eval_loss": 0.004204958211630583, "eval_runtime": 35.0994, "eval_samples_per_second": 4278.533, "eval_steps_per_second": 66.867, "step": 1900 }, { "epoch": 0.2034945663754528, "grad_norm": 0.24391968548297882, "learning_rate": 1.5930108672490947e-05, "loss": 0.0014, "step": 1910 }, { "epoch": 0.2034945663754528, "eval_loss": 0.004156021401286125, "eval_runtime": 35.055, "eval_samples_per_second": 4283.949, "eval_steps_per_second": 66.952, "step": 1910 }, { "epoch": 0.20455998295333475, "grad_norm": 0.21614207327365875, "learning_rate": 1.5908800340933305e-05, "loss": 0.0038, "step": 1920 }, { "epoch": 0.20455998295333475, "eval_loss": 0.004162695724517107, "eval_runtime": 35.0833, "eval_samples_per_second": 4280.504, "eval_steps_per_second": 66.898, "step": 1920 }, { "epoch": 0.2056253995312167, "grad_norm": 0.08646874129772186, "learning_rate": 1.5887492009375667e-05, "loss": 0.0007, "step": 1930 }, { "epoch": 0.2056253995312167, "eval_loss": 0.00422044238075614, "eval_runtime": 35.0779, "eval_samples_per_second": 4281.163, "eval_steps_per_second": 66.908, "step": 1930 }, { "epoch": 0.20669081610909865, "grad_norm": 0.004879661835730076, "learning_rate": 1.586618367781803e-05, "loss": 0.0123, "step": 1940 }, { "epoch": 0.20669081610909865, "eval_loss": 0.004559625405818224, "eval_runtime": 35.0448, "eval_samples_per_second": 4285.202, "eval_steps_per_second": 66.971, "step": 1940 }, { "epoch": 0.2077562326869806, "grad_norm": 0.15028773248195648, "learning_rate": 1.584487534626039e-05, "loss": 0.0124, "step": 1950 }, { "epoch": 0.2077562326869806, "eval_loss": 0.005765740759670734, "eval_runtime": 35.0383, "eval_samples_per_second": 4285.999, "eval_steps_per_second": 66.984, "step": 1950 }, { "epoch": 0.20882164926486257, "grad_norm": 0.004790129140019417, "learning_rate": 1.582356701470275e-05, "loss": 0.0089, "step": 1960 }, { "epoch": 0.20882164926486257, "eval_loss": 0.004500931594520807, "eval_runtime": 35.0794, "eval_samples_per_second": 4280.971, "eval_steps_per_second": 66.905, "step": 1960 }, { "epoch": 0.20988706584274452, "grad_norm": 3.749185562133789, "learning_rate": 1.580225868314511e-05, "loss": 0.0115, "step": 1970 }, { "epoch": 0.20988706584274452, "eval_loss": 0.004156744107604027, "eval_runtime": 35.0591, "eval_samples_per_second": 4283.456, "eval_steps_per_second": 66.944, "step": 1970 }, { "epoch": 0.21095248242062647, "grad_norm": 0.009407439269125462, "learning_rate": 1.5780950351587473e-05, "loss": 0.001, "step": 1980 }, { "epoch": 0.21095248242062647, "eval_loss": 0.005346408113837242, "eval_runtime": 35.0873, "eval_samples_per_second": 4280.009, "eval_steps_per_second": 66.89, "step": 1980 }, { "epoch": 0.21201789899850843, "grad_norm": 4.62002420425415, "learning_rate": 1.5759642020029834e-05, "loss": 0.0068, "step": 1990 }, { "epoch": 0.21201789899850843, "eval_loss": 0.006435367278754711, "eval_runtime": 35.1066, "eval_samples_per_second": 4277.661, "eval_steps_per_second": 66.854, "step": 1990 }, { "epoch": 0.21308331557639038, "grad_norm": 0.005438864231109619, "learning_rate": 1.5738333688472193e-05, "loss": 0.0027, "step": 2000 }, { "epoch": 0.21308331557639038, "eval_loss": 0.00453655980527401, "eval_runtime": 35.0641, "eval_samples_per_second": 4282.841, "eval_steps_per_second": 66.935, "step": 2000 }, { "epoch": 0.21414873215427233, "grad_norm": 0.05276772007346153, "learning_rate": 1.5717025356914555e-05, "loss": 0.0049, "step": 2010 }, { "epoch": 0.21414873215427233, "eval_loss": 0.004296323750168085, "eval_runtime": 35.0599, "eval_samples_per_second": 4283.356, "eval_steps_per_second": 66.943, "step": 2010 }, { "epoch": 0.21521414873215428, "grad_norm": 0.0866408571600914, "learning_rate": 1.5695717025356917e-05, "loss": 0.0005, "step": 2020 }, { "epoch": 0.21521414873215428, "eval_loss": 0.0043297079391777515, "eval_runtime": 35.0108, "eval_samples_per_second": 4289.364, "eval_steps_per_second": 67.036, "step": 2020 }, { "epoch": 0.21627956531003623, "grad_norm": 0.03794199973344803, "learning_rate": 1.567440869379928e-05, "loss": 0.0007, "step": 2030 }, { "epoch": 0.21627956531003623, "eval_loss": 0.004399747122079134, "eval_runtime": 35.043, "eval_samples_per_second": 4285.424, "eval_steps_per_second": 66.975, "step": 2030 }, { "epoch": 0.21734498188791818, "grad_norm": 0.1419890820980072, "learning_rate": 1.5653100362241637e-05, "loss": 0.011, "step": 2040 }, { "epoch": 0.21734498188791818, "eval_loss": 0.004798985552042723, "eval_runtime": 35.2407, "eval_samples_per_second": 4261.377, "eval_steps_per_second": 66.599, "step": 2040 }, { "epoch": 0.21841039846580013, "grad_norm": 0.004484011325985193, "learning_rate": 1.5631792030684e-05, "loss": 0.0051, "step": 2050 }, { "epoch": 0.21841039846580013, "eval_loss": 0.0046847849152982235, "eval_runtime": 35.045, "eval_samples_per_second": 4285.182, "eval_steps_per_second": 66.971, "step": 2050 }, { "epoch": 0.21947581504368208, "grad_norm": 0.04429204761981964, "learning_rate": 1.561048369912636e-05, "loss": 0.0012, "step": 2060 }, { "epoch": 0.21947581504368208, "eval_loss": 0.004873516503721476, "eval_runtime": 35.0605, "eval_samples_per_second": 4283.283, "eval_steps_per_second": 66.941, "step": 2060 }, { "epoch": 0.22054123162156403, "grad_norm": 2.6432743072509766, "learning_rate": 1.5589175367568722e-05, "loss": 0.0134, "step": 2070 }, { "epoch": 0.22054123162156403, "eval_loss": 0.004669446498155594, "eval_runtime": 35.0459, "eval_samples_per_second": 4285.071, "eval_steps_per_second": 66.969, "step": 2070 }, { "epoch": 0.22160664819944598, "grad_norm": 0.004335370380431414, "learning_rate": 1.556786703601108e-05, "loss": 0.0006, "step": 2080 }, { "epoch": 0.22160664819944598, "eval_loss": 0.0043738046661019325, "eval_runtime": 35.0814, "eval_samples_per_second": 4280.727, "eval_steps_per_second": 66.902, "step": 2080 }, { "epoch": 0.22267206477732793, "grad_norm": 0.09280374646186829, "learning_rate": 1.5546558704453443e-05, "loss": 0.0006, "step": 2090 }, { "epoch": 0.22267206477732793, "eval_loss": 0.0046281940303742886, "eval_runtime": 35.0542, "eval_samples_per_second": 4284.053, "eval_steps_per_second": 66.953, "step": 2090 }, { "epoch": 0.22373748135520988, "grad_norm": 0.005173602141439915, "learning_rate": 1.5525250372895804e-05, "loss": 0.0012, "step": 2100 }, { "epoch": 0.22373748135520988, "eval_loss": 0.004610604140907526, "eval_runtime": 35.0581, "eval_samples_per_second": 4283.58, "eval_steps_per_second": 66.946, "step": 2100 }, { "epoch": 0.22480289793309183, "grad_norm": 2.3304152488708496, "learning_rate": 1.5503942041338166e-05, "loss": 0.006, "step": 2110 }, { "epoch": 0.22480289793309183, "eval_loss": 0.004738961812108755, "eval_runtime": 35.0641, "eval_samples_per_second": 4282.84, "eval_steps_per_second": 66.935, "step": 2110 }, { "epoch": 0.22586831451097378, "grad_norm": 0.004037824459373951, "learning_rate": 1.5482633709780525e-05, "loss": 0.0004, "step": 2120 }, { "epoch": 0.22586831451097378, "eval_loss": 0.004827072378247976, "eval_runtime": 35.0285, "eval_samples_per_second": 4287.2, "eval_steps_per_second": 67.003, "step": 2120 }, { "epoch": 0.22693373108885576, "grad_norm": 0.018360449001193047, "learning_rate": 1.5461325378222887e-05, "loss": 0.0176, "step": 2130 }, { "epoch": 0.22693373108885576, "eval_loss": 0.004639809485524893, "eval_runtime": 35.0055, "eval_samples_per_second": 4290.006, "eval_steps_per_second": 67.047, "step": 2130 }, { "epoch": 0.2279991476667377, "grad_norm": 0.14667311310768127, "learning_rate": 1.544001704666525e-05, "loss": 0.0166, "step": 2140 }, { "epoch": 0.2279991476667377, "eval_loss": 0.004509914666414261, "eval_runtime": 35.0516, "eval_samples_per_second": 4284.375, "eval_steps_per_second": 66.959, "step": 2140 }, { "epoch": 0.22906456424461966, "grad_norm": 0.1163237988948822, "learning_rate": 1.541870871510761e-05, "loss": 0.0022, "step": 2150 }, { "epoch": 0.22906456424461966, "eval_loss": 0.004260140936821699, "eval_runtime": 35.0677, "eval_samples_per_second": 4282.404, "eval_steps_per_second": 66.928, "step": 2150 }, { "epoch": 0.2301299808225016, "grad_norm": 0.11981041729450226, "learning_rate": 1.539740038354997e-05, "loss": 0.0021, "step": 2160 }, { "epoch": 0.2301299808225016, "eval_loss": 0.0041451407596468925, "eval_runtime": 35.0698, "eval_samples_per_second": 4282.144, "eval_steps_per_second": 66.924, "step": 2160 }, { "epoch": 0.23119539740038356, "grad_norm": 0.009077006950974464, "learning_rate": 1.537609205199233e-05, "loss": 0.0037, "step": 2170 }, { "epoch": 0.23119539740038356, "eval_loss": 0.00424983911216259, "eval_runtime": 35.0442, "eval_samples_per_second": 4285.276, "eval_steps_per_second": 66.973, "step": 2170 }, { "epoch": 0.2322608139782655, "grad_norm": 0.8403615355491638, "learning_rate": 1.5354783720434692e-05, "loss": 0.0086, "step": 2180 }, { "epoch": 0.2322608139782655, "eval_loss": 0.004347871523350477, "eval_runtime": 35.0585, "eval_samples_per_second": 4283.529, "eval_steps_per_second": 66.945, "step": 2180 }, { "epoch": 0.23332623055614746, "grad_norm": 0.7031656503677368, "learning_rate": 1.5333475388877054e-05, "loss": 0.0087, "step": 2190 }, { "epoch": 0.23332623055614746, "eval_loss": 0.004369079601019621, "eval_runtime": 35.0569, "eval_samples_per_second": 4283.716, "eval_steps_per_second": 66.948, "step": 2190 }, { "epoch": 0.2343916471340294, "grad_norm": 0.004467003047466278, "learning_rate": 1.5312167057319413e-05, "loss": 0.0013, "step": 2200 }, { "epoch": 0.2343916471340294, "eval_loss": 0.004685032181441784, "eval_runtime": 35.0162, "eval_samples_per_second": 4288.697, "eval_steps_per_second": 67.026, "step": 2200 }, { "epoch": 0.23545706371191136, "grad_norm": 0.3929450809955597, "learning_rate": 1.5290858725761775e-05, "loss": 0.0034, "step": 2210 }, { "epoch": 0.23545706371191136, "eval_loss": 0.0044579585082829, "eval_runtime": 35.0558, "eval_samples_per_second": 4283.855, "eval_steps_per_second": 66.95, "step": 2210 }, { "epoch": 0.2365224802897933, "grad_norm": 0.004594275262206793, "learning_rate": 1.5269550394204136e-05, "loss": 0.0065, "step": 2220 }, { "epoch": 0.2365224802897933, "eval_loss": 0.005013572052121162, "eval_runtime": 35.0748, "eval_samples_per_second": 4281.54, "eval_steps_per_second": 66.914, "step": 2220 }, { "epoch": 0.23758789686767526, "grad_norm": 0.0050141457468271255, "learning_rate": 1.5248242062646496e-05, "loss": 0.0054, "step": 2230 }, { "epoch": 0.23758789686767526, "eval_loss": 0.004857253283262253, "eval_runtime": 35.057, "eval_samples_per_second": 4283.714, "eval_steps_per_second": 66.948, "step": 2230 }, { "epoch": 0.2386533134455572, "grad_norm": 0.041468288749456406, "learning_rate": 1.5226933731088858e-05, "loss": 0.0007, "step": 2240 }, { "epoch": 0.2386533134455572, "eval_loss": 0.004677619785070419, "eval_runtime": 35.0514, "eval_samples_per_second": 4284.389, "eval_steps_per_second": 66.959, "step": 2240 }, { "epoch": 0.23971873002343916, "grad_norm": 0.004301860462874174, "learning_rate": 1.5205625399531218e-05, "loss": 0.0009, "step": 2250 }, { "epoch": 0.23971873002343916, "eval_loss": 0.004459399729967117, "eval_runtime": 35.0712, "eval_samples_per_second": 4281.976, "eval_steps_per_second": 66.921, "step": 2250 }, { "epoch": 0.2407841466013211, "grad_norm": 0.38491347432136536, "learning_rate": 1.518431706797358e-05, "loss": 0.0018, "step": 2260 }, { "epoch": 0.2407841466013211, "eval_loss": 0.004341489169746637, "eval_runtime": 35.09, "eval_samples_per_second": 4279.681, "eval_steps_per_second": 66.885, "step": 2260 }, { "epoch": 0.24184956317920306, "grad_norm": 0.0182588379830122, "learning_rate": 1.516300873641594e-05, "loss": 0.0049, "step": 2270 }, { "epoch": 0.24184956317920306, "eval_loss": 0.00429992750287056, "eval_runtime": 35.0536, "eval_samples_per_second": 4284.123, "eval_steps_per_second": 66.955, "step": 2270 }, { "epoch": 0.242914979757085, "grad_norm": 0.0038155666552484035, "learning_rate": 1.5141700404858302e-05, "loss": 0.0012, "step": 2280 }, { "epoch": 0.242914979757085, "eval_loss": 0.0042260088957846165, "eval_runtime": 35.0614, "eval_samples_per_second": 4283.169, "eval_steps_per_second": 66.94, "step": 2280 }, { "epoch": 0.24398039633496696, "grad_norm": 0.003924189601093531, "learning_rate": 1.5120392073300662e-05, "loss": 0.0077, "step": 2290 }, { "epoch": 0.24398039633496696, "eval_loss": 0.004574434366077185, "eval_runtime": 35.0275, "eval_samples_per_second": 4287.317, "eval_steps_per_second": 67.004, "step": 2290 }, { "epoch": 0.24504581291284894, "grad_norm": 0.025482522323727608, "learning_rate": 1.5099083741743024e-05, "loss": 0.0071, "step": 2300 }, { "epoch": 0.24504581291284894, "eval_loss": 0.005042645614594221, "eval_runtime": 35.027, "eval_samples_per_second": 4287.374, "eval_steps_per_second": 67.005, "step": 2300 }, { "epoch": 0.2461112294907309, "grad_norm": 0.003832248505204916, "learning_rate": 1.5077775410185384e-05, "loss": 0.0026, "step": 2310 }, { "epoch": 0.2461112294907309, "eval_loss": 0.0048546576872467995, "eval_runtime": 35.0534, "eval_samples_per_second": 4284.146, "eval_steps_per_second": 66.955, "step": 2310 }, { "epoch": 0.24717664606861284, "grad_norm": 0.0844670832157135, "learning_rate": 1.5056467078627746e-05, "loss": 0.0047, "step": 2320 }, { "epoch": 0.24717664606861284, "eval_loss": 0.00422197300940752, "eval_runtime": 35.0619, "eval_samples_per_second": 4283.111, "eval_steps_per_second": 66.939, "step": 2320 }, { "epoch": 0.2482420626464948, "grad_norm": 0.039526067674160004, "learning_rate": 1.5035158747070106e-05, "loss": 0.0008, "step": 2330 }, { "epoch": 0.2482420626464948, "eval_loss": 0.004363663960248232, "eval_runtime": 35.0057, "eval_samples_per_second": 4289.986, "eval_steps_per_second": 67.046, "step": 2330 }, { "epoch": 0.24930747922437674, "grad_norm": 0.036807432770729065, "learning_rate": 1.5013850415512468e-05, "loss": 0.005, "step": 2340 }, { "epoch": 0.24930747922437674, "eval_loss": 0.00450093112885952, "eval_runtime": 35.0239, "eval_samples_per_second": 4287.753, "eval_steps_per_second": 67.011, "step": 2340 }, { "epoch": 0.2503728958022587, "grad_norm": 0.013508515432476997, "learning_rate": 1.4992542083954828e-05, "loss": 0.0011, "step": 2350 }, { "epoch": 0.2503728958022587, "eval_loss": 0.004736943170428276, "eval_runtime": 35.0146, "eval_samples_per_second": 4288.902, "eval_steps_per_second": 67.029, "step": 2350 }, { "epoch": 0.25143831238014064, "grad_norm": 0.13655096292495728, "learning_rate": 1.497123375239719e-05, "loss": 0.0006, "step": 2360 }, { "epoch": 0.25143831238014064, "eval_loss": 0.004960217047482729, "eval_runtime": 35.0563, "eval_samples_per_second": 4283.792, "eval_steps_per_second": 66.949, "step": 2360 }, { "epoch": 0.2525037289580226, "grad_norm": 1.2185442447662354, "learning_rate": 1.494992542083955e-05, "loss": 0.0104, "step": 2370 }, { "epoch": 0.2525037289580226, "eval_loss": 0.004864447750151157, "eval_runtime": 35.0658, "eval_samples_per_second": 4282.637, "eval_steps_per_second": 66.931, "step": 2370 }, { "epoch": 0.25356914553590454, "grad_norm": 0.036553967744112015, "learning_rate": 1.4928617089281912e-05, "loss": 0.0074, "step": 2380 }, { "epoch": 0.25356914553590454, "eval_loss": 0.004570557735860348, "eval_runtime": 35.027, "eval_samples_per_second": 4287.378, "eval_steps_per_second": 67.005, "step": 2380 }, { "epoch": 0.2546345621137865, "grad_norm": 0.027491575106978416, "learning_rate": 1.490730875772427e-05, "loss": 0.0011, "step": 2390 }, { "epoch": 0.2546345621137865, "eval_loss": 0.004696827381849289, "eval_runtime": 35.0112, "eval_samples_per_second": 4289.31, "eval_steps_per_second": 67.036, "step": 2390 }, { "epoch": 0.25569997869166844, "grad_norm": 0.23158523440361023, "learning_rate": 1.488600042616663e-05, "loss": 0.0104, "step": 2400 }, { "epoch": 0.25569997869166844, "eval_loss": 0.004320676904171705, "eval_runtime": 35.0305, "eval_samples_per_second": 4286.953, "eval_steps_per_second": 66.999, "step": 2400 }, { "epoch": 0.2567653952695504, "grad_norm": 0.007454337552189827, "learning_rate": 1.4864692094608993e-05, "loss": 0.0093, "step": 2410 }, { "epoch": 0.2567653952695504, "eval_loss": 0.004152194131165743, "eval_runtime": 35.0245, "eval_samples_per_second": 4287.684, "eval_steps_per_second": 67.01, "step": 2410 }, { "epoch": 0.25783081184743234, "grad_norm": 0.006590006407350302, "learning_rate": 1.4843383763051353e-05, "loss": 0.0007, "step": 2420 }, { "epoch": 0.25783081184743234, "eval_loss": 0.004091034177690744, "eval_runtime": 35.0369, "eval_samples_per_second": 4286.17, "eval_steps_per_second": 66.987, "step": 2420 }, { "epoch": 0.2588962284253143, "grad_norm": 0.08318906277418137, "learning_rate": 1.4822075431493715e-05, "loss": 0.0032, "step": 2430 }, { "epoch": 0.2588962284253143, "eval_loss": 0.004115572199225426, "eval_runtime": 35.0695, "eval_samples_per_second": 4282.183, "eval_steps_per_second": 66.924, "step": 2430 }, { "epoch": 0.25996164500319624, "grad_norm": 0.25250810384750366, "learning_rate": 1.4800767099936075e-05, "loss": 0.0025, "step": 2440 }, { "epoch": 0.25996164500319624, "eval_loss": 0.004098709672689438, "eval_runtime": 35.0339, "eval_samples_per_second": 4286.54, "eval_steps_per_second": 66.992, "step": 2440 }, { "epoch": 0.2610270615810782, "grad_norm": 0.004992151632905006, "learning_rate": 1.4779458768378437e-05, "loss": 0.0019, "step": 2450 }, { "epoch": 0.2610270615810782, "eval_loss": 0.00410530436784029, "eval_runtime": 35.0414, "eval_samples_per_second": 4285.617, "eval_steps_per_second": 66.978, "step": 2450 }, { "epoch": 0.26209247815896014, "grad_norm": 2.30206036567688, "learning_rate": 1.4758150436820797e-05, "loss": 0.0107, "step": 2460 }, { "epoch": 0.26209247815896014, "eval_loss": 0.004170614294707775, "eval_runtime": 35.0183, "eval_samples_per_second": 4288.444, "eval_steps_per_second": 67.022, "step": 2460 }, { "epoch": 0.2631578947368421, "grad_norm": 0.07904893159866333, "learning_rate": 1.4736842105263159e-05, "loss": 0.0032, "step": 2470 }, { "epoch": 0.2631578947368421, "eval_loss": 0.004105927422642708, "eval_runtime": 35.0424, "eval_samples_per_second": 4285.494, "eval_steps_per_second": 66.976, "step": 2470 }, { "epoch": 0.26422331131472404, "grad_norm": 0.0035470998845994473, "learning_rate": 1.4715533773705519e-05, "loss": 0.0074, "step": 2480 }, { "epoch": 0.26422331131472404, "eval_loss": 0.004295279737561941, "eval_runtime": 35.0454, "eval_samples_per_second": 4285.128, "eval_steps_per_second": 66.97, "step": 2480 }, { "epoch": 0.265288727892606, "grad_norm": 0.4439772665500641, "learning_rate": 1.469422544214788e-05, "loss": 0.0048, "step": 2490 }, { "epoch": 0.265288727892606, "eval_loss": 0.004732625558972359, "eval_runtime": 35.0313, "eval_samples_per_second": 4286.848, "eval_steps_per_second": 66.997, "step": 2490 }, { "epoch": 0.26635414447048794, "grad_norm": 0.0037416014820337296, "learning_rate": 1.467291711059024e-05, "loss": 0.0026, "step": 2500 }, { "epoch": 0.26635414447048794, "eval_loss": 0.004636832047253847, "eval_runtime": 35.0307, "eval_samples_per_second": 4286.927, "eval_steps_per_second": 66.998, "step": 2500 }, { "epoch": 0.2674195610483699, "grad_norm": 0.003717947518453002, "learning_rate": 1.4651608779032603e-05, "loss": 0.0048, "step": 2510 }, { "epoch": 0.2674195610483699, "eval_loss": 0.004365737084299326, "eval_runtime": 35.0714, "eval_samples_per_second": 4281.948, "eval_steps_per_second": 66.921, "step": 2510 }, { "epoch": 0.26848497762625184, "grad_norm": 0.031179407611489296, "learning_rate": 1.4630300447474963e-05, "loss": 0.0009, "step": 2520 }, { "epoch": 0.26848497762625184, "eval_loss": 0.004146276507526636, "eval_runtime": 35.0595, "eval_samples_per_second": 4283.409, "eval_steps_per_second": 66.943, "step": 2520 }, { "epoch": 0.2695503942041338, "grad_norm": 0.05639449879527092, "learning_rate": 1.4608992115917325e-05, "loss": 0.0079, "step": 2530 }, { "epoch": 0.2695503942041338, "eval_loss": 0.004130475223064423, "eval_runtime": 35.0379, "eval_samples_per_second": 4286.044, "eval_steps_per_second": 66.985, "step": 2530 }, { "epoch": 0.27061581078201574, "grad_norm": 0.003375578671693802, "learning_rate": 1.4587683784359685e-05, "loss": 0.0008, "step": 2540 }, { "epoch": 0.27061581078201574, "eval_loss": 0.004228705074638128, "eval_runtime": 35.0234, "eval_samples_per_second": 4287.822, "eval_steps_per_second": 67.012, "step": 2540 }, { "epoch": 0.2716812273598977, "grad_norm": 0.004005058668553829, "learning_rate": 1.4566375452802046e-05, "loss": 0.0138, "step": 2550 }, { "epoch": 0.2716812273598977, "eval_loss": 0.004597960971295834, "eval_runtime": 35.0266, "eval_samples_per_second": 4287.433, "eval_steps_per_second": 67.006, "step": 2550 }, { "epoch": 0.2727466439377797, "grad_norm": 0.0035304948687553406, "learning_rate": 1.4545067121244407e-05, "loss": 0.0055, "step": 2560 }, { "epoch": 0.2727466439377797, "eval_loss": 0.005362317897379398, "eval_runtime": 35.0017, "eval_samples_per_second": 4290.48, "eval_steps_per_second": 67.054, "step": 2560 }, { "epoch": 0.27381206051566165, "grad_norm": 0.00467054545879364, "learning_rate": 1.4523758789686768e-05, "loss": 0.0132, "step": 2570 }, { "epoch": 0.27381206051566165, "eval_loss": 0.005131016951054335, "eval_runtime": 35.0481, "eval_samples_per_second": 4284.791, "eval_steps_per_second": 66.965, "step": 2570 }, { "epoch": 0.2748774770935436, "grad_norm": 0.025946978479623795, "learning_rate": 1.4502450458129129e-05, "loss": 0.0017, "step": 2580 }, { "epoch": 0.2748774770935436, "eval_loss": 0.011761846020817757, "eval_runtime": 35.0511, "eval_samples_per_second": 4284.43, "eval_steps_per_second": 66.959, "step": 2580 }, { "epoch": 0.27594289367142555, "grad_norm": 0.01493908278644085, "learning_rate": 1.448114212657149e-05, "loss": 0.0043, "step": 2590 }, { "epoch": 0.27594289367142555, "eval_loss": 0.006850136443972588, "eval_runtime": 35.0353, "eval_samples_per_second": 4286.368, "eval_steps_per_second": 66.99, "step": 2590 }, { "epoch": 0.2770083102493075, "grad_norm": 0.5396614670753479, "learning_rate": 1.445983379501385e-05, "loss": 0.0047, "step": 2600 }, { "epoch": 0.2770083102493075, "eval_loss": 0.004564850591123104, "eval_runtime": 35.0642, "eval_samples_per_second": 4282.829, "eval_steps_per_second": 66.934, "step": 2600 }, { "epoch": 0.27807372682718945, "grad_norm": 0.003329735714942217, "learning_rate": 1.4438525463456212e-05, "loss": 0.0052, "step": 2610 }, { "epoch": 0.27807372682718945, "eval_loss": 0.004351920913904905, "eval_runtime": 35.0437, "eval_samples_per_second": 4285.335, "eval_steps_per_second": 66.974, "step": 2610 }, { "epoch": 0.2791391434050714, "grad_norm": 0.007534320000559092, "learning_rate": 1.4417217131898573e-05, "loss": 0.0039, "step": 2620 }, { "epoch": 0.2791391434050714, "eval_loss": 0.004768616519868374, "eval_runtime": 35.078, "eval_samples_per_second": 4281.139, "eval_steps_per_second": 66.908, "step": 2620 }, { "epoch": 0.28020455998295335, "grad_norm": 0.0036227928940206766, "learning_rate": 1.4395908800340934e-05, "loss": 0.0197, "step": 2630 }, { "epoch": 0.28020455998295335, "eval_loss": 0.004250204190611839, "eval_runtime": 35.0223, "eval_samples_per_second": 4287.952, "eval_steps_per_second": 67.014, "step": 2630 }, { "epoch": 0.2812699765608353, "grad_norm": 0.057612668722867966, "learning_rate": 1.4374600468783295e-05, "loss": 0.0017, "step": 2640 }, { "epoch": 0.2812699765608353, "eval_loss": 0.004129552282392979, "eval_runtime": 35.0617, "eval_samples_per_second": 4283.136, "eval_steps_per_second": 66.939, "step": 2640 }, { "epoch": 0.28233539313871725, "grad_norm": 0.05967571586370468, "learning_rate": 1.4353292137225656e-05, "loss": 0.0034, "step": 2650 }, { "epoch": 0.28233539313871725, "eval_loss": 0.004171199630945921, "eval_runtime": 35.0514, "eval_samples_per_second": 4284.394, "eval_steps_per_second": 66.959, "step": 2650 }, { "epoch": 0.2834008097165992, "grad_norm": 0.027145517989993095, "learning_rate": 1.4331983805668017e-05, "loss": 0.0016, "step": 2660 }, { "epoch": 0.2834008097165992, "eval_loss": 0.004205272998660803, "eval_runtime": 35.0565, "eval_samples_per_second": 4283.776, "eval_steps_per_second": 66.949, "step": 2660 }, { "epoch": 0.28446622629448115, "grad_norm": 0.028178216889500618, "learning_rate": 1.4310675474110378e-05, "loss": 0.0008, "step": 2670 }, { "epoch": 0.28446622629448115, "eval_loss": 0.004260431043803692, "eval_runtime": 35.049, "eval_samples_per_second": 4284.69, "eval_steps_per_second": 66.963, "step": 2670 }, { "epoch": 0.2855316428723631, "grad_norm": 0.17948974668979645, "learning_rate": 1.4289367142552739e-05, "loss": 0.0183, "step": 2680 }, { "epoch": 0.2855316428723631, "eval_loss": 0.004343624692410231, "eval_runtime": 35.0373, "eval_samples_per_second": 4286.113, "eval_steps_per_second": 66.986, "step": 2680 }, { "epoch": 0.28659705945024505, "grad_norm": 0.06010470911860466, "learning_rate": 1.42680588109951e-05, "loss": 0.0017, "step": 2690 }, { "epoch": 0.28659705945024505, "eval_loss": 0.004411335103213787, "eval_runtime": 35.0836, "eval_samples_per_second": 4280.467, "eval_steps_per_second": 66.897, "step": 2690 }, { "epoch": 0.287662476028127, "grad_norm": 0.005281396675854921, "learning_rate": 1.424675047943746e-05, "loss": 0.0007, "step": 2700 }, { "epoch": 0.287662476028127, "eval_loss": 0.0045102485455572605, "eval_runtime": 35.0122, "eval_samples_per_second": 4289.185, "eval_steps_per_second": 67.034, "step": 2700 }, { "epoch": 0.28872789260600895, "grad_norm": 0.3176427185535431, "learning_rate": 1.4225442147879822e-05, "loss": 0.0024, "step": 2710 }, { "epoch": 0.28872789260600895, "eval_loss": 0.004357383586466312, "eval_runtime": 35.0509, "eval_samples_per_second": 4284.451, "eval_steps_per_second": 66.96, "step": 2710 }, { "epoch": 0.2897933091838909, "grad_norm": 0.3456588685512543, "learning_rate": 1.4204133816322182e-05, "loss": 0.0013, "step": 2720 }, { "epoch": 0.2897933091838909, "eval_loss": 0.004329455550760031, "eval_runtime": 35.0464, "eval_samples_per_second": 4285.011, "eval_steps_per_second": 66.968, "step": 2720 }, { "epoch": 0.29085872576177285, "grad_norm": 0.003269694047048688, "learning_rate": 1.4182825484764544e-05, "loss": 0.0006, "step": 2730 }, { "epoch": 0.29085872576177285, "eval_loss": 0.004743185359984636, "eval_runtime": 35.0411, "eval_samples_per_second": 4285.649, "eval_steps_per_second": 66.978, "step": 2730 }, { "epoch": 0.2919241423396548, "grad_norm": 0.0067397127859294415, "learning_rate": 1.4161517153206904e-05, "loss": 0.0035, "step": 2740 }, { "epoch": 0.2919241423396548, "eval_loss": 0.0047289966605603695, "eval_runtime": 35.0343, "eval_samples_per_second": 4286.487, "eval_steps_per_second": 66.992, "step": 2740 }, { "epoch": 0.29298955891753675, "grad_norm": 0.0032168785110116005, "learning_rate": 1.4140208821649266e-05, "loss": 0.0003, "step": 2750 }, { "epoch": 0.29298955891753675, "eval_loss": 0.004747165832668543, "eval_runtime": 35.0422, "eval_samples_per_second": 4285.518, "eval_steps_per_second": 66.976, "step": 2750 }, { "epoch": 0.2940549754954187, "grad_norm": 0.014863832853734493, "learning_rate": 1.4118900490091626e-05, "loss": 0.0012, "step": 2760 }, { "epoch": 0.2940549754954187, "eval_loss": 0.004687744192779064, "eval_runtime": 35.0762, "eval_samples_per_second": 4281.363, "eval_steps_per_second": 66.911, "step": 2760 }, { "epoch": 0.29512039207330065, "grad_norm": 0.011120929382741451, "learning_rate": 1.4097592158533988e-05, "loss": 0.0007, "step": 2770 }, { "epoch": 0.29512039207330065, "eval_loss": 0.0050058220513165, "eval_runtime": 35.0543, "eval_samples_per_second": 4284.039, "eval_steps_per_second": 66.953, "step": 2770 }, { "epoch": 0.2961858086511826, "grad_norm": 0.006813787389546633, "learning_rate": 1.4076283826976348e-05, "loss": 0.0033, "step": 2780 }, { "epoch": 0.2961858086511826, "eval_loss": 0.006108899600803852, "eval_runtime": 35.0497, "eval_samples_per_second": 4284.604, "eval_steps_per_second": 66.962, "step": 2780 }, { "epoch": 0.29725122522906455, "grad_norm": 0.004185474012047052, "learning_rate": 1.405497549541871e-05, "loss": 0.001, "step": 2790 }, { "epoch": 0.29725122522906455, "eval_loss": 0.006051088683307171, "eval_runtime": 35.0464, "eval_samples_per_second": 4285.005, "eval_steps_per_second": 66.968, "step": 2790 }, { "epoch": 0.2983166418069465, "grad_norm": 0.0032837213948369026, "learning_rate": 1.403366716386107e-05, "loss": 0.0027, "step": 2800 }, { "epoch": 0.2983166418069465, "eval_loss": 0.005179966799914837, "eval_runtime": 35.0247, "eval_samples_per_second": 4287.657, "eval_steps_per_second": 67.01, "step": 2800 }, { "epoch": 0.29938205838482845, "grad_norm": 0.018226496875286102, "learning_rate": 1.4012358832303432e-05, "loss": 0.0003, "step": 2810 }, { "epoch": 0.29938205838482845, "eval_loss": 0.00502545852214098, "eval_runtime": 35.0582, "eval_samples_per_second": 4283.567, "eval_steps_per_second": 66.946, "step": 2810 }, { "epoch": 0.3004474749627104, "grad_norm": 0.013967903330922127, "learning_rate": 1.3991050500745792e-05, "loss": 0.0109, "step": 2820 }, { "epoch": 0.3004474749627104, "eval_loss": 0.004795020446181297, "eval_runtime": 35.0562, "eval_samples_per_second": 4283.809, "eval_steps_per_second": 66.95, "step": 2820 }, { "epoch": 0.30151289154059235, "grad_norm": 0.00310189975425601, "learning_rate": 1.3969742169188154e-05, "loss": 0.0116, "step": 2830 }, { "epoch": 0.30151289154059235, "eval_loss": 0.005139603745192289, "eval_runtime": 35.0512, "eval_samples_per_second": 4284.424, "eval_steps_per_second": 66.959, "step": 2830 }, { "epoch": 0.3025783081184743, "grad_norm": 0.003773763542994857, "learning_rate": 1.3948433837630514e-05, "loss": 0.0115, "step": 2840 }, { "epoch": 0.3025783081184743, "eval_loss": 0.004601133055984974, "eval_runtime": 35.0881, "eval_samples_per_second": 4279.908, "eval_steps_per_second": 66.889, "step": 2840 }, { "epoch": 0.30364372469635625, "grad_norm": 0.00316253793425858, "learning_rate": 1.3927125506072876e-05, "loss": 0.0058, "step": 2850 }, { "epoch": 0.30364372469635625, "eval_loss": 0.004557873122394085, "eval_runtime": 35.0701, "eval_samples_per_second": 4282.113, "eval_steps_per_second": 66.923, "step": 2850 }, { "epoch": 0.3047091412742382, "grad_norm": 0.010574285872280598, "learning_rate": 1.3905817174515236e-05, "loss": 0.0058, "step": 2860 }, { "epoch": 0.3047091412742382, "eval_loss": 0.005399353802204132, "eval_runtime": 35.0344, "eval_samples_per_second": 4286.476, "eval_steps_per_second": 66.991, "step": 2860 }, { "epoch": 0.30577455785212015, "grad_norm": 0.019116273149847984, "learning_rate": 1.3884508842957598e-05, "loss": 0.0027, "step": 2870 }, { "epoch": 0.30577455785212015, "eval_loss": 0.004699068609625101, "eval_runtime": 35.0451, "eval_samples_per_second": 4285.167, "eval_steps_per_second": 66.971, "step": 2870 }, { "epoch": 0.3068399744300021, "grad_norm": 0.03568415716290474, "learning_rate": 1.3863200511399958e-05, "loss": 0.009, "step": 2880 }, { "epoch": 0.3068399744300021, "eval_loss": 0.0043353792279958725, "eval_runtime": 35.0693, "eval_samples_per_second": 4282.212, "eval_steps_per_second": 66.925, "step": 2880 }, { "epoch": 0.30790539100788406, "grad_norm": 2.934440851211548, "learning_rate": 1.384189217984232e-05, "loss": 0.004, "step": 2890 }, { "epoch": 0.30790539100788406, "eval_loss": 0.0042236242443323135, "eval_runtime": 35.0591, "eval_samples_per_second": 4283.447, "eval_steps_per_second": 66.944, "step": 2890 }, { "epoch": 0.30897080758576606, "grad_norm": 0.4145413637161255, "learning_rate": 1.382058384828468e-05, "loss": 0.0038, "step": 2900 }, { "epoch": 0.30897080758576606, "eval_loss": 0.004244114272296429, "eval_runtime": 35.0187, "eval_samples_per_second": 4288.391, "eval_steps_per_second": 67.021, "step": 2900 }, { "epoch": 0.310036224163648, "grad_norm": 0.0071656289510428905, "learning_rate": 1.3799275516727042e-05, "loss": 0.001, "step": 2910 }, { "epoch": 0.310036224163648, "eval_loss": 0.004367951303720474, "eval_runtime": 35.0279, "eval_samples_per_second": 4287.263, "eval_steps_per_second": 67.004, "step": 2910 }, { "epoch": 0.31110164074152996, "grad_norm": 1.2467246055603027, "learning_rate": 1.3777967185169402e-05, "loss": 0.0074, "step": 2920 }, { "epoch": 0.31110164074152996, "eval_loss": 0.003949224948883057, "eval_runtime": 35.0452, "eval_samples_per_second": 4285.157, "eval_steps_per_second": 66.971, "step": 2920 }, { "epoch": 0.3121670573194119, "grad_norm": 0.0029619967099279165, "learning_rate": 1.3756658853611764e-05, "loss": 0.0005, "step": 2930 }, { "epoch": 0.3121670573194119, "eval_loss": 0.004097965080291033, "eval_runtime": 35.0575, "eval_samples_per_second": 4283.649, "eval_steps_per_second": 66.947, "step": 2930 }, { "epoch": 0.31323247389729386, "grad_norm": 0.003615755122154951, "learning_rate": 1.3735350522054124e-05, "loss": 0.0098, "step": 2940 }, { "epoch": 0.31323247389729386, "eval_loss": 0.00408256845548749, "eval_runtime": 35.0611, "eval_samples_per_second": 4283.203, "eval_steps_per_second": 66.94, "step": 2940 }, { "epoch": 0.3142978904751758, "grad_norm": 0.0028381391894072294, "learning_rate": 1.3714042190496486e-05, "loss": 0.0006, "step": 2950 }, { "epoch": 0.3142978904751758, "eval_loss": 0.003918228670954704, "eval_runtime": 35.0452, "eval_samples_per_second": 4285.146, "eval_steps_per_second": 66.971, "step": 2950 }, { "epoch": 0.31536330705305776, "grad_norm": 0.06406796723604202, "learning_rate": 1.3692733858938846e-05, "loss": 0.0007, "step": 2960 }, { "epoch": 0.31536330705305776, "eval_loss": 0.003923286683857441, "eval_runtime": 35.0548, "eval_samples_per_second": 4283.984, "eval_steps_per_second": 66.952, "step": 2960 }, { "epoch": 0.3164287236309397, "grad_norm": 0.16647638380527496, "learning_rate": 1.3671425527381208e-05, "loss": 0.0009, "step": 2970 }, { "epoch": 0.3164287236309397, "eval_loss": 0.003978800494223833, "eval_runtime": 35.0109, "eval_samples_per_second": 4289.344, "eval_steps_per_second": 67.036, "step": 2970 }, { "epoch": 0.31749414020882166, "grad_norm": 1.183781623840332, "learning_rate": 1.3650117195823568e-05, "loss": 0.0041, "step": 2980 }, { "epoch": 0.31749414020882166, "eval_loss": 0.004037255886942148, "eval_runtime": 35.0257, "eval_samples_per_second": 4287.532, "eval_steps_per_second": 67.008, "step": 2980 }, { "epoch": 0.3185595567867036, "grad_norm": 0.002885080175474286, "learning_rate": 1.362880886426593e-05, "loss": 0.0102, "step": 2990 }, { "epoch": 0.3185595567867036, "eval_loss": 0.003956990782171488, "eval_runtime": 35.0222, "eval_samples_per_second": 4287.965, "eval_steps_per_second": 67.015, "step": 2990 }, { "epoch": 0.31962497336458556, "grad_norm": 0.13790345191955566, "learning_rate": 1.360750053270829e-05, "loss": 0.0014, "step": 3000 }, { "epoch": 0.31962497336458556, "eval_loss": 0.004563441965728998, "eval_runtime": 35.0586, "eval_samples_per_second": 4283.514, "eval_steps_per_second": 66.945, "step": 3000 }, { "epoch": 0.3206903899424675, "grad_norm": 0.23745372891426086, "learning_rate": 1.3586192201150652e-05, "loss": 0.0033, "step": 3010 }, { "epoch": 0.3206903899424675, "eval_loss": 0.004914409015327692, "eval_runtime": 35.0379, "eval_samples_per_second": 4286.05, "eval_steps_per_second": 66.985, "step": 3010 }, { "epoch": 0.32175580652034946, "grad_norm": 0.043057333678007126, "learning_rate": 1.3564883869593012e-05, "loss": 0.0017, "step": 3020 }, { "epoch": 0.32175580652034946, "eval_loss": 0.004360364284366369, "eval_runtime": 35.0253, "eval_samples_per_second": 4287.587, "eval_steps_per_second": 67.009, "step": 3020 }, { "epoch": 0.3228212230982314, "grad_norm": 0.037039484828710556, "learning_rate": 1.3543575538035374e-05, "loss": 0.0013, "step": 3030 }, { "epoch": 0.3228212230982314, "eval_loss": 0.004530046600848436, "eval_runtime": 35.0216, "eval_samples_per_second": 4288.035, "eval_steps_per_second": 67.016, "step": 3030 }, { "epoch": 0.32388663967611336, "grad_norm": 3.6495485305786133, "learning_rate": 1.3522267206477734e-05, "loss": 0.021, "step": 3040 }, { "epoch": 0.32388663967611336, "eval_loss": 0.006539896596223116, "eval_runtime": 35.0111, "eval_samples_per_second": 4289.326, "eval_steps_per_second": 67.036, "step": 3040 }, { "epoch": 0.3249520562539953, "grad_norm": 4.4830756187438965, "learning_rate": 1.3500958874920096e-05, "loss": 0.0152, "step": 3050 }, { "epoch": 0.3249520562539953, "eval_loss": 0.004404969979077578, "eval_runtime": 35.0765, "eval_samples_per_second": 4281.333, "eval_steps_per_second": 66.911, "step": 3050 }, { "epoch": 0.32601747283187726, "grad_norm": 0.06523173302412033, "learning_rate": 1.3479650543362456e-05, "loss": 0.0013, "step": 3060 }, { "epoch": 0.32601747283187726, "eval_loss": 0.004014772828668356, "eval_runtime": 35.1746, "eval_samples_per_second": 4269.389, "eval_steps_per_second": 66.724, "step": 3060 }, { "epoch": 0.3270828894097592, "grad_norm": 0.0030045281164348125, "learning_rate": 1.3458342211804816e-05, "loss": 0.001, "step": 3070 }, { "epoch": 0.3270828894097592, "eval_loss": 0.003926475998014212, "eval_runtime": 35.0278, "eval_samples_per_second": 4287.284, "eval_steps_per_second": 67.004, "step": 3070 }, { "epoch": 0.32814830598764116, "grad_norm": 0.19878143072128296, "learning_rate": 1.3437033880247176e-05, "loss": 0.0038, "step": 3080 }, { "epoch": 0.32814830598764116, "eval_loss": 0.0038352280389517546, "eval_runtime": 35.0379, "eval_samples_per_second": 4286.044, "eval_steps_per_second": 66.985, "step": 3080 }, { "epoch": 0.3292137225655231, "grad_norm": 0.010688831098377705, "learning_rate": 1.3415725548689538e-05, "loss": 0.0012, "step": 3090 }, { "epoch": 0.3292137225655231, "eval_loss": 0.0038455103058367968, "eval_runtime": 35.0172, "eval_samples_per_second": 4288.572, "eval_steps_per_second": 67.024, "step": 3090 }, { "epoch": 0.33027913914340506, "grad_norm": 2.0539698600769043, "learning_rate": 1.3394417217131898e-05, "loss": 0.0102, "step": 3100 }, { "epoch": 0.33027913914340506, "eval_loss": 0.0038040748331695795, "eval_runtime": 35.0546, "eval_samples_per_second": 4284.007, "eval_steps_per_second": 66.953, "step": 3100 }, { "epoch": 0.331344555721287, "grad_norm": 0.01796787604689598, "learning_rate": 1.337310888557426e-05, "loss": 0.0054, "step": 3110 }, { "epoch": 0.331344555721287, "eval_loss": 0.003991218749433756, "eval_runtime": 35.0314, "eval_samples_per_second": 4286.841, "eval_steps_per_second": 66.997, "step": 3110 }, { "epoch": 0.33240997229916897, "grad_norm": 0.40324315428733826, "learning_rate": 1.335180055401662e-05, "loss": 0.0131, "step": 3120 }, { "epoch": 0.33240997229916897, "eval_loss": 0.004118100740015507, "eval_runtime": 35.0329, "eval_samples_per_second": 4286.658, "eval_steps_per_second": 66.994, "step": 3120 }, { "epoch": 0.3334753888770509, "grad_norm": 0.14840951561927795, "learning_rate": 1.3330492222458982e-05, "loss": 0.0075, "step": 3130 }, { "epoch": 0.3334753888770509, "eval_loss": 0.003930113278329372, "eval_runtime": 35.0237, "eval_samples_per_second": 4287.785, "eval_steps_per_second": 67.012, "step": 3130 }, { "epoch": 0.33454080545493287, "grad_norm": 0.053780388087034225, "learning_rate": 1.3309183890901342e-05, "loss": 0.0009, "step": 3140 }, { "epoch": 0.33454080545493287, "eval_loss": 0.0038190835621207952, "eval_runtime": 34.9943, "eval_samples_per_second": 4291.383, "eval_steps_per_second": 67.068, "step": 3140 }, { "epoch": 0.3356062220328148, "grad_norm": 0.002771625993773341, "learning_rate": 1.3287875559343704e-05, "loss": 0.0037, "step": 3150 }, { "epoch": 0.3356062220328148, "eval_loss": 0.0038646007888019085, "eval_runtime": 35.023, "eval_samples_per_second": 4287.864, "eval_steps_per_second": 67.013, "step": 3150 }, { "epoch": 0.33667163861069677, "grad_norm": 0.003614110639318824, "learning_rate": 1.3266567227786064e-05, "loss": 0.0025, "step": 3160 }, { "epoch": 0.33667163861069677, "eval_loss": 0.0038535690400749445, "eval_runtime": 35.0472, "eval_samples_per_second": 4284.901, "eval_steps_per_second": 66.967, "step": 3160 }, { "epoch": 0.3377370551885787, "grad_norm": 0.05751369893550873, "learning_rate": 1.3245258896228426e-05, "loss": 0.0078, "step": 3170 }, { "epoch": 0.3377370551885787, "eval_loss": 0.0038923989050090313, "eval_runtime": 35.0433, "eval_samples_per_second": 4285.383, "eval_steps_per_second": 66.974, "step": 3170 }, { "epoch": 0.33880247176646067, "grad_norm": 0.05214925855398178, "learning_rate": 1.3223950564670786e-05, "loss": 0.0004, "step": 3180 }, { "epoch": 0.33880247176646067, "eval_loss": 0.003956311382353306, "eval_runtime": 35.0271, "eval_samples_per_second": 4287.364, "eval_steps_per_second": 67.005, "step": 3180 }, { "epoch": 0.3398678883443426, "grad_norm": 0.1537170559167862, "learning_rate": 1.3202642233113148e-05, "loss": 0.004, "step": 3190 }, { "epoch": 0.3398678883443426, "eval_loss": 0.003928401041775942, "eval_runtime": 35.0635, "eval_samples_per_second": 4282.914, "eval_steps_per_second": 66.936, "step": 3190 }, { "epoch": 0.34093330492222457, "grad_norm": 0.12932927906513214, "learning_rate": 1.3181333901555508e-05, "loss": 0.0033, "step": 3200 }, { "epoch": 0.34093330492222457, "eval_loss": 0.0038969647139310837, "eval_runtime": 35.079, "eval_samples_per_second": 4281.024, "eval_steps_per_second": 66.906, "step": 3200 }, { "epoch": 0.3419987215001065, "grad_norm": 0.5066677927970886, "learning_rate": 1.316002556999787e-05, "loss": 0.0059, "step": 3210 }, { "epoch": 0.3419987215001065, "eval_loss": 0.0044798399321734905, "eval_runtime": 34.9897, "eval_samples_per_second": 4291.945, "eval_steps_per_second": 67.077, "step": 3210 }, { "epoch": 0.34306413807798847, "grad_norm": 0.7912442088127136, "learning_rate": 1.313871723844023e-05, "loss": 0.0104, "step": 3220 }, { "epoch": 0.34306413807798847, "eval_loss": 0.004379452206194401, "eval_runtime": 35.0419, "eval_samples_per_second": 4285.558, "eval_steps_per_second": 66.977, "step": 3220 }, { "epoch": 0.3441295546558704, "grad_norm": 0.0026291459798812866, "learning_rate": 1.3117408906882592e-05, "loss": 0.0149, "step": 3230 }, { "epoch": 0.3441295546558704, "eval_loss": 0.0038042503874748945, "eval_runtime": 35.032, "eval_samples_per_second": 4286.769, "eval_steps_per_second": 66.996, "step": 3230 }, { "epoch": 0.3451949712337524, "grad_norm": 0.012699414044618607, "learning_rate": 1.3096100575324952e-05, "loss": 0.0003, "step": 3240 }, { "epoch": 0.3451949712337524, "eval_loss": 0.0037444639019668102, "eval_runtime": 34.9976, "eval_samples_per_second": 4290.975, "eval_steps_per_second": 67.062, "step": 3240 }, { "epoch": 0.3462603878116344, "grad_norm": 0.06426554918289185, "learning_rate": 1.3074792243767314e-05, "loss": 0.0038, "step": 3250 }, { "epoch": 0.3462603878116344, "eval_loss": 0.0037567310500890017, "eval_runtime": 35.0155, "eval_samples_per_second": 4288.782, "eval_steps_per_second": 67.027, "step": 3250 }, { "epoch": 0.3473258043895163, "grad_norm": 0.002508602337911725, "learning_rate": 1.3053483912209674e-05, "loss": 0.0071, "step": 3260 }, { "epoch": 0.3473258043895163, "eval_loss": 0.004010562784969807, "eval_runtime": 35.0264, "eval_samples_per_second": 4287.447, "eval_steps_per_second": 67.007, "step": 3260 }, { "epoch": 0.3483912209673983, "grad_norm": 0.11929357796907425, "learning_rate": 1.3032175580652036e-05, "loss": 0.0016, "step": 3270 }, { "epoch": 0.3483912209673983, "eval_loss": 0.004778635688126087, "eval_runtime": 35.0281, "eval_samples_per_second": 4287.247, "eval_steps_per_second": 67.003, "step": 3270 }, { "epoch": 0.3494566375452802, "grad_norm": 2.577575922012329, "learning_rate": 1.3010867249094396e-05, "loss": 0.0043, "step": 3280 }, { "epoch": 0.3494566375452802, "eval_loss": 0.004186397884041071, "eval_runtime": 35.0663, "eval_samples_per_second": 4282.569, "eval_steps_per_second": 66.93, "step": 3280 }, { "epoch": 0.3505220541231622, "grad_norm": 0.003140628570690751, "learning_rate": 1.2989558917536758e-05, "loss": 0.0002, "step": 3290 }, { "epoch": 0.3505220541231622, "eval_loss": 0.004019039683043957, "eval_runtime": 35.0117, "eval_samples_per_second": 4289.253, "eval_steps_per_second": 67.035, "step": 3290 }, { "epoch": 0.3515874707010441, "grad_norm": 3.4825243949890137, "learning_rate": 1.2968250585979118e-05, "loss": 0.0222, "step": 3300 }, { "epoch": 0.3515874707010441, "eval_loss": 0.003909524530172348, "eval_runtime": 35.026, "eval_samples_per_second": 4287.501, "eval_steps_per_second": 67.007, "step": 3300 }, { "epoch": 0.3526528872789261, "grad_norm": 0.0026517182122915983, "learning_rate": 1.294694225442148e-05, "loss": 0.0015, "step": 3310 }, { "epoch": 0.3526528872789261, "eval_loss": 0.003752995515242219, "eval_runtime": 35.0277, "eval_samples_per_second": 4287.298, "eval_steps_per_second": 67.004, "step": 3310 }, { "epoch": 0.353718303856808, "grad_norm": 0.011982251890003681, "learning_rate": 1.292563392286384e-05, "loss": 0.0014, "step": 3320 }, { "epoch": 0.353718303856808, "eval_loss": 0.003981877584010363, "eval_runtime": 35.0094, "eval_samples_per_second": 4289.53, "eval_steps_per_second": 67.039, "step": 3320 }, { "epoch": 0.35478372043469, "grad_norm": 0.009307941421866417, "learning_rate": 1.2904325591306202e-05, "loss": 0.0018, "step": 3330 }, { "epoch": 0.35478372043469, "eval_loss": 0.004219081252813339, "eval_runtime": 35.037, "eval_samples_per_second": 4286.157, "eval_steps_per_second": 66.986, "step": 3330 }, { "epoch": 0.3558491370125719, "grad_norm": 0.003159622196108103, "learning_rate": 1.2883017259748562e-05, "loss": 0.0006, "step": 3340 }, { "epoch": 0.3558491370125719, "eval_loss": 0.004214595537632704, "eval_runtime": 35.0219, "eval_samples_per_second": 4288.002, "eval_steps_per_second": 67.015, "step": 3340 }, { "epoch": 0.3569145535904539, "grad_norm": 0.008063999935984612, "learning_rate": 1.2861708928190924e-05, "loss": 0.0009, "step": 3350 }, { "epoch": 0.3569145535904539, "eval_loss": 0.0040070428512990475, "eval_runtime": 35.0574, "eval_samples_per_second": 4283.666, "eval_steps_per_second": 66.947, "step": 3350 }, { "epoch": 0.3579799701683358, "grad_norm": 0.005213271360844374, "learning_rate": 1.2840400596633284e-05, "loss": 0.0003, "step": 3360 }, { "epoch": 0.3579799701683358, "eval_loss": 0.00408785417675972, "eval_runtime": 35.0643, "eval_samples_per_second": 4282.822, "eval_steps_per_second": 66.934, "step": 3360 }, { "epoch": 0.3590453867462178, "grad_norm": 0.09602358192205429, "learning_rate": 1.2819092265075646e-05, "loss": 0.0004, "step": 3370 }, { "epoch": 0.3590453867462178, "eval_loss": 0.004157669842243195, "eval_runtime": 35.041, "eval_samples_per_second": 4285.661, "eval_steps_per_second": 66.979, "step": 3370 }, { "epoch": 0.3601108033240997, "grad_norm": 0.00831978116184473, "learning_rate": 1.2797783933518006e-05, "loss": 0.0002, "step": 3380 }, { "epoch": 0.3601108033240997, "eval_loss": 0.0041765193454921246, "eval_runtime": 35.0633, "eval_samples_per_second": 4282.934, "eval_steps_per_second": 66.936, "step": 3380 }, { "epoch": 0.3611762199019817, "grad_norm": 0.011790602467954159, "learning_rate": 1.2776475601960368e-05, "loss": 0.0003, "step": 3390 }, { "epoch": 0.3611762199019817, "eval_loss": 0.004201879724860191, "eval_runtime": 35.0369, "eval_samples_per_second": 4286.161, "eval_steps_per_second": 66.986, "step": 3390 }, { "epoch": 0.3622416364798636, "grad_norm": 0.009733389131724834, "learning_rate": 1.2755167270402728e-05, "loss": 0.0018, "step": 3400 }, { "epoch": 0.3622416364798636, "eval_loss": 0.00434511061757803, "eval_runtime": 35.0659, "eval_samples_per_second": 4282.623, "eval_steps_per_second": 66.931, "step": 3400 }, { "epoch": 0.3633070530577456, "grad_norm": 0.02603778801858425, "learning_rate": 1.273385893884509e-05, "loss": 0.0006, "step": 3410 }, { "epoch": 0.3633070530577456, "eval_loss": 0.004456702154129744, "eval_runtime": 35.0386, "eval_samples_per_second": 4285.953, "eval_steps_per_second": 66.983, "step": 3410 }, { "epoch": 0.3643724696356275, "grad_norm": 0.0025256723165512085, "learning_rate": 1.271255060728745e-05, "loss": 0.0003, "step": 3420 }, { "epoch": 0.3643724696356275, "eval_loss": 0.004565515089780092, "eval_runtime": 35.0713, "eval_samples_per_second": 4281.965, "eval_steps_per_second": 66.921, "step": 3420 }, { "epoch": 0.3654378862135095, "grad_norm": 0.0844619870185852, "learning_rate": 1.2691242275729812e-05, "loss": 0.0005, "step": 3430 }, { "epoch": 0.3654378862135095, "eval_loss": 0.004752400331199169, "eval_runtime": 35.0162, "eval_samples_per_second": 4288.697, "eval_steps_per_second": 67.026, "step": 3430 }, { "epoch": 0.3665033027913914, "grad_norm": 0.0021026332397013903, "learning_rate": 1.2669933944172172e-05, "loss": 0.008, "step": 3440 }, { "epoch": 0.3665033027913914, "eval_loss": 0.005648768972605467, "eval_runtime": 35.0423, "eval_samples_per_second": 4285.511, "eval_steps_per_second": 66.976, "step": 3440 }, { "epoch": 0.3675687193692734, "grad_norm": 0.0023316419683396816, "learning_rate": 1.2648625612614534e-05, "loss": 0.0228, "step": 3450 }, { "epoch": 0.3675687193692734, "eval_loss": 0.004643257707357407, "eval_runtime": 35.0494, "eval_samples_per_second": 4284.635, "eval_steps_per_second": 66.963, "step": 3450 }, { "epoch": 0.36863413594715533, "grad_norm": 0.07622463256120682, "learning_rate": 1.2627317281056894e-05, "loss": 0.0007, "step": 3460 }, { "epoch": 0.36863413594715533, "eval_loss": 0.0040694731287658215, "eval_runtime": 35.0492, "eval_samples_per_second": 4284.666, "eval_steps_per_second": 66.963, "step": 3460 }, { "epoch": 0.3696995525250373, "grad_norm": 0.047097232192754745, "learning_rate": 1.2606008949499256e-05, "loss": 0.0038, "step": 3470 }, { "epoch": 0.3696995525250373, "eval_loss": 0.004037928301841021, "eval_runtime": 35.0666, "eval_samples_per_second": 4282.532, "eval_steps_per_second": 66.93, "step": 3470 }, { "epoch": 0.37076496910291923, "grad_norm": 0.0033278772607445717, "learning_rate": 1.2584700617941616e-05, "loss": 0.0003, "step": 3480 }, { "epoch": 0.37076496910291923, "eval_loss": 0.003961279056966305, "eval_runtime": 35.017, "eval_samples_per_second": 4288.604, "eval_steps_per_second": 67.025, "step": 3480 }, { "epoch": 0.3718303856808012, "grad_norm": 0.0027841716073453426, "learning_rate": 1.2563392286383978e-05, "loss": 0.0034, "step": 3490 }, { "epoch": 0.3718303856808012, "eval_loss": 0.0038393058348447084, "eval_runtime": 35.0566, "eval_samples_per_second": 4283.76, "eval_steps_per_second": 66.949, "step": 3490 }, { "epoch": 0.37289580225868313, "grad_norm": 0.01204370055347681, "learning_rate": 1.2542083954826338e-05, "loss": 0.0003, "step": 3500 }, { "epoch": 0.37289580225868313, "eval_loss": 0.0038628741167485714, "eval_runtime": 35.0746, "eval_samples_per_second": 4281.557, "eval_steps_per_second": 66.914, "step": 3500 }, { "epoch": 0.3739612188365651, "grad_norm": 0.0020387719850987196, "learning_rate": 1.25207756232687e-05, "loss": 0.0005, "step": 3510 }, { "epoch": 0.3739612188365651, "eval_loss": 0.0038964590057730675, "eval_runtime": 35.0528, "eval_samples_per_second": 4284.217, "eval_steps_per_second": 66.956, "step": 3510 }, { "epoch": 0.37502663541444703, "grad_norm": 0.0024983766488730907, "learning_rate": 1.249946729171106e-05, "loss": 0.0002, "step": 3520 }, { "epoch": 0.37502663541444703, "eval_loss": 0.003918655216693878, "eval_runtime": 35.0622, "eval_samples_per_second": 4283.07, "eval_steps_per_second": 66.938, "step": 3520 }, { "epoch": 0.376092051992329, "grad_norm": 0.008725779131054878, "learning_rate": 1.2478158960153422e-05, "loss": 0.0002, "step": 3530 }, { "epoch": 0.376092051992329, "eval_loss": 0.003952388651669025, "eval_runtime": 35.0543, "eval_samples_per_second": 4284.036, "eval_steps_per_second": 66.953, "step": 3530 }, { "epoch": 0.37715746857021093, "grad_norm": 0.02480531670153141, "learning_rate": 1.2456850628595782e-05, "loss": 0.0009, "step": 3540 }, { "epoch": 0.37715746857021093, "eval_loss": 0.004151148721575737, "eval_runtime": 35.0767, "eval_samples_per_second": 4281.306, "eval_steps_per_second": 66.911, "step": 3540 }, { "epoch": 0.3782228851480929, "grad_norm": 0.0019680445548146963, "learning_rate": 1.2435542297038144e-05, "loss": 0.0005, "step": 3550 }, { "epoch": 0.3782228851480929, "eval_loss": 0.004354014992713928, "eval_runtime": 35.0834, "eval_samples_per_second": 4280.481, "eval_steps_per_second": 66.898, "step": 3550 }, { "epoch": 0.37928830172597483, "grad_norm": 0.002003498375415802, "learning_rate": 1.2414233965480504e-05, "loss": 0.0101, "step": 3560 }, { "epoch": 0.37928830172597483, "eval_loss": 0.003934256266802549, "eval_runtime": 35.06, "eval_samples_per_second": 4283.344, "eval_steps_per_second": 66.942, "step": 3560 }, { "epoch": 0.3803537183038568, "grad_norm": 0.12127909809350967, "learning_rate": 1.2392925633922866e-05, "loss": 0.0007, "step": 3570 }, { "epoch": 0.3803537183038568, "eval_loss": 0.0039041999261826277, "eval_runtime": 35.0747, "eval_samples_per_second": 4281.545, "eval_steps_per_second": 66.914, "step": 3570 }, { "epoch": 0.3814191348817388, "grad_norm": 0.0020610857754945755, "learning_rate": 1.2371617302365226e-05, "loss": 0.011, "step": 3580 }, { "epoch": 0.3814191348817388, "eval_loss": 0.0038424658123403788, "eval_runtime": 35.1049, "eval_samples_per_second": 4277.859, "eval_steps_per_second": 66.857, "step": 3580 }, { "epoch": 0.38248455145962074, "grad_norm": 0.002351719420403242, "learning_rate": 1.2350308970807588e-05, "loss": 0.0021, "step": 3590 }, { "epoch": 0.38248455145962074, "eval_loss": 0.003790972288697958, "eval_runtime": 35.104, "eval_samples_per_second": 4277.973, "eval_steps_per_second": 66.858, "step": 3590 }, { "epoch": 0.3835499680375027, "grad_norm": 0.002572182798758149, "learning_rate": 1.2329000639249948e-05, "loss": 0.0002, "step": 3600 }, { "epoch": 0.3835499680375027, "eval_loss": 0.00375328934751451, "eval_runtime": 35.0611, "eval_samples_per_second": 4283.213, "eval_steps_per_second": 66.94, "step": 3600 }, { "epoch": 0.38461538461538464, "grad_norm": 0.11054070293903351, "learning_rate": 1.230769230769231e-05, "loss": 0.0045, "step": 3610 }, { "epoch": 0.38461538461538464, "eval_loss": 0.00388448778539896, "eval_runtime": 35.06, "eval_samples_per_second": 4283.349, "eval_steps_per_second": 66.942, "step": 3610 }, { "epoch": 0.3856808011932666, "grad_norm": 0.0022837959695607424, "learning_rate": 1.228638397613467e-05, "loss": 0.0001, "step": 3620 }, { "epoch": 0.3856808011932666, "eval_loss": 0.004068476613610983, "eval_runtime": 35.0813, "eval_samples_per_second": 4280.748, "eval_steps_per_second": 66.902, "step": 3620 }, { "epoch": 0.38674621777114854, "grad_norm": 0.15774469077587128, "learning_rate": 1.2265075644577032e-05, "loss": 0.0141, "step": 3630 }, { "epoch": 0.38674621777114854, "eval_loss": 0.0036877018865197897, "eval_runtime": 35.0629, "eval_samples_per_second": 4282.984, "eval_steps_per_second": 66.937, "step": 3630 }, { "epoch": 0.3878116343490305, "grad_norm": 0.0019772418309003115, "learning_rate": 1.2243767313019392e-05, "loss": 0.0084, "step": 3640 }, { "epoch": 0.3878116343490305, "eval_loss": 0.0037544872611761093, "eval_runtime": 35.0694, "eval_samples_per_second": 4282.201, "eval_steps_per_second": 66.925, "step": 3640 }, { "epoch": 0.38887705092691244, "grad_norm": 0.2123226374387741, "learning_rate": 1.2222458981461754e-05, "loss": 0.0056, "step": 3650 }, { "epoch": 0.38887705092691244, "eval_loss": 0.0039087338373064995, "eval_runtime": 35.0526, "eval_samples_per_second": 4284.253, "eval_steps_per_second": 66.957, "step": 3650 }, { "epoch": 0.3899424675047944, "grad_norm": 0.0640728697180748, "learning_rate": 1.2201150649904114e-05, "loss": 0.0004, "step": 3660 }, { "epoch": 0.3899424675047944, "eval_loss": 0.004001296125352383, "eval_runtime": 35.07, "eval_samples_per_second": 4282.117, "eval_steps_per_second": 66.923, "step": 3660 }, { "epoch": 0.39100788408267634, "grad_norm": 0.0062736626714468, "learning_rate": 1.2179842318346476e-05, "loss": 0.0097, "step": 3670 }, { "epoch": 0.39100788408267634, "eval_loss": 0.0038057903293520212, "eval_runtime": 35.1173, "eval_samples_per_second": 4276.355, "eval_steps_per_second": 66.833, "step": 3670 }, { "epoch": 0.3920733006605583, "grad_norm": 0.00930896308273077, "learning_rate": 1.2158533986788836e-05, "loss": 0.0023, "step": 3680 }, { "epoch": 0.3920733006605583, "eval_loss": 0.0034806670155376196, "eval_runtime": 35.0473, "eval_samples_per_second": 4284.891, "eval_steps_per_second": 66.967, "step": 3680 }, { "epoch": 0.39313871723844024, "grad_norm": 0.002325017936527729, "learning_rate": 1.2137225655231198e-05, "loss": 0.0025, "step": 3690 }, { "epoch": 0.39313871723844024, "eval_loss": 0.003411883721128106, "eval_runtime": 35.0649, "eval_samples_per_second": 4282.74, "eval_steps_per_second": 66.933, "step": 3690 }, { "epoch": 0.3942041338163222, "grad_norm": 0.005867179948836565, "learning_rate": 1.2115917323673558e-05, "loss": 0.0032, "step": 3700 }, { "epoch": 0.3942041338163222, "eval_loss": 0.0034860384184867144, "eval_runtime": 35.0541, "eval_samples_per_second": 4284.061, "eval_steps_per_second": 66.954, "step": 3700 }, { "epoch": 0.39526955039420414, "grad_norm": 0.00318440911360085, "learning_rate": 1.209460899211592e-05, "loss": 0.0023, "step": 3710 }, { "epoch": 0.39526955039420414, "eval_loss": 0.003476213663816452, "eval_runtime": 35.0529, "eval_samples_per_second": 4284.216, "eval_steps_per_second": 66.956, "step": 3710 }, { "epoch": 0.3963349669720861, "grad_norm": 0.04958747327327728, "learning_rate": 1.207330066055828e-05, "loss": 0.0015, "step": 3720 }, { "epoch": 0.3963349669720861, "eval_loss": 0.003567621810361743, "eval_runtime": 35.0515, "eval_samples_per_second": 4284.38, "eval_steps_per_second": 66.959, "step": 3720 }, { "epoch": 0.39740038354996804, "grad_norm": 0.0030473291408270597, "learning_rate": 1.2051992329000642e-05, "loss": 0.0002, "step": 3730 }, { "epoch": 0.39740038354996804, "eval_loss": 0.0036918912082910538, "eval_runtime": 35.0549, "eval_samples_per_second": 4283.967, "eval_steps_per_second": 66.952, "step": 3730 }, { "epoch": 0.39846580012785, "grad_norm": 0.028261132538318634, "learning_rate": 1.2030683997443002e-05, "loss": 0.0038, "step": 3740 }, { "epoch": 0.39846580012785, "eval_loss": 0.0036734293680638075, "eval_runtime": 35.05, "eval_samples_per_second": 4284.562, "eval_steps_per_second": 66.961, "step": 3740 }, { "epoch": 0.39953121670573194, "grad_norm": 0.0021167814265936613, "learning_rate": 1.2009375665885362e-05, "loss": 0.0101, "step": 3750 }, { "epoch": 0.39953121670573194, "eval_loss": 0.0038234253879636526, "eval_runtime": 35.0671, "eval_samples_per_second": 4282.476, "eval_steps_per_second": 66.929, "step": 3750 }, { "epoch": 0.4005966332836139, "grad_norm": 0.00868785660713911, "learning_rate": 1.1988067334327722e-05, "loss": 0.0021, "step": 3760 }, { "epoch": 0.4005966332836139, "eval_loss": 0.0038062850944697857, "eval_runtime": 35.0761, "eval_samples_per_second": 4281.374, "eval_steps_per_second": 66.912, "step": 3760 }, { "epoch": 0.40166204986149584, "grad_norm": 0.010587544180452824, "learning_rate": 1.1966759002770084e-05, "loss": 0.0008, "step": 3770 }, { "epoch": 0.40166204986149584, "eval_loss": 0.0039014420472085476, "eval_runtime": 35.0702, "eval_samples_per_second": 4282.097, "eval_steps_per_second": 66.923, "step": 3770 }, { "epoch": 0.4027274664393778, "grad_norm": 0.001994876191020012, "learning_rate": 1.1945450671212444e-05, "loss": 0.0044, "step": 3780 }, { "epoch": 0.4027274664393778, "eval_loss": 0.0038195240776985884, "eval_runtime": 35.0371, "eval_samples_per_second": 4286.142, "eval_steps_per_second": 66.986, "step": 3780 }, { "epoch": 0.40379288301725974, "grad_norm": 1.4522329568862915, "learning_rate": 1.1924142339654806e-05, "loss": 0.0052, "step": 3790 }, { "epoch": 0.40379288301725974, "eval_loss": 0.003863760968670249, "eval_runtime": 35.0456, "eval_samples_per_second": 4285.108, "eval_steps_per_second": 66.97, "step": 3790 }, { "epoch": 0.4048582995951417, "grad_norm": 0.22452107071876526, "learning_rate": 1.1902834008097166e-05, "loss": 0.0019, "step": 3800 }, { "epoch": 0.4048582995951417, "eval_loss": 0.003808986861258745, "eval_runtime": 35.0644, "eval_samples_per_second": 4282.803, "eval_steps_per_second": 66.934, "step": 3800 }, { "epoch": 0.40592371617302364, "grad_norm": 0.0063810450956225395, "learning_rate": 1.1881525676539528e-05, "loss": 0.0045, "step": 3810 }, { "epoch": 0.40592371617302364, "eval_loss": 0.003760164137929678, "eval_runtime": 35.0902, "eval_samples_per_second": 4279.652, "eval_steps_per_second": 66.885, "step": 3810 }, { "epoch": 0.4069891327509056, "grad_norm": 0.004251557867974043, "learning_rate": 1.1860217344981888e-05, "loss": 0.003, "step": 3820 }, { "epoch": 0.4069891327509056, "eval_loss": 0.003843538695946336, "eval_runtime": 35.04, "eval_samples_per_second": 4285.79, "eval_steps_per_second": 66.981, "step": 3820 }, { "epoch": 0.40805454932878754, "grad_norm": 0.03963892534375191, "learning_rate": 1.183890901342425e-05, "loss": 0.001, "step": 3830 }, { "epoch": 0.40805454932878754, "eval_loss": 0.0038712327368557453, "eval_runtime": 35.0958, "eval_samples_per_second": 4278.975, "eval_steps_per_second": 66.874, "step": 3830 }, { "epoch": 0.4091199659066695, "grad_norm": 0.4857088625431061, "learning_rate": 1.181760068186661e-05, "loss": 0.0014, "step": 3840 }, { "epoch": 0.4091199659066695, "eval_loss": 0.003814863506704569, "eval_runtime": 35.1055, "eval_samples_per_second": 4277.786, "eval_steps_per_second": 66.856, "step": 3840 }, { "epoch": 0.41018538248455144, "grad_norm": 1.0623544454574585, "learning_rate": 1.1796292350308972e-05, "loss": 0.0041, "step": 3850 }, { "epoch": 0.41018538248455144, "eval_loss": 0.00388675881549716, "eval_runtime": 35.0714, "eval_samples_per_second": 4281.952, "eval_steps_per_second": 66.921, "step": 3850 }, { "epoch": 0.4112507990624334, "grad_norm": 0.25712525844573975, "learning_rate": 1.1774984018751332e-05, "loss": 0.0095, "step": 3860 }, { "epoch": 0.4112507990624334, "eval_loss": 0.0036819500382989645, "eval_runtime": 35.0976, "eval_samples_per_second": 4278.758, "eval_steps_per_second": 66.871, "step": 3860 }, { "epoch": 0.41231621564031534, "grad_norm": 0.1655515432357788, "learning_rate": 1.1753675687193694e-05, "loss": 0.0005, "step": 3870 }, { "epoch": 0.41231621564031534, "eval_loss": 0.003723361063748598, "eval_runtime": 35.055, "eval_samples_per_second": 4283.959, "eval_steps_per_second": 66.952, "step": 3870 }, { "epoch": 0.4133816322181973, "grad_norm": 0.0033157425932586193, "learning_rate": 1.1732367355636054e-05, "loss": 0.0011, "step": 3880 }, { "epoch": 0.4133816322181973, "eval_loss": 0.003979133442044258, "eval_runtime": 35.0914, "eval_samples_per_second": 4279.505, "eval_steps_per_second": 66.882, "step": 3880 }, { "epoch": 0.41444704879607924, "grad_norm": 0.0025206347927451134, "learning_rate": 1.1711059024078416e-05, "loss": 0.0009, "step": 3890 }, { "epoch": 0.41444704879607924, "eval_loss": 0.0041635469533503056, "eval_runtime": 35.0535, "eval_samples_per_second": 4284.141, "eval_steps_per_second": 66.955, "step": 3890 }, { "epoch": 0.4155124653739612, "grad_norm": 0.034843962639570236, "learning_rate": 1.1689750692520776e-05, "loss": 0.0003, "step": 3900 }, { "epoch": 0.4155124653739612, "eval_loss": 0.00432234350591898, "eval_runtime": 35.0686, "eval_samples_per_second": 4282.288, "eval_steps_per_second": 66.926, "step": 3900 }, { "epoch": 0.41657788195184314, "grad_norm": 0.009492074139416218, "learning_rate": 1.1668442360963138e-05, "loss": 0.0048, "step": 3910 }, { "epoch": 0.41657788195184314, "eval_loss": 0.004398560617119074, "eval_runtime": 35.073, "eval_samples_per_second": 4281.752, "eval_steps_per_second": 66.918, "step": 3910 }, { "epoch": 0.41764329852972515, "grad_norm": 0.0028107059188187122, "learning_rate": 1.1647134029405498e-05, "loss": 0.0003, "step": 3920 }, { "epoch": 0.41764329852972515, "eval_loss": 0.004599866457283497, "eval_runtime": 35.0365, "eval_samples_per_second": 4286.217, "eval_steps_per_second": 66.987, "step": 3920 }, { "epoch": 0.4187087151076071, "grad_norm": 0.024568969383835793, "learning_rate": 1.162582569784786e-05, "loss": 0.0005, "step": 3930 }, { "epoch": 0.4187087151076071, "eval_loss": 0.004718529060482979, "eval_runtime": 35.0643, "eval_samples_per_second": 4282.822, "eval_steps_per_second": 66.934, "step": 3930 }, { "epoch": 0.41977413168548905, "grad_norm": 0.0036480259150266647, "learning_rate": 1.160451736629022e-05, "loss": 0.0018, "step": 3940 }, { "epoch": 0.41977413168548905, "eval_loss": 0.004873115103691816, "eval_runtime": 35.1052, "eval_samples_per_second": 4277.824, "eval_steps_per_second": 66.856, "step": 3940 }, { "epoch": 0.420839548263371, "grad_norm": 0.017611248418688774, "learning_rate": 1.1583209034732582e-05, "loss": 0.0002, "step": 3950 }, { "epoch": 0.420839548263371, "eval_loss": 0.005019678734242916, "eval_runtime": 35.0986, "eval_samples_per_second": 4278.636, "eval_steps_per_second": 66.869, "step": 3950 }, { "epoch": 0.42190496484125295, "grad_norm": 0.0019490675767883658, "learning_rate": 1.1561900703174942e-05, "loss": 0.001, "step": 3960 }, { "epoch": 0.42190496484125295, "eval_loss": 0.005005026701837778, "eval_runtime": 35.1168, "eval_samples_per_second": 4276.412, "eval_steps_per_second": 66.834, "step": 3960 }, { "epoch": 0.4229703814191349, "grad_norm": 0.0025157982017844915, "learning_rate": 1.1540592371617304e-05, "loss": 0.0002, "step": 3970 }, { "epoch": 0.4229703814191349, "eval_loss": 0.0050827213563025, "eval_runtime": 35.0907, "eval_samples_per_second": 4279.602, "eval_steps_per_second": 66.884, "step": 3970 }, { "epoch": 0.42403579799701685, "grad_norm": 0.002603873610496521, "learning_rate": 1.1519284040059664e-05, "loss": 0.0001, "step": 3980 }, { "epoch": 0.42403579799701685, "eval_loss": 0.0051656016148626804, "eval_runtime": 35.0682, "eval_samples_per_second": 4282.34, "eval_steps_per_second": 66.927, "step": 3980 }, { "epoch": 0.4251012145748988, "grad_norm": 0.0055221510119736195, "learning_rate": 1.1497975708502026e-05, "loss": 0.0005, "step": 3990 }, { "epoch": 0.4251012145748988, "eval_loss": 0.005205425899475813, "eval_runtime": 35.0712, "eval_samples_per_second": 4281.979, "eval_steps_per_second": 66.921, "step": 3990 }, { "epoch": 0.42616663115278075, "grad_norm": 0.0023050708696246147, "learning_rate": 1.1476667376944386e-05, "loss": 0.0001, "step": 4000 }, { "epoch": 0.42616663115278075, "eval_loss": 0.005290038418024778, "eval_runtime": 35.045, "eval_samples_per_second": 4285.177, "eval_steps_per_second": 66.971, "step": 4000 }, { "epoch": 0.4272320477306627, "grad_norm": 0.11549913138151169, "learning_rate": 1.1455359045386748e-05, "loss": 0.0003, "step": 4010 }, { "epoch": 0.4272320477306627, "eval_loss": 0.00536829000338912, "eval_runtime": 35.094, "eval_samples_per_second": 4279.197, "eval_steps_per_second": 66.878, "step": 4010 }, { "epoch": 0.42829746430854465, "grad_norm": 0.017903966829180717, "learning_rate": 1.1434050713829108e-05, "loss": 0.0001, "step": 4020 }, { "epoch": 0.42829746430854465, "eval_loss": 0.005488130263984203, "eval_runtime": 35.0917, "eval_samples_per_second": 4279.475, "eval_steps_per_second": 66.882, "step": 4020 }, { "epoch": 0.4293628808864266, "grad_norm": 0.0017556482926011086, "learning_rate": 1.141274238227147e-05, "loss": 0.017, "step": 4030 }, { "epoch": 0.4293628808864266, "eval_loss": 0.005172598175704479, "eval_runtime": 35.0759, "eval_samples_per_second": 4281.396, "eval_steps_per_second": 66.912, "step": 4030 }, { "epoch": 0.43042829746430855, "grad_norm": 0.005225505214184523, "learning_rate": 1.139143405071383e-05, "loss": 0.0001, "step": 4040 }, { "epoch": 0.43042829746430855, "eval_loss": 0.004632376134395599, "eval_runtime": 35.0926, "eval_samples_per_second": 4279.37, "eval_steps_per_second": 66.88, "step": 4040 }, { "epoch": 0.4314937140421905, "grad_norm": 0.001705207396298647, "learning_rate": 1.1370125719156192e-05, "loss": 0.0005, "step": 4050 }, { "epoch": 0.4314937140421905, "eval_loss": 0.0044433241710066795, "eval_runtime": 35.0773, "eval_samples_per_second": 4281.231, "eval_steps_per_second": 66.909, "step": 4050 }, { "epoch": 0.43255913062007245, "grad_norm": 0.0016584375407546759, "learning_rate": 1.1348817387598552e-05, "loss": 0.0001, "step": 4060 }, { "epoch": 0.43255913062007245, "eval_loss": 0.004409145098179579, "eval_runtime": 35.0654, "eval_samples_per_second": 4282.689, "eval_steps_per_second": 66.932, "step": 4060 }, { "epoch": 0.4336245471979544, "grad_norm": 0.0026151298079639673, "learning_rate": 1.1327509056040914e-05, "loss": 0.0001, "step": 4070 }, { "epoch": 0.4336245471979544, "eval_loss": 0.004420367535203695, "eval_runtime": 35.1047, "eval_samples_per_second": 4277.889, "eval_steps_per_second": 66.857, "step": 4070 }, { "epoch": 0.43468996377583635, "grad_norm": 0.0077970316633582115, "learning_rate": 1.1306200724483274e-05, "loss": 0.0002, "step": 4080 }, { "epoch": 0.43468996377583635, "eval_loss": 0.004452695604413748, "eval_runtime": 35.193, "eval_samples_per_second": 4267.161, "eval_steps_per_second": 66.689, "step": 4080 }, { "epoch": 0.4357553803537183, "grad_norm": 0.03631202504038811, "learning_rate": 1.1284892392925636e-05, "loss": 0.0007, "step": 4090 }, { "epoch": 0.4357553803537183, "eval_loss": 0.004691319074481726, "eval_runtime": 35.0695, "eval_samples_per_second": 4282.187, "eval_steps_per_second": 66.924, "step": 4090 }, { "epoch": 0.43682079693160025, "grad_norm": 1.5857198238372803, "learning_rate": 1.1263584061367996e-05, "loss": 0.0085, "step": 4100 }, { "epoch": 0.43682079693160025, "eval_loss": 0.004954234231263399, "eval_runtime": 35.0751, "eval_samples_per_second": 4281.502, "eval_steps_per_second": 66.914, "step": 4100 }, { "epoch": 0.4378862135094822, "grad_norm": 0.001573398825712502, "learning_rate": 1.1242275729810358e-05, "loss": 0.0047, "step": 4110 }, { "epoch": 0.4378862135094822, "eval_loss": 0.004456042777746916, "eval_runtime": 35.0967, "eval_samples_per_second": 4278.862, "eval_steps_per_second": 66.872, "step": 4110 }, { "epoch": 0.43895163008736415, "grad_norm": 0.00567322364076972, "learning_rate": 1.1220967398252718e-05, "loss": 0.0208, "step": 4120 }, { "epoch": 0.43895163008736415, "eval_loss": 0.004521696828305721, "eval_runtime": 35.0786, "eval_samples_per_second": 4281.069, "eval_steps_per_second": 66.907, "step": 4120 }, { "epoch": 0.4400170466652461, "grad_norm": 0.23694893717765808, "learning_rate": 1.119965906669508e-05, "loss": 0.0006, "step": 4130 }, { "epoch": 0.4400170466652461, "eval_loss": 0.00439961813390255, "eval_runtime": 35.0751, "eval_samples_per_second": 4281.495, "eval_steps_per_second": 66.914, "step": 4130 }, { "epoch": 0.44108246324312805, "grad_norm": 0.0016371961683034897, "learning_rate": 1.117835073513744e-05, "loss": 0.0004, "step": 4140 }, { "epoch": 0.44108246324312805, "eval_loss": 0.004129570908844471, "eval_runtime": 35.0666, "eval_samples_per_second": 4282.542, "eval_steps_per_second": 66.93, "step": 4140 }, { "epoch": 0.44214787982101, "grad_norm": 0.001793770119547844, "learning_rate": 1.1157042403579802e-05, "loss": 0.0005, "step": 4150 }, { "epoch": 0.44214787982101, "eval_loss": 0.0041097295470535755, "eval_runtime": 35.0581, "eval_samples_per_second": 4283.58, "eval_steps_per_second": 66.946, "step": 4150 }, { "epoch": 0.44321329639889195, "grad_norm": 0.025650380179286003, "learning_rate": 1.1135734072022162e-05, "loss": 0.0115, "step": 4160 }, { "epoch": 0.44321329639889195, "eval_loss": 0.004528433550149202, "eval_runtime": 35.0417, "eval_samples_per_second": 4285.579, "eval_steps_per_second": 66.977, "step": 4160 }, { "epoch": 0.4442787129767739, "grad_norm": 0.001788902678526938, "learning_rate": 1.1114425740464523e-05, "loss": 0.0054, "step": 4170 }, { "epoch": 0.4442787129767739, "eval_loss": 0.004935861565172672, "eval_runtime": 35.0797, "eval_samples_per_second": 4280.932, "eval_steps_per_second": 66.905, "step": 4170 }, { "epoch": 0.44534412955465585, "grad_norm": 0.07157997041940689, "learning_rate": 1.1093117408906884e-05, "loss": 0.0044, "step": 4180 }, { "epoch": 0.44534412955465585, "eval_loss": 0.005132563877850771, "eval_runtime": 35.0861, "eval_samples_per_second": 4280.157, "eval_steps_per_second": 66.893, "step": 4180 }, { "epoch": 0.4464095461325378, "grad_norm": 0.0018303110264241695, "learning_rate": 1.1071809077349245e-05, "loss": 0.001, "step": 4190 }, { "epoch": 0.4464095461325378, "eval_loss": 0.005342422518879175, "eval_runtime": 35.0513, "eval_samples_per_second": 4284.412, "eval_steps_per_second": 66.959, "step": 4190 }, { "epoch": 0.44747496271041975, "grad_norm": 0.0073822783306241035, "learning_rate": 1.1050500745791606e-05, "loss": 0.0018, "step": 4200 }, { "epoch": 0.44747496271041975, "eval_loss": 0.0050468165427446365, "eval_runtime": 35.0561, "eval_samples_per_second": 4283.816, "eval_steps_per_second": 66.95, "step": 4200 }, { "epoch": 0.4485403792883017, "grad_norm": 0.06104118749499321, "learning_rate": 1.1029192414233967e-05, "loss": 0.0005, "step": 4210 }, { "epoch": 0.4485403792883017, "eval_loss": 0.004778716247528791, "eval_runtime": 35.0462, "eval_samples_per_second": 4285.024, "eval_steps_per_second": 66.969, "step": 4210 }, { "epoch": 0.44960579586618366, "grad_norm": 0.07653524726629257, "learning_rate": 1.1007884082676328e-05, "loss": 0.0009, "step": 4220 }, { "epoch": 0.44960579586618366, "eval_loss": 0.0042607756331563, "eval_runtime": 35.0848, "eval_samples_per_second": 4280.319, "eval_steps_per_second": 66.895, "step": 4220 }, { "epoch": 0.4506712124440656, "grad_norm": 0.0015622730134055018, "learning_rate": 1.098657575111869e-05, "loss": 0.0007, "step": 4230 }, { "epoch": 0.4506712124440656, "eval_loss": 0.004101978614926338, "eval_runtime": 35.0595, "eval_samples_per_second": 4283.398, "eval_steps_per_second": 66.943, "step": 4230 }, { "epoch": 0.45173662902194756, "grad_norm": 0.16386055946350098, "learning_rate": 1.096526741956105e-05, "loss": 0.0008, "step": 4240 }, { "epoch": 0.45173662902194756, "eval_loss": 0.004037812352180481, "eval_runtime": 35.0852, "eval_samples_per_second": 4280.269, "eval_steps_per_second": 66.894, "step": 4240 }, { "epoch": 0.4528020455998295, "grad_norm": 0.05913758650422096, "learning_rate": 1.0943959088003411e-05, "loss": 0.0029, "step": 4250 }, { "epoch": 0.4528020455998295, "eval_loss": 0.003882251214236021, "eval_runtime": 35.0103, "eval_samples_per_second": 4289.42, "eval_steps_per_second": 67.037, "step": 4250 }, { "epoch": 0.4538674621777115, "grad_norm": 0.0016470799455419183, "learning_rate": 1.0922650756445772e-05, "loss": 0.0008, "step": 4260 }, { "epoch": 0.4538674621777115, "eval_loss": 0.003832570044323802, "eval_runtime": 35.0235, "eval_samples_per_second": 4287.813, "eval_steps_per_second": 67.012, "step": 4260 }, { "epoch": 0.45493287875559346, "grad_norm": 0.0037195871118456125, "learning_rate": 1.0901342424888133e-05, "loss": 0.0002, "step": 4270 }, { "epoch": 0.45493287875559346, "eval_loss": 0.0038891404401510954, "eval_runtime": 35.053, "eval_samples_per_second": 4284.196, "eval_steps_per_second": 66.956, "step": 4270 }, { "epoch": 0.4559982953334754, "grad_norm": 0.001607783604413271, "learning_rate": 1.0880034093330494e-05, "loss": 0.0012, "step": 4280 }, { "epoch": 0.4559982953334754, "eval_loss": 0.0040294453501701355, "eval_runtime": 35.0398, "eval_samples_per_second": 4285.807, "eval_steps_per_second": 66.981, "step": 4280 }, { "epoch": 0.45706371191135736, "grad_norm": 0.005903988610953093, "learning_rate": 1.0858725761772855e-05, "loss": 0.0016, "step": 4290 }, { "epoch": 0.45706371191135736, "eval_loss": 0.003837657393887639, "eval_runtime": 35.0606, "eval_samples_per_second": 4283.274, "eval_steps_per_second": 66.941, "step": 4290 }, { "epoch": 0.4581291284892393, "grad_norm": 0.017637008801102638, "learning_rate": 1.0837417430215216e-05, "loss": 0.0003, "step": 4300 }, { "epoch": 0.4581291284892393, "eval_loss": 0.003823323640972376, "eval_runtime": 35.0483, "eval_samples_per_second": 4284.777, "eval_steps_per_second": 66.965, "step": 4300 }, { "epoch": 0.45919454506712126, "grad_norm": 0.0024709682911634445, "learning_rate": 1.0816109098657577e-05, "loss": 0.0002, "step": 4310 }, { "epoch": 0.45919454506712126, "eval_loss": 0.003842473030090332, "eval_runtime": 35.0497, "eval_samples_per_second": 4284.604, "eval_steps_per_second": 66.962, "step": 4310 }, { "epoch": 0.4602599616450032, "grad_norm": 0.0015808714088052511, "learning_rate": 1.0794800767099937e-05, "loss": 0.0067, "step": 4320 }, { "epoch": 0.4602599616450032, "eval_loss": 0.0036013866774737835, "eval_runtime": 35.0348, "eval_samples_per_second": 4286.421, "eval_steps_per_second": 66.99, "step": 4320 }, { "epoch": 0.46132537822288516, "grad_norm": 0.0048879231326282024, "learning_rate": 1.07734924355423e-05, "loss": 0.0008, "step": 4330 }, { "epoch": 0.46132537822288516, "eval_loss": 0.0035459273494780064, "eval_runtime": 35.0239, "eval_samples_per_second": 4287.753, "eval_steps_per_second": 67.011, "step": 4330 }, { "epoch": 0.4623907948007671, "grad_norm": 2.5852835178375244, "learning_rate": 1.075218410398466e-05, "loss": 0.0059, "step": 4340 }, { "epoch": 0.4623907948007671, "eval_loss": 0.003548369277268648, "eval_runtime": 35.0675, "eval_samples_per_second": 4282.425, "eval_steps_per_second": 66.928, "step": 4340 }, { "epoch": 0.46345621137864906, "grad_norm": 0.017402660101652145, "learning_rate": 1.0730875772427021e-05, "loss": 0.0001, "step": 4350 }, { "epoch": 0.46345621137864906, "eval_loss": 0.0034816220868378878, "eval_runtime": 35.0635, "eval_samples_per_second": 4282.911, "eval_steps_per_second": 66.936, "step": 4350 }, { "epoch": 0.464521627956531, "grad_norm": 0.001518838806077838, "learning_rate": 1.0709567440869381e-05, "loss": 0.0032, "step": 4360 }, { "epoch": 0.464521627956531, "eval_loss": 0.0034797603730112314, "eval_runtime": 35.0071, "eval_samples_per_second": 4289.821, "eval_steps_per_second": 67.044, "step": 4360 }, { "epoch": 0.46558704453441296, "grad_norm": 0.0015628690598532557, "learning_rate": 1.0688259109311743e-05, "loss": 0.0054, "step": 4370 }, { "epoch": 0.46558704453441296, "eval_loss": 0.0033893610816448927, "eval_runtime": 35.0177, "eval_samples_per_second": 4288.516, "eval_steps_per_second": 67.023, "step": 4370 }, { "epoch": 0.4666524611122949, "grad_norm": 0.015727238729596138, "learning_rate": 1.0666950777754103e-05, "loss": 0.001, "step": 4380 }, { "epoch": 0.4666524611122949, "eval_loss": 0.003377847606316209, "eval_runtime": 35.0481, "eval_samples_per_second": 4284.803, "eval_steps_per_second": 66.965, "step": 4380 }, { "epoch": 0.46771787769017686, "grad_norm": 0.001751308562234044, "learning_rate": 1.0645642446196465e-05, "loss": 0.0095, "step": 4390 }, { "epoch": 0.46771787769017686, "eval_loss": 0.003439757041633129, "eval_runtime": 35.0358, "eval_samples_per_second": 4286.298, "eval_steps_per_second": 66.989, "step": 4390 }, { "epoch": 0.4687832942680588, "grad_norm": 0.003558347700163722, "learning_rate": 1.0624334114638825e-05, "loss": 0.0014, "step": 4400 }, { "epoch": 0.4687832942680588, "eval_loss": 0.0034394925460219383, "eval_runtime": 35.0439, "eval_samples_per_second": 4285.308, "eval_steps_per_second": 66.973, "step": 4400 }, { "epoch": 0.46984871084594076, "grad_norm": 0.025436507537961006, "learning_rate": 1.0603025783081187e-05, "loss": 0.0011, "step": 4410 }, { "epoch": 0.46984871084594076, "eval_loss": 0.003332258900627494, "eval_runtime": 35.0777, "eval_samples_per_second": 4281.18, "eval_steps_per_second": 66.909, "step": 4410 }, { "epoch": 0.4709141274238227, "grad_norm": 0.0017181203002110124, "learning_rate": 1.0581717451523547e-05, "loss": 0.0051, "step": 4420 }, { "epoch": 0.4709141274238227, "eval_loss": 0.0032833644654601812, "eval_runtime": 35.0364, "eval_samples_per_second": 4286.225, "eval_steps_per_second": 66.987, "step": 4420 }, { "epoch": 0.47197954400170467, "grad_norm": 0.3103368282318115, "learning_rate": 1.0560409119965906e-05, "loss": 0.0009, "step": 4430 }, { "epoch": 0.47197954400170467, "eval_loss": 0.003310458268970251, "eval_runtime": 35.0137, "eval_samples_per_second": 4289.005, "eval_steps_per_second": 67.031, "step": 4430 }, { "epoch": 0.4730449605795866, "grad_norm": 0.0034083034843206406, "learning_rate": 1.0539100788408268e-05, "loss": 0.0016, "step": 4440 }, { "epoch": 0.4730449605795866, "eval_loss": 0.00335130887106061, "eval_runtime": 35.028, "eval_samples_per_second": 4287.254, "eval_steps_per_second": 67.004, "step": 4440 }, { "epoch": 0.47411037715746857, "grad_norm": 0.0014370749704539776, "learning_rate": 1.0517792456850628e-05, "loss": 0.0026, "step": 4450 }, { "epoch": 0.47411037715746857, "eval_loss": 0.0034298617392778397, "eval_runtime": 35.0892, "eval_samples_per_second": 4279.782, "eval_steps_per_second": 66.887, "step": 4450 }, { "epoch": 0.4751757937353505, "grad_norm": 0.004712993744760752, "learning_rate": 1.049648412529299e-05, "loss": 0.0001, "step": 4460 }, { "epoch": 0.4751757937353505, "eval_loss": 0.003493973519653082, "eval_runtime": 35.0547, "eval_samples_per_second": 4283.994, "eval_steps_per_second": 66.953, "step": 4460 }, { "epoch": 0.47624121031323247, "grad_norm": 0.0023907856084406376, "learning_rate": 1.047517579373535e-05, "loss": 0.0092, "step": 4470 }, { "epoch": 0.47624121031323247, "eval_loss": 0.003423650749027729, "eval_runtime": 35.0747, "eval_samples_per_second": 4281.543, "eval_steps_per_second": 66.914, "step": 4470 }, { "epoch": 0.4773066268911144, "grad_norm": 0.6058014035224915, "learning_rate": 1.0453867462177712e-05, "loss": 0.0027, "step": 4480 }, { "epoch": 0.4773066268911144, "eval_loss": 0.0034181708469986916, "eval_runtime": 35.0269, "eval_samples_per_second": 4287.387, "eval_steps_per_second": 67.006, "step": 4480 }, { "epoch": 0.47837204346899637, "grad_norm": 0.002385197440162301, "learning_rate": 1.0432559130620072e-05, "loss": 0.0003, "step": 4490 }, { "epoch": 0.47837204346899637, "eval_loss": 0.003402228932827711, "eval_runtime": 35.0808, "eval_samples_per_second": 4280.801, "eval_steps_per_second": 66.903, "step": 4490 }, { "epoch": 0.4794374600468783, "grad_norm": 0.004214168526232243, "learning_rate": 1.0411250799062434e-05, "loss": 0.0093, "step": 4500 }, { "epoch": 0.4794374600468783, "eval_loss": 0.0033664952497929335, "eval_runtime": 35.004, "eval_samples_per_second": 4290.198, "eval_steps_per_second": 67.05, "step": 4500 }, { "epoch": 0.48050287662476027, "grad_norm": 0.0015161953633651137, "learning_rate": 1.0389942467504794e-05, "loss": 0.0001, "step": 4510 }, { "epoch": 0.48050287662476027, "eval_loss": 0.003395343665033579, "eval_runtime": 35.0498, "eval_samples_per_second": 4284.584, "eval_steps_per_second": 66.962, "step": 4510 }, { "epoch": 0.4815682932026422, "grad_norm": 0.0015271385200321674, "learning_rate": 1.0368634135947156e-05, "loss": 0.0096, "step": 4520 }, { "epoch": 0.4815682932026422, "eval_loss": 0.0033128561917692423, "eval_runtime": 35.0436, "eval_samples_per_second": 4285.349, "eval_steps_per_second": 66.974, "step": 4520 }, { "epoch": 0.48263370978052417, "grad_norm": 0.004237685352563858, "learning_rate": 1.0347325804389516e-05, "loss": 0.0003, "step": 4530 }, { "epoch": 0.48263370978052417, "eval_loss": 0.0032125210855156183, "eval_runtime": 35.0305, "eval_samples_per_second": 4286.947, "eval_steps_per_second": 66.999, "step": 4530 }, { "epoch": 0.4836991263584061, "grad_norm": 0.0016559308860450983, "learning_rate": 1.0326017472831878e-05, "loss": 0.0141, "step": 4540 }, { "epoch": 0.4836991263584061, "eval_loss": 0.003183180931955576, "eval_runtime": 35.0073, "eval_samples_per_second": 4289.791, "eval_steps_per_second": 67.043, "step": 4540 }, { "epoch": 0.48476454293628807, "grad_norm": 0.0016300799325108528, "learning_rate": 1.0304709141274238e-05, "loss": 0.0042, "step": 4550 }, { "epoch": 0.48476454293628807, "eval_loss": 0.0034217729698866606, "eval_runtime": 35.0329, "eval_samples_per_second": 4286.656, "eval_steps_per_second": 66.994, "step": 4550 }, { "epoch": 0.48582995951417, "grad_norm": 0.39741653203964233, "learning_rate": 1.02834008097166e-05, "loss": 0.0005, "step": 4560 }, { "epoch": 0.48582995951417, "eval_loss": 0.003642290597781539, "eval_runtime": 35.09, "eval_samples_per_second": 4279.682, "eval_steps_per_second": 66.885, "step": 4560 }, { "epoch": 0.48689537609205197, "grad_norm": 0.18953844904899597, "learning_rate": 1.026209247815896e-05, "loss": 0.0007, "step": 4570 }, { "epoch": 0.48689537609205197, "eval_loss": 0.003431662917137146, "eval_runtime": 35.0419, "eval_samples_per_second": 4285.553, "eval_steps_per_second": 66.977, "step": 4570 }, { "epoch": 0.4879607926699339, "grad_norm": 0.0029646658804267645, "learning_rate": 1.0240784146601322e-05, "loss": 0.0013, "step": 4580 }, { "epoch": 0.4879607926699339, "eval_loss": 0.0033544725738465786, "eval_runtime": 35.0086, "eval_samples_per_second": 4289.63, "eval_steps_per_second": 67.041, "step": 4580 }, { "epoch": 0.48902620924781587, "grad_norm": 0.008971764706075191, "learning_rate": 1.0219475815043682e-05, "loss": 0.001, "step": 4590 }, { "epoch": 0.48902620924781587, "eval_loss": 0.003337480593472719, "eval_runtime": 35.0018, "eval_samples_per_second": 4290.465, "eval_steps_per_second": 67.054, "step": 4590 }, { "epoch": 0.4900916258256979, "grad_norm": 0.0016449299873784184, "learning_rate": 1.0198167483486044e-05, "loss": 0.0011, "step": 4600 }, { "epoch": 0.4900916258256979, "eval_loss": 0.0033237505704164505, "eval_runtime": 35.0433, "eval_samples_per_second": 4285.388, "eval_steps_per_second": 66.974, "step": 4600 }, { "epoch": 0.4911570424035798, "grad_norm": 0.013473814353346825, "learning_rate": 1.0176859151928404e-05, "loss": 0.0017, "step": 4610 }, { "epoch": 0.4911570424035798, "eval_loss": 0.003368969541043043, "eval_runtime": 35.0263, "eval_samples_per_second": 4287.467, "eval_steps_per_second": 67.007, "step": 4610 }, { "epoch": 0.4922224589814618, "grad_norm": 0.0029099630191922188, "learning_rate": 1.0155550820370766e-05, "loss": 0.0007, "step": 4620 }, { "epoch": 0.4922224589814618, "eval_loss": 0.003540628356859088, "eval_runtime": 35.0376, "eval_samples_per_second": 4286.083, "eval_steps_per_second": 66.985, "step": 4620 }, { "epoch": 0.4932878755593437, "grad_norm": 0.020577091723680496, "learning_rate": 1.0134242488813126e-05, "loss": 0.0005, "step": 4630 }, { "epoch": 0.4932878755593437, "eval_loss": 0.0037534397561103106, "eval_runtime": 35.0156, "eval_samples_per_second": 4288.773, "eval_steps_per_second": 67.027, "step": 4630 }, { "epoch": 0.4943532921372257, "grad_norm": 0.07291168719530106, "learning_rate": 1.0112934157255487e-05, "loss": 0.0003, "step": 4640 }, { "epoch": 0.4943532921372257, "eval_loss": 0.003838547272607684, "eval_runtime": 35.0423, "eval_samples_per_second": 4285.511, "eval_steps_per_second": 66.976, "step": 4640 }, { "epoch": 0.4954187087151076, "grad_norm": 0.02392764948308468, "learning_rate": 1.0091625825697848e-05, "loss": 0.0059, "step": 4650 }, { "epoch": 0.4954187087151076, "eval_loss": 0.0035981247201561928, "eval_runtime": 35.0354, "eval_samples_per_second": 4286.356, "eval_steps_per_second": 66.989, "step": 4650 }, { "epoch": 0.4964841252929896, "grad_norm": 0.001953831873834133, "learning_rate": 1.007031749414021e-05, "loss": 0.0002, "step": 4660 }, { "epoch": 0.4964841252929896, "eval_loss": 0.003541785990819335, "eval_runtime": 35.0691, "eval_samples_per_second": 4282.23, "eval_steps_per_second": 66.925, "step": 4660 }, { "epoch": 0.4975495418708715, "grad_norm": 0.05205778032541275, "learning_rate": 1.004900916258257e-05, "loss": 0.0008, "step": 4670 }, { "epoch": 0.4975495418708715, "eval_loss": 0.0035847588442265987, "eval_runtime": 35.0465, "eval_samples_per_second": 4284.988, "eval_steps_per_second": 66.968, "step": 4670 }, { "epoch": 0.4986149584487535, "grad_norm": 0.0016077999025583267, "learning_rate": 1.0027700831024931e-05, "loss": 0.0001, "step": 4680 }, { "epoch": 0.4986149584487535, "eval_loss": 0.0036463753785938025, "eval_runtime": 35.0411, "eval_samples_per_second": 4285.648, "eval_steps_per_second": 66.978, "step": 4680 }, { "epoch": 0.4996803750266354, "grad_norm": 0.1424156129360199, "learning_rate": 1.0006392499467292e-05, "loss": 0.0035, "step": 4690 }, { "epoch": 0.4996803750266354, "eval_loss": 0.003566704923287034, "eval_runtime": 35.0012, "eval_samples_per_second": 4290.537, "eval_steps_per_second": 67.055, "step": 4690 }, { "epoch": 0.5007457916045174, "grad_norm": 0.0017005006084218621, "learning_rate": 9.985084167909653e-06, "loss": 0.0003, "step": 4700 }, { "epoch": 0.5007457916045174, "eval_loss": 0.0034857653081417084, "eval_runtime": 35.004, "eval_samples_per_second": 4290.198, "eval_steps_per_second": 67.05, "step": 4700 }, { "epoch": 0.5018112081823993, "grad_norm": 0.11611221730709076, "learning_rate": 9.963775836352014e-06, "loss": 0.0007, "step": 4710 }, { "epoch": 0.5018112081823993, "eval_loss": 0.0035124989226460457, "eval_runtime": 35.059, "eval_samples_per_second": 4283.467, "eval_steps_per_second": 66.944, "step": 4710 }, { "epoch": 0.5028766247602813, "grad_norm": 0.023515425622463226, "learning_rate": 9.942467504794375e-06, "loss": 0.0065, "step": 4720 }, { "epoch": 0.5028766247602813, "eval_loss": 0.0037029124796390533, "eval_runtime": 35.0494, "eval_samples_per_second": 4284.644, "eval_steps_per_second": 66.963, "step": 4720 }, { "epoch": 0.5039420413381632, "grad_norm": 0.022330928593873978, "learning_rate": 9.921159173236736e-06, "loss": 0.0003, "step": 4730 }, { "epoch": 0.5039420413381632, "eval_loss": 0.003991840872913599, "eval_runtime": 35.0528, "eval_samples_per_second": 4284.226, "eval_steps_per_second": 66.956, "step": 4730 }, { "epoch": 0.5050074579160452, "grad_norm": 0.0015543290646746755, "learning_rate": 9.899850841679097e-06, "loss": 0.0005, "step": 4740 }, { "epoch": 0.5050074579160452, "eval_loss": 0.004205774050205946, "eval_runtime": 35.0589, "eval_samples_per_second": 4283.481, "eval_steps_per_second": 66.945, "step": 4740 }, { "epoch": 0.5060728744939271, "grad_norm": 0.0047645787708461285, "learning_rate": 9.878542510121458e-06, "loss": 0.0097, "step": 4750 }, { "epoch": 0.5060728744939271, "eval_loss": 0.004034877754747868, "eval_runtime": 35.029, "eval_samples_per_second": 4287.128, "eval_steps_per_second": 67.002, "step": 4750 }, { "epoch": 0.5071382910718091, "grad_norm": 0.018391378223896027, "learning_rate": 9.85723417856382e-06, "loss": 0.0033, "step": 4760 }, { "epoch": 0.5071382910718091, "eval_loss": 0.0034893695265054703, "eval_runtime": 35.0565, "eval_samples_per_second": 4283.77, "eval_steps_per_second": 66.949, "step": 4760 }, { "epoch": 0.508203707649691, "grad_norm": 0.09393607079982758, "learning_rate": 9.83592584700618e-06, "loss": 0.0061, "step": 4770 }, { "epoch": 0.508203707649691, "eval_loss": 0.003367075929418206, "eval_runtime": 35.0294, "eval_samples_per_second": 4287.079, "eval_steps_per_second": 67.001, "step": 4770 }, { "epoch": 0.509269124227573, "grad_norm": 0.05805261433124542, "learning_rate": 9.814617515448541e-06, "loss": 0.0029, "step": 4780 }, { "epoch": 0.509269124227573, "eval_loss": 0.0032794128637760878, "eval_runtime": 35.0347, "eval_samples_per_second": 4286.435, "eval_steps_per_second": 66.991, "step": 4780 }, { "epoch": 0.5103345408054549, "grad_norm": 0.218561589717865, "learning_rate": 9.793309183890901e-06, "loss": 0.0011, "step": 4790 }, { "epoch": 0.5103345408054549, "eval_loss": 0.0032986998558044434, "eval_runtime": 35.042, "eval_samples_per_second": 4285.545, "eval_steps_per_second": 66.977, "step": 4790 }, { "epoch": 0.5113999573833369, "grad_norm": 0.0016841794131323695, "learning_rate": 9.772000852333263e-06, "loss": 0.0005, "step": 4800 }, { "epoch": 0.5113999573833369, "eval_loss": 0.003459086874499917, "eval_runtime": 35.0677, "eval_samples_per_second": 4282.407, "eval_steps_per_second": 66.928, "step": 4800 }, { "epoch": 0.5124653739612188, "grad_norm": 0.16237737238407135, "learning_rate": 9.750692520775623e-06, "loss": 0.0029, "step": 4810 }, { "epoch": 0.5124653739612188, "eval_loss": 0.0034708159510046244, "eval_runtime": 35.1028, "eval_samples_per_second": 4278.124, "eval_steps_per_second": 66.861, "step": 4810 }, { "epoch": 0.5135307905391008, "grad_norm": 0.0015488864155486226, "learning_rate": 9.729384189217985e-06, "loss": 0.0133, "step": 4820 }, { "epoch": 0.5135307905391008, "eval_loss": 0.0035250100772827864, "eval_runtime": 35.0425, "eval_samples_per_second": 4285.48, "eval_steps_per_second": 66.976, "step": 4820 }, { "epoch": 0.5145962071169827, "grad_norm": 0.0018307908903807402, "learning_rate": 9.708075857660345e-06, "loss": 0.0003, "step": 4830 }, { "epoch": 0.5145962071169827, "eval_loss": 0.0035982467234134674, "eval_runtime": 35.0382, "eval_samples_per_second": 4286.011, "eval_steps_per_second": 66.984, "step": 4830 }, { "epoch": 0.5156616236948647, "grad_norm": 0.001923054805956781, "learning_rate": 9.686767526102707e-06, "loss": 0.0067, "step": 4840 }, { "epoch": 0.5156616236948647, "eval_loss": 0.003679609391838312, "eval_runtime": 35.0048, "eval_samples_per_second": 4290.095, "eval_steps_per_second": 67.048, "step": 4840 }, { "epoch": 0.5167270402727466, "grad_norm": 0.004570928402245045, "learning_rate": 9.665459194545067e-06, "loss": 0.0002, "step": 4850 }, { "epoch": 0.5167270402727466, "eval_loss": 0.003768416354432702, "eval_runtime": 35.0717, "eval_samples_per_second": 4281.911, "eval_steps_per_second": 66.92, "step": 4850 }, { "epoch": 0.5177924568506286, "grad_norm": 0.002467310754582286, "learning_rate": 9.64415086298743e-06, "loss": 0.0002, "step": 4860 }, { "epoch": 0.5177924568506286, "eval_loss": 0.0038299639709293842, "eval_runtime": 35.0546, "eval_samples_per_second": 4283.999, "eval_steps_per_second": 66.953, "step": 4860 }, { "epoch": 0.5188578734285105, "grad_norm": 0.007724090479314327, "learning_rate": 9.62284253142979e-06, "loss": 0.0002, "step": 4870 }, { "epoch": 0.5188578734285105, "eval_loss": 0.0038635297678411007, "eval_runtime": 35.0595, "eval_samples_per_second": 4283.406, "eval_steps_per_second": 66.943, "step": 4870 }, { "epoch": 0.5199232900063925, "grad_norm": 0.0018088623182848096, "learning_rate": 9.601534199872151e-06, "loss": 0.0021, "step": 4880 }, { "epoch": 0.5199232900063925, "eval_loss": 0.00368054979480803, "eval_runtime": 35.0441, "eval_samples_per_second": 4285.285, "eval_steps_per_second": 66.973, "step": 4880 }, { "epoch": 0.5209887065842744, "grad_norm": 0.001608138787560165, "learning_rate": 9.580225868314511e-06, "loss": 0.0003, "step": 4890 }, { "epoch": 0.5209887065842744, "eval_loss": 0.0037215733900666237, "eval_runtime": 35.0462, "eval_samples_per_second": 4285.034, "eval_steps_per_second": 66.969, "step": 4890 }, { "epoch": 0.5220541231621564, "grad_norm": 0.006339292973279953, "learning_rate": 9.558917536756873e-06, "loss": 0.0002, "step": 4900 }, { "epoch": 0.5220541231621564, "eval_loss": 0.003766452893614769, "eval_runtime": 35.0484, "eval_samples_per_second": 4284.756, "eval_steps_per_second": 66.964, "step": 4900 }, { "epoch": 0.5231195397400383, "grad_norm": 0.0025099278427660465, "learning_rate": 9.537609205199233e-06, "loss": 0.0002, "step": 4910 }, { "epoch": 0.5231195397400383, "eval_loss": 0.003810285124927759, "eval_runtime": 35.0708, "eval_samples_per_second": 4282.026, "eval_steps_per_second": 66.922, "step": 4910 }, { "epoch": 0.5241849563179203, "grad_norm": 0.013173098675906658, "learning_rate": 9.516300873641595e-06, "loss": 0.0146, "step": 4920 }, { "epoch": 0.5241849563179203, "eval_loss": 0.0033065176103264093, "eval_runtime": 35.0434, "eval_samples_per_second": 4285.372, "eval_steps_per_second": 66.974, "step": 4920 }, { "epoch": 0.5252503728958022, "grad_norm": 0.010503698140382767, "learning_rate": 9.494992542083955e-06, "loss": 0.0005, "step": 4930 }, { "epoch": 0.5252503728958022, "eval_loss": 0.003174431389197707, "eval_runtime": 35.0738, "eval_samples_per_second": 4281.663, "eval_steps_per_second": 66.916, "step": 4930 }, { "epoch": 0.5263157894736842, "grad_norm": 0.001948002027347684, "learning_rate": 9.473684210526315e-06, "loss": 0.0067, "step": 4940 }, { "epoch": 0.5263157894736842, "eval_loss": 0.0031368620693683624, "eval_runtime": 35.0438, "eval_samples_per_second": 4285.322, "eval_steps_per_second": 66.973, "step": 4940 }, { "epoch": 0.5273812060515661, "grad_norm": 0.2971310019493103, "learning_rate": 9.452375878968677e-06, "loss": 0.0068, "step": 4950 }, { "epoch": 0.5273812060515661, "eval_loss": 0.003268307074904442, "eval_runtime": 35.0314, "eval_samples_per_second": 4286.844, "eval_steps_per_second": 66.997, "step": 4950 }, { "epoch": 0.5284466226294481, "grad_norm": 0.0015663893427699804, "learning_rate": 9.431067547411037e-06, "loss": 0.0024, "step": 4960 }, { "epoch": 0.5284466226294481, "eval_loss": 0.003325843717902899, "eval_runtime": 35.0579, "eval_samples_per_second": 4283.604, "eval_steps_per_second": 66.946, "step": 4960 }, { "epoch": 0.52951203920733, "grad_norm": 0.0014791989233344793, "learning_rate": 9.4097592158534e-06, "loss": 0.0008, "step": 4970 }, { "epoch": 0.52951203920733, "eval_loss": 0.003260023193433881, "eval_runtime": 35.0543, "eval_samples_per_second": 4284.038, "eval_steps_per_second": 66.953, "step": 4970 }, { "epoch": 0.530577455785212, "grad_norm": 0.0014208897482603788, "learning_rate": 9.38845088429576e-06, "loss": 0.0007, "step": 4980 }, { "epoch": 0.530577455785212, "eval_loss": 0.0032248280476778746, "eval_runtime": 35.0876, "eval_samples_per_second": 4279.972, "eval_steps_per_second": 66.89, "step": 4980 }, { "epoch": 0.5316428723630939, "grad_norm": 2.7022790908813477, "learning_rate": 9.367142552738121e-06, "loss": 0.0094, "step": 4990 }, { "epoch": 0.5316428723630939, "eval_loss": 0.0041928887367248535, "eval_runtime": 35.0333, "eval_samples_per_second": 4286.607, "eval_steps_per_second": 66.993, "step": 4990 }, { "epoch": 0.5327082889409759, "grad_norm": 0.0015129174571484327, "learning_rate": 9.345834221180481e-06, "loss": 0.0006, "step": 5000 }, { "epoch": 0.5327082889409759, "eval_loss": 0.004537190776318312, "eval_runtime": 35.0351, "eval_samples_per_second": 4286.385, "eval_steps_per_second": 66.99, "step": 5000 }, { "epoch": 0.5337737055188578, "grad_norm": 0.20955073833465576, "learning_rate": 9.324525889622843e-06, "loss": 0.0027, "step": 5010 }, { "epoch": 0.5337737055188578, "eval_loss": 0.0031157478224486113, "eval_runtime": 35.053, "eval_samples_per_second": 4284.197, "eval_steps_per_second": 66.956, "step": 5010 }, { "epoch": 0.5348391220967398, "grad_norm": 0.0013660400873050094, "learning_rate": 9.303217558065203e-06, "loss": 0.0002, "step": 5020 }, { "epoch": 0.5348391220967398, "eval_loss": 0.003070124424993992, "eval_runtime": 35.0725, "eval_samples_per_second": 4281.816, "eval_steps_per_second": 66.919, "step": 5020 }, { "epoch": 0.5359045386746217, "grad_norm": 0.9235541224479675, "learning_rate": 9.281909226507565e-06, "loss": 0.0087, "step": 5030 }, { "epoch": 0.5359045386746217, "eval_loss": 0.0031868915539234877, "eval_runtime": 35.0389, "eval_samples_per_second": 4285.924, "eval_steps_per_second": 66.983, "step": 5030 }, { "epoch": 0.5369699552525037, "grad_norm": 0.003905409947037697, "learning_rate": 9.260600894949925e-06, "loss": 0.0065, "step": 5040 }, { "epoch": 0.5369699552525037, "eval_loss": 0.0031315067317336798, "eval_runtime": 35.0444, "eval_samples_per_second": 4285.252, "eval_steps_per_second": 66.972, "step": 5040 }, { "epoch": 0.5380353718303856, "grad_norm": 0.14299072325229645, "learning_rate": 9.239292563392287e-06, "loss": 0.0014, "step": 5050 }, { "epoch": 0.5380353718303856, "eval_loss": 0.0031951293349266052, "eval_runtime": 35.0455, "eval_samples_per_second": 4285.118, "eval_steps_per_second": 66.97, "step": 5050 }, { "epoch": 0.5391007884082676, "grad_norm": 0.18088282644748688, "learning_rate": 9.217984231834647e-06, "loss": 0.001, "step": 5060 }, { "epoch": 0.5391007884082676, "eval_loss": 0.003341434756293893, "eval_runtime": 35.0797, "eval_samples_per_second": 4280.938, "eval_steps_per_second": 66.905, "step": 5060 }, { "epoch": 0.5401662049861495, "grad_norm": 0.10255859047174454, "learning_rate": 9.19667590027701e-06, "loss": 0.0042, "step": 5070 }, { "epoch": 0.5401662049861495, "eval_loss": 0.0033879380207508802, "eval_runtime": 35.0607, "eval_samples_per_second": 4283.26, "eval_steps_per_second": 66.941, "step": 5070 }, { "epoch": 0.5412316215640315, "grad_norm": 0.6156185865402222, "learning_rate": 9.17536756871937e-06, "loss": 0.0037, "step": 5080 }, { "epoch": 0.5412316215640315, "eval_loss": 0.0033736974000930786, "eval_runtime": 35.0571, "eval_samples_per_second": 4283.693, "eval_steps_per_second": 66.948, "step": 5080 }, { "epoch": 0.5422970381419134, "grad_norm": 0.001436607213690877, "learning_rate": 9.154059237161731e-06, "loss": 0.0008, "step": 5090 }, { "epoch": 0.5422970381419134, "eval_loss": 0.0032905188854783773, "eval_runtime": 35.0708, "eval_samples_per_second": 4282.026, "eval_steps_per_second": 66.922, "step": 5090 }, { "epoch": 0.5433624547197954, "grad_norm": 0.013623624108731747, "learning_rate": 9.132750905604091e-06, "loss": 0.0002, "step": 5100 }, { "epoch": 0.5433624547197954, "eval_loss": 0.0032300897873938084, "eval_runtime": 35.1794, "eval_samples_per_second": 4268.799, "eval_steps_per_second": 66.715, "step": 5100 }, { "epoch": 0.5444278712976774, "grad_norm": 0.0022984424140304327, "learning_rate": 9.111442574046453e-06, "loss": 0.0021, "step": 5110 }, { "epoch": 0.5444278712976774, "eval_loss": 0.0031744264997541904, "eval_runtime": 35.0537, "eval_samples_per_second": 4284.114, "eval_steps_per_second": 66.954, "step": 5110 }, { "epoch": 0.5454932878755594, "grad_norm": 0.0016335392137989402, "learning_rate": 9.090134242488813e-06, "loss": 0.0037, "step": 5120 }, { "epoch": 0.5454932878755594, "eval_loss": 0.0030962612945586443, "eval_runtime": 35.0514, "eval_samples_per_second": 4284.394, "eval_steps_per_second": 66.959, "step": 5120 }, { "epoch": 0.5465587044534413, "grad_norm": 0.001437659957446158, "learning_rate": 9.068825910931175e-06, "loss": 0.0003, "step": 5130 }, { "epoch": 0.5465587044534413, "eval_loss": 0.003103644121438265, "eval_runtime": 35.083, "eval_samples_per_second": 4280.534, "eval_steps_per_second": 66.898, "step": 5130 }, { "epoch": 0.5476241210313233, "grad_norm": 0.0013070203131064773, "learning_rate": 9.047517579373535e-06, "loss": 0.0118, "step": 5140 }, { "epoch": 0.5476241210313233, "eval_loss": 0.0030982240568846464, "eval_runtime": 35.0299, "eval_samples_per_second": 4287.028, "eval_steps_per_second": 67.0, "step": 5140 }, { "epoch": 0.5486895376092052, "grad_norm": 0.963262677192688, "learning_rate": 9.026209247815897e-06, "loss": 0.0018, "step": 5150 }, { "epoch": 0.5486895376092052, "eval_loss": 0.003294318215921521, "eval_runtime": 35.0551, "eval_samples_per_second": 4283.947, "eval_steps_per_second": 66.952, "step": 5150 }, { "epoch": 0.5497549541870872, "grad_norm": 0.01332628633826971, "learning_rate": 9.004900916258257e-06, "loss": 0.0002, "step": 5160 }, { "epoch": 0.5497549541870872, "eval_loss": 0.004062108229845762, "eval_runtime": 35.073, "eval_samples_per_second": 4281.76, "eval_steps_per_second": 66.918, "step": 5160 }, { "epoch": 0.5508203707649691, "grad_norm": 0.022138891741633415, "learning_rate": 8.983592584700619e-06, "loss": 0.0013, "step": 5170 }, { "epoch": 0.5508203707649691, "eval_loss": 0.0037723940331488848, "eval_runtime": 35.0486, "eval_samples_per_second": 4284.742, "eval_steps_per_second": 66.964, "step": 5170 }, { "epoch": 0.5518857873428511, "grad_norm": 0.5359131693840027, "learning_rate": 8.96228425314298e-06, "loss": 0.0077, "step": 5180 }, { "epoch": 0.5518857873428511, "eval_loss": 0.004283850081264973, "eval_runtime": 35.0668, "eval_samples_per_second": 4282.51, "eval_steps_per_second": 66.929, "step": 5180 }, { "epoch": 0.552951203920733, "grad_norm": 0.0019443683559074998, "learning_rate": 8.940975921585341e-06, "loss": 0.0016, "step": 5190 }, { "epoch": 0.552951203920733, "eval_loss": 0.004361128434538841, "eval_runtime": 35.0534, "eval_samples_per_second": 4284.152, "eval_steps_per_second": 66.955, "step": 5190 }, { "epoch": 0.554016620498615, "grad_norm": 0.07410170882940292, "learning_rate": 8.919667590027701e-06, "loss": 0.0074, "step": 5200 }, { "epoch": 0.554016620498615, "eval_loss": 0.0039838762022554874, "eval_runtime": 35.0317, "eval_samples_per_second": 4286.807, "eval_steps_per_second": 66.997, "step": 5200 }, { "epoch": 0.555082037076497, "grad_norm": 0.0014316923916339874, "learning_rate": 8.898359258470063e-06, "loss": 0.0032, "step": 5210 }, { "epoch": 0.555082037076497, "eval_loss": 0.0032119122333824635, "eval_runtime": 35.0299, "eval_samples_per_second": 4287.025, "eval_steps_per_second": 67.0, "step": 5210 }, { "epoch": 0.5561474536543789, "grad_norm": 0.0034566791728138924, "learning_rate": 8.877050926912423e-06, "loss": 0.0013, "step": 5220 }, { "epoch": 0.5561474536543789, "eval_loss": 0.0030109714716672897, "eval_runtime": 35.0631, "eval_samples_per_second": 4282.969, "eval_steps_per_second": 66.937, "step": 5220 }, { "epoch": 0.5572128702322608, "grad_norm": 0.00445709191262722, "learning_rate": 8.855742595354785e-06, "loss": 0.0028, "step": 5230 }, { "epoch": 0.5572128702322608, "eval_loss": 0.0030201044864952564, "eval_runtime": 35.0621, "eval_samples_per_second": 4283.091, "eval_steps_per_second": 66.938, "step": 5230 }, { "epoch": 0.5582782868101428, "grad_norm": 0.22698596119880676, "learning_rate": 8.834434263797145e-06, "loss": 0.0003, "step": 5240 }, { "epoch": 0.5582782868101428, "eval_loss": 0.0030886309687048197, "eval_runtime": 35.0687, "eval_samples_per_second": 4282.284, "eval_steps_per_second": 66.926, "step": 5240 }, { "epoch": 0.5593437033880247, "grad_norm": 0.08581502735614777, "learning_rate": 8.813125932239507e-06, "loss": 0.001, "step": 5250 }, { "epoch": 0.5593437033880247, "eval_loss": 0.003185087814927101, "eval_runtime": 35.025, "eval_samples_per_second": 4287.618, "eval_steps_per_second": 67.009, "step": 5250 }, { "epoch": 0.5604091199659067, "grad_norm": 0.002484232885763049, "learning_rate": 8.791817600681867e-06, "loss": 0.0128, "step": 5260 }, { "epoch": 0.5604091199659067, "eval_loss": 0.0033555706031620502, "eval_runtime": 35.0536, "eval_samples_per_second": 4284.12, "eval_steps_per_second": 66.955, "step": 5260 }, { "epoch": 0.5614745365437886, "grad_norm": 5.450259208679199, "learning_rate": 8.770509269124229e-06, "loss": 0.0125, "step": 5270 }, { "epoch": 0.5614745365437886, "eval_loss": 0.0033903010189533234, "eval_runtime": 35.0949, "eval_samples_per_second": 4279.088, "eval_steps_per_second": 66.876, "step": 5270 }, { "epoch": 0.5625399531216706, "grad_norm": 0.05760002136230469, "learning_rate": 8.74920093756659e-06, "loss": 0.0049, "step": 5280 }, { "epoch": 0.5625399531216706, "eval_loss": 0.00321973511017859, "eval_runtime": 35.0435, "eval_samples_per_second": 4285.361, "eval_steps_per_second": 66.974, "step": 5280 }, { "epoch": 0.5636053696995525, "grad_norm": 0.005817291792482138, "learning_rate": 8.72789260600895e-06, "loss": 0.0016, "step": 5290 }, { "epoch": 0.5636053696995525, "eval_loss": 0.0032233393285423517, "eval_runtime": 35.0468, "eval_samples_per_second": 4284.959, "eval_steps_per_second": 66.968, "step": 5290 }, { "epoch": 0.5646707862774345, "grad_norm": 0.009052244946360588, "learning_rate": 8.706584274451311e-06, "loss": 0.0013, "step": 5300 }, { "epoch": 0.5646707862774345, "eval_loss": 0.0031393333338201046, "eval_runtime": 35.0484, "eval_samples_per_second": 4284.761, "eval_steps_per_second": 66.965, "step": 5300 }, { "epoch": 0.5657362028553165, "grad_norm": 0.022343887016177177, "learning_rate": 8.685275942893671e-06, "loss": 0.0048, "step": 5310 }, { "epoch": 0.5657362028553165, "eval_loss": 0.0030866351444274187, "eval_runtime": 35.0511, "eval_samples_per_second": 4284.433, "eval_steps_per_second": 66.959, "step": 5310 }, { "epoch": 0.5668016194331984, "grad_norm": 0.001582018448971212, "learning_rate": 8.663967611336033e-06, "loss": 0.0005, "step": 5320 }, { "epoch": 0.5668016194331984, "eval_loss": 0.0030833673663437366, "eval_runtime": 35.0498, "eval_samples_per_second": 4284.586, "eval_steps_per_second": 66.962, "step": 5320 }, { "epoch": 0.5678670360110804, "grad_norm": 0.005051956046372652, "learning_rate": 8.642659279778393e-06, "loss": 0.0006, "step": 5330 }, { "epoch": 0.5678670360110804, "eval_loss": 0.003096715547144413, "eval_runtime": 35.0295, "eval_samples_per_second": 4287.074, "eval_steps_per_second": 67.001, "step": 5330 }, { "epoch": 0.5689324525889623, "grad_norm": 0.4084688425064087, "learning_rate": 8.621350948220755e-06, "loss": 0.0027, "step": 5340 }, { "epoch": 0.5689324525889623, "eval_loss": 0.0030978054273873568, "eval_runtime": 35.0254, "eval_samples_per_second": 4287.573, "eval_steps_per_second": 67.008, "step": 5340 }, { "epoch": 0.5699978691668443, "grad_norm": 0.0028698795940726995, "learning_rate": 8.600042616663115e-06, "loss": 0.0002, "step": 5350 }, { "epoch": 0.5699978691668443, "eval_loss": 0.003147188574075699, "eval_runtime": 35.0568, "eval_samples_per_second": 4283.732, "eval_steps_per_second": 66.948, "step": 5350 }, { "epoch": 0.5710632857447262, "grad_norm": 0.0017005919944494963, "learning_rate": 8.578734285105477e-06, "loss": 0.0005, "step": 5360 }, { "epoch": 0.5710632857447262, "eval_loss": 0.003185020759701729, "eval_runtime": 35.0384, "eval_samples_per_second": 4285.988, "eval_steps_per_second": 66.984, "step": 5360 }, { "epoch": 0.5721287023226082, "grad_norm": 0.005401854868978262, "learning_rate": 8.557425953547837e-06, "loss": 0.0001, "step": 5370 }, { "epoch": 0.5721287023226082, "eval_loss": 0.0032068644650280476, "eval_runtime": 35.0844, "eval_samples_per_second": 4280.363, "eval_steps_per_second": 66.896, "step": 5370 }, { "epoch": 0.5731941189004901, "grad_norm": 0.0929129421710968, "learning_rate": 8.536117621990199e-06, "loss": 0.0025, "step": 5380 }, { "epoch": 0.5731941189004901, "eval_loss": 0.0031966594979166985, "eval_runtime": 35.0084, "eval_samples_per_second": 4289.653, "eval_steps_per_second": 67.041, "step": 5380 }, { "epoch": 0.574259535478372, "grad_norm": 0.0012115921126678586, "learning_rate": 8.51480929043256e-06, "loss": 0.0014, "step": 5390 }, { "epoch": 0.574259535478372, "eval_loss": 0.0032315885182470083, "eval_runtime": 35.0764, "eval_samples_per_second": 4281.346, "eval_steps_per_second": 66.911, "step": 5390 }, { "epoch": 0.575324952056254, "grad_norm": 0.00769865233451128, "learning_rate": 8.493500958874921e-06, "loss": 0.0001, "step": 5400 }, { "epoch": 0.575324952056254, "eval_loss": 0.0032829763367772102, "eval_runtime": 35.0689, "eval_samples_per_second": 4282.254, "eval_steps_per_second": 66.925, "step": 5400 }, { "epoch": 0.576390368634136, "grad_norm": 1.7847949266433716, "learning_rate": 8.472192627317281e-06, "loss": 0.0047, "step": 5410 }, { "epoch": 0.576390368634136, "eval_loss": 0.0033038435503840446, "eval_runtime": 35.0485, "eval_samples_per_second": 4284.75, "eval_steps_per_second": 66.964, "step": 5410 }, { "epoch": 0.5774557852120179, "grad_norm": 0.0020609069615602493, "learning_rate": 8.450884295759643e-06, "loss": 0.0007, "step": 5420 }, { "epoch": 0.5774557852120179, "eval_loss": 0.003206141060218215, "eval_runtime": 35.0791, "eval_samples_per_second": 4281.014, "eval_steps_per_second": 66.906, "step": 5420 }, { "epoch": 0.5785212017898999, "grad_norm": 0.08441135287284851, "learning_rate": 8.429575964202003e-06, "loss": 0.0105, "step": 5430 }, { "epoch": 0.5785212017898999, "eval_loss": 0.0032010802533477545, "eval_runtime": 35.0223, "eval_samples_per_second": 4287.952, "eval_steps_per_second": 67.014, "step": 5430 }, { "epoch": 0.5795866183677818, "grad_norm": 0.03874306008219719, "learning_rate": 8.408267632644365e-06, "loss": 0.0007, "step": 5440 }, { "epoch": 0.5795866183677818, "eval_loss": 0.003258783370256424, "eval_runtime": 35.062, "eval_samples_per_second": 4283.099, "eval_steps_per_second": 66.939, "step": 5440 }, { "epoch": 0.5806520349456638, "grad_norm": 0.07270818948745728, "learning_rate": 8.386959301086725e-06, "loss": 0.0157, "step": 5450 }, { "epoch": 0.5806520349456638, "eval_loss": 0.0032654814422130585, "eval_runtime": 35.0557, "eval_samples_per_second": 4283.864, "eval_steps_per_second": 66.951, "step": 5450 }, { "epoch": 0.5817174515235457, "grad_norm": 0.006695209536701441, "learning_rate": 8.365650969529087e-06, "loss": 0.0081, "step": 5460 }, { "epoch": 0.5817174515235457, "eval_loss": 0.0033340235240757465, "eval_runtime": 35.0804, "eval_samples_per_second": 4280.854, "eval_steps_per_second": 66.903, "step": 5460 }, { "epoch": 0.5827828681014277, "grad_norm": 0.02671169675886631, "learning_rate": 8.344342637971447e-06, "loss": 0.0002, "step": 5470 }, { "epoch": 0.5827828681014277, "eval_loss": 0.0034461417235434055, "eval_runtime": 35.0658, "eval_samples_per_second": 4282.633, "eval_steps_per_second": 66.931, "step": 5470 }, { "epoch": 0.5838482846793096, "grad_norm": 0.012659654952585697, "learning_rate": 8.323034306413809e-06, "loss": 0.0002, "step": 5480 }, { "epoch": 0.5838482846793096, "eval_loss": 0.0034948259126394987, "eval_runtime": 35.0375, "eval_samples_per_second": 4286.093, "eval_steps_per_second": 66.985, "step": 5480 }, { "epoch": 0.5849137012571916, "grad_norm": 0.005894053727388382, "learning_rate": 8.301725974856169e-06, "loss": 0.0055, "step": 5490 }, { "epoch": 0.5849137012571916, "eval_loss": 0.0036419378593564034, "eval_runtime": 35.0948, "eval_samples_per_second": 4279.095, "eval_steps_per_second": 66.876, "step": 5490 }, { "epoch": 0.5859791178350735, "grad_norm": 2.166231155395508, "learning_rate": 8.280417643298531e-06, "loss": 0.0048, "step": 5500 }, { "epoch": 0.5859791178350735, "eval_loss": 0.0034717011731117964, "eval_runtime": 35.0707, "eval_samples_per_second": 4282.038, "eval_steps_per_second": 66.922, "step": 5500 }, { "epoch": 0.5870445344129555, "grad_norm": 0.006031760014593601, "learning_rate": 8.259109311740891e-06, "loss": 0.0007, "step": 5510 }, { "epoch": 0.5870445344129555, "eval_loss": 0.0032065894920378923, "eval_runtime": 35.0352, "eval_samples_per_second": 4286.372, "eval_steps_per_second": 66.99, "step": 5510 }, { "epoch": 0.5881099509908374, "grad_norm": 0.023564601317048073, "learning_rate": 8.237800980183253e-06, "loss": 0.0009, "step": 5520 }, { "epoch": 0.5881099509908374, "eval_loss": 0.003217566292732954, "eval_runtime": 35.048, "eval_samples_per_second": 4284.815, "eval_steps_per_second": 66.965, "step": 5520 }, { "epoch": 0.5891753675687194, "grad_norm": 0.04442958906292915, "learning_rate": 8.216492648625613e-06, "loss": 0.0004, "step": 5530 }, { "epoch": 0.5891753675687194, "eval_loss": 0.0032989357132464647, "eval_runtime": 35.0666, "eval_samples_per_second": 4282.534, "eval_steps_per_second": 66.93, "step": 5530 }, { "epoch": 0.5902407841466013, "grad_norm": 0.004105782601982355, "learning_rate": 8.195184317067975e-06, "loss": 0.0068, "step": 5540 }, { "epoch": 0.5902407841466013, "eval_loss": 0.0033785353880375624, "eval_runtime": 35.067, "eval_samples_per_second": 4282.491, "eval_steps_per_second": 66.929, "step": 5540 }, { "epoch": 0.5913062007244833, "grad_norm": 1.0616731643676758, "learning_rate": 8.173875985510335e-06, "loss": 0.0134, "step": 5550 }, { "epoch": 0.5913062007244833, "eval_loss": 0.0031998585909605026, "eval_runtime": 35.01, "eval_samples_per_second": 4289.463, "eval_steps_per_second": 67.038, "step": 5550 }, { "epoch": 0.5923716173023652, "grad_norm": 0.0035948033910244703, "learning_rate": 8.152567653952697e-06, "loss": 0.0048, "step": 5560 }, { "epoch": 0.5923716173023652, "eval_loss": 0.0032753869891166687, "eval_runtime": 35.0539, "eval_samples_per_second": 4284.083, "eval_steps_per_second": 66.954, "step": 5560 }, { "epoch": 0.5934370338802472, "grad_norm": 0.00209414167329669, "learning_rate": 8.131259322395057e-06, "loss": 0.0006, "step": 5570 }, { "epoch": 0.5934370338802472, "eval_loss": 0.0032072330359369516, "eval_runtime": 35.0701, "eval_samples_per_second": 4282.11, "eval_steps_per_second": 66.923, "step": 5570 }, { "epoch": 0.5945024504581291, "grad_norm": 0.04995543509721756, "learning_rate": 8.109950990837419e-06, "loss": 0.0029, "step": 5580 }, { "epoch": 0.5945024504581291, "eval_loss": 0.003245977219194174, "eval_runtime": 35.0719, "eval_samples_per_second": 4281.884, "eval_steps_per_second": 66.92, "step": 5580 }, { "epoch": 0.5955678670360111, "grad_norm": 0.001491761882789433, "learning_rate": 8.088642659279779e-06, "loss": 0.0054, "step": 5590 }, { "epoch": 0.5955678670360111, "eval_loss": 0.0031598976347595453, "eval_runtime": 35.0642, "eval_samples_per_second": 4282.827, "eval_steps_per_second": 66.934, "step": 5590 }, { "epoch": 0.596633283613893, "grad_norm": 0.003034034511074424, "learning_rate": 8.06733432772214e-06, "loss": 0.0027, "step": 5600 }, { "epoch": 0.596633283613893, "eval_loss": 0.0031364411115646362, "eval_runtime": 35.0383, "eval_samples_per_second": 4285.992, "eval_steps_per_second": 66.984, "step": 5600 }, { "epoch": 0.597698700191775, "grad_norm": 0.004649047274142504, "learning_rate": 8.046025996164501e-06, "loss": 0.0003, "step": 5610 }, { "epoch": 0.597698700191775, "eval_loss": 0.0031330641359090805, "eval_runtime": 35.0411, "eval_samples_per_second": 4285.656, "eval_steps_per_second": 66.979, "step": 5610 }, { "epoch": 0.5987641167696569, "grad_norm": 0.06239793077111244, "learning_rate": 8.024717664606861e-06, "loss": 0.0098, "step": 5620 }, { "epoch": 0.5987641167696569, "eval_loss": 0.003176827682182193, "eval_runtime": 35.0911, "eval_samples_per_second": 4279.541, "eval_steps_per_second": 66.883, "step": 5620 }, { "epoch": 0.5998295333475389, "grad_norm": 5.838839530944824, "learning_rate": 8.003409333049223e-06, "loss": 0.0072, "step": 5630 }, { "epoch": 0.5998295333475389, "eval_loss": 0.003169504227116704, "eval_runtime": 35.0513, "eval_samples_per_second": 4284.4, "eval_steps_per_second": 66.959, "step": 5630 }, { "epoch": 0.6008949499254208, "grad_norm": 0.13171178102493286, "learning_rate": 7.982101001491583e-06, "loss": 0.0007, "step": 5640 }, { "epoch": 0.6008949499254208, "eval_loss": 0.003053726628422737, "eval_runtime": 35.0276, "eval_samples_per_second": 4287.3, "eval_steps_per_second": 67.004, "step": 5640 }, { "epoch": 0.6019603665033028, "grad_norm": 0.0015700625954195857, "learning_rate": 7.960792669933945e-06, "loss": 0.0065, "step": 5650 }, { "epoch": 0.6019603665033028, "eval_loss": 0.0035806247033178806, "eval_runtime": 35.0609, "eval_samples_per_second": 4283.23, "eval_steps_per_second": 66.941, "step": 5650 }, { "epoch": 0.6030257830811847, "grad_norm": 0.005014845635741949, "learning_rate": 7.939484338376305e-06, "loss": 0.0002, "step": 5660 }, { "epoch": 0.6030257830811847, "eval_loss": 0.004452229011803865, "eval_runtime": 35.0085, "eval_samples_per_second": 4289.644, "eval_steps_per_second": 67.041, "step": 5660 }, { "epoch": 0.6040911996590667, "grad_norm": 0.05154247581958771, "learning_rate": 7.918176006818667e-06, "loss": 0.0005, "step": 5670 }, { "epoch": 0.6040911996590667, "eval_loss": 0.004699897486716509, "eval_runtime": 35.0347, "eval_samples_per_second": 4286.437, "eval_steps_per_second": 66.991, "step": 5670 }, { "epoch": 0.6051566162369486, "grad_norm": 0.0041319397278130054, "learning_rate": 7.896867675261027e-06, "loss": 0.0028, "step": 5680 }, { "epoch": 0.6051566162369486, "eval_loss": 0.0034233941696584225, "eval_runtime": 35.0512, "eval_samples_per_second": 4284.419, "eval_steps_per_second": 66.959, "step": 5680 }, { "epoch": 0.6062220328148306, "grad_norm": 0.0012949644587934017, "learning_rate": 7.875559343703389e-06, "loss": 0.0001, "step": 5690 }, { "epoch": 0.6062220328148306, "eval_loss": 0.0033080654684454203, "eval_runtime": 35.0534, "eval_samples_per_second": 4284.15, "eval_steps_per_second": 66.955, "step": 5690 }, { "epoch": 0.6072874493927125, "grad_norm": 0.0013649500906467438, "learning_rate": 7.854251012145749e-06, "loss": 0.0069, "step": 5700 }, { "epoch": 0.6072874493927125, "eval_loss": 0.00345489289611578, "eval_runtime": 35.0169, "eval_samples_per_second": 4288.615, "eval_steps_per_second": 67.025, "step": 5700 }, { "epoch": 0.6083528659705945, "grad_norm": 0.29954442381858826, "learning_rate": 7.832942680588111e-06, "loss": 0.0013, "step": 5710 }, { "epoch": 0.6083528659705945, "eval_loss": 0.003461030311882496, "eval_runtime": 35.0232, "eval_samples_per_second": 4287.846, "eval_steps_per_second": 67.013, "step": 5710 }, { "epoch": 0.6094182825484764, "grad_norm": 0.001273061498068273, "learning_rate": 7.811634349030471e-06, "loss": 0.0007, "step": 5720 }, { "epoch": 0.6094182825484764, "eval_loss": 0.0033160303719341755, "eval_runtime": 34.994, "eval_samples_per_second": 4291.424, "eval_steps_per_second": 67.069, "step": 5720 }, { "epoch": 0.6104836991263584, "grad_norm": 0.0023903066758066416, "learning_rate": 7.790326017472833e-06, "loss": 0.0007, "step": 5730 }, { "epoch": 0.6104836991263584, "eval_loss": 0.0033325697295367718, "eval_runtime": 35.0304, "eval_samples_per_second": 4286.968, "eval_steps_per_second": 66.999, "step": 5730 }, { "epoch": 0.6115491157042403, "grad_norm": 0.0014730616239830852, "learning_rate": 7.769017685915193e-06, "loss": 0.0084, "step": 5740 }, { "epoch": 0.6115491157042403, "eval_loss": 0.00339673925191164, "eval_runtime": 35.0363, "eval_samples_per_second": 4286.245, "eval_steps_per_second": 66.988, "step": 5740 }, { "epoch": 0.6126145322821223, "grad_norm": 0.19337864220142365, "learning_rate": 7.747709354357555e-06, "loss": 0.001, "step": 5750 }, { "epoch": 0.6126145322821223, "eval_loss": 0.003394161816686392, "eval_runtime": 35.0308, "eval_samples_per_second": 4286.914, "eval_steps_per_second": 66.998, "step": 5750 }, { "epoch": 0.6136799488600042, "grad_norm": 0.001471309456974268, "learning_rate": 7.726401022799915e-06, "loss": 0.0036, "step": 5760 }, { "epoch": 0.6136799488600042, "eval_loss": 0.003426865441724658, "eval_runtime": 35.0019, "eval_samples_per_second": 4290.458, "eval_steps_per_second": 67.054, "step": 5760 }, { "epoch": 0.6147453654378862, "grad_norm": 0.0012775680515915155, "learning_rate": 7.705092691242277e-06, "loss": 0.0003, "step": 5770 }, { "epoch": 0.6147453654378862, "eval_loss": 0.003422880545258522, "eval_runtime": 35.0286, "eval_samples_per_second": 4287.183, "eval_steps_per_second": 67.002, "step": 5770 }, { "epoch": 0.6158107820157681, "grad_norm": 0.0013159505324438214, "learning_rate": 7.683784359684637e-06, "loss": 0.0084, "step": 5780 }, { "epoch": 0.6158107820157681, "eval_loss": 0.0034935129806399345, "eval_runtime": 35.0203, "eval_samples_per_second": 4288.193, "eval_steps_per_second": 67.018, "step": 5780 }, { "epoch": 0.6168761985936502, "grad_norm": 0.0015752206090837717, "learning_rate": 7.662476028126999e-06, "loss": 0.0003, "step": 5790 }, { "epoch": 0.6168761985936502, "eval_loss": 0.003633267944678664, "eval_runtime": 35.0278, "eval_samples_per_second": 4287.283, "eval_steps_per_second": 67.004, "step": 5790 }, { "epoch": 0.6179416151715321, "grad_norm": 0.010617982596158981, "learning_rate": 7.641167696569359e-06, "loss": 0.005, "step": 5800 }, { "epoch": 0.6179416151715321, "eval_loss": 0.003393057268112898, "eval_runtime": 35.0376, "eval_samples_per_second": 4286.076, "eval_steps_per_second": 66.985, "step": 5800 }, { "epoch": 0.6190070317494141, "grad_norm": 2.5578744411468506, "learning_rate": 7.61985936501172e-06, "loss": 0.0063, "step": 5810 }, { "epoch": 0.6190070317494141, "eval_loss": 0.003425801871344447, "eval_runtime": 35.0481, "eval_samples_per_second": 4284.799, "eval_steps_per_second": 66.965, "step": 5810 }, { "epoch": 0.620072448327296, "grad_norm": 0.010255936533212662, "learning_rate": 7.598551033454081e-06, "loss": 0.0001, "step": 5820 }, { "epoch": 0.620072448327296, "eval_loss": 0.003427485004067421, "eval_runtime": 35.0171, "eval_samples_per_second": 4288.594, "eval_steps_per_second": 67.024, "step": 5820 }, { "epoch": 0.621137864905178, "grad_norm": 0.1109393909573555, "learning_rate": 7.577242701896442e-06, "loss": 0.0003, "step": 5830 }, { "epoch": 0.621137864905178, "eval_loss": 0.0034398355055600405, "eval_runtime": 35.0582, "eval_samples_per_second": 4283.569, "eval_steps_per_second": 66.946, "step": 5830 }, { "epoch": 0.6222032814830599, "grad_norm": 0.12083720415830612, "learning_rate": 7.555934370338803e-06, "loss": 0.0006, "step": 5840 }, { "epoch": 0.6222032814830599, "eval_loss": 0.003469038987532258, "eval_runtime": 35.0472, "eval_samples_per_second": 4284.904, "eval_steps_per_second": 66.967, "step": 5840 }, { "epoch": 0.6232686980609419, "grad_norm": 0.0032793928403407335, "learning_rate": 7.534626038781164e-06, "loss": 0.0025, "step": 5850 }, { "epoch": 0.6232686980609419, "eval_loss": 0.0036529472563415766, "eval_runtime": 35.0157, "eval_samples_per_second": 4288.764, "eval_steps_per_second": 67.027, "step": 5850 }, { "epoch": 0.6243341146388238, "grad_norm": 0.02544957958161831, "learning_rate": 7.513317707223525e-06, "loss": 0.0011, "step": 5860 }, { "epoch": 0.6243341146388238, "eval_loss": 0.0036848068702965975, "eval_runtime": 35.0117, "eval_samples_per_second": 4289.249, "eval_steps_per_second": 67.035, "step": 5860 }, { "epoch": 0.6253995312167058, "grad_norm": 0.005889591760933399, "learning_rate": 7.492009375665886e-06, "loss": 0.0052, "step": 5870 }, { "epoch": 0.6253995312167058, "eval_loss": 0.003564575221389532, "eval_runtime": 35.044, "eval_samples_per_second": 4285.3, "eval_steps_per_second": 66.973, "step": 5870 }, { "epoch": 0.6264649477945877, "grad_norm": 0.5814864635467529, "learning_rate": 7.470701044108247e-06, "loss": 0.0052, "step": 5880 }, { "epoch": 0.6264649477945877, "eval_loss": 0.0036007657181471586, "eval_runtime": 35.0287, "eval_samples_per_second": 4287.168, "eval_steps_per_second": 67.002, "step": 5880 }, { "epoch": 0.6275303643724697, "grad_norm": 0.009390910156071186, "learning_rate": 7.449392712550608e-06, "loss": 0.0018, "step": 5890 }, { "epoch": 0.6275303643724697, "eval_loss": 0.0035891227889806032, "eval_runtime": 35.0066, "eval_samples_per_second": 4289.876, "eval_steps_per_second": 67.044, "step": 5890 }, { "epoch": 0.6285957809503516, "grad_norm": 0.020240269601345062, "learning_rate": 7.428084380992969e-06, "loss": 0.0046, "step": 5900 }, { "epoch": 0.6285957809503516, "eval_loss": 0.0035373272839933634, "eval_runtime": 35.0097, "eval_samples_per_second": 4289.498, "eval_steps_per_second": 67.039, "step": 5900 }, { "epoch": 0.6296611975282336, "grad_norm": 0.10366514325141907, "learning_rate": 7.40677604943533e-06, "loss": 0.0004, "step": 5910 }, { "epoch": 0.6296611975282336, "eval_loss": 0.003493980038911104, "eval_runtime": 35.0399, "eval_samples_per_second": 4285.803, "eval_steps_per_second": 66.981, "step": 5910 }, { "epoch": 0.6307266141061155, "grad_norm": 0.03924533352255821, "learning_rate": 7.385467717877691e-06, "loss": 0.0001, "step": 5920 }, { "epoch": 0.6307266141061155, "eval_loss": 0.0034797810949385166, "eval_runtime": 35.024, "eval_samples_per_second": 4287.747, "eval_steps_per_second": 67.011, "step": 5920 }, { "epoch": 0.6317920306839975, "grad_norm": 0.011868029832839966, "learning_rate": 7.364159386320052e-06, "loss": 0.0109, "step": 5930 }, { "epoch": 0.6317920306839975, "eval_loss": 0.0033822518307715654, "eval_runtime": 35.0226, "eval_samples_per_second": 4287.919, "eval_steps_per_second": 67.014, "step": 5930 }, { "epoch": 0.6328574472618794, "grad_norm": 1.3012027740478516, "learning_rate": 7.342851054762413e-06, "loss": 0.0049, "step": 5940 }, { "epoch": 0.6328574472618794, "eval_loss": 0.0033151600509881973, "eval_runtime": 35.0547, "eval_samples_per_second": 4283.989, "eval_steps_per_second": 66.952, "step": 5940 }, { "epoch": 0.6339228638397614, "grad_norm": 0.0013931491412222385, "learning_rate": 7.321542723204774e-06, "loss": 0.0052, "step": 5950 }, { "epoch": 0.6339228638397614, "eval_loss": 0.00324883870780468, "eval_runtime": 35.0289, "eval_samples_per_second": 4287.151, "eval_steps_per_second": 67.002, "step": 5950 }, { "epoch": 0.6349882804176433, "grad_norm": 0.05665739253163338, "learning_rate": 7.300234391647134e-06, "loss": 0.0003, "step": 5960 }, { "epoch": 0.6349882804176433, "eval_loss": 0.003316541202366352, "eval_runtime": 35.0598, "eval_samples_per_second": 4283.368, "eval_steps_per_second": 66.943, "step": 5960 }, { "epoch": 0.6360536969955253, "grad_norm": 0.014257961884140968, "learning_rate": 7.278926060089495e-06, "loss": 0.0061, "step": 5970 }, { "epoch": 0.6360536969955253, "eval_loss": 0.003145574824884534, "eval_runtime": 35.0387, "eval_samples_per_second": 4285.951, "eval_steps_per_second": 66.983, "step": 5970 }, { "epoch": 0.6371191135734072, "grad_norm": 0.019166210666298866, "learning_rate": 7.257617728531856e-06, "loss": 0.0049, "step": 5980 }, { "epoch": 0.6371191135734072, "eval_loss": 0.003021866548806429, "eval_runtime": 35.044, "eval_samples_per_second": 4285.301, "eval_steps_per_second": 66.973, "step": 5980 }, { "epoch": 0.6381845301512892, "grad_norm": 0.0279945507645607, "learning_rate": 7.236309396974217e-06, "loss": 0.0067, "step": 5990 }, { "epoch": 0.6381845301512892, "eval_loss": 0.002979971468448639, "eval_runtime": 34.9997, "eval_samples_per_second": 4290.723, "eval_steps_per_second": 67.058, "step": 5990 }, { "epoch": 0.6392499467291711, "grad_norm": 0.005042492412030697, "learning_rate": 7.215001065416578e-06, "loss": 0.0007, "step": 6000 }, { "epoch": 0.6392499467291711, "eval_loss": 0.0029915031045675278, "eval_runtime": 35.0149, "eval_samples_per_second": 4288.857, "eval_steps_per_second": 67.029, "step": 6000 }, { "epoch": 0.6403153633070531, "grad_norm": 0.0013033768627792597, "learning_rate": 7.193692733858939e-06, "loss": 0.0006, "step": 6010 }, { "epoch": 0.6403153633070531, "eval_loss": 0.002996724331751466, "eval_runtime": 35.0561, "eval_samples_per_second": 4283.823, "eval_steps_per_second": 66.95, "step": 6010 }, { "epoch": 0.641380779884935, "grad_norm": 0.0022245654836297035, "learning_rate": 7.1723844023013e-06, "loss": 0.0005, "step": 6020 }, { "epoch": 0.641380779884935, "eval_loss": 0.002998237032443285, "eval_runtime": 35.0276, "eval_samples_per_second": 4287.302, "eval_steps_per_second": 67.004, "step": 6020 }, { "epoch": 0.642446196462817, "grad_norm": 4.448103427886963, "learning_rate": 7.151076070743661e-06, "loss": 0.0092, "step": 6030 }, { "epoch": 0.642446196462817, "eval_loss": 0.0030165978241711855, "eval_runtime": 35.0225, "eval_samples_per_second": 4287.935, "eval_steps_per_second": 67.014, "step": 6030 }, { "epoch": 0.6435116130406989, "grad_norm": 0.0021644230000674725, "learning_rate": 7.129767739186022e-06, "loss": 0.0017, "step": 6040 }, { "epoch": 0.6435116130406989, "eval_loss": 0.0030744208488613367, "eval_runtime": 35.0525, "eval_samples_per_second": 4284.256, "eval_steps_per_second": 66.957, "step": 6040 }, { "epoch": 0.6445770296185809, "grad_norm": 0.0013590834569185972, "learning_rate": 7.108459407628383e-06, "loss": 0.0061, "step": 6050 }, { "epoch": 0.6445770296185809, "eval_loss": 0.003088417463004589, "eval_runtime": 35.0506, "eval_samples_per_second": 4284.487, "eval_steps_per_second": 66.96, "step": 6050 }, { "epoch": 0.6456424461964628, "grad_norm": 0.15340279042720795, "learning_rate": 7.087151076070744e-06, "loss": 0.0017, "step": 6060 }, { "epoch": 0.6456424461964628, "eval_loss": 0.0031277111265808344, "eval_runtime": 35.0518, "eval_samples_per_second": 4284.343, "eval_steps_per_second": 66.958, "step": 6060 }, { "epoch": 0.6467078627743448, "grad_norm": 0.03221344202756882, "learning_rate": 7.065842744513105e-06, "loss": 0.0027, "step": 6070 }, { "epoch": 0.6467078627743448, "eval_loss": 0.0032699485309422016, "eval_runtime": 35.0636, "eval_samples_per_second": 4282.901, "eval_steps_per_second": 66.935, "step": 6070 }, { "epoch": 0.6477732793522267, "grad_norm": 0.0018749163718894124, "learning_rate": 7.044534412955466e-06, "loss": 0.0008, "step": 6080 }, { "epoch": 0.6477732793522267, "eval_loss": 0.00332645233720541, "eval_runtime": 35.0515, "eval_samples_per_second": 4284.379, "eval_steps_per_second": 66.959, "step": 6080 }, { "epoch": 0.6488386959301087, "grad_norm": 0.0626567080616951, "learning_rate": 7.023226081397827e-06, "loss": 0.0001, "step": 6090 }, { "epoch": 0.6488386959301087, "eval_loss": 0.003338114358484745, "eval_runtime": 35.0324, "eval_samples_per_second": 4286.713, "eval_steps_per_second": 66.995, "step": 6090 }, { "epoch": 0.6499041125079906, "grad_norm": 0.0010921815410256386, "learning_rate": 7.001917749840188e-06, "loss": 0.0007, "step": 6100 }, { "epoch": 0.6499041125079906, "eval_loss": 0.0033058812841773033, "eval_runtime": 35.0347, "eval_samples_per_second": 4286.435, "eval_steps_per_second": 66.991, "step": 6100 }, { "epoch": 0.6509695290858726, "grad_norm": 0.0011606470216065645, "learning_rate": 6.980609418282549e-06, "loss": 0.0006, "step": 6110 }, { "epoch": 0.6509695290858726, "eval_loss": 0.0032852557487785816, "eval_runtime": 35.1398, "eval_samples_per_second": 4273.614, "eval_steps_per_second": 66.79, "step": 6110 }, { "epoch": 0.6520349456637545, "grad_norm": 0.0011158619308844209, "learning_rate": 6.95930108672491e-06, "loss": 0.0038, "step": 6120 }, { "epoch": 0.6520349456637545, "eval_loss": 0.0032803104259073734, "eval_runtime": 35.166, "eval_samples_per_second": 4270.431, "eval_steps_per_second": 66.741, "step": 6120 }, { "epoch": 0.6531003622416365, "grad_norm": 0.3906470537185669, "learning_rate": 6.937992755167271e-06, "loss": 0.0012, "step": 6130 }, { "epoch": 0.6531003622416365, "eval_loss": 0.003150229575112462, "eval_runtime": 35.0766, "eval_samples_per_second": 4281.314, "eval_steps_per_second": 66.911, "step": 6130 }, { "epoch": 0.6541657788195184, "grad_norm": 0.022889362648129463, "learning_rate": 6.916684423609632e-06, "loss": 0.001, "step": 6140 }, { "epoch": 0.6541657788195184, "eval_loss": 0.0030826658476144075, "eval_runtime": 35.0315, "eval_samples_per_second": 4286.834, "eval_steps_per_second": 66.997, "step": 6140 }, { "epoch": 0.6552311953974004, "grad_norm": 0.0011571752838790417, "learning_rate": 6.895376092051993e-06, "loss": 0.0002, "step": 6150 }, { "epoch": 0.6552311953974004, "eval_loss": 0.003100884146988392, "eval_runtime": 35.065, "eval_samples_per_second": 4282.728, "eval_steps_per_second": 66.933, "step": 6150 }, { "epoch": 0.6562966119752823, "grad_norm": 0.0019666426815092564, "learning_rate": 6.874067760494354e-06, "loss": 0.0002, "step": 6160 }, { "epoch": 0.6562966119752823, "eval_loss": 0.0031208472792059183, "eval_runtime": 35.064, "eval_samples_per_second": 4282.858, "eval_steps_per_second": 66.935, "step": 6160 }, { "epoch": 0.6573620285531643, "grad_norm": 0.0021635943558067083, "learning_rate": 6.852759428936715e-06, "loss": 0.0143, "step": 6170 }, { "epoch": 0.6573620285531643, "eval_loss": 0.0030296596232801676, "eval_runtime": 35.056, "eval_samples_per_second": 4283.836, "eval_steps_per_second": 66.95, "step": 6170 }, { "epoch": 0.6584274451310462, "grad_norm": 0.001522368867881596, "learning_rate": 6.831451097379076e-06, "loss": 0.0004, "step": 6180 }, { "epoch": 0.6584274451310462, "eval_loss": 0.003006124868988991, "eval_runtime": 35.0506, "eval_samples_per_second": 4284.497, "eval_steps_per_second": 66.96, "step": 6180 }, { "epoch": 0.6594928617089282, "grad_norm": 0.001939168432727456, "learning_rate": 6.810142765821437e-06, "loss": 0.0007, "step": 6190 }, { "epoch": 0.6594928617089282, "eval_loss": 0.003031767439097166, "eval_runtime": 35.07, "eval_samples_per_second": 4282.121, "eval_steps_per_second": 66.923, "step": 6190 }, { "epoch": 0.6605582782868101, "grad_norm": 0.0015014013042673469, "learning_rate": 6.788834434263798e-06, "loss": 0.0003, "step": 6200 }, { "epoch": 0.6605582782868101, "eval_loss": 0.003057195106521249, "eval_runtime": 35.0456, "eval_samples_per_second": 4285.1, "eval_steps_per_second": 66.97, "step": 6200 }, { "epoch": 0.6616236948646921, "grad_norm": 0.01135373953729868, "learning_rate": 6.767526102706159e-06, "loss": 0.0022, "step": 6210 }, { "epoch": 0.6616236948646921, "eval_loss": 0.0030318093486130238, "eval_runtime": 35.0502, "eval_samples_per_second": 4284.546, "eval_steps_per_second": 66.961, "step": 6210 }, { "epoch": 0.662689111442574, "grad_norm": 0.002891425509005785, "learning_rate": 6.74621777114852e-06, "loss": 0.0076, "step": 6220 }, { "epoch": 0.662689111442574, "eval_loss": 0.0030160024762153625, "eval_runtime": 35.0854, "eval_samples_per_second": 4280.246, "eval_steps_per_second": 66.894, "step": 6220 }, { "epoch": 0.663754528020456, "grad_norm": 0.004777186084538698, "learning_rate": 6.724909439590881e-06, "loss": 0.0025, "step": 6230 }, { "epoch": 0.663754528020456, "eval_loss": 0.0030639716424047947, "eval_runtime": 35.0034, "eval_samples_per_second": 4290.265, "eval_steps_per_second": 67.051, "step": 6230 }, { "epoch": 0.6648199445983379, "grad_norm": 0.19623669981956482, "learning_rate": 6.703601108033242e-06, "loss": 0.0231, "step": 6240 }, { "epoch": 0.6648199445983379, "eval_loss": 0.003115487052127719, "eval_runtime": 35.0302, "eval_samples_per_second": 4286.99, "eval_steps_per_second": 66.999, "step": 6240 }, { "epoch": 0.6658853611762199, "grad_norm": 0.0012964721536263824, "learning_rate": 6.682292776475603e-06, "loss": 0.0032, "step": 6250 }, { "epoch": 0.6658853611762199, "eval_loss": 0.003056860063225031, "eval_runtime": 35.0191, "eval_samples_per_second": 4288.342, "eval_steps_per_second": 67.021, "step": 6250 }, { "epoch": 0.6669507777541018, "grad_norm": 0.001261876430362463, "learning_rate": 6.660984444917964e-06, "loss": 0.0061, "step": 6260 }, { "epoch": 0.6669507777541018, "eval_loss": 0.0029731402173638344, "eval_runtime": 35.0181, "eval_samples_per_second": 4288.465, "eval_steps_per_second": 67.022, "step": 6260 }, { "epoch": 0.6680161943319838, "grad_norm": 0.05022572726011276, "learning_rate": 6.639676113360325e-06, "loss": 0.0005, "step": 6270 }, { "epoch": 0.6680161943319838, "eval_loss": 0.0029431069269776344, "eval_runtime": 35.033, "eval_samples_per_second": 4286.645, "eval_steps_per_second": 66.994, "step": 6270 }, { "epoch": 0.6690816109098657, "grad_norm": 0.0013886064989492297, "learning_rate": 6.618367781802686e-06, "loss": 0.0012, "step": 6280 }, { "epoch": 0.6690816109098657, "eval_loss": 0.0029925217386335135, "eval_runtime": 35.0227, "eval_samples_per_second": 4287.908, "eval_steps_per_second": 67.014, "step": 6280 }, { "epoch": 0.6701470274877477, "grad_norm": 0.0013931123539805412, "learning_rate": 6.597059450245047e-06, "loss": 0.0012, "step": 6290 }, { "epoch": 0.6701470274877477, "eval_loss": 0.003136566374450922, "eval_runtime": 35.002, "eval_samples_per_second": 4290.435, "eval_steps_per_second": 67.053, "step": 6290 }, { "epoch": 0.6712124440656296, "grad_norm": 0.13292770087718964, "learning_rate": 6.575751118687407e-06, "loss": 0.0034, "step": 6300 }, { "epoch": 0.6712124440656296, "eval_loss": 0.0031445687636733055, "eval_runtime": 35.0137, "eval_samples_per_second": 4289.007, "eval_steps_per_second": 67.031, "step": 6300 }, { "epoch": 0.6722778606435116, "grad_norm": 0.24445843696594238, "learning_rate": 6.554442787129768e-06, "loss": 0.0052, "step": 6310 }, { "epoch": 0.6722778606435116, "eval_loss": 0.003103400580585003, "eval_runtime": 35.0057, "eval_samples_per_second": 4289.981, "eval_steps_per_second": 67.046, "step": 6310 }, { "epoch": 0.6733432772213935, "grad_norm": 0.0012035582913085818, "learning_rate": 6.533134455572129e-06, "loss": 0.0006, "step": 6320 }, { "epoch": 0.6733432772213935, "eval_loss": 0.003125393996015191, "eval_runtime": 35.0617, "eval_samples_per_second": 4283.134, "eval_steps_per_second": 66.939, "step": 6320 }, { "epoch": 0.6744086937992755, "grad_norm": 0.0018411766504868865, "learning_rate": 6.51182612401449e-06, "loss": 0.0003, "step": 6330 }, { "epoch": 0.6744086937992755, "eval_loss": 0.0031454197596758604, "eval_runtime": 35.0566, "eval_samples_per_second": 4283.761, "eval_steps_per_second": 66.949, "step": 6330 }, { "epoch": 0.6754741103771574, "grad_norm": 0.0026676368433982134, "learning_rate": 6.490517792456851e-06, "loss": 0.0005, "step": 6340 }, { "epoch": 0.6754741103771574, "eval_loss": 0.0031759522389620543, "eval_runtime": 35.0199, "eval_samples_per_second": 4288.251, "eval_steps_per_second": 67.019, "step": 6340 }, { "epoch": 0.6765395269550394, "grad_norm": 0.11416032165288925, "learning_rate": 6.469209460899212e-06, "loss": 0.0005, "step": 6350 }, { "epoch": 0.6765395269550394, "eval_loss": 0.0031935395672917366, "eval_runtime": 35.0086, "eval_samples_per_second": 4289.63, "eval_steps_per_second": 67.041, "step": 6350 }, { "epoch": 0.6776049435329213, "grad_norm": 0.08742302656173706, "learning_rate": 6.447901129341573e-06, "loss": 0.0056, "step": 6360 }, { "epoch": 0.6776049435329213, "eval_loss": 0.0031826442573219538, "eval_runtime": 35.021, "eval_samples_per_second": 4288.113, "eval_steps_per_second": 67.017, "step": 6360 }, { "epoch": 0.6786703601108033, "grad_norm": 0.04388425499200821, "learning_rate": 6.426592797783934e-06, "loss": 0.0002, "step": 6370 }, { "epoch": 0.6786703601108033, "eval_loss": 0.003147592768073082, "eval_runtime": 34.9947, "eval_samples_per_second": 4291.33, "eval_steps_per_second": 67.067, "step": 6370 }, { "epoch": 0.6797357766886852, "grad_norm": 0.37059757113456726, "learning_rate": 6.405284466226295e-06, "loss": 0.001, "step": 6380 }, { "epoch": 0.6797357766886852, "eval_loss": 0.003108437405899167, "eval_runtime": 35.0218, "eval_samples_per_second": 4288.015, "eval_steps_per_second": 67.015, "step": 6380 }, { "epoch": 0.6808011932665672, "grad_norm": 0.03148869425058365, "learning_rate": 6.383976134668656e-06, "loss": 0.0043, "step": 6390 }, { "epoch": 0.6808011932665672, "eval_loss": 0.0030745782423764467, "eval_runtime": 35.0146, "eval_samples_per_second": 4288.898, "eval_steps_per_second": 67.029, "step": 6390 }, { "epoch": 0.6818666098444491, "grad_norm": 0.0011743833310902119, "learning_rate": 6.362667803111017e-06, "loss": 0.0003, "step": 6400 }, { "epoch": 0.6818666098444491, "eval_loss": 0.0030515496619045734, "eval_runtime": 34.9735, "eval_samples_per_second": 4293.942, "eval_steps_per_second": 67.108, "step": 6400 }, { "epoch": 0.6829320264223311, "grad_norm": 5.519503116607666, "learning_rate": 6.341359471553378e-06, "loss": 0.0112, "step": 6410 }, { "epoch": 0.6829320264223311, "eval_loss": 0.0030856935773044825, "eval_runtime": 35.0092, "eval_samples_per_second": 4289.562, "eval_steps_per_second": 67.04, "step": 6410 }, { "epoch": 0.683997443000213, "grad_norm": 0.0011778927873820066, "learning_rate": 6.320051139995739e-06, "loss": 0.0028, "step": 6420 }, { "epoch": 0.683997443000213, "eval_loss": 0.0030590456444770098, "eval_runtime": 35.0267, "eval_samples_per_second": 4287.421, "eval_steps_per_second": 67.006, "step": 6420 }, { "epoch": 0.685062859578095, "grad_norm": 0.19133904576301575, "learning_rate": 6.2987428084381e-06, "loss": 0.0007, "step": 6430 }, { "epoch": 0.685062859578095, "eval_loss": 0.0030688135884702206, "eval_runtime": 35.0199, "eval_samples_per_second": 4288.245, "eval_steps_per_second": 67.019, "step": 6430 }, { "epoch": 0.6861282761559769, "grad_norm": 4.050024509429932, "learning_rate": 6.277434476880461e-06, "loss": 0.013, "step": 6440 }, { "epoch": 0.6861282761559769, "eval_loss": 0.0031101179774850607, "eval_runtime": 35.0365, "eval_samples_per_second": 4286.212, "eval_steps_per_second": 66.987, "step": 6440 }, { "epoch": 0.6871936927338589, "grad_norm": 0.0026636181864887476, "learning_rate": 6.256126145322822e-06, "loss": 0.0181, "step": 6450 }, { "epoch": 0.6871936927338589, "eval_loss": 0.0030249811243265867, "eval_runtime": 35.0288, "eval_samples_per_second": 4287.161, "eval_steps_per_second": 67.002, "step": 6450 }, { "epoch": 0.6882591093117408, "grad_norm": 0.0036579566076397896, "learning_rate": 6.234817813765183e-06, "loss": 0.0005, "step": 6460 }, { "epoch": 0.6882591093117408, "eval_loss": 0.0030106704216450453, "eval_runtime": 35.0327, "eval_samples_per_second": 4286.684, "eval_steps_per_second": 66.995, "step": 6460 }, { "epoch": 0.6893245258896229, "grad_norm": 0.003752629505470395, "learning_rate": 6.213509482207544e-06, "loss": 0.0006, "step": 6470 }, { "epoch": 0.6893245258896229, "eval_loss": 0.0030355704948306084, "eval_runtime": 35.0648, "eval_samples_per_second": 4282.758, "eval_steps_per_second": 66.933, "step": 6470 }, { "epoch": 0.6903899424675048, "grad_norm": 0.06187931075692177, "learning_rate": 6.192201150649905e-06, "loss": 0.0014, "step": 6480 }, { "epoch": 0.6903899424675048, "eval_loss": 0.003116002306342125, "eval_runtime": 35.0665, "eval_samples_per_second": 4282.551, "eval_steps_per_second": 66.93, "step": 6480 }, { "epoch": 0.6914553590453868, "grad_norm": 0.03547167405486107, "learning_rate": 6.1708928190922656e-06, "loss": 0.0002, "step": 6490 }, { "epoch": 0.6914553590453868, "eval_loss": 0.003167262999340892, "eval_runtime": 35.0077, "eval_samples_per_second": 4289.742, "eval_steps_per_second": 67.042, "step": 6490 }, { "epoch": 0.6925207756232687, "grad_norm": 0.04050152748823166, "learning_rate": 6.1495844875346266e-06, "loss": 0.0003, "step": 6500 }, { "epoch": 0.6925207756232687, "eval_loss": 0.0032066998537629843, "eval_runtime": 35.0207, "eval_samples_per_second": 4288.147, "eval_steps_per_second": 67.017, "step": 6500 }, { "epoch": 0.6935861922011507, "grad_norm": 0.14706210792064667, "learning_rate": 6.1282761559769876e-06, "loss": 0.0001, "step": 6510 }, { "epoch": 0.6935861922011507, "eval_loss": 0.0032550478354096413, "eval_runtime": 35.0392, "eval_samples_per_second": 4285.881, "eval_steps_per_second": 66.982, "step": 6510 }, { "epoch": 0.6946516087790326, "grad_norm": 1.0719351768493652, "learning_rate": 6.1069678244193485e-06, "loss": 0.0039, "step": 6520 }, { "epoch": 0.6946516087790326, "eval_loss": 0.003266693092882633, "eval_runtime": 35.0566, "eval_samples_per_second": 4283.762, "eval_steps_per_second": 66.949, "step": 6520 }, { "epoch": 0.6957170253569146, "grad_norm": 0.0011848441790789366, "learning_rate": 6.0856594928617095e-06, "loss": 0.0002, "step": 6530 }, { "epoch": 0.6957170253569146, "eval_loss": 0.0031671386677771807, "eval_runtime": 35.0738, "eval_samples_per_second": 4281.652, "eval_steps_per_second": 66.916, "step": 6530 }, { "epoch": 0.6967824419347965, "grad_norm": 0.042776867747306824, "learning_rate": 6.0643511613040705e-06, "loss": 0.0004, "step": 6540 }, { "epoch": 0.6967824419347965, "eval_loss": 0.003157460829243064, "eval_runtime": 35.0847, "eval_samples_per_second": 4280.324, "eval_steps_per_second": 66.895, "step": 6540 }, { "epoch": 0.6978478585126785, "grad_norm": 0.1637280434370041, "learning_rate": 6.0430428297464315e-06, "loss": 0.0006, "step": 6550 }, { "epoch": 0.6978478585126785, "eval_loss": 0.00318445498123765, "eval_runtime": 35.0605, "eval_samples_per_second": 4283.276, "eval_steps_per_second": 66.941, "step": 6550 }, { "epoch": 0.6989132750905604, "grad_norm": 0.04782974347472191, "learning_rate": 6.0217344981887925e-06, "loss": 0.0002, "step": 6560 }, { "epoch": 0.6989132750905604, "eval_loss": 0.0032346732914447784, "eval_runtime": 35.05, "eval_samples_per_second": 4284.569, "eval_steps_per_second": 66.962, "step": 6560 }, { "epoch": 0.6999786916684424, "grad_norm": 0.003285630140453577, "learning_rate": 6.0004261666311535e-06, "loss": 0.0067, "step": 6570 }, { "epoch": 0.6999786916684424, "eval_loss": 0.003163369372487068, "eval_runtime": 35.0327, "eval_samples_per_second": 4286.681, "eval_steps_per_second": 66.995, "step": 6570 }, { "epoch": 0.7010441082463243, "grad_norm": 0.0016075136372819543, "learning_rate": 5.9791178350735145e-06, "loss": 0.0004, "step": 6580 }, { "epoch": 0.7010441082463243, "eval_loss": 0.003068899270147085, "eval_runtime": 35.0951, "eval_samples_per_second": 4279.057, "eval_steps_per_second": 66.875, "step": 6580 }, { "epoch": 0.7021095248242063, "grad_norm": 0.0011133512016385794, "learning_rate": 5.9578095035158755e-06, "loss": 0.0014, "step": 6590 }, { "epoch": 0.7021095248242063, "eval_loss": 0.003099815221503377, "eval_runtime": 35.0502, "eval_samples_per_second": 4284.536, "eval_steps_per_second": 66.961, "step": 6590 }, { "epoch": 0.7031749414020882, "grad_norm": 0.002385763917118311, "learning_rate": 5.9365011719582365e-06, "loss": 0.0002, "step": 6600 }, { "epoch": 0.7031749414020882, "eval_loss": 0.003167761955410242, "eval_runtime": 35.0594, "eval_samples_per_second": 4283.418, "eval_steps_per_second": 66.944, "step": 6600 }, { "epoch": 0.7042403579799702, "grad_norm": 0.0011592097580432892, "learning_rate": 5.9151928404005975e-06, "loss": 0.0031, "step": 6610 }, { "epoch": 0.7042403579799702, "eval_loss": 0.0031276061199605465, "eval_runtime": 35.0211, "eval_samples_per_second": 4288.1, "eval_steps_per_second": 67.017, "step": 6610 }, { "epoch": 0.7053057745578521, "grad_norm": 0.0014141725841909647, "learning_rate": 5.8938845088429584e-06, "loss": 0.0002, "step": 6620 }, { "epoch": 0.7053057745578521, "eval_loss": 0.0030569627415388823, "eval_runtime": 35.0162, "eval_samples_per_second": 4288.698, "eval_steps_per_second": 67.026, "step": 6620 }, { "epoch": 0.7063711911357341, "grad_norm": 0.0018372322665527463, "learning_rate": 5.8725761772853194e-06, "loss": 0.008, "step": 6630 }, { "epoch": 0.7063711911357341, "eval_loss": 0.003044996177777648, "eval_runtime": 35.0321, "eval_samples_per_second": 4286.754, "eval_steps_per_second": 66.996, "step": 6630 }, { "epoch": 0.707436607713616, "grad_norm": 0.0027874810621142387, "learning_rate": 5.8512678457276796e-06, "loss": 0.0012, "step": 6640 }, { "epoch": 0.707436607713616, "eval_loss": 0.0030959330033510923, "eval_runtime": 35.0844, "eval_samples_per_second": 4280.361, "eval_steps_per_second": 66.896, "step": 6640 }, { "epoch": 0.708502024291498, "grad_norm": 2.3545823097229004, "learning_rate": 5.8299595141700406e-06, "loss": 0.0009, "step": 6650 }, { "epoch": 0.708502024291498, "eval_loss": 0.003153095720335841, "eval_runtime": 34.9933, "eval_samples_per_second": 4291.507, "eval_steps_per_second": 67.07, "step": 6650 }, { "epoch": 0.70956744086938, "grad_norm": 0.0011235169367864728, "learning_rate": 5.8086511826124016e-06, "loss": 0.0005, "step": 6660 }, { "epoch": 0.70956744086938, "eval_loss": 0.0032801416236907244, "eval_runtime": 35.0278, "eval_samples_per_second": 4287.279, "eval_steps_per_second": 67.004, "step": 6660 }, { "epoch": 0.7106328574472619, "grad_norm": 0.023665864020586014, "learning_rate": 5.7873428510547625e-06, "loss": 0.0004, "step": 6670 }, { "epoch": 0.7106328574472619, "eval_loss": 0.0033080640714615583, "eval_runtime": 35.0146, "eval_samples_per_second": 4288.902, "eval_steps_per_second": 67.029, "step": 6670 }, { "epoch": 0.7116982740251439, "grad_norm": 0.00826460961252451, "learning_rate": 5.7660345194971235e-06, "loss": 0.0006, "step": 6680 }, { "epoch": 0.7116982740251439, "eval_loss": 0.003354353830218315, "eval_runtime": 35.1018, "eval_samples_per_second": 4278.238, "eval_steps_per_second": 66.863, "step": 6680 }, { "epoch": 0.7127636906030258, "grad_norm": 0.2588113248348236, "learning_rate": 5.7447261879394845e-06, "loss": 0.001, "step": 6690 }, { "epoch": 0.7127636906030258, "eval_loss": 0.00345269194804132, "eval_runtime": 35.0432, "eval_samples_per_second": 4285.398, "eval_steps_per_second": 66.975, "step": 6690 }, { "epoch": 0.7138291071809078, "grad_norm": 0.0016366565832868218, "learning_rate": 5.7234178563818455e-06, "loss": 0.0001, "step": 6700 }, { "epoch": 0.7138291071809078, "eval_loss": 0.0035686830524355173, "eval_runtime": 35.0532, "eval_samples_per_second": 4284.173, "eval_steps_per_second": 66.955, "step": 6700 }, { "epoch": 0.7148945237587897, "grad_norm": 0.0024288988206535578, "learning_rate": 5.7021095248242065e-06, "loss": 0.0051, "step": 6710 }, { "epoch": 0.7148945237587897, "eval_loss": 0.0036059534177184105, "eval_runtime": 35.0346, "eval_samples_per_second": 4286.443, "eval_steps_per_second": 66.991, "step": 6710 }, { "epoch": 0.7159599403366717, "grad_norm": 0.0010271297069266438, "learning_rate": 5.6808011932665675e-06, "loss": 0.0032, "step": 6720 }, { "epoch": 0.7159599403366717, "eval_loss": 0.0035558068193495274, "eval_runtime": 35.0227, "eval_samples_per_second": 4287.901, "eval_steps_per_second": 67.014, "step": 6720 }, { "epoch": 0.7170253569145536, "grad_norm": 0.0013646967709064484, "learning_rate": 5.6594928617089285e-06, "loss": 0.0001, "step": 6730 }, { "epoch": 0.7170253569145536, "eval_loss": 0.003483639331534505, "eval_runtime": 35.0068, "eval_samples_per_second": 4289.851, "eval_steps_per_second": 67.044, "step": 6730 }, { "epoch": 0.7180907734924356, "grad_norm": 0.0010676413075998425, "learning_rate": 5.6381845301512895e-06, "loss": 0.0003, "step": 6740 }, { "epoch": 0.7180907734924356, "eval_loss": 0.0034696413204073906, "eval_runtime": 35.0092, "eval_samples_per_second": 4289.554, "eval_steps_per_second": 67.039, "step": 6740 }, { "epoch": 0.7191561900703175, "grad_norm": 0.0070797838270664215, "learning_rate": 5.6168761985936505e-06, "loss": 0.0002, "step": 6750 }, { "epoch": 0.7191561900703175, "eval_loss": 0.003489007707685232, "eval_runtime": 35.0141, "eval_samples_per_second": 4288.963, "eval_steps_per_second": 67.03, "step": 6750 }, { "epoch": 0.7202216066481995, "grad_norm": 0.0010801940225064754, "learning_rate": 5.5955678670360115e-06, "loss": 0.0055, "step": 6760 }, { "epoch": 0.7202216066481995, "eval_loss": 0.003356917528435588, "eval_runtime": 35.0544, "eval_samples_per_second": 4284.03, "eval_steps_per_second": 66.953, "step": 6760 }, { "epoch": 0.7212870232260814, "grad_norm": 0.0018471528310328722, "learning_rate": 5.5742595354783724e-06, "loss": 0.0002, "step": 6770 }, { "epoch": 0.7212870232260814, "eval_loss": 0.00331767532043159, "eval_runtime": 35.06, "eval_samples_per_second": 4283.344, "eval_steps_per_second": 66.942, "step": 6770 }, { "epoch": 0.7223524398039634, "grad_norm": 0.0017673600232228637, "learning_rate": 5.5529512039207334e-06, "loss": 0.0056, "step": 6780 }, { "epoch": 0.7223524398039634, "eval_loss": 0.0031172942835837603, "eval_runtime": 35.0612, "eval_samples_per_second": 4283.197, "eval_steps_per_second": 66.94, "step": 6780 }, { "epoch": 0.7234178563818453, "grad_norm": 0.0015435615787282586, "learning_rate": 5.5316428723630944e-06, "loss": 0.0007, "step": 6790 }, { "epoch": 0.7234178563818453, "eval_loss": 0.0030721002258360386, "eval_runtime": 35.0224, "eval_samples_per_second": 4287.935, "eval_steps_per_second": 67.014, "step": 6790 }, { "epoch": 0.7244832729597273, "grad_norm": 0.04698014259338379, "learning_rate": 5.510334540805455e-06, "loss": 0.001, "step": 6800 }, { "epoch": 0.7244832729597273, "eval_loss": 0.00307706487365067, "eval_runtime": 35.0244, "eval_samples_per_second": 4287.692, "eval_steps_per_second": 67.01, "step": 6800 }, { "epoch": 0.7255486895376092, "grad_norm": 0.002133553382009268, "learning_rate": 5.489026209247816e-06, "loss": 0.0001, "step": 6810 }, { "epoch": 0.7255486895376092, "eval_loss": 0.003107481636106968, "eval_runtime": 35.0466, "eval_samples_per_second": 4284.985, "eval_steps_per_second": 66.968, "step": 6810 }, { "epoch": 0.7266141061154912, "grad_norm": 0.0030837086960673332, "learning_rate": 5.467717877690177e-06, "loss": 0.0012, "step": 6820 }, { "epoch": 0.7266141061154912, "eval_loss": 0.003099891124293208, "eval_runtime": 35.0508, "eval_samples_per_second": 4284.463, "eval_steps_per_second": 66.96, "step": 6820 }, { "epoch": 0.7276795226933731, "grad_norm": 0.0010295656975358725, "learning_rate": 5.446409546132538e-06, "loss": 0.017, "step": 6830 }, { "epoch": 0.7276795226933731, "eval_loss": 0.0031501969788223505, "eval_runtime": 35.0735, "eval_samples_per_second": 4281.691, "eval_steps_per_second": 66.917, "step": 6830 }, { "epoch": 0.728744939271255, "grad_norm": 0.0011918977834284306, "learning_rate": 5.425101214574899e-06, "loss": 0.0015, "step": 6840 }, { "epoch": 0.728744939271255, "eval_loss": 0.0031602659728378057, "eval_runtime": 35.0557, "eval_samples_per_second": 4283.869, "eval_steps_per_second": 66.951, "step": 6840 }, { "epoch": 0.729810355849137, "grad_norm": 0.002359379781410098, "learning_rate": 5.40379288301726e-06, "loss": 0.0017, "step": 6850 }, { "epoch": 0.729810355849137, "eval_loss": 0.0032126172445714474, "eval_runtime": 35.0649, "eval_samples_per_second": 4282.74, "eval_steps_per_second": 66.933, "step": 6850 }, { "epoch": 0.730875772427019, "grad_norm": 0.002211513929069042, "learning_rate": 5.382484551459621e-06, "loss": 0.0001, "step": 6860 }, { "epoch": 0.730875772427019, "eval_loss": 0.0033024682197719812, "eval_runtime": 35.0584, "eval_samples_per_second": 4283.538, "eval_steps_per_second": 66.945, "step": 6860 }, { "epoch": 0.7319411890049009, "grad_norm": 0.0362793393433094, "learning_rate": 5.361176219901982e-06, "loss": 0.0138, "step": 6870 }, { "epoch": 0.7319411890049009, "eval_loss": 0.003259913297370076, "eval_runtime": 35.0459, "eval_samples_per_second": 4285.062, "eval_steps_per_second": 66.969, "step": 6870 }, { "epoch": 0.7330066055827829, "grad_norm": 0.0012098865117877722, "learning_rate": 5.339867888344343e-06, "loss": 0.0001, "step": 6880 }, { "epoch": 0.7330066055827829, "eval_loss": 0.0032443315722048283, "eval_runtime": 35.0439, "eval_samples_per_second": 4285.305, "eval_steps_per_second": 66.973, "step": 6880 }, { "epoch": 0.7340720221606648, "grad_norm": 0.0010898082982748747, "learning_rate": 5.318559556786704e-06, "loss": 0.0007, "step": 6890 }, { "epoch": 0.7340720221606648, "eval_loss": 0.0032403902150690556, "eval_runtime": 35.0508, "eval_samples_per_second": 4284.466, "eval_steps_per_second": 66.96, "step": 6890 }, { "epoch": 0.7351374387385468, "grad_norm": 0.016424862667918205, "learning_rate": 5.297251225229065e-06, "loss": 0.0163, "step": 6900 }, { "epoch": 0.7351374387385468, "eval_loss": 0.003254901384934783, "eval_runtime": 35.042, "eval_samples_per_second": 4285.543, "eval_steps_per_second": 66.977, "step": 6900 }, { "epoch": 0.7362028553164287, "grad_norm": 0.0012151696719229221, "learning_rate": 5.275942893671426e-06, "loss": 0.0002, "step": 6910 }, { "epoch": 0.7362028553164287, "eval_loss": 0.003262386191636324, "eval_runtime": 35.0134, "eval_samples_per_second": 4289.044, "eval_steps_per_second": 67.031, "step": 6910 }, { "epoch": 0.7372682718943107, "grad_norm": 0.3996301293373108, "learning_rate": 5.254634562113787e-06, "loss": 0.0087, "step": 6920 }, { "epoch": 0.7372682718943107, "eval_loss": 0.003233132418245077, "eval_runtime": 35.0817, "eval_samples_per_second": 4280.688, "eval_steps_per_second": 66.901, "step": 6920 }, { "epoch": 0.7383336884721926, "grad_norm": 0.0016513338778167963, "learning_rate": 5.233326230556148e-06, "loss": 0.0003, "step": 6930 }, { "epoch": 0.7383336884721926, "eval_loss": 0.003179131541401148, "eval_runtime": 35.056, "eval_samples_per_second": 4283.832, "eval_steps_per_second": 66.95, "step": 6930 }, { "epoch": 0.7393991050500746, "grad_norm": 0.0020407168194651604, "learning_rate": 5.212017898998509e-06, "loss": 0.0001, "step": 6940 }, { "epoch": 0.7393991050500746, "eval_loss": 0.003184954635798931, "eval_runtime": 35.0381, "eval_samples_per_second": 4286.014, "eval_steps_per_second": 66.984, "step": 6940 }, { "epoch": 0.7404645216279565, "grad_norm": 0.0016329142963513732, "learning_rate": 5.19070956744087e-06, "loss": 0.0001, "step": 6950 }, { "epoch": 0.7404645216279565, "eval_loss": 0.0031928608659654856, "eval_runtime": 35.0685, "eval_samples_per_second": 4282.299, "eval_steps_per_second": 66.926, "step": 6950 }, { "epoch": 0.7415299382058385, "grad_norm": 0.001757573802024126, "learning_rate": 5.169401235883231e-06, "loss": 0.0008, "step": 6960 }, { "epoch": 0.7415299382058385, "eval_loss": 0.0032357927411794662, "eval_runtime": 35.0463, "eval_samples_per_second": 4285.021, "eval_steps_per_second": 66.969, "step": 6960 }, { "epoch": 0.7425953547837204, "grad_norm": 0.0012253515888005495, "learning_rate": 5.148092904325592e-06, "loss": 0.0123, "step": 6970 }, { "epoch": 0.7425953547837204, "eval_loss": 0.003153954865410924, "eval_runtime": 35.056, "eval_samples_per_second": 4283.835, "eval_steps_per_second": 66.95, "step": 6970 }, { "epoch": 0.7436607713616024, "grad_norm": 0.011123016476631165, "learning_rate": 5.126784572767952e-06, "loss": 0.0002, "step": 6980 }, { "epoch": 0.7436607713616024, "eval_loss": 0.003115166211500764, "eval_runtime": 35.0253, "eval_samples_per_second": 4287.583, "eval_steps_per_second": 67.009, "step": 6980 }, { "epoch": 0.7447261879394843, "grad_norm": 0.0014360809000208974, "learning_rate": 5.105476241210313e-06, "loss": 0.0025, "step": 6990 }, { "epoch": 0.7447261879394843, "eval_loss": 0.0031400981824845076, "eval_runtime": 35.0588, "eval_samples_per_second": 4283.494, "eval_steps_per_second": 66.945, "step": 6990 }, { "epoch": 0.7457916045173663, "grad_norm": 0.11274624615907669, "learning_rate": 5.084167909652674e-06, "loss": 0.0034, "step": 7000 }, { "epoch": 0.7457916045173663, "eval_loss": 0.00313013419508934, "eval_runtime": 35.1033, "eval_samples_per_second": 4278.065, "eval_steps_per_second": 66.86, "step": 7000 }, { "epoch": 0.7468570210952482, "grad_norm": 0.0017726977821439505, "learning_rate": 5.062859578095035e-06, "loss": 0.0028, "step": 7010 }, { "epoch": 0.7468570210952482, "eval_loss": 0.003186985617503524, "eval_runtime": 35.0513, "eval_samples_per_second": 4284.409, "eval_steps_per_second": 66.959, "step": 7010 }, { "epoch": 0.7479224376731302, "grad_norm": 0.001665642368607223, "learning_rate": 5.041551246537396e-06, "loss": 0.0023, "step": 7020 }, { "epoch": 0.7479224376731302, "eval_loss": 0.003349791280925274, "eval_runtime": 35.0399, "eval_samples_per_second": 4285.805, "eval_steps_per_second": 66.981, "step": 7020 }, { "epoch": 0.7489878542510121, "grad_norm": 0.015697909519076347, "learning_rate": 5.020242914979757e-06, "loss": 0.0007, "step": 7030 }, { "epoch": 0.7489878542510121, "eval_loss": 0.003393676597625017, "eval_runtime": 35.0379, "eval_samples_per_second": 4286.046, "eval_steps_per_second": 66.985, "step": 7030 }, { "epoch": 0.7500532708288941, "grad_norm": 0.0011734378058463335, "learning_rate": 4.998934583422118e-06, "loss": 0.0013, "step": 7040 }, { "epoch": 0.7500532708288941, "eval_loss": 0.003516310593113303, "eval_runtime": 35.0437, "eval_samples_per_second": 4285.337, "eval_steps_per_second": 66.974, "step": 7040 }, { "epoch": 0.751118687406776, "grad_norm": 0.001946283970028162, "learning_rate": 4.977626251864479e-06, "loss": 0.0021, "step": 7050 }, { "epoch": 0.751118687406776, "eval_loss": 0.003597394796088338, "eval_runtime": 35.0327, "eval_samples_per_second": 4286.677, "eval_steps_per_second": 66.994, "step": 7050 }, { "epoch": 0.752184103984658, "grad_norm": 0.0019929111003875732, "learning_rate": 4.95631792030684e-06, "loss": 0.0004, "step": 7060 }, { "epoch": 0.752184103984658, "eval_loss": 0.003620902309194207, "eval_runtime": 35.0599, "eval_samples_per_second": 4283.349, "eval_steps_per_second": 66.942, "step": 7060 }, { "epoch": 0.7532495205625399, "grad_norm": 0.0011990427738055587, "learning_rate": 4.935009588749201e-06, "loss": 0.0018, "step": 7070 }, { "epoch": 0.7532495205625399, "eval_loss": 0.0034169661812484264, "eval_runtime": 35.0447, "eval_samples_per_second": 4285.211, "eval_steps_per_second": 66.972, "step": 7070 }, { "epoch": 0.7543149371404219, "grad_norm": 0.10688398033380508, "learning_rate": 4.913701257191562e-06, "loss": 0.0024, "step": 7080 }, { "epoch": 0.7543149371404219, "eval_loss": 0.0034140669740736485, "eval_runtime": 35.0525, "eval_samples_per_second": 4284.26, "eval_steps_per_second": 66.957, "step": 7080 }, { "epoch": 0.7553803537183038, "grad_norm": 0.005744527094066143, "learning_rate": 4.892392925633923e-06, "loss": 0.0007, "step": 7090 }, { "epoch": 0.7553803537183038, "eval_loss": 0.0033137863501906395, "eval_runtime": 35.0391, "eval_samples_per_second": 4285.902, "eval_steps_per_second": 66.982, "step": 7090 }, { "epoch": 0.7564457702961858, "grad_norm": 0.0011864439584314823, "learning_rate": 4.871084594076284e-06, "loss": 0.0004, "step": 7100 }, { "epoch": 0.7564457702961858, "eval_loss": 0.0032741157338023186, "eval_runtime": 35.0598, "eval_samples_per_second": 4283.366, "eval_steps_per_second": 66.943, "step": 7100 }, { "epoch": 0.7575111868740677, "grad_norm": 0.003718329593539238, "learning_rate": 4.849776262518645e-06, "loss": 0.0005, "step": 7110 }, { "epoch": 0.7575111868740677, "eval_loss": 0.0032938900403678417, "eval_runtime": 34.9977, "eval_samples_per_second": 4290.962, "eval_steps_per_second": 67.061, "step": 7110 }, { "epoch": 0.7585766034519497, "grad_norm": 0.0011979677947238088, "learning_rate": 4.828467930961006e-06, "loss": 0.0045, "step": 7120 }, { "epoch": 0.7585766034519497, "eval_loss": 0.0033634670544415712, "eval_runtime": 35.0131, "eval_samples_per_second": 4289.081, "eval_steps_per_second": 67.032, "step": 7120 }, { "epoch": 0.7596420200298316, "grad_norm": 0.0033819531090557575, "learning_rate": 4.807159599403367e-06, "loss": 0.0209, "step": 7130 }, { "epoch": 0.7596420200298316, "eval_loss": 0.0031614580657333136, "eval_runtime": 35.028, "eval_samples_per_second": 4287.261, "eval_steps_per_second": 67.004, "step": 7130 }, { "epoch": 0.7607074366077136, "grad_norm": 0.0051054502837359905, "learning_rate": 4.785851267845728e-06, "loss": 0.0107, "step": 7140 }, { "epoch": 0.7607074366077136, "eval_loss": 0.003080246038734913, "eval_runtime": 35.1422, "eval_samples_per_second": 4273.324, "eval_steps_per_second": 66.786, "step": 7140 }, { "epoch": 0.7617728531855956, "grad_norm": 0.13544993102550507, "learning_rate": 4.764542936288089e-06, "loss": 0.0006, "step": 7150 }, { "epoch": 0.7617728531855956, "eval_loss": 0.0030894039664417505, "eval_runtime": 35.0199, "eval_samples_per_second": 4288.25, "eval_steps_per_second": 67.019, "step": 7150 }, { "epoch": 0.7628382697634776, "grad_norm": 0.0017130186315625906, "learning_rate": 4.74323460473045e-06, "loss": 0.0001, "step": 7160 }, { "epoch": 0.7628382697634776, "eval_loss": 0.003118880558758974, "eval_runtime": 35.0319, "eval_samples_per_second": 4286.777, "eval_steps_per_second": 66.996, "step": 7160 }, { "epoch": 0.7639036863413595, "grad_norm": 1.1553536653518677, "learning_rate": 4.721926273172811e-06, "loss": 0.0155, "step": 7170 }, { "epoch": 0.7639036863413595, "eval_loss": 0.0030046890024095774, "eval_runtime": 35.0791, "eval_samples_per_second": 4281.007, "eval_steps_per_second": 66.906, "step": 7170 }, { "epoch": 0.7649691029192415, "grad_norm": 0.0015282640233635902, "learning_rate": 4.700617941615172e-06, "loss": 0.0071, "step": 7180 }, { "epoch": 0.7649691029192415, "eval_loss": 0.0028774854727089405, "eval_runtime": 35.0386, "eval_samples_per_second": 4285.954, "eval_steps_per_second": 66.983, "step": 7180 }, { "epoch": 0.7660345194971234, "grad_norm": 0.001786403707228601, "learning_rate": 4.679309610057533e-06, "loss": 0.0042, "step": 7190 }, { "epoch": 0.7660345194971234, "eval_loss": 0.002872324315831065, "eval_runtime": 35.0037, "eval_samples_per_second": 4290.235, "eval_steps_per_second": 67.05, "step": 7190 }, { "epoch": 0.7670999360750054, "grad_norm": 0.002205133670940995, "learning_rate": 4.658001278499894e-06, "loss": 0.0006, "step": 7200 }, { "epoch": 0.7670999360750054, "eval_loss": 0.0029324537608772516, "eval_runtime": 35.0174, "eval_samples_per_second": 4288.558, "eval_steps_per_second": 67.024, "step": 7200 }, { "epoch": 0.7681653526528873, "grad_norm": 0.002565717324614525, "learning_rate": 4.636692946942255e-06, "loss": 0.0005, "step": 7210 }, { "epoch": 0.7681653526528873, "eval_loss": 0.0029697574209421873, "eval_runtime": 35.0534, "eval_samples_per_second": 4284.145, "eval_steps_per_second": 66.955, "step": 7210 }, { "epoch": 0.7692307692307693, "grad_norm": 1.1513327360153198, "learning_rate": 4.615384615384616e-06, "loss": 0.0162, "step": 7220 }, { "epoch": 0.7692307692307693, "eval_loss": 0.002979603363201022, "eval_runtime": 35.0461, "eval_samples_per_second": 4285.043, "eval_steps_per_second": 66.969, "step": 7220 }, { "epoch": 0.7702961858086512, "grad_norm": 0.002426127204671502, "learning_rate": 4.594076283826976e-06, "loss": 0.0009, "step": 7230 }, { "epoch": 0.7702961858086512, "eval_loss": 0.002993279369547963, "eval_runtime": 35.0161, "eval_samples_per_second": 4288.714, "eval_steps_per_second": 67.026, "step": 7230 }, { "epoch": 0.7713616023865332, "grad_norm": 0.0015931341331452131, "learning_rate": 4.572767952269337e-06, "loss": 0.0004, "step": 7240 }, { "epoch": 0.7713616023865332, "eval_loss": 0.0029843186493963003, "eval_runtime": 35.0478, "eval_samples_per_second": 4284.837, "eval_steps_per_second": 66.966, "step": 7240 }, { "epoch": 0.7724270189644151, "grad_norm": 0.0018893532687798142, "learning_rate": 4.551459620711698e-06, "loss": 0.0078, "step": 7250 }, { "epoch": 0.7724270189644151, "eval_loss": 0.0029693315736949444, "eval_runtime": 35.0331, "eval_samples_per_second": 4286.632, "eval_steps_per_second": 66.994, "step": 7250 }, { "epoch": 0.7734924355422971, "grad_norm": 0.009522825479507446, "learning_rate": 4.530151289154059e-06, "loss": 0.02, "step": 7260 }, { "epoch": 0.7734924355422971, "eval_loss": 0.0028855737764388323, "eval_runtime": 35.057, "eval_samples_per_second": 4283.708, "eval_steps_per_second": 66.948, "step": 7260 }, { "epoch": 0.774557852120179, "grad_norm": 0.00656323553994298, "learning_rate": 4.50884295759642e-06, "loss": 0.0004, "step": 7270 }, { "epoch": 0.774557852120179, "eval_loss": 0.002848101779818535, "eval_runtime": 35.0368, "eval_samples_per_second": 4286.18, "eval_steps_per_second": 66.987, "step": 7270 }, { "epoch": 0.775623268698061, "grad_norm": 0.07277275621891022, "learning_rate": 4.487534626038781e-06, "loss": 0.0022, "step": 7280 }, { "epoch": 0.775623268698061, "eval_loss": 0.00286454102024436, "eval_runtime": 35.0012, "eval_samples_per_second": 4290.544, "eval_steps_per_second": 67.055, "step": 7280 }, { "epoch": 0.7766886852759429, "grad_norm": 0.0020550009794533253, "learning_rate": 4.466226294481142e-06, "loss": 0.0009, "step": 7290 }, { "epoch": 0.7766886852759429, "eval_loss": 0.002871564356610179, "eval_runtime": 35.0429, "eval_samples_per_second": 4285.428, "eval_steps_per_second": 66.975, "step": 7290 }, { "epoch": 0.7777541018538249, "grad_norm": 0.0024138211738318205, "learning_rate": 4.444917962923503e-06, "loss": 0.0017, "step": 7300 }, { "epoch": 0.7777541018538249, "eval_loss": 0.0029342793859541416, "eval_runtime": 35.0165, "eval_samples_per_second": 4288.667, "eval_steps_per_second": 67.026, "step": 7300 }, { "epoch": 0.7788195184317068, "grad_norm": 0.002006649738177657, "learning_rate": 4.423609631365864e-06, "loss": 0.0001, "step": 7310 }, { "epoch": 0.7788195184317068, "eval_loss": 0.0029674111865460873, "eval_runtime": 35.0269, "eval_samples_per_second": 4287.391, "eval_steps_per_second": 67.006, "step": 7310 }, { "epoch": 0.7798849350095888, "grad_norm": 0.23964039981365204, "learning_rate": 4.402301299808225e-06, "loss": 0.0005, "step": 7320 }, { "epoch": 0.7798849350095888, "eval_loss": 0.0029816378373652697, "eval_runtime": 35.0273, "eval_samples_per_second": 4287.336, "eval_steps_per_second": 67.005, "step": 7320 }, { "epoch": 0.7809503515874707, "grad_norm": 0.07510890811681747, "learning_rate": 4.380992968250586e-06, "loss": 0.0171, "step": 7330 }, { "epoch": 0.7809503515874707, "eval_loss": 0.002983283717185259, "eval_runtime": 35.0339, "eval_samples_per_second": 4286.533, "eval_steps_per_second": 66.992, "step": 7330 }, { "epoch": 0.7820157681653527, "grad_norm": 0.026817040517926216, "learning_rate": 4.359684636692947e-06, "loss": 0.0002, "step": 7340 }, { "epoch": 0.7820157681653527, "eval_loss": 0.0029749777168035507, "eval_runtime": 35.0352, "eval_samples_per_second": 4286.376, "eval_steps_per_second": 66.99, "step": 7340 }, { "epoch": 0.7830811847432346, "grad_norm": 0.002611766569316387, "learning_rate": 4.338376305135308e-06, "loss": 0.002, "step": 7350 }, { "epoch": 0.7830811847432346, "eval_loss": 0.003011771710589528, "eval_runtime": 35.0182, "eval_samples_per_second": 4288.458, "eval_steps_per_second": 67.022, "step": 7350 }, { "epoch": 0.7841466013211166, "grad_norm": 0.0021272755693644285, "learning_rate": 4.317067973577669e-06, "loss": 0.0019, "step": 7360 }, { "epoch": 0.7841466013211166, "eval_loss": 0.0032490803860127926, "eval_runtime": 35.0078, "eval_samples_per_second": 4289.734, "eval_steps_per_second": 67.042, "step": 7360 }, { "epoch": 0.7852120178989985, "grad_norm": 0.0024452470242977142, "learning_rate": 4.29575964202003e-06, "loss": 0.0066, "step": 7370 }, { "epoch": 0.7852120178989985, "eval_loss": 0.0033675709273666143, "eval_runtime": 35.0296, "eval_samples_per_second": 4287.054, "eval_steps_per_second": 67.0, "step": 7370 }, { "epoch": 0.7862774344768805, "grad_norm": 0.31848591566085815, "learning_rate": 4.274451310462391e-06, "loss": 0.0024, "step": 7380 }, { "epoch": 0.7862774344768805, "eval_loss": 0.0031276163645088673, "eval_runtime": 35.0063, "eval_samples_per_second": 4289.91, "eval_steps_per_second": 67.045, "step": 7380 }, { "epoch": 0.7873428510547624, "grad_norm": 0.018683720380067825, "learning_rate": 4.253142978904752e-06, "loss": 0.0012, "step": 7390 }, { "epoch": 0.7873428510547624, "eval_loss": 0.003029879881069064, "eval_runtime": 35.0335, "eval_samples_per_second": 4286.585, "eval_steps_per_second": 66.993, "step": 7390 }, { "epoch": 0.7884082676326444, "grad_norm": 0.39703598618507385, "learning_rate": 4.231834647347113e-06, "loss": 0.0031, "step": 7400 }, { "epoch": 0.7884082676326444, "eval_loss": 0.0030737167689949274, "eval_runtime": 35.0217, "eval_samples_per_second": 4288.024, "eval_steps_per_second": 67.016, "step": 7400 }, { "epoch": 0.7894736842105263, "grad_norm": 0.003899503033608198, "learning_rate": 4.210526315789474e-06, "loss": 0.0039, "step": 7410 }, { "epoch": 0.7894736842105263, "eval_loss": 0.0030558835715055466, "eval_runtime": 35.0174, "eval_samples_per_second": 4288.552, "eval_steps_per_second": 67.024, "step": 7410 }, { "epoch": 0.7905391007884083, "grad_norm": 0.0023945241700857878, "learning_rate": 4.189217984231835e-06, "loss": 0.0009, "step": 7420 }, { "epoch": 0.7905391007884083, "eval_loss": 0.003028090111911297, "eval_runtime": 35.0152, "eval_samples_per_second": 4288.821, "eval_steps_per_second": 67.028, "step": 7420 }, { "epoch": 0.7916045173662902, "grad_norm": 0.007479314226657152, "learning_rate": 4.167909652674196e-06, "loss": 0.0027, "step": 7430 }, { "epoch": 0.7916045173662902, "eval_loss": 0.0029746410436928272, "eval_runtime": 35.0281, "eval_samples_per_second": 4287.249, "eval_steps_per_second": 67.003, "step": 7430 }, { "epoch": 0.7926699339441722, "grad_norm": 0.0067185997031629086, "learning_rate": 4.146601321116557e-06, "loss": 0.0106, "step": 7440 }, { "epoch": 0.7926699339441722, "eval_loss": 0.0029829549603164196, "eval_runtime": 35.033, "eval_samples_per_second": 4286.641, "eval_steps_per_second": 66.994, "step": 7440 }, { "epoch": 0.7937353505220541, "grad_norm": 0.13891682028770447, "learning_rate": 4.125292989558918e-06, "loss": 0.0004, "step": 7450 }, { "epoch": 0.7937353505220541, "eval_loss": 0.002984261605888605, "eval_runtime": 35.0187, "eval_samples_per_second": 4288.393, "eval_steps_per_second": 67.021, "step": 7450 }, { "epoch": 0.7948007670999361, "grad_norm": 0.13743676245212555, "learning_rate": 4.103984658001279e-06, "loss": 0.0002, "step": 7460 }, { "epoch": 0.7948007670999361, "eval_loss": 0.0029855200555175543, "eval_runtime": 35.0451, "eval_samples_per_second": 4285.16, "eval_steps_per_second": 66.971, "step": 7460 }, { "epoch": 0.795866183677818, "grad_norm": 0.2898567020893097, "learning_rate": 4.08267632644364e-06, "loss": 0.0004, "step": 7470 }, { "epoch": 0.795866183677818, "eval_loss": 0.002985232975333929, "eval_runtime": 35.0329, "eval_samples_per_second": 4286.662, "eval_steps_per_second": 66.994, "step": 7470 }, { "epoch": 0.7969316002557, "grad_norm": 0.0027324198745191097, "learning_rate": 4.061367994886001e-06, "loss": 0.0088, "step": 7480 }, { "epoch": 0.7969316002557, "eval_loss": 0.002921548206359148, "eval_runtime": 35.0082, "eval_samples_per_second": 4289.676, "eval_steps_per_second": 67.041, "step": 7480 }, { "epoch": 0.7979970168335819, "grad_norm": 0.6680575609207153, "learning_rate": 4.040059663328362e-06, "loss": 0.0046, "step": 7490 }, { "epoch": 0.7979970168335819, "eval_loss": 0.00286526489071548, "eval_runtime": 35.0436, "eval_samples_per_second": 4285.344, "eval_steps_per_second": 66.974, "step": 7490 }, { "epoch": 0.7990624334114639, "grad_norm": 0.25453507900238037, "learning_rate": 4.018751331770723e-06, "loss": 0.001, "step": 7500 }, { "epoch": 0.7990624334114639, "eval_loss": 0.0029128112364560366, "eval_runtime": 35.0244, "eval_samples_per_second": 4287.697, "eval_steps_per_second": 67.01, "step": 7500 }, { "epoch": 0.8001278499893458, "grad_norm": 0.29362809658050537, "learning_rate": 3.997443000213084e-06, "loss": 0.0062, "step": 7510 }, { "epoch": 0.8001278499893458, "eval_loss": 0.002864515408873558, "eval_runtime": 35.0496, "eval_samples_per_second": 4284.613, "eval_steps_per_second": 66.962, "step": 7510 }, { "epoch": 0.8011932665672278, "grad_norm": 0.0018628902034834027, "learning_rate": 3.976134668655445e-06, "loss": 0.0013, "step": 7520 }, { "epoch": 0.8011932665672278, "eval_loss": 0.002836798317730427, "eval_runtime": 35.0704, "eval_samples_per_second": 4282.074, "eval_steps_per_second": 66.923, "step": 7520 }, { "epoch": 0.8022586831451097, "grad_norm": 0.0024648455437272787, "learning_rate": 3.954826337097806e-06, "loss": 0.0083, "step": 7530 }, { "epoch": 0.8022586831451097, "eval_loss": 0.0029008083511143923, "eval_runtime": 35.0548, "eval_samples_per_second": 4283.981, "eval_steps_per_second": 66.952, "step": 7530 }, { "epoch": 0.8033240997229917, "grad_norm": 0.002339346567168832, "learning_rate": 3.933518005540167e-06, "loss": 0.0002, "step": 7540 }, { "epoch": 0.8033240997229917, "eval_loss": 0.0029889014549553394, "eval_runtime": 35.0496, "eval_samples_per_second": 4284.61, "eval_steps_per_second": 66.962, "step": 7540 }, { "epoch": 0.8043895163008736, "grad_norm": 0.04572073370218277, "learning_rate": 3.912209673982528e-06, "loss": 0.0003, "step": 7550 }, { "epoch": 0.8043895163008736, "eval_loss": 0.0030233801808208227, "eval_runtime": 35.0316, "eval_samples_per_second": 4286.82, "eval_steps_per_second": 66.997, "step": 7550 }, { "epoch": 0.8054549328787556, "grad_norm": 0.09433967620134354, "learning_rate": 3.890901342424889e-06, "loss": 0.0003, "step": 7560 }, { "epoch": 0.8054549328787556, "eval_loss": 0.0030390520114451647, "eval_runtime": 35.0528, "eval_samples_per_second": 4284.227, "eval_steps_per_second": 66.956, "step": 7560 }, { "epoch": 0.8065203494566375, "grad_norm": 0.004556519910693169, "learning_rate": 3.869593010867249e-06, "loss": 0.0119, "step": 7570 }, { "epoch": 0.8065203494566375, "eval_loss": 0.0029982631094753742, "eval_runtime": 35.0447, "eval_samples_per_second": 4285.217, "eval_steps_per_second": 66.972, "step": 7570 }, { "epoch": 0.8075857660345195, "grad_norm": 0.004369661677628756, "learning_rate": 3.84828467930961e-06, "loss": 0.0074, "step": 7580 }, { "epoch": 0.8075857660345195, "eval_loss": 0.0028656297363340855, "eval_runtime": 35.0204, "eval_samples_per_second": 4288.185, "eval_steps_per_second": 67.018, "step": 7580 }, { "epoch": 0.8086511826124014, "grad_norm": 0.37288787961006165, "learning_rate": 3.826976347751971e-06, "loss": 0.0027, "step": 7590 }, { "epoch": 0.8086511826124014, "eval_loss": 0.0028651338070631027, "eval_runtime": 35.0362, "eval_samples_per_second": 4286.255, "eval_steps_per_second": 66.988, "step": 7590 }, { "epoch": 0.8097165991902834, "grad_norm": 0.007273674942553043, "learning_rate": 3.805668016194332e-06, "loss": 0.0075, "step": 7600 }, { "epoch": 0.8097165991902834, "eval_loss": 0.002874514786526561, "eval_runtime": 35.0364, "eval_samples_per_second": 4286.227, "eval_steps_per_second": 66.987, "step": 7600 }, { "epoch": 0.8107820157681653, "grad_norm": 0.003154418431222439, "learning_rate": 3.784359684636693e-06, "loss": 0.0016, "step": 7610 }, { "epoch": 0.8107820157681653, "eval_loss": 0.0028334720991551876, "eval_runtime": 35.0032, "eval_samples_per_second": 4290.298, "eval_steps_per_second": 67.051, "step": 7610 }, { "epoch": 0.8118474323460473, "grad_norm": 0.16729117929935455, "learning_rate": 3.763051353079054e-06, "loss": 0.0003, "step": 7620 }, { "epoch": 0.8118474323460473, "eval_loss": 0.0028151795268058777, "eval_runtime": 35.1147, "eval_samples_per_second": 4276.674, "eval_steps_per_second": 66.838, "step": 7620 }, { "epoch": 0.8129128489239292, "grad_norm": 0.11129946261644363, "learning_rate": 3.741743021521415e-06, "loss": 0.0036, "step": 7630 }, { "epoch": 0.8129128489239292, "eval_loss": 0.0028255251236259937, "eval_runtime": 35.0946, "eval_samples_per_second": 4279.114, "eval_steps_per_second": 66.876, "step": 7630 }, { "epoch": 0.8139782655018112, "grad_norm": 0.006738661322742701, "learning_rate": 3.720434689963776e-06, "loss": 0.0038, "step": 7640 }, { "epoch": 0.8139782655018112, "eval_loss": 0.002836094470694661, "eval_runtime": 35.0731, "eval_samples_per_second": 4281.737, "eval_steps_per_second": 66.917, "step": 7640 }, { "epoch": 0.8150436820796931, "grad_norm": 0.008290871046483517, "learning_rate": 3.699126358406137e-06, "loss": 0.0003, "step": 7650 }, { "epoch": 0.8150436820796931, "eval_loss": 0.002835857914760709, "eval_runtime": 35.0577, "eval_samples_per_second": 4283.622, "eval_steps_per_second": 66.947, "step": 7650 }, { "epoch": 0.8161090986575751, "grad_norm": 0.0021515628322958946, "learning_rate": 3.677818026848498e-06, "loss": 0.0003, "step": 7660 }, { "epoch": 0.8161090986575751, "eval_loss": 0.0028307398315519094, "eval_runtime": 35.0504, "eval_samples_per_second": 4284.522, "eval_steps_per_second": 66.961, "step": 7660 }, { "epoch": 0.817174515235457, "grad_norm": 0.0018256115727126598, "learning_rate": 3.656509695290859e-06, "loss": 0.0024, "step": 7670 }, { "epoch": 0.817174515235457, "eval_loss": 0.002814466366544366, "eval_runtime": 35.0893, "eval_samples_per_second": 4279.763, "eval_steps_per_second": 66.886, "step": 7670 }, { "epoch": 0.818239931813339, "grad_norm": 0.6402817368507385, "learning_rate": 3.63520136373322e-06, "loss": 0.0021, "step": 7680 }, { "epoch": 0.818239931813339, "eval_loss": 0.0028463115449994802, "eval_runtime": 35.067, "eval_samples_per_second": 4282.489, "eval_steps_per_second": 66.929, "step": 7680 }, { "epoch": 0.8193053483912209, "grad_norm": 0.0017836468759924173, "learning_rate": 3.613893032175581e-06, "loss": 0.0006, "step": 7690 }, { "epoch": 0.8193053483912209, "eval_loss": 0.0028544815722852945, "eval_runtime": 35.0337, "eval_samples_per_second": 4286.553, "eval_steps_per_second": 66.993, "step": 7690 }, { "epoch": 0.8203707649691029, "grad_norm": 0.005954293999820948, "learning_rate": 3.592584700617942e-06, "loss": 0.0004, "step": 7700 }, { "epoch": 0.8203707649691029, "eval_loss": 0.0028285484295338392, "eval_runtime": 35.055, "eval_samples_per_second": 4283.948, "eval_steps_per_second": 66.952, "step": 7700 }, { "epoch": 0.8214361815469848, "grad_norm": 0.002660792786628008, "learning_rate": 3.571276369060303e-06, "loss": 0.0053, "step": 7710 }, { "epoch": 0.8214361815469848, "eval_loss": 0.0027588389348238707, "eval_runtime": 35.0245, "eval_samples_per_second": 4287.679, "eval_steps_per_second": 67.01, "step": 7710 }, { "epoch": 0.8225015981248668, "grad_norm": 0.006484444718807936, "learning_rate": 3.549968037502664e-06, "loss": 0.0029, "step": 7720 }, { "epoch": 0.8225015981248668, "eval_loss": 0.0027692620642483234, "eval_runtime": 35.0538, "eval_samples_per_second": 4284.096, "eval_steps_per_second": 66.954, "step": 7720 }, { "epoch": 0.8235670147027487, "grad_norm": 0.003297739662230015, "learning_rate": 3.528659705945025e-06, "loss": 0.0017, "step": 7730 }, { "epoch": 0.8235670147027487, "eval_loss": 0.0028382448945194483, "eval_runtime": 35.0157, "eval_samples_per_second": 4288.766, "eval_steps_per_second": 67.027, "step": 7730 }, { "epoch": 0.8246324312806307, "grad_norm": 0.001944978255778551, "learning_rate": 3.5073513743873855e-06, "loss": 0.0025, "step": 7740 }, { "epoch": 0.8246324312806307, "eval_loss": 0.0028590108267962933, "eval_runtime": 35.0189, "eval_samples_per_second": 4288.367, "eval_steps_per_second": 67.021, "step": 7740 }, { "epoch": 0.8256978478585126, "grad_norm": 0.0017903875559568405, "learning_rate": 3.4860430428297465e-06, "loss": 0.0133, "step": 7750 }, { "epoch": 0.8256978478585126, "eval_loss": 0.0028446416836231947, "eval_runtime": 35.0345, "eval_samples_per_second": 4286.465, "eval_steps_per_second": 66.991, "step": 7750 }, { "epoch": 0.8267632644363946, "grad_norm": 0.0015631518326699734, "learning_rate": 3.4647347112721075e-06, "loss": 0.0064, "step": 7760 }, { "epoch": 0.8267632644363946, "eval_loss": 0.0028156498447060585, "eval_runtime": 35.0207, "eval_samples_per_second": 4288.149, "eval_steps_per_second": 67.017, "step": 7760 }, { "epoch": 0.8278286810142765, "grad_norm": 0.0022290684282779694, "learning_rate": 3.4434263797144685e-06, "loss": 0.0021, "step": 7770 }, { "epoch": 0.8278286810142765, "eval_loss": 0.0028021347243338823, "eval_runtime": 35.0059, "eval_samples_per_second": 4289.966, "eval_steps_per_second": 67.046, "step": 7770 }, { "epoch": 0.8288940975921585, "grad_norm": 0.001918564666993916, "learning_rate": 3.4221180481568295e-06, "loss": 0.0002, "step": 7780 }, { "epoch": 0.8288940975921585, "eval_loss": 0.0027851953636854887, "eval_runtime": 35.0363, "eval_samples_per_second": 4286.241, "eval_steps_per_second": 66.988, "step": 7780 }, { "epoch": 0.8299595141700404, "grad_norm": 0.027464309707283974, "learning_rate": 3.4008097165991905e-06, "loss": 0.0016, "step": 7790 }, { "epoch": 0.8299595141700404, "eval_loss": 0.002777885412797332, "eval_runtime": 35.0127, "eval_samples_per_second": 4289.128, "eval_steps_per_second": 67.033, "step": 7790 }, { "epoch": 0.8310249307479224, "grad_norm": 0.004105029162019491, "learning_rate": 3.3795013850415515e-06, "loss": 0.0035, "step": 7800 }, { "epoch": 0.8310249307479224, "eval_loss": 0.0027947339694947004, "eval_runtime": 35.0353, "eval_samples_per_second": 4286.367, "eval_steps_per_second": 66.99, "step": 7800 }, { "epoch": 0.8320903473258043, "grad_norm": 0.5708588361740112, "learning_rate": 3.3581930534839125e-06, "loss": 0.0043, "step": 7810 }, { "epoch": 0.8320903473258043, "eval_loss": 0.0027894387021660805, "eval_runtime": 35.0245, "eval_samples_per_second": 4287.678, "eval_steps_per_second": 67.01, "step": 7810 }, { "epoch": 0.8331557639036863, "grad_norm": 0.002082349034026265, "learning_rate": 3.3368847219262734e-06, "loss": 0.0073, "step": 7820 }, { "epoch": 0.8331557639036863, "eval_loss": 0.002818479435518384, "eval_runtime": 35.0308, "eval_samples_per_second": 4286.913, "eval_steps_per_second": 66.998, "step": 7820 }, { "epoch": 0.8342211804815683, "grad_norm": 0.0014790042769163847, "learning_rate": 3.3155763903686344e-06, "loss": 0.001, "step": 7830 }, { "epoch": 0.8342211804815683, "eval_loss": 0.002822867361828685, "eval_runtime": 35.0817, "eval_samples_per_second": 4280.692, "eval_steps_per_second": 66.901, "step": 7830 }, { "epoch": 0.8352865970594503, "grad_norm": 0.025291219353675842, "learning_rate": 3.2942680588109954e-06, "loss": 0.0011, "step": 7840 }, { "epoch": 0.8352865970594503, "eval_loss": 0.0028085343074053526, "eval_runtime": 35.0213, "eval_samples_per_second": 4288.076, "eval_steps_per_second": 67.016, "step": 7840 }, { "epoch": 0.8363520136373322, "grad_norm": 0.0024894457310438156, "learning_rate": 3.2729597272533564e-06, "loss": 0.0004, "step": 7850 }, { "epoch": 0.8363520136373322, "eval_loss": 0.0028004287742078304, "eval_runtime": 34.9977, "eval_samples_per_second": 4290.973, "eval_steps_per_second": 67.062, "step": 7850 }, { "epoch": 0.8374174302152142, "grad_norm": 0.001692480524070561, "learning_rate": 3.2516513956957174e-06, "loss": 0.0002, "step": 7860 }, { "epoch": 0.8374174302152142, "eval_loss": 0.0027998967561870813, "eval_runtime": 35.0112, "eval_samples_per_second": 4289.317, "eval_steps_per_second": 67.036, "step": 7860 }, { "epoch": 0.8384828467930961, "grad_norm": 0.0016112946905195713, "learning_rate": 3.2303430641380784e-06, "loss": 0.0061, "step": 7870 }, { "epoch": 0.8384828467930961, "eval_loss": 0.0027837178204208612, "eval_runtime": 35.0425, "eval_samples_per_second": 4285.484, "eval_steps_per_second": 66.976, "step": 7870 }, { "epoch": 0.8395482633709781, "grad_norm": 0.047582581639289856, "learning_rate": 3.2090347325804394e-06, "loss": 0.0168, "step": 7880 }, { "epoch": 0.8395482633709781, "eval_loss": 0.002785380929708481, "eval_runtime": 35.0378, "eval_samples_per_second": 4286.056, "eval_steps_per_second": 66.985, "step": 7880 }, { "epoch": 0.84061367994886, "grad_norm": 0.0018558768788352609, "learning_rate": 3.1877264010228004e-06, "loss": 0.0011, "step": 7890 }, { "epoch": 0.84061367994886, "eval_loss": 0.002787909237667918, "eval_runtime": 35.0402, "eval_samples_per_second": 4285.761, "eval_steps_per_second": 66.98, "step": 7890 }, { "epoch": 0.841679096526742, "grad_norm": 0.0328022725880146, "learning_rate": 3.1664180694651614e-06, "loss": 0.0006, "step": 7900 }, { "epoch": 0.841679096526742, "eval_loss": 0.0027848321478813887, "eval_runtime": 35.0488, "eval_samples_per_second": 4284.716, "eval_steps_per_second": 66.964, "step": 7900 }, { "epoch": 0.842744513104624, "grad_norm": 0.0023002829402685165, "learning_rate": 3.145109737907522e-06, "loss": 0.0015, "step": 7910 }, { "epoch": 0.842744513104624, "eval_loss": 0.00278343609534204, "eval_runtime": 35.0285, "eval_samples_per_second": 4287.2, "eval_steps_per_second": 67.003, "step": 7910 }, { "epoch": 0.8438099296825059, "grad_norm": 0.001770269824191928, "learning_rate": 3.123801406349883e-06, "loss": 0.0035, "step": 7920 }, { "epoch": 0.8438099296825059, "eval_loss": 0.0027938741259276867, "eval_runtime": 35.0162, "eval_samples_per_second": 4288.704, "eval_steps_per_second": 67.026, "step": 7920 }, { "epoch": 0.8448753462603878, "grad_norm": 0.0020561525598168373, "learning_rate": 3.102493074792244e-06, "loss": 0.0004, "step": 7930 }, { "epoch": 0.8448753462603878, "eval_loss": 0.0028133615851402283, "eval_runtime": 35.0294, "eval_samples_per_second": 4287.089, "eval_steps_per_second": 67.001, "step": 7930 }, { "epoch": 0.8459407628382698, "grad_norm": 0.023834535852074623, "learning_rate": 3.081184743234605e-06, "loss": 0.0002, "step": 7940 }, { "epoch": 0.8459407628382698, "eval_loss": 0.002823204966261983, "eval_runtime": 34.9882, "eval_samples_per_second": 4292.129, "eval_steps_per_second": 67.08, "step": 7940 }, { "epoch": 0.8470061794161517, "grad_norm": 0.0046548559330403805, "learning_rate": 3.059876411676966e-06, "loss": 0.0004, "step": 7950 }, { "epoch": 0.8470061794161517, "eval_loss": 0.0028295184019953012, "eval_runtime": 34.9818, "eval_samples_per_second": 4292.921, "eval_steps_per_second": 67.092, "step": 7950 }, { "epoch": 0.8480715959940337, "grad_norm": 0.007586074061691761, "learning_rate": 3.038568080119327e-06, "loss": 0.0004, "step": 7960 }, { "epoch": 0.8480715959940337, "eval_loss": 0.0028241388499736786, "eval_runtime": 34.9874, "eval_samples_per_second": 4292.226, "eval_steps_per_second": 67.081, "step": 7960 }, { "epoch": 0.8491370125719157, "grad_norm": 0.0014697522856295109, "learning_rate": 3.017259748561688e-06, "loss": 0.0023, "step": 7970 }, { "epoch": 0.8491370125719157, "eval_loss": 0.002786256605759263, "eval_runtime": 35.0455, "eval_samples_per_second": 4285.115, "eval_steps_per_second": 66.97, "step": 7970 }, { "epoch": 0.8502024291497976, "grad_norm": 0.006472844164818525, "learning_rate": 2.995951417004049e-06, "loss": 0.0005, "step": 7980 }, { "epoch": 0.8502024291497976, "eval_loss": 0.0027799701783806086, "eval_runtime": 35.0498, "eval_samples_per_second": 4284.593, "eval_steps_per_second": 66.962, "step": 7980 }, { "epoch": 0.8512678457276796, "grad_norm": 0.16366152465343475, "learning_rate": 2.97464308544641e-06, "loss": 0.0025, "step": 7990 }, { "epoch": 0.8512678457276796, "eval_loss": 0.002784137846902013, "eval_runtime": 35.0048, "eval_samples_per_second": 4290.101, "eval_steps_per_second": 67.048, "step": 7990 }, { "epoch": 0.8523332623055615, "grad_norm": 0.00848406832665205, "learning_rate": 2.953334753888771e-06, "loss": 0.0001, "step": 8000 }, { "epoch": 0.8523332623055615, "eval_loss": 0.002791937440633774, "eval_runtime": 35.0313, "eval_samples_per_second": 4286.847, "eval_steps_per_second": 66.997, "step": 8000 }, { "epoch": 0.8533986788834435, "grad_norm": 0.0016626849537715316, "learning_rate": 2.932026422331132e-06, "loss": 0.0016, "step": 8010 }, { "epoch": 0.8533986788834435, "eval_loss": 0.0027943544555455446, "eval_runtime": 35.0983, "eval_samples_per_second": 4278.668, "eval_steps_per_second": 66.869, "step": 8010 }, { "epoch": 0.8544640954613254, "grad_norm": 0.0065400260500609875, "learning_rate": 2.910718090773493e-06, "loss": 0.0024, "step": 8020 }, { "epoch": 0.8544640954613254, "eval_loss": 0.0027912973891943693, "eval_runtime": 35.0664, "eval_samples_per_second": 4282.557, "eval_steps_per_second": 66.93, "step": 8020 }, { "epoch": 0.8555295120392074, "grad_norm": 0.002638779580593109, "learning_rate": 2.889409759215854e-06, "loss": 0.007, "step": 8030 }, { "epoch": 0.8555295120392074, "eval_loss": 0.002751028398051858, "eval_runtime": 35.0158, "eval_samples_per_second": 4288.751, "eval_steps_per_second": 67.027, "step": 8030 }, { "epoch": 0.8565949286170893, "grad_norm": 0.1178533062338829, "learning_rate": 2.868101427658215e-06, "loss": 0.001, "step": 8040 }, { "epoch": 0.8565949286170893, "eval_loss": 0.0027414588257670403, "eval_runtime": 35.0552, "eval_samples_per_second": 4283.934, "eval_steps_per_second": 66.952, "step": 8040 }, { "epoch": 0.8576603451949713, "grad_norm": 0.008728962391614914, "learning_rate": 2.846793096100576e-06, "loss": 0.0006, "step": 8050 }, { "epoch": 0.8576603451949713, "eval_loss": 0.00275249220430851, "eval_runtime": 35.0312, "eval_samples_per_second": 4286.86, "eval_steps_per_second": 66.997, "step": 8050 }, { "epoch": 0.8587257617728532, "grad_norm": 0.004858131520450115, "learning_rate": 2.8254847645429368e-06, "loss": 0.0002, "step": 8060 }, { "epoch": 0.8587257617728532, "eval_loss": 0.002762093674391508, "eval_runtime": 35.0276, "eval_samples_per_second": 4287.304, "eval_steps_per_second": 67.004, "step": 8060 }, { "epoch": 0.8597911783507352, "grad_norm": 0.0031513080466538668, "learning_rate": 2.8041764329852978e-06, "loss": 0.0019, "step": 8070 }, { "epoch": 0.8597911783507352, "eval_loss": 0.0027698467019945383, "eval_runtime": 35.0124, "eval_samples_per_second": 4289.171, "eval_steps_per_second": 67.033, "step": 8070 }, { "epoch": 0.8608565949286171, "grad_norm": 0.0038100427482277155, "learning_rate": 2.7828681014276583e-06, "loss": 0.0007, "step": 8080 }, { "epoch": 0.8608565949286171, "eval_loss": 0.0027781969401985407, "eval_runtime": 35.0691, "eval_samples_per_second": 4282.23, "eval_steps_per_second": 66.925, "step": 8080 }, { "epoch": 0.861922011506499, "grad_norm": 0.003881295910105109, "learning_rate": 2.7615597698700193e-06, "loss": 0.0011, "step": 8090 }, { "epoch": 0.861922011506499, "eval_loss": 0.0027934699319303036, "eval_runtime": 35.0896, "eval_samples_per_second": 4279.729, "eval_steps_per_second": 66.886, "step": 8090 }, { "epoch": 0.862987428084381, "grad_norm": 0.0016517649637535214, "learning_rate": 2.7402514383123803e-06, "loss": 0.0007, "step": 8100 }, { "epoch": 0.862987428084381, "eval_loss": 0.0028144221287220716, "eval_runtime": 35.0148, "eval_samples_per_second": 4288.867, "eval_steps_per_second": 67.029, "step": 8100 }, { "epoch": 0.864052844662263, "grad_norm": 1.7808645963668823, "learning_rate": 2.7189431067547413e-06, "loss": 0.0011, "step": 8110 }, { "epoch": 0.864052844662263, "eval_loss": 0.0028445336502045393, "eval_runtime": 35.0679, "eval_samples_per_second": 4282.38, "eval_steps_per_second": 66.927, "step": 8110 }, { "epoch": 0.8651182612401449, "grad_norm": 3.285395383834839, "learning_rate": 2.6976347751971023e-06, "loss": 0.004, "step": 8120 }, { "epoch": 0.8651182612401449, "eval_loss": 0.0028635459020733833, "eval_runtime": 35.0252, "eval_samples_per_second": 4287.6, "eval_steps_per_second": 67.009, "step": 8120 }, { "epoch": 0.8661836778180269, "grad_norm": 0.033276911824941635, "learning_rate": 2.6763264436394633e-06, "loss": 0.0161, "step": 8130 }, { "epoch": 0.8661836778180269, "eval_loss": 0.002783233532682061, "eval_runtime": 35.0088, "eval_samples_per_second": 4289.605, "eval_steps_per_second": 67.04, "step": 8130 }, { "epoch": 0.8672490943959088, "grad_norm": 0.015310313552618027, "learning_rate": 2.6550181120818243e-06, "loss": 0.0001, "step": 8140 }, { "epoch": 0.8672490943959088, "eval_loss": 0.002757697133347392, "eval_runtime": 35.0601, "eval_samples_per_second": 4283.329, "eval_steps_per_second": 66.942, "step": 8140 }, { "epoch": 0.8683145109737908, "grad_norm": 0.012751123867928982, "learning_rate": 2.6337097805241853e-06, "loss": 0.0011, "step": 8150 }, { "epoch": 0.8683145109737908, "eval_loss": 0.002762366319075227, "eval_runtime": 35.047, "eval_samples_per_second": 4284.926, "eval_steps_per_second": 66.967, "step": 8150 }, { "epoch": 0.8693799275516727, "grad_norm": 0.05020337924361229, "learning_rate": 2.6124014489665463e-06, "loss": 0.0173, "step": 8160 }, { "epoch": 0.8693799275516727, "eval_loss": 0.0027494090609252453, "eval_runtime": 35.1731, "eval_samples_per_second": 4269.564, "eval_steps_per_second": 66.727, "step": 8160 }, { "epoch": 0.8704453441295547, "grad_norm": 0.029232144355773926, "learning_rate": 2.5910931174089072e-06, "loss": 0.0003, "step": 8170 }, { "epoch": 0.8704453441295547, "eval_loss": 0.0027434728108346462, "eval_runtime": 35.0502, "eval_samples_per_second": 4284.538, "eval_steps_per_second": 66.961, "step": 8170 }, { "epoch": 0.8715107607074366, "grad_norm": 0.07336370646953583, "learning_rate": 2.5697847858512682e-06, "loss": 0.0007, "step": 8180 }, { "epoch": 0.8715107607074366, "eval_loss": 0.002746333135291934, "eval_runtime": 35.0401, "eval_samples_per_second": 4285.776, "eval_steps_per_second": 66.98, "step": 8180 }, { "epoch": 0.8725761772853186, "grad_norm": 0.009558520279824734, "learning_rate": 2.5484764542936292e-06, "loss": 0.0039, "step": 8190 }, { "epoch": 0.8725761772853186, "eval_loss": 0.002741629723459482, "eval_runtime": 35.0868, "eval_samples_per_second": 4280.076, "eval_steps_per_second": 66.891, "step": 8190 }, { "epoch": 0.8736415938632005, "grad_norm": 0.030061665922403336, "learning_rate": 2.5271681227359902e-06, "loss": 0.0003, "step": 8200 }, { "epoch": 0.8736415938632005, "eval_loss": 0.0027512703090906143, "eval_runtime": 35.0542, "eval_samples_per_second": 4284.054, "eval_steps_per_second": 66.953, "step": 8200 }, { "epoch": 0.8747070104410825, "grad_norm": 0.0030335835181176662, "learning_rate": 2.505859791178351e-06, "loss": 0.0004, "step": 8210 }, { "epoch": 0.8747070104410825, "eval_loss": 0.0027557830326259136, "eval_runtime": 35.0459, "eval_samples_per_second": 4285.067, "eval_steps_per_second": 66.969, "step": 8210 }, { "epoch": 0.8757724270189644, "grad_norm": 0.005516626872122288, "learning_rate": 2.4845514596207118e-06, "loss": 0.0015, "step": 8220 }, { "epoch": 0.8757724270189644, "eval_loss": 0.00276589160785079, "eval_runtime": 35.0691, "eval_samples_per_second": 4282.235, "eval_steps_per_second": 66.925, "step": 8220 }, { "epoch": 0.8768378435968464, "grad_norm": 2.0706310272216797, "learning_rate": 2.4632431280630728e-06, "loss": 0.0165, "step": 8230 }, { "epoch": 0.8768378435968464, "eval_loss": 0.002770791994407773, "eval_runtime": 35.0091, "eval_samples_per_second": 4289.571, "eval_steps_per_second": 67.04, "step": 8230 }, { "epoch": 0.8779032601747283, "grad_norm": 0.862779974937439, "learning_rate": 2.4419347965054338e-06, "loss": 0.0037, "step": 8240 }, { "epoch": 0.8779032601747283, "eval_loss": 0.0027604245115071535, "eval_runtime": 35.002, "eval_samples_per_second": 4290.441, "eval_steps_per_second": 67.053, "step": 8240 }, { "epoch": 0.8789686767526103, "grad_norm": 0.07593127340078354, "learning_rate": 2.4206264649477947e-06, "loss": 0.0013, "step": 8250 }, { "epoch": 0.8789686767526103, "eval_loss": 0.0027596699073910713, "eval_runtime": 35.0447, "eval_samples_per_second": 4285.216, "eval_steps_per_second": 66.972, "step": 8250 }, { "epoch": 0.8800340933304922, "grad_norm": 0.004259423352777958, "learning_rate": 2.3993181333901557e-06, "loss": 0.0007, "step": 8260 }, { "epoch": 0.8800340933304922, "eval_loss": 0.002764316974207759, "eval_runtime": 35.0623, "eval_samples_per_second": 4283.062, "eval_steps_per_second": 66.938, "step": 8260 }, { "epoch": 0.8810995099083742, "grad_norm": 0.0013831878313794732, "learning_rate": 2.3780098018325167e-06, "loss": 0.0007, "step": 8270 }, { "epoch": 0.8810995099083742, "eval_loss": 0.002769648330286145, "eval_runtime": 35.0422, "eval_samples_per_second": 4285.516, "eval_steps_per_second": 66.976, "step": 8270 }, { "epoch": 0.8821649264862561, "grad_norm": 0.002447050530463457, "learning_rate": 2.3567014702748777e-06, "loss": 0.0035, "step": 8280 }, { "epoch": 0.8821649264862561, "eval_loss": 0.002767772413790226, "eval_runtime": 35.0206, "eval_samples_per_second": 4288.167, "eval_steps_per_second": 67.018, "step": 8280 }, { "epoch": 0.8832303430641381, "grad_norm": 0.0015266514383256435, "learning_rate": 2.3353931387172387e-06, "loss": 0.0047, "step": 8290 }, { "epoch": 0.8832303430641381, "eval_loss": 0.002763263415545225, "eval_runtime": 35.0695, "eval_samples_per_second": 4282.187, "eval_steps_per_second": 66.924, "step": 8290 }, { "epoch": 0.88429575964202, "grad_norm": 0.08378314226865768, "learning_rate": 2.3140848071595997e-06, "loss": 0.0028, "step": 8300 }, { "epoch": 0.88429575964202, "eval_loss": 0.0027693863958120346, "eval_runtime": 35.0809, "eval_samples_per_second": 4280.795, "eval_steps_per_second": 66.903, "step": 8300 }, { "epoch": 0.885361176219902, "grad_norm": 0.002748900791630149, "learning_rate": 2.2927764756019607e-06, "loss": 0.0034, "step": 8310 }, { "epoch": 0.885361176219902, "eval_loss": 0.002776265610009432, "eval_runtime": 35.022, "eval_samples_per_second": 4287.987, "eval_steps_per_second": 67.015, "step": 8310 }, { "epoch": 0.8864265927977839, "grad_norm": 0.025571728125214577, "learning_rate": 2.2714681440443217e-06, "loss": 0.0004, "step": 8320 }, { "epoch": 0.8864265927977839, "eval_loss": 0.0027706564869731665, "eval_runtime": 35.0191, "eval_samples_per_second": 4288.346, "eval_steps_per_second": 67.021, "step": 8320 }, { "epoch": 0.8874920093756659, "grad_norm": 0.007666856050491333, "learning_rate": 2.2501598124866827e-06, "loss": 0.0123, "step": 8330 }, { "epoch": 0.8874920093756659, "eval_loss": 0.002764170989394188, "eval_runtime": 35.008, "eval_samples_per_second": 4289.7, "eval_steps_per_second": 67.042, "step": 8330 }, { "epoch": 0.8885574259535478, "grad_norm": 0.0209694541990757, "learning_rate": 2.2288514809290437e-06, "loss": 0.0004, "step": 8340 }, { "epoch": 0.8885574259535478, "eval_loss": 0.002746229525655508, "eval_runtime": 35.0307, "eval_samples_per_second": 4286.93, "eval_steps_per_second": 66.998, "step": 8340 }, { "epoch": 0.8896228425314298, "grad_norm": 0.0023976133670657873, "learning_rate": 2.2075431493714046e-06, "loss": 0.001, "step": 8350 }, { "epoch": 0.8896228425314298, "eval_loss": 0.002741154283285141, "eval_runtime": 35.0325, "eval_samples_per_second": 4286.707, "eval_steps_per_second": 66.995, "step": 8350 }, { "epoch": 0.8906882591093117, "grad_norm": 0.24398835003376007, "learning_rate": 2.1862348178137656e-06, "loss": 0.0009, "step": 8360 }, { "epoch": 0.8906882591093117, "eval_loss": 0.002744528232142329, "eval_runtime": 35.0342, "eval_samples_per_second": 4286.492, "eval_steps_per_second": 66.992, "step": 8360 }, { "epoch": 0.8917536756871937, "grad_norm": 0.03572320565581322, "learning_rate": 2.1649264862561266e-06, "loss": 0.0004, "step": 8370 }, { "epoch": 0.8917536756871937, "eval_loss": 0.0027611658442765474, "eval_runtime": 35.0407, "eval_samples_per_second": 4285.701, "eval_steps_per_second": 66.979, "step": 8370 }, { "epoch": 0.8928190922650756, "grad_norm": 0.6422826647758484, "learning_rate": 2.143618154698487e-06, "loss": 0.0024, "step": 8380 }, { "epoch": 0.8928190922650756, "eval_loss": 0.0027707030531018972, "eval_runtime": 35.062, "eval_samples_per_second": 4283.101, "eval_steps_per_second": 66.939, "step": 8380 }, { "epoch": 0.8938845088429576, "grad_norm": 0.0015818601241335273, "learning_rate": 2.122309823140848e-06, "loss": 0.0045, "step": 8390 }, { "epoch": 0.8938845088429576, "eval_loss": 0.0027499543502926826, "eval_runtime": 35.0496, "eval_samples_per_second": 4284.608, "eval_steps_per_second": 66.962, "step": 8390 }, { "epoch": 0.8949499254208395, "grad_norm": 0.0156484916806221, "learning_rate": 2.101001491583209e-06, "loss": 0.0084, "step": 8400 }, { "epoch": 0.8949499254208395, "eval_loss": 0.0027432774659246206, "eval_runtime": 35.0349, "eval_samples_per_second": 4286.407, "eval_steps_per_second": 66.99, "step": 8400 }, { "epoch": 0.8960153419987215, "grad_norm": 0.0045946515165269375, "learning_rate": 2.07969316002557e-06, "loss": 0.0011, "step": 8410 }, { "epoch": 0.8960153419987215, "eval_loss": 0.002739608520641923, "eval_runtime": 35.0416, "eval_samples_per_second": 4285.598, "eval_steps_per_second": 66.978, "step": 8410 }, { "epoch": 0.8970807585766034, "grad_norm": 0.0017706300131976604, "learning_rate": 2.058384828467931e-06, "loss": 0.0034, "step": 8420 }, { "epoch": 0.8970807585766034, "eval_loss": 0.0027431268244981766, "eval_runtime": 35.0275, "eval_samples_per_second": 4287.323, "eval_steps_per_second": 67.005, "step": 8420 }, { "epoch": 0.8981461751544854, "grad_norm": 0.002228514524176717, "learning_rate": 2.037076496910292e-06, "loss": 0.0015, "step": 8430 }, { "epoch": 0.8981461751544854, "eval_loss": 0.0027508740313351154, "eval_runtime": 35.0118, "eval_samples_per_second": 4289.244, "eval_steps_per_second": 67.035, "step": 8430 }, { "epoch": 0.8992115917323673, "grad_norm": 0.0016633226769044995, "learning_rate": 2.015768165352653e-06, "loss": 0.0007, "step": 8440 }, { "epoch": 0.8992115917323673, "eval_loss": 0.0027630003169178963, "eval_runtime": 35.0398, "eval_samples_per_second": 4285.81, "eval_steps_per_second": 66.981, "step": 8440 }, { "epoch": 0.9002770083102493, "grad_norm": 0.0019834586419165134, "learning_rate": 1.994459833795014e-06, "loss": 0.0059, "step": 8450 }, { "epoch": 0.9002770083102493, "eval_loss": 0.002752769272774458, "eval_runtime": 35.0758, "eval_samples_per_second": 4281.418, "eval_steps_per_second": 66.912, "step": 8450 }, { "epoch": 0.9013424248881312, "grad_norm": 0.17420539259910583, "learning_rate": 1.9731515022373747e-06, "loss": 0.0006, "step": 8460 }, { "epoch": 0.9013424248881312, "eval_loss": 0.002748524770140648, "eval_runtime": 35.0817, "eval_samples_per_second": 4280.698, "eval_steps_per_second": 66.901, "step": 8460 }, { "epoch": 0.9024078414660132, "grad_norm": 0.003549454268068075, "learning_rate": 1.9518431706797357e-06, "loss": 0.0001, "step": 8470 }, { "epoch": 0.9024078414660132, "eval_loss": 0.0027478199917823076, "eval_runtime": 35.0417, "eval_samples_per_second": 4285.583, "eval_steps_per_second": 66.977, "step": 8470 }, { "epoch": 0.9034732580438951, "grad_norm": 0.0015697539784014225, "learning_rate": 1.9305348391220967e-06, "loss": 0.0011, "step": 8480 }, { "epoch": 0.9034732580438951, "eval_loss": 0.0027480670250952244, "eval_runtime": 35.0442, "eval_samples_per_second": 4285.279, "eval_steps_per_second": 66.973, "step": 8480 }, { "epoch": 0.9045386746217771, "grad_norm": 0.002247209195047617, "learning_rate": 1.9092265075644577e-06, "loss": 0.0001, "step": 8490 }, { "epoch": 0.9045386746217771, "eval_loss": 0.0027491068467497826, "eval_runtime": 35.0476, "eval_samples_per_second": 4284.857, "eval_steps_per_second": 66.966, "step": 8490 }, { "epoch": 0.905604091199659, "grad_norm": 0.009732640348374844, "learning_rate": 1.8879181760068189e-06, "loss": 0.0006, "step": 8500 }, { "epoch": 0.905604091199659, "eval_loss": 0.002738188486546278, "eval_runtime": 35.0452, "eval_samples_per_second": 4285.154, "eval_steps_per_second": 66.971, "step": 8500 }, { "epoch": 0.9066695077775411, "grad_norm": 0.0018489729845896363, "learning_rate": 1.8666098444491799e-06, "loss": 0.0006, "step": 8510 }, { "epoch": 0.9066695077775411, "eval_loss": 0.002733604284003377, "eval_runtime": 35.0616, "eval_samples_per_second": 4283.15, "eval_steps_per_second": 66.939, "step": 8510 }, { "epoch": 0.907734924355423, "grad_norm": 0.0026071134489029646, "learning_rate": 1.8453015128915408e-06, "loss": 0.0117, "step": 8520 }, { "epoch": 0.907734924355423, "eval_loss": 0.0027230686973780394, "eval_runtime": 35.0647, "eval_samples_per_second": 4282.773, "eval_steps_per_second": 66.933, "step": 8520 }, { "epoch": 0.908800340933305, "grad_norm": 3.456970453262329, "learning_rate": 1.8239931813339018e-06, "loss": 0.0195, "step": 8530 }, { "epoch": 0.908800340933305, "eval_loss": 0.002711121691390872, "eval_runtime": 35.0266, "eval_samples_per_second": 4287.425, "eval_steps_per_second": 67.006, "step": 8530 }, { "epoch": 0.9098657575111869, "grad_norm": 0.0017407455015927553, "learning_rate": 1.8026848497762628e-06, "loss": 0.0015, "step": 8540 }, { "epoch": 0.9098657575111869, "eval_loss": 0.002703184960409999, "eval_runtime": 35.0438, "eval_samples_per_second": 4285.323, "eval_steps_per_second": 66.973, "step": 8540 }, { "epoch": 0.9109311740890689, "grad_norm": 0.05513337254524231, "learning_rate": 1.7813765182186236e-06, "loss": 0.009, "step": 8550 }, { "epoch": 0.9109311740890689, "eval_loss": 0.0027215650770813227, "eval_runtime": 35.0904, "eval_samples_per_second": 4279.629, "eval_steps_per_second": 66.884, "step": 8550 }, { "epoch": 0.9119965906669508, "grad_norm": 0.014360551722347736, "learning_rate": 1.7600681866609846e-06, "loss": 0.0044, "step": 8560 }, { "epoch": 0.9119965906669508, "eval_loss": 0.0027318003121763468, "eval_runtime": 35.0623, "eval_samples_per_second": 4283.058, "eval_steps_per_second": 66.938, "step": 8560 }, { "epoch": 0.9130620072448328, "grad_norm": 0.02252795174717903, "learning_rate": 1.7387598551033456e-06, "loss": 0.0012, "step": 8570 }, { "epoch": 0.9130620072448328, "eval_loss": 0.0027427198365330696, "eval_runtime": 35.086, "eval_samples_per_second": 4280.172, "eval_steps_per_second": 66.893, "step": 8570 }, { "epoch": 0.9141274238227147, "grad_norm": 0.016549358144402504, "learning_rate": 1.7174515235457066e-06, "loss": 0.0009, "step": 8580 }, { "epoch": 0.9141274238227147, "eval_loss": 0.0027572920080274343, "eval_runtime": 35.0789, "eval_samples_per_second": 4281.031, "eval_steps_per_second": 66.906, "step": 8580 }, { "epoch": 0.9151928404005967, "grad_norm": 0.0019249654142186046, "learning_rate": 1.6961431919880676e-06, "loss": 0.0002, "step": 8590 }, { "epoch": 0.9151928404005967, "eval_loss": 0.0027666096575558186, "eval_runtime": 35.0568, "eval_samples_per_second": 4283.731, "eval_steps_per_second": 66.948, "step": 8590 }, { "epoch": 0.9162582569784786, "grad_norm": 0.003874736838042736, "learning_rate": 1.6748348604304286e-06, "loss": 0.0003, "step": 8600 }, { "epoch": 0.9162582569784786, "eval_loss": 0.002771862084046006, "eval_runtime": 35.0593, "eval_samples_per_second": 4283.426, "eval_steps_per_second": 66.944, "step": 8600 }, { "epoch": 0.9173236735563606, "grad_norm": 0.0042558941058814526, "learning_rate": 1.6535265288727895e-06, "loss": 0.0003, "step": 8610 }, { "epoch": 0.9173236735563606, "eval_loss": 0.002779304748401046, "eval_runtime": 35.0443, "eval_samples_per_second": 4285.264, "eval_steps_per_second": 66.972, "step": 8610 }, { "epoch": 0.9183890901342425, "grad_norm": 0.0014816632028669119, "learning_rate": 1.6322181973151505e-06, "loss": 0.0003, "step": 8620 }, { "epoch": 0.9183890901342425, "eval_loss": 0.0027969153597950935, "eval_runtime": 35.0602, "eval_samples_per_second": 4283.317, "eval_steps_per_second": 66.942, "step": 8620 }, { "epoch": 0.9194545067121245, "grad_norm": 0.005812318064272404, "learning_rate": 1.610909865757511e-06, "loss": 0.0032, "step": 8630 }, { "epoch": 0.9194545067121245, "eval_loss": 0.002787941135466099, "eval_runtime": 35.0357, "eval_samples_per_second": 4286.314, "eval_steps_per_second": 66.989, "step": 8630 }, { "epoch": 0.9205199232900064, "grad_norm": 0.006544212810695171, "learning_rate": 1.589601534199872e-06, "loss": 0.0013, "step": 8640 }, { "epoch": 0.9205199232900064, "eval_loss": 0.0028067566454410553, "eval_runtime": 35.0446, "eval_samples_per_second": 4285.22, "eval_steps_per_second": 66.972, "step": 8640 }, { "epoch": 0.9215853398678884, "grad_norm": 0.00216415012255311, "learning_rate": 1.568293202642233e-06, "loss": 0.0011, "step": 8650 }, { "epoch": 0.9215853398678884, "eval_loss": 0.0028414882253855467, "eval_runtime": 35.0524, "eval_samples_per_second": 4284.268, "eval_steps_per_second": 66.957, "step": 8650 }, { "epoch": 0.9226507564457703, "grad_norm": 0.0019254203652963042, "learning_rate": 1.546984871084594e-06, "loss": 0.0011, "step": 8660 }, { "epoch": 0.9226507564457703, "eval_loss": 0.0028817523270845413, "eval_runtime": 35.0563, "eval_samples_per_second": 4283.796, "eval_steps_per_second": 66.949, "step": 8660 }, { "epoch": 0.9237161730236523, "grad_norm": 0.4283369183540344, "learning_rate": 1.525676539526955e-06, "loss": 0.0017, "step": 8670 }, { "epoch": 0.9237161730236523, "eval_loss": 0.002892641816288233, "eval_runtime": 35.0339, "eval_samples_per_second": 4286.532, "eval_steps_per_second": 66.992, "step": 8670 }, { "epoch": 0.9247815896015342, "grad_norm": 0.002897687954828143, "learning_rate": 1.504368207969316e-06, "loss": 0.0003, "step": 8680 }, { "epoch": 0.9247815896015342, "eval_loss": 0.0029174918308854103, "eval_runtime": 35.0612, "eval_samples_per_second": 4283.199, "eval_steps_per_second": 66.94, "step": 8680 }, { "epoch": 0.9258470061794162, "grad_norm": 0.8872772455215454, "learning_rate": 1.4830598764116772e-06, "loss": 0.0053, "step": 8690 }, { "epoch": 0.9258470061794162, "eval_loss": 0.0028924746438860893, "eval_runtime": 35.0912, "eval_samples_per_second": 4279.538, "eval_steps_per_second": 66.883, "step": 8690 }, { "epoch": 0.9269124227572981, "grad_norm": 0.004901398438960314, "learning_rate": 1.4617515448540382e-06, "loss": 0.0001, "step": 8700 }, { "epoch": 0.9269124227572981, "eval_loss": 0.002856872510164976, "eval_runtime": 35.0855, "eval_samples_per_second": 4280.234, "eval_steps_per_second": 66.894, "step": 8700 }, { "epoch": 0.9279778393351801, "grad_norm": 0.0021194189321249723, "learning_rate": 1.4404432132963992e-06, "loss": 0.0153, "step": 8710 }, { "epoch": 0.9279778393351801, "eval_loss": 0.002821860834956169, "eval_runtime": 35.0497, "eval_samples_per_second": 4284.598, "eval_steps_per_second": 66.962, "step": 8710 }, { "epoch": 0.929043255913062, "grad_norm": 0.10005082935094833, "learning_rate": 1.4191348817387598e-06, "loss": 0.0002, "step": 8720 }, { "epoch": 0.929043255913062, "eval_loss": 0.0028019933961331844, "eval_runtime": 35.0643, "eval_samples_per_second": 4282.821, "eval_steps_per_second": 66.934, "step": 8720 }, { "epoch": 0.930108672490944, "grad_norm": 0.002553171245381236, "learning_rate": 1.3978265501811208e-06, "loss": 0.0007, "step": 8730 }, { "epoch": 0.930108672490944, "eval_loss": 0.0027997682336717844, "eval_runtime": 35.0344, "eval_samples_per_second": 4286.473, "eval_steps_per_second": 66.991, "step": 8730 }, { "epoch": 0.9311740890688259, "grad_norm": 0.002466765232384205, "learning_rate": 1.3765182186234818e-06, "loss": 0.0003, "step": 8740 }, { "epoch": 0.9311740890688259, "eval_loss": 0.002808566903695464, "eval_runtime": 34.9819, "eval_samples_per_second": 4292.906, "eval_steps_per_second": 67.092, "step": 8740 }, { "epoch": 0.9322395056467079, "grad_norm": 0.0318465530872345, "learning_rate": 1.3552098870658428e-06, "loss": 0.0035, "step": 8750 }, { "epoch": 0.9322395056467079, "eval_loss": 0.0027872417122125626, "eval_runtime": 35.0627, "eval_samples_per_second": 4283.009, "eval_steps_per_second": 66.937, "step": 8750 }, { "epoch": 0.9333049222245898, "grad_norm": 0.0013958633644506335, "learning_rate": 1.3339015555082038e-06, "loss": 0.0002, "step": 8760 }, { "epoch": 0.9333049222245898, "eval_loss": 0.002777666551992297, "eval_runtime": 35.0407, "eval_samples_per_second": 4285.707, "eval_steps_per_second": 66.979, "step": 8760 }, { "epoch": 0.9343703388024718, "grad_norm": 0.00333898956887424, "learning_rate": 1.3125932239505647e-06, "loss": 0.0079, "step": 8770 }, { "epoch": 0.9343703388024718, "eval_loss": 0.0027819545939564705, "eval_runtime": 35.0694, "eval_samples_per_second": 4282.194, "eval_steps_per_second": 66.924, "step": 8770 }, { "epoch": 0.9354357553803537, "grad_norm": 0.30577364563941956, "learning_rate": 1.2912848923929257e-06, "loss": 0.0013, "step": 8780 }, { "epoch": 0.9354357553803537, "eval_loss": 0.0027924508322030306, "eval_runtime": 35.0496, "eval_samples_per_second": 4284.612, "eval_steps_per_second": 66.962, "step": 8780 }, { "epoch": 0.9365011719582357, "grad_norm": 0.040551621466875076, "learning_rate": 1.2699765608352867e-06, "loss": 0.0008, "step": 8790 }, { "epoch": 0.9365011719582357, "eval_loss": 0.0028144929092377424, "eval_runtime": 35.0432, "eval_samples_per_second": 4285.396, "eval_steps_per_second": 66.974, "step": 8790 }, { "epoch": 0.9375665885361176, "grad_norm": 0.265171080827713, "learning_rate": 1.2486682292776477e-06, "loss": 0.0008, "step": 8800 }, { "epoch": 0.9375665885361176, "eval_loss": 0.002830737503245473, "eval_runtime": 35.0513, "eval_samples_per_second": 4284.411, "eval_steps_per_second": 66.959, "step": 8800 }, { "epoch": 0.9386320051139996, "grad_norm": 0.02604043483734131, "learning_rate": 1.2273598977200087e-06, "loss": 0.0001, "step": 8810 }, { "epoch": 0.9386320051139996, "eval_loss": 0.0028454142156988382, "eval_runtime": 35.0453, "eval_samples_per_second": 4285.141, "eval_steps_per_second": 66.97, "step": 8810 }, { "epoch": 0.9396974216918815, "grad_norm": 0.0020725736394524574, "learning_rate": 1.2060515661623697e-06, "loss": 0.0006, "step": 8820 }, { "epoch": 0.9396974216918815, "eval_loss": 0.002855469472706318, "eval_runtime": 35.0566, "eval_samples_per_second": 4283.757, "eval_steps_per_second": 66.949, "step": 8820 }, { "epoch": 0.9407628382697635, "grad_norm": 0.09126020967960358, "learning_rate": 1.1847432346047305e-06, "loss": 0.0004, "step": 8830 }, { "epoch": 0.9407628382697635, "eval_loss": 0.0028640632517635822, "eval_runtime": 35.0432, "eval_samples_per_second": 4285.401, "eval_steps_per_second": 66.975, "step": 8830 }, { "epoch": 0.9418282548476454, "grad_norm": 0.0015713806496933103, "learning_rate": 1.1634349030470915e-06, "loss": 0.0008, "step": 8840 }, { "epoch": 0.9418282548476454, "eval_loss": 0.0028736621607095003, "eval_runtime": 35.0791, "eval_samples_per_second": 4281.007, "eval_steps_per_second": 66.906, "step": 8840 }, { "epoch": 0.9428936714255274, "grad_norm": 0.001434053760021925, "learning_rate": 1.1421265714894525e-06, "loss": 0.0066, "step": 8850 }, { "epoch": 0.9428936714255274, "eval_loss": 0.0028831621166318655, "eval_runtime": 35.0581, "eval_samples_per_second": 4283.57, "eval_steps_per_second": 66.946, "step": 8850 }, { "epoch": 0.9439590880034093, "grad_norm": 0.0015935949049890041, "learning_rate": 1.1208182399318134e-06, "loss": 0.0049, "step": 8860 }, { "epoch": 0.9439590880034093, "eval_loss": 0.0028416782151907682, "eval_runtime": 35.0629, "eval_samples_per_second": 4282.99, "eval_steps_per_second": 66.937, "step": 8860 }, { "epoch": 0.9450245045812913, "grad_norm": 0.0035525760613381863, "learning_rate": 1.0995099083741744e-06, "loss": 0.0024, "step": 8870 }, { "epoch": 0.9450245045812913, "eval_loss": 0.002823463175445795, "eval_runtime": 35.0712, "eval_samples_per_second": 4281.969, "eval_steps_per_second": 66.921, "step": 8870 }, { "epoch": 0.9460899211591732, "grad_norm": 0.003755433950573206, "learning_rate": 1.0782015768165354e-06, "loss": 0.0005, "step": 8880 }, { "epoch": 0.9460899211591732, "eval_loss": 0.002827939111739397, "eval_runtime": 35.0628, "eval_samples_per_second": 4282.996, "eval_steps_per_second": 66.937, "step": 8880 }, { "epoch": 0.9471553377370552, "grad_norm": 0.005914956331253052, "learning_rate": 1.0568932452588964e-06, "loss": 0.001, "step": 8890 }, { "epoch": 0.9471553377370552, "eval_loss": 0.0028441580943763256, "eval_runtime": 35.0361, "eval_samples_per_second": 4286.265, "eval_steps_per_second": 66.988, "step": 8890 }, { "epoch": 0.9482207543149371, "grad_norm": 0.007786376867443323, "learning_rate": 1.0355849137012574e-06, "loss": 0.0025, "step": 8900 }, { "epoch": 0.9482207543149371, "eval_loss": 0.0028372537344694138, "eval_runtime": 35.0638, "eval_samples_per_second": 4282.882, "eval_steps_per_second": 66.935, "step": 8900 }, { "epoch": 0.9492861708928191, "grad_norm": 0.01083774771541357, "learning_rate": 1.0142765821436182e-06, "loss": 0.0007, "step": 8910 }, { "epoch": 0.9492861708928191, "eval_loss": 0.0028390076477080584, "eval_runtime": 35.0497, "eval_samples_per_second": 4284.598, "eval_steps_per_second": 66.962, "step": 8910 }, { "epoch": 0.950351587470701, "grad_norm": 0.0015453147934749722, "learning_rate": 9.929682505859792e-07, "loss": 0.0024, "step": 8920 }, { "epoch": 0.950351587470701, "eval_loss": 0.002836124738678336, "eval_runtime": 35.0833, "eval_samples_per_second": 4280.504, "eval_steps_per_second": 66.898, "step": 8920 }, { "epoch": 0.951417004048583, "grad_norm": 0.003299474949017167, "learning_rate": 9.716599190283402e-07, "loss": 0.0002, "step": 8930 }, { "epoch": 0.951417004048583, "eval_loss": 0.0028344527818262577, "eval_runtime": 35.0755, "eval_samples_per_second": 4281.445, "eval_steps_per_second": 66.913, "step": 8930 }, { "epoch": 0.9524824206264649, "grad_norm": 0.001612770720385015, "learning_rate": 9.503515874707012e-07, "loss": 0.0002, "step": 8940 }, { "epoch": 0.9524824206264649, "eval_loss": 0.0028346776962280273, "eval_runtime": 35.0638, "eval_samples_per_second": 4282.881, "eval_steps_per_second": 66.935, "step": 8940 }, { "epoch": 0.9535478372043469, "grad_norm": 0.00152910640463233, "learning_rate": 9.29043255913062e-07, "loss": 0.0001, "step": 8950 }, { "epoch": 0.9535478372043469, "eval_loss": 0.0028353093657642603, "eval_runtime": 35.047, "eval_samples_per_second": 4284.935, "eval_steps_per_second": 66.967, "step": 8950 }, { "epoch": 0.9546132537822288, "grad_norm": 0.0018316495697945356, "learning_rate": 9.07734924355423e-07, "loss": 0.0001, "step": 8960 }, { "epoch": 0.9546132537822288, "eval_loss": 0.0028363701421767473, "eval_runtime": 35.0263, "eval_samples_per_second": 4287.461, "eval_steps_per_second": 67.007, "step": 8960 }, { "epoch": 0.9556786703601108, "grad_norm": 0.0018473445670679212, "learning_rate": 8.86426592797784e-07, "loss": 0.0048, "step": 8970 }, { "epoch": 0.9556786703601108, "eval_loss": 0.002827234333381057, "eval_runtime": 35.0517, "eval_samples_per_second": 4284.351, "eval_steps_per_second": 66.958, "step": 8970 }, { "epoch": 0.9567440869379927, "grad_norm": 0.0031582904048264027, "learning_rate": 8.65118261240145e-07, "loss": 0.0006, "step": 8980 }, { "epoch": 0.9567440869379927, "eval_loss": 0.0028260373510420322, "eval_runtime": 35.0605, "eval_samples_per_second": 4283.28, "eval_steps_per_second": 66.941, "step": 8980 }, { "epoch": 0.9578095035158747, "grad_norm": 0.001939519657753408, "learning_rate": 8.43809929682506e-07, "loss": 0.0007, "step": 8990 }, { "epoch": 0.9578095035158747, "eval_loss": 0.0028355128597468138, "eval_runtime": 35.1082, "eval_samples_per_second": 4277.462, "eval_steps_per_second": 66.85, "step": 8990 }, { "epoch": 0.9588749200937566, "grad_norm": 0.0012444235617294908, "learning_rate": 8.225015981248669e-07, "loss": 0.0005, "step": 9000 }, { "epoch": 0.9588749200937566, "eval_loss": 0.002848832868039608, "eval_runtime": 35.0665, "eval_samples_per_second": 4282.551, "eval_steps_per_second": 66.93, "step": 9000 }, { "epoch": 0.9599403366716386, "grad_norm": 0.028200862929224968, "learning_rate": 8.011932665672279e-07, "loss": 0.0089, "step": 9010 }, { "epoch": 0.9599403366716386, "eval_loss": 0.0028532061260193586, "eval_runtime": 35.0678, "eval_samples_per_second": 4282.385, "eval_steps_per_second": 66.927, "step": 9010 }, { "epoch": 0.9610057532495205, "grad_norm": 0.07253487408161163, "learning_rate": 7.798849350095889e-07, "loss": 0.0003, "step": 9020 }, { "epoch": 0.9610057532495205, "eval_loss": 0.0028550319839268923, "eval_runtime": 35.0826, "eval_samples_per_second": 4280.586, "eval_steps_per_second": 66.899, "step": 9020 }, { "epoch": 0.9620711698274025, "grad_norm": 0.0023238463327288628, "learning_rate": 7.585766034519499e-07, "loss": 0.0026, "step": 9030 }, { "epoch": 0.9620711698274025, "eval_loss": 0.002850407036021352, "eval_runtime": 35.0397, "eval_samples_per_second": 4285.823, "eval_steps_per_second": 66.981, "step": 9030 }, { "epoch": 0.9631365864052844, "grad_norm": 0.0029777686577290297, "learning_rate": 7.372682718943107e-07, "loss": 0.0151, "step": 9040 }, { "epoch": 0.9631365864052844, "eval_loss": 0.002831405494362116, "eval_runtime": 35.0457, "eval_samples_per_second": 4285.097, "eval_steps_per_second": 66.97, "step": 9040 }, { "epoch": 0.9642020029831664, "grad_norm": 0.0014590666396543384, "learning_rate": 7.159599403366717e-07, "loss": 0.0007, "step": 9050 }, { "epoch": 0.9642020029831664, "eval_loss": 0.002821285743266344, "eval_runtime": 35.0131, "eval_samples_per_second": 4289.085, "eval_steps_per_second": 67.032, "step": 9050 }, { "epoch": 0.9652674195610483, "grad_norm": 0.01177013386040926, "learning_rate": 6.946516087790327e-07, "loss": 0.0005, "step": 9060 }, { "epoch": 0.9652674195610483, "eval_loss": 0.002820658963173628, "eval_runtime": 35.0093, "eval_samples_per_second": 4289.544, "eval_steps_per_second": 67.039, "step": 9060 }, { "epoch": 0.9663328361389303, "grad_norm": 0.016290990635752678, "learning_rate": 6.733432772213937e-07, "loss": 0.0005, "step": 9070 }, { "epoch": 0.9663328361389303, "eval_loss": 0.002823216374963522, "eval_runtime": 35.0349, "eval_samples_per_second": 4286.409, "eval_steps_per_second": 66.99, "step": 9070 }, { "epoch": 0.9673982527168122, "grad_norm": 0.03478045016527176, "learning_rate": 6.520349456637545e-07, "loss": 0.0027, "step": 9080 }, { "epoch": 0.9673982527168122, "eval_loss": 0.0028139299247413874, "eval_runtime": 35.0138, "eval_samples_per_second": 4288.995, "eval_steps_per_second": 67.031, "step": 9080 }, { "epoch": 0.9684636692946942, "grad_norm": 0.001305173384025693, "learning_rate": 6.307266141061155e-07, "loss": 0.0035, "step": 9090 }, { "epoch": 0.9684636692946942, "eval_loss": 0.0028059857431799173, "eval_runtime": 35.0574, "eval_samples_per_second": 4283.659, "eval_steps_per_second": 66.947, "step": 9090 }, { "epoch": 0.9695290858725761, "grad_norm": 0.017361849546432495, "learning_rate": 6.094182825484765e-07, "loss": 0.0022, "step": 9100 }, { "epoch": 0.9695290858725761, "eval_loss": 0.002785086864605546, "eval_runtime": 35.026, "eval_samples_per_second": 4287.499, "eval_steps_per_second": 67.007, "step": 9100 }, { "epoch": 0.9705945024504581, "grad_norm": 0.0025778792332857847, "learning_rate": 5.881099509908375e-07, "loss": 0.0006, "step": 9110 }, { "epoch": 0.9705945024504581, "eval_loss": 0.002780887531116605, "eval_runtime": 35.0435, "eval_samples_per_second": 4285.355, "eval_steps_per_second": 66.974, "step": 9110 }, { "epoch": 0.97165991902834, "grad_norm": 0.7138797044754028, "learning_rate": 5.668016194331984e-07, "loss": 0.0061, "step": 9120 }, { "epoch": 0.97165991902834, "eval_loss": 0.002781209535896778, "eval_runtime": 34.9997, "eval_samples_per_second": 4290.726, "eval_steps_per_second": 67.058, "step": 9120 }, { "epoch": 0.972725335606222, "grad_norm": 0.11075238883495331, "learning_rate": 5.454932878755593e-07, "loss": 0.0007, "step": 9130 }, { "epoch": 0.972725335606222, "eval_loss": 0.0027841285336762667, "eval_runtime": 35.0754, "eval_samples_per_second": 4281.462, "eval_steps_per_second": 66.913, "step": 9130 }, { "epoch": 0.9737907521841039, "grad_norm": 0.0027794514317065477, "learning_rate": 5.241849563179203e-07, "loss": 0.0002, "step": 9140 }, { "epoch": 0.9737907521841039, "eval_loss": 0.0027846924494951963, "eval_runtime": 35.0165, "eval_samples_per_second": 4288.667, "eval_steps_per_second": 67.026, "step": 9140 }, { "epoch": 0.9748561687619859, "grad_norm": 0.00787454191595316, "learning_rate": 5.028766247602813e-07, "loss": 0.006, "step": 9150 }, { "epoch": 0.9748561687619859, "eval_loss": 0.0027796956710517406, "eval_runtime": 35.0181, "eval_samples_per_second": 4288.464, "eval_steps_per_second": 67.022, "step": 9150 }, { "epoch": 0.9759215853398678, "grad_norm": 0.04696900025010109, "learning_rate": 4.815682932026423e-07, "loss": 0.0103, "step": 9160 }, { "epoch": 0.9759215853398678, "eval_loss": 0.002768127480521798, "eval_runtime": 35.0038, "eval_samples_per_second": 4290.223, "eval_steps_per_second": 67.05, "step": 9160 }, { "epoch": 0.9769870019177498, "grad_norm": 0.0021065620239824057, "learning_rate": 4.6025996164500324e-07, "loss": 0.0017, "step": 9170 }, { "epoch": 0.9769870019177498, "eval_loss": 0.0027594445273280144, "eval_runtime": 35.1389, "eval_samples_per_second": 4273.724, "eval_steps_per_second": 66.792, "step": 9170 }, { "epoch": 0.9780524184956317, "grad_norm": 0.01714223064482212, "learning_rate": 4.389516300873642e-07, "loss": 0.0008, "step": 9180 }, { "epoch": 0.9780524184956317, "eval_loss": 0.00275549478828907, "eval_runtime": 35.0542, "eval_samples_per_second": 4284.05, "eval_steps_per_second": 66.953, "step": 9180 }, { "epoch": 0.9791178350735138, "grad_norm": 0.0016872499836608768, "learning_rate": 4.1764329852972517e-07, "loss": 0.0007, "step": 9190 }, { "epoch": 0.9791178350735138, "eval_loss": 0.0027567828074097633, "eval_runtime": 35.0654, "eval_samples_per_second": 4282.682, "eval_steps_per_second": 66.932, "step": 9190 }, { "epoch": 0.9801832516513957, "grad_norm": 0.06704606115818024, "learning_rate": 3.963349669720861e-07, "loss": 0.0025, "step": 9200 }, { "epoch": 0.9801832516513957, "eval_loss": 0.00275617279112339, "eval_runtime": 35.0662, "eval_samples_per_second": 4282.591, "eval_steps_per_second": 66.931, "step": 9200 }, { "epoch": 0.9812486682292777, "grad_norm": 0.0024131489917635918, "learning_rate": 3.750266354144471e-07, "loss": 0.001, "step": 9210 }, { "epoch": 0.9812486682292777, "eval_loss": 0.0027577125001698732, "eval_runtime": 35.0511, "eval_samples_per_second": 4284.426, "eval_steps_per_second": 66.959, "step": 9210 }, { "epoch": 0.9823140848071596, "grad_norm": 0.0015416039386764169, "learning_rate": 3.5371830385680803e-07, "loss": 0.002, "step": 9220 }, { "epoch": 0.9823140848071596, "eval_loss": 0.0027562561444938183, "eval_runtime": 35.0509, "eval_samples_per_second": 4284.459, "eval_steps_per_second": 66.96, "step": 9220 }, { "epoch": 0.9833795013850416, "grad_norm": 0.6929004192352295, "learning_rate": 3.32409972299169e-07, "loss": 0.0022, "step": 9230 }, { "epoch": 0.9833795013850416, "eval_loss": 0.0027580568566918373, "eval_runtime": 35.0336, "eval_samples_per_second": 4286.575, "eval_steps_per_second": 66.993, "step": 9230 }, { "epoch": 0.9844449179629235, "grad_norm": 0.0025643545668572187, "learning_rate": 3.1110164074152996e-07, "loss": 0.0008, "step": 9240 }, { "epoch": 0.9844449179629235, "eval_loss": 0.002759452909231186, "eval_runtime": 35.0344, "eval_samples_per_second": 4286.471, "eval_steps_per_second": 66.991, "step": 9240 }, { "epoch": 0.9855103345408055, "grad_norm": 0.002499173628166318, "learning_rate": 2.8979330918389095e-07, "loss": 0.0002, "step": 9250 }, { "epoch": 0.9855103345408055, "eval_loss": 0.0027610480319708586, "eval_runtime": 35.0262, "eval_samples_per_second": 4287.473, "eval_steps_per_second": 67.007, "step": 9250 }, { "epoch": 0.9865757511186874, "grad_norm": 0.0013231937773525715, "learning_rate": 2.684849776262519e-07, "loss": 0.0014, "step": 9260 }, { "epoch": 0.9865757511186874, "eval_loss": 0.002763622673228383, "eval_runtime": 35.0291, "eval_samples_per_second": 4287.125, "eval_steps_per_second": 67.001, "step": 9260 }, { "epoch": 0.9876411676965694, "grad_norm": 0.001755521516315639, "learning_rate": 2.471766460686129e-07, "loss": 0.0029, "step": 9270 }, { "epoch": 0.9876411676965694, "eval_loss": 0.002763139782473445, "eval_runtime": 35.0386, "eval_samples_per_second": 4285.957, "eval_steps_per_second": 66.983, "step": 9270 }, { "epoch": 0.9887065842744513, "grad_norm": 0.20544728636741638, "learning_rate": 2.258683145109738e-07, "loss": 0.0009, "step": 9280 }, { "epoch": 0.9887065842744513, "eval_loss": 0.0027635886799544096, "eval_runtime": 35.0717, "eval_samples_per_second": 4281.912, "eval_steps_per_second": 66.92, "step": 9280 }, { "epoch": 0.9897720008523333, "grad_norm": 0.06573604047298431, "learning_rate": 2.0455998295333478e-07, "loss": 0.0002, "step": 9290 }, { "epoch": 0.9897720008523333, "eval_loss": 0.0027646832168102264, "eval_runtime": 34.9956, "eval_samples_per_second": 4291.22, "eval_steps_per_second": 67.065, "step": 9290 }, { "epoch": 0.9908374174302153, "grad_norm": 0.002524553332477808, "learning_rate": 1.8325165139569574e-07, "loss": 0.0004, "step": 9300 }, { "epoch": 0.9908374174302153, "eval_loss": 0.0027653006836771965, "eval_runtime": 35.0245, "eval_samples_per_second": 4287.681, "eval_steps_per_second": 67.01, "step": 9300 }, { "epoch": 0.9919028340080972, "grad_norm": 0.0033023718278855085, "learning_rate": 1.619433198380567e-07, "loss": 0.0058, "step": 9310 }, { "epoch": 0.9919028340080972, "eval_loss": 0.0027657628525048494, "eval_runtime": 35.0047, "eval_samples_per_second": 4290.116, "eval_steps_per_second": 67.048, "step": 9310 }, { "epoch": 0.9929682505859792, "grad_norm": 0.058614350855350494, "learning_rate": 1.4063498828041767e-07, "loss": 0.0003, "step": 9320 }, { "epoch": 0.9929682505859792, "eval_loss": 0.002766667865216732, "eval_runtime": 34.9979, "eval_samples_per_second": 4290.938, "eval_steps_per_second": 67.061, "step": 9320 }, { "epoch": 0.9940336671638611, "grad_norm": 0.13814635574817657, "learning_rate": 1.193266567227786e-07, "loss": 0.0087, "step": 9330 }, { "epoch": 0.9940336671638611, "eval_loss": 0.0027665847446769476, "eval_runtime": 35.0199, "eval_samples_per_second": 4288.253, "eval_steps_per_second": 67.019, "step": 9330 }, { "epoch": 0.995099083741743, "grad_norm": 0.007654257118701935, "learning_rate": 9.801832516513957e-08, "loss": 0.0071, "step": 9340 }, { "epoch": 0.995099083741743, "eval_loss": 0.002764417789876461, "eval_runtime": 35.033, "eval_samples_per_second": 4286.649, "eval_steps_per_second": 66.994, "step": 9340 }, { "epoch": 0.996164500319625, "grad_norm": 0.0012485783081501722, "learning_rate": 7.670999360750054e-08, "loss": 0.0159, "step": 9350 }, { "epoch": 0.996164500319625, "eval_loss": 0.0027642108034342527, "eval_runtime": 35.05, "eval_samples_per_second": 4284.566, "eval_steps_per_second": 66.962, "step": 9350 }, { "epoch": 0.997229916897507, "grad_norm": 0.0015925171319395304, "learning_rate": 5.54016620498615e-08, "loss": 0.0003, "step": 9360 }, { "epoch": 0.997229916897507, "eval_loss": 0.002764443401247263, "eval_runtime": 35.0619, "eval_samples_per_second": 4283.109, "eval_steps_per_second": 66.939, "step": 9360 }, { "epoch": 0.9982953334753889, "grad_norm": 0.004035618621855974, "learning_rate": 3.409333049222246e-08, "loss": 0.0108, "step": 9370 }, { "epoch": 0.9982953334753889, "eval_loss": 0.002764328382909298, "eval_runtime": 34.9973, "eval_samples_per_second": 4291.017, "eval_steps_per_second": 67.062, "step": 9370 }, { "epoch": 0.9993607500532709, "grad_norm": 0.0013912487775087357, "learning_rate": 1.2784998934583423e-08, "loss": 0.0002, "step": 9380 }, { "epoch": 0.9993607500532709, "eval_loss": 0.0027642655186355114, "eval_runtime": 35.0448, "eval_samples_per_second": 4285.205, "eval_steps_per_second": 66.971, "step": 9380 } ], "logging_steps": 10, "max_steps": 9386, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 302396856261120.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }