{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9997069597069597, "eval_steps": 100, "global_step": 1706, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005860805860805861, "grad_norm": 2.521568536758423, "learning_rate": 0.00019941383352872216, "loss": 2.3132, "step": 5 }, { "epoch": 0.011721611721611722, "grad_norm": 2.104935884475708, "learning_rate": 0.00019882766705744433, "loss": 0.9444, "step": 10 }, { "epoch": 0.017582417582417582, "grad_norm": 1.2950881719589233, "learning_rate": 0.00019824150058616647, "loss": 0.357, "step": 15 }, { "epoch": 0.023443223443223443, "grad_norm": 1.1862170696258545, "learning_rate": 0.00019765533411488865, "loss": 0.2105, "step": 20 }, { "epoch": 0.029304029304029304, "grad_norm": 0.5247148871421814, "learning_rate": 0.0001970691676436108, "loss": 0.1087, "step": 25 }, { "epoch": 0.035164835164835165, "grad_norm": 1.2535285949707031, "learning_rate": 0.00019648300117233296, "loss": 0.1185, "step": 30 }, { "epoch": 0.041025641025641026, "grad_norm": 0.5407606959342957, "learning_rate": 0.0001958968347010551, "loss": 0.0775, "step": 35 }, { "epoch": 0.046886446886446886, "grad_norm": 1.137266993522644, "learning_rate": 0.00019531066822977726, "loss": 0.0728, "step": 40 }, { "epoch": 0.05274725274725275, "grad_norm": 0.5244052410125732, "learning_rate": 0.00019472450175849943, "loss": 0.056, "step": 45 }, { "epoch": 0.05860805860805861, "grad_norm": 0.4511496126651764, "learning_rate": 0.00019413833528722157, "loss": 0.0648, "step": 50 }, { "epoch": 0.06446886446886448, "grad_norm": 0.33913975954055786, "learning_rate": 0.00019355216881594375, "loss": 0.0479, "step": 55 }, { "epoch": 0.07032967032967033, "grad_norm": 0.354777991771698, "learning_rate": 0.0001929660023446659, "loss": 0.0758, "step": 60 }, { "epoch": 0.0761904761904762, "grad_norm": 0.3968910276889801, "learning_rate": 0.00019237983587338807, "loss": 0.0776, "step": 65 }, { "epoch": 0.08205128205128205, "grad_norm": 0.5061652660369873, "learning_rate": 0.0001917936694021102, "loss": 0.0349, "step": 70 }, { "epoch": 0.08791208791208792, "grad_norm": 0.24455250799655914, "learning_rate": 0.00019120750293083236, "loss": 0.0369, "step": 75 }, { "epoch": 0.09377289377289377, "grad_norm": 0.3959537446498871, "learning_rate": 0.00019062133645955453, "loss": 0.0513, "step": 80 }, { "epoch": 0.09963369963369964, "grad_norm": 0.4534469544887543, "learning_rate": 0.00019003516998827668, "loss": 0.0459, "step": 85 }, { "epoch": 0.1054945054945055, "grad_norm": 0.30694451928138733, "learning_rate": 0.00018944900351699885, "loss": 0.0377, "step": 90 }, { "epoch": 0.11135531135531136, "grad_norm": 0.15078052878379822, "learning_rate": 0.000188862837045721, "loss": 0.0297, "step": 95 }, { "epoch": 0.11721611721611722, "grad_norm": 0.3153330981731415, "learning_rate": 0.00018827667057444317, "loss": 0.0301, "step": 100 }, { "epoch": 0.11721611721611722, "eval_loss": 0.028654273599386215, "eval_runtime": 4.8016, "eval_samples_per_second": 5.831, "eval_steps_per_second": 2.916, "step": 100 }, { "epoch": 0.12307692307692308, "grad_norm": 0.5777396559715271, "learning_rate": 0.0001876905041031653, "loss": 0.0515, "step": 105 }, { "epoch": 0.12893772893772895, "grad_norm": 0.3677718937397003, "learning_rate": 0.00018710433763188746, "loss": 0.0315, "step": 110 }, { "epoch": 0.1347985347985348, "grad_norm": 0.30658507347106934, "learning_rate": 0.0001865181711606096, "loss": 0.0248, "step": 115 }, { "epoch": 0.14065934065934066, "grad_norm": 0.21649648249149323, "learning_rate": 0.00018593200468933178, "loss": 0.0352, "step": 120 }, { "epoch": 0.14652014652014653, "grad_norm": 0.2877885103225708, "learning_rate": 0.00018534583821805395, "loss": 0.0456, "step": 125 }, { "epoch": 0.1523809523809524, "grad_norm": 0.1782904863357544, "learning_rate": 0.0001847596717467761, "loss": 0.0257, "step": 130 }, { "epoch": 0.15824175824175823, "grad_norm": 0.1211300641298294, "learning_rate": 0.00018417350527549827, "loss": 0.0437, "step": 135 }, { "epoch": 0.1641025641025641, "grad_norm": 0.298053503036499, "learning_rate": 0.0001835873388042204, "loss": 0.036, "step": 140 }, { "epoch": 0.16996336996336997, "grad_norm": 0.2280658483505249, "learning_rate": 0.00018300117233294256, "loss": 0.0331, "step": 145 }, { "epoch": 0.17582417582417584, "grad_norm": 0.14896267652511597, "learning_rate": 0.0001824150058616647, "loss": 0.0239, "step": 150 }, { "epoch": 0.18168498168498168, "grad_norm": 0.1769961267709732, "learning_rate": 0.00018182883939038688, "loss": 0.0375, "step": 155 }, { "epoch": 0.18754578754578755, "grad_norm": 0.19849297404289246, "learning_rate": 0.00018124267291910902, "loss": 0.0357, "step": 160 }, { "epoch": 0.1934065934065934, "grad_norm": 0.2294420450925827, "learning_rate": 0.0001806565064478312, "loss": 0.0504, "step": 165 }, { "epoch": 0.19926739926739928, "grad_norm": 0.10744224488735199, "learning_rate": 0.00018007033997655337, "loss": 0.0209, "step": 170 }, { "epoch": 0.20512820512820512, "grad_norm": 0.06066066771745682, "learning_rate": 0.00017948417350527551, "loss": 0.0175, "step": 175 }, { "epoch": 0.210989010989011, "grad_norm": 0.5421485304832458, "learning_rate": 0.00017889800703399766, "loss": 0.0398, "step": 180 }, { "epoch": 0.21684981684981686, "grad_norm": 0.1306767612695694, "learning_rate": 0.0001783118405627198, "loss": 0.0258, "step": 185 }, { "epoch": 0.22271062271062272, "grad_norm": 0.11510124802589417, "learning_rate": 0.00017772567409144198, "loss": 0.0253, "step": 190 }, { "epoch": 0.22857142857142856, "grad_norm": 0.20577751100063324, "learning_rate": 0.00017713950762016412, "loss": 0.0277, "step": 195 }, { "epoch": 0.23443223443223443, "grad_norm": 0.20588932931423187, "learning_rate": 0.0001765533411488863, "loss": 0.0296, "step": 200 }, { "epoch": 0.23443223443223443, "eval_loss": 0.019306689500808716, "eval_runtime": 4.7344, "eval_samples_per_second": 5.914, "eval_steps_per_second": 2.957, "step": 200 }, { "epoch": 0.2402930402930403, "grad_norm": 0.13713975250720978, "learning_rate": 0.00017596717467760847, "loss": 0.0372, "step": 205 }, { "epoch": 0.24615384615384617, "grad_norm": 0.14788508415222168, "learning_rate": 0.00017538100820633061, "loss": 0.033, "step": 210 }, { "epoch": 0.252014652014652, "grad_norm": 0.2372630536556244, "learning_rate": 0.00017479484173505276, "loss": 0.0281, "step": 215 }, { "epoch": 0.2578754578754579, "grad_norm": 0.2357954978942871, "learning_rate": 0.0001742086752637749, "loss": 0.0295, "step": 220 }, { "epoch": 0.26373626373626374, "grad_norm": 0.2537606358528137, "learning_rate": 0.00017362250879249708, "loss": 0.036, "step": 225 }, { "epoch": 0.2695970695970696, "grad_norm": 0.2222289741039276, "learning_rate": 0.00017303634232121922, "loss": 0.0402, "step": 230 }, { "epoch": 0.2754578754578755, "grad_norm": 0.19696177542209625, "learning_rate": 0.0001724501758499414, "loss": 0.025, "step": 235 }, { "epoch": 0.2813186813186813, "grad_norm": 0.08915109932422638, "learning_rate": 0.00017186400937866357, "loss": 0.0292, "step": 240 }, { "epoch": 0.28717948717948716, "grad_norm": 0.39625948667526245, "learning_rate": 0.00017127784290738572, "loss": 0.0324, "step": 245 }, { "epoch": 0.29304029304029305, "grad_norm": 0.05654177442193031, "learning_rate": 0.00017069167643610786, "loss": 0.0384, "step": 250 }, { "epoch": 0.2989010989010989, "grad_norm": 0.23707075417041779, "learning_rate": 0.00017010550996483, "loss": 0.0311, "step": 255 }, { "epoch": 0.3047619047619048, "grad_norm": 0.2619571387767792, "learning_rate": 0.00016951934349355218, "loss": 0.0275, "step": 260 }, { "epoch": 0.31062271062271063, "grad_norm": 0.11028550565242767, "learning_rate": 0.00016893317702227432, "loss": 0.0194, "step": 265 }, { "epoch": 0.31648351648351647, "grad_norm": 0.26710912585258484, "learning_rate": 0.0001683470105509965, "loss": 0.0318, "step": 270 }, { "epoch": 0.32234432234432236, "grad_norm": 0.20064710080623627, "learning_rate": 0.00016776084407971864, "loss": 0.0517, "step": 275 }, { "epoch": 0.3282051282051282, "grad_norm": 0.06760745495557785, "learning_rate": 0.00016717467760844082, "loss": 0.0223, "step": 280 }, { "epoch": 0.33406593406593404, "grad_norm": 0.14518442749977112, "learning_rate": 0.00016658851113716296, "loss": 0.0216, "step": 285 }, { "epoch": 0.33992673992673994, "grad_norm": 0.35427016019821167, "learning_rate": 0.0001660023446658851, "loss": 0.0268, "step": 290 }, { "epoch": 0.3457875457875458, "grad_norm": 0.14693213999271393, "learning_rate": 0.00016541617819460728, "loss": 0.0246, "step": 295 }, { "epoch": 0.3516483516483517, "grad_norm": 0.2351713478565216, "learning_rate": 0.00016483001172332943, "loss": 0.0399, "step": 300 }, { "epoch": 0.3516483516483517, "eval_loss": 0.02264154888689518, "eval_runtime": 4.8013, "eval_samples_per_second": 5.832, "eval_steps_per_second": 2.916, "step": 300 }, { "epoch": 0.3575091575091575, "grad_norm": 0.16985514760017395, "learning_rate": 0.0001642438452520516, "loss": 0.0243, "step": 305 }, { "epoch": 0.36336996336996336, "grad_norm": 0.199642613530159, "learning_rate": 0.00016365767878077374, "loss": 0.0329, "step": 310 }, { "epoch": 0.36923076923076925, "grad_norm": 0.05538804084062576, "learning_rate": 0.00016307151230949592, "loss": 0.0474, "step": 315 }, { "epoch": 0.3750915750915751, "grad_norm": 0.1542443037033081, "learning_rate": 0.00016248534583821806, "loss": 0.0232, "step": 320 }, { "epoch": 0.38095238095238093, "grad_norm": 0.16808335483074188, "learning_rate": 0.0001618991793669402, "loss": 0.0272, "step": 325 }, { "epoch": 0.3868131868131868, "grad_norm": 0.20816853642463684, "learning_rate": 0.00016131301289566238, "loss": 0.0375, "step": 330 }, { "epoch": 0.39267399267399267, "grad_norm": 0.10338038206100464, "learning_rate": 0.00016072684642438453, "loss": 0.0233, "step": 335 }, { "epoch": 0.39853479853479856, "grad_norm": 0.06522126495838165, "learning_rate": 0.0001601406799531067, "loss": 0.0288, "step": 340 }, { "epoch": 0.4043956043956044, "grad_norm": 0.11830403655767441, "learning_rate": 0.00015955451348182884, "loss": 0.0164, "step": 345 }, { "epoch": 0.41025641025641024, "grad_norm": 0.2777006924152374, "learning_rate": 0.00015896834701055102, "loss": 0.0345, "step": 350 }, { "epoch": 0.41611721611721614, "grad_norm": 0.1277918815612793, "learning_rate": 0.00015838218053927316, "loss": 0.0229, "step": 355 }, { "epoch": 0.421978021978022, "grad_norm": 0.09861145913600922, "learning_rate": 0.0001577960140679953, "loss": 0.0181, "step": 360 }, { "epoch": 0.4278388278388278, "grad_norm": 0.08698171377182007, "learning_rate": 0.00015720984759671748, "loss": 0.0365, "step": 365 }, { "epoch": 0.4336996336996337, "grad_norm": 0.23488883674144745, "learning_rate": 0.00015662368112543963, "loss": 0.0352, "step": 370 }, { "epoch": 0.43956043956043955, "grad_norm": 0.05140375718474388, "learning_rate": 0.0001560375146541618, "loss": 0.0235, "step": 375 }, { "epoch": 0.44542124542124545, "grad_norm": 0.15759135782718658, "learning_rate": 0.00015545134818288395, "loss": 0.0236, "step": 380 }, { "epoch": 0.4512820512820513, "grad_norm": 0.07523085922002792, "learning_rate": 0.00015486518171160612, "loss": 0.0218, "step": 385 }, { "epoch": 0.45714285714285713, "grad_norm": 0.1892630159854889, "learning_rate": 0.00015427901524032826, "loss": 0.0264, "step": 390 }, { "epoch": 0.463003663003663, "grad_norm": 0.12749487161636353, "learning_rate": 0.0001536928487690504, "loss": 0.0414, "step": 395 }, { "epoch": 0.46886446886446886, "grad_norm": 0.11519593745470047, "learning_rate": 0.00015310668229777258, "loss": 0.0173, "step": 400 }, { "epoch": 0.46886446886446886, "eval_loss": 0.01765686459839344, "eval_runtime": 4.8182, "eval_samples_per_second": 5.811, "eval_steps_per_second": 2.906, "step": 400 }, { "epoch": 0.4747252747252747, "grad_norm": 0.14240577816963196, "learning_rate": 0.00015252051582649473, "loss": 0.0214, "step": 405 }, { "epoch": 0.4805860805860806, "grad_norm": 0.11283282935619354, "learning_rate": 0.0001519343493552169, "loss": 0.0266, "step": 410 }, { "epoch": 0.48644688644688644, "grad_norm": 0.16279707849025726, "learning_rate": 0.00015134818288393905, "loss": 0.0274, "step": 415 }, { "epoch": 0.49230769230769234, "grad_norm": 0.07319923490285873, "learning_rate": 0.00015076201641266122, "loss": 0.0216, "step": 420 }, { "epoch": 0.4981684981684982, "grad_norm": 0.13532328605651855, "learning_rate": 0.00015017584994138336, "loss": 0.0308, "step": 425 }, { "epoch": 0.504029304029304, "grad_norm": 0.12896127998828888, "learning_rate": 0.0001495896834701055, "loss": 0.0129, "step": 430 }, { "epoch": 0.5098901098901099, "grad_norm": 0.03718201071023941, "learning_rate": 0.00014900351699882766, "loss": 0.0181, "step": 435 }, { "epoch": 0.5157509157509158, "grad_norm": 0.05926808714866638, "learning_rate": 0.00014841735052754983, "loss": 0.0146, "step": 440 }, { "epoch": 0.5216117216117216, "grad_norm": 0.09976931661367416, "learning_rate": 0.00014783118405627197, "loss": 0.0231, "step": 445 }, { "epoch": 0.5274725274725275, "grad_norm": 0.18460391461849213, "learning_rate": 0.00014724501758499415, "loss": 0.02, "step": 450 }, { "epoch": 0.5333333333333333, "grad_norm": 0.07457377016544342, "learning_rate": 0.00014665885111371632, "loss": 0.0209, "step": 455 }, { "epoch": 0.5391941391941392, "grad_norm": 0.03992030769586563, "learning_rate": 0.00014607268464243847, "loss": 0.0251, "step": 460 }, { "epoch": 0.545054945054945, "grad_norm": 0.196414053440094, "learning_rate": 0.0001454865181711606, "loss": 0.0344, "step": 465 }, { "epoch": 0.550915750915751, "grad_norm": 0.19978556036949158, "learning_rate": 0.00014490035169988276, "loss": 0.0159, "step": 470 }, { "epoch": 0.5567765567765568, "grad_norm": 0.11389517784118652, "learning_rate": 0.00014431418522860493, "loss": 0.0197, "step": 475 }, { "epoch": 0.5626373626373626, "grad_norm": 0.07047716528177261, "learning_rate": 0.00014372801875732708, "loss": 0.0138, "step": 480 }, { "epoch": 0.5684981684981685, "grad_norm": 0.10614708811044693, "learning_rate": 0.00014314185228604925, "loss": 0.0208, "step": 485 }, { "epoch": 0.5743589743589743, "grad_norm": 0.20569799840450287, "learning_rate": 0.00014255568581477142, "loss": 0.0203, "step": 490 }, { "epoch": 0.5802197802197803, "grad_norm": 0.19309553503990173, "learning_rate": 0.00014196951934349357, "loss": 0.0282, "step": 495 }, { "epoch": 0.5860805860805861, "grad_norm": 0.07542768865823746, "learning_rate": 0.0001413833528722157, "loss": 0.0173, "step": 500 }, { "epoch": 0.5860805860805861, "eval_loss": 0.022126102820038795, "eval_runtime": 4.8755, "eval_samples_per_second": 5.743, "eval_steps_per_second": 2.872, "step": 500 }, { "epoch": 0.591941391941392, "grad_norm": 0.1205630674958229, "learning_rate": 0.00014079718640093786, "loss": 0.0322, "step": 505 }, { "epoch": 0.5978021978021978, "grad_norm": 0.13761042058467865, "learning_rate": 0.00014021101992966003, "loss": 0.0203, "step": 510 }, { "epoch": 0.6036630036630036, "grad_norm": 0.08595598489046097, "learning_rate": 0.00013962485345838218, "loss": 0.0145, "step": 515 }, { "epoch": 0.6095238095238096, "grad_norm": 0.11087319999933243, "learning_rate": 0.00013903868698710435, "loss": 0.0218, "step": 520 }, { "epoch": 0.6153846153846154, "grad_norm": 0.14962054789066315, "learning_rate": 0.00013845252051582652, "loss": 0.0322, "step": 525 }, { "epoch": 0.6212454212454213, "grad_norm": 0.07573894411325455, "learning_rate": 0.00013786635404454867, "loss": 0.0275, "step": 530 }, { "epoch": 0.6271062271062271, "grad_norm": 0.069780133664608, "learning_rate": 0.0001372801875732708, "loss": 0.0235, "step": 535 }, { "epoch": 0.6329670329670329, "grad_norm": 0.07833613455295563, "learning_rate": 0.00013669402110199296, "loss": 0.0344, "step": 540 }, { "epoch": 0.6388278388278388, "grad_norm": 0.07331829518079758, "learning_rate": 0.00013610785463071513, "loss": 0.0135, "step": 545 }, { "epoch": 0.6446886446886447, "grad_norm": 0.22369089722633362, "learning_rate": 0.00013552168815943728, "loss": 0.0222, "step": 550 }, { "epoch": 0.6505494505494506, "grad_norm": 0.1465146392583847, "learning_rate": 0.00013493552168815945, "loss": 0.0307, "step": 555 }, { "epoch": 0.6564102564102564, "grad_norm": 0.06348715722560883, "learning_rate": 0.00013434935521688162, "loss": 0.0358, "step": 560 }, { "epoch": 0.6622710622710622, "grad_norm": 0.09298256784677505, "learning_rate": 0.00013376318874560377, "loss": 0.0224, "step": 565 }, { "epoch": 0.6681318681318681, "grad_norm": 0.18280836939811707, "learning_rate": 0.00013317702227432591, "loss": 0.0263, "step": 570 }, { "epoch": 0.673992673992674, "grad_norm": 0.07080171257257462, "learning_rate": 0.00013259085580304806, "loss": 0.0192, "step": 575 }, { "epoch": 0.6798534798534799, "grad_norm": 0.11019092798233032, "learning_rate": 0.00013200468933177023, "loss": 0.0211, "step": 580 }, { "epoch": 0.6857142857142857, "grad_norm": 0.13162659108638763, "learning_rate": 0.00013141852286049238, "loss": 0.0284, "step": 585 }, { "epoch": 0.6915750915750916, "grad_norm": 0.19205588102340698, "learning_rate": 0.00013083235638921455, "loss": 0.0245, "step": 590 }, { "epoch": 0.6974358974358974, "grad_norm": 0.0971173569560051, "learning_rate": 0.0001302461899179367, "loss": 0.0216, "step": 595 }, { "epoch": 0.7032967032967034, "grad_norm": 0.2179749310016632, "learning_rate": 0.00012966002344665887, "loss": 0.0268, "step": 600 }, { "epoch": 0.7032967032967034, "eval_loss": 0.01850169710814953, "eval_runtime": 4.6688, "eval_samples_per_second": 5.997, "eval_steps_per_second": 2.999, "step": 600 }, { "epoch": 0.7091575091575092, "grad_norm": 0.23079490661621094, "learning_rate": 0.00012907385697538101, "loss": 0.0269, "step": 605 }, { "epoch": 0.715018315018315, "grad_norm": 0.08325810730457306, "learning_rate": 0.00012848769050410316, "loss": 0.0211, "step": 610 }, { "epoch": 0.7208791208791209, "grad_norm": 0.05983910337090492, "learning_rate": 0.00012790152403282533, "loss": 0.0217, "step": 615 }, { "epoch": 0.7267399267399267, "grad_norm": 0.13831888139247894, "learning_rate": 0.00012731535756154748, "loss": 0.0142, "step": 620 }, { "epoch": 0.7326007326007326, "grad_norm": 0.12296965718269348, "learning_rate": 0.00012672919109026965, "loss": 0.0253, "step": 625 }, { "epoch": 0.7384615384615385, "grad_norm": 0.13777951896190643, "learning_rate": 0.0001261430246189918, "loss": 0.0214, "step": 630 }, { "epoch": 0.7443223443223443, "grad_norm": 0.12136834859848022, "learning_rate": 0.00012555685814771397, "loss": 0.0244, "step": 635 }, { "epoch": 0.7501831501831502, "grad_norm": 0.050576552748680115, "learning_rate": 0.00012497069167643612, "loss": 0.0137, "step": 640 }, { "epoch": 0.756043956043956, "grad_norm": 0.22222141921520233, "learning_rate": 0.00012438452520515826, "loss": 0.0254, "step": 645 }, { "epoch": 0.7619047619047619, "grad_norm": 0.06815624237060547, "learning_rate": 0.00012379835873388043, "loss": 0.0231, "step": 650 }, { "epoch": 0.7677655677655678, "grad_norm": 0.19518345594406128, "learning_rate": 0.00012321219226260258, "loss": 0.0218, "step": 655 }, { "epoch": 0.7736263736263737, "grad_norm": 0.06349798291921616, "learning_rate": 0.00012262602579132475, "loss": 0.0265, "step": 660 }, { "epoch": 0.7794871794871795, "grad_norm": 0.09031341969966888, "learning_rate": 0.00012203985932004688, "loss": 0.0299, "step": 665 }, { "epoch": 0.7853479853479853, "grad_norm": 0.060232892632484436, "learning_rate": 0.00012145369284876906, "loss": 0.0227, "step": 670 }, { "epoch": 0.7912087912087912, "grad_norm": 0.23972396552562714, "learning_rate": 0.00012086752637749122, "loss": 0.0233, "step": 675 }, { "epoch": 0.7970695970695971, "grad_norm": 0.06141636520624161, "learning_rate": 0.00012028135990621336, "loss": 0.017, "step": 680 }, { "epoch": 0.802930402930403, "grad_norm": 0.05603253096342087, "learning_rate": 0.00011969519343493553, "loss": 0.025, "step": 685 }, { "epoch": 0.8087912087912088, "grad_norm": 0.06840907782316208, "learning_rate": 0.0001191090269636577, "loss": 0.0164, "step": 690 }, { "epoch": 0.8146520146520146, "grad_norm": 0.1270790845155716, "learning_rate": 0.00011852286049237984, "loss": 0.0237, "step": 695 }, { "epoch": 0.8205128205128205, "grad_norm": 0.03222518041729927, "learning_rate": 0.00011793669402110198, "loss": 0.0169, "step": 700 }, { "epoch": 0.8205128205128205, "eval_loss": 0.026108432561159134, "eval_runtime": 4.6865, "eval_samples_per_second": 5.975, "eval_steps_per_second": 2.987, "step": 700 }, { "epoch": 0.8263736263736263, "grad_norm": 0.06273896247148514, "learning_rate": 0.00011735052754982416, "loss": 0.0245, "step": 705 }, { "epoch": 0.8322344322344323, "grad_norm": 0.1422451138496399, "learning_rate": 0.00011676436107854632, "loss": 0.0218, "step": 710 }, { "epoch": 0.8380952380952381, "grad_norm": 0.07166247069835663, "learning_rate": 0.00011617819460726846, "loss": 0.0259, "step": 715 }, { "epoch": 0.843956043956044, "grad_norm": 0.13188450038433075, "learning_rate": 0.00011559202813599064, "loss": 0.0185, "step": 720 }, { "epoch": 0.8498168498168498, "grad_norm": 0.11839079111814499, "learning_rate": 0.0001150058616647128, "loss": 0.0196, "step": 725 }, { "epoch": 0.8556776556776556, "grad_norm": 0.09421879053115845, "learning_rate": 0.00011441969519343494, "loss": 0.0207, "step": 730 }, { "epoch": 0.8615384615384616, "grad_norm": 0.11722107976675034, "learning_rate": 0.00011383352872215709, "loss": 0.0286, "step": 735 }, { "epoch": 0.8673992673992674, "grad_norm": 0.07790110260248184, "learning_rate": 0.00011324736225087926, "loss": 0.0157, "step": 740 }, { "epoch": 0.8732600732600733, "grad_norm": 0.11153840273618698, "learning_rate": 0.00011266119577960142, "loss": 0.0184, "step": 745 }, { "epoch": 0.8791208791208791, "grad_norm": 0.07105362415313721, "learning_rate": 0.00011207502930832356, "loss": 0.0193, "step": 750 }, { "epoch": 0.884981684981685, "grad_norm": 0.11616308242082596, "learning_rate": 0.00011148886283704571, "loss": 0.0219, "step": 755 }, { "epoch": 0.8908424908424909, "grad_norm": 0.10045047104358673, "learning_rate": 0.0001109026963657679, "loss": 0.0177, "step": 760 }, { "epoch": 0.8967032967032967, "grad_norm": 0.07033990323543549, "learning_rate": 0.00011031652989449004, "loss": 0.0227, "step": 765 }, { "epoch": 0.9025641025641026, "grad_norm": 0.07648850232362747, "learning_rate": 0.00010973036342321219, "loss": 0.023, "step": 770 }, { "epoch": 0.9084249084249084, "grad_norm": 0.05392804369330406, "learning_rate": 0.00010914419695193436, "loss": 0.0136, "step": 775 }, { "epoch": 0.9142857142857143, "grad_norm": 0.17311276495456696, "learning_rate": 0.00010855803048065652, "loss": 0.0257, "step": 780 }, { "epoch": 0.9201465201465201, "grad_norm": 0.07022574543952942, "learning_rate": 0.00010797186400937866, "loss": 0.0282, "step": 785 }, { "epoch": 0.926007326007326, "grad_norm": 0.15858297049999237, "learning_rate": 0.00010738569753810081, "loss": 0.0219, "step": 790 }, { "epoch": 0.9318681318681319, "grad_norm": 0.06796769052743912, "learning_rate": 0.00010679953106682298, "loss": 0.0288, "step": 795 }, { "epoch": 0.9377289377289377, "grad_norm": 0.11868051439523697, "learning_rate": 0.00010621336459554514, "loss": 0.0248, "step": 800 }, { "epoch": 0.9377289377289377, "eval_loss": 0.019950957968831062, "eval_runtime": 4.8291, "eval_samples_per_second": 5.798, "eval_steps_per_second": 2.899, "step": 800 }, { "epoch": 0.9435897435897436, "grad_norm": 0.19787561893463135, "learning_rate": 0.00010562719812426729, "loss": 0.0249, "step": 805 }, { "epoch": 0.9494505494505494, "grad_norm": 0.06437662243843079, "learning_rate": 0.00010504103165298946, "loss": 0.0141, "step": 810 }, { "epoch": 0.9553113553113554, "grad_norm": 0.09178975969552994, "learning_rate": 0.00010445486518171162, "loss": 0.0218, "step": 815 }, { "epoch": 0.9611721611721612, "grad_norm": 0.09567834436893463, "learning_rate": 0.00010386869871043376, "loss": 0.0229, "step": 820 }, { "epoch": 0.967032967032967, "grad_norm": 0.039594829082489014, "learning_rate": 0.00010328253223915591, "loss": 0.0186, "step": 825 }, { "epoch": 0.9728937728937729, "grad_norm": 0.18495650589466095, "learning_rate": 0.00010269636576787808, "loss": 0.0237, "step": 830 }, { "epoch": 0.9787545787545787, "grad_norm": 0.1861388385295868, "learning_rate": 0.00010211019929660024, "loss": 0.0367, "step": 835 }, { "epoch": 0.9846153846153847, "grad_norm": 0.05491223558783531, "learning_rate": 0.00010152403282532239, "loss": 0.015, "step": 840 }, { "epoch": 0.9904761904761905, "grad_norm": 0.04110349714756012, "learning_rate": 0.00010093786635404456, "loss": 0.0258, "step": 845 }, { "epoch": 0.9963369963369964, "grad_norm": 0.07649147510528564, "learning_rate": 0.00010035169988276672, "loss": 0.0235, "step": 850 }, { "epoch": 1.0021978021978022, "grad_norm": 0.11078579723834991, "learning_rate": 9.976553341148887e-05, "loss": 0.0204, "step": 855 }, { "epoch": 1.008058608058608, "grad_norm": 0.08302613347768784, "learning_rate": 9.917936694021102e-05, "loss": 0.0188, "step": 860 }, { "epoch": 1.0139194139194139, "grad_norm": 0.19045108556747437, "learning_rate": 9.859320046893318e-05, "loss": 0.0226, "step": 865 }, { "epoch": 1.0197802197802197, "grad_norm": 0.04657626897096634, "learning_rate": 9.800703399765534e-05, "loss": 0.0205, "step": 870 }, { "epoch": 1.0256410256410255, "grad_norm": 0.10237232595682144, "learning_rate": 9.742086752637749e-05, "loss": 0.0259, "step": 875 }, { "epoch": 1.0315018315018314, "grad_norm": 0.1746947020292282, "learning_rate": 9.683470105509965e-05, "loss": 0.0229, "step": 880 }, { "epoch": 1.0373626373626375, "grad_norm": 0.1032433807849884, "learning_rate": 9.624853458382182e-05, "loss": 0.0277, "step": 885 }, { "epoch": 1.0432234432234433, "grad_norm": 0.08428288996219635, "learning_rate": 9.566236811254397e-05, "loss": 0.0161, "step": 890 }, { "epoch": 1.0490842490842491, "grad_norm": 0.16661523282527924, "learning_rate": 9.507620164126613e-05, "loss": 0.0212, "step": 895 }, { "epoch": 1.054945054945055, "grad_norm": 0.08473166078329086, "learning_rate": 9.449003516998827e-05, "loss": 0.0285, "step": 900 }, { "epoch": 1.054945054945055, "eval_loss": 0.018958253785967827, "eval_runtime": 4.8179, "eval_samples_per_second": 5.812, "eval_steps_per_second": 2.906, "step": 900 }, { "epoch": 1.0608058608058608, "grad_norm": 0.04149739816784859, "learning_rate": 9.390386869871044e-05, "loss": 0.0138, "step": 905 }, { "epoch": 1.0666666666666667, "grad_norm": 0.05616866052150726, "learning_rate": 9.331770222743259e-05, "loss": 0.0194, "step": 910 }, { "epoch": 1.0725274725274725, "grad_norm": 0.16730394959449768, "learning_rate": 9.273153575615475e-05, "loss": 0.0359, "step": 915 }, { "epoch": 1.0783882783882783, "grad_norm": 0.06336849927902222, "learning_rate": 9.214536928487691e-05, "loss": 0.0147, "step": 920 }, { "epoch": 1.0842490842490842, "grad_norm": 0.0882687047123909, "learning_rate": 9.155920281359907e-05, "loss": 0.015, "step": 925 }, { "epoch": 1.09010989010989, "grad_norm": 0.05476200208067894, "learning_rate": 9.097303634232123e-05, "loss": 0.019, "step": 930 }, { "epoch": 1.095970695970696, "grad_norm": 0.05358652025461197, "learning_rate": 9.038686987104337e-05, "loss": 0.0174, "step": 935 }, { "epoch": 1.101831501831502, "grad_norm": 0.24095569550991058, "learning_rate": 8.980070339976554e-05, "loss": 0.0293, "step": 940 }, { "epoch": 1.1076923076923078, "grad_norm": 0.06653840094804764, "learning_rate": 8.921453692848769e-05, "loss": 0.0133, "step": 945 }, { "epoch": 1.1135531135531136, "grad_norm": 0.08839567750692368, "learning_rate": 8.862837045720985e-05, "loss": 0.0208, "step": 950 }, { "epoch": 1.1194139194139194, "grad_norm": 0.032916922122240067, "learning_rate": 8.804220398593201e-05, "loss": 0.0267, "step": 955 }, { "epoch": 1.1252747252747253, "grad_norm": 0.11819420754909515, "learning_rate": 8.745603751465417e-05, "loss": 0.0384, "step": 960 }, { "epoch": 1.1311355311355311, "grad_norm": 0.06757565587759018, "learning_rate": 8.686987104337633e-05, "loss": 0.0135, "step": 965 }, { "epoch": 1.136996336996337, "grad_norm": 0.0970228835940361, "learning_rate": 8.628370457209847e-05, "loss": 0.0166, "step": 970 }, { "epoch": 1.1428571428571428, "grad_norm": 0.13436350226402283, "learning_rate": 8.569753810082065e-05, "loss": 0.016, "step": 975 }, { "epoch": 1.1487179487179486, "grad_norm": 0.10584839433431625, "learning_rate": 8.511137162954279e-05, "loss": 0.0177, "step": 980 }, { "epoch": 1.1545787545787545, "grad_norm": 0.21206024289131165, "learning_rate": 8.452520515826495e-05, "loss": 0.0325, "step": 985 }, { "epoch": 1.1604395604395605, "grad_norm": 0.04815613850951195, "learning_rate": 8.393903868698711e-05, "loss": 0.0137, "step": 990 }, { "epoch": 1.1663003663003664, "grad_norm": 0.07466138154268265, "learning_rate": 8.335287221570927e-05, "loss": 0.0173, "step": 995 }, { "epoch": 1.1721611721611722, "grad_norm": 0.09366811066865921, "learning_rate": 8.276670574443143e-05, "loss": 0.025, "step": 1000 }, { "epoch": 1.1721611721611722, "eval_loss": 0.019907595589756966, "eval_runtime": 4.819, "eval_samples_per_second": 5.81, "eval_steps_per_second": 2.905, "step": 1000 }, { "epoch": 1.178021978021978, "grad_norm": 0.08527784794569016, "learning_rate": 8.218053927315357e-05, "loss": 0.0208, "step": 1005 }, { "epoch": 1.183882783882784, "grad_norm": 0.08328138291835785, "learning_rate": 8.159437280187575e-05, "loss": 0.0216, "step": 1010 }, { "epoch": 1.1897435897435897, "grad_norm": 0.05000188946723938, "learning_rate": 8.100820633059789e-05, "loss": 0.0211, "step": 1015 }, { "epoch": 1.1956043956043956, "grad_norm": 0.028807902708649635, "learning_rate": 8.042203985932005e-05, "loss": 0.0096, "step": 1020 }, { "epoch": 1.2014652014652014, "grad_norm": 0.20507606863975525, "learning_rate": 7.983587338804221e-05, "loss": 0.0222, "step": 1025 }, { "epoch": 1.2073260073260073, "grad_norm": 0.04885656014084816, "learning_rate": 7.924970691676437e-05, "loss": 0.0215, "step": 1030 }, { "epoch": 1.213186813186813, "grad_norm": 0.047489382326602936, "learning_rate": 7.866354044548652e-05, "loss": 0.0178, "step": 1035 }, { "epoch": 1.2190476190476192, "grad_norm": 0.05971779301762581, "learning_rate": 7.807737397420867e-05, "loss": 0.0176, "step": 1040 }, { "epoch": 1.224908424908425, "grad_norm": 0.04695736616849899, "learning_rate": 7.749120750293083e-05, "loss": 0.0148, "step": 1045 }, { "epoch": 1.2307692307692308, "grad_norm": 0.08131909370422363, "learning_rate": 7.690504103165299e-05, "loss": 0.0201, "step": 1050 }, { "epoch": 1.2366300366300367, "grad_norm": 0.06954577565193176, "learning_rate": 7.631887456037515e-05, "loss": 0.0149, "step": 1055 }, { "epoch": 1.2424908424908425, "grad_norm": 0.054430391639471054, "learning_rate": 7.57327080890973e-05, "loss": 0.0078, "step": 1060 }, { "epoch": 1.2483516483516484, "grad_norm": 0.12231959402561188, "learning_rate": 7.514654161781947e-05, "loss": 0.0258, "step": 1065 }, { "epoch": 1.2542124542124542, "grad_norm": 0.04983118548989296, "learning_rate": 7.456037514654162e-05, "loss": 0.0189, "step": 1070 }, { "epoch": 1.26007326007326, "grad_norm": 0.11981873214244843, "learning_rate": 7.397420867526378e-05, "loss": 0.0156, "step": 1075 }, { "epoch": 1.2659340659340659, "grad_norm": 0.03829724341630936, "learning_rate": 7.338804220398593e-05, "loss": 0.0162, "step": 1080 }, { "epoch": 1.2717948717948717, "grad_norm": 0.1572490632534027, "learning_rate": 7.28018757327081e-05, "loss": 0.0188, "step": 1085 }, { "epoch": 1.2776556776556776, "grad_norm": 0.122514508664608, "learning_rate": 7.221570926143025e-05, "loss": 0.0229, "step": 1090 }, { "epoch": 1.2835164835164834, "grad_norm": 0.06537042558193207, "learning_rate": 7.16295427901524e-05, "loss": 0.0222, "step": 1095 }, { "epoch": 1.2893772893772895, "grad_norm": 0.1269371509552002, "learning_rate": 7.104337631887457e-05, "loss": 0.0272, "step": 1100 }, { "epoch": 1.2893772893772895, "eval_loss": 0.015397748909890652, "eval_runtime": 4.7526, "eval_samples_per_second": 5.892, "eval_steps_per_second": 2.946, "step": 1100 }, { "epoch": 1.2952380952380953, "grad_norm": 0.07507819682359695, "learning_rate": 7.045720984759672e-05, "loss": 0.0256, "step": 1105 }, { "epoch": 1.3010989010989011, "grad_norm": 0.04198193550109863, "learning_rate": 6.987104337631888e-05, "loss": 0.0102, "step": 1110 }, { "epoch": 1.306959706959707, "grad_norm": 0.053751297295093536, "learning_rate": 6.928487690504104e-05, "loss": 0.0141, "step": 1115 }, { "epoch": 1.3128205128205128, "grad_norm": 0.12076237797737122, "learning_rate": 6.86987104337632e-05, "loss": 0.0165, "step": 1120 }, { "epoch": 1.3186813186813187, "grad_norm": 0.0769004300236702, "learning_rate": 6.811254396248535e-05, "loss": 0.0191, "step": 1125 }, { "epoch": 1.3245421245421245, "grad_norm": 0.08021704852581024, "learning_rate": 6.75263774912075e-05, "loss": 0.0253, "step": 1130 }, { "epoch": 1.3304029304029303, "grad_norm": 0.09786754846572876, "learning_rate": 6.694021101992967e-05, "loss": 0.0191, "step": 1135 }, { "epoch": 1.3362637362637364, "grad_norm": 0.06878714263439178, "learning_rate": 6.635404454865182e-05, "loss": 0.0326, "step": 1140 }, { "epoch": 1.3421245421245422, "grad_norm": 0.11297193914651871, "learning_rate": 6.576787807737398e-05, "loss": 0.0185, "step": 1145 }, { "epoch": 1.347985347985348, "grad_norm": 0.10731342434883118, "learning_rate": 6.518171160609614e-05, "loss": 0.0168, "step": 1150 }, { "epoch": 1.353846153846154, "grad_norm": 0.08888328820466995, "learning_rate": 6.45955451348183e-05, "loss": 0.0182, "step": 1155 }, { "epoch": 1.3597069597069598, "grad_norm": 0.1666301190853119, "learning_rate": 6.400937866354045e-05, "loss": 0.0254, "step": 1160 }, { "epoch": 1.3655677655677656, "grad_norm": 0.1334419697523117, "learning_rate": 6.34232121922626e-05, "loss": 0.0215, "step": 1165 }, { "epoch": 1.3714285714285714, "grad_norm": 0.05616243556141853, "learning_rate": 6.283704572098477e-05, "loss": 0.0156, "step": 1170 }, { "epoch": 1.3772893772893773, "grad_norm": 0.1660885214805603, "learning_rate": 6.225087924970692e-05, "loss": 0.0241, "step": 1175 }, { "epoch": 1.3831501831501831, "grad_norm": 0.09245380759239197, "learning_rate": 6.166471277842908e-05, "loss": 0.0222, "step": 1180 }, { "epoch": 1.389010989010989, "grad_norm": 0.08635041117668152, "learning_rate": 6.107854630715122e-05, "loss": 0.0203, "step": 1185 }, { "epoch": 1.3948717948717948, "grad_norm": 0.07752135396003723, "learning_rate": 6.049237983587339e-05, "loss": 0.0216, "step": 1190 }, { "epoch": 1.4007326007326006, "grad_norm": 0.10603225976228714, "learning_rate": 5.990621336459554e-05, "loss": 0.0205, "step": 1195 }, { "epoch": 1.4065934065934065, "grad_norm": 0.04343140870332718, "learning_rate": 5.932004689331771e-05, "loss": 0.0105, "step": 1200 }, { "epoch": 1.4065934065934065, "eval_loss": 0.015952473506331444, "eval_runtime": 4.7686, "eval_samples_per_second": 5.872, "eval_steps_per_second": 2.936, "step": 1200 }, { "epoch": 1.4124542124542123, "grad_norm": 0.12541887164115906, "learning_rate": 5.873388042203987e-05, "loss": 0.0217, "step": 1205 }, { "epoch": 1.4183150183150184, "grad_norm": 0.05417335778474808, "learning_rate": 5.814771395076202e-05, "loss": 0.0205, "step": 1210 }, { "epoch": 1.4241758241758242, "grad_norm": 0.04461506009101868, "learning_rate": 5.756154747948418e-05, "loss": 0.0211, "step": 1215 }, { "epoch": 1.43003663003663, "grad_norm": 0.09423286467790604, "learning_rate": 5.697538100820633e-05, "loss": 0.0238, "step": 1220 }, { "epoch": 1.435897435897436, "grad_norm": 0.050094570964574814, "learning_rate": 5.638921453692849e-05, "loss": 0.0163, "step": 1225 }, { "epoch": 1.4417582417582417, "grad_norm": 0.13104532659053802, "learning_rate": 5.580304806565064e-05, "loss": 0.0219, "step": 1230 }, { "epoch": 1.4476190476190476, "grad_norm": 0.08238503336906433, "learning_rate": 5.52168815943728e-05, "loss": 0.0126, "step": 1235 }, { "epoch": 1.4534798534798534, "grad_norm": 0.1029452383518219, "learning_rate": 5.463071512309497e-05, "loss": 0.0247, "step": 1240 }, { "epoch": 1.4593406593406593, "grad_norm": 0.05564792454242706, "learning_rate": 5.404454865181712e-05, "loss": 0.0212, "step": 1245 }, { "epoch": 1.4652014652014653, "grad_norm": 0.08589282631874084, "learning_rate": 5.345838218053928e-05, "loss": 0.0184, "step": 1250 }, { "epoch": 1.4710622710622712, "grad_norm": 0.15644195675849915, "learning_rate": 5.287221570926143e-05, "loss": 0.0165, "step": 1255 }, { "epoch": 1.476923076923077, "grad_norm": 0.11301274597644806, "learning_rate": 5.228604923798359e-05, "loss": 0.0322, "step": 1260 }, { "epoch": 1.4827838827838828, "grad_norm": 0.044553741812705994, "learning_rate": 5.1699882766705743e-05, "loss": 0.0183, "step": 1265 }, { "epoch": 1.4886446886446887, "grad_norm": 0.06141185760498047, "learning_rate": 5.11137162954279e-05, "loss": 0.0102, "step": 1270 }, { "epoch": 1.4945054945054945, "grad_norm": 0.08107537031173706, "learning_rate": 5.0527549824150055e-05, "loss": 0.0215, "step": 1275 }, { "epoch": 1.5003663003663004, "grad_norm": 0.06017793342471123, "learning_rate": 4.9941383352872214e-05, "loss": 0.0176, "step": 1280 }, { "epoch": 1.5062271062271062, "grad_norm": 0.051033902913331985, "learning_rate": 4.9355216881594373e-05, "loss": 0.0149, "step": 1285 }, { "epoch": 1.512087912087912, "grad_norm": 0.124452143907547, "learning_rate": 4.876905041031653e-05, "loss": 0.0209, "step": 1290 }, { "epoch": 1.5179487179487179, "grad_norm": 0.1616523712873459, "learning_rate": 4.8182883939038685e-05, "loss": 0.0192, "step": 1295 }, { "epoch": 1.5238095238095237, "grad_norm": 0.07067764550447464, "learning_rate": 4.759671746776085e-05, "loss": 0.0279, "step": 1300 }, { "epoch": 1.5238095238095237, "eval_loss": 0.017229218035936356, "eval_runtime": 4.6702, "eval_samples_per_second": 5.996, "eval_steps_per_second": 2.998, "step": 1300 }, { "epoch": 1.5296703296703296, "grad_norm": 0.06332267820835114, "learning_rate": 4.7010550996483003e-05, "loss": 0.0253, "step": 1305 }, { "epoch": 1.5355311355311354, "grad_norm": 0.08032066375017166, "learning_rate": 4.642438452520516e-05, "loss": 0.0128, "step": 1310 }, { "epoch": 1.5413919413919412, "grad_norm": 0.11456907540559769, "learning_rate": 4.5838218053927315e-05, "loss": 0.0127, "step": 1315 }, { "epoch": 1.5472527472527473, "grad_norm": 0.19653138518333435, "learning_rate": 4.5252051582649474e-05, "loss": 0.0236, "step": 1320 }, { "epoch": 1.5531135531135531, "grad_norm": 0.08195839822292328, "learning_rate": 4.4665885111371633e-05, "loss": 0.0173, "step": 1325 }, { "epoch": 1.558974358974359, "grad_norm": 0.11376089602708817, "learning_rate": 4.4079718640093786e-05, "loss": 0.0115, "step": 1330 }, { "epoch": 1.5648351648351648, "grad_norm": 0.055264201015233994, "learning_rate": 4.3493552168815945e-05, "loss": 0.0198, "step": 1335 }, { "epoch": 1.5706959706959707, "grad_norm": 0.13038881123065948, "learning_rate": 4.2907385697538104e-05, "loss": 0.0231, "step": 1340 }, { "epoch": 1.5765567765567765, "grad_norm": 0.0317939892411232, "learning_rate": 4.2321219226260263e-05, "loss": 0.0083, "step": 1345 }, { "epoch": 1.5824175824175826, "grad_norm": 0.151336207985878, "learning_rate": 4.1735052754982416e-05, "loss": 0.0225, "step": 1350 }, { "epoch": 1.5882783882783884, "grad_norm": 0.07817093282938004, "learning_rate": 4.1148886283704575e-05, "loss": 0.0226, "step": 1355 }, { "epoch": 1.5941391941391942, "grad_norm": 0.1341279149055481, "learning_rate": 4.056271981242673e-05, "loss": 0.0263, "step": 1360 }, { "epoch": 1.6, "grad_norm": 0.06353727728128433, "learning_rate": 3.997655334114889e-05, "loss": 0.0198, "step": 1365 }, { "epoch": 1.605860805860806, "grad_norm": 0.11177901178598404, "learning_rate": 3.9390386869871046e-05, "loss": 0.0172, "step": 1370 }, { "epoch": 1.6117216117216118, "grad_norm": 0.047024596482515335, "learning_rate": 3.88042203985932e-05, "loss": 0.0207, "step": 1375 }, { "epoch": 1.6175824175824176, "grad_norm": 0.04343528300523758, "learning_rate": 3.8218053927315364e-05, "loss": 0.0214, "step": 1380 }, { "epoch": 1.6234432234432234, "grad_norm": 0.08330193161964417, "learning_rate": 3.763188745603752e-05, "loss": 0.0286, "step": 1385 }, { "epoch": 1.6293040293040293, "grad_norm": 0.0811009556055069, "learning_rate": 3.7045720984759676e-05, "loss": 0.0148, "step": 1390 }, { "epoch": 1.6351648351648351, "grad_norm": 0.1049441322684288, "learning_rate": 3.645955451348183e-05, "loss": 0.0184, "step": 1395 }, { "epoch": 1.641025641025641, "grad_norm": 0.11944428086280823, "learning_rate": 3.587338804220399e-05, "loss": 0.0122, "step": 1400 }, { "epoch": 1.641025641025641, "eval_loss": 0.017561230808496475, "eval_runtime": 4.685, "eval_samples_per_second": 5.977, "eval_steps_per_second": 2.988, "step": 1400 }, { "epoch": 1.6468864468864468, "grad_norm": 0.14023366570472717, "learning_rate": 3.528722157092615e-05, "loss": 0.0178, "step": 1405 }, { "epoch": 1.6527472527472526, "grad_norm": 0.14057691395282745, "learning_rate": 3.47010550996483e-05, "loss": 0.0268, "step": 1410 }, { "epoch": 1.6586080586080585, "grad_norm": 0.1253061443567276, "learning_rate": 3.411488862837046e-05, "loss": 0.0266, "step": 1415 }, { "epoch": 1.6644688644688643, "grad_norm": 0.03431854769587517, "learning_rate": 3.352872215709262e-05, "loss": 0.02, "step": 1420 }, { "epoch": 1.6703296703296702, "grad_norm": 0.13929079473018646, "learning_rate": 3.294255568581478e-05, "loss": 0.0226, "step": 1425 }, { "epoch": 1.6761904761904762, "grad_norm": 0.06429693102836609, "learning_rate": 3.235638921453693e-05, "loss": 0.0225, "step": 1430 }, { "epoch": 1.682051282051282, "grad_norm": 0.029311953112483025, "learning_rate": 3.177022274325909e-05, "loss": 0.0161, "step": 1435 }, { "epoch": 1.687912087912088, "grad_norm": 0.04346455633640289, "learning_rate": 3.118405627198124e-05, "loss": 0.0155, "step": 1440 }, { "epoch": 1.6937728937728938, "grad_norm": 0.09009824693202972, "learning_rate": 3.05978898007034e-05, "loss": 0.0153, "step": 1445 }, { "epoch": 1.6996336996336996, "grad_norm": 0.071926549077034, "learning_rate": 3.0011723329425556e-05, "loss": 0.0136, "step": 1450 }, { "epoch": 1.7054945054945057, "grad_norm": 0.06461833417415619, "learning_rate": 2.9425556858147718e-05, "loss": 0.0237, "step": 1455 }, { "epoch": 1.7113553113553115, "grad_norm": 0.039929524064064026, "learning_rate": 2.8839390386869874e-05, "loss": 0.0187, "step": 1460 }, { "epoch": 1.7172161172161173, "grad_norm": 0.0534372515976429, "learning_rate": 2.825322391559203e-05, "loss": 0.0192, "step": 1465 }, { "epoch": 1.7230769230769232, "grad_norm": 0.1467376947402954, "learning_rate": 2.7667057444314186e-05, "loss": 0.0203, "step": 1470 }, { "epoch": 1.728937728937729, "grad_norm": 0.0830003172159195, "learning_rate": 2.7080890973036345e-05, "loss": 0.0188, "step": 1475 }, { "epoch": 1.7347985347985349, "grad_norm": 0.07220768928527832, "learning_rate": 2.64947245017585e-05, "loss": 0.0118, "step": 1480 }, { "epoch": 1.7406593406593407, "grad_norm": 0.0751115009188652, "learning_rate": 2.5908558030480656e-05, "loss": 0.0156, "step": 1485 }, { "epoch": 1.7465201465201465, "grad_norm": 0.07690921425819397, "learning_rate": 2.5322391559202812e-05, "loss": 0.0347, "step": 1490 }, { "epoch": 1.7523809523809524, "grad_norm": 0.05416159704327583, "learning_rate": 2.473622508792497e-05, "loss": 0.0167, "step": 1495 }, { "epoch": 1.7582417582417582, "grad_norm": 0.0676250010728836, "learning_rate": 2.4150058616647127e-05, "loss": 0.0205, "step": 1500 }, { "epoch": 1.7582417582417582, "eval_loss": 0.019522378221154213, "eval_runtime": 4.6942, "eval_samples_per_second": 5.965, "eval_steps_per_second": 2.982, "step": 1500 }, { "epoch": 1.764102564102564, "grad_norm": 0.08909754455089569, "learning_rate": 2.3563892145369286e-05, "loss": 0.0249, "step": 1505 }, { "epoch": 1.76996336996337, "grad_norm": 0.042161233723163605, "learning_rate": 2.2977725674091442e-05, "loss": 0.0117, "step": 1510 }, { "epoch": 1.7758241758241757, "grad_norm": 0.07136218249797821, "learning_rate": 2.23915592028136e-05, "loss": 0.0212, "step": 1515 }, { "epoch": 1.7816849816849816, "grad_norm": 0.14128735661506653, "learning_rate": 2.1805392731535757e-05, "loss": 0.0189, "step": 1520 }, { "epoch": 1.7875457875457874, "grad_norm": 0.05959760770201683, "learning_rate": 2.1219226260257916e-05, "loss": 0.0119, "step": 1525 }, { "epoch": 1.7934065934065933, "grad_norm": 0.038479190319776535, "learning_rate": 2.0633059788980072e-05, "loss": 0.013, "step": 1530 }, { "epoch": 1.7992673992673993, "grad_norm": 0.09512809664011002, "learning_rate": 2.0046893317702228e-05, "loss": 0.0148, "step": 1535 }, { "epoch": 1.8051282051282052, "grad_norm": 0.14848454296588898, "learning_rate": 1.9460726846424384e-05, "loss": 0.019, "step": 1540 }, { "epoch": 1.810989010989011, "grad_norm": 0.10240516811609268, "learning_rate": 1.8874560375146543e-05, "loss": 0.017, "step": 1545 }, { "epoch": 1.8168498168498168, "grad_norm": 0.09345954656600952, "learning_rate": 1.82883939038687e-05, "loss": 0.0237, "step": 1550 }, { "epoch": 1.8227106227106227, "grad_norm": 0.03815275430679321, "learning_rate": 1.7702227432590858e-05, "loss": 0.0188, "step": 1555 }, { "epoch": 1.8285714285714287, "grad_norm": 0.027827398851513863, "learning_rate": 1.7116060961313014e-05, "loss": 0.0183, "step": 1560 }, { "epoch": 1.8344322344322346, "grad_norm": 0.08811303228139877, "learning_rate": 1.6529894490035173e-05, "loss": 0.0191, "step": 1565 }, { "epoch": 1.8402930402930404, "grad_norm": 0.03119056299328804, "learning_rate": 1.594372801875733e-05, "loss": 0.0183, "step": 1570 }, { "epoch": 1.8461538461538463, "grad_norm": 0.09752997010946274, "learning_rate": 1.5357561547479485e-05, "loss": 0.0161, "step": 1575 }, { "epoch": 1.852014652014652, "grad_norm": 0.0855243131518364, "learning_rate": 1.477139507620164e-05, "loss": 0.015, "step": 1580 }, { "epoch": 1.857875457875458, "grad_norm": 0.08388842642307281, "learning_rate": 1.41852286049238e-05, "loss": 0.0148, "step": 1585 }, { "epoch": 1.8637362637362638, "grad_norm": 0.10147551447153091, "learning_rate": 1.3599062133645957e-05, "loss": 0.0154, "step": 1590 }, { "epoch": 1.8695970695970696, "grad_norm": 0.0457012839615345, "learning_rate": 1.3012895662368113e-05, "loss": 0.0186, "step": 1595 }, { "epoch": 1.8754578754578755, "grad_norm": 0.03654688224196434, "learning_rate": 1.242672919109027e-05, "loss": 0.0321, "step": 1600 }, { "epoch": 1.8754578754578755, "eval_loss": 0.017443044111132622, "eval_runtime": 4.7108, "eval_samples_per_second": 5.944, "eval_steps_per_second": 2.972, "step": 1600 }, { "epoch": 1.8813186813186813, "grad_norm": 0.07887323200702667, "learning_rate": 1.1840562719812428e-05, "loss": 0.0142, "step": 1605 }, { "epoch": 1.8871794871794871, "grad_norm": 0.11328335106372833, "learning_rate": 1.1254396248534585e-05, "loss": 0.025, "step": 1610 }, { "epoch": 1.893040293040293, "grad_norm": 0.09318089485168457, "learning_rate": 1.0668229777256741e-05, "loss": 0.0204, "step": 1615 }, { "epoch": 1.8989010989010988, "grad_norm": 0.06992164254188538, "learning_rate": 1.0082063305978899e-05, "loss": 0.0135, "step": 1620 }, { "epoch": 1.9047619047619047, "grad_norm": 0.14927181601524353, "learning_rate": 9.495896834701056e-06, "loss": 0.0245, "step": 1625 }, { "epoch": 1.9106227106227105, "grad_norm": 0.11408836394548416, "learning_rate": 8.909730363423214e-06, "loss": 0.0161, "step": 1630 }, { "epoch": 1.9164835164835163, "grad_norm": 0.06911155581474304, "learning_rate": 8.32356389214537e-06, "loss": 0.0154, "step": 1635 }, { "epoch": 1.9223443223443224, "grad_norm": 0.11621779948472977, "learning_rate": 7.737397420867527e-06, "loss": 0.0144, "step": 1640 }, { "epoch": 1.9282051282051282, "grad_norm": 0.046058397740125656, "learning_rate": 7.151230949589684e-06, "loss": 0.0093, "step": 1645 }, { "epoch": 1.934065934065934, "grad_norm": 0.11228576302528381, "learning_rate": 6.565064478311841e-06, "loss": 0.022, "step": 1650 }, { "epoch": 1.93992673992674, "grad_norm": 0.1315338909626007, "learning_rate": 5.978898007033998e-06, "loss": 0.0193, "step": 1655 }, { "epoch": 1.9457875457875458, "grad_norm": 0.040056392550468445, "learning_rate": 5.3927315357561546e-06, "loss": 0.0132, "step": 1660 }, { "epoch": 1.9516483516483516, "grad_norm": 0.10741738229990005, "learning_rate": 4.806565064478312e-06, "loss": 0.0352, "step": 1665 }, { "epoch": 1.9575091575091577, "grad_norm": 0.059029560536146164, "learning_rate": 4.220398593200469e-06, "loss": 0.019, "step": 1670 }, { "epoch": 1.9633699633699635, "grad_norm": 0.06789711117744446, "learning_rate": 3.6342321219226262e-06, "loss": 0.0189, "step": 1675 }, { "epoch": 1.9692307692307693, "grad_norm": 0.02918117679655552, "learning_rate": 3.0480656506447833e-06, "loss": 0.0259, "step": 1680 }, { "epoch": 1.9750915750915752, "grad_norm": 0.08073403686285019, "learning_rate": 2.4618991793669404e-06, "loss": 0.0286, "step": 1685 }, { "epoch": 1.980952380952381, "grad_norm": 0.1617717295885086, "learning_rate": 1.8757327080890972e-06, "loss": 0.0191, "step": 1690 }, { "epoch": 1.9868131868131869, "grad_norm": 0.06613462418317795, "learning_rate": 1.2895662368112545e-06, "loss": 0.0128, "step": 1695 }, { "epoch": 1.9926739926739927, "grad_norm": 0.08398256450891495, "learning_rate": 7.033997655334116e-07, "loss": 0.0118, "step": 1700 }, { "epoch": 1.9926739926739927, "eval_loss": 0.018166696652770042, "eval_runtime": 4.6959, "eval_samples_per_second": 5.963, "eval_steps_per_second": 2.981, "step": 1700 }, { "epoch": 1.9985347985347985, "grad_norm": 0.0773661658167839, "learning_rate": 1.1723329425556858e-07, "loss": 0.0146, "step": 1705 } ], "logging_steps": 5, "max_steps": 1706, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.764499308335456e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }