{ "best_metric": 11.208358764648438, "best_model_checkpoint": "./FT_models/[LDH]0219_all_llama31_docs_nodocs/checkpoint-1000", "epoch": 2.716904276985743, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009051821679112922, "grad_norm": 0.874797523021698, "learning_rate": 0.0001999995055317446, "loss": 1.9293, "step": 10 }, { "epoch": 0.018103643358225844, "grad_norm": 0.5239140391349792, "learning_rate": 0.0001999955498150411, "loss": 1.1773, "step": 20 }, { "epoch": 0.027155465037338764, "grad_norm": 0.49293628334999084, "learning_rate": 0.00019998763853811184, "loss": 1.1369, "step": 30 }, { "epoch": 0.03620728671645169, "grad_norm": 0.4514484405517578, "learning_rate": 0.00019997577201390606, "loss": 1.1049, "step": 40 }, { "epoch": 0.04525910839556461, "grad_norm": 0.480444073677063, "learning_rate": 0.0001999599507118322, "loss": 1.0472, "step": 50 }, { "epoch": 0.05431093007467753, "grad_norm": 0.6258535385131836, "learning_rate": 0.00019994017525773913, "loss": 1.0656, "step": 60 }, { "epoch": 0.06336275175379046, "grad_norm": 0.5886121392250061, "learning_rate": 0.0001999164464338918, "loss": 0.9311, "step": 70 }, { "epoch": 0.07241457343290338, "grad_norm": 0.6107162833213806, "learning_rate": 0.0001998887651789398, "loss": 1.0476, "step": 80 }, { "epoch": 0.0814663951120163, "grad_norm": 0.5811272859573364, "learning_rate": 0.0001998571325878806, "loss": 0.9568, "step": 90 }, { "epoch": 0.09051821679112922, "grad_norm": 0.6300623416900635, "learning_rate": 0.00019982154991201608, "loss": 0.9121, "step": 100 }, { "epoch": 0.09957003847024214, "grad_norm": 0.6835796236991882, "learning_rate": 0.00019978201855890308, "loss": 0.8682, "step": 110 }, { "epoch": 0.10862186014935506, "grad_norm": 0.5953958034515381, "learning_rate": 0.00019973854009229763, "loss": 0.9708, "step": 120 }, { "epoch": 0.11767368182846798, "grad_norm": 0.6399095058441162, "learning_rate": 0.00019969111623209323, "loss": 0.9063, "step": 130 }, { "epoch": 0.1267255035075809, "grad_norm": 0.5774116516113281, "learning_rate": 0.00019963974885425266, "loss": 0.9571, "step": 140 }, { "epoch": 0.13577732518669383, "grad_norm": 0.6394885182380676, "learning_rate": 0.00019958443999073397, "loss": 0.9187, "step": 150 }, { "epoch": 0.14482914686580675, "grad_norm": 0.6276863813400269, "learning_rate": 0.00019952519182940993, "loss": 0.9531, "step": 160 }, { "epoch": 0.15388096854491967, "grad_norm": 0.6715622544288635, "learning_rate": 0.0001994620067139815, "loss": 0.8558, "step": 170 }, { "epoch": 0.1629327902240326, "grad_norm": 0.7086506485939026, "learning_rate": 0.00019939488714388524, "loss": 0.8611, "step": 180 }, { "epoch": 0.1719846119031455, "grad_norm": 0.614281177520752, "learning_rate": 0.00019932383577419432, "loss": 0.7985, "step": 190 }, { "epoch": 0.18103643358225843, "grad_norm": 0.6501333117485046, "learning_rate": 0.0001992488554155135, "loss": 0.8663, "step": 200 }, { "epoch": 0.19008825526137135, "grad_norm": 0.59181809425354, "learning_rate": 0.0001991699490338681, "loss": 0.7611, "step": 210 }, { "epoch": 0.19914007694048427, "grad_norm": 0.6414404511451721, "learning_rate": 0.00019908711975058637, "loss": 0.7952, "step": 220 }, { "epoch": 0.2081918986195972, "grad_norm": 0.5972408056259155, "learning_rate": 0.00019900037084217637, "loss": 0.7415, "step": 230 }, { "epoch": 0.2172437202987101, "grad_norm": 0.6564370393753052, "learning_rate": 0.00019890970574019617, "loss": 0.7951, "step": 240 }, { "epoch": 0.22629554197782303, "grad_norm": 0.7463496327400208, "learning_rate": 0.00019881512803111796, "loss": 0.7421, "step": 250 }, { "epoch": 0.23534736365693595, "grad_norm": 0.6890644431114197, "learning_rate": 0.00019871664145618657, "loss": 0.7929, "step": 260 }, { "epoch": 0.24439918533604887, "grad_norm": 0.5937248468399048, "learning_rate": 0.00019861424991127115, "loss": 0.7801, "step": 270 }, { "epoch": 0.2534510070151618, "grad_norm": 0.7962441444396973, "learning_rate": 0.00019850795744671116, "loss": 0.7889, "step": 280 }, { "epoch": 0.2625028286942747, "grad_norm": 0.5598044991493225, "learning_rate": 0.00019839776826715614, "loss": 0.773, "step": 290 }, { "epoch": 0.27155465037338766, "grad_norm": 0.743926465511322, "learning_rate": 0.00019828368673139947, "loss": 0.7739, "step": 300 }, { "epoch": 0.28060647205250056, "grad_norm": 0.5907182693481445, "learning_rate": 0.00019816571735220583, "loss": 0.7472, "step": 310 }, { "epoch": 0.2896582937316135, "grad_norm": 0.7143293619155884, "learning_rate": 0.0001980438647961327, "loss": 0.723, "step": 320 }, { "epoch": 0.2987101154107264, "grad_norm": 0.6375603079795837, "learning_rate": 0.00019791813388334581, "loss": 0.7454, "step": 330 }, { "epoch": 0.30776193708983934, "grad_norm": 0.7762064933776855, "learning_rate": 0.00019778852958742853, "loss": 0.6882, "step": 340 }, { "epoch": 0.31681375876895224, "grad_norm": 0.5525858998298645, "learning_rate": 0.00019765505703518496, "loss": 0.6642, "step": 350 }, { "epoch": 0.3258655804480652, "grad_norm": 0.6317685842514038, "learning_rate": 0.00019751772150643722, "loss": 0.6965, "step": 360 }, { "epoch": 0.3349174021271781, "grad_norm": 0.664470374584198, "learning_rate": 0.0001973765284338167, "loss": 0.7218, "step": 370 }, { "epoch": 0.343969223806291, "grad_norm": 0.587965190410614, "learning_rate": 0.00019723148340254892, "loss": 0.7628, "step": 380 }, { "epoch": 0.3530210454854039, "grad_norm": 0.6755478978157043, "learning_rate": 0.0001970825921502328, "loss": 0.6277, "step": 390 }, { "epoch": 0.36207286716451687, "grad_norm": 0.6129139065742493, "learning_rate": 0.00019692986056661356, "loss": 0.7238, "step": 400 }, { "epoch": 0.37112468884362976, "grad_norm": 0.6757403612136841, "learning_rate": 0.0001967732946933499, "loss": 0.6568, "step": 410 }, { "epoch": 0.3801765105227427, "grad_norm": 0.7164187431335449, "learning_rate": 0.00019661290072377482, "loss": 0.6963, "step": 420 }, { "epoch": 0.3892283322018556, "grad_norm": 0.6669524908065796, "learning_rate": 0.0001964486850026507, "loss": 0.6931, "step": 430 }, { "epoch": 0.39828015388096855, "grad_norm": 0.6036201119422913, "learning_rate": 0.00019628065402591845, "loss": 0.6989, "step": 440 }, { "epoch": 0.4073319755600815, "grad_norm": 0.6022247076034546, "learning_rate": 0.0001961088144404403, "loss": 0.7669, "step": 450 }, { "epoch": 0.4163837972391944, "grad_norm": 0.6327004432678223, "learning_rate": 0.00019593317304373705, "loss": 0.6673, "step": 460 }, { "epoch": 0.42543561891830733, "grad_norm": 0.6826880574226379, "learning_rate": 0.00019575373678371909, "loss": 0.6965, "step": 470 }, { "epoch": 0.4344874405974202, "grad_norm": 0.6268287897109985, "learning_rate": 0.0001955705127584117, "loss": 0.6468, "step": 480 }, { "epoch": 0.4435392622765332, "grad_norm": 0.6102762818336487, "learning_rate": 0.00019538350821567404, "loss": 0.6302, "step": 490 }, { "epoch": 0.45259108395564607, "grad_norm": 0.672012448310852, "learning_rate": 0.00019519273055291266, "loss": 0.6529, "step": 500 }, { "epoch": 0.45259108395564607, "eval_loss": 11.249969482421875, "eval_runtime": 42.473, "eval_samples_per_second": 7.464, "eval_steps_per_second": 3.744, "step": 500 }, { "epoch": 0.461642905634759, "grad_norm": 0.6281114220619202, "learning_rate": 0.00019499818731678873, "loss": 0.705, "step": 510 }, { "epoch": 0.4706947273138719, "grad_norm": 0.6248394250869751, "learning_rate": 0.00019479988620291956, "loss": 0.6999, "step": 520 }, { "epoch": 0.47974654899298486, "grad_norm": 0.5792989134788513, "learning_rate": 0.00019459783505557424, "loss": 0.6166, "step": 530 }, { "epoch": 0.48879837067209775, "grad_norm": 0.6465005278587341, "learning_rate": 0.0001943920418673633, "loss": 0.5985, "step": 540 }, { "epoch": 0.4978501923512107, "grad_norm": 0.7002309560775757, "learning_rate": 0.0001941825147789225, "loss": 0.7327, "step": 550 }, { "epoch": 0.5069020140303236, "grad_norm": 0.6550771594047546, "learning_rate": 0.00019396926207859084, "loss": 0.6097, "step": 560 }, { "epoch": 0.5159538357094365, "grad_norm": 0.6645638942718506, "learning_rate": 0.00019375229220208276, "loss": 0.6032, "step": 570 }, { "epoch": 0.5250056573885494, "grad_norm": 0.6332526803016663, "learning_rate": 0.0001935316137321543, "loss": 0.647, "step": 580 }, { "epoch": 0.5340574790676623, "grad_norm": 0.6539055705070496, "learning_rate": 0.00019330723539826375, "loss": 0.5964, "step": 590 }, { "epoch": 0.5431093007467753, "grad_norm": 0.616939902305603, "learning_rate": 0.0001930791660762262, "loss": 0.6331, "step": 600 }, { "epoch": 0.5521611224258882, "grad_norm": 0.6726529598236084, "learning_rate": 0.0001928474147878626, "loss": 0.6395, "step": 610 }, { "epoch": 0.5612129441050011, "grad_norm": 0.6520217061042786, "learning_rate": 0.0001926119907006426, "loss": 0.5835, "step": 620 }, { "epoch": 0.570264765784114, "grad_norm": 0.6048600077629089, "learning_rate": 0.00019237290312732226, "loss": 0.5824, "step": 630 }, { "epoch": 0.579316587463227, "grad_norm": 0.6220108270645142, "learning_rate": 0.0001921301615255754, "loss": 0.638, "step": 640 }, { "epoch": 0.5883684091423399, "grad_norm": 0.6199471354484558, "learning_rate": 0.00019188377549761963, "loss": 0.591, "step": 650 }, { "epoch": 0.5974202308214528, "grad_norm": 0.6387749910354614, "learning_rate": 0.00019163375478983632, "loss": 0.5934, "step": 660 }, { "epoch": 0.6064720525005657, "grad_norm": 0.6121918559074402, "learning_rate": 0.00019138010929238534, "loss": 0.5164, "step": 670 }, { "epoch": 0.6155238741796787, "grad_norm": 0.5336770415306091, "learning_rate": 0.0001911228490388136, "loss": 0.5644, "step": 680 }, { "epoch": 0.6245756958587916, "grad_norm": 0.6676989793777466, "learning_rate": 0.00019086198420565823, "loss": 0.6162, "step": 690 }, { "epoch": 0.6336275175379045, "grad_norm": 0.7787268161773682, "learning_rate": 0.000190597525112044, "loss": 0.5458, "step": 700 }, { "epoch": 0.6426793392170175, "grad_norm": 0.5786814093589783, "learning_rate": 0.00019032948221927524, "loss": 0.6159, "step": 710 }, { "epoch": 0.6517311608961304, "grad_norm": 0.5182745456695557, "learning_rate": 0.00019005786613042185, "loss": 0.5528, "step": 720 }, { "epoch": 0.6607829825752433, "grad_norm": 0.6106360554695129, "learning_rate": 0.00018978268758989991, "loss": 0.5792, "step": 730 }, { "epoch": 0.6698348042543562, "grad_norm": 0.6614301800727844, "learning_rate": 0.00018950395748304678, "loss": 0.5251, "step": 740 }, { "epoch": 0.6788866259334692, "grad_norm": 0.4879720211029053, "learning_rate": 0.0001892216868356904, "loss": 0.5596, "step": 750 }, { "epoch": 0.687938447612582, "grad_norm": 0.7061878442764282, "learning_rate": 0.00018893588681371303, "loss": 0.5517, "step": 760 }, { "epoch": 0.6969902692916949, "grad_norm": 0.7787359952926636, "learning_rate": 0.00018864656872260985, "loss": 0.5813, "step": 770 }, { "epoch": 0.7060420909708078, "grad_norm": 0.5743793845176697, "learning_rate": 0.00018835374400704154, "loss": 0.56, "step": 780 }, { "epoch": 0.7150939126499208, "grad_norm": 0.5919637680053711, "learning_rate": 0.00018805742425038145, "loss": 0.5429, "step": 790 }, { "epoch": 0.7241457343290337, "grad_norm": 0.5750169157981873, "learning_rate": 0.00018775762117425777, "loss": 0.5303, "step": 800 }, { "epoch": 0.7331975560081466, "grad_norm": 0.5034388899803162, "learning_rate": 0.00018745434663808942, "loss": 0.554, "step": 810 }, { "epoch": 0.7422493776872595, "grad_norm": 0.6673147678375244, "learning_rate": 0.00018714761263861728, "loss": 0.5769, "step": 820 }, { "epoch": 0.7513011993663725, "grad_norm": 0.6599574089050293, "learning_rate": 0.00018683743130942928, "loss": 0.5316, "step": 830 }, { "epoch": 0.7603530210454854, "grad_norm": 0.6396362781524658, "learning_rate": 0.00018652381492048083, "loss": 0.5061, "step": 840 }, { "epoch": 0.7694048427245983, "grad_norm": 0.5517466068267822, "learning_rate": 0.00018620677587760916, "loss": 0.5951, "step": 850 }, { "epoch": 0.7784566644037112, "grad_norm": 0.5412144064903259, "learning_rate": 0.00018588632672204264, "loss": 0.4929, "step": 860 }, { "epoch": 0.7875084860828242, "grad_norm": 0.5431721210479736, "learning_rate": 0.00018556248012990468, "loss": 0.5358, "step": 870 }, { "epoch": 0.7965603077619371, "grad_norm": 0.5506939888000488, "learning_rate": 0.0001852352489117124, "loss": 0.4487, "step": 880 }, { "epoch": 0.80561212944105, "grad_norm": 0.5743805766105652, "learning_rate": 0.0001849046460118698, "loss": 0.5725, "step": 890 }, { "epoch": 0.814663951120163, "grad_norm": 0.573427677154541, "learning_rate": 0.00018457068450815562, "loss": 0.5078, "step": 900 }, { "epoch": 0.8237157727992759, "grad_norm": 0.6854224801063538, "learning_rate": 0.00018423337761120618, "loss": 0.4688, "step": 910 }, { "epoch": 0.8327675944783888, "grad_norm": 0.6372252702713013, "learning_rate": 0.00018389273866399275, "loss": 0.5169, "step": 920 }, { "epoch": 0.8418194161575017, "grad_norm": 0.6130049228668213, "learning_rate": 0.00018354878114129367, "loss": 0.5089, "step": 930 }, { "epoch": 0.8508712378366147, "grad_norm": 0.5654692649841309, "learning_rate": 0.00018320151864916135, "loss": 0.5492, "step": 940 }, { "epoch": 0.8599230595157276, "grad_norm": 0.548478901386261, "learning_rate": 0.00018285096492438424, "loss": 0.5218, "step": 950 }, { "epoch": 0.8689748811948405, "grad_norm": 0.5212107300758362, "learning_rate": 0.00018249713383394303, "loss": 0.5803, "step": 960 }, { "epoch": 0.8780267028739533, "grad_norm": 0.5414544939994812, "learning_rate": 0.00018214003937446253, "loss": 0.5677, "step": 970 }, { "epoch": 0.8870785245530663, "grad_norm": 0.58649742603302, "learning_rate": 0.0001817796956716578, "loss": 0.5331, "step": 980 }, { "epoch": 0.8961303462321792, "grad_norm": 0.5166681408882141, "learning_rate": 0.00018141611697977529, "loss": 0.5032, "step": 990 }, { "epoch": 0.9051821679112921, "grad_norm": 0.6174758076667786, "learning_rate": 0.0001810493176810292, "loss": 0.5148, "step": 1000 }, { "epoch": 0.9051821679112921, "eval_loss": 11.208358764648438, "eval_runtime": 42.0152, "eval_samples_per_second": 7.545, "eval_steps_per_second": 3.784, "step": 1000 }, { "epoch": 0.914233989590405, "grad_norm": 0.6219255924224854, "learning_rate": 0.00018067931228503246, "loss": 0.5486, "step": 1010 }, { "epoch": 0.923285811269518, "grad_norm": 0.5557869672775269, "learning_rate": 0.00018030611542822257, "loss": 0.4645, "step": 1020 }, { "epoch": 0.9323376329486309, "grad_norm": 0.6376787424087524, "learning_rate": 0.00017992974187328305, "loss": 0.5582, "step": 1030 }, { "epoch": 0.9413894546277438, "grad_norm": 0.6127185821533203, "learning_rate": 0.000179550206508559, "loss": 0.5878, "step": 1040 }, { "epoch": 0.9504412763068567, "grad_norm": 1.0581947565078735, "learning_rate": 0.00017916752434746856, "loss": 0.5113, "step": 1050 }, { "epoch": 0.9594930979859697, "grad_norm": 0.6159986257553101, "learning_rate": 0.00017878171052790868, "loss": 0.5226, "step": 1060 }, { "epoch": 0.9685449196650826, "grad_norm": 0.6697051525115967, "learning_rate": 0.00017839278031165658, "loss": 0.5011, "step": 1070 }, { "epoch": 0.9775967413441955, "grad_norm": 0.6310513019561768, "learning_rate": 0.00017800074908376584, "loss": 0.4599, "step": 1080 }, { "epoch": 0.9866485630233084, "grad_norm": 0.5671334266662598, "learning_rate": 0.0001776056323519579, "loss": 0.4455, "step": 1090 }, { "epoch": 0.9957003847024214, "grad_norm": 0.5935373902320862, "learning_rate": 0.00017720744574600863, "loss": 0.472, "step": 1100 }, { "epoch": 1.0054310930074677, "grad_norm": 0.4842943251132965, "learning_rate": 0.00017680620501712996, "loss": 0.4619, "step": 1110 }, { "epoch": 1.0144829146865806, "grad_norm": 0.5976426601409912, "learning_rate": 0.00017640192603734692, "loss": 0.4535, "step": 1120 }, { "epoch": 1.0235347363656937, "grad_norm": 0.5157873630523682, "learning_rate": 0.00017599462479886974, "loss": 0.3832, "step": 1130 }, { "epoch": 1.0325865580448066, "grad_norm": 0.6044872403144836, "learning_rate": 0.00017558431741346122, "loss": 0.4133, "step": 1140 }, { "epoch": 1.0416383797239195, "grad_norm": 0.6488142013549805, "learning_rate": 0.00017517102011179933, "loss": 0.4391, "step": 1150 }, { "epoch": 1.0506902014030324, "grad_norm": 0.5446122884750366, "learning_rate": 0.00017475474924283536, "loss": 0.3626, "step": 1160 }, { "epoch": 1.0597420230821453, "grad_norm": 0.4834959805011749, "learning_rate": 0.000174335521273147, "loss": 0.3712, "step": 1170 }, { "epoch": 1.0687938447612582, "grad_norm": 0.506747305393219, "learning_rate": 0.00017391335278628712, "loss": 0.3985, "step": 1180 }, { "epoch": 1.077845666440371, "grad_norm": 0.5550024509429932, "learning_rate": 0.0001734882604821276, "loss": 0.4201, "step": 1190 }, { "epoch": 1.086897488119484, "grad_norm": 0.5207591652870178, "learning_rate": 0.00017306026117619889, "loss": 0.3782, "step": 1200 }, { "epoch": 1.095949309798597, "grad_norm": 0.6619957089424133, "learning_rate": 0.00017262937179902472, "loss": 0.4203, "step": 1210 }, { "epoch": 1.10500113147771, "grad_norm": 0.6261160969734192, "learning_rate": 0.00017219560939545246, "loss": 0.3877, "step": 1220 }, { "epoch": 1.1140529531568228, "grad_norm": 0.6254423260688782, "learning_rate": 0.0001717589911239788, "loss": 0.4245, "step": 1230 }, { "epoch": 1.1231047748359357, "grad_norm": 0.5006200671195984, "learning_rate": 0.00017131953425607104, "loss": 0.399, "step": 1240 }, { "epoch": 1.1321565965150486, "grad_norm": 0.5162988305091858, "learning_rate": 0.00017087725617548385, "loss": 0.4325, "step": 1250 }, { "epoch": 1.1412084181941615, "grad_norm": 0.5567941665649414, "learning_rate": 0.00017043217437757164, "loss": 0.4391, "step": 1260 }, { "epoch": 1.1502602398732744, "grad_norm": 0.6780401468276978, "learning_rate": 0.00016998430646859654, "loss": 0.4388, "step": 1270 }, { "epoch": 1.1593120615523875, "grad_norm": 0.6121058464050293, "learning_rate": 0.00016953367016503182, "loss": 0.4216, "step": 1280 }, { "epoch": 1.1683638832315004, "grad_norm": 0.6004344820976257, "learning_rate": 0.00016908028329286112, "loss": 0.4171, "step": 1290 }, { "epoch": 1.1774157049106133, "grad_norm": 0.46892327070236206, "learning_rate": 0.0001686241637868734, "loss": 0.3628, "step": 1300 }, { "epoch": 1.1864675265897262, "grad_norm": 0.6038176417350769, "learning_rate": 0.00016816532968995328, "loss": 0.3771, "step": 1310 }, { "epoch": 1.195519348268839, "grad_norm": 0.5433237552642822, "learning_rate": 0.00016770379915236766, "loss": 0.4029, "step": 1320 }, { "epoch": 1.204571169947952, "grad_norm": 0.6447364091873169, "learning_rate": 0.00016723959043104728, "loss": 0.4154, "step": 1330 }, { "epoch": 1.2136229916270649, "grad_norm": 0.45961570739746094, "learning_rate": 0.00016677272188886483, "loss": 0.412, "step": 1340 }, { "epoch": 1.2226748133061778, "grad_norm": 0.5007642507553101, "learning_rate": 0.00016630321199390867, "loss": 0.4137, "step": 1350 }, { "epoch": 1.2317266349852907, "grad_norm": 0.4964968264102936, "learning_rate": 0.00016583107931875192, "loss": 0.3922, "step": 1360 }, { "epoch": 1.2407784566644038, "grad_norm": 0.573707640171051, "learning_rate": 0.00016535634253971794, "loss": 0.3778, "step": 1370 }, { "epoch": 1.2498302783435167, "grad_norm": 0.5992773771286011, "learning_rate": 0.00016487902043614173, "loss": 0.4528, "step": 1380 }, { "epoch": 1.2588821000226296, "grad_norm": 0.650651216506958, "learning_rate": 0.00016439913188962685, "loss": 0.3915, "step": 1390 }, { "epoch": 1.2679339217017425, "grad_norm": 0.5636417865753174, "learning_rate": 0.0001639166958832985, "loss": 0.3947, "step": 1400 }, { "epoch": 1.2769857433808554, "grad_norm": 0.5544995665550232, "learning_rate": 0.00016343173150105278, "loss": 0.4252, "step": 1410 }, { "epoch": 1.2860375650599682, "grad_norm": 0.5746600031852722, "learning_rate": 0.0001629442579268016, "loss": 0.4127, "step": 1420 }, { "epoch": 1.2950893867390811, "grad_norm": 0.5972291231155396, "learning_rate": 0.0001624542944437139, "loss": 0.3918, "step": 1430 }, { "epoch": 1.3041412084181943, "grad_norm": 0.6181298494338989, "learning_rate": 0.00016196186043345288, "loss": 0.3615, "step": 1440 }, { "epoch": 1.3131930300973071, "grad_norm": 0.5114730000495911, "learning_rate": 0.00016146697537540924, "loss": 0.3966, "step": 1450 }, { "epoch": 1.32224485177642, "grad_norm": 0.5740589499473572, "learning_rate": 0.0001609696588459307, "loss": 0.4061, "step": 1460 }, { "epoch": 1.331296673455533, "grad_norm": 0.6494064927101135, "learning_rate": 0.00016046993051754756, "loss": 0.3979, "step": 1470 }, { "epoch": 1.3403484951346458, "grad_norm": 0.6532962322235107, "learning_rate": 0.0001599678101581945, "loss": 0.4531, "step": 1480 }, { "epoch": 1.3494003168137587, "grad_norm": 0.7263498306274414, "learning_rate": 0.00015946331763042867, "loss": 0.3788, "step": 1490 }, { "epoch": 1.3584521384928716, "grad_norm": 0.8316817879676819, "learning_rate": 0.00015895647289064396, "loss": 0.4298, "step": 1500 }, { "epoch": 1.3584521384928716, "eval_loss": 11.852625846862793, "eval_runtime": 42.2271, "eval_samples_per_second": 7.507, "eval_steps_per_second": 3.765, "step": 1500 }, { "epoch": 1.3675039601719847, "grad_norm": 0.4995158314704895, "learning_rate": 0.0001584472959882815, "loss": 0.4021, "step": 1510 }, { "epoch": 1.3765557818510976, "grad_norm": 0.6390697360038757, "learning_rate": 0.0001579358070650367, "loss": 0.3633, "step": 1520 }, { "epoch": 1.3856076035302105, "grad_norm": 0.5589749217033386, "learning_rate": 0.00015742202635406235, "loss": 0.3524, "step": 1530 }, { "epoch": 1.3946594252093234, "grad_norm": 0.5728365778923035, "learning_rate": 0.0001569059741791684, "loss": 0.3456, "step": 1540 }, { "epoch": 1.4037112468884363, "grad_norm": 0.45577678084373474, "learning_rate": 0.0001563876709540178, "loss": 0.3706, "step": 1550 }, { "epoch": 1.4127630685675492, "grad_norm": 0.7161752581596375, "learning_rate": 0.00015586713718131922, "loss": 0.362, "step": 1560 }, { "epoch": 1.421814890246662, "grad_norm": 0.530432403087616, "learning_rate": 0.0001553443934520159, "loss": 0.3798, "step": 1570 }, { "epoch": 1.4308667119257752, "grad_norm": 0.5991781949996948, "learning_rate": 0.00015481946044447099, "loss": 0.3965, "step": 1580 }, { "epoch": 1.4399185336048879, "grad_norm": 0.5994631052017212, "learning_rate": 0.00015429235892364994, "loss": 0.3855, "step": 1590 }, { "epoch": 1.448970355284001, "grad_norm": 0.5677673816680908, "learning_rate": 0.00015376310974029873, "loss": 0.3773, "step": 1600 }, { "epoch": 1.4580221769631139, "grad_norm": 0.4799586236476898, "learning_rate": 0.0001532317338301192, "loss": 0.3466, "step": 1610 }, { "epoch": 1.4670739986422268, "grad_norm": 0.5448312163352966, "learning_rate": 0.00015269825221294098, "loss": 0.3664, "step": 1620 }, { "epoch": 1.4761258203213397, "grad_norm": 0.5199277400970459, "learning_rate": 0.0001521626859918898, "loss": 0.3838, "step": 1630 }, { "epoch": 1.4851776420004525, "grad_norm": 0.4976363778114319, "learning_rate": 0.00015162505635255287, "loss": 0.3811, "step": 1640 }, { "epoch": 1.4942294636795654, "grad_norm": 0.6075541377067566, "learning_rate": 0.0001510853845621409, "loss": 0.3756, "step": 1650 }, { "epoch": 1.5032812853586783, "grad_norm": 0.5979759097099304, "learning_rate": 0.00015054369196864644, "loss": 0.3973, "step": 1660 }, { "epoch": 1.5123331070377914, "grad_norm": 0.9770258665084839, "learning_rate": 0.00015000000000000001, "loss": 0.4124, "step": 1670 }, { "epoch": 1.5213849287169041, "grad_norm": 0.580491840839386, "learning_rate": 0.0001494543301632219, "loss": 0.3827, "step": 1680 }, { "epoch": 1.5304367503960172, "grad_norm": 0.6543178558349609, "learning_rate": 0.0001489067040435717, "loss": 0.4009, "step": 1690 }, { "epoch": 1.5394885720751301, "grad_norm": 0.5909605622291565, "learning_rate": 0.00014835714330369446, "loss": 0.4272, "step": 1700 }, { "epoch": 1.548540393754243, "grad_norm": 0.4959102272987366, "learning_rate": 0.0001478056696827636, "loss": 0.3536, "step": 1710 }, { "epoch": 1.557592215433356, "grad_norm": 0.5864228010177612, "learning_rate": 0.00014725230499562119, "loss": 0.3657, "step": 1720 }, { "epoch": 1.5666440371124688, "grad_norm": 0.6094350814819336, "learning_rate": 0.00014669707113191483, "loss": 0.3889, "step": 1730 }, { "epoch": 1.575695858791582, "grad_norm": 0.45741668343544006, "learning_rate": 0.00014613999005523174, "loss": 0.37, "step": 1740 }, { "epoch": 1.5847476804706946, "grad_norm": 0.5069059729576111, "learning_rate": 0.00014558108380223012, "loss": 0.3992, "step": 1750 }, { "epoch": 1.5937995021498077, "grad_norm": 0.535524845123291, "learning_rate": 0.00014502037448176734, "loss": 0.3763, "step": 1760 }, { "epoch": 1.6028513238289206, "grad_norm": 0.5067459940910339, "learning_rate": 0.00014445788427402528, "loss": 0.3409, "step": 1770 }, { "epoch": 1.6119031455080335, "grad_norm": 0.5375223159790039, "learning_rate": 0.00014389363542963306, "loss": 0.3778, "step": 1780 }, { "epoch": 1.6209549671871464, "grad_norm": 0.5057679414749146, "learning_rate": 0.00014332765026878687, "loss": 0.362, "step": 1790 }, { "epoch": 1.6300067888662593, "grad_norm": 0.5510945320129395, "learning_rate": 0.00014275995118036693, "loss": 0.374, "step": 1800 }, { "epoch": 1.6390586105453724, "grad_norm": 0.619204580783844, "learning_rate": 0.00014219056062105193, "loss": 0.4279, "step": 1810 }, { "epoch": 1.648110432224485, "grad_norm": 0.5340638160705566, "learning_rate": 0.00014161950111443077, "loss": 0.3675, "step": 1820 }, { "epoch": 1.6571622539035982, "grad_norm": 0.6182312965393066, "learning_rate": 0.0001410467952501114, "loss": 0.3686, "step": 1830 }, { "epoch": 1.666214075582711, "grad_norm": 0.5214246511459351, "learning_rate": 0.00014047246568282736, "loss": 0.4001, "step": 1840 }, { "epoch": 1.675265897261824, "grad_norm": 0.4987814128398895, "learning_rate": 0.00013989653513154165, "loss": 0.357, "step": 1850 }, { "epoch": 1.6843177189409368, "grad_norm": 0.6497951149940491, "learning_rate": 0.0001393190263785479, "loss": 0.3722, "step": 1860 }, { "epoch": 1.6933695406200497, "grad_norm": 0.4140976667404175, "learning_rate": 0.00013873996226856933, "loss": 0.316, "step": 1870 }, { "epoch": 1.7024213622991629, "grad_norm": 0.5191941857337952, "learning_rate": 0.00013815936570785487, "loss": 0.3865, "step": 1880 }, { "epoch": 1.7114731839782755, "grad_norm": 0.5386790037155151, "learning_rate": 0.00013757725966327322, "loss": 0.3499, "step": 1890 }, { "epoch": 1.7205250056573886, "grad_norm": 0.5060738921165466, "learning_rate": 0.00013699366716140435, "loss": 0.3481, "step": 1900 }, { "epoch": 1.7295768273365013, "grad_norm": 0.5718989968299866, "learning_rate": 0.0001364086112876284, "loss": 0.3713, "step": 1910 }, { "epoch": 1.7386286490156144, "grad_norm": 0.5812616348266602, "learning_rate": 0.00013582211518521273, "loss": 0.3651, "step": 1920 }, { "epoch": 1.7476804706947273, "grad_norm": 0.42051294445991516, "learning_rate": 0.00013523420205439646, "loss": 0.3581, "step": 1930 }, { "epoch": 1.7567322923738402, "grad_norm": 0.6224139928817749, "learning_rate": 0.00013464489515147238, "loss": 0.3663, "step": 1940 }, { "epoch": 1.765784114052953, "grad_norm": 0.5418313145637512, "learning_rate": 0.00013405421778786737, "loss": 0.3521, "step": 1950 }, { "epoch": 1.774835935732066, "grad_norm": 0.5429657101631165, "learning_rate": 0.00013346219332922016, "loss": 0.3946, "step": 1960 }, { "epoch": 1.783887757411179, "grad_norm": 0.5962104797363281, "learning_rate": 0.0001328688451944569, "loss": 0.4121, "step": 1970 }, { "epoch": 1.7929395790902918, "grad_norm": 0.5818737745285034, "learning_rate": 0.00013227419685486492, "loss": 0.3209, "step": 1980 }, { "epoch": 1.801991400769405, "grad_norm": 0.4606368839740753, "learning_rate": 0.0001316782718331643, "loss": 0.316, "step": 1990 }, { "epoch": 1.8110432224485178, "grad_norm": 0.49896928668022156, "learning_rate": 0.00013108109370257712, "loss": 0.3538, "step": 2000 }, { "epoch": 1.8110432224485178, "eval_loss": 11.89608383178711, "eval_runtime": 41.6812, "eval_samples_per_second": 7.605, "eval_steps_per_second": 3.815, "step": 2000 }, { "epoch": 1.8200950441276307, "grad_norm": 0.4855138659477234, "learning_rate": 0.00013048268608589533, "loss": 0.3578, "step": 2010 }, { "epoch": 1.8291468658067436, "grad_norm": 0.577150285243988, "learning_rate": 0.00012988307265454597, "loss": 0.3644, "step": 2020 }, { "epoch": 1.8381986874858565, "grad_norm": 0.5089125037193298, "learning_rate": 0.00012928227712765504, "loss": 0.398, "step": 2030 }, { "epoch": 1.8472505091649696, "grad_norm": 0.44641053676605225, "learning_rate": 0.00012868032327110904, "loss": 0.3656, "step": 2040 }, { "epoch": 1.8563023308440822, "grad_norm": 0.5212085247039795, "learning_rate": 0.00012807723489661495, "loss": 0.3396, "step": 2050 }, { "epoch": 1.8653541525231954, "grad_norm": 0.48269546031951904, "learning_rate": 0.0001274730358607583, "loss": 0.3649, "step": 2060 }, { "epoch": 1.8744059742023083, "grad_norm": 0.5938083529472351, "learning_rate": 0.00012686775006405946, "loss": 0.3318, "step": 2070 }, { "epoch": 1.8834577958814211, "grad_norm": 0.46941766142845154, "learning_rate": 0.0001262614014500282, "loss": 0.3625, "step": 2080 }, { "epoch": 1.892509617560534, "grad_norm": 0.6015097498893738, "learning_rate": 0.00012565401400421651, "loss": 0.4174, "step": 2090 }, { "epoch": 1.901561439239647, "grad_norm": 0.6819751858711243, "learning_rate": 0.00012504561175326985, "loss": 0.351, "step": 2100 }, { "epoch": 1.91061326091876, "grad_norm": 0.4607352018356323, "learning_rate": 0.0001244362187639767, "loss": 0.3488, "step": 2110 }, { "epoch": 1.9196650825978727, "grad_norm": 0.7872518301010132, "learning_rate": 0.0001238258591423165, "loss": 0.3719, "step": 2120 }, { "epoch": 1.9287169042769858, "grad_norm": 0.49948009848594666, "learning_rate": 0.00012321455703250616, "loss": 0.3415, "step": 2130 }, { "epoch": 1.9377687259560985, "grad_norm": 0.513294517993927, "learning_rate": 0.0001226023366160449, "loss": 0.3575, "step": 2140 }, { "epoch": 1.9468205476352116, "grad_norm": 0.7481987476348877, "learning_rate": 0.00012198922211075778, "loss": 0.3465, "step": 2150 }, { "epoch": 1.9558723693143245, "grad_norm": 0.4865647852420807, "learning_rate": 0.00012137523776983757, "loss": 0.3246, "step": 2160 }, { "epoch": 1.9649241909934374, "grad_norm": 0.35988450050354004, "learning_rate": 0.00012076040788088554, "loss": 0.3506, "step": 2170 }, { "epoch": 1.9739760126725503, "grad_norm": 0.46516749262809753, "learning_rate": 0.00012014475676495052, "loss": 0.3077, "step": 2180 }, { "epoch": 1.9830278343516632, "grad_norm": 0.5179293751716614, "learning_rate": 0.000119528308775567, "loss": 0.375, "step": 2190 }, { "epoch": 1.9920796560307763, "grad_norm": 0.5721428394317627, "learning_rate": 0.00011891108829779165, "loss": 0.3503, "step": 2200 }, { "epoch": 2.0018103643358227, "grad_norm": 0.4291672706604004, "learning_rate": 0.00011829311974723867, "loss": 0.3745, "step": 2210 }, { "epoch": 2.0108621860149354, "grad_norm": 0.4879133701324463, "learning_rate": 0.00011767442756911417, "loss": 0.2479, "step": 2220 }, { "epoch": 2.0199140076940485, "grad_norm": 0.5393707752227783, "learning_rate": 0.00011705503623724898, "loss": 0.222, "step": 2230 }, { "epoch": 2.028965829373161, "grad_norm": 0.5438636541366577, "learning_rate": 0.00011643497025313061, "loss": 0.2726, "step": 2240 }, { "epoch": 2.0380176510522743, "grad_norm": 0.5499648451805115, "learning_rate": 0.0001158142541449341, "loss": 0.2583, "step": 2250 }, { "epoch": 2.0470694727313874, "grad_norm": 0.5354553461074829, "learning_rate": 0.0001151929124665516, "loss": 0.2365, "step": 2260 }, { "epoch": 2.0561212944105, "grad_norm": 0.586413562297821, "learning_rate": 0.00011457096979662114, "loss": 0.243, "step": 2270 }, { "epoch": 2.065173116089613, "grad_norm": 0.48880791664123535, "learning_rate": 0.00011394845073755455, "loss": 0.2534, "step": 2280 }, { "epoch": 2.074224937768726, "grad_norm": 0.5162414908409119, "learning_rate": 0.00011332537991456398, "loss": 0.2279, "step": 2290 }, { "epoch": 2.083276759447839, "grad_norm": 0.6083577871322632, "learning_rate": 0.00011270178197468789, "loss": 0.2353, "step": 2300 }, { "epoch": 2.0923285811269516, "grad_norm": 0.5396526455879211, "learning_rate": 0.00011207768158581613, "loss": 0.2396, "step": 2310 }, { "epoch": 2.1013804028060648, "grad_norm": 0.5191901326179504, "learning_rate": 0.00011145310343571411, "loss": 0.2557, "step": 2320 }, { "epoch": 2.1104322244851774, "grad_norm": 0.4944523572921753, "learning_rate": 0.0001108280722310462, "loss": 0.2332, "step": 2330 }, { "epoch": 2.1194840461642905, "grad_norm": 0.49368587136268616, "learning_rate": 0.00011020261269639842, "loss": 0.2395, "step": 2340 }, { "epoch": 2.1285358678434037, "grad_norm": 0.5858927369117737, "learning_rate": 0.00010957674957330042, "loss": 0.27, "step": 2350 }, { "epoch": 2.1375876895225163, "grad_norm": 0.5289851427078247, "learning_rate": 0.00010895050761924668, "loss": 0.2777, "step": 2360 }, { "epoch": 2.1466395112016294, "grad_norm": 0.5512372851371765, "learning_rate": 0.00010832391160671729, "loss": 0.257, "step": 2370 }, { "epoch": 2.155691332880742, "grad_norm": 0.48882365226745605, "learning_rate": 0.00010769698632219794, "loss": 0.2513, "step": 2380 }, { "epoch": 2.164743154559855, "grad_norm": 0.4982316792011261, "learning_rate": 0.00010706975656519946, "loss": 0.2265, "step": 2390 }, { "epoch": 2.173794976238968, "grad_norm": 0.5512099266052246, "learning_rate": 0.00010644224714727681, "loss": 0.2586, "step": 2400 }, { "epoch": 2.182846797918081, "grad_norm": 0.5546220541000366, "learning_rate": 0.00010581448289104758, "loss": 0.2494, "step": 2410 }, { "epoch": 2.191898619597194, "grad_norm": 0.6402847170829773, "learning_rate": 0.00010518648862921012, "loss": 0.2467, "step": 2420 }, { "epoch": 2.200950441276307, "grad_norm": 0.4461919665336609, "learning_rate": 0.00010455828920356115, "loss": 0.2248, "step": 2430 }, { "epoch": 2.21000226295542, "grad_norm": 0.562235951423645, "learning_rate": 0.00010392990946401313, "loss": 0.2196, "step": 2440 }, { "epoch": 2.2190540846345326, "grad_norm": 0.49623265862464905, "learning_rate": 0.00010330137426761135, "loss": 0.2572, "step": 2450 }, { "epoch": 2.2281059063136457, "grad_norm": 0.5660530924797058, "learning_rate": 0.00010267270847755048, "loss": 0.2523, "step": 2460 }, { "epoch": 2.2371577279927584, "grad_norm": 0.5315167307853699, "learning_rate": 0.00010204393696219117, "loss": 0.2489, "step": 2470 }, { "epoch": 2.2462095496718715, "grad_norm": 0.530099093914032, "learning_rate": 0.00010141508459407623, "loss": 0.2603, "step": 2480 }, { "epoch": 2.2552613713509846, "grad_norm": 0.6275547742843628, "learning_rate": 0.00010078617624894684, "loss": 0.2338, "step": 2490 }, { "epoch": 2.2643131930300973, "grad_norm": 0.5999734997749329, "learning_rate": 0.00010015723680475846, "loss": 0.2418, "step": 2500 }, { "epoch": 2.2643131930300973, "eval_loss": 12.012301445007324, "eval_runtime": 42.3048, "eval_samples_per_second": 7.493, "eval_steps_per_second": 3.758, "step": 2500 }, { "epoch": 2.2733650147092104, "grad_norm": 0.5600557327270508, "learning_rate": 9.95282911406968e-05, "loss": 0.2394, "step": 2510 }, { "epoch": 2.282416836388323, "grad_norm": 0.383777379989624, "learning_rate": 9.889936413619356e-05, "loss": 0.2331, "step": 2520 }, { "epoch": 2.291468658067436, "grad_norm": 0.5634308457374573, "learning_rate": 9.827048066994225e-05, "loss": 0.2492, "step": 2530 }, { "epoch": 2.300520479746549, "grad_norm": 0.5783660411834717, "learning_rate": 9.764166561891432e-05, "loss": 0.2524, "step": 2540 }, { "epoch": 2.309572301425662, "grad_norm": 0.6156514286994934, "learning_rate": 9.70129438573747e-05, "loss": 0.2655, "step": 2550 }, { "epoch": 2.318624123104775, "grad_norm": 0.3963329493999481, "learning_rate": 9.63843402558981e-05, "loss": 0.2459, "step": 2560 }, { "epoch": 2.3276759447838877, "grad_norm": 0.4392152726650238, "learning_rate": 9.57558796803852e-05, "loss": 0.2579, "step": 2570 }, { "epoch": 2.336727766463001, "grad_norm": 0.5545589327812195, "learning_rate": 9.512758699107879e-05, "loss": 0.2716, "step": 2580 }, { "epoch": 2.3457795881421135, "grad_norm": 0.6256219744682312, "learning_rate": 9.449948704158071e-05, "loss": 0.2644, "step": 2590 }, { "epoch": 2.3548314098212266, "grad_norm": 0.5015664100646973, "learning_rate": 9.38716046778684e-05, "loss": 0.2603, "step": 2600 }, { "epoch": 2.3638832315003393, "grad_norm": 0.5921751260757446, "learning_rate": 9.324396473731217e-05, "loss": 0.2713, "step": 2610 }, { "epoch": 2.3729350531794524, "grad_norm": 0.4421112835407257, "learning_rate": 9.261659204769284e-05, "loss": 0.2663, "step": 2620 }, { "epoch": 2.3819868748585655, "grad_norm": 0.5723668336868286, "learning_rate": 9.198951142621929e-05, "loss": 0.2757, "step": 2630 }, { "epoch": 2.391038696537678, "grad_norm": 0.5875786542892456, "learning_rate": 9.136274767854716e-05, "loss": 0.2764, "step": 2640 }, { "epoch": 2.4000905182167913, "grad_norm": 0.6496031284332275, "learning_rate": 9.07363255977973e-05, "loss": 0.273, "step": 2650 }, { "epoch": 2.409142339895904, "grad_norm": 0.39376866817474365, "learning_rate": 9.011026996357503e-05, "loss": 0.23, "step": 2660 }, { "epoch": 2.418194161575017, "grad_norm": 0.6112829446792603, "learning_rate": 8.948460554099018e-05, "loss": 0.2605, "step": 2670 }, { "epoch": 2.4272459832541298, "grad_norm": 0.519059419631958, "learning_rate": 8.885935707967716e-05, "loss": 0.2501, "step": 2680 }, { "epoch": 2.436297804933243, "grad_norm": 0.8110079765319824, "learning_rate": 8.823454931281616e-05, "loss": 0.2387, "step": 2690 }, { "epoch": 2.4453496266123556, "grad_norm": 0.7474923133850098, "learning_rate": 8.76102069561545e-05, "loss": 0.2655, "step": 2700 }, { "epoch": 2.4544014482914687, "grad_norm": 0.44778284430503845, "learning_rate": 8.698635470702923e-05, "loss": 0.2285, "step": 2710 }, { "epoch": 2.4634532699705813, "grad_norm": 0.5677033066749573, "learning_rate": 8.636301724339004e-05, "loss": 0.2433, "step": 2720 }, { "epoch": 2.4725050916496945, "grad_norm": 0.5416902303695679, "learning_rate": 8.574021922282292e-05, "loss": 0.238, "step": 2730 }, { "epoch": 2.4815569133288076, "grad_norm": 0.5404213666915894, "learning_rate": 8.511798528157512e-05, "loss": 0.2682, "step": 2740 }, { "epoch": 2.4906087350079202, "grad_norm": 0.4115196168422699, "learning_rate": 8.449634003358022e-05, "loss": 0.2486, "step": 2750 }, { "epoch": 2.4996605566870334, "grad_norm": 0.40759924054145813, "learning_rate": 8.387530806948476e-05, "loss": 0.2381, "step": 2760 }, { "epoch": 2.508712378366146, "grad_norm": 0.5902087092399597, "learning_rate": 8.325491395567541e-05, "loss": 0.2364, "step": 2770 }, { "epoch": 2.517764200045259, "grad_norm": 0.5643983483314514, "learning_rate": 8.263518223330697e-05, "loss": 0.2322, "step": 2780 }, { "epoch": 2.526816021724372, "grad_norm": 0.55036860704422, "learning_rate": 8.201613741733203e-05, "loss": 0.2697, "step": 2790 }, { "epoch": 2.535867843403485, "grad_norm": 0.5846819877624512, "learning_rate": 8.13978039955308e-05, "loss": 0.2648, "step": 2800 }, { "epoch": 2.544919665082598, "grad_norm": 0.6239003539085388, "learning_rate": 8.078020642754274e-05, "loss": 0.244, "step": 2810 }, { "epoch": 2.5539714867617107, "grad_norm": 0.49254196882247925, "learning_rate": 8.016336914389874e-05, "loss": 0.2475, "step": 2820 }, { "epoch": 2.563023308440824, "grad_norm": 0.737869381904602, "learning_rate": 7.954731654505491e-05, "loss": 0.2694, "step": 2830 }, { "epoch": 2.5720751301199365, "grad_norm": 0.5183305144309998, "learning_rate": 7.89320730004274e-05, "loss": 0.234, "step": 2840 }, { "epoch": 2.5811269517990496, "grad_norm": 0.4777659773826599, "learning_rate": 7.831766284742807e-05, "loss": 0.2552, "step": 2850 }, { "epoch": 2.5901787734781623, "grad_norm": 0.553449809551239, "learning_rate": 7.77041103905023e-05, "loss": 0.2275, "step": 2860 }, { "epoch": 2.5992305951572754, "grad_norm": 0.542242169380188, "learning_rate": 7.709143990016702e-05, "loss": 0.2736, "step": 2870 }, { "epoch": 2.6082824168363885, "grad_norm": 0.564593493938446, "learning_rate": 7.6479675612051e-05, "loss": 0.2469, "step": 2880 }, { "epoch": 2.617334238515501, "grad_norm": 0.37938451766967773, "learning_rate": 7.586884172593609e-05, "loss": 0.2362, "step": 2890 }, { "epoch": 2.6263860601946143, "grad_norm": 0.6128523945808411, "learning_rate": 7.525896240479976e-05, "loss": 0.2456, "step": 2900 }, { "epoch": 2.635437881873727, "grad_norm": 0.6073201894760132, "learning_rate": 7.465006177385953e-05, "loss": 0.2413, "step": 2910 }, { "epoch": 2.64448970355284, "grad_norm": 0.4320588707923889, "learning_rate": 7.404216391961847e-05, "loss": 0.243, "step": 2920 }, { "epoch": 2.6535415252319527, "grad_norm": 0.40451350808143616, "learning_rate": 7.343529288891239e-05, "loss": 0.265, "step": 2930 }, { "epoch": 2.662593346911066, "grad_norm": 0.43473196029663086, "learning_rate": 7.282947268795877e-05, "loss": 0.2267, "step": 2940 }, { "epoch": 2.671645168590179, "grad_norm": 0.6388351917266846, "learning_rate": 7.222472728140695e-05, "loss": 0.2745, "step": 2950 }, { "epoch": 2.6806969902692916, "grad_norm": 0.5615083575248718, "learning_rate": 7.162108059139032e-05, "loss": 0.25, "step": 2960 }, { "epoch": 2.6897488119484048, "grad_norm": 0.6018364429473877, "learning_rate": 7.101855649657991e-05, "loss": 0.2601, "step": 2970 }, { "epoch": 2.6988006336275174, "grad_norm": 0.5931141376495361, "learning_rate": 7.041717883123977e-05, "loss": 0.2455, "step": 2980 }, { "epoch": 2.7078524553066305, "grad_norm": 0.4409467875957489, "learning_rate": 6.981697138428434e-05, "loss": 0.2584, "step": 2990 }, { "epoch": 2.716904276985743, "grad_norm": 0.6161777377128601, "learning_rate": 6.921795789833723e-05, "loss": 0.2364, "step": 3000 }, { "epoch": 2.716904276985743, "eval_loss": 11.997015953063965, "eval_runtime": 42.4537, "eval_samples_per_second": 7.467, "eval_steps_per_second": 3.745, "step": 3000 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.6779958650393395e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }