diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21978 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 31359, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003188877196339169, + "grad_norm": 2.7591781616210938, + "learning_rate": 1.5944385981695843e-05, + "loss": 9.7119, + "step": 10 + }, + { + "epoch": 0.0006377754392678338, + "grad_norm": 2.3757214546203613, + "learning_rate": 3.1888771963391685e-05, + "loss": 8.9972, + "step": 20 + }, + { + "epoch": 0.0009566631589017507, + "grad_norm": 1.9443352222442627, + "learning_rate": 4.7833157945087525e-05, + "loss": 8.234, + "step": 30 + }, + { + "epoch": 0.0012755508785356677, + "grad_norm": 0.9958082437515259, + "learning_rate": 6.377754392678337e-05, + "loss": 7.4942, + "step": 40 + }, + { + "epoch": 0.0015944385981695845, + "grad_norm": 0.7051369547843933, + "learning_rate": 7.972192990847922e-05, + "loss": 6.9288, + "step": 50 + }, + { + "epoch": 0.0019133263178035013, + "grad_norm": 0.5652105808258057, + "learning_rate": 9.566631589017505e-05, + "loss": 6.5521, + "step": 60 + }, + { + "epoch": 0.0022322140374374183, + "grad_norm": 0.8407195806503296, + "learning_rate": 0.00011161070187187091, + "loss": 6.2748, + "step": 70 + }, + { + "epoch": 0.0025511017570713354, + "grad_norm": 0.7927109003067017, + "learning_rate": 0.00012755508785356674, + "loss": 6.0919, + "step": 80 + }, + { + "epoch": 0.002869989476705252, + "grad_norm": 0.647208571434021, + "learning_rate": 0.0001434994738352626, + "loss": 5.9766, + "step": 90 + }, + { + "epoch": 0.003188877196339169, + "grad_norm": 0.6491755843162537, + "learning_rate": 0.00015944385981695843, + "loss": 5.8506, + "step": 100 + }, + { + "epoch": 0.003507764915973086, + "grad_norm": 0.9509009718894958, + "learning_rate": 0.00017538824579865428, + "loss": 5.7138, + "step": 110 + }, + { + "epoch": 0.0038266526356070026, + "grad_norm": 0.7599376440048218, + "learning_rate": 0.0001913326317803501, + "loss": 5.5894, + "step": 120 + }, + { + "epoch": 0.00414554035524092, + "grad_norm": 0.8345359563827515, + "learning_rate": 0.00020727701776204597, + "loss": 5.4928, + "step": 130 + }, + { + "epoch": 0.004464428074874837, + "grad_norm": 0.9486031532287598, + "learning_rate": 0.00022322140374374182, + "loss": 5.3421, + "step": 140 + }, + { + "epoch": 0.004783315794508754, + "grad_norm": 0.7641623616218567, + "learning_rate": 0.00023916578972543764, + "loss": 5.2179, + "step": 150 + }, + { + "epoch": 0.005102203514142671, + "grad_norm": 0.8762153387069702, + "learning_rate": 0.0002551101757071335, + "loss": 5.0897, + "step": 160 + }, + { + "epoch": 0.005421091233776587, + "grad_norm": 0.9552673697471619, + "learning_rate": 0.00027105456168882936, + "loss": 5.0136, + "step": 170 + }, + { + "epoch": 0.005739978953410504, + "grad_norm": 0.8702957034111023, + "learning_rate": 0.0002869989476705252, + "loss": 4.8847, + "step": 180 + }, + { + "epoch": 0.006058866673044421, + "grad_norm": 1.1011112928390503, + "learning_rate": 0.00030294333365222105, + "loss": 4.793, + "step": 190 + }, + { + "epoch": 0.006377754392678338, + "grad_norm": 1.000261664390564, + "learning_rate": 0.00031888771963391687, + "loss": 4.6767, + "step": 200 + }, + { + "epoch": 0.006696642112312255, + "grad_norm": 0.9304866194725037, + "learning_rate": 0.0003348321056156127, + "loss": 4.5988, + "step": 210 + }, + { + "epoch": 0.007015529831946172, + "grad_norm": 1.0546783208847046, + "learning_rate": 0.00035077649159730856, + "loss": 4.4907, + "step": 220 + }, + { + "epoch": 0.007334417551580088, + "grad_norm": 0.9612847566604614, + "learning_rate": 0.00036672087757900443, + "loss": 4.4107, + "step": 230 + }, + { + "epoch": 0.007653305271214005, + "grad_norm": 0.9102572202682495, + "learning_rate": 0.0003826652635607002, + "loss": 4.3265, + "step": 240 + }, + { + "epoch": 0.007972192990847922, + "grad_norm": 0.9532923102378845, + "learning_rate": 0.00039860964954239607, + "loss": 4.2816, + "step": 250 + }, + { + "epoch": 0.00829108071048184, + "grad_norm": 1.0799651145935059, + "learning_rate": 0.00041455403552409194, + "loss": 4.1925, + "step": 260 + }, + { + "epoch": 0.008609968430115756, + "grad_norm": 0.8453513979911804, + "learning_rate": 0.0004304984215057878, + "loss": 4.1438, + "step": 270 + }, + { + "epoch": 0.008928856149749673, + "grad_norm": 0.9012154340744019, + "learning_rate": 0.00044644280748748364, + "loss": 4.0625, + "step": 280 + }, + { + "epoch": 0.00924774386938359, + "grad_norm": 0.9568570852279663, + "learning_rate": 0.0004623871934691794, + "loss": 4.0183, + "step": 290 + }, + { + "epoch": 0.009566631589017507, + "grad_norm": 0.9893571734428406, + "learning_rate": 0.0004783315794508753, + "loss": 3.9543, + "step": 300 + }, + { + "epoch": 0.009885519308651424, + "grad_norm": 1.001813530921936, + "learning_rate": 0.0004942759654325711, + "loss": 3.9078, + "step": 310 + }, + { + "epoch": 0.010204407028285341, + "grad_norm": 0.8830215930938721, + "learning_rate": 0.0005, + "loss": 3.8586, + "step": 320 + }, + { + "epoch": 0.010523294747919257, + "grad_norm": 1.0455135107040405, + "learning_rate": 0.0005, + "loss": 3.79, + "step": 330 + }, + { + "epoch": 0.010842182467553174, + "grad_norm": 0.891879677772522, + "learning_rate": 0.0005, + "loss": 3.7631, + "step": 340 + }, + { + "epoch": 0.01116107018718709, + "grad_norm": 0.8672872185707092, + "learning_rate": 0.0005, + "loss": 3.696, + "step": 350 + }, + { + "epoch": 0.011479957906821008, + "grad_norm": 0.9735280275344849, + "learning_rate": 0.0005, + "loss": 3.6841, + "step": 360 + }, + { + "epoch": 0.011798845626454925, + "grad_norm": 0.8796781301498413, + "learning_rate": 0.0005, + "loss": 3.6076, + "step": 370 + }, + { + "epoch": 0.012117733346088842, + "grad_norm": 0.7970229387283325, + "learning_rate": 0.0005, + "loss": 3.5803, + "step": 380 + }, + { + "epoch": 0.012436621065722759, + "grad_norm": 0.8341770768165588, + "learning_rate": 0.0005, + "loss": 3.5506, + "step": 390 + }, + { + "epoch": 0.012755508785356676, + "grad_norm": 1.0613290071487427, + "learning_rate": 0.0005, + "loss": 3.5163, + "step": 400 + }, + { + "epoch": 0.013074396504990593, + "grad_norm": 0.8499947786331177, + "learning_rate": 0.0005, + "loss": 3.4579, + "step": 410 + }, + { + "epoch": 0.01339328422462451, + "grad_norm": 0.868526816368103, + "learning_rate": 0.0005, + "loss": 3.4293, + "step": 420 + }, + { + "epoch": 0.013712171944258427, + "grad_norm": 1.0044291019439697, + "learning_rate": 0.0005, + "loss": 3.3861, + "step": 430 + }, + { + "epoch": 0.014031059663892344, + "grad_norm": 1.0256859064102173, + "learning_rate": 0.0005, + "loss": 3.3423, + "step": 440 + }, + { + "epoch": 0.014349947383526261, + "grad_norm": 1.0345772504806519, + "learning_rate": 0.0005, + "loss": 3.2853, + "step": 450 + }, + { + "epoch": 0.014668835103160176, + "grad_norm": 1.175614833831787, + "learning_rate": 0.0005, + "loss": 3.2852, + "step": 460 + }, + { + "epoch": 0.014987722822794093, + "grad_norm": 0.8701066970825195, + "learning_rate": 0.0005, + "loss": 3.2546, + "step": 470 + }, + { + "epoch": 0.01530661054242801, + "grad_norm": 0.8358347415924072, + "learning_rate": 0.0005, + "loss": 3.2179, + "step": 480 + }, + { + "epoch": 0.01562549826206193, + "grad_norm": 0.8820248246192932, + "learning_rate": 0.0005, + "loss": 3.1517, + "step": 490 + }, + { + "epoch": 0.015944385981695845, + "grad_norm": 0.9414588809013367, + "learning_rate": 0.0005, + "loss": 3.1477, + "step": 500 + }, + { + "epoch": 0.016263273701329763, + "grad_norm": 1.0134668350219727, + "learning_rate": 0.0005, + "loss": 3.1032, + "step": 510 + }, + { + "epoch": 0.01658216142096368, + "grad_norm": 1.2762371301651, + "learning_rate": 0.0005, + "loss": 3.077, + "step": 520 + }, + { + "epoch": 0.016901049140597594, + "grad_norm": 0.9885225892066956, + "learning_rate": 0.0005, + "loss": 3.0795, + "step": 530 + }, + { + "epoch": 0.017219936860231513, + "grad_norm": 1.1082136631011963, + "learning_rate": 0.0005, + "loss": 3.0401, + "step": 540 + }, + { + "epoch": 0.017538824579865428, + "grad_norm": 0.8814631700515747, + "learning_rate": 0.0005, + "loss": 2.9775, + "step": 550 + }, + { + "epoch": 0.017857712299499347, + "grad_norm": 0.8388587832450867, + "learning_rate": 0.0005, + "loss": 2.9665, + "step": 560 + }, + { + "epoch": 0.018176600019133262, + "grad_norm": 0.8874592781066895, + "learning_rate": 0.0005, + "loss": 2.9424, + "step": 570 + }, + { + "epoch": 0.01849548773876718, + "grad_norm": 0.8496912121772766, + "learning_rate": 0.0005, + "loss": 2.9261, + "step": 580 + }, + { + "epoch": 0.018814375458401096, + "grad_norm": 0.9394707679748535, + "learning_rate": 0.0005, + "loss": 2.8692, + "step": 590 + }, + { + "epoch": 0.019133263178035015, + "grad_norm": 0.9260649681091309, + "learning_rate": 0.0005, + "loss": 2.8651, + "step": 600 + }, + { + "epoch": 0.01945215089766893, + "grad_norm": 0.9702092409133911, + "learning_rate": 0.0005, + "loss": 2.8374, + "step": 610 + }, + { + "epoch": 0.01977103861730285, + "grad_norm": 0.9171802401542664, + "learning_rate": 0.0005, + "loss": 2.7758, + "step": 620 + }, + { + "epoch": 0.020089926336936764, + "grad_norm": 0.906156063079834, + "learning_rate": 0.0005, + "loss": 2.7732, + "step": 630 + }, + { + "epoch": 0.020408814056570683, + "grad_norm": 0.8847390413284302, + "learning_rate": 0.0005, + "loss": 2.7732, + "step": 640 + }, + { + "epoch": 0.0207277017762046, + "grad_norm": 0.8880306482315063, + "learning_rate": 0.0005, + "loss": 2.7348, + "step": 650 + }, + { + "epoch": 0.021046589495838514, + "grad_norm": 0.9086488485336304, + "learning_rate": 0.0005, + "loss": 2.7275, + "step": 660 + }, + { + "epoch": 0.021365477215472432, + "grad_norm": 0.8930760622024536, + "learning_rate": 0.0005, + "loss": 2.7, + "step": 670 + }, + { + "epoch": 0.021684364935106348, + "grad_norm": 0.8453980088233948, + "learning_rate": 0.0005, + "loss": 2.6833, + "step": 680 + }, + { + "epoch": 0.022003252654740266, + "grad_norm": 0.9567738771438599, + "learning_rate": 0.0005, + "loss": 2.6641, + "step": 690 + }, + { + "epoch": 0.02232214037437418, + "grad_norm": 0.8755207657814026, + "learning_rate": 0.0005, + "loss": 2.6219, + "step": 700 + }, + { + "epoch": 0.0226410280940081, + "grad_norm": 0.881864070892334, + "learning_rate": 0.0005, + "loss": 2.6393, + "step": 710 + }, + { + "epoch": 0.022959915813642016, + "grad_norm": 0.940329909324646, + "learning_rate": 0.0005, + "loss": 2.5864, + "step": 720 + }, + { + "epoch": 0.023278803533275935, + "grad_norm": 1.046134352684021, + "learning_rate": 0.0005, + "loss": 2.5888, + "step": 730 + }, + { + "epoch": 0.02359769125290985, + "grad_norm": 0.8772185444831848, + "learning_rate": 0.0005, + "loss": 2.5502, + "step": 740 + }, + { + "epoch": 0.02391657897254377, + "grad_norm": 0.865439236164093, + "learning_rate": 0.0005, + "loss": 2.5728, + "step": 750 + }, + { + "epoch": 0.024235466692177684, + "grad_norm": 0.8309611678123474, + "learning_rate": 0.0005, + "loss": 2.5484, + "step": 760 + }, + { + "epoch": 0.024554354411811603, + "grad_norm": 0.9572917819023132, + "learning_rate": 0.0005, + "loss": 2.5551, + "step": 770 + }, + { + "epoch": 0.024873242131445518, + "grad_norm": 0.8789076805114746, + "learning_rate": 0.0005, + "loss": 2.4962, + "step": 780 + }, + { + "epoch": 0.025192129851079433, + "grad_norm": 0.8763116002082825, + "learning_rate": 0.0005, + "loss": 2.5085, + "step": 790 + }, + { + "epoch": 0.025511017570713352, + "grad_norm": 0.8708914518356323, + "learning_rate": 0.0005, + "loss": 2.5152, + "step": 800 + }, + { + "epoch": 0.025829905290347267, + "grad_norm": 0.9939478039741516, + "learning_rate": 0.0005, + "loss": 2.4608, + "step": 810 + }, + { + "epoch": 0.026148793009981186, + "grad_norm": 0.9306032061576843, + "learning_rate": 0.0005, + "loss": 2.4971, + "step": 820 + }, + { + "epoch": 0.0264676807296151, + "grad_norm": 0.8735993504524231, + "learning_rate": 0.0005, + "loss": 2.441, + "step": 830 + }, + { + "epoch": 0.02678656844924902, + "grad_norm": 0.8575237393379211, + "learning_rate": 0.0005, + "loss": 2.4408, + "step": 840 + }, + { + "epoch": 0.027105456168882935, + "grad_norm": 0.8133852481842041, + "learning_rate": 0.0005, + "loss": 2.4189, + "step": 850 + }, + { + "epoch": 0.027424343888516854, + "grad_norm": 0.9014040231704712, + "learning_rate": 0.0005, + "loss": 2.4153, + "step": 860 + }, + { + "epoch": 0.02774323160815077, + "grad_norm": 0.801937460899353, + "learning_rate": 0.0005, + "loss": 2.3815, + "step": 870 + }, + { + "epoch": 0.028062119327784688, + "grad_norm": 0.8674513101577759, + "learning_rate": 0.0005, + "loss": 2.3742, + "step": 880 + }, + { + "epoch": 0.028381007047418604, + "grad_norm": 0.8789620399475098, + "learning_rate": 0.0005, + "loss": 2.3523, + "step": 890 + }, + { + "epoch": 0.028699894767052522, + "grad_norm": 1.5122326612472534, + "learning_rate": 0.0005, + "loss": 2.3561, + "step": 900 + }, + { + "epoch": 0.029018782486686438, + "grad_norm": 0.9573362469673157, + "learning_rate": 0.0005, + "loss": 2.3151, + "step": 910 + }, + { + "epoch": 0.029337670206320353, + "grad_norm": 0.9171894788742065, + "learning_rate": 0.0005, + "loss": 2.321, + "step": 920 + }, + { + "epoch": 0.02965655792595427, + "grad_norm": 0.9501611590385437, + "learning_rate": 0.0005, + "loss": 2.3244, + "step": 930 + }, + { + "epoch": 0.029975445645588187, + "grad_norm": 0.8088300228118896, + "learning_rate": 0.0005, + "loss": 2.3178, + "step": 940 + }, + { + "epoch": 0.030294333365222106, + "grad_norm": 0.8260691165924072, + "learning_rate": 0.0005, + "loss": 2.2892, + "step": 950 + }, + { + "epoch": 0.03061322108485602, + "grad_norm": 0.8876654505729675, + "learning_rate": 0.0005, + "loss": 2.2933, + "step": 960 + }, + { + "epoch": 0.03093210880448994, + "grad_norm": 0.8482714295387268, + "learning_rate": 0.0005, + "loss": 2.2748, + "step": 970 + }, + { + "epoch": 0.03125099652412386, + "grad_norm": 0.8355926275253296, + "learning_rate": 0.0005, + "loss": 2.286, + "step": 980 + }, + { + "epoch": 0.03156988424375777, + "grad_norm": 0.8398257493972778, + "learning_rate": 0.0005, + "loss": 2.2733, + "step": 990 + }, + { + "epoch": 0.03188877196339169, + "grad_norm": 0.9260374903678894, + "learning_rate": 0.0005, + "loss": 2.251, + "step": 1000 + }, + { + "epoch": 0.03220765968302561, + "grad_norm": 0.8768055438995361, + "learning_rate": 0.0005, + "loss": 2.2422, + "step": 1010 + }, + { + "epoch": 0.03252654740265953, + "grad_norm": 0.9418902397155762, + "learning_rate": 0.0005, + "loss": 2.254, + "step": 1020 + }, + { + "epoch": 0.03284543512229344, + "grad_norm": 0.9414481520652771, + "learning_rate": 0.0005, + "loss": 2.221, + "step": 1030 + }, + { + "epoch": 0.03316432284192736, + "grad_norm": 1.0012530088424683, + "learning_rate": 0.0005, + "loss": 2.1724, + "step": 1040 + }, + { + "epoch": 0.033483210561561276, + "grad_norm": 1.026558756828308, + "learning_rate": 0.0005, + "loss": 2.2094, + "step": 1050 + }, + { + "epoch": 0.03380209828119519, + "grad_norm": 0.8032659292221069, + "learning_rate": 0.0005, + "loss": 2.1611, + "step": 1060 + }, + { + "epoch": 0.03412098600082911, + "grad_norm": 0.8239543437957764, + "learning_rate": 0.0005, + "loss": 2.1877, + "step": 1070 + }, + { + "epoch": 0.034439873720463025, + "grad_norm": 0.8418667316436768, + "learning_rate": 0.0005, + "loss": 2.18, + "step": 1080 + }, + { + "epoch": 0.034758761440096944, + "grad_norm": 0.8027506470680237, + "learning_rate": 0.0005, + "loss": 2.177, + "step": 1090 + }, + { + "epoch": 0.035077649159730856, + "grad_norm": 0.7979521751403809, + "learning_rate": 0.0005, + "loss": 2.1828, + "step": 1100 + }, + { + "epoch": 0.035396536879364775, + "grad_norm": 0.8028553128242493, + "learning_rate": 0.0005, + "loss": 2.1322, + "step": 1110 + }, + { + "epoch": 0.035715424598998693, + "grad_norm": 0.7876245379447937, + "learning_rate": 0.0005, + "loss": 2.1393, + "step": 1120 + }, + { + "epoch": 0.03603431231863261, + "grad_norm": 0.8016355633735657, + "learning_rate": 0.0005, + "loss": 2.1355, + "step": 1130 + }, + { + "epoch": 0.036353200038266524, + "grad_norm": 0.8408504128456116, + "learning_rate": 0.0005, + "loss": 2.1464, + "step": 1140 + }, + { + "epoch": 0.03667208775790044, + "grad_norm": 0.9178126454353333, + "learning_rate": 0.0005, + "loss": 2.135, + "step": 1150 + }, + { + "epoch": 0.03699097547753436, + "grad_norm": 0.965160071849823, + "learning_rate": 0.0005, + "loss": 2.1258, + "step": 1160 + }, + { + "epoch": 0.03730986319716828, + "grad_norm": 0.7981363534927368, + "learning_rate": 0.0005, + "loss": 2.0838, + "step": 1170 + }, + { + "epoch": 0.03762875091680219, + "grad_norm": 0.894565999507904, + "learning_rate": 0.0005, + "loss": 2.0513, + "step": 1180 + }, + { + "epoch": 0.03794763863643611, + "grad_norm": 0.8362721800804138, + "learning_rate": 0.0005, + "loss": 2.0724, + "step": 1190 + }, + { + "epoch": 0.03826652635607003, + "grad_norm": 0.819324254989624, + "learning_rate": 0.0005, + "loss": 2.0788, + "step": 1200 + }, + { + "epoch": 0.03858541407570394, + "grad_norm": 0.7400639057159424, + "learning_rate": 0.0005, + "loss": 2.0486, + "step": 1210 + }, + { + "epoch": 0.03890430179533786, + "grad_norm": 0.9596002697944641, + "learning_rate": 0.0005, + "loss": 2.0635, + "step": 1220 + }, + { + "epoch": 0.03922318951497178, + "grad_norm": 0.7574199438095093, + "learning_rate": 0.0005, + "loss": 2.0509, + "step": 1230 + }, + { + "epoch": 0.0395420772346057, + "grad_norm": 0.7776308059692383, + "learning_rate": 0.0005, + "loss": 2.057, + "step": 1240 + }, + { + "epoch": 0.03986096495423961, + "grad_norm": 0.7979622483253479, + "learning_rate": 0.0005, + "loss": 2.0323, + "step": 1250 + }, + { + "epoch": 0.04017985267387353, + "grad_norm": 0.777959942817688, + "learning_rate": 0.0005, + "loss": 2.0241, + "step": 1260 + }, + { + "epoch": 0.04049874039350745, + "grad_norm": 0.7778024673461914, + "learning_rate": 0.0005, + "loss": 2.0401, + "step": 1270 + }, + { + "epoch": 0.040817628113141366, + "grad_norm": 0.7771192193031311, + "learning_rate": 0.0005, + "loss": 2.0219, + "step": 1280 + }, + { + "epoch": 0.04113651583277528, + "grad_norm": 0.7752227187156677, + "learning_rate": 0.0005, + "loss": 2.018, + "step": 1290 + }, + { + "epoch": 0.0414554035524092, + "grad_norm": 0.7638623118400574, + "learning_rate": 0.0005, + "loss": 1.986, + "step": 1300 + }, + { + "epoch": 0.041774291272043115, + "grad_norm": 0.7941219806671143, + "learning_rate": 0.0005, + "loss": 1.9811, + "step": 1310 + }, + { + "epoch": 0.04209317899167703, + "grad_norm": 0.7345162630081177, + "learning_rate": 0.0005, + "loss": 2.0212, + "step": 1320 + }, + { + "epoch": 0.042412066711310946, + "grad_norm": 0.8033922910690308, + "learning_rate": 0.0005, + "loss": 1.9976, + "step": 1330 + }, + { + "epoch": 0.042730954430944865, + "grad_norm": 0.7183955311775208, + "learning_rate": 0.0005, + "loss": 1.9659, + "step": 1340 + }, + { + "epoch": 0.04304984215057878, + "grad_norm": 0.7607952356338501, + "learning_rate": 0.0005, + "loss": 1.9921, + "step": 1350 + }, + { + "epoch": 0.043368729870212695, + "grad_norm": 0.8439708352088928, + "learning_rate": 0.0005, + "loss": 1.9723, + "step": 1360 + }, + { + "epoch": 0.043687617589846614, + "grad_norm": 0.7470654249191284, + "learning_rate": 0.0005, + "loss": 1.9738, + "step": 1370 + }, + { + "epoch": 0.04400650530948053, + "grad_norm": 0.7575402855873108, + "learning_rate": 0.0005, + "loss": 1.9791, + "step": 1380 + }, + { + "epoch": 0.04432539302911445, + "grad_norm": 0.7686700224876404, + "learning_rate": 0.0005, + "loss": 1.9262, + "step": 1390 + }, + { + "epoch": 0.04464428074874836, + "grad_norm": 0.956557035446167, + "learning_rate": 0.0005, + "loss": 1.9607, + "step": 1400 + }, + { + "epoch": 0.04496316846838228, + "grad_norm": 0.7784798741340637, + "learning_rate": 0.0005, + "loss": 1.9567, + "step": 1410 + }, + { + "epoch": 0.0452820561880162, + "grad_norm": 0.7413488626480103, + "learning_rate": 0.0005, + "loss": 1.9216, + "step": 1420 + }, + { + "epoch": 0.04560094390765012, + "grad_norm": 0.7566946148872375, + "learning_rate": 0.0005, + "loss": 1.9394, + "step": 1430 + }, + { + "epoch": 0.04591983162728403, + "grad_norm": 0.7943182587623596, + "learning_rate": 0.0005, + "loss": 1.914, + "step": 1440 + }, + { + "epoch": 0.04623871934691795, + "grad_norm": 0.8151775598526001, + "learning_rate": 0.0005, + "loss": 1.9598, + "step": 1450 + }, + { + "epoch": 0.04655760706655187, + "grad_norm": 0.7676635384559631, + "learning_rate": 0.0005, + "loss": 1.9039, + "step": 1460 + }, + { + "epoch": 0.04687649478618578, + "grad_norm": 0.7939006686210632, + "learning_rate": 0.0005, + "loss": 1.9082, + "step": 1470 + }, + { + "epoch": 0.0471953825058197, + "grad_norm": 0.7992001175880432, + "learning_rate": 0.0005, + "loss": 1.9097, + "step": 1480 + }, + { + "epoch": 0.04751427022545362, + "grad_norm": 0.8140571713447571, + "learning_rate": 0.0005, + "loss": 1.9024, + "step": 1490 + }, + { + "epoch": 0.04783315794508754, + "grad_norm": 0.717022180557251, + "learning_rate": 0.0005, + "loss": 1.8898, + "step": 1500 + }, + { + "epoch": 0.04815204566472145, + "grad_norm": 0.8296144604682922, + "learning_rate": 0.0005, + "loss": 1.8656, + "step": 1510 + }, + { + "epoch": 0.04847093338435537, + "grad_norm": 0.759627640247345, + "learning_rate": 0.0005, + "loss": 1.8984, + "step": 1520 + }, + { + "epoch": 0.048789821103989287, + "grad_norm": 0.6937538385391235, + "learning_rate": 0.0005, + "loss": 1.9031, + "step": 1530 + }, + { + "epoch": 0.049108708823623205, + "grad_norm": 0.8209205269813538, + "learning_rate": 0.0005, + "loss": 1.8583, + "step": 1540 + }, + { + "epoch": 0.04942759654325712, + "grad_norm": 0.7268592715263367, + "learning_rate": 0.0005, + "loss": 1.8808, + "step": 1550 + }, + { + "epoch": 0.049746484262891036, + "grad_norm": 0.7960999608039856, + "learning_rate": 0.0005, + "loss": 1.8662, + "step": 1560 + }, + { + "epoch": 0.050065371982524955, + "grad_norm": 0.7445297241210938, + "learning_rate": 0.0005, + "loss": 1.865, + "step": 1570 + }, + { + "epoch": 0.050384259702158866, + "grad_norm": 0.7228101491928101, + "learning_rate": 0.0005, + "loss": 1.842, + "step": 1580 + }, + { + "epoch": 0.050703147421792785, + "grad_norm": 0.7101908326148987, + "learning_rate": 0.0005, + "loss": 1.8432, + "step": 1590 + }, + { + "epoch": 0.051022035141426704, + "grad_norm": 0.817542552947998, + "learning_rate": 0.0005, + "loss": 1.8422, + "step": 1600 + }, + { + "epoch": 0.05134092286106062, + "grad_norm": 0.7786903977394104, + "learning_rate": 0.0005, + "loss": 1.8447, + "step": 1610 + }, + { + "epoch": 0.051659810580694535, + "grad_norm": 0.8136606216430664, + "learning_rate": 0.0005, + "loss": 1.8723, + "step": 1620 + }, + { + "epoch": 0.05197869830032845, + "grad_norm": 0.74000483751297, + "learning_rate": 0.0005, + "loss": 1.8278, + "step": 1630 + }, + { + "epoch": 0.05229758601996237, + "grad_norm": 0.6954797506332397, + "learning_rate": 0.0005, + "loss": 1.8355, + "step": 1640 + }, + { + "epoch": 0.05261647373959629, + "grad_norm": 0.6831514835357666, + "learning_rate": 0.0005, + "loss": 1.8538, + "step": 1650 + }, + { + "epoch": 0.0529353614592302, + "grad_norm": 0.9402697086334229, + "learning_rate": 0.0005, + "loss": 1.8373, + "step": 1660 + }, + { + "epoch": 0.05325424917886412, + "grad_norm": 0.6758236289024353, + "learning_rate": 0.0005, + "loss": 1.8319, + "step": 1670 + }, + { + "epoch": 0.05357313689849804, + "grad_norm": 0.763216495513916, + "learning_rate": 0.0005, + "loss": 1.7892, + "step": 1680 + }, + { + "epoch": 0.05389202461813196, + "grad_norm": 0.7123900055885315, + "learning_rate": 0.0005, + "loss": 1.8057, + "step": 1690 + }, + { + "epoch": 0.05421091233776587, + "grad_norm": 0.7843760848045349, + "learning_rate": 0.0005, + "loss": 1.8182, + "step": 1700 + }, + { + "epoch": 0.05452980005739979, + "grad_norm": 0.7198774218559265, + "learning_rate": 0.0005, + "loss": 1.8059, + "step": 1710 + }, + { + "epoch": 0.05484868777703371, + "grad_norm": 0.6590190529823303, + "learning_rate": 0.0005, + "loss": 1.8036, + "step": 1720 + }, + { + "epoch": 0.05516757549666762, + "grad_norm": 0.7103943228721619, + "learning_rate": 0.0005, + "loss": 1.8009, + "step": 1730 + }, + { + "epoch": 0.05548646321630154, + "grad_norm": 0.973406970500946, + "learning_rate": 0.0005, + "loss": 1.8117, + "step": 1740 + }, + { + "epoch": 0.05580535093593546, + "grad_norm": 0.7230905294418335, + "learning_rate": 0.0005, + "loss": 1.795, + "step": 1750 + }, + { + "epoch": 0.056124238655569376, + "grad_norm": 0.6838631629943848, + "learning_rate": 0.0005, + "loss": 1.7943, + "step": 1760 + }, + { + "epoch": 0.05644312637520329, + "grad_norm": 0.69809889793396, + "learning_rate": 0.0005, + "loss": 1.7839, + "step": 1770 + }, + { + "epoch": 0.05676201409483721, + "grad_norm": 0.6809301376342773, + "learning_rate": 0.0005, + "loss": 1.8128, + "step": 1780 + }, + { + "epoch": 0.057080901814471126, + "grad_norm": 0.7151408791542053, + "learning_rate": 0.0005, + "loss": 1.776, + "step": 1790 + }, + { + "epoch": 0.057399789534105045, + "grad_norm": 0.7247176766395569, + "learning_rate": 0.0005, + "loss": 1.776, + "step": 1800 + }, + { + "epoch": 0.057718677253738956, + "grad_norm": 0.7167004346847534, + "learning_rate": 0.0005, + "loss": 1.8057, + "step": 1810 + }, + { + "epoch": 0.058037564973372875, + "grad_norm": 0.6864603757858276, + "learning_rate": 0.0005, + "loss": 1.7778, + "step": 1820 + }, + { + "epoch": 0.058356452693006794, + "grad_norm": 0.7611004710197449, + "learning_rate": 0.0005, + "loss": 1.7702, + "step": 1830 + }, + { + "epoch": 0.058675340412640706, + "grad_norm": 0.7142066359519958, + "learning_rate": 0.0005, + "loss": 1.7642, + "step": 1840 + }, + { + "epoch": 0.058994228132274625, + "grad_norm": 0.6733101010322571, + "learning_rate": 0.0005, + "loss": 1.7448, + "step": 1850 + }, + { + "epoch": 0.05931311585190854, + "grad_norm": 0.6972571611404419, + "learning_rate": 0.0005, + "loss": 1.7637, + "step": 1860 + }, + { + "epoch": 0.05963200357154246, + "grad_norm": 0.6883410811424255, + "learning_rate": 0.0005, + "loss": 1.7597, + "step": 1870 + }, + { + "epoch": 0.059950891291176374, + "grad_norm": 0.6904264092445374, + "learning_rate": 0.0005, + "loss": 1.7673, + "step": 1880 + }, + { + "epoch": 0.06026977901081029, + "grad_norm": 0.7625157237052917, + "learning_rate": 0.0005, + "loss": 1.7568, + "step": 1890 + }, + { + "epoch": 0.06058866673044421, + "grad_norm": 0.7773521542549133, + "learning_rate": 0.0005, + "loss": 1.7628, + "step": 1900 + }, + { + "epoch": 0.06090755445007813, + "grad_norm": 0.6850663423538208, + "learning_rate": 0.0005, + "loss": 1.7486, + "step": 1910 + }, + { + "epoch": 0.06122644216971204, + "grad_norm": 0.829558253288269, + "learning_rate": 0.0005, + "loss": 1.7295, + "step": 1920 + }, + { + "epoch": 0.06154532988934596, + "grad_norm": 0.6841607689857483, + "learning_rate": 0.0005, + "loss": 1.7603, + "step": 1930 + }, + { + "epoch": 0.06186421760897988, + "grad_norm": 0.8563835620880127, + "learning_rate": 0.0005, + "loss": 1.7298, + "step": 1940 + }, + { + "epoch": 0.0621831053286138, + "grad_norm": 0.6699181795120239, + "learning_rate": 0.0005, + "loss": 1.7362, + "step": 1950 + }, + { + "epoch": 0.06250199304824772, + "grad_norm": 0.6846010088920593, + "learning_rate": 0.0005, + "loss": 1.7306, + "step": 1960 + }, + { + "epoch": 0.06282088076788163, + "grad_norm": 0.7223895788192749, + "learning_rate": 0.0005, + "loss": 1.7413, + "step": 1970 + }, + { + "epoch": 0.06313976848751554, + "grad_norm": 0.6938334107398987, + "learning_rate": 0.0005, + "loss": 1.7191, + "step": 1980 + }, + { + "epoch": 0.06345865620714947, + "grad_norm": 0.677183210849762, + "learning_rate": 0.0005, + "loss": 1.7135, + "step": 1990 + }, + { + "epoch": 0.06377754392678338, + "grad_norm": 0.6501774787902832, + "learning_rate": 0.0005, + "loss": 1.7226, + "step": 2000 + }, + { + "epoch": 0.06409643164641729, + "grad_norm": 0.68111252784729, + "learning_rate": 0.0005, + "loss": 1.7158, + "step": 2010 + }, + { + "epoch": 0.06441531936605122, + "grad_norm": 0.6771100163459778, + "learning_rate": 0.0005, + "loss": 1.7362, + "step": 2020 + }, + { + "epoch": 0.06473420708568513, + "grad_norm": 0.9300406575202942, + "learning_rate": 0.0005, + "loss": 1.7184, + "step": 2030 + }, + { + "epoch": 0.06505309480531905, + "grad_norm": 0.6919383406639099, + "learning_rate": 0.0005, + "loss": 1.712, + "step": 2040 + }, + { + "epoch": 0.06537198252495297, + "grad_norm": 0.6832332611083984, + "learning_rate": 0.0005, + "loss": 1.6954, + "step": 2050 + }, + { + "epoch": 0.06569087024458688, + "grad_norm": 0.6679097414016724, + "learning_rate": 0.0005, + "loss": 1.6775, + "step": 2060 + }, + { + "epoch": 0.0660097579642208, + "grad_norm": 0.6957271695137024, + "learning_rate": 0.0005, + "loss": 1.7101, + "step": 2070 + }, + { + "epoch": 0.06632864568385471, + "grad_norm": 0.6392918825149536, + "learning_rate": 0.0005, + "loss": 1.6956, + "step": 2080 + }, + { + "epoch": 0.06664753340348863, + "grad_norm": 0.6429009437561035, + "learning_rate": 0.0005, + "loss": 1.712, + "step": 2090 + }, + { + "epoch": 0.06696642112312255, + "grad_norm": 0.696384847164154, + "learning_rate": 0.0005, + "loss": 1.7035, + "step": 2100 + }, + { + "epoch": 0.06728530884275646, + "grad_norm": 0.6736230254173279, + "learning_rate": 0.0005, + "loss": 1.6815, + "step": 2110 + }, + { + "epoch": 0.06760419656239038, + "grad_norm": 0.6436002254486084, + "learning_rate": 0.0005, + "loss": 1.683, + "step": 2120 + }, + { + "epoch": 0.0679230842820243, + "grad_norm": 0.6723842620849609, + "learning_rate": 0.0005, + "loss": 1.6912, + "step": 2130 + }, + { + "epoch": 0.06824197200165821, + "grad_norm": 0.6679061055183411, + "learning_rate": 0.0005, + "loss": 1.6814, + "step": 2140 + }, + { + "epoch": 0.06856085972129214, + "grad_norm": 0.6640651822090149, + "learning_rate": 0.0005, + "loss": 1.6863, + "step": 2150 + }, + { + "epoch": 0.06887974744092605, + "grad_norm": 0.6838980317115784, + "learning_rate": 0.0005, + "loss": 1.6938, + "step": 2160 + }, + { + "epoch": 0.06919863516055996, + "grad_norm": 0.6549293994903564, + "learning_rate": 0.0005, + "loss": 1.6792, + "step": 2170 + }, + { + "epoch": 0.06951752288019389, + "grad_norm": 0.6822001338005066, + "learning_rate": 0.0005, + "loss": 1.6701, + "step": 2180 + }, + { + "epoch": 0.0698364105998278, + "grad_norm": 0.7280387282371521, + "learning_rate": 0.0005, + "loss": 1.6589, + "step": 2190 + }, + { + "epoch": 0.07015529831946171, + "grad_norm": 0.6739123463630676, + "learning_rate": 0.0005, + "loss": 1.6739, + "step": 2200 + }, + { + "epoch": 0.07047418603909564, + "grad_norm": 0.6753026843070984, + "learning_rate": 0.0005, + "loss": 1.6546, + "step": 2210 + }, + { + "epoch": 0.07079307375872955, + "grad_norm": 0.7010917663574219, + "learning_rate": 0.0005, + "loss": 1.6658, + "step": 2220 + }, + { + "epoch": 0.07111196147836348, + "grad_norm": 0.6419041752815247, + "learning_rate": 0.0005, + "loss": 1.6547, + "step": 2230 + }, + { + "epoch": 0.07143084919799739, + "grad_norm": 0.7313352823257446, + "learning_rate": 0.0005, + "loss": 1.6504, + "step": 2240 + }, + { + "epoch": 0.0717497369176313, + "grad_norm": 0.6374030709266663, + "learning_rate": 0.0005, + "loss": 1.6372, + "step": 2250 + }, + { + "epoch": 0.07206862463726522, + "grad_norm": 0.6675490140914917, + "learning_rate": 0.0005, + "loss": 1.6238, + "step": 2260 + }, + { + "epoch": 0.07238751235689914, + "grad_norm": 0.7854442000389099, + "learning_rate": 0.0005, + "loss": 1.6574, + "step": 2270 + }, + { + "epoch": 0.07270640007653305, + "grad_norm": 0.6852051615715027, + "learning_rate": 0.0005, + "loss": 1.647, + "step": 2280 + }, + { + "epoch": 0.07302528779616697, + "grad_norm": 0.7064688205718994, + "learning_rate": 0.0005, + "loss": 1.6258, + "step": 2290 + }, + { + "epoch": 0.07334417551580089, + "grad_norm": 0.6328973770141602, + "learning_rate": 0.0005, + "loss": 1.6332, + "step": 2300 + }, + { + "epoch": 0.0736630632354348, + "grad_norm": 0.6078288555145264, + "learning_rate": 0.0005, + "loss": 1.6222, + "step": 2310 + }, + { + "epoch": 0.07398195095506872, + "grad_norm": 0.6709187626838684, + "learning_rate": 0.0005, + "loss": 1.6442, + "step": 2320 + }, + { + "epoch": 0.07430083867470264, + "grad_norm": 0.6568571925163269, + "learning_rate": 0.0005, + "loss": 1.6406, + "step": 2330 + }, + { + "epoch": 0.07461972639433656, + "grad_norm": 0.6580377817153931, + "learning_rate": 0.0005, + "loss": 1.6228, + "step": 2340 + }, + { + "epoch": 0.07493861411397047, + "grad_norm": 0.6596248745918274, + "learning_rate": 0.0005, + "loss": 1.6444, + "step": 2350 + }, + { + "epoch": 0.07525750183360438, + "grad_norm": 0.6527488231658936, + "learning_rate": 0.0005, + "loss": 1.6392, + "step": 2360 + }, + { + "epoch": 0.07557638955323831, + "grad_norm": 0.6278038024902344, + "learning_rate": 0.0005, + "loss": 1.5949, + "step": 2370 + }, + { + "epoch": 0.07589527727287222, + "grad_norm": 0.6202778816223145, + "learning_rate": 0.0005, + "loss": 1.603, + "step": 2380 + }, + { + "epoch": 0.07621416499250613, + "grad_norm": 0.739552915096283, + "learning_rate": 0.0005, + "loss": 1.6397, + "step": 2390 + }, + { + "epoch": 0.07653305271214006, + "grad_norm": 0.6613728404045105, + "learning_rate": 0.0005, + "loss": 1.6224, + "step": 2400 + }, + { + "epoch": 0.07685194043177397, + "grad_norm": 0.6121648550033569, + "learning_rate": 0.0005, + "loss": 1.6353, + "step": 2410 + }, + { + "epoch": 0.07717082815140788, + "grad_norm": 0.6781162619590759, + "learning_rate": 0.0005, + "loss": 1.6094, + "step": 2420 + }, + { + "epoch": 0.07748971587104181, + "grad_norm": 0.6234122514724731, + "learning_rate": 0.0005, + "loss": 1.6106, + "step": 2430 + }, + { + "epoch": 0.07780860359067572, + "grad_norm": 0.6485508680343628, + "learning_rate": 0.0005, + "loss": 1.6038, + "step": 2440 + }, + { + "epoch": 0.07812749131030965, + "grad_norm": 0.7261289954185486, + "learning_rate": 0.0005, + "loss": 1.5956, + "step": 2450 + }, + { + "epoch": 0.07844637902994356, + "grad_norm": 0.6383315920829773, + "learning_rate": 0.0005, + "loss": 1.6133, + "step": 2460 + }, + { + "epoch": 0.07876526674957747, + "grad_norm": 0.6283700466156006, + "learning_rate": 0.0005, + "loss": 1.6076, + "step": 2470 + }, + { + "epoch": 0.0790841544692114, + "grad_norm": 0.6034983396530151, + "learning_rate": 0.0005, + "loss": 1.5841, + "step": 2480 + }, + { + "epoch": 0.07940304218884531, + "grad_norm": 0.6377360820770264, + "learning_rate": 0.0005, + "loss": 1.6206, + "step": 2490 + }, + { + "epoch": 0.07972192990847922, + "grad_norm": 0.631747841835022, + "learning_rate": 0.0005, + "loss": 1.6042, + "step": 2500 + }, + { + "epoch": 0.08004081762811315, + "grad_norm": 0.6026962995529175, + "learning_rate": 0.0005, + "loss": 1.5796, + "step": 2510 + }, + { + "epoch": 0.08035970534774706, + "grad_norm": 0.6390537619590759, + "learning_rate": 0.0005, + "loss": 1.6024, + "step": 2520 + }, + { + "epoch": 0.08067859306738097, + "grad_norm": 0.605514407157898, + "learning_rate": 0.0005, + "loss": 1.6008, + "step": 2530 + }, + { + "epoch": 0.0809974807870149, + "grad_norm": 0.5983066558837891, + "learning_rate": 0.0005, + "loss": 1.5949, + "step": 2540 + }, + { + "epoch": 0.0813163685066488, + "grad_norm": 0.6076896786689758, + "learning_rate": 0.0005, + "loss": 1.5942, + "step": 2550 + }, + { + "epoch": 0.08163525622628273, + "grad_norm": 0.6317102313041687, + "learning_rate": 0.0005, + "loss": 1.5887, + "step": 2560 + }, + { + "epoch": 0.08195414394591664, + "grad_norm": 0.6394833922386169, + "learning_rate": 0.0005, + "loss": 1.5628, + "step": 2570 + }, + { + "epoch": 0.08227303166555056, + "grad_norm": 0.6671807169914246, + "learning_rate": 0.0005, + "loss": 1.5965, + "step": 2580 + }, + { + "epoch": 0.08259191938518448, + "grad_norm": 0.6070294380187988, + "learning_rate": 0.0005, + "loss": 1.5699, + "step": 2590 + }, + { + "epoch": 0.0829108071048184, + "grad_norm": 0.6343675255775452, + "learning_rate": 0.0005, + "loss": 1.5775, + "step": 2600 + }, + { + "epoch": 0.0832296948244523, + "grad_norm": 0.6360995769500732, + "learning_rate": 0.0005, + "loss": 1.5782, + "step": 2610 + }, + { + "epoch": 0.08354858254408623, + "grad_norm": 0.6294621229171753, + "learning_rate": 0.0005, + "loss": 1.5628, + "step": 2620 + }, + { + "epoch": 0.08386747026372014, + "grad_norm": 0.6626117825508118, + "learning_rate": 0.0005, + "loss": 1.5692, + "step": 2630 + }, + { + "epoch": 0.08418635798335405, + "grad_norm": 0.5887053608894348, + "learning_rate": 0.0005, + "loss": 1.5651, + "step": 2640 + }, + { + "epoch": 0.08450524570298798, + "grad_norm": 0.5864725708961487, + "learning_rate": 0.0005, + "loss": 1.5682, + "step": 2650 + }, + { + "epoch": 0.08482413342262189, + "grad_norm": 0.6141831278800964, + "learning_rate": 0.0005, + "loss": 1.581, + "step": 2660 + }, + { + "epoch": 0.08514302114225582, + "grad_norm": 0.5977876782417297, + "learning_rate": 0.0005, + "loss": 1.5545, + "step": 2670 + }, + { + "epoch": 0.08546190886188973, + "grad_norm": 0.6059004068374634, + "learning_rate": 0.0005, + "loss": 1.5912, + "step": 2680 + }, + { + "epoch": 0.08578079658152364, + "grad_norm": 0.5856339931488037, + "learning_rate": 0.0005, + "loss": 1.5646, + "step": 2690 + }, + { + "epoch": 0.08609968430115757, + "grad_norm": 0.5980954766273499, + "learning_rate": 0.0005, + "loss": 1.5682, + "step": 2700 + }, + { + "epoch": 0.08641857202079148, + "grad_norm": 0.6472553014755249, + "learning_rate": 0.0005, + "loss": 1.5604, + "step": 2710 + }, + { + "epoch": 0.08673745974042539, + "grad_norm": 0.6788700222969055, + "learning_rate": 0.0005, + "loss": 1.5745, + "step": 2720 + }, + { + "epoch": 0.08705634746005932, + "grad_norm": 0.5948100686073303, + "learning_rate": 0.0005, + "loss": 1.5789, + "step": 2730 + }, + { + "epoch": 0.08737523517969323, + "grad_norm": 0.6130886673927307, + "learning_rate": 0.0005, + "loss": 1.5899, + "step": 2740 + }, + { + "epoch": 0.08769412289932715, + "grad_norm": 0.6039873361587524, + "learning_rate": 0.0005, + "loss": 1.553, + "step": 2750 + }, + { + "epoch": 0.08801301061896107, + "grad_norm": 0.6165336966514587, + "learning_rate": 0.0005, + "loss": 1.5493, + "step": 2760 + }, + { + "epoch": 0.08833189833859498, + "grad_norm": 0.6436738967895508, + "learning_rate": 0.0005, + "loss": 1.5479, + "step": 2770 + }, + { + "epoch": 0.0886507860582289, + "grad_norm": 0.6035174131393433, + "learning_rate": 0.0005, + "loss": 1.5779, + "step": 2780 + }, + { + "epoch": 0.08896967377786281, + "grad_norm": 0.5844959616661072, + "learning_rate": 0.0005, + "loss": 1.5297, + "step": 2790 + }, + { + "epoch": 0.08928856149749673, + "grad_norm": 0.5872601866722107, + "learning_rate": 0.0005, + "loss": 1.5617, + "step": 2800 + }, + { + "epoch": 0.08960744921713065, + "grad_norm": 0.6345117688179016, + "learning_rate": 0.0005, + "loss": 1.5247, + "step": 2810 + }, + { + "epoch": 0.08992633693676456, + "grad_norm": 0.6308701634407043, + "learning_rate": 0.0005, + "loss": 1.5495, + "step": 2820 + }, + { + "epoch": 0.09024522465639848, + "grad_norm": 0.6402453184127808, + "learning_rate": 0.0005, + "loss": 1.5494, + "step": 2830 + }, + { + "epoch": 0.0905641123760324, + "grad_norm": 0.5856583714485168, + "learning_rate": 0.0005, + "loss": 1.5478, + "step": 2840 + }, + { + "epoch": 0.09088300009566631, + "grad_norm": 0.5717542767524719, + "learning_rate": 0.0005, + "loss": 1.5255, + "step": 2850 + }, + { + "epoch": 0.09120188781530024, + "grad_norm": 0.5698398947715759, + "learning_rate": 0.0005, + "loss": 1.5312, + "step": 2860 + }, + { + "epoch": 0.09152077553493415, + "grad_norm": 0.6083899140357971, + "learning_rate": 0.0005, + "loss": 1.5315, + "step": 2870 + }, + { + "epoch": 0.09183966325456806, + "grad_norm": 0.6005008220672607, + "learning_rate": 0.0005, + "loss": 1.5453, + "step": 2880 + }, + { + "epoch": 0.09215855097420199, + "grad_norm": 0.5992957353591919, + "learning_rate": 0.0005, + "loss": 1.5357, + "step": 2890 + }, + { + "epoch": 0.0924774386938359, + "grad_norm": 0.5881666541099548, + "learning_rate": 0.0005, + "loss": 1.5308, + "step": 2900 + }, + { + "epoch": 0.09279632641346981, + "grad_norm": 0.6062901616096497, + "learning_rate": 0.0005, + "loss": 1.5388, + "step": 2910 + }, + { + "epoch": 0.09311521413310374, + "grad_norm": 0.5917524695396423, + "learning_rate": 0.0005, + "loss": 1.5409, + "step": 2920 + }, + { + "epoch": 0.09343410185273765, + "grad_norm": 0.6036853790283203, + "learning_rate": 0.0005, + "loss": 1.5436, + "step": 2930 + }, + { + "epoch": 0.09375298957237156, + "grad_norm": 0.5984651446342468, + "learning_rate": 0.0005, + "loss": 1.5526, + "step": 2940 + }, + { + "epoch": 0.09407187729200549, + "grad_norm": 0.6358986496925354, + "learning_rate": 0.0005, + "loss": 1.5239, + "step": 2950 + }, + { + "epoch": 0.0943907650116394, + "grad_norm": 0.6184073686599731, + "learning_rate": 0.0005, + "loss": 1.5247, + "step": 2960 + }, + { + "epoch": 0.09470965273127332, + "grad_norm": 0.6121427416801453, + "learning_rate": 0.0005, + "loss": 1.5344, + "step": 2970 + }, + { + "epoch": 0.09502854045090724, + "grad_norm": 0.5836875438690186, + "learning_rate": 0.0005, + "loss": 1.5249, + "step": 2980 + }, + { + "epoch": 0.09534742817054115, + "grad_norm": 0.5729764103889465, + "learning_rate": 0.0005, + "loss": 1.5302, + "step": 2990 + }, + { + "epoch": 0.09566631589017507, + "grad_norm": 0.5892743468284607, + "learning_rate": 0.0005, + "loss": 1.5433, + "step": 3000 + }, + { + "epoch": 0.09598520360980899, + "grad_norm": 0.593152642250061, + "learning_rate": 0.0005, + "loss": 1.5232, + "step": 3010 + }, + { + "epoch": 0.0963040913294429, + "grad_norm": 0.5753911733627319, + "learning_rate": 0.0005, + "loss": 1.5133, + "step": 3020 + }, + { + "epoch": 0.09662297904907682, + "grad_norm": 0.6225734353065491, + "learning_rate": 0.0005, + "loss": 1.5195, + "step": 3030 + }, + { + "epoch": 0.09694186676871074, + "grad_norm": 0.5819618105888367, + "learning_rate": 0.0005, + "loss": 1.5238, + "step": 3040 + }, + { + "epoch": 0.09726075448834465, + "grad_norm": 0.6067165732383728, + "learning_rate": 0.0005, + "loss": 1.5342, + "step": 3050 + }, + { + "epoch": 0.09757964220797857, + "grad_norm": 0.5918219089508057, + "learning_rate": 0.0005, + "loss": 1.531, + "step": 3060 + }, + { + "epoch": 0.09789852992761248, + "grad_norm": 0.5920441150665283, + "learning_rate": 0.0005, + "loss": 1.5125, + "step": 3070 + }, + { + "epoch": 0.09821741764724641, + "grad_norm": 0.5718549489974976, + "learning_rate": 0.0005, + "loss": 1.5217, + "step": 3080 + }, + { + "epoch": 0.09853630536688032, + "grad_norm": 0.5798651576042175, + "learning_rate": 0.0005, + "loss": 1.5092, + "step": 3090 + }, + { + "epoch": 0.09885519308651423, + "grad_norm": 0.5711941123008728, + "learning_rate": 0.0005, + "loss": 1.4999, + "step": 3100 + }, + { + "epoch": 0.09917408080614816, + "grad_norm": 0.6470732688903809, + "learning_rate": 0.0005, + "loss": 1.4928, + "step": 3110 + }, + { + "epoch": 0.09949296852578207, + "grad_norm": 0.5650610327720642, + "learning_rate": 0.0005, + "loss": 1.5182, + "step": 3120 + }, + { + "epoch": 0.09981185624541598, + "grad_norm": 0.6110008358955383, + "learning_rate": 0.0005, + "loss": 1.5261, + "step": 3130 + }, + { + "epoch": 0.10013074396504991, + "grad_norm": 0.6006331443786621, + "learning_rate": 0.0005, + "loss": 1.5124, + "step": 3140 + }, + { + "epoch": 0.10044963168468382, + "grad_norm": 0.5928711295127869, + "learning_rate": 0.0005, + "loss": 1.5159, + "step": 3150 + }, + { + "epoch": 0.10076851940431773, + "grad_norm": 0.5740655064582825, + "learning_rate": 0.0005, + "loss": 1.4932, + "step": 3160 + }, + { + "epoch": 0.10108740712395166, + "grad_norm": 0.5549852252006531, + "learning_rate": 0.0005, + "loss": 1.5022, + "step": 3170 + }, + { + "epoch": 0.10140629484358557, + "grad_norm": 0.5529758930206299, + "learning_rate": 0.0005, + "loss": 1.5011, + "step": 3180 + }, + { + "epoch": 0.1017251825632195, + "grad_norm": 0.5959564447402954, + "learning_rate": 0.0005, + "loss": 1.4831, + "step": 3190 + }, + { + "epoch": 0.10204407028285341, + "grad_norm": 0.5632524490356445, + "learning_rate": 0.0005, + "loss": 1.4886, + "step": 3200 + }, + { + "epoch": 0.10236295800248732, + "grad_norm": 0.5667331218719482, + "learning_rate": 0.0005, + "loss": 1.5205, + "step": 3210 + }, + { + "epoch": 0.10268184572212125, + "grad_norm": 0.561392605304718, + "learning_rate": 0.0005, + "loss": 1.508, + "step": 3220 + }, + { + "epoch": 0.10300073344175516, + "grad_norm": 0.5860125422477722, + "learning_rate": 0.0005, + "loss": 1.4788, + "step": 3230 + }, + { + "epoch": 0.10331962116138907, + "grad_norm": 0.5863109230995178, + "learning_rate": 0.0005, + "loss": 1.461, + "step": 3240 + }, + { + "epoch": 0.103638508881023, + "grad_norm": 0.5454897880554199, + "learning_rate": 0.0005, + "loss": 1.491, + "step": 3250 + }, + { + "epoch": 0.1039573966006569, + "grad_norm": 0.5635510087013245, + "learning_rate": 0.0005, + "loss": 1.5087, + "step": 3260 + }, + { + "epoch": 0.10427628432029083, + "grad_norm": 0.5486317276954651, + "learning_rate": 0.0005, + "loss": 1.4587, + "step": 3270 + }, + { + "epoch": 0.10459517203992474, + "grad_norm": 0.620488166809082, + "learning_rate": 0.0005, + "loss": 1.5002, + "step": 3280 + }, + { + "epoch": 0.10491405975955866, + "grad_norm": 0.5548288226127625, + "learning_rate": 0.0005, + "loss": 1.464, + "step": 3290 + }, + { + "epoch": 0.10523294747919258, + "grad_norm": 0.5835471749305725, + "learning_rate": 0.0005, + "loss": 1.492, + "step": 3300 + }, + { + "epoch": 0.1055518351988265, + "grad_norm": 0.585202693939209, + "learning_rate": 0.0005, + "loss": 1.48, + "step": 3310 + }, + { + "epoch": 0.1058707229184604, + "grad_norm": 0.5698010325431824, + "learning_rate": 0.0005, + "loss": 1.4633, + "step": 3320 + }, + { + "epoch": 0.10618961063809433, + "grad_norm": 0.5747951865196228, + "learning_rate": 0.0005, + "loss": 1.531, + "step": 3330 + }, + { + "epoch": 0.10650849835772824, + "grad_norm": 0.5524262189865112, + "learning_rate": 0.0005, + "loss": 1.4961, + "step": 3340 + }, + { + "epoch": 0.10682738607736215, + "grad_norm": 0.5779598355293274, + "learning_rate": 0.0005, + "loss": 1.5045, + "step": 3350 + }, + { + "epoch": 0.10714627379699608, + "grad_norm": 0.6228440999984741, + "learning_rate": 0.0005, + "loss": 1.4862, + "step": 3360 + }, + { + "epoch": 0.10746516151662999, + "grad_norm": 0.5359426140785217, + "learning_rate": 0.0005, + "loss": 1.4967, + "step": 3370 + }, + { + "epoch": 0.10778404923626392, + "grad_norm": 0.6059668660163879, + "learning_rate": 0.0005, + "loss": 1.4805, + "step": 3380 + }, + { + "epoch": 0.10810293695589783, + "grad_norm": 0.5769729018211365, + "learning_rate": 0.0005, + "loss": 1.4502, + "step": 3390 + }, + { + "epoch": 0.10842182467553174, + "grad_norm": 0.5658615231513977, + "learning_rate": 0.0005, + "loss": 1.4685, + "step": 3400 + }, + { + "epoch": 0.10874071239516567, + "grad_norm": 0.5631580352783203, + "learning_rate": 0.0005, + "loss": 1.4805, + "step": 3410 + }, + { + "epoch": 0.10905960011479958, + "grad_norm": 0.554395854473114, + "learning_rate": 0.0005, + "loss": 1.487, + "step": 3420 + }, + { + "epoch": 0.10937848783443349, + "grad_norm": 0.5861183404922485, + "learning_rate": 0.0005, + "loss": 1.4826, + "step": 3430 + }, + { + "epoch": 0.10969737555406742, + "grad_norm": 0.5401641726493835, + "learning_rate": 0.0005, + "loss": 1.4861, + "step": 3440 + }, + { + "epoch": 0.11001626327370133, + "grad_norm": 0.5490972399711609, + "learning_rate": 0.0005, + "loss": 1.4907, + "step": 3450 + }, + { + "epoch": 0.11033515099333524, + "grad_norm": 0.5460464954376221, + "learning_rate": 0.0005, + "loss": 1.4537, + "step": 3460 + }, + { + "epoch": 0.11065403871296917, + "grad_norm": 0.5482505559921265, + "learning_rate": 0.0005, + "loss": 1.477, + "step": 3470 + }, + { + "epoch": 0.11097292643260308, + "grad_norm": 0.5561589598655701, + "learning_rate": 0.0005, + "loss": 1.4591, + "step": 3480 + }, + { + "epoch": 0.111291814152237, + "grad_norm": 0.5551013350486755, + "learning_rate": 0.0005, + "loss": 1.4544, + "step": 3490 + }, + { + "epoch": 0.11161070187187092, + "grad_norm": 0.5580571293830872, + "learning_rate": 0.0005, + "loss": 1.468, + "step": 3500 + }, + { + "epoch": 0.11192958959150483, + "grad_norm": 0.583152174949646, + "learning_rate": 0.0005, + "loss": 1.4704, + "step": 3510 + }, + { + "epoch": 0.11224847731113875, + "grad_norm": 0.5443835258483887, + "learning_rate": 0.0005, + "loss": 1.462, + "step": 3520 + }, + { + "epoch": 0.11256736503077266, + "grad_norm": 0.5560067296028137, + "learning_rate": 0.0005, + "loss": 1.4542, + "step": 3530 + }, + { + "epoch": 0.11288625275040658, + "grad_norm": 0.5389274954795837, + "learning_rate": 0.0005, + "loss": 1.4681, + "step": 3540 + }, + { + "epoch": 0.1132051404700405, + "grad_norm": 0.5621799826622009, + "learning_rate": 0.0005, + "loss": 1.4688, + "step": 3550 + }, + { + "epoch": 0.11352402818967441, + "grad_norm": 0.5864140391349792, + "learning_rate": 0.0005, + "loss": 1.4607, + "step": 3560 + }, + { + "epoch": 0.11384291590930833, + "grad_norm": 0.5543456077575684, + "learning_rate": 0.0005, + "loss": 1.4383, + "step": 3570 + }, + { + "epoch": 0.11416180362894225, + "grad_norm": 0.556922972202301, + "learning_rate": 0.0005, + "loss": 1.4362, + "step": 3580 + }, + { + "epoch": 0.11448069134857616, + "grad_norm": 0.5521618127822876, + "learning_rate": 0.0005, + "loss": 1.4752, + "step": 3590 + }, + { + "epoch": 0.11479957906821009, + "grad_norm": 0.5324291586875916, + "learning_rate": 0.0005, + "loss": 1.4639, + "step": 3600 + }, + { + "epoch": 0.115118466787844, + "grad_norm": 0.5443735122680664, + "learning_rate": 0.0005, + "loss": 1.4357, + "step": 3610 + }, + { + "epoch": 0.11543735450747791, + "grad_norm": 0.5763177275657654, + "learning_rate": 0.0005, + "loss": 1.4309, + "step": 3620 + }, + { + "epoch": 0.11575624222711184, + "grad_norm": 0.5425290465354919, + "learning_rate": 0.0005, + "loss": 1.4659, + "step": 3630 + }, + { + "epoch": 0.11607512994674575, + "grad_norm": 0.598491907119751, + "learning_rate": 0.0005, + "loss": 1.477, + "step": 3640 + }, + { + "epoch": 0.11639401766637966, + "grad_norm": 0.5747283101081848, + "learning_rate": 0.0005, + "loss": 1.4482, + "step": 3650 + }, + { + "epoch": 0.11671290538601359, + "grad_norm": 0.5188102722167969, + "learning_rate": 0.0005, + "loss": 1.4331, + "step": 3660 + }, + { + "epoch": 0.1170317931056475, + "grad_norm": 0.5427939295768738, + "learning_rate": 0.0005, + "loss": 1.4646, + "step": 3670 + }, + { + "epoch": 0.11735068082528141, + "grad_norm": 0.5608155131340027, + "learning_rate": 0.0005, + "loss": 1.4812, + "step": 3680 + }, + { + "epoch": 0.11766956854491534, + "grad_norm": 0.5341671705245972, + "learning_rate": 0.0005, + "loss": 1.4434, + "step": 3690 + }, + { + "epoch": 0.11798845626454925, + "grad_norm": 0.5669570565223694, + "learning_rate": 0.0005, + "loss": 1.4547, + "step": 3700 + }, + { + "epoch": 0.11830734398418317, + "grad_norm": 0.5232808589935303, + "learning_rate": 0.0005, + "loss": 1.4501, + "step": 3710 + }, + { + "epoch": 0.11862623170381709, + "grad_norm": 0.5229708552360535, + "learning_rate": 0.0005, + "loss": 1.4325, + "step": 3720 + }, + { + "epoch": 0.118945119423451, + "grad_norm": 0.5569901466369629, + "learning_rate": 0.0005, + "loss": 1.4504, + "step": 3730 + }, + { + "epoch": 0.11926400714308492, + "grad_norm": 0.5216270685195923, + "learning_rate": 0.0005, + "loss": 1.4587, + "step": 3740 + }, + { + "epoch": 0.11958289486271884, + "grad_norm": 0.5247334837913513, + "learning_rate": 0.0005, + "loss": 1.4313, + "step": 3750 + }, + { + "epoch": 0.11990178258235275, + "grad_norm": 0.5420272946357727, + "learning_rate": 0.0005, + "loss": 1.439, + "step": 3760 + }, + { + "epoch": 0.12022067030198667, + "grad_norm": 0.5527462363243103, + "learning_rate": 0.0005, + "loss": 1.4538, + "step": 3770 + }, + { + "epoch": 0.12053955802162059, + "grad_norm": 0.5519984364509583, + "learning_rate": 0.0005, + "loss": 1.4697, + "step": 3780 + }, + { + "epoch": 0.12085844574125451, + "grad_norm": 0.5728175044059753, + "learning_rate": 0.0005, + "loss": 1.4228, + "step": 3790 + }, + { + "epoch": 0.12117733346088842, + "grad_norm": 0.5295974016189575, + "learning_rate": 0.0005, + "loss": 1.4217, + "step": 3800 + }, + { + "epoch": 0.12149622118052233, + "grad_norm": 0.5916241407394409, + "learning_rate": 0.0005, + "loss": 1.426, + "step": 3810 + }, + { + "epoch": 0.12181510890015626, + "grad_norm": 0.5520297288894653, + "learning_rate": 0.0005, + "loss": 1.4343, + "step": 3820 + }, + { + "epoch": 0.12213399661979017, + "grad_norm": 0.547126054763794, + "learning_rate": 0.0005, + "loss": 1.4223, + "step": 3830 + }, + { + "epoch": 0.12245288433942408, + "grad_norm": 0.5996837019920349, + "learning_rate": 0.0005, + "loss": 1.4485, + "step": 3840 + }, + { + "epoch": 0.12277177205905801, + "grad_norm": 0.5237704515457153, + "learning_rate": 0.0005, + "loss": 1.4456, + "step": 3850 + }, + { + "epoch": 0.12309065977869192, + "grad_norm": 0.5590718984603882, + "learning_rate": 0.0005, + "loss": 1.4472, + "step": 3860 + }, + { + "epoch": 0.12340954749832583, + "grad_norm": 0.5487239360809326, + "learning_rate": 0.0005, + "loss": 1.457, + "step": 3870 + }, + { + "epoch": 0.12372843521795976, + "grad_norm": 0.5274037718772888, + "learning_rate": 0.0005, + "loss": 1.4231, + "step": 3880 + }, + { + "epoch": 0.12404732293759367, + "grad_norm": 0.5649983882904053, + "learning_rate": 0.0005, + "loss": 1.4431, + "step": 3890 + }, + { + "epoch": 0.1243662106572276, + "grad_norm": 0.5578463077545166, + "learning_rate": 0.0005, + "loss": 1.4381, + "step": 3900 + }, + { + "epoch": 0.12468509837686151, + "grad_norm": 0.531040608882904, + "learning_rate": 0.0005, + "loss": 1.4313, + "step": 3910 + }, + { + "epoch": 0.12500398609649543, + "grad_norm": 0.5263800024986267, + "learning_rate": 0.0005, + "loss": 1.4373, + "step": 3920 + }, + { + "epoch": 0.12532287381612933, + "grad_norm": 0.5514016151428223, + "learning_rate": 0.0005, + "loss": 1.4105, + "step": 3930 + }, + { + "epoch": 0.12564176153576326, + "grad_norm": 0.5430245399475098, + "learning_rate": 0.0005, + "loss": 1.4072, + "step": 3940 + }, + { + "epoch": 0.12596064925539718, + "grad_norm": 0.49897560477256775, + "learning_rate": 0.0005, + "loss": 1.4286, + "step": 3950 + }, + { + "epoch": 0.12627953697503108, + "grad_norm": 0.5384120941162109, + "learning_rate": 0.0005, + "loss": 1.411, + "step": 3960 + }, + { + "epoch": 0.126598424694665, + "grad_norm": 0.511913537979126, + "learning_rate": 0.0005, + "loss": 1.4332, + "step": 3970 + }, + { + "epoch": 0.12691731241429893, + "grad_norm": 0.5535169839859009, + "learning_rate": 0.0005, + "loss": 1.4108, + "step": 3980 + }, + { + "epoch": 0.12723620013393283, + "grad_norm": 0.5501640439033508, + "learning_rate": 0.0005, + "loss": 1.4184, + "step": 3990 + }, + { + "epoch": 0.12755508785356676, + "grad_norm": 0.525273859500885, + "learning_rate": 0.0005, + "loss": 1.4155, + "step": 4000 + }, + { + "epoch": 0.12787397557320068, + "grad_norm": 0.5309306383132935, + "learning_rate": 0.0005, + "loss": 1.43, + "step": 4010 + }, + { + "epoch": 0.12819286329283458, + "grad_norm": 0.590304970741272, + "learning_rate": 0.0005, + "loss": 1.421, + "step": 4020 + }, + { + "epoch": 0.1285117510124685, + "grad_norm": 0.5559628009796143, + "learning_rate": 0.0005, + "loss": 1.428, + "step": 4030 + }, + { + "epoch": 0.12883063873210243, + "grad_norm": 0.523764967918396, + "learning_rate": 0.0005, + "loss": 1.4189, + "step": 4040 + }, + { + "epoch": 0.12914952645173636, + "grad_norm": 0.5560243129730225, + "learning_rate": 0.0005, + "loss": 1.4279, + "step": 4050 + }, + { + "epoch": 0.12946841417137026, + "grad_norm": 0.5219590663909912, + "learning_rate": 0.0005, + "loss": 1.4343, + "step": 4060 + }, + { + "epoch": 0.12978730189100418, + "grad_norm": 0.5206665396690369, + "learning_rate": 0.0005, + "loss": 1.4299, + "step": 4070 + }, + { + "epoch": 0.1301061896106381, + "grad_norm": 0.5236951112747192, + "learning_rate": 0.0005, + "loss": 1.3991, + "step": 4080 + }, + { + "epoch": 0.130425077330272, + "grad_norm": 0.5283119678497314, + "learning_rate": 0.0005, + "loss": 1.4192, + "step": 4090 + }, + { + "epoch": 0.13074396504990593, + "grad_norm": 0.5292713642120361, + "learning_rate": 0.0005, + "loss": 1.4188, + "step": 4100 + }, + { + "epoch": 0.13106285276953986, + "grad_norm": 0.5801336169242859, + "learning_rate": 0.0005, + "loss": 1.4115, + "step": 4110 + }, + { + "epoch": 0.13138174048917375, + "grad_norm": 0.5187575817108154, + "learning_rate": 0.0005, + "loss": 1.3969, + "step": 4120 + }, + { + "epoch": 0.13170062820880768, + "grad_norm": 0.541106641292572, + "learning_rate": 0.0005, + "loss": 1.4133, + "step": 4130 + }, + { + "epoch": 0.1320195159284416, + "grad_norm": 0.557482898235321, + "learning_rate": 0.0005, + "loss": 1.4106, + "step": 4140 + }, + { + "epoch": 0.1323384036480755, + "grad_norm": 0.5167677402496338, + "learning_rate": 0.0005, + "loss": 1.3988, + "step": 4150 + }, + { + "epoch": 0.13265729136770943, + "grad_norm": 0.5109487175941467, + "learning_rate": 0.0005, + "loss": 1.4044, + "step": 4160 + }, + { + "epoch": 0.13297617908734335, + "grad_norm": 0.5093071460723877, + "learning_rate": 0.0005, + "loss": 1.4067, + "step": 4170 + }, + { + "epoch": 0.13329506680697725, + "grad_norm": 0.5161386728286743, + "learning_rate": 0.0005, + "loss": 1.3862, + "step": 4180 + }, + { + "epoch": 0.13361395452661118, + "grad_norm": 0.5066111087799072, + "learning_rate": 0.0005, + "loss": 1.3957, + "step": 4190 + }, + { + "epoch": 0.1339328422462451, + "grad_norm": 0.5192795991897583, + "learning_rate": 0.0005, + "loss": 1.4062, + "step": 4200 + }, + { + "epoch": 0.134251729965879, + "grad_norm": 0.504766047000885, + "learning_rate": 0.0005, + "loss": 1.4001, + "step": 4210 + }, + { + "epoch": 0.13457061768551293, + "grad_norm": 0.5010153651237488, + "learning_rate": 0.0005, + "loss": 1.4002, + "step": 4220 + }, + { + "epoch": 0.13488950540514685, + "grad_norm": 0.5043392777442932, + "learning_rate": 0.0005, + "loss": 1.3885, + "step": 4230 + }, + { + "epoch": 0.13520839312478075, + "grad_norm": 0.5130932927131653, + "learning_rate": 0.0005, + "loss": 1.4271, + "step": 4240 + }, + { + "epoch": 0.13552728084441468, + "grad_norm": 0.4850212633609772, + "learning_rate": 0.0005, + "loss": 1.4028, + "step": 4250 + }, + { + "epoch": 0.1358461685640486, + "grad_norm": 0.5103309750556946, + "learning_rate": 0.0005, + "loss": 1.4137, + "step": 4260 + }, + { + "epoch": 0.13616505628368253, + "grad_norm": 0.5363282561302185, + "learning_rate": 0.0005, + "loss": 1.3568, + "step": 4270 + }, + { + "epoch": 0.13648394400331643, + "grad_norm": 0.5205022692680359, + "learning_rate": 0.0005, + "loss": 1.3999, + "step": 4280 + }, + { + "epoch": 0.13680283172295035, + "grad_norm": 0.5299255847930908, + "learning_rate": 0.0005, + "loss": 1.3793, + "step": 4290 + }, + { + "epoch": 0.13712171944258428, + "grad_norm": 0.5065796971321106, + "learning_rate": 0.0005, + "loss": 1.3932, + "step": 4300 + }, + { + "epoch": 0.13744060716221818, + "grad_norm": 0.5404181480407715, + "learning_rate": 0.0005, + "loss": 1.4033, + "step": 4310 + }, + { + "epoch": 0.1377594948818521, + "grad_norm": 0.495960533618927, + "learning_rate": 0.0005, + "loss": 1.3815, + "step": 4320 + }, + { + "epoch": 0.13807838260148603, + "grad_norm": 0.4996244013309479, + "learning_rate": 0.0005, + "loss": 1.3929, + "step": 4330 + }, + { + "epoch": 0.13839727032111993, + "grad_norm": 0.496735543012619, + "learning_rate": 0.0005, + "loss": 1.3939, + "step": 4340 + }, + { + "epoch": 0.13871615804075385, + "grad_norm": 0.5102179646492004, + "learning_rate": 0.0005, + "loss": 1.3932, + "step": 4350 + }, + { + "epoch": 0.13903504576038778, + "grad_norm": 0.515885055065155, + "learning_rate": 0.0005, + "loss": 1.3931, + "step": 4360 + }, + { + "epoch": 0.13935393348002167, + "grad_norm": 0.5257781744003296, + "learning_rate": 0.0005, + "loss": 1.4075, + "step": 4370 + }, + { + "epoch": 0.1396728211996556, + "grad_norm": 0.5537557601928711, + "learning_rate": 0.0005, + "loss": 1.3857, + "step": 4380 + }, + { + "epoch": 0.13999170891928953, + "grad_norm": 0.5099586844444275, + "learning_rate": 0.0005, + "loss": 1.3906, + "step": 4390 + }, + { + "epoch": 0.14031059663892342, + "grad_norm": 0.5115305185317993, + "learning_rate": 0.0005, + "loss": 1.4028, + "step": 4400 + }, + { + "epoch": 0.14062948435855735, + "grad_norm": 0.5236173868179321, + "learning_rate": 0.0005, + "loss": 1.4064, + "step": 4410 + }, + { + "epoch": 0.14094837207819128, + "grad_norm": 0.49574723839759827, + "learning_rate": 0.0005, + "loss": 1.3687, + "step": 4420 + }, + { + "epoch": 0.14126725979782517, + "grad_norm": 0.512898862361908, + "learning_rate": 0.0005, + "loss": 1.385, + "step": 4430 + }, + { + "epoch": 0.1415861475174591, + "grad_norm": 0.49689918756484985, + "learning_rate": 0.0005, + "loss": 1.3695, + "step": 4440 + }, + { + "epoch": 0.14190503523709302, + "grad_norm": 0.5022764801979065, + "learning_rate": 0.0005, + "loss": 1.3606, + "step": 4450 + }, + { + "epoch": 0.14222392295672695, + "grad_norm": 0.5280899405479431, + "learning_rate": 0.0005, + "loss": 1.3651, + "step": 4460 + }, + { + "epoch": 0.14254281067636085, + "grad_norm": 0.5257755517959595, + "learning_rate": 0.0005, + "loss": 1.3761, + "step": 4470 + }, + { + "epoch": 0.14286169839599477, + "grad_norm": 0.5031253099441528, + "learning_rate": 0.0005, + "loss": 1.3864, + "step": 4480 + }, + { + "epoch": 0.1431805861156287, + "grad_norm": 0.5129061341285706, + "learning_rate": 0.0005, + "loss": 1.3759, + "step": 4490 + }, + { + "epoch": 0.1434994738352626, + "grad_norm": 0.5187103152275085, + "learning_rate": 0.0005, + "loss": 1.4037, + "step": 4500 + }, + { + "epoch": 0.14381836155489652, + "grad_norm": 0.5009415745735168, + "learning_rate": 0.0005, + "loss": 1.4097, + "step": 4510 + }, + { + "epoch": 0.14413724927453045, + "grad_norm": 0.5013829469680786, + "learning_rate": 0.0005, + "loss": 1.3811, + "step": 4520 + }, + { + "epoch": 0.14445613699416435, + "grad_norm": 0.4731661081314087, + "learning_rate": 0.0005, + "loss": 1.3755, + "step": 4530 + }, + { + "epoch": 0.14477502471379827, + "grad_norm": 0.5249818563461304, + "learning_rate": 0.0005, + "loss": 1.3786, + "step": 4540 + }, + { + "epoch": 0.1450939124334322, + "grad_norm": 0.49896419048309326, + "learning_rate": 0.0005, + "loss": 1.38, + "step": 4550 + }, + { + "epoch": 0.1454128001530661, + "grad_norm": 0.5124298930168152, + "learning_rate": 0.0005, + "loss": 1.3999, + "step": 4560 + }, + { + "epoch": 0.14573168787270002, + "grad_norm": 0.4988538324832916, + "learning_rate": 0.0005, + "loss": 1.3761, + "step": 4570 + }, + { + "epoch": 0.14605057559233395, + "grad_norm": 0.49036043882369995, + "learning_rate": 0.0005, + "loss": 1.3684, + "step": 4580 + }, + { + "epoch": 0.14636946331196785, + "grad_norm": 0.5022983551025391, + "learning_rate": 0.0005, + "loss": 1.3694, + "step": 4590 + }, + { + "epoch": 0.14668835103160177, + "grad_norm": 0.5071865320205688, + "learning_rate": 0.0005, + "loss": 1.3891, + "step": 4600 + }, + { + "epoch": 0.1470072387512357, + "grad_norm": 0.4856301546096802, + "learning_rate": 0.0005, + "loss": 1.3796, + "step": 4610 + }, + { + "epoch": 0.1473261264708696, + "grad_norm": 0.4836377501487732, + "learning_rate": 0.0005, + "loss": 1.3829, + "step": 4620 + }, + { + "epoch": 0.14764501419050352, + "grad_norm": 0.4946225583553314, + "learning_rate": 0.0005, + "loss": 1.3766, + "step": 4630 + }, + { + "epoch": 0.14796390191013745, + "grad_norm": 0.4711849093437195, + "learning_rate": 0.0005, + "loss": 1.3866, + "step": 4640 + }, + { + "epoch": 0.14828278962977134, + "grad_norm": 0.5586421489715576, + "learning_rate": 0.0005, + "loss": 1.383, + "step": 4650 + }, + { + "epoch": 0.14860167734940527, + "grad_norm": 0.5114756226539612, + "learning_rate": 0.0005, + "loss": 1.381, + "step": 4660 + }, + { + "epoch": 0.1489205650690392, + "grad_norm": 0.526611864566803, + "learning_rate": 0.0005, + "loss": 1.3658, + "step": 4670 + }, + { + "epoch": 0.14923945278867312, + "grad_norm": 0.4793586730957031, + "learning_rate": 0.0005, + "loss": 1.3788, + "step": 4680 + }, + { + "epoch": 0.14955834050830702, + "grad_norm": 0.5218145251274109, + "learning_rate": 0.0005, + "loss": 1.3764, + "step": 4690 + }, + { + "epoch": 0.14987722822794095, + "grad_norm": 0.4978809058666229, + "learning_rate": 0.0005, + "loss": 1.3516, + "step": 4700 + }, + { + "epoch": 0.15019611594757487, + "grad_norm": 0.4808703362941742, + "learning_rate": 0.0005, + "loss": 1.3671, + "step": 4710 + }, + { + "epoch": 0.15051500366720877, + "grad_norm": 0.5049744248390198, + "learning_rate": 0.0005, + "loss": 1.3551, + "step": 4720 + }, + { + "epoch": 0.1508338913868427, + "grad_norm": 0.4735998213291168, + "learning_rate": 0.0005, + "loss": 1.3686, + "step": 4730 + }, + { + "epoch": 0.15115277910647662, + "grad_norm": 0.5061535835266113, + "learning_rate": 0.0005, + "loss": 1.3678, + "step": 4740 + }, + { + "epoch": 0.15147166682611052, + "grad_norm": 0.5199115872383118, + "learning_rate": 0.0005, + "loss": 1.3485, + "step": 4750 + }, + { + "epoch": 0.15179055454574444, + "grad_norm": 0.484674334526062, + "learning_rate": 0.0005, + "loss": 1.3521, + "step": 4760 + }, + { + "epoch": 0.15210944226537837, + "grad_norm": 0.5155425071716309, + "learning_rate": 0.0005, + "loss": 1.3646, + "step": 4770 + }, + { + "epoch": 0.15242832998501227, + "grad_norm": 0.514416515827179, + "learning_rate": 0.0005, + "loss": 1.3557, + "step": 4780 + }, + { + "epoch": 0.1527472177046462, + "grad_norm": 0.5200762152671814, + "learning_rate": 0.0005, + "loss": 1.3765, + "step": 4790 + }, + { + "epoch": 0.15306610542428012, + "grad_norm": 0.507409393787384, + "learning_rate": 0.0005, + "loss": 1.3558, + "step": 4800 + }, + { + "epoch": 0.15338499314391402, + "grad_norm": 0.47993403673171997, + "learning_rate": 0.0005, + "loss": 1.3741, + "step": 4810 + }, + { + "epoch": 0.15370388086354794, + "grad_norm": 0.48537927865982056, + "learning_rate": 0.0005, + "loss": 1.3639, + "step": 4820 + }, + { + "epoch": 0.15402276858318187, + "grad_norm": 0.47542354464530945, + "learning_rate": 0.0005, + "loss": 1.3495, + "step": 4830 + }, + { + "epoch": 0.15434165630281577, + "grad_norm": 0.4669359624385834, + "learning_rate": 0.0005, + "loss": 1.3683, + "step": 4840 + }, + { + "epoch": 0.1546605440224497, + "grad_norm": 0.4673759937286377, + "learning_rate": 0.0005, + "loss": 1.3621, + "step": 4850 + }, + { + "epoch": 0.15497943174208362, + "grad_norm": 0.47257837653160095, + "learning_rate": 0.0005, + "loss": 1.3612, + "step": 4860 + }, + { + "epoch": 0.15529831946171752, + "grad_norm": 0.5429351925849915, + "learning_rate": 0.0005, + "loss": 1.3419, + "step": 4870 + }, + { + "epoch": 0.15561720718135144, + "grad_norm": 0.48105567693710327, + "learning_rate": 0.0005, + "loss": 1.3404, + "step": 4880 + }, + { + "epoch": 0.15593609490098537, + "grad_norm": 0.49502962827682495, + "learning_rate": 0.0005, + "loss": 1.3621, + "step": 4890 + }, + { + "epoch": 0.1562549826206193, + "grad_norm": 0.4787370562553406, + "learning_rate": 0.0005, + "loss": 1.3659, + "step": 4900 + }, + { + "epoch": 0.1565738703402532, + "grad_norm": 0.5000290870666504, + "learning_rate": 0.0005, + "loss": 1.3689, + "step": 4910 + }, + { + "epoch": 0.15689275805988712, + "grad_norm": 0.48688530921936035, + "learning_rate": 0.0005, + "loss": 1.3592, + "step": 4920 + }, + { + "epoch": 0.15721164577952104, + "grad_norm": 0.49029314517974854, + "learning_rate": 0.0005, + "loss": 1.3672, + "step": 4930 + }, + { + "epoch": 0.15753053349915494, + "grad_norm": 0.4770297408103943, + "learning_rate": 0.0005, + "loss": 1.3401, + "step": 4940 + }, + { + "epoch": 0.15784942121878887, + "grad_norm": 0.485786497592926, + "learning_rate": 0.0005, + "loss": 1.3362, + "step": 4950 + }, + { + "epoch": 0.1581683089384228, + "grad_norm": 0.51774001121521, + "learning_rate": 0.0005, + "loss": 1.3643, + "step": 4960 + }, + { + "epoch": 0.1584871966580567, + "grad_norm": 0.4883527159690857, + "learning_rate": 0.0005, + "loss": 1.3578, + "step": 4970 + }, + { + "epoch": 0.15880608437769062, + "grad_norm": 0.4808884859085083, + "learning_rate": 0.0005, + "loss": 1.3452, + "step": 4980 + }, + { + "epoch": 0.15912497209732454, + "grad_norm": 0.48574575781822205, + "learning_rate": 0.0005, + "loss": 1.3456, + "step": 4990 + }, + { + "epoch": 0.15944385981695844, + "grad_norm": 0.4806765615940094, + "learning_rate": 0.0005, + "loss": 1.3183, + "step": 5000 + }, + { + "epoch": 0.15976274753659236, + "grad_norm": 0.47405245900154114, + "learning_rate": 0.0005, + "loss": 1.3571, + "step": 5010 + }, + { + "epoch": 0.1600816352562263, + "grad_norm": 0.5027293562889099, + "learning_rate": 0.0005, + "loss": 1.3413, + "step": 5020 + }, + { + "epoch": 0.1604005229758602, + "grad_norm": 0.5056706666946411, + "learning_rate": 0.0005, + "loss": 1.3807, + "step": 5030 + }, + { + "epoch": 0.1607194106954941, + "grad_norm": 0.47428926825523376, + "learning_rate": 0.0005, + "loss": 1.3603, + "step": 5040 + }, + { + "epoch": 0.16103829841512804, + "grad_norm": 0.49223700165748596, + "learning_rate": 0.0005, + "loss": 1.343, + "step": 5050 + }, + { + "epoch": 0.16135718613476194, + "grad_norm": 0.4843858778476715, + "learning_rate": 0.0005, + "loss": 1.3467, + "step": 5060 + }, + { + "epoch": 0.16167607385439586, + "grad_norm": 0.4874281585216522, + "learning_rate": 0.0005, + "loss": 1.3374, + "step": 5070 + }, + { + "epoch": 0.1619949615740298, + "grad_norm": 0.5039829611778259, + "learning_rate": 0.0005, + "loss": 1.3454, + "step": 5080 + }, + { + "epoch": 0.16231384929366371, + "grad_norm": 0.4736151099205017, + "learning_rate": 0.0005, + "loss": 1.366, + "step": 5090 + }, + { + "epoch": 0.1626327370132976, + "grad_norm": 0.4735645651817322, + "learning_rate": 0.0005, + "loss": 1.3539, + "step": 5100 + }, + { + "epoch": 0.16295162473293154, + "grad_norm": 0.4740479588508606, + "learning_rate": 0.0005, + "loss": 1.3478, + "step": 5110 + }, + { + "epoch": 0.16327051245256546, + "grad_norm": 0.4920395314693451, + "learning_rate": 0.0005, + "loss": 1.3257, + "step": 5120 + }, + { + "epoch": 0.16358940017219936, + "grad_norm": 0.4727405607700348, + "learning_rate": 0.0005, + "loss": 1.346, + "step": 5130 + }, + { + "epoch": 0.1639082878918333, + "grad_norm": 0.4696497619152069, + "learning_rate": 0.0005, + "loss": 1.34, + "step": 5140 + }, + { + "epoch": 0.1642271756114672, + "grad_norm": 0.48133373260498047, + "learning_rate": 0.0005, + "loss": 1.3443, + "step": 5150 + }, + { + "epoch": 0.1645460633311011, + "grad_norm": 0.5079322457313538, + "learning_rate": 0.0005, + "loss": 1.3423, + "step": 5160 + }, + { + "epoch": 0.16486495105073504, + "grad_norm": 0.4736444056034088, + "learning_rate": 0.0005, + "loss": 1.3488, + "step": 5170 + }, + { + "epoch": 0.16518383877036896, + "grad_norm": 0.4801396131515503, + "learning_rate": 0.0005, + "loss": 1.3514, + "step": 5180 + }, + { + "epoch": 0.16550272649000286, + "grad_norm": 0.4806341230869293, + "learning_rate": 0.0005, + "loss": 1.3682, + "step": 5190 + }, + { + "epoch": 0.1658216142096368, + "grad_norm": 0.46117836236953735, + "learning_rate": 0.0005, + "loss": 1.3248, + "step": 5200 + }, + { + "epoch": 0.1661405019292707, + "grad_norm": 0.47865962982177734, + "learning_rate": 0.0005, + "loss": 1.3268, + "step": 5210 + }, + { + "epoch": 0.1664593896489046, + "grad_norm": 0.47703343629837036, + "learning_rate": 0.0005, + "loss": 1.3303, + "step": 5220 + }, + { + "epoch": 0.16677827736853854, + "grad_norm": 0.5019536018371582, + "learning_rate": 0.0005, + "loss": 1.3262, + "step": 5230 + }, + { + "epoch": 0.16709716508817246, + "grad_norm": 0.4716322124004364, + "learning_rate": 0.0005, + "loss": 1.3411, + "step": 5240 + }, + { + "epoch": 0.16741605280780636, + "grad_norm": 0.4730423092842102, + "learning_rate": 0.0005, + "loss": 1.3355, + "step": 5250 + }, + { + "epoch": 0.16773494052744028, + "grad_norm": 0.4925325810909271, + "learning_rate": 0.0005, + "loss": 1.3315, + "step": 5260 + }, + { + "epoch": 0.1680538282470742, + "grad_norm": 0.48779094219207764, + "learning_rate": 0.0005, + "loss": 1.3214, + "step": 5270 + }, + { + "epoch": 0.1683727159667081, + "grad_norm": 0.4637846350669861, + "learning_rate": 0.0005, + "loss": 1.3366, + "step": 5280 + }, + { + "epoch": 0.16869160368634203, + "grad_norm": 0.48783788084983826, + "learning_rate": 0.0005, + "loss": 1.3301, + "step": 5290 + }, + { + "epoch": 0.16901049140597596, + "grad_norm": 0.47115302085876465, + "learning_rate": 0.0005, + "loss": 1.3293, + "step": 5300 + }, + { + "epoch": 0.16932937912560989, + "grad_norm": 0.4882822334766388, + "learning_rate": 0.0005, + "loss": 1.3382, + "step": 5310 + }, + { + "epoch": 0.16964826684524378, + "grad_norm": 0.4872843027114868, + "learning_rate": 0.0005, + "loss": 1.331, + "step": 5320 + }, + { + "epoch": 0.1699671545648777, + "grad_norm": 0.46673768758773804, + "learning_rate": 0.0005, + "loss": 1.3439, + "step": 5330 + }, + { + "epoch": 0.17028604228451164, + "grad_norm": 0.4746752083301544, + "learning_rate": 0.0005, + "loss": 1.3559, + "step": 5340 + }, + { + "epoch": 0.17060493000414553, + "grad_norm": 0.48173803091049194, + "learning_rate": 0.0005, + "loss": 1.3302, + "step": 5350 + }, + { + "epoch": 0.17092381772377946, + "grad_norm": 0.4860802888870239, + "learning_rate": 0.0005, + "loss": 1.3315, + "step": 5360 + }, + { + "epoch": 0.17124270544341338, + "grad_norm": 0.4615638256072998, + "learning_rate": 0.0005, + "loss": 1.3175, + "step": 5370 + }, + { + "epoch": 0.17156159316304728, + "grad_norm": 0.4676917791366577, + "learning_rate": 0.0005, + "loss": 1.3037, + "step": 5380 + }, + { + "epoch": 0.1718804808826812, + "grad_norm": 0.4855146110057831, + "learning_rate": 0.0005, + "loss": 1.3231, + "step": 5390 + }, + { + "epoch": 0.17219936860231513, + "grad_norm": 0.48758402466773987, + "learning_rate": 0.0005, + "loss": 1.3519, + "step": 5400 + }, + { + "epoch": 0.17251825632194903, + "grad_norm": 0.4420989751815796, + "learning_rate": 0.0005, + "loss": 1.317, + "step": 5410 + }, + { + "epoch": 0.17283714404158296, + "grad_norm": 0.4432215690612793, + "learning_rate": 0.0005, + "loss": 1.3363, + "step": 5420 + }, + { + "epoch": 0.17315603176121688, + "grad_norm": 0.5140141844749451, + "learning_rate": 0.0005, + "loss": 1.3301, + "step": 5430 + }, + { + "epoch": 0.17347491948085078, + "grad_norm": 0.4605754613876343, + "learning_rate": 0.0005, + "loss": 1.3343, + "step": 5440 + }, + { + "epoch": 0.1737938072004847, + "grad_norm": 0.46923375129699707, + "learning_rate": 0.0005, + "loss": 1.3341, + "step": 5450 + }, + { + "epoch": 0.17411269492011863, + "grad_norm": 0.4824751019477844, + "learning_rate": 0.0005, + "loss": 1.3169, + "step": 5460 + }, + { + "epoch": 0.17443158263975253, + "grad_norm": 0.4664471745491028, + "learning_rate": 0.0005, + "loss": 1.3375, + "step": 5470 + }, + { + "epoch": 0.17475047035938646, + "grad_norm": 0.48443225026130676, + "learning_rate": 0.0005, + "loss": 1.3362, + "step": 5480 + }, + { + "epoch": 0.17506935807902038, + "grad_norm": 0.46908897161483765, + "learning_rate": 0.0005, + "loss": 1.3, + "step": 5490 + }, + { + "epoch": 0.1753882457986543, + "grad_norm": 0.4562680125236511, + "learning_rate": 0.0005, + "loss": 1.2999, + "step": 5500 + }, + { + "epoch": 0.1757071335182882, + "grad_norm": 0.46892642974853516, + "learning_rate": 0.0005, + "loss": 1.3357, + "step": 5510 + }, + { + "epoch": 0.17602602123792213, + "grad_norm": 0.5013911724090576, + "learning_rate": 0.0005, + "loss": 1.3326, + "step": 5520 + }, + { + "epoch": 0.17634490895755606, + "grad_norm": 0.45573681592941284, + "learning_rate": 0.0005, + "loss": 1.3149, + "step": 5530 + }, + { + "epoch": 0.17666379667718995, + "grad_norm": 0.4677056074142456, + "learning_rate": 0.0005, + "loss": 1.3216, + "step": 5540 + }, + { + "epoch": 0.17698268439682388, + "grad_norm": 0.4879295527935028, + "learning_rate": 0.0005, + "loss": 1.3337, + "step": 5550 + }, + { + "epoch": 0.1773015721164578, + "grad_norm": 0.4852617084980011, + "learning_rate": 0.0005, + "loss": 1.3217, + "step": 5560 + }, + { + "epoch": 0.1776204598360917, + "grad_norm": 0.4770294725894928, + "learning_rate": 0.0005, + "loss": 1.3143, + "step": 5570 + }, + { + "epoch": 0.17793934755572563, + "grad_norm": 0.4696557819843292, + "learning_rate": 0.0005, + "loss": 1.3031, + "step": 5580 + }, + { + "epoch": 0.17825823527535956, + "grad_norm": 0.4751479923725128, + "learning_rate": 0.0005, + "loss": 1.3371, + "step": 5590 + }, + { + "epoch": 0.17857712299499345, + "grad_norm": 0.4551768898963928, + "learning_rate": 0.0005, + "loss": 1.3293, + "step": 5600 + }, + { + "epoch": 0.17889601071462738, + "grad_norm": 0.4669037461280823, + "learning_rate": 0.0005, + "loss": 1.3207, + "step": 5610 + }, + { + "epoch": 0.1792148984342613, + "grad_norm": 0.4847627878189087, + "learning_rate": 0.0005, + "loss": 1.3393, + "step": 5620 + }, + { + "epoch": 0.1795337861538952, + "grad_norm": 0.4655326008796692, + "learning_rate": 0.0005, + "loss": 1.3244, + "step": 5630 + }, + { + "epoch": 0.17985267387352913, + "grad_norm": 0.447279691696167, + "learning_rate": 0.0005, + "loss": 1.3149, + "step": 5640 + }, + { + "epoch": 0.18017156159316305, + "grad_norm": 0.45796069502830505, + "learning_rate": 0.0005, + "loss": 1.3346, + "step": 5650 + }, + { + "epoch": 0.18049044931279695, + "grad_norm": 0.4523908197879791, + "learning_rate": 0.0005, + "loss": 1.3209, + "step": 5660 + }, + { + "epoch": 0.18080933703243088, + "grad_norm": 0.46900004148483276, + "learning_rate": 0.0005, + "loss": 1.325, + "step": 5670 + }, + { + "epoch": 0.1811282247520648, + "grad_norm": 0.457965612411499, + "learning_rate": 0.0005, + "loss": 1.3031, + "step": 5680 + }, + { + "epoch": 0.1814471124716987, + "grad_norm": 0.4391346871852875, + "learning_rate": 0.0005, + "loss": 1.3066, + "step": 5690 + }, + { + "epoch": 0.18176600019133263, + "grad_norm": 0.44087544083595276, + "learning_rate": 0.0005, + "loss": 1.3129, + "step": 5700 + }, + { + "epoch": 0.18208488791096655, + "grad_norm": 0.46599528193473816, + "learning_rate": 0.0005, + "loss": 1.3201, + "step": 5710 + }, + { + "epoch": 0.18240377563060048, + "grad_norm": 0.44314372539520264, + "learning_rate": 0.0005, + "loss": 1.3357, + "step": 5720 + }, + { + "epoch": 0.18272266335023438, + "grad_norm": 0.4564199447631836, + "learning_rate": 0.0005, + "loss": 1.2977, + "step": 5730 + }, + { + "epoch": 0.1830415510698683, + "grad_norm": 0.45973026752471924, + "learning_rate": 0.0005, + "loss": 1.2927, + "step": 5740 + }, + { + "epoch": 0.18336043878950223, + "grad_norm": 0.4481637477874756, + "learning_rate": 0.0005, + "loss": 1.33, + "step": 5750 + }, + { + "epoch": 0.18367932650913613, + "grad_norm": 0.4800594747066498, + "learning_rate": 0.0005, + "loss": 1.313, + "step": 5760 + }, + { + "epoch": 0.18399821422877005, + "grad_norm": 0.49500760436058044, + "learning_rate": 0.0005, + "loss": 1.3104, + "step": 5770 + }, + { + "epoch": 0.18431710194840398, + "grad_norm": 0.4503342807292938, + "learning_rate": 0.0005, + "loss": 1.3013, + "step": 5780 + }, + { + "epoch": 0.18463598966803788, + "grad_norm": 0.451404869556427, + "learning_rate": 0.0005, + "loss": 1.3019, + "step": 5790 + }, + { + "epoch": 0.1849548773876718, + "grad_norm": 0.4840630888938904, + "learning_rate": 0.0005, + "loss": 1.3129, + "step": 5800 + }, + { + "epoch": 0.18527376510730573, + "grad_norm": 0.45992785692214966, + "learning_rate": 0.0005, + "loss": 1.3124, + "step": 5810 + }, + { + "epoch": 0.18559265282693962, + "grad_norm": 0.44602009654045105, + "learning_rate": 0.0005, + "loss": 1.3026, + "step": 5820 + }, + { + "epoch": 0.18591154054657355, + "grad_norm": 0.44860562682151794, + "learning_rate": 0.0005, + "loss": 1.3054, + "step": 5830 + }, + { + "epoch": 0.18623042826620748, + "grad_norm": 0.5289995074272156, + "learning_rate": 0.0005, + "loss": 1.306, + "step": 5840 + }, + { + "epoch": 0.18654931598584137, + "grad_norm": 0.4493541717529297, + "learning_rate": 0.0005, + "loss": 1.3108, + "step": 5850 + }, + { + "epoch": 0.1868682037054753, + "grad_norm": 0.454628586769104, + "learning_rate": 0.0005, + "loss": 1.3098, + "step": 5860 + }, + { + "epoch": 0.18718709142510923, + "grad_norm": 0.46552345156669617, + "learning_rate": 0.0005, + "loss": 1.2977, + "step": 5870 + }, + { + "epoch": 0.18750597914474312, + "grad_norm": 0.45460376143455505, + "learning_rate": 0.0005, + "loss": 1.3215, + "step": 5880 + }, + { + "epoch": 0.18782486686437705, + "grad_norm": 0.46688956022262573, + "learning_rate": 0.0005, + "loss": 1.3116, + "step": 5890 + }, + { + "epoch": 0.18814375458401097, + "grad_norm": 0.45595839619636536, + "learning_rate": 0.0005, + "loss": 1.3025, + "step": 5900 + }, + { + "epoch": 0.18846264230364487, + "grad_norm": 0.457356333732605, + "learning_rate": 0.0005, + "loss": 1.2995, + "step": 5910 + }, + { + "epoch": 0.1887815300232788, + "grad_norm": 0.44302088022232056, + "learning_rate": 0.0005, + "loss": 1.3031, + "step": 5920 + }, + { + "epoch": 0.18910041774291272, + "grad_norm": 0.46125978231430054, + "learning_rate": 0.0005, + "loss": 1.3006, + "step": 5930 + }, + { + "epoch": 0.18941930546254665, + "grad_norm": 0.45474714040756226, + "learning_rate": 0.0005, + "loss": 1.3132, + "step": 5940 + }, + { + "epoch": 0.18973819318218055, + "grad_norm": 0.4790920913219452, + "learning_rate": 0.0005, + "loss": 1.2941, + "step": 5950 + }, + { + "epoch": 0.19005708090181447, + "grad_norm": 0.4711848795413971, + "learning_rate": 0.0005, + "loss": 1.3296, + "step": 5960 + }, + { + "epoch": 0.1903759686214484, + "grad_norm": 0.45707839727401733, + "learning_rate": 0.0005, + "loss": 1.3104, + "step": 5970 + }, + { + "epoch": 0.1906948563410823, + "grad_norm": 0.47939345240592957, + "learning_rate": 0.0005, + "loss": 1.2997, + "step": 5980 + }, + { + "epoch": 0.19101374406071622, + "grad_norm": 0.4554172158241272, + "learning_rate": 0.0005, + "loss": 1.3091, + "step": 5990 + }, + { + "epoch": 0.19133263178035015, + "grad_norm": 0.45858779549598694, + "learning_rate": 0.0005, + "loss": 1.2891, + "step": 6000 + }, + { + "epoch": 0.19165151949998405, + "grad_norm": 0.4406243860721588, + "learning_rate": 0.0005, + "loss": 1.3123, + "step": 6010 + }, + { + "epoch": 0.19197040721961797, + "grad_norm": 0.4473991096019745, + "learning_rate": 0.0005, + "loss": 1.2913, + "step": 6020 + }, + { + "epoch": 0.1922892949392519, + "grad_norm": 0.4342772960662842, + "learning_rate": 0.0005, + "loss": 1.2756, + "step": 6030 + }, + { + "epoch": 0.1926081826588858, + "grad_norm": 0.4456859827041626, + "learning_rate": 0.0005, + "loss": 1.2947, + "step": 6040 + }, + { + "epoch": 0.19292707037851972, + "grad_norm": 0.443399578332901, + "learning_rate": 0.0005, + "loss": 1.2933, + "step": 6050 + }, + { + "epoch": 0.19324595809815365, + "grad_norm": 0.45340052247047424, + "learning_rate": 0.0005, + "loss": 1.2876, + "step": 6060 + }, + { + "epoch": 0.19356484581778755, + "grad_norm": 0.48477765917778015, + "learning_rate": 0.0005, + "loss": 1.2951, + "step": 6070 + }, + { + "epoch": 0.19388373353742147, + "grad_norm": 0.4576895534992218, + "learning_rate": 0.0005, + "loss": 1.2933, + "step": 6080 + }, + { + "epoch": 0.1942026212570554, + "grad_norm": 0.46342259645462036, + "learning_rate": 0.0005, + "loss": 1.2931, + "step": 6090 + }, + { + "epoch": 0.1945215089766893, + "grad_norm": 0.4377509355545044, + "learning_rate": 0.0005, + "loss": 1.3003, + "step": 6100 + }, + { + "epoch": 0.19484039669632322, + "grad_norm": 0.47648733854293823, + "learning_rate": 0.0005, + "loss": 1.3055, + "step": 6110 + }, + { + "epoch": 0.19515928441595715, + "grad_norm": 0.47392523288726807, + "learning_rate": 0.0005, + "loss": 1.2955, + "step": 6120 + }, + { + "epoch": 0.19547817213559107, + "grad_norm": 0.44861629605293274, + "learning_rate": 0.0005, + "loss": 1.2736, + "step": 6130 + }, + { + "epoch": 0.19579705985522497, + "grad_norm": 0.45019152760505676, + "learning_rate": 0.0005, + "loss": 1.3001, + "step": 6140 + }, + { + "epoch": 0.1961159475748589, + "grad_norm": 0.43096092343330383, + "learning_rate": 0.0005, + "loss": 1.2678, + "step": 6150 + }, + { + "epoch": 0.19643483529449282, + "grad_norm": 0.4468885064125061, + "learning_rate": 0.0005, + "loss": 1.2742, + "step": 6160 + }, + { + "epoch": 0.19675372301412672, + "grad_norm": 0.44142505526542664, + "learning_rate": 0.0005, + "loss": 1.3079, + "step": 6170 + }, + { + "epoch": 0.19707261073376064, + "grad_norm": 0.4425607919692993, + "learning_rate": 0.0005, + "loss": 1.2885, + "step": 6180 + }, + { + "epoch": 0.19739149845339457, + "grad_norm": 0.45576775074005127, + "learning_rate": 0.0005, + "loss": 1.3119, + "step": 6190 + }, + { + "epoch": 0.19771038617302847, + "grad_norm": 0.44537460803985596, + "learning_rate": 0.0005, + "loss": 1.2937, + "step": 6200 + }, + { + "epoch": 0.1980292738926624, + "grad_norm": 0.46090060472488403, + "learning_rate": 0.0005, + "loss": 1.2971, + "step": 6210 + }, + { + "epoch": 0.19834816161229632, + "grad_norm": 0.44849535822868347, + "learning_rate": 0.0005, + "loss": 1.2699, + "step": 6220 + }, + { + "epoch": 0.19866704933193022, + "grad_norm": 0.4581367075443268, + "learning_rate": 0.0005, + "loss": 1.276, + "step": 6230 + }, + { + "epoch": 0.19898593705156414, + "grad_norm": 0.4342295825481415, + "learning_rate": 0.0005, + "loss": 1.284, + "step": 6240 + }, + { + "epoch": 0.19930482477119807, + "grad_norm": 0.4359763264656067, + "learning_rate": 0.0005, + "loss": 1.3001, + "step": 6250 + }, + { + "epoch": 0.19962371249083197, + "grad_norm": 0.4430510103702545, + "learning_rate": 0.0005, + "loss": 1.2748, + "step": 6260 + }, + { + "epoch": 0.1999426002104659, + "grad_norm": 0.4533330202102661, + "learning_rate": 0.0005, + "loss": 1.2926, + "step": 6270 + }, + { + "epoch": 0.20026148793009982, + "grad_norm": 0.45962098240852356, + "learning_rate": 0.0005, + "loss": 1.2872, + "step": 6280 + }, + { + "epoch": 0.20058037564973372, + "grad_norm": 0.4355621039867401, + "learning_rate": 0.0005, + "loss": 1.3021, + "step": 6290 + }, + { + "epoch": 0.20089926336936764, + "grad_norm": 0.4176916182041168, + "learning_rate": 0.0005, + "loss": 1.2885, + "step": 6300 + }, + { + "epoch": 0.20121815108900157, + "grad_norm": 0.4410490393638611, + "learning_rate": 0.0005, + "loss": 1.2827, + "step": 6310 + }, + { + "epoch": 0.20153703880863547, + "grad_norm": 0.45107796788215637, + "learning_rate": 0.0005, + "loss": 1.2835, + "step": 6320 + }, + { + "epoch": 0.2018559265282694, + "grad_norm": 0.44999945163726807, + "learning_rate": 0.0005, + "loss": 1.2805, + "step": 6330 + }, + { + "epoch": 0.20217481424790332, + "grad_norm": 0.46005144715309143, + "learning_rate": 0.0005, + "loss": 1.2857, + "step": 6340 + }, + { + "epoch": 0.20249370196753724, + "grad_norm": 0.44353771209716797, + "learning_rate": 0.0005, + "loss": 1.2774, + "step": 6350 + }, + { + "epoch": 0.20281258968717114, + "grad_norm": 0.4384526014328003, + "learning_rate": 0.0005, + "loss": 1.2871, + "step": 6360 + }, + { + "epoch": 0.20313147740680507, + "grad_norm": 0.43561914563179016, + "learning_rate": 0.0005, + "loss": 1.2985, + "step": 6370 + }, + { + "epoch": 0.203450365126439, + "grad_norm": 0.4324412941932678, + "learning_rate": 0.0005, + "loss": 1.2948, + "step": 6380 + }, + { + "epoch": 0.2037692528460729, + "grad_norm": 0.4382753372192383, + "learning_rate": 0.0005, + "loss": 1.2745, + "step": 6390 + }, + { + "epoch": 0.20408814056570682, + "grad_norm": 0.4647562503814697, + "learning_rate": 0.0005, + "loss": 1.2706, + "step": 6400 + }, + { + "epoch": 0.20440702828534074, + "grad_norm": 0.4480821490287781, + "learning_rate": 0.0005, + "loss": 1.2923, + "step": 6410 + }, + { + "epoch": 0.20472591600497464, + "grad_norm": 0.4176822304725647, + "learning_rate": 0.0005, + "loss": 1.2749, + "step": 6420 + }, + { + "epoch": 0.20504480372460857, + "grad_norm": 0.4394923448562622, + "learning_rate": 0.0005, + "loss": 1.301, + "step": 6430 + }, + { + "epoch": 0.2053636914442425, + "grad_norm": 0.43086984753608704, + "learning_rate": 0.0005, + "loss": 1.2803, + "step": 6440 + }, + { + "epoch": 0.2056825791638764, + "grad_norm": 0.4294069707393646, + "learning_rate": 0.0005, + "loss": 1.2733, + "step": 6450 + }, + { + "epoch": 0.20600146688351031, + "grad_norm": 0.42091456055641174, + "learning_rate": 0.0005, + "loss": 1.2691, + "step": 6460 + }, + { + "epoch": 0.20632035460314424, + "grad_norm": 0.45179837942123413, + "learning_rate": 0.0005, + "loss": 1.2874, + "step": 6470 + }, + { + "epoch": 0.20663924232277814, + "grad_norm": 0.4361703395843506, + "learning_rate": 0.0005, + "loss": 1.284, + "step": 6480 + }, + { + "epoch": 0.20695813004241206, + "grad_norm": 0.45856398344039917, + "learning_rate": 0.0005, + "loss": 1.2815, + "step": 6490 + }, + { + "epoch": 0.207277017762046, + "grad_norm": 0.44898521900177, + "learning_rate": 0.0005, + "loss": 1.2737, + "step": 6500 + }, + { + "epoch": 0.2075959054816799, + "grad_norm": 0.44692954421043396, + "learning_rate": 0.0005, + "loss": 1.2723, + "step": 6510 + }, + { + "epoch": 0.2079147932013138, + "grad_norm": 0.43304726481437683, + "learning_rate": 0.0005, + "loss": 1.2761, + "step": 6520 + }, + { + "epoch": 0.20823368092094774, + "grad_norm": 0.4356973469257355, + "learning_rate": 0.0005, + "loss": 1.2845, + "step": 6530 + }, + { + "epoch": 0.20855256864058166, + "grad_norm": 0.4488913416862488, + "learning_rate": 0.0005, + "loss": 1.3023, + "step": 6540 + }, + { + "epoch": 0.20887145636021556, + "grad_norm": 0.43722909688949585, + "learning_rate": 0.0005, + "loss": 1.2822, + "step": 6550 + }, + { + "epoch": 0.2091903440798495, + "grad_norm": 0.4295201301574707, + "learning_rate": 0.0005, + "loss": 1.2736, + "step": 6560 + }, + { + "epoch": 0.20950923179948341, + "grad_norm": 0.44216859340667725, + "learning_rate": 0.0005, + "loss": 1.2815, + "step": 6570 + }, + { + "epoch": 0.2098281195191173, + "grad_norm": 0.4219614267349243, + "learning_rate": 0.0005, + "loss": 1.2776, + "step": 6580 + }, + { + "epoch": 0.21014700723875124, + "grad_norm": 0.4117146134376526, + "learning_rate": 0.0005, + "loss": 1.2856, + "step": 6590 + }, + { + "epoch": 0.21046589495838516, + "grad_norm": 0.4342212975025177, + "learning_rate": 0.0005, + "loss": 1.2647, + "step": 6600 + }, + { + "epoch": 0.21078478267801906, + "grad_norm": 0.4356221556663513, + "learning_rate": 0.0005, + "loss": 1.2861, + "step": 6610 + }, + { + "epoch": 0.211103670397653, + "grad_norm": 0.4763599932193756, + "learning_rate": 0.0005, + "loss": 1.2701, + "step": 6620 + }, + { + "epoch": 0.2114225581172869, + "grad_norm": 0.44057396054267883, + "learning_rate": 0.0005, + "loss": 1.2817, + "step": 6630 + }, + { + "epoch": 0.2117414458369208, + "grad_norm": 0.42277610301971436, + "learning_rate": 0.0005, + "loss": 1.2917, + "step": 6640 + }, + { + "epoch": 0.21206033355655474, + "grad_norm": 0.42706525325775146, + "learning_rate": 0.0005, + "loss": 1.2664, + "step": 6650 + }, + { + "epoch": 0.21237922127618866, + "grad_norm": 0.43516626954078674, + "learning_rate": 0.0005, + "loss": 1.2785, + "step": 6660 + }, + { + "epoch": 0.21269810899582256, + "grad_norm": 0.4840092957019806, + "learning_rate": 0.0005, + "loss": 1.2756, + "step": 6670 + }, + { + "epoch": 0.21301699671545649, + "grad_norm": 0.44978827238082886, + "learning_rate": 0.0005, + "loss": 1.2698, + "step": 6680 + }, + { + "epoch": 0.2133358844350904, + "grad_norm": 0.45999205112457275, + "learning_rate": 0.0005, + "loss": 1.2845, + "step": 6690 + }, + { + "epoch": 0.2136547721547243, + "grad_norm": 0.45006537437438965, + "learning_rate": 0.0005, + "loss": 1.2754, + "step": 6700 + }, + { + "epoch": 0.21397365987435824, + "grad_norm": 0.4400864541530609, + "learning_rate": 0.0005, + "loss": 1.2611, + "step": 6710 + }, + { + "epoch": 0.21429254759399216, + "grad_norm": 0.44918856024742126, + "learning_rate": 0.0005, + "loss": 1.2746, + "step": 6720 + }, + { + "epoch": 0.21461143531362606, + "grad_norm": 0.4447942078113556, + "learning_rate": 0.0005, + "loss": 1.2667, + "step": 6730 + }, + { + "epoch": 0.21493032303325998, + "grad_norm": 0.41364383697509766, + "learning_rate": 0.0005, + "loss": 1.2737, + "step": 6740 + }, + { + "epoch": 0.2152492107528939, + "grad_norm": 0.43288302421569824, + "learning_rate": 0.0005, + "loss": 1.2889, + "step": 6750 + }, + { + "epoch": 0.21556809847252784, + "grad_norm": 0.4335830509662628, + "learning_rate": 0.0005, + "loss": 1.2803, + "step": 6760 + }, + { + "epoch": 0.21588698619216173, + "grad_norm": 0.4247308373451233, + "learning_rate": 0.0005, + "loss": 1.2649, + "step": 6770 + }, + { + "epoch": 0.21620587391179566, + "grad_norm": 0.4426180422306061, + "learning_rate": 0.0005, + "loss": 1.2554, + "step": 6780 + }, + { + "epoch": 0.21652476163142959, + "grad_norm": 0.43544715642929077, + "learning_rate": 0.0005, + "loss": 1.2535, + "step": 6790 + }, + { + "epoch": 0.21684364935106348, + "grad_norm": 0.46235576272010803, + "learning_rate": 0.0005, + "loss": 1.2466, + "step": 6800 + }, + { + "epoch": 0.2171625370706974, + "grad_norm": 0.41557857394218445, + "learning_rate": 0.0005, + "loss": 1.2806, + "step": 6810 + }, + { + "epoch": 0.21748142479033133, + "grad_norm": 0.4564860463142395, + "learning_rate": 0.0005, + "loss": 1.2688, + "step": 6820 + }, + { + "epoch": 0.21780031250996523, + "grad_norm": 0.4496130645275116, + "learning_rate": 0.0005, + "loss": 1.2671, + "step": 6830 + }, + { + "epoch": 0.21811920022959916, + "grad_norm": 0.4237511157989502, + "learning_rate": 0.0005, + "loss": 1.2684, + "step": 6840 + }, + { + "epoch": 0.21843808794923308, + "grad_norm": 0.4281541109085083, + "learning_rate": 0.0005, + "loss": 1.2486, + "step": 6850 + }, + { + "epoch": 0.21875697566886698, + "grad_norm": 0.41945523023605347, + "learning_rate": 0.0005, + "loss": 1.2485, + "step": 6860 + }, + { + "epoch": 0.2190758633885009, + "grad_norm": 0.4274255037307739, + "learning_rate": 0.0005, + "loss": 1.2698, + "step": 6870 + }, + { + "epoch": 0.21939475110813483, + "grad_norm": 0.4264500141143799, + "learning_rate": 0.0005, + "loss": 1.2686, + "step": 6880 + }, + { + "epoch": 0.21971363882776873, + "grad_norm": 0.41923102736473083, + "learning_rate": 0.0005, + "loss": 1.2623, + "step": 6890 + }, + { + "epoch": 0.22003252654740266, + "grad_norm": 0.438001424074173, + "learning_rate": 0.0005, + "loss": 1.2769, + "step": 6900 + }, + { + "epoch": 0.22035141426703658, + "grad_norm": 0.4238506853580475, + "learning_rate": 0.0005, + "loss": 1.2699, + "step": 6910 + }, + { + "epoch": 0.22067030198667048, + "grad_norm": 0.4327387511730194, + "learning_rate": 0.0005, + "loss": 1.2808, + "step": 6920 + }, + { + "epoch": 0.2209891897063044, + "grad_norm": 0.429081529378891, + "learning_rate": 0.0005, + "loss": 1.2726, + "step": 6930 + }, + { + "epoch": 0.22130807742593833, + "grad_norm": 0.4404042363166809, + "learning_rate": 0.0005, + "loss": 1.2592, + "step": 6940 + }, + { + "epoch": 0.22162696514557226, + "grad_norm": 0.43843477964401245, + "learning_rate": 0.0005, + "loss": 1.2564, + "step": 6950 + }, + { + "epoch": 0.22194585286520616, + "grad_norm": 0.42280009388923645, + "learning_rate": 0.0005, + "loss": 1.249, + "step": 6960 + }, + { + "epoch": 0.22226474058484008, + "grad_norm": 0.4233909547328949, + "learning_rate": 0.0005, + "loss": 1.2641, + "step": 6970 + }, + { + "epoch": 0.222583628304474, + "grad_norm": 0.4561818540096283, + "learning_rate": 0.0005, + "loss": 1.2642, + "step": 6980 + }, + { + "epoch": 0.2229025160241079, + "grad_norm": 0.4259558320045471, + "learning_rate": 0.0005, + "loss": 1.2718, + "step": 6990 + }, + { + "epoch": 0.22322140374374183, + "grad_norm": 0.42075279355049133, + "learning_rate": 0.0005, + "loss": 1.2893, + "step": 7000 + }, + { + "epoch": 0.22354029146337576, + "grad_norm": 0.43573033809661865, + "learning_rate": 0.0005, + "loss": 1.2479, + "step": 7010 + }, + { + "epoch": 0.22385917918300965, + "grad_norm": 0.4520786702632904, + "learning_rate": 0.0005, + "loss": 1.2683, + "step": 7020 + }, + { + "epoch": 0.22417806690264358, + "grad_norm": 0.4258021116256714, + "learning_rate": 0.0005, + "loss": 1.2382, + "step": 7030 + }, + { + "epoch": 0.2244969546222775, + "grad_norm": 0.4456654489040375, + "learning_rate": 0.0005, + "loss": 1.2436, + "step": 7040 + }, + { + "epoch": 0.2248158423419114, + "grad_norm": 0.432917058467865, + "learning_rate": 0.0005, + "loss": 1.2592, + "step": 7050 + }, + { + "epoch": 0.22513473006154533, + "grad_norm": 0.41865724325180054, + "learning_rate": 0.0005, + "loss": 1.253, + "step": 7060 + }, + { + "epoch": 0.22545361778117926, + "grad_norm": 0.4204801321029663, + "learning_rate": 0.0005, + "loss": 1.2539, + "step": 7070 + }, + { + "epoch": 0.22577250550081315, + "grad_norm": 0.41760504245758057, + "learning_rate": 0.0005, + "loss": 1.2786, + "step": 7080 + }, + { + "epoch": 0.22609139322044708, + "grad_norm": 0.421063631772995, + "learning_rate": 0.0005, + "loss": 1.2795, + "step": 7090 + }, + { + "epoch": 0.226410280940081, + "grad_norm": 0.46195104718208313, + "learning_rate": 0.0005, + "loss": 1.2607, + "step": 7100 + }, + { + "epoch": 0.2267291686597149, + "grad_norm": 0.43076223134994507, + "learning_rate": 0.0005, + "loss": 1.2782, + "step": 7110 + }, + { + "epoch": 0.22704805637934883, + "grad_norm": 0.40276387333869934, + "learning_rate": 0.0005, + "loss": 1.2509, + "step": 7120 + }, + { + "epoch": 0.22736694409898275, + "grad_norm": 0.41084590554237366, + "learning_rate": 0.0005, + "loss": 1.2662, + "step": 7130 + }, + { + "epoch": 0.22768583181861665, + "grad_norm": 0.4378090500831604, + "learning_rate": 0.0005, + "loss": 1.2637, + "step": 7140 + }, + { + "epoch": 0.22800471953825058, + "grad_norm": 0.42991676926612854, + "learning_rate": 0.0005, + "loss": 1.2598, + "step": 7150 + }, + { + "epoch": 0.2283236072578845, + "grad_norm": 0.4566963016986847, + "learning_rate": 0.0005, + "loss": 1.2652, + "step": 7160 + }, + { + "epoch": 0.22864249497751843, + "grad_norm": 0.4291926920413971, + "learning_rate": 0.0005, + "loss": 1.2762, + "step": 7170 + }, + { + "epoch": 0.22896138269715233, + "grad_norm": 0.4488089084625244, + "learning_rate": 0.0005, + "loss": 1.2427, + "step": 7180 + }, + { + "epoch": 0.22928027041678625, + "grad_norm": 0.4147830903530121, + "learning_rate": 0.0005, + "loss": 1.2498, + "step": 7190 + }, + { + "epoch": 0.22959915813642018, + "grad_norm": 0.44582098722457886, + "learning_rate": 0.0005, + "loss": 1.2518, + "step": 7200 + }, + { + "epoch": 0.22991804585605408, + "grad_norm": 0.41427379846572876, + "learning_rate": 0.0005, + "loss": 1.2484, + "step": 7210 + }, + { + "epoch": 0.230236933575688, + "grad_norm": 0.41920924186706543, + "learning_rate": 0.0005, + "loss": 1.2295, + "step": 7220 + }, + { + "epoch": 0.23055582129532193, + "grad_norm": 0.4119439721107483, + "learning_rate": 0.0005, + "loss": 1.2607, + "step": 7230 + }, + { + "epoch": 0.23087470901495583, + "grad_norm": 0.42621639370918274, + "learning_rate": 0.0005, + "loss": 1.269, + "step": 7240 + }, + { + "epoch": 0.23119359673458975, + "grad_norm": 0.42913585901260376, + "learning_rate": 0.0005, + "loss": 1.2534, + "step": 7250 + }, + { + "epoch": 0.23151248445422368, + "grad_norm": 0.43496614694595337, + "learning_rate": 0.0005, + "loss": 1.2442, + "step": 7260 + }, + { + "epoch": 0.23183137217385758, + "grad_norm": 0.4075222909450531, + "learning_rate": 0.0005, + "loss": 1.2465, + "step": 7270 + }, + { + "epoch": 0.2321502598934915, + "grad_norm": 0.4576127231121063, + "learning_rate": 0.0005, + "loss": 1.269, + "step": 7280 + }, + { + "epoch": 0.23246914761312543, + "grad_norm": 0.42538854479789734, + "learning_rate": 0.0005, + "loss": 1.2565, + "step": 7290 + }, + { + "epoch": 0.23278803533275932, + "grad_norm": 0.42042431235313416, + "learning_rate": 0.0005, + "loss": 1.251, + "step": 7300 + }, + { + "epoch": 0.23310692305239325, + "grad_norm": 0.4098883271217346, + "learning_rate": 0.0005, + "loss": 1.2665, + "step": 7310 + }, + { + "epoch": 0.23342581077202718, + "grad_norm": 0.43415921926498413, + "learning_rate": 0.0005, + "loss": 1.251, + "step": 7320 + }, + { + "epoch": 0.23374469849166107, + "grad_norm": 0.4317355155944824, + "learning_rate": 0.0005, + "loss": 1.2345, + "step": 7330 + }, + { + "epoch": 0.234063586211295, + "grad_norm": 0.4150109589099884, + "learning_rate": 0.0005, + "loss": 1.2605, + "step": 7340 + }, + { + "epoch": 0.23438247393092893, + "grad_norm": 0.39927196502685547, + "learning_rate": 0.0005, + "loss": 1.2532, + "step": 7350 + }, + { + "epoch": 0.23470136165056282, + "grad_norm": 0.41622141003608704, + "learning_rate": 0.0005, + "loss": 1.244, + "step": 7360 + }, + { + "epoch": 0.23502024937019675, + "grad_norm": 0.40569183230400085, + "learning_rate": 0.0005, + "loss": 1.2433, + "step": 7370 + }, + { + "epoch": 0.23533913708983067, + "grad_norm": 0.4543701410293579, + "learning_rate": 0.0005, + "loss": 1.2545, + "step": 7380 + }, + { + "epoch": 0.2356580248094646, + "grad_norm": 0.41043969988822937, + "learning_rate": 0.0005, + "loss": 1.254, + "step": 7390 + }, + { + "epoch": 0.2359769125290985, + "grad_norm": 0.4187447726726532, + "learning_rate": 0.0005, + "loss": 1.2401, + "step": 7400 + }, + { + "epoch": 0.23629580024873242, + "grad_norm": 0.4198475778102875, + "learning_rate": 0.0005, + "loss": 1.245, + "step": 7410 + }, + { + "epoch": 0.23661468796836635, + "grad_norm": 0.42908045649528503, + "learning_rate": 0.0005, + "loss": 1.2683, + "step": 7420 + }, + { + "epoch": 0.23693357568800025, + "grad_norm": 0.4434865117073059, + "learning_rate": 0.0005, + "loss": 1.229, + "step": 7430 + }, + { + "epoch": 0.23725246340763417, + "grad_norm": 0.4296889901161194, + "learning_rate": 0.0005, + "loss": 1.2417, + "step": 7440 + }, + { + "epoch": 0.2375713511272681, + "grad_norm": 0.42803293466567993, + "learning_rate": 0.0005, + "loss": 1.2622, + "step": 7450 + }, + { + "epoch": 0.237890238846902, + "grad_norm": 0.4073934853076935, + "learning_rate": 0.0005, + "loss": 1.2612, + "step": 7460 + }, + { + "epoch": 0.23820912656653592, + "grad_norm": 0.4153025448322296, + "learning_rate": 0.0005, + "loss": 1.2498, + "step": 7470 + }, + { + "epoch": 0.23852801428616985, + "grad_norm": 0.42079266905784607, + "learning_rate": 0.0005, + "loss": 1.2667, + "step": 7480 + }, + { + "epoch": 0.23884690200580375, + "grad_norm": 0.400680810213089, + "learning_rate": 0.0005, + "loss": 1.2381, + "step": 7490 + }, + { + "epoch": 0.23916578972543767, + "grad_norm": 0.41702139377593994, + "learning_rate": 0.0005, + "loss": 1.2692, + "step": 7500 + }, + { + "epoch": 0.2394846774450716, + "grad_norm": 0.4298720955848694, + "learning_rate": 0.0005, + "loss": 1.2466, + "step": 7510 + }, + { + "epoch": 0.2398035651647055, + "grad_norm": 0.3950182795524597, + "learning_rate": 0.0005, + "loss": 1.2521, + "step": 7520 + }, + { + "epoch": 0.24012245288433942, + "grad_norm": 0.3977687954902649, + "learning_rate": 0.0005, + "loss": 1.2533, + "step": 7530 + }, + { + "epoch": 0.24044134060397335, + "grad_norm": 0.40387430787086487, + "learning_rate": 0.0005, + "loss": 1.2465, + "step": 7540 + }, + { + "epoch": 0.24076022832360724, + "grad_norm": 0.4266010522842407, + "learning_rate": 0.0005, + "loss": 1.2316, + "step": 7550 + }, + { + "epoch": 0.24107911604324117, + "grad_norm": 0.3930842876434326, + "learning_rate": 0.0005, + "loss": 1.2391, + "step": 7560 + }, + { + "epoch": 0.2413980037628751, + "grad_norm": 0.41269761323928833, + "learning_rate": 0.0005, + "loss": 1.2631, + "step": 7570 + }, + { + "epoch": 0.24171689148250902, + "grad_norm": 0.4143700897693634, + "learning_rate": 0.0005, + "loss": 1.2409, + "step": 7580 + }, + { + "epoch": 0.24203577920214292, + "grad_norm": 0.40925079584121704, + "learning_rate": 0.0005, + "loss": 1.248, + "step": 7590 + }, + { + "epoch": 0.24235466692177685, + "grad_norm": 0.4155504107475281, + "learning_rate": 0.0005, + "loss": 1.2536, + "step": 7600 + }, + { + "epoch": 0.24267355464141077, + "grad_norm": 0.41415005922317505, + "learning_rate": 0.0005, + "loss": 1.2212, + "step": 7610 + }, + { + "epoch": 0.24299244236104467, + "grad_norm": 0.4322291612625122, + "learning_rate": 0.0005, + "loss": 1.2285, + "step": 7620 + }, + { + "epoch": 0.2433113300806786, + "grad_norm": 0.4126437306404114, + "learning_rate": 0.0005, + "loss": 1.2521, + "step": 7630 + }, + { + "epoch": 0.24363021780031252, + "grad_norm": 0.42201003432273865, + "learning_rate": 0.0005, + "loss": 1.2684, + "step": 7640 + }, + { + "epoch": 0.24394910551994642, + "grad_norm": 0.4242708086967468, + "learning_rate": 0.0005, + "loss": 1.2413, + "step": 7650 + }, + { + "epoch": 0.24426799323958034, + "grad_norm": 0.4040601849555969, + "learning_rate": 0.0005, + "loss": 1.2337, + "step": 7660 + }, + { + "epoch": 0.24458688095921427, + "grad_norm": 0.4113594889640808, + "learning_rate": 0.0005, + "loss": 1.2357, + "step": 7670 + }, + { + "epoch": 0.24490576867884817, + "grad_norm": 0.39296871423721313, + "learning_rate": 0.0005, + "loss": 1.2275, + "step": 7680 + }, + { + "epoch": 0.2452246563984821, + "grad_norm": 0.42583292722702026, + "learning_rate": 0.0005, + "loss": 1.2515, + "step": 7690 + }, + { + "epoch": 0.24554354411811602, + "grad_norm": 0.40814805030822754, + "learning_rate": 0.0005, + "loss": 1.2325, + "step": 7700 + }, + { + "epoch": 0.24586243183774992, + "grad_norm": 0.40266889333724976, + "learning_rate": 0.0005, + "loss": 1.2177, + "step": 7710 + }, + { + "epoch": 0.24618131955738384, + "grad_norm": 0.4071027636528015, + "learning_rate": 0.0005, + "loss": 1.2353, + "step": 7720 + }, + { + "epoch": 0.24650020727701777, + "grad_norm": 0.4114187955856323, + "learning_rate": 0.0005, + "loss": 1.2439, + "step": 7730 + }, + { + "epoch": 0.24681909499665167, + "grad_norm": 0.41335058212280273, + "learning_rate": 0.0005, + "loss": 1.2357, + "step": 7740 + }, + { + "epoch": 0.2471379827162856, + "grad_norm": 0.41387081146240234, + "learning_rate": 0.0005, + "loss": 1.2565, + "step": 7750 + }, + { + "epoch": 0.24745687043591952, + "grad_norm": 0.40741318464279175, + "learning_rate": 0.0005, + "loss": 1.2342, + "step": 7760 + }, + { + "epoch": 0.24777575815555342, + "grad_norm": 0.4092518091201782, + "learning_rate": 0.0005, + "loss": 1.2448, + "step": 7770 + }, + { + "epoch": 0.24809464587518734, + "grad_norm": 0.40899863839149475, + "learning_rate": 0.0005, + "loss": 1.2185, + "step": 7780 + }, + { + "epoch": 0.24841353359482127, + "grad_norm": 0.42302390933036804, + "learning_rate": 0.0005, + "loss": 1.2528, + "step": 7790 + }, + { + "epoch": 0.2487324213144552, + "grad_norm": 0.41404077410697937, + "learning_rate": 0.0005, + "loss": 1.2351, + "step": 7800 + }, + { + "epoch": 0.2490513090340891, + "grad_norm": 0.4217240512371063, + "learning_rate": 0.0005, + "loss": 1.2472, + "step": 7810 + }, + { + "epoch": 0.24937019675372302, + "grad_norm": 0.4106791019439697, + "learning_rate": 0.0005, + "loss": 1.2335, + "step": 7820 + }, + { + "epoch": 0.24968908447335694, + "grad_norm": 0.39354920387268066, + "learning_rate": 0.0005, + "loss": 1.2208, + "step": 7830 + }, + { + "epoch": 0.25000797219299087, + "grad_norm": 0.4039877951145172, + "learning_rate": 0.0005, + "loss": 1.2217, + "step": 7840 + }, + { + "epoch": 0.25032685991262477, + "grad_norm": 0.4242667853832245, + "learning_rate": 0.0005, + "loss": 1.2093, + "step": 7850 + }, + { + "epoch": 0.25064574763225866, + "grad_norm": 0.4013097584247589, + "learning_rate": 0.0005, + "loss": 1.2115, + "step": 7860 + }, + { + "epoch": 0.2509646353518926, + "grad_norm": 0.4191020727157593, + "learning_rate": 0.0005, + "loss": 1.22, + "step": 7870 + }, + { + "epoch": 0.2512835230715265, + "grad_norm": 0.39803019165992737, + "learning_rate": 0.0005, + "loss": 1.2185, + "step": 7880 + }, + { + "epoch": 0.2516024107911604, + "grad_norm": 0.41739556193351746, + "learning_rate": 0.0005, + "loss": 1.2619, + "step": 7890 + }, + { + "epoch": 0.25192129851079437, + "grad_norm": 0.3922768533229828, + "learning_rate": 0.0005, + "loss": 1.2372, + "step": 7900 + }, + { + "epoch": 0.25224018623042826, + "grad_norm": 0.4070080816745758, + "learning_rate": 0.0005, + "loss": 1.2311, + "step": 7910 + }, + { + "epoch": 0.25255907395006216, + "grad_norm": 0.4060456454753876, + "learning_rate": 0.0005, + "loss": 1.2383, + "step": 7920 + }, + { + "epoch": 0.2528779616696961, + "grad_norm": 0.42530322074890137, + "learning_rate": 0.0005, + "loss": 1.2259, + "step": 7930 + }, + { + "epoch": 0.25319684938933, + "grad_norm": 0.41797924041748047, + "learning_rate": 0.0005, + "loss": 1.228, + "step": 7940 + }, + { + "epoch": 0.2535157371089639, + "grad_norm": 0.3849104940891266, + "learning_rate": 0.0005, + "loss": 1.2296, + "step": 7950 + }, + { + "epoch": 0.25383462482859787, + "grad_norm": 0.41749095916748047, + "learning_rate": 0.0005, + "loss": 1.2441, + "step": 7960 + }, + { + "epoch": 0.25415351254823176, + "grad_norm": 0.40891966223716736, + "learning_rate": 0.0005, + "loss": 1.2323, + "step": 7970 + }, + { + "epoch": 0.25447240026786566, + "grad_norm": 0.42078697681427, + "learning_rate": 0.0005, + "loss": 1.2602, + "step": 7980 + }, + { + "epoch": 0.2547912879874996, + "grad_norm": 0.4090920686721802, + "learning_rate": 0.0005, + "loss": 1.2506, + "step": 7990 + }, + { + "epoch": 0.2551101757071335, + "grad_norm": 0.41705599427223206, + "learning_rate": 0.0005, + "loss": 1.2238, + "step": 8000 + }, + { + "epoch": 0.2554290634267674, + "grad_norm": 0.40100833773612976, + "learning_rate": 0.0005, + "loss": 1.2228, + "step": 8010 + }, + { + "epoch": 0.25574795114640136, + "grad_norm": 0.396261602640152, + "learning_rate": 0.0005, + "loss": 1.2328, + "step": 8020 + }, + { + "epoch": 0.25606683886603526, + "grad_norm": 0.3865589499473572, + "learning_rate": 0.0005, + "loss": 1.2191, + "step": 8030 + }, + { + "epoch": 0.25638572658566916, + "grad_norm": 0.4185122549533844, + "learning_rate": 0.0005, + "loss": 1.2351, + "step": 8040 + }, + { + "epoch": 0.2567046143053031, + "grad_norm": 0.40291154384613037, + "learning_rate": 0.0005, + "loss": 1.2455, + "step": 8050 + }, + { + "epoch": 0.257023502024937, + "grad_norm": 0.4228131175041199, + "learning_rate": 0.0005, + "loss": 1.2309, + "step": 8060 + }, + { + "epoch": 0.2573423897445709, + "grad_norm": 0.4108589291572571, + "learning_rate": 0.0005, + "loss": 1.2433, + "step": 8070 + }, + { + "epoch": 0.25766127746420486, + "grad_norm": 0.4041846692562103, + "learning_rate": 0.0005, + "loss": 1.2269, + "step": 8080 + }, + { + "epoch": 0.25798016518383876, + "grad_norm": 0.40096405148506165, + "learning_rate": 0.0005, + "loss": 1.2629, + "step": 8090 + }, + { + "epoch": 0.2582990529034727, + "grad_norm": 0.39418691396713257, + "learning_rate": 0.0005, + "loss": 1.2421, + "step": 8100 + }, + { + "epoch": 0.2586179406231066, + "grad_norm": 0.42769312858581543, + "learning_rate": 0.0005, + "loss": 1.2407, + "step": 8110 + }, + { + "epoch": 0.2589368283427405, + "grad_norm": 0.40227988362312317, + "learning_rate": 0.0005, + "loss": 1.2128, + "step": 8120 + }, + { + "epoch": 0.25925571606237446, + "grad_norm": 0.388641893863678, + "learning_rate": 0.0005, + "loss": 1.2208, + "step": 8130 + }, + { + "epoch": 0.25957460378200836, + "grad_norm": 0.39878180623054504, + "learning_rate": 0.0005, + "loss": 1.1998, + "step": 8140 + }, + { + "epoch": 0.25989349150164226, + "grad_norm": 0.3936919569969177, + "learning_rate": 0.0005, + "loss": 1.2275, + "step": 8150 + }, + { + "epoch": 0.2602123792212762, + "grad_norm": 0.40643203258514404, + "learning_rate": 0.0005, + "loss": 1.2206, + "step": 8160 + }, + { + "epoch": 0.2605312669409101, + "grad_norm": 0.40189582109451294, + "learning_rate": 0.0005, + "loss": 1.212, + "step": 8170 + }, + { + "epoch": 0.260850154660544, + "grad_norm": 0.4173663556575775, + "learning_rate": 0.0005, + "loss": 1.2346, + "step": 8180 + }, + { + "epoch": 0.26116904238017796, + "grad_norm": 0.39820632338523865, + "learning_rate": 0.0005, + "loss": 1.2181, + "step": 8190 + }, + { + "epoch": 0.26148793009981186, + "grad_norm": 0.40434184670448303, + "learning_rate": 0.0005, + "loss": 1.2237, + "step": 8200 + }, + { + "epoch": 0.26180681781944576, + "grad_norm": 0.3958496153354645, + "learning_rate": 0.0005, + "loss": 1.2447, + "step": 8210 + }, + { + "epoch": 0.2621257055390797, + "grad_norm": 0.40260443091392517, + "learning_rate": 0.0005, + "loss": 1.2424, + "step": 8220 + }, + { + "epoch": 0.2624445932587136, + "grad_norm": 0.40898144245147705, + "learning_rate": 0.0005, + "loss": 1.2212, + "step": 8230 + }, + { + "epoch": 0.2627634809783475, + "grad_norm": 0.43052124977111816, + "learning_rate": 0.0005, + "loss": 1.2143, + "step": 8240 + }, + { + "epoch": 0.26308236869798146, + "grad_norm": 0.3874588906764984, + "learning_rate": 0.0005, + "loss": 1.2101, + "step": 8250 + }, + { + "epoch": 0.26340125641761536, + "grad_norm": 0.3905537724494934, + "learning_rate": 0.0005, + "loss": 1.2357, + "step": 8260 + }, + { + "epoch": 0.26372014413724926, + "grad_norm": 0.3907853662967682, + "learning_rate": 0.0005, + "loss": 1.2286, + "step": 8270 + }, + { + "epoch": 0.2640390318568832, + "grad_norm": 0.3982638716697693, + "learning_rate": 0.0005, + "loss": 1.2296, + "step": 8280 + }, + { + "epoch": 0.2643579195765171, + "grad_norm": 0.4627874195575714, + "learning_rate": 0.0005, + "loss": 1.2305, + "step": 8290 + }, + { + "epoch": 0.264676807296151, + "grad_norm": 0.40264928340911865, + "learning_rate": 0.0005, + "loss": 1.2188, + "step": 8300 + }, + { + "epoch": 0.26499569501578496, + "grad_norm": 0.3911851942539215, + "learning_rate": 0.0005, + "loss": 1.2174, + "step": 8310 + }, + { + "epoch": 0.26531458273541886, + "grad_norm": 0.37752363085746765, + "learning_rate": 0.0005, + "loss": 1.21, + "step": 8320 + }, + { + "epoch": 0.26563347045505276, + "grad_norm": 0.42199021577835083, + "learning_rate": 0.0005, + "loss": 1.2042, + "step": 8330 + }, + { + "epoch": 0.2659523581746867, + "grad_norm": 0.41406044363975525, + "learning_rate": 0.0005, + "loss": 1.2338, + "step": 8340 + }, + { + "epoch": 0.2662712458943206, + "grad_norm": 0.42696836590766907, + "learning_rate": 0.0005, + "loss": 1.2209, + "step": 8350 + }, + { + "epoch": 0.2665901336139545, + "grad_norm": 0.39001408219337463, + "learning_rate": 0.0005, + "loss": 1.2496, + "step": 8360 + }, + { + "epoch": 0.26690902133358846, + "grad_norm": 0.4086891710758209, + "learning_rate": 0.0005, + "loss": 1.2394, + "step": 8370 + }, + { + "epoch": 0.26722790905322236, + "grad_norm": 0.3911346197128296, + "learning_rate": 0.0005, + "loss": 1.2266, + "step": 8380 + }, + { + "epoch": 0.26754679677285625, + "grad_norm": 0.39909061789512634, + "learning_rate": 0.0005, + "loss": 1.2261, + "step": 8390 + }, + { + "epoch": 0.2678656844924902, + "grad_norm": 0.4086940288543701, + "learning_rate": 0.0005, + "loss": 1.2203, + "step": 8400 + }, + { + "epoch": 0.2681845722121241, + "grad_norm": 0.40768542885780334, + "learning_rate": 0.0005, + "loss": 1.215, + "step": 8410 + }, + { + "epoch": 0.268503459931758, + "grad_norm": 0.39183276891708374, + "learning_rate": 0.0005, + "loss": 1.2101, + "step": 8420 + }, + { + "epoch": 0.26882234765139196, + "grad_norm": 0.383931040763855, + "learning_rate": 0.0005, + "loss": 1.2115, + "step": 8430 + }, + { + "epoch": 0.26914123537102586, + "grad_norm": 0.39705631136894226, + "learning_rate": 0.0005, + "loss": 1.2093, + "step": 8440 + }, + { + "epoch": 0.26946012309065975, + "grad_norm": 0.39195719361305237, + "learning_rate": 0.0005, + "loss": 1.2219, + "step": 8450 + }, + { + "epoch": 0.2697790108102937, + "grad_norm": 0.40420153737068176, + "learning_rate": 0.0005, + "loss": 1.2152, + "step": 8460 + }, + { + "epoch": 0.2700978985299276, + "grad_norm": 0.4257016181945801, + "learning_rate": 0.0005, + "loss": 1.2303, + "step": 8470 + }, + { + "epoch": 0.2704167862495615, + "grad_norm": 0.40116095542907715, + "learning_rate": 0.0005, + "loss": 1.2122, + "step": 8480 + }, + { + "epoch": 0.27073567396919546, + "grad_norm": 0.42069900035858154, + "learning_rate": 0.0005, + "loss": 1.2106, + "step": 8490 + }, + { + "epoch": 0.27105456168882935, + "grad_norm": 0.38822034001350403, + "learning_rate": 0.0005, + "loss": 1.2378, + "step": 8500 + }, + { + "epoch": 0.2713734494084633, + "grad_norm": 0.38975998759269714, + "learning_rate": 0.0005, + "loss": 1.2138, + "step": 8510 + }, + { + "epoch": 0.2716923371280972, + "grad_norm": 0.4007030129432678, + "learning_rate": 0.0005, + "loss": 1.2051, + "step": 8520 + }, + { + "epoch": 0.2720112248477311, + "grad_norm": 0.39984530210494995, + "learning_rate": 0.0005, + "loss": 1.2053, + "step": 8530 + }, + { + "epoch": 0.27233011256736506, + "grad_norm": 0.3883681893348694, + "learning_rate": 0.0005, + "loss": 1.2196, + "step": 8540 + }, + { + "epoch": 0.27264900028699895, + "grad_norm": 0.40165334939956665, + "learning_rate": 0.0005, + "loss": 1.2413, + "step": 8550 + }, + { + "epoch": 0.27296788800663285, + "grad_norm": 0.3896324634552002, + "learning_rate": 0.0005, + "loss": 1.2273, + "step": 8560 + }, + { + "epoch": 0.2732867757262668, + "grad_norm": 0.3914828896522522, + "learning_rate": 0.0005, + "loss": 1.2383, + "step": 8570 + }, + { + "epoch": 0.2736056634459007, + "grad_norm": 0.39543914794921875, + "learning_rate": 0.0005, + "loss": 1.2028, + "step": 8580 + }, + { + "epoch": 0.2739245511655346, + "grad_norm": 0.38763898611068726, + "learning_rate": 0.0005, + "loss": 1.2109, + "step": 8590 + }, + { + "epoch": 0.27424343888516856, + "grad_norm": 0.37673720717430115, + "learning_rate": 0.0005, + "loss": 1.2265, + "step": 8600 + }, + { + "epoch": 0.27456232660480245, + "grad_norm": 0.3957275152206421, + "learning_rate": 0.0005, + "loss": 1.2341, + "step": 8610 + }, + { + "epoch": 0.27488121432443635, + "grad_norm": 0.4029931426048279, + "learning_rate": 0.0005, + "loss": 1.1993, + "step": 8620 + }, + { + "epoch": 0.2752001020440703, + "grad_norm": 0.3820151686668396, + "learning_rate": 0.0005, + "loss": 1.2055, + "step": 8630 + }, + { + "epoch": 0.2755189897637042, + "grad_norm": 0.39909201860427856, + "learning_rate": 0.0005, + "loss": 1.2057, + "step": 8640 + }, + { + "epoch": 0.2758378774833381, + "grad_norm": 0.3830402195453644, + "learning_rate": 0.0005, + "loss": 1.2292, + "step": 8650 + }, + { + "epoch": 0.27615676520297205, + "grad_norm": 0.38620173931121826, + "learning_rate": 0.0005, + "loss": 1.2111, + "step": 8660 + }, + { + "epoch": 0.27647565292260595, + "grad_norm": 0.39842554926872253, + "learning_rate": 0.0005, + "loss": 1.2183, + "step": 8670 + }, + { + "epoch": 0.27679454064223985, + "grad_norm": 0.3806487023830414, + "learning_rate": 0.0005, + "loss": 1.2022, + "step": 8680 + }, + { + "epoch": 0.2771134283618738, + "grad_norm": 0.4076787829399109, + "learning_rate": 0.0005, + "loss": 1.2007, + "step": 8690 + }, + { + "epoch": 0.2774323160815077, + "grad_norm": 0.3915601968765259, + "learning_rate": 0.0005, + "loss": 1.2146, + "step": 8700 + }, + { + "epoch": 0.2777512038011416, + "grad_norm": 0.38058754801750183, + "learning_rate": 0.0005, + "loss": 1.2189, + "step": 8710 + }, + { + "epoch": 0.27807009152077555, + "grad_norm": 0.39291903376579285, + "learning_rate": 0.0005, + "loss": 1.2177, + "step": 8720 + }, + { + "epoch": 0.27838897924040945, + "grad_norm": 0.3893909454345703, + "learning_rate": 0.0005, + "loss": 1.1909, + "step": 8730 + }, + { + "epoch": 0.27870786696004335, + "grad_norm": 0.400829017162323, + "learning_rate": 0.0005, + "loss": 1.212, + "step": 8740 + }, + { + "epoch": 0.2790267546796773, + "grad_norm": 0.40721598267555237, + "learning_rate": 0.0005, + "loss": 1.2168, + "step": 8750 + }, + { + "epoch": 0.2793456423993112, + "grad_norm": 0.4186878204345703, + "learning_rate": 0.0005, + "loss": 1.2119, + "step": 8760 + }, + { + "epoch": 0.2796645301189451, + "grad_norm": 0.3906959593296051, + "learning_rate": 0.0005, + "loss": 1.217, + "step": 8770 + }, + { + "epoch": 0.27998341783857905, + "grad_norm": 0.3925018906593323, + "learning_rate": 0.0005, + "loss": 1.221, + "step": 8780 + }, + { + "epoch": 0.28030230555821295, + "grad_norm": 0.39244839549064636, + "learning_rate": 0.0005, + "loss": 1.2088, + "step": 8790 + }, + { + "epoch": 0.28062119327784685, + "grad_norm": 0.3988844156265259, + "learning_rate": 0.0005, + "loss": 1.1995, + "step": 8800 + }, + { + "epoch": 0.2809400809974808, + "grad_norm": 0.38632071018218994, + "learning_rate": 0.0005, + "loss": 1.2249, + "step": 8810 + }, + { + "epoch": 0.2812589687171147, + "grad_norm": 0.3827994167804718, + "learning_rate": 0.0005, + "loss": 1.2106, + "step": 8820 + }, + { + "epoch": 0.2815778564367486, + "grad_norm": 0.4094688594341278, + "learning_rate": 0.0005, + "loss": 1.2355, + "step": 8830 + }, + { + "epoch": 0.28189674415638255, + "grad_norm": 0.3843954801559448, + "learning_rate": 0.0005, + "loss": 1.2099, + "step": 8840 + }, + { + "epoch": 0.28221563187601645, + "grad_norm": 0.3869307041168213, + "learning_rate": 0.0005, + "loss": 1.2141, + "step": 8850 + }, + { + "epoch": 0.28253451959565035, + "grad_norm": 0.38374051451683044, + "learning_rate": 0.0005, + "loss": 1.2162, + "step": 8860 + }, + { + "epoch": 0.2828534073152843, + "grad_norm": 0.39685219526290894, + "learning_rate": 0.0005, + "loss": 1.2, + "step": 8870 + }, + { + "epoch": 0.2831722950349182, + "grad_norm": 0.3965778052806854, + "learning_rate": 0.0005, + "loss": 1.2194, + "step": 8880 + }, + { + "epoch": 0.2834911827545521, + "grad_norm": 0.390626460313797, + "learning_rate": 0.0005, + "loss": 1.1979, + "step": 8890 + }, + { + "epoch": 0.28381007047418605, + "grad_norm": 0.40536853671073914, + "learning_rate": 0.0005, + "loss": 1.2207, + "step": 8900 + }, + { + "epoch": 0.28412895819381995, + "grad_norm": 0.40149563550949097, + "learning_rate": 0.0005, + "loss": 1.2049, + "step": 8910 + }, + { + "epoch": 0.2844478459134539, + "grad_norm": 0.40702196955680847, + "learning_rate": 0.0005, + "loss": 1.205, + "step": 8920 + }, + { + "epoch": 0.2847667336330878, + "grad_norm": 0.3878912925720215, + "learning_rate": 0.0005, + "loss": 1.2151, + "step": 8930 + }, + { + "epoch": 0.2850856213527217, + "grad_norm": 0.407080739736557, + "learning_rate": 0.0005, + "loss": 1.2083, + "step": 8940 + }, + { + "epoch": 0.28540450907235565, + "grad_norm": 0.40740883350372314, + "learning_rate": 0.0005, + "loss": 1.2198, + "step": 8950 + }, + { + "epoch": 0.28572339679198955, + "grad_norm": 0.4337395131587982, + "learning_rate": 0.0005, + "loss": 1.2008, + "step": 8960 + }, + { + "epoch": 0.28604228451162345, + "grad_norm": 0.3903014063835144, + "learning_rate": 0.0005, + "loss": 1.2072, + "step": 8970 + }, + { + "epoch": 0.2863611722312574, + "grad_norm": 0.38654598593711853, + "learning_rate": 0.0005, + "loss": 1.2098, + "step": 8980 + }, + { + "epoch": 0.2866800599508913, + "grad_norm": 0.3934592306613922, + "learning_rate": 0.0005, + "loss": 1.2082, + "step": 8990 + }, + { + "epoch": 0.2869989476705252, + "grad_norm": 0.3974146246910095, + "learning_rate": 0.0005, + "loss": 1.234, + "step": 9000 + }, + { + "epoch": 0.28731783539015915, + "grad_norm": 0.39898407459259033, + "learning_rate": 0.0005, + "loss": 1.1914, + "step": 9010 + }, + { + "epoch": 0.28763672310979305, + "grad_norm": 0.39268526434898376, + "learning_rate": 0.0005, + "loss": 1.2091, + "step": 9020 + }, + { + "epoch": 0.28795561082942694, + "grad_norm": 0.3900041878223419, + "learning_rate": 0.0005, + "loss": 1.207, + "step": 9030 + }, + { + "epoch": 0.2882744985490609, + "grad_norm": 0.3990797996520996, + "learning_rate": 0.0005, + "loss": 1.2143, + "step": 9040 + }, + { + "epoch": 0.2885933862686948, + "grad_norm": 0.37181395292282104, + "learning_rate": 0.0005, + "loss": 1.1998, + "step": 9050 + }, + { + "epoch": 0.2889122739883287, + "grad_norm": 0.3809095323085785, + "learning_rate": 0.0005, + "loss": 1.1981, + "step": 9060 + }, + { + "epoch": 0.28923116170796265, + "grad_norm": 0.3911319673061371, + "learning_rate": 0.0005, + "loss": 1.2012, + "step": 9070 + }, + { + "epoch": 0.28955004942759655, + "grad_norm": 0.40503358840942383, + "learning_rate": 0.0005, + "loss": 1.2069, + "step": 9080 + }, + { + "epoch": 0.28986893714723044, + "grad_norm": 0.40217137336730957, + "learning_rate": 0.0005, + "loss": 1.2307, + "step": 9090 + }, + { + "epoch": 0.2901878248668644, + "grad_norm": 0.39365431666374207, + "learning_rate": 0.0005, + "loss": 1.2027, + "step": 9100 + }, + { + "epoch": 0.2905067125864983, + "grad_norm": 0.3819471597671509, + "learning_rate": 0.0005, + "loss": 1.2018, + "step": 9110 + }, + { + "epoch": 0.2908256003061322, + "grad_norm": 0.41711509227752686, + "learning_rate": 0.0005, + "loss": 1.2114, + "step": 9120 + }, + { + "epoch": 0.29114448802576615, + "grad_norm": 0.38931921124458313, + "learning_rate": 0.0005, + "loss": 1.2114, + "step": 9130 + }, + { + "epoch": 0.29146337574540004, + "grad_norm": 0.4078792929649353, + "learning_rate": 0.0005, + "loss": 1.1975, + "step": 9140 + }, + { + "epoch": 0.29178226346503394, + "grad_norm": 0.3806973397731781, + "learning_rate": 0.0005, + "loss": 1.193, + "step": 9150 + }, + { + "epoch": 0.2921011511846679, + "grad_norm": 0.37954655289649963, + "learning_rate": 0.0005, + "loss": 1.2152, + "step": 9160 + }, + { + "epoch": 0.2924200389043018, + "grad_norm": 0.38385963439941406, + "learning_rate": 0.0005, + "loss": 1.2195, + "step": 9170 + }, + { + "epoch": 0.2927389266239357, + "grad_norm": 0.3808298408985138, + "learning_rate": 0.0005, + "loss": 1.2128, + "step": 9180 + }, + { + "epoch": 0.29305781434356964, + "grad_norm": 0.3786800503730774, + "learning_rate": 0.0005, + "loss": 1.1958, + "step": 9190 + }, + { + "epoch": 0.29337670206320354, + "grad_norm": 0.3799290359020233, + "learning_rate": 0.0005, + "loss": 1.2006, + "step": 9200 + }, + { + "epoch": 0.29369558978283744, + "grad_norm": 0.37467071413993835, + "learning_rate": 0.0005, + "loss": 1.1909, + "step": 9210 + }, + { + "epoch": 0.2940144775024714, + "grad_norm": 0.3991237282752991, + "learning_rate": 0.0005, + "loss": 1.1974, + "step": 9220 + }, + { + "epoch": 0.2943333652221053, + "grad_norm": 0.38379865884780884, + "learning_rate": 0.0005, + "loss": 1.2086, + "step": 9230 + }, + { + "epoch": 0.2946522529417392, + "grad_norm": 0.3732675313949585, + "learning_rate": 0.0005, + "loss": 1.1895, + "step": 9240 + }, + { + "epoch": 0.29497114066137314, + "grad_norm": 0.3875043988227844, + "learning_rate": 0.0005, + "loss": 1.2025, + "step": 9250 + }, + { + "epoch": 0.29529002838100704, + "grad_norm": 0.41846537590026855, + "learning_rate": 0.0005, + "loss": 1.1986, + "step": 9260 + }, + { + "epoch": 0.29560891610064094, + "grad_norm": 0.37385421991348267, + "learning_rate": 0.0005, + "loss": 1.1868, + "step": 9270 + }, + { + "epoch": 0.2959278038202749, + "grad_norm": 0.3830406069755554, + "learning_rate": 0.0005, + "loss": 1.198, + "step": 9280 + }, + { + "epoch": 0.2962466915399088, + "grad_norm": 0.3768352270126343, + "learning_rate": 0.0005, + "loss": 1.1808, + "step": 9290 + }, + { + "epoch": 0.2965655792595427, + "grad_norm": 0.37796854972839355, + "learning_rate": 0.0005, + "loss": 1.2005, + "step": 9300 + }, + { + "epoch": 0.29688446697917664, + "grad_norm": 0.3770412802696228, + "learning_rate": 0.0005, + "loss": 1.2082, + "step": 9310 + }, + { + "epoch": 0.29720335469881054, + "grad_norm": 0.37880739569664, + "learning_rate": 0.0005, + "loss": 1.1779, + "step": 9320 + }, + { + "epoch": 0.29752224241844444, + "grad_norm": 0.38096004724502563, + "learning_rate": 0.0005, + "loss": 1.2064, + "step": 9330 + }, + { + "epoch": 0.2978411301380784, + "grad_norm": 0.39088889956474304, + "learning_rate": 0.0005, + "loss": 1.1833, + "step": 9340 + }, + { + "epoch": 0.2981600178577123, + "grad_norm": 0.3733189105987549, + "learning_rate": 0.0005, + "loss": 1.1898, + "step": 9350 + }, + { + "epoch": 0.29847890557734624, + "grad_norm": 0.3921329081058502, + "learning_rate": 0.0005, + "loss": 1.1951, + "step": 9360 + }, + { + "epoch": 0.29879779329698014, + "grad_norm": 0.38713183999061584, + "learning_rate": 0.0005, + "loss": 1.2039, + "step": 9370 + }, + { + "epoch": 0.29911668101661404, + "grad_norm": 0.3787401616573334, + "learning_rate": 0.0005, + "loss": 1.2016, + "step": 9380 + }, + { + "epoch": 0.299435568736248, + "grad_norm": 0.36601531505584717, + "learning_rate": 0.0005, + "loss": 1.1808, + "step": 9390 + }, + { + "epoch": 0.2997544564558819, + "grad_norm": 0.383737176656723, + "learning_rate": 0.0005, + "loss": 1.2, + "step": 9400 + }, + { + "epoch": 0.3000733441755158, + "grad_norm": 0.3672783076763153, + "learning_rate": 0.0005, + "loss": 1.2034, + "step": 9410 + }, + { + "epoch": 0.30039223189514974, + "grad_norm": 0.37648218870162964, + "learning_rate": 0.0005, + "loss": 1.1984, + "step": 9420 + }, + { + "epoch": 0.30071111961478364, + "grad_norm": 0.39211979508399963, + "learning_rate": 0.0005, + "loss": 1.1931, + "step": 9430 + }, + { + "epoch": 0.30103000733441754, + "grad_norm": 0.4010789394378662, + "learning_rate": 0.0005, + "loss": 1.2021, + "step": 9440 + }, + { + "epoch": 0.3013488950540515, + "grad_norm": 0.37380537390708923, + "learning_rate": 0.0005, + "loss": 1.2043, + "step": 9450 + }, + { + "epoch": 0.3016677827736854, + "grad_norm": 0.399393230676651, + "learning_rate": 0.0005, + "loss": 1.2053, + "step": 9460 + }, + { + "epoch": 0.3019866704933193, + "grad_norm": 0.3876519203186035, + "learning_rate": 0.0005, + "loss": 1.2181, + "step": 9470 + }, + { + "epoch": 0.30230555821295324, + "grad_norm": 0.3972403109073639, + "learning_rate": 0.0005, + "loss": 1.1972, + "step": 9480 + }, + { + "epoch": 0.30262444593258714, + "grad_norm": 0.383587121963501, + "learning_rate": 0.0005, + "loss": 1.1982, + "step": 9490 + }, + { + "epoch": 0.30294333365222104, + "grad_norm": 0.38184329867362976, + "learning_rate": 0.0005, + "loss": 1.2009, + "step": 9500 + }, + { + "epoch": 0.303262221371855, + "grad_norm": 0.3812735378742218, + "learning_rate": 0.0005, + "loss": 1.1845, + "step": 9510 + }, + { + "epoch": 0.3035811090914889, + "grad_norm": 0.3952953815460205, + "learning_rate": 0.0005, + "loss": 1.193, + "step": 9520 + }, + { + "epoch": 0.3038999968111228, + "grad_norm": 0.39169639348983765, + "learning_rate": 0.0005, + "loss": 1.1801, + "step": 9530 + }, + { + "epoch": 0.30421888453075674, + "grad_norm": 0.3926263749599457, + "learning_rate": 0.0005, + "loss": 1.1917, + "step": 9540 + }, + { + "epoch": 0.30453777225039064, + "grad_norm": 0.38617074489593506, + "learning_rate": 0.0005, + "loss": 1.1705, + "step": 9550 + }, + { + "epoch": 0.30485665997002453, + "grad_norm": 0.37101539969444275, + "learning_rate": 0.0005, + "loss": 1.1953, + "step": 9560 + }, + { + "epoch": 0.3051755476896585, + "grad_norm": 0.3760269582271576, + "learning_rate": 0.0005, + "loss": 1.1884, + "step": 9570 + }, + { + "epoch": 0.3054944354092924, + "grad_norm": 0.37347906827926636, + "learning_rate": 0.0005, + "loss": 1.2076, + "step": 9580 + }, + { + "epoch": 0.3058133231289263, + "grad_norm": 0.3571949601173401, + "learning_rate": 0.0005, + "loss": 1.1914, + "step": 9590 + }, + { + "epoch": 0.30613221084856024, + "grad_norm": 0.3888949751853943, + "learning_rate": 0.0005, + "loss": 1.1841, + "step": 9600 + }, + { + "epoch": 0.30645109856819414, + "grad_norm": 0.3792676627635956, + "learning_rate": 0.0005, + "loss": 1.1905, + "step": 9610 + }, + { + "epoch": 0.30676998628782803, + "grad_norm": 0.38213565945625305, + "learning_rate": 0.0005, + "loss": 1.1943, + "step": 9620 + }, + { + "epoch": 0.307088874007462, + "grad_norm": 0.38365045189857483, + "learning_rate": 0.0005, + "loss": 1.1645, + "step": 9630 + }, + { + "epoch": 0.3074077617270959, + "grad_norm": 0.37328922748565674, + "learning_rate": 0.0005, + "loss": 1.198, + "step": 9640 + }, + { + "epoch": 0.3077266494467298, + "grad_norm": 0.3832704424858093, + "learning_rate": 0.0005, + "loss": 1.1884, + "step": 9650 + }, + { + "epoch": 0.30804553716636374, + "grad_norm": 0.3734091818332672, + "learning_rate": 0.0005, + "loss": 1.1795, + "step": 9660 + }, + { + "epoch": 0.30836442488599763, + "grad_norm": 0.4023909568786621, + "learning_rate": 0.0005, + "loss": 1.1808, + "step": 9670 + }, + { + "epoch": 0.30868331260563153, + "grad_norm": 0.3570982813835144, + "learning_rate": 0.0005, + "loss": 1.1923, + "step": 9680 + }, + { + "epoch": 0.3090022003252655, + "grad_norm": 0.3908848166465759, + "learning_rate": 0.0005, + "loss": 1.2007, + "step": 9690 + }, + { + "epoch": 0.3093210880448994, + "grad_norm": 0.36542585492134094, + "learning_rate": 0.0005, + "loss": 1.1934, + "step": 9700 + }, + { + "epoch": 0.3096399757645333, + "grad_norm": 0.3709806799888611, + "learning_rate": 0.0005, + "loss": 1.2026, + "step": 9710 + }, + { + "epoch": 0.30995886348416724, + "grad_norm": 0.41271722316741943, + "learning_rate": 0.0005, + "loss": 1.2015, + "step": 9720 + }, + { + "epoch": 0.31027775120380113, + "grad_norm": 0.40473729372024536, + "learning_rate": 0.0005, + "loss": 1.2006, + "step": 9730 + }, + { + "epoch": 0.31059663892343503, + "grad_norm": 0.38929319381713867, + "learning_rate": 0.0005, + "loss": 1.1976, + "step": 9740 + }, + { + "epoch": 0.310915526643069, + "grad_norm": 0.3830064833164215, + "learning_rate": 0.0005, + "loss": 1.1798, + "step": 9750 + }, + { + "epoch": 0.3112344143627029, + "grad_norm": 0.3693712651729584, + "learning_rate": 0.0005, + "loss": 1.1867, + "step": 9760 + }, + { + "epoch": 0.31155330208233684, + "grad_norm": 0.3764219880104065, + "learning_rate": 0.0005, + "loss": 1.2072, + "step": 9770 + }, + { + "epoch": 0.31187218980197073, + "grad_norm": 0.3684931993484497, + "learning_rate": 0.0005, + "loss": 1.175, + "step": 9780 + }, + { + "epoch": 0.31219107752160463, + "grad_norm": 0.3662542402744293, + "learning_rate": 0.0005, + "loss": 1.1829, + "step": 9790 + }, + { + "epoch": 0.3125099652412386, + "grad_norm": 0.3679318428039551, + "learning_rate": 0.0005, + "loss": 1.1931, + "step": 9800 + }, + { + "epoch": 0.3128288529608725, + "grad_norm": 0.36969873309135437, + "learning_rate": 0.0005, + "loss": 1.1973, + "step": 9810 + }, + { + "epoch": 0.3131477406805064, + "grad_norm": 0.3816441595554352, + "learning_rate": 0.0005, + "loss": 1.1991, + "step": 9820 + }, + { + "epoch": 0.31346662840014033, + "grad_norm": 0.3715498149394989, + "learning_rate": 0.0005, + "loss": 1.1844, + "step": 9830 + }, + { + "epoch": 0.31378551611977423, + "grad_norm": 0.3715684413909912, + "learning_rate": 0.0005, + "loss": 1.1899, + "step": 9840 + }, + { + "epoch": 0.31410440383940813, + "grad_norm": 0.3811728358268738, + "learning_rate": 0.0005, + "loss": 1.1711, + "step": 9850 + }, + { + "epoch": 0.3144232915590421, + "grad_norm": 0.3746051490306854, + "learning_rate": 0.0005, + "loss": 1.1945, + "step": 9860 + }, + { + "epoch": 0.314742179278676, + "grad_norm": 0.36689209938049316, + "learning_rate": 0.0005, + "loss": 1.1963, + "step": 9870 + }, + { + "epoch": 0.3150610669983099, + "grad_norm": 0.3704276382923126, + "learning_rate": 0.0005, + "loss": 1.1974, + "step": 9880 + }, + { + "epoch": 0.31537995471794383, + "grad_norm": 0.3665519952774048, + "learning_rate": 0.0005, + "loss": 1.1768, + "step": 9890 + }, + { + "epoch": 0.31569884243757773, + "grad_norm": 0.3684837222099304, + "learning_rate": 0.0005, + "loss": 1.1824, + "step": 9900 + }, + { + "epoch": 0.31601773015721163, + "grad_norm": 0.37614014744758606, + "learning_rate": 0.0005, + "loss": 1.1725, + "step": 9910 + }, + { + "epoch": 0.3163366178768456, + "grad_norm": 0.3925752341747284, + "learning_rate": 0.0005, + "loss": 1.1831, + "step": 9920 + }, + { + "epoch": 0.3166555055964795, + "grad_norm": 0.3735015094280243, + "learning_rate": 0.0005, + "loss": 1.1932, + "step": 9930 + }, + { + "epoch": 0.3169743933161134, + "grad_norm": 0.37925341725349426, + "learning_rate": 0.0005, + "loss": 1.1925, + "step": 9940 + }, + { + "epoch": 0.31729328103574733, + "grad_norm": 0.3753606677055359, + "learning_rate": 0.0005, + "loss": 1.1986, + "step": 9950 + }, + { + "epoch": 0.31761216875538123, + "grad_norm": 0.3785763680934906, + "learning_rate": 0.0005, + "loss": 1.184, + "step": 9960 + }, + { + "epoch": 0.31793105647501513, + "grad_norm": 0.37484627962112427, + "learning_rate": 0.0005, + "loss": 1.2061, + "step": 9970 + }, + { + "epoch": 0.3182499441946491, + "grad_norm": 0.371409147977829, + "learning_rate": 0.0005, + "loss": 1.1843, + "step": 9980 + }, + { + "epoch": 0.318568831914283, + "grad_norm": 0.37367263436317444, + "learning_rate": 0.0005, + "loss": 1.1872, + "step": 9990 + }, + { + "epoch": 0.3188877196339169, + "grad_norm": 0.38979968428611755, + "learning_rate": 0.0005, + "loss": 1.1923, + "step": 10000 + }, + { + "epoch": 0.31920660735355083, + "grad_norm": 0.3798271119594574, + "learning_rate": 0.0005, + "loss": 1.194, + "step": 10010 + }, + { + "epoch": 0.31952549507318473, + "grad_norm": 0.38313472270965576, + "learning_rate": 0.0005, + "loss": 1.1788, + "step": 10020 + }, + { + "epoch": 0.3198443827928186, + "grad_norm": 0.372326135635376, + "learning_rate": 0.0005, + "loss": 1.1815, + "step": 10030 + }, + { + "epoch": 0.3201632705124526, + "grad_norm": 0.3908335566520691, + "learning_rate": 0.0005, + "loss": 1.1869, + "step": 10040 + }, + { + "epoch": 0.3204821582320865, + "grad_norm": 0.40376147627830505, + "learning_rate": 0.0005, + "loss": 1.2042, + "step": 10050 + }, + { + "epoch": 0.3208010459517204, + "grad_norm": 0.3750164806842804, + "learning_rate": 0.0005, + "loss": 1.1887, + "step": 10060 + }, + { + "epoch": 0.32111993367135433, + "grad_norm": 0.37391674518585205, + "learning_rate": 0.0005, + "loss": 1.1732, + "step": 10070 + }, + { + "epoch": 0.3214388213909882, + "grad_norm": 0.37153011560440063, + "learning_rate": 0.0005, + "loss": 1.1878, + "step": 10080 + }, + { + "epoch": 0.3217577091106221, + "grad_norm": 0.3877953886985779, + "learning_rate": 0.0005, + "loss": 1.1831, + "step": 10090 + }, + { + "epoch": 0.3220765968302561, + "grad_norm": 0.3789294958114624, + "learning_rate": 0.0005, + "loss": 1.1929, + "step": 10100 + }, + { + "epoch": 0.32239548454989, + "grad_norm": 0.3732856810092926, + "learning_rate": 0.0005, + "loss": 1.1915, + "step": 10110 + }, + { + "epoch": 0.3227143722695239, + "grad_norm": 0.35892724990844727, + "learning_rate": 0.0005, + "loss": 1.1653, + "step": 10120 + }, + { + "epoch": 0.32303325998915783, + "grad_norm": 0.39610254764556885, + "learning_rate": 0.0005, + "loss": 1.1752, + "step": 10130 + }, + { + "epoch": 0.3233521477087917, + "grad_norm": 0.3713800311088562, + "learning_rate": 0.0005, + "loss": 1.1784, + "step": 10140 + }, + { + "epoch": 0.3236710354284256, + "grad_norm": 0.36443793773651123, + "learning_rate": 0.0005, + "loss": 1.1908, + "step": 10150 + }, + { + "epoch": 0.3239899231480596, + "grad_norm": 0.3666205108165741, + "learning_rate": 0.0005, + "loss": 1.1808, + "step": 10160 + }, + { + "epoch": 0.3243088108676935, + "grad_norm": 0.3741157650947571, + "learning_rate": 0.0005, + "loss": 1.1761, + "step": 10170 + }, + { + "epoch": 0.32462769858732743, + "grad_norm": 0.37408700585365295, + "learning_rate": 0.0005, + "loss": 1.1757, + "step": 10180 + }, + { + "epoch": 0.3249465863069613, + "grad_norm": 0.3955936133861542, + "learning_rate": 0.0005, + "loss": 1.1673, + "step": 10190 + }, + { + "epoch": 0.3252654740265952, + "grad_norm": 0.36712345480918884, + "learning_rate": 0.0005, + "loss": 1.1777, + "step": 10200 + }, + { + "epoch": 0.3255843617462292, + "grad_norm": 0.37262213230133057, + "learning_rate": 0.0005, + "loss": 1.1777, + "step": 10210 + }, + { + "epoch": 0.3259032494658631, + "grad_norm": 0.37740543484687805, + "learning_rate": 0.0005, + "loss": 1.1778, + "step": 10220 + }, + { + "epoch": 0.326222137185497, + "grad_norm": 0.37197840213775635, + "learning_rate": 0.0005, + "loss": 1.2135, + "step": 10230 + }, + { + "epoch": 0.3265410249051309, + "grad_norm": 0.3649875521659851, + "learning_rate": 0.0005, + "loss": 1.1988, + "step": 10240 + }, + { + "epoch": 0.3268599126247648, + "grad_norm": 0.37759947776794434, + "learning_rate": 0.0005, + "loss": 1.1572, + "step": 10250 + }, + { + "epoch": 0.3271788003443987, + "grad_norm": 0.3873971104621887, + "learning_rate": 0.0005, + "loss": 1.1837, + "step": 10260 + }, + { + "epoch": 0.3274976880640327, + "grad_norm": 0.37649694085121155, + "learning_rate": 0.0005, + "loss": 1.1821, + "step": 10270 + }, + { + "epoch": 0.3278165757836666, + "grad_norm": 0.3752191662788391, + "learning_rate": 0.0005, + "loss": 1.1841, + "step": 10280 + }, + { + "epoch": 0.3281354635033005, + "grad_norm": 0.36976438760757446, + "learning_rate": 0.0005, + "loss": 1.1849, + "step": 10290 + }, + { + "epoch": 0.3284543512229344, + "grad_norm": 0.3746798038482666, + "learning_rate": 0.0005, + "loss": 1.1784, + "step": 10300 + }, + { + "epoch": 0.3287732389425683, + "grad_norm": 0.39386677742004395, + "learning_rate": 0.0005, + "loss": 1.2019, + "step": 10310 + }, + { + "epoch": 0.3290921266622022, + "grad_norm": 0.39480507373809814, + "learning_rate": 0.0005, + "loss": 1.1712, + "step": 10320 + }, + { + "epoch": 0.3294110143818362, + "grad_norm": 0.3793785572052002, + "learning_rate": 0.0005, + "loss": 1.2034, + "step": 10330 + }, + { + "epoch": 0.3297299021014701, + "grad_norm": 0.3793615698814392, + "learning_rate": 0.0005, + "loss": 1.1909, + "step": 10340 + }, + { + "epoch": 0.33004878982110397, + "grad_norm": 0.3702448308467865, + "learning_rate": 0.0005, + "loss": 1.1725, + "step": 10350 + }, + { + "epoch": 0.3303676775407379, + "grad_norm": 0.3776017725467682, + "learning_rate": 0.0005, + "loss": 1.1788, + "step": 10360 + }, + { + "epoch": 0.3306865652603718, + "grad_norm": 0.3725394606590271, + "learning_rate": 0.0005, + "loss": 1.2005, + "step": 10370 + }, + { + "epoch": 0.3310054529800057, + "grad_norm": 0.3663187623023987, + "learning_rate": 0.0005, + "loss": 1.1725, + "step": 10380 + }, + { + "epoch": 0.3313243406996397, + "grad_norm": 0.38458192348480225, + "learning_rate": 0.0005, + "loss": 1.1662, + "step": 10390 + }, + { + "epoch": 0.3316432284192736, + "grad_norm": 0.35204780101776123, + "learning_rate": 0.0005, + "loss": 1.184, + "step": 10400 + }, + { + "epoch": 0.33196211613890747, + "grad_norm": 0.35831499099731445, + "learning_rate": 0.0005, + "loss": 1.1646, + "step": 10410 + }, + { + "epoch": 0.3322810038585414, + "grad_norm": 0.36791619658470154, + "learning_rate": 0.0005, + "loss": 1.1792, + "step": 10420 + }, + { + "epoch": 0.3325998915781753, + "grad_norm": 0.3646932542324066, + "learning_rate": 0.0005, + "loss": 1.1671, + "step": 10430 + }, + { + "epoch": 0.3329187792978092, + "grad_norm": 0.36495405435562134, + "learning_rate": 0.0005, + "loss": 1.2007, + "step": 10440 + }, + { + "epoch": 0.3332376670174432, + "grad_norm": 0.3621331751346588, + "learning_rate": 0.0005, + "loss": 1.171, + "step": 10450 + }, + { + "epoch": 0.33355655473707707, + "grad_norm": 0.3788112998008728, + "learning_rate": 0.0005, + "loss": 1.1649, + "step": 10460 + }, + { + "epoch": 0.33387544245671097, + "grad_norm": 0.38030800223350525, + "learning_rate": 0.0005, + "loss": 1.1813, + "step": 10470 + }, + { + "epoch": 0.3341943301763449, + "grad_norm": 0.3575531840324402, + "learning_rate": 0.0005, + "loss": 1.1739, + "step": 10480 + }, + { + "epoch": 0.3345132178959788, + "grad_norm": 0.3673001527786255, + "learning_rate": 0.0005, + "loss": 1.1621, + "step": 10490 + }, + { + "epoch": 0.3348321056156127, + "grad_norm": 0.3933727443218231, + "learning_rate": 0.0005, + "loss": 1.1723, + "step": 10500 + }, + { + "epoch": 0.33515099333524667, + "grad_norm": 0.3708011209964752, + "learning_rate": 0.0005, + "loss": 1.1836, + "step": 10510 + }, + { + "epoch": 0.33546988105488057, + "grad_norm": 0.3957275450229645, + "learning_rate": 0.0005, + "loss": 1.1937, + "step": 10520 + }, + { + "epoch": 0.33578876877451447, + "grad_norm": 0.37122806906700134, + "learning_rate": 0.0005, + "loss": 1.1811, + "step": 10530 + }, + { + "epoch": 0.3361076564941484, + "grad_norm": 0.3752969205379486, + "learning_rate": 0.0005, + "loss": 1.1578, + "step": 10540 + }, + { + "epoch": 0.3364265442137823, + "grad_norm": 0.3687181770801544, + "learning_rate": 0.0005, + "loss": 1.1646, + "step": 10550 + }, + { + "epoch": 0.3367454319334162, + "grad_norm": 0.36669522523880005, + "learning_rate": 0.0005, + "loss": 1.1773, + "step": 10560 + }, + { + "epoch": 0.33706431965305017, + "grad_norm": 0.37474456429481506, + "learning_rate": 0.0005, + "loss": 1.1659, + "step": 10570 + }, + { + "epoch": 0.33738320737268407, + "grad_norm": 0.39351922273635864, + "learning_rate": 0.0005, + "loss": 1.1668, + "step": 10580 + }, + { + "epoch": 0.337702095092318, + "grad_norm": 0.3685235381126404, + "learning_rate": 0.0005, + "loss": 1.1431, + "step": 10590 + }, + { + "epoch": 0.3380209828119519, + "grad_norm": 0.3805023431777954, + "learning_rate": 0.0005, + "loss": 1.165, + "step": 10600 + }, + { + "epoch": 0.3383398705315858, + "grad_norm": 0.38222357630729675, + "learning_rate": 0.0005, + "loss": 1.1732, + "step": 10610 + }, + { + "epoch": 0.33865875825121977, + "grad_norm": 0.36740052700042725, + "learning_rate": 0.0005, + "loss": 1.1966, + "step": 10620 + }, + { + "epoch": 0.33897764597085367, + "grad_norm": 0.3713676929473877, + "learning_rate": 0.0005, + "loss": 1.1754, + "step": 10630 + }, + { + "epoch": 0.33929653369048757, + "grad_norm": 0.3694130480289459, + "learning_rate": 0.0005, + "loss": 1.1816, + "step": 10640 + }, + { + "epoch": 0.3396154214101215, + "grad_norm": 0.36890512704849243, + "learning_rate": 0.0005, + "loss": 1.1737, + "step": 10650 + }, + { + "epoch": 0.3399343091297554, + "grad_norm": 0.3650340735912323, + "learning_rate": 0.0005, + "loss": 1.1748, + "step": 10660 + }, + { + "epoch": 0.3402531968493893, + "grad_norm": 0.3503739833831787, + "learning_rate": 0.0005, + "loss": 1.1781, + "step": 10670 + }, + { + "epoch": 0.34057208456902327, + "grad_norm": 0.3604639172554016, + "learning_rate": 0.0005, + "loss": 1.1828, + "step": 10680 + }, + { + "epoch": 0.34089097228865717, + "grad_norm": 0.3638792932033539, + "learning_rate": 0.0005, + "loss": 1.175, + "step": 10690 + }, + { + "epoch": 0.34120986000829107, + "grad_norm": 0.36150917410850525, + "learning_rate": 0.0005, + "loss": 1.176, + "step": 10700 + }, + { + "epoch": 0.341528747727925, + "grad_norm": 0.35987234115600586, + "learning_rate": 0.0005, + "loss": 1.1682, + "step": 10710 + }, + { + "epoch": 0.3418476354475589, + "grad_norm": 0.36292168498039246, + "learning_rate": 0.0005, + "loss": 1.1788, + "step": 10720 + }, + { + "epoch": 0.3421665231671928, + "grad_norm": 0.38154059648513794, + "learning_rate": 0.0005, + "loss": 1.1536, + "step": 10730 + }, + { + "epoch": 0.34248541088682677, + "grad_norm": 0.37422358989715576, + "learning_rate": 0.0005, + "loss": 1.1845, + "step": 10740 + }, + { + "epoch": 0.34280429860646067, + "grad_norm": 0.35607483983039856, + "learning_rate": 0.0005, + "loss": 1.1679, + "step": 10750 + }, + { + "epoch": 0.34312318632609456, + "grad_norm": 0.3682827651500702, + "learning_rate": 0.0005, + "loss": 1.1579, + "step": 10760 + }, + { + "epoch": 0.3434420740457285, + "grad_norm": 0.3609450161457062, + "learning_rate": 0.0005, + "loss": 1.1527, + "step": 10770 + }, + { + "epoch": 0.3437609617653624, + "grad_norm": 0.3711645305156708, + "learning_rate": 0.0005, + "loss": 1.1739, + "step": 10780 + }, + { + "epoch": 0.3440798494849963, + "grad_norm": 0.3735518157482147, + "learning_rate": 0.0005, + "loss": 1.1652, + "step": 10790 + }, + { + "epoch": 0.34439873720463027, + "grad_norm": 0.3735860586166382, + "learning_rate": 0.0005, + "loss": 1.1748, + "step": 10800 + }, + { + "epoch": 0.34471762492426417, + "grad_norm": 0.3775559663772583, + "learning_rate": 0.0005, + "loss": 1.1929, + "step": 10810 + }, + { + "epoch": 0.34503651264389806, + "grad_norm": 0.3657887279987335, + "learning_rate": 0.0005, + "loss": 1.1805, + "step": 10820 + }, + { + "epoch": 0.345355400363532, + "grad_norm": 0.3744265139102936, + "learning_rate": 0.0005, + "loss": 1.1694, + "step": 10830 + }, + { + "epoch": 0.3456742880831659, + "grad_norm": 0.36719611287117004, + "learning_rate": 0.0005, + "loss": 1.1576, + "step": 10840 + }, + { + "epoch": 0.3459931758027998, + "grad_norm": 0.35522088408470154, + "learning_rate": 0.0005, + "loss": 1.1597, + "step": 10850 + }, + { + "epoch": 0.34631206352243377, + "grad_norm": 0.34695446491241455, + "learning_rate": 0.0005, + "loss": 1.1587, + "step": 10860 + }, + { + "epoch": 0.34663095124206766, + "grad_norm": 0.35940998792648315, + "learning_rate": 0.0005, + "loss": 1.1787, + "step": 10870 + }, + { + "epoch": 0.34694983896170156, + "grad_norm": 0.3649153411388397, + "learning_rate": 0.0005, + "loss": 1.1883, + "step": 10880 + }, + { + "epoch": 0.3472687266813355, + "grad_norm": 0.373657763004303, + "learning_rate": 0.0005, + "loss": 1.1751, + "step": 10890 + }, + { + "epoch": 0.3475876144009694, + "grad_norm": 0.38196080923080444, + "learning_rate": 0.0005, + "loss": 1.1819, + "step": 10900 + }, + { + "epoch": 0.3479065021206033, + "grad_norm": 0.3717299997806549, + "learning_rate": 0.0005, + "loss": 1.1535, + "step": 10910 + }, + { + "epoch": 0.34822538984023726, + "grad_norm": 0.3629402816295624, + "learning_rate": 0.0005, + "loss": 1.1781, + "step": 10920 + }, + { + "epoch": 0.34854427755987116, + "grad_norm": 0.3839849829673767, + "learning_rate": 0.0005, + "loss": 1.1799, + "step": 10930 + }, + { + "epoch": 0.34886316527950506, + "grad_norm": 0.3889947831630707, + "learning_rate": 0.0005, + "loss": 1.1562, + "step": 10940 + }, + { + "epoch": 0.349182052999139, + "grad_norm": 0.3629694879055023, + "learning_rate": 0.0005, + "loss": 1.1859, + "step": 10950 + }, + { + "epoch": 0.3495009407187729, + "grad_norm": 0.3563177287578583, + "learning_rate": 0.0005, + "loss": 1.1712, + "step": 10960 + }, + { + "epoch": 0.3498198284384068, + "grad_norm": 0.3640787601470947, + "learning_rate": 0.0005, + "loss": 1.1764, + "step": 10970 + }, + { + "epoch": 0.35013871615804076, + "grad_norm": 0.36180850863456726, + "learning_rate": 0.0005, + "loss": 1.1752, + "step": 10980 + }, + { + "epoch": 0.35045760387767466, + "grad_norm": 0.36016371846199036, + "learning_rate": 0.0005, + "loss": 1.1529, + "step": 10990 + }, + { + "epoch": 0.3507764915973086, + "grad_norm": 0.37246090173721313, + "learning_rate": 0.0005, + "loss": 1.1658, + "step": 11000 + }, + { + "epoch": 0.3510953793169425, + "grad_norm": 0.38452988862991333, + "learning_rate": 0.0005, + "loss": 1.1807, + "step": 11010 + }, + { + "epoch": 0.3514142670365764, + "grad_norm": 0.3714504539966583, + "learning_rate": 0.0005, + "loss": 1.1618, + "step": 11020 + }, + { + "epoch": 0.35173315475621036, + "grad_norm": 0.3711185157299042, + "learning_rate": 0.0005, + "loss": 1.1423, + "step": 11030 + }, + { + "epoch": 0.35205204247584426, + "grad_norm": 0.36155474185943604, + "learning_rate": 0.0005, + "loss": 1.1618, + "step": 11040 + }, + { + "epoch": 0.35237093019547816, + "grad_norm": 0.36459237337112427, + "learning_rate": 0.0005, + "loss": 1.1434, + "step": 11050 + }, + { + "epoch": 0.3526898179151121, + "grad_norm": 0.3625276982784271, + "learning_rate": 0.0005, + "loss": 1.1632, + "step": 11060 + }, + { + "epoch": 0.353008705634746, + "grad_norm": 0.35428375005722046, + "learning_rate": 0.0005, + "loss": 1.1604, + "step": 11070 + }, + { + "epoch": 0.3533275933543799, + "grad_norm": 0.36311075091362, + "learning_rate": 0.0005, + "loss": 1.1588, + "step": 11080 + }, + { + "epoch": 0.35364648107401386, + "grad_norm": 0.357806921005249, + "learning_rate": 0.0005, + "loss": 1.1489, + "step": 11090 + }, + { + "epoch": 0.35396536879364776, + "grad_norm": 0.3589811623096466, + "learning_rate": 0.0005, + "loss": 1.1464, + "step": 11100 + }, + { + "epoch": 0.35428425651328166, + "grad_norm": 0.3645656704902649, + "learning_rate": 0.0005, + "loss": 1.1691, + "step": 11110 + }, + { + "epoch": 0.3546031442329156, + "grad_norm": 0.37154215574264526, + "learning_rate": 0.0005, + "loss": 1.1766, + "step": 11120 + }, + { + "epoch": 0.3549220319525495, + "grad_norm": 0.36165502667427063, + "learning_rate": 0.0005, + "loss": 1.1706, + "step": 11130 + }, + { + "epoch": 0.3552409196721834, + "grad_norm": 0.36878177523612976, + "learning_rate": 0.0005, + "loss": 1.1641, + "step": 11140 + }, + { + "epoch": 0.35555980739181736, + "grad_norm": 0.3841949701309204, + "learning_rate": 0.0005, + "loss": 1.1614, + "step": 11150 + }, + { + "epoch": 0.35587869511145126, + "grad_norm": 0.36055484414100647, + "learning_rate": 0.0005, + "loss": 1.1699, + "step": 11160 + }, + { + "epoch": 0.35619758283108516, + "grad_norm": 0.3562716841697693, + "learning_rate": 0.0005, + "loss": 1.1683, + "step": 11170 + }, + { + "epoch": 0.3565164705507191, + "grad_norm": 0.37487974762916565, + "learning_rate": 0.0005, + "loss": 1.1617, + "step": 11180 + }, + { + "epoch": 0.356835358270353, + "grad_norm": 0.37108105421066284, + "learning_rate": 0.0005, + "loss": 1.1582, + "step": 11190 + }, + { + "epoch": 0.3571542459899869, + "grad_norm": 0.35356369614601135, + "learning_rate": 0.0005, + "loss": 1.1382, + "step": 11200 + }, + { + "epoch": 0.35747313370962086, + "grad_norm": 0.3768700361251831, + "learning_rate": 0.0005, + "loss": 1.1722, + "step": 11210 + }, + { + "epoch": 0.35779202142925476, + "grad_norm": 0.3485049605369568, + "learning_rate": 0.0005, + "loss": 1.1617, + "step": 11220 + }, + { + "epoch": 0.35811090914888866, + "grad_norm": 0.36197659373283386, + "learning_rate": 0.0005, + "loss": 1.181, + "step": 11230 + }, + { + "epoch": 0.3584297968685226, + "grad_norm": 0.37221449613571167, + "learning_rate": 0.0005, + "loss": 1.1772, + "step": 11240 + }, + { + "epoch": 0.3587486845881565, + "grad_norm": 0.36957746744155884, + "learning_rate": 0.0005, + "loss": 1.154, + "step": 11250 + }, + { + "epoch": 0.3590675723077904, + "grad_norm": 0.35753747820854187, + "learning_rate": 0.0005, + "loss": 1.1469, + "step": 11260 + }, + { + "epoch": 0.35938646002742436, + "grad_norm": 0.36781930923461914, + "learning_rate": 0.0005, + "loss": 1.1735, + "step": 11270 + }, + { + "epoch": 0.35970534774705826, + "grad_norm": 0.3567204177379608, + "learning_rate": 0.0005, + "loss": 1.1451, + "step": 11280 + }, + { + "epoch": 0.36002423546669216, + "grad_norm": 0.3888096809387207, + "learning_rate": 0.0005, + "loss": 1.1547, + "step": 11290 + }, + { + "epoch": 0.3603431231863261, + "grad_norm": 0.3750171959400177, + "learning_rate": 0.0005, + "loss": 1.1634, + "step": 11300 + }, + { + "epoch": 0.36066201090596, + "grad_norm": 0.3716561496257782, + "learning_rate": 0.0005, + "loss": 1.1548, + "step": 11310 + }, + { + "epoch": 0.3609808986255939, + "grad_norm": 0.3608260452747345, + "learning_rate": 0.0005, + "loss": 1.1478, + "step": 11320 + }, + { + "epoch": 0.36129978634522786, + "grad_norm": 0.3671262264251709, + "learning_rate": 0.0005, + "loss": 1.149, + "step": 11330 + }, + { + "epoch": 0.36161867406486176, + "grad_norm": 0.3494800627231598, + "learning_rate": 0.0005, + "loss": 1.1707, + "step": 11340 + }, + { + "epoch": 0.36193756178449565, + "grad_norm": 0.35910022258758545, + "learning_rate": 0.0005, + "loss": 1.1607, + "step": 11350 + }, + { + "epoch": 0.3622564495041296, + "grad_norm": 0.35987815260887146, + "learning_rate": 0.0005, + "loss": 1.17, + "step": 11360 + }, + { + "epoch": 0.3625753372237635, + "grad_norm": 0.36094966530799866, + "learning_rate": 0.0005, + "loss": 1.157, + "step": 11370 + }, + { + "epoch": 0.3628942249433974, + "grad_norm": 0.3670837879180908, + "learning_rate": 0.0005, + "loss": 1.1536, + "step": 11380 + }, + { + "epoch": 0.36321311266303136, + "grad_norm": 0.3617798984050751, + "learning_rate": 0.0005, + "loss": 1.1596, + "step": 11390 + }, + { + "epoch": 0.36353200038266525, + "grad_norm": 0.3594600260257721, + "learning_rate": 0.0005, + "loss": 1.1657, + "step": 11400 + }, + { + "epoch": 0.3638508881022992, + "grad_norm": 0.36290115118026733, + "learning_rate": 0.0005, + "loss": 1.1482, + "step": 11410 + }, + { + "epoch": 0.3641697758219331, + "grad_norm": 0.3666008412837982, + "learning_rate": 0.0005, + "loss": 1.1577, + "step": 11420 + }, + { + "epoch": 0.364488663541567, + "grad_norm": 0.36086878180503845, + "learning_rate": 0.0005, + "loss": 1.1377, + "step": 11430 + }, + { + "epoch": 0.36480755126120096, + "grad_norm": 0.36125829815864563, + "learning_rate": 0.0005, + "loss": 1.1607, + "step": 11440 + }, + { + "epoch": 0.36512643898083486, + "grad_norm": 0.35635900497436523, + "learning_rate": 0.0005, + "loss": 1.1602, + "step": 11450 + }, + { + "epoch": 0.36544532670046875, + "grad_norm": 0.3567068874835968, + "learning_rate": 0.0005, + "loss": 1.1432, + "step": 11460 + }, + { + "epoch": 0.3657642144201027, + "grad_norm": 0.37255966663360596, + "learning_rate": 0.0005, + "loss": 1.1605, + "step": 11470 + }, + { + "epoch": 0.3660831021397366, + "grad_norm": 0.36005812883377075, + "learning_rate": 0.0005, + "loss": 1.1489, + "step": 11480 + }, + { + "epoch": 0.3664019898593705, + "grad_norm": 0.36554327607154846, + "learning_rate": 0.0005, + "loss": 1.1594, + "step": 11490 + }, + { + "epoch": 0.36672087757900446, + "grad_norm": 0.3808325231075287, + "learning_rate": 0.0005, + "loss": 1.1783, + "step": 11500 + }, + { + "epoch": 0.36703976529863835, + "grad_norm": 0.35380932688713074, + "learning_rate": 0.0005, + "loss": 1.1485, + "step": 11510 + }, + { + "epoch": 0.36735865301827225, + "grad_norm": 0.3594566881656647, + "learning_rate": 0.0005, + "loss": 1.1728, + "step": 11520 + }, + { + "epoch": 0.3676775407379062, + "grad_norm": 0.36721861362457275, + "learning_rate": 0.0005, + "loss": 1.1462, + "step": 11530 + }, + { + "epoch": 0.3679964284575401, + "grad_norm": 0.3698878586292267, + "learning_rate": 0.0005, + "loss": 1.1349, + "step": 11540 + }, + { + "epoch": 0.368315316177174, + "grad_norm": 0.35534027218818665, + "learning_rate": 0.0005, + "loss": 1.1605, + "step": 11550 + }, + { + "epoch": 0.36863420389680795, + "grad_norm": 0.36262908577919006, + "learning_rate": 0.0005, + "loss": 1.1527, + "step": 11560 + }, + { + "epoch": 0.36895309161644185, + "grad_norm": 0.3791918158531189, + "learning_rate": 0.0005, + "loss": 1.1658, + "step": 11570 + }, + { + "epoch": 0.36927197933607575, + "grad_norm": 0.413955956697464, + "learning_rate": 0.0005, + "loss": 1.1337, + "step": 11580 + }, + { + "epoch": 0.3695908670557097, + "grad_norm": 0.3605935275554657, + "learning_rate": 0.0005, + "loss": 1.1422, + "step": 11590 + }, + { + "epoch": 0.3699097547753436, + "grad_norm": 0.36769089102745056, + "learning_rate": 0.0005, + "loss": 1.1756, + "step": 11600 + }, + { + "epoch": 0.3702286424949775, + "grad_norm": 0.35838136076927185, + "learning_rate": 0.0005, + "loss": 1.1598, + "step": 11610 + }, + { + "epoch": 0.37054753021461145, + "grad_norm": 0.3628617227077484, + "learning_rate": 0.0005, + "loss": 1.1712, + "step": 11620 + }, + { + "epoch": 0.37086641793424535, + "grad_norm": 0.35749825835227966, + "learning_rate": 0.0005, + "loss": 1.1641, + "step": 11630 + }, + { + "epoch": 0.37118530565387925, + "grad_norm": 0.3627100884914398, + "learning_rate": 0.0005, + "loss": 1.1821, + "step": 11640 + }, + { + "epoch": 0.3715041933735132, + "grad_norm": 0.3494734764099121, + "learning_rate": 0.0005, + "loss": 1.1492, + "step": 11650 + }, + { + "epoch": 0.3718230810931471, + "grad_norm": 0.3884241580963135, + "learning_rate": 0.0005, + "loss": 1.1744, + "step": 11660 + }, + { + "epoch": 0.372141968812781, + "grad_norm": 0.3681718111038208, + "learning_rate": 0.0005, + "loss": 1.1778, + "step": 11670 + }, + { + "epoch": 0.37246085653241495, + "grad_norm": 0.36322730779647827, + "learning_rate": 0.0005, + "loss": 1.1552, + "step": 11680 + }, + { + "epoch": 0.37277974425204885, + "grad_norm": 0.35371363162994385, + "learning_rate": 0.0005, + "loss": 1.1498, + "step": 11690 + }, + { + "epoch": 0.37309863197168275, + "grad_norm": 0.34495317935943604, + "learning_rate": 0.0005, + "loss": 1.1393, + "step": 11700 + }, + { + "epoch": 0.3734175196913167, + "grad_norm": 0.34707680344581604, + "learning_rate": 0.0005, + "loss": 1.1549, + "step": 11710 + }, + { + "epoch": 0.3737364074109506, + "grad_norm": 0.3565426766872406, + "learning_rate": 0.0005, + "loss": 1.1512, + "step": 11720 + }, + { + "epoch": 0.3740552951305845, + "grad_norm": 0.3783700466156006, + "learning_rate": 0.0005, + "loss": 1.1614, + "step": 11730 + }, + { + "epoch": 0.37437418285021845, + "grad_norm": 0.35486361384391785, + "learning_rate": 0.0005, + "loss": 1.1666, + "step": 11740 + }, + { + "epoch": 0.37469307056985235, + "grad_norm": 0.3659943640232086, + "learning_rate": 0.0005, + "loss": 1.1722, + "step": 11750 + }, + { + "epoch": 0.37501195828948625, + "grad_norm": 0.35426226258277893, + "learning_rate": 0.0005, + "loss": 1.1636, + "step": 11760 + }, + { + "epoch": 0.3753308460091202, + "grad_norm": 0.358793169260025, + "learning_rate": 0.0005, + "loss": 1.1649, + "step": 11770 + }, + { + "epoch": 0.3756497337287541, + "grad_norm": 0.35559338331222534, + "learning_rate": 0.0005, + "loss": 1.1591, + "step": 11780 + }, + { + "epoch": 0.375968621448388, + "grad_norm": 0.3519185781478882, + "learning_rate": 0.0005, + "loss": 1.1787, + "step": 11790 + }, + { + "epoch": 0.37628750916802195, + "grad_norm": 0.3474023640155792, + "learning_rate": 0.0005, + "loss": 1.1565, + "step": 11800 + }, + { + "epoch": 0.37660639688765585, + "grad_norm": 0.3500244915485382, + "learning_rate": 0.0005, + "loss": 1.1622, + "step": 11810 + }, + { + "epoch": 0.37692528460728975, + "grad_norm": 0.3565733730792999, + "learning_rate": 0.0005, + "loss": 1.164, + "step": 11820 + }, + { + "epoch": 0.3772441723269237, + "grad_norm": 0.36334407329559326, + "learning_rate": 0.0005, + "loss": 1.1387, + "step": 11830 + }, + { + "epoch": 0.3775630600465576, + "grad_norm": 0.3599485158920288, + "learning_rate": 0.0005, + "loss": 1.1615, + "step": 11840 + }, + { + "epoch": 0.37788194776619155, + "grad_norm": 0.3623838722705841, + "learning_rate": 0.0005, + "loss": 1.1589, + "step": 11850 + }, + { + "epoch": 0.37820083548582545, + "grad_norm": 0.3498895764350891, + "learning_rate": 0.0005, + "loss": 1.1644, + "step": 11860 + }, + { + "epoch": 0.37851972320545935, + "grad_norm": 0.36630934476852417, + "learning_rate": 0.0005, + "loss": 1.1632, + "step": 11870 + }, + { + "epoch": 0.3788386109250933, + "grad_norm": 0.3576071858406067, + "learning_rate": 0.0005, + "loss": 1.1638, + "step": 11880 + }, + { + "epoch": 0.3791574986447272, + "grad_norm": 0.34983667731285095, + "learning_rate": 0.0005, + "loss": 1.1516, + "step": 11890 + }, + { + "epoch": 0.3794763863643611, + "grad_norm": 0.35930803418159485, + "learning_rate": 0.0005, + "loss": 1.1447, + "step": 11900 + }, + { + "epoch": 0.37979527408399505, + "grad_norm": 0.3543728291988373, + "learning_rate": 0.0005, + "loss": 1.1642, + "step": 11910 + }, + { + "epoch": 0.38011416180362895, + "grad_norm": 0.35220077633857727, + "learning_rate": 0.0005, + "loss": 1.1647, + "step": 11920 + }, + { + "epoch": 0.38043304952326285, + "grad_norm": 0.3517967462539673, + "learning_rate": 0.0005, + "loss": 1.1344, + "step": 11930 + }, + { + "epoch": 0.3807519372428968, + "grad_norm": 0.34194129705429077, + "learning_rate": 0.0005, + "loss": 1.1277, + "step": 11940 + }, + { + "epoch": 0.3810708249625307, + "grad_norm": 0.35568177700042725, + "learning_rate": 0.0005, + "loss": 1.1492, + "step": 11950 + }, + { + "epoch": 0.3813897126821646, + "grad_norm": 0.36249402165412903, + "learning_rate": 0.0005, + "loss": 1.1629, + "step": 11960 + }, + { + "epoch": 0.38170860040179855, + "grad_norm": 0.3520861566066742, + "learning_rate": 0.0005, + "loss": 1.1556, + "step": 11970 + }, + { + "epoch": 0.38202748812143245, + "grad_norm": 0.3475063741207123, + "learning_rate": 0.0005, + "loss": 1.137, + "step": 11980 + }, + { + "epoch": 0.38234637584106634, + "grad_norm": 0.3599000573158264, + "learning_rate": 0.0005, + "loss": 1.147, + "step": 11990 + }, + { + "epoch": 0.3826652635607003, + "grad_norm": 0.38423648476600647, + "learning_rate": 0.0005, + "loss": 1.1504, + "step": 12000 + }, + { + "epoch": 0.3829841512803342, + "grad_norm": 0.3562023937702179, + "learning_rate": 0.0005, + "loss": 1.1617, + "step": 12010 + }, + { + "epoch": 0.3833030389999681, + "grad_norm": 0.3565387725830078, + "learning_rate": 0.0005, + "loss": 1.1744, + "step": 12020 + }, + { + "epoch": 0.38362192671960205, + "grad_norm": 0.34507426619529724, + "learning_rate": 0.0005, + "loss": 1.1502, + "step": 12030 + }, + { + "epoch": 0.38394081443923594, + "grad_norm": 0.36354437470436096, + "learning_rate": 0.0005, + "loss": 1.1442, + "step": 12040 + }, + { + "epoch": 0.38425970215886984, + "grad_norm": 0.34563297033309937, + "learning_rate": 0.0005, + "loss": 1.1528, + "step": 12050 + }, + { + "epoch": 0.3845785898785038, + "grad_norm": 0.3544037640094757, + "learning_rate": 0.0005, + "loss": 1.1394, + "step": 12060 + }, + { + "epoch": 0.3848974775981377, + "grad_norm": 0.34711146354675293, + "learning_rate": 0.0005, + "loss": 1.1584, + "step": 12070 + }, + { + "epoch": 0.3852163653177716, + "grad_norm": 0.353064626455307, + "learning_rate": 0.0005, + "loss": 1.1553, + "step": 12080 + }, + { + "epoch": 0.38553525303740555, + "grad_norm": 0.35694098472595215, + "learning_rate": 0.0005, + "loss": 1.1472, + "step": 12090 + }, + { + "epoch": 0.38585414075703944, + "grad_norm": 0.3677344024181366, + "learning_rate": 0.0005, + "loss": 1.1561, + "step": 12100 + }, + { + "epoch": 0.38617302847667334, + "grad_norm": 0.3596011698246002, + "learning_rate": 0.0005, + "loss": 1.1589, + "step": 12110 + }, + { + "epoch": 0.3864919161963073, + "grad_norm": 0.3607899248600006, + "learning_rate": 0.0005, + "loss": 1.1641, + "step": 12120 + }, + { + "epoch": 0.3868108039159412, + "grad_norm": 0.37189728021621704, + "learning_rate": 0.0005, + "loss": 1.1612, + "step": 12130 + }, + { + "epoch": 0.3871296916355751, + "grad_norm": 0.34136825799942017, + "learning_rate": 0.0005, + "loss": 1.1266, + "step": 12140 + }, + { + "epoch": 0.38744857935520904, + "grad_norm": 0.34620654582977295, + "learning_rate": 0.0005, + "loss": 1.1665, + "step": 12150 + }, + { + "epoch": 0.38776746707484294, + "grad_norm": 0.34414157271385193, + "learning_rate": 0.0005, + "loss": 1.1644, + "step": 12160 + }, + { + "epoch": 0.38808635479447684, + "grad_norm": 0.34810659289360046, + "learning_rate": 0.0005, + "loss": 1.1253, + "step": 12170 + }, + { + "epoch": 0.3884052425141108, + "grad_norm": 0.34679773449897766, + "learning_rate": 0.0005, + "loss": 1.1718, + "step": 12180 + }, + { + "epoch": 0.3887241302337447, + "grad_norm": 0.3541298806667328, + "learning_rate": 0.0005, + "loss": 1.1441, + "step": 12190 + }, + { + "epoch": 0.3890430179533786, + "grad_norm": 0.35697078704833984, + "learning_rate": 0.0005, + "loss": 1.1455, + "step": 12200 + }, + { + "epoch": 0.38936190567301254, + "grad_norm": 0.34624364972114563, + "learning_rate": 0.0005, + "loss": 1.1545, + "step": 12210 + }, + { + "epoch": 0.38968079339264644, + "grad_norm": 0.3443002998828888, + "learning_rate": 0.0005, + "loss": 1.1549, + "step": 12220 + }, + { + "epoch": 0.38999968111228034, + "grad_norm": 0.34411001205444336, + "learning_rate": 0.0005, + "loss": 1.1437, + "step": 12230 + }, + { + "epoch": 0.3903185688319143, + "grad_norm": 0.38664522767066956, + "learning_rate": 0.0005, + "loss": 1.1393, + "step": 12240 + }, + { + "epoch": 0.3906374565515482, + "grad_norm": 0.36069822311401367, + "learning_rate": 0.0005, + "loss": 1.1302, + "step": 12250 + }, + { + "epoch": 0.39095634427118214, + "grad_norm": 0.35054412484169006, + "learning_rate": 0.0005, + "loss": 1.1452, + "step": 12260 + }, + { + "epoch": 0.39127523199081604, + "grad_norm": 0.3593745827674866, + "learning_rate": 0.0005, + "loss": 1.1593, + "step": 12270 + }, + { + "epoch": 0.39159411971044994, + "grad_norm": 0.36931827664375305, + "learning_rate": 0.0005, + "loss": 1.1431, + "step": 12280 + }, + { + "epoch": 0.3919130074300839, + "grad_norm": 0.3705731928348541, + "learning_rate": 0.0005, + "loss": 1.1467, + "step": 12290 + }, + { + "epoch": 0.3922318951497178, + "grad_norm": 0.35201239585876465, + "learning_rate": 0.0005, + "loss": 1.1658, + "step": 12300 + }, + { + "epoch": 0.3925507828693517, + "grad_norm": 0.33797210454940796, + "learning_rate": 0.0005, + "loss": 1.1491, + "step": 12310 + }, + { + "epoch": 0.39286967058898564, + "grad_norm": 0.3512865900993347, + "learning_rate": 0.0005, + "loss": 1.1571, + "step": 12320 + }, + { + "epoch": 0.39318855830861954, + "grad_norm": 0.35392460227012634, + "learning_rate": 0.0005, + "loss": 1.1542, + "step": 12330 + }, + { + "epoch": 0.39350744602825344, + "grad_norm": 0.34811460971832275, + "learning_rate": 0.0005, + "loss": 1.1351, + "step": 12340 + }, + { + "epoch": 0.3938263337478874, + "grad_norm": 0.3421926200389862, + "learning_rate": 0.0005, + "loss": 1.1344, + "step": 12350 + }, + { + "epoch": 0.3941452214675213, + "grad_norm": 0.3451977074146271, + "learning_rate": 0.0005, + "loss": 1.176, + "step": 12360 + }, + { + "epoch": 0.3944641091871552, + "grad_norm": 0.33949071168899536, + "learning_rate": 0.0005, + "loss": 1.1455, + "step": 12370 + }, + { + "epoch": 0.39478299690678914, + "grad_norm": 0.35217025876045227, + "learning_rate": 0.0005, + "loss": 1.1395, + "step": 12380 + }, + { + "epoch": 0.39510188462642304, + "grad_norm": 0.35102054476737976, + "learning_rate": 0.0005, + "loss": 1.1419, + "step": 12390 + }, + { + "epoch": 0.39542077234605694, + "grad_norm": 0.3483518362045288, + "learning_rate": 0.0005, + "loss": 1.1571, + "step": 12400 + }, + { + "epoch": 0.3957396600656909, + "grad_norm": 0.35172149538993835, + "learning_rate": 0.0005, + "loss": 1.1394, + "step": 12410 + }, + { + "epoch": 0.3960585477853248, + "grad_norm": 0.3434382379055023, + "learning_rate": 0.0005, + "loss": 1.1504, + "step": 12420 + }, + { + "epoch": 0.3963774355049587, + "grad_norm": 0.3492888808250427, + "learning_rate": 0.0005, + "loss": 1.1534, + "step": 12430 + }, + { + "epoch": 0.39669632322459264, + "grad_norm": 0.34085673093795776, + "learning_rate": 0.0005, + "loss": 1.1288, + "step": 12440 + }, + { + "epoch": 0.39701521094422654, + "grad_norm": 0.35518237948417664, + "learning_rate": 0.0005, + "loss": 1.1519, + "step": 12450 + }, + { + "epoch": 0.39733409866386044, + "grad_norm": 0.3471197187900543, + "learning_rate": 0.0005, + "loss": 1.1358, + "step": 12460 + }, + { + "epoch": 0.3976529863834944, + "grad_norm": 0.36180561780929565, + "learning_rate": 0.0005, + "loss": 1.1609, + "step": 12470 + }, + { + "epoch": 0.3979718741031283, + "grad_norm": 0.3461403250694275, + "learning_rate": 0.0005, + "loss": 1.158, + "step": 12480 + }, + { + "epoch": 0.3982907618227622, + "grad_norm": 0.342271089553833, + "learning_rate": 0.0005, + "loss": 1.144, + "step": 12490 + }, + { + "epoch": 0.39860964954239614, + "grad_norm": 0.3423519730567932, + "learning_rate": 0.0005, + "loss": 1.1634, + "step": 12500 + }, + { + "epoch": 0.39892853726203004, + "grad_norm": 0.365598201751709, + "learning_rate": 0.0005, + "loss": 1.1384, + "step": 12510 + }, + { + "epoch": 0.39924742498166393, + "grad_norm": 0.3581569492816925, + "learning_rate": 0.0005, + "loss": 1.1419, + "step": 12520 + }, + { + "epoch": 0.3995663127012979, + "grad_norm": 0.3360135853290558, + "learning_rate": 0.0005, + "loss": 1.1578, + "step": 12530 + }, + { + "epoch": 0.3998852004209318, + "grad_norm": 0.3541604280471802, + "learning_rate": 0.0005, + "loss": 1.144, + "step": 12540 + }, + { + "epoch": 0.4002040881405657, + "grad_norm": 0.3437075912952423, + "learning_rate": 0.0005, + "loss": 1.1229, + "step": 12550 + }, + { + "epoch": 0.40052297586019964, + "grad_norm": 0.3445467948913574, + "learning_rate": 0.0005, + "loss": 1.1386, + "step": 12560 + }, + { + "epoch": 0.40084186357983353, + "grad_norm": 0.35498735308647156, + "learning_rate": 0.0005, + "loss": 1.1668, + "step": 12570 + }, + { + "epoch": 0.40116075129946743, + "grad_norm": 0.35733136534690857, + "learning_rate": 0.0005, + "loss": 1.1215, + "step": 12580 + }, + { + "epoch": 0.4014796390191014, + "grad_norm": 0.3459886312484741, + "learning_rate": 0.0005, + "loss": 1.1686, + "step": 12590 + }, + { + "epoch": 0.4017985267387353, + "grad_norm": 0.3461032509803772, + "learning_rate": 0.0005, + "loss": 1.1407, + "step": 12600 + }, + { + "epoch": 0.4021174144583692, + "grad_norm": 0.3554673492908478, + "learning_rate": 0.0005, + "loss": 1.1458, + "step": 12610 + }, + { + "epoch": 0.40243630217800314, + "grad_norm": 0.3475288450717926, + "learning_rate": 0.0005, + "loss": 1.1496, + "step": 12620 + }, + { + "epoch": 0.40275518989763703, + "grad_norm": 0.348635196685791, + "learning_rate": 0.0005, + "loss": 1.1268, + "step": 12630 + }, + { + "epoch": 0.40307407761727093, + "grad_norm": 0.3567737638950348, + "learning_rate": 0.0005, + "loss": 1.1291, + "step": 12640 + }, + { + "epoch": 0.4033929653369049, + "grad_norm": 0.3558914363384247, + "learning_rate": 0.0005, + "loss": 1.1519, + "step": 12650 + }, + { + "epoch": 0.4037118530565388, + "grad_norm": 0.35246145725250244, + "learning_rate": 0.0005, + "loss": 1.1425, + "step": 12660 + }, + { + "epoch": 0.40403074077617274, + "grad_norm": 0.3489568531513214, + "learning_rate": 0.0005, + "loss": 1.1396, + "step": 12670 + }, + { + "epoch": 0.40434962849580663, + "grad_norm": 0.361677348613739, + "learning_rate": 0.0005, + "loss": 1.1468, + "step": 12680 + }, + { + "epoch": 0.40466851621544053, + "grad_norm": 0.34218665957450867, + "learning_rate": 0.0005, + "loss": 1.1329, + "step": 12690 + }, + { + "epoch": 0.4049874039350745, + "grad_norm": 0.3400156795978546, + "learning_rate": 0.0005, + "loss": 1.1464, + "step": 12700 + }, + { + "epoch": 0.4053062916547084, + "grad_norm": 0.35337018966674805, + "learning_rate": 0.0005, + "loss": 1.131, + "step": 12710 + }, + { + "epoch": 0.4056251793743423, + "grad_norm": 0.3584569990634918, + "learning_rate": 0.0005, + "loss": 1.1508, + "step": 12720 + }, + { + "epoch": 0.40594406709397624, + "grad_norm": 0.3542274236679077, + "learning_rate": 0.0005, + "loss": 1.1348, + "step": 12730 + }, + { + "epoch": 0.40626295481361013, + "grad_norm": 0.3413817882537842, + "learning_rate": 0.0005, + "loss": 1.148, + "step": 12740 + }, + { + "epoch": 0.40658184253324403, + "grad_norm": 0.3511422872543335, + "learning_rate": 0.0005, + "loss": 1.1378, + "step": 12750 + }, + { + "epoch": 0.406900730252878, + "grad_norm": 0.3467022478580475, + "learning_rate": 0.0005, + "loss": 1.1405, + "step": 12760 + }, + { + "epoch": 0.4072196179725119, + "grad_norm": 0.35287347435951233, + "learning_rate": 0.0005, + "loss": 1.1562, + "step": 12770 + }, + { + "epoch": 0.4075385056921458, + "grad_norm": 0.35349148511886597, + "learning_rate": 0.0005, + "loss": 1.1361, + "step": 12780 + }, + { + "epoch": 0.40785739341177973, + "grad_norm": 0.35082828998565674, + "learning_rate": 0.0005, + "loss": 1.1388, + "step": 12790 + }, + { + "epoch": 0.40817628113141363, + "grad_norm": 0.34728944301605225, + "learning_rate": 0.0005, + "loss": 1.1577, + "step": 12800 + }, + { + "epoch": 0.40849516885104753, + "grad_norm": 0.35114023089408875, + "learning_rate": 0.0005, + "loss": 1.1352, + "step": 12810 + }, + { + "epoch": 0.4088140565706815, + "grad_norm": 0.34104374051094055, + "learning_rate": 0.0005, + "loss": 1.1343, + "step": 12820 + }, + { + "epoch": 0.4091329442903154, + "grad_norm": 0.33483564853668213, + "learning_rate": 0.0005, + "loss": 1.1331, + "step": 12830 + }, + { + "epoch": 0.4094518320099493, + "grad_norm": 0.3484364449977875, + "learning_rate": 0.0005, + "loss": 1.1079, + "step": 12840 + }, + { + "epoch": 0.40977071972958323, + "grad_norm": 0.3626136779785156, + "learning_rate": 0.0005, + "loss": 1.147, + "step": 12850 + }, + { + "epoch": 0.41008960744921713, + "grad_norm": 0.3401944935321808, + "learning_rate": 0.0005, + "loss": 1.1235, + "step": 12860 + }, + { + "epoch": 0.41040849516885103, + "grad_norm": 0.33367717266082764, + "learning_rate": 0.0005, + "loss": 1.1324, + "step": 12870 + }, + { + "epoch": 0.410727382888485, + "grad_norm": 0.3408673405647278, + "learning_rate": 0.0005, + "loss": 1.1421, + "step": 12880 + }, + { + "epoch": 0.4110462706081189, + "grad_norm": 0.37122371792793274, + "learning_rate": 0.0005, + "loss": 1.1486, + "step": 12890 + }, + { + "epoch": 0.4113651583277528, + "grad_norm": 0.3381376564502716, + "learning_rate": 0.0005, + "loss": 1.1311, + "step": 12900 + }, + { + "epoch": 0.41168404604738673, + "grad_norm": 0.3362289369106293, + "learning_rate": 0.0005, + "loss": 1.1246, + "step": 12910 + }, + { + "epoch": 0.41200293376702063, + "grad_norm": 0.355113685131073, + "learning_rate": 0.0005, + "loss": 1.1365, + "step": 12920 + }, + { + "epoch": 0.4123218214866545, + "grad_norm": 0.3402760624885559, + "learning_rate": 0.0005, + "loss": 1.1308, + "step": 12930 + }, + { + "epoch": 0.4126407092062885, + "grad_norm": 0.34513506293296814, + "learning_rate": 0.0005, + "loss": 1.1456, + "step": 12940 + }, + { + "epoch": 0.4129595969259224, + "grad_norm": 0.3438687026500702, + "learning_rate": 0.0005, + "loss": 1.1437, + "step": 12950 + }, + { + "epoch": 0.4132784846455563, + "grad_norm": 0.33485645055770874, + "learning_rate": 0.0005, + "loss": 1.1441, + "step": 12960 + }, + { + "epoch": 0.41359737236519023, + "grad_norm": 0.35648801922798157, + "learning_rate": 0.0005, + "loss": 1.1465, + "step": 12970 + }, + { + "epoch": 0.41391626008482413, + "grad_norm": 0.35223162174224854, + "learning_rate": 0.0005, + "loss": 1.1502, + "step": 12980 + }, + { + "epoch": 0.414235147804458, + "grad_norm": 0.3574160039424896, + "learning_rate": 0.0005, + "loss": 1.1337, + "step": 12990 + }, + { + "epoch": 0.414554035524092, + "grad_norm": 0.3410550057888031, + "learning_rate": 0.0005, + "loss": 1.1398, + "step": 13000 + }, + { + "epoch": 0.4148729232437259, + "grad_norm": 0.3610963523387909, + "learning_rate": 0.0005, + "loss": 1.1207, + "step": 13010 + }, + { + "epoch": 0.4151918109633598, + "grad_norm": 0.36963796615600586, + "learning_rate": 0.0005, + "loss": 1.1547, + "step": 13020 + }, + { + "epoch": 0.41551069868299373, + "grad_norm": 0.36384981870651245, + "learning_rate": 0.0005, + "loss": 1.1513, + "step": 13030 + }, + { + "epoch": 0.4158295864026276, + "grad_norm": 0.3315349817276001, + "learning_rate": 0.0005, + "loss": 1.1315, + "step": 13040 + }, + { + "epoch": 0.4161484741222615, + "grad_norm": 0.33881279826164246, + "learning_rate": 0.0005, + "loss": 1.1367, + "step": 13050 + }, + { + "epoch": 0.4164673618418955, + "grad_norm": 0.35474470257759094, + "learning_rate": 0.0005, + "loss": 1.1565, + "step": 13060 + }, + { + "epoch": 0.4167862495615294, + "grad_norm": 0.3389039635658264, + "learning_rate": 0.0005, + "loss": 1.1348, + "step": 13070 + }, + { + "epoch": 0.41710513728116333, + "grad_norm": 0.33887627720832825, + "learning_rate": 0.0005, + "loss": 1.1439, + "step": 13080 + }, + { + "epoch": 0.4174240250007972, + "grad_norm": 0.35486432909965515, + "learning_rate": 0.0005, + "loss": 1.1419, + "step": 13090 + }, + { + "epoch": 0.4177429127204311, + "grad_norm": 0.33916404843330383, + "learning_rate": 0.0005, + "loss": 1.1335, + "step": 13100 + }, + { + "epoch": 0.4180618004400651, + "grad_norm": 0.3448198437690735, + "learning_rate": 0.0005, + "loss": 1.1418, + "step": 13110 + }, + { + "epoch": 0.418380688159699, + "grad_norm": 0.364322692155838, + "learning_rate": 0.0005, + "loss": 1.1499, + "step": 13120 + }, + { + "epoch": 0.4186995758793329, + "grad_norm": 0.34698808193206787, + "learning_rate": 0.0005, + "loss": 1.1451, + "step": 13130 + }, + { + "epoch": 0.41901846359896683, + "grad_norm": 0.3550267517566681, + "learning_rate": 0.0005, + "loss": 1.1343, + "step": 13140 + }, + { + "epoch": 0.4193373513186007, + "grad_norm": 0.34121349453926086, + "learning_rate": 0.0005, + "loss": 1.1459, + "step": 13150 + }, + { + "epoch": 0.4196562390382346, + "grad_norm": 0.3349049687385559, + "learning_rate": 0.0005, + "loss": 1.1293, + "step": 13160 + }, + { + "epoch": 0.4199751267578686, + "grad_norm": 0.34862592816352844, + "learning_rate": 0.0005, + "loss": 1.137, + "step": 13170 + }, + { + "epoch": 0.4202940144775025, + "grad_norm": 0.3531326651573181, + "learning_rate": 0.0005, + "loss": 1.1218, + "step": 13180 + }, + { + "epoch": 0.4206129021971364, + "grad_norm": 0.34977883100509644, + "learning_rate": 0.0005, + "loss": 1.1375, + "step": 13190 + }, + { + "epoch": 0.4209317899167703, + "grad_norm": 0.35819658637046814, + "learning_rate": 0.0005, + "loss": 1.1542, + "step": 13200 + }, + { + "epoch": 0.4212506776364042, + "grad_norm": 0.3428193926811218, + "learning_rate": 0.0005, + "loss": 1.142, + "step": 13210 + }, + { + "epoch": 0.4215695653560381, + "grad_norm": 0.3502006530761719, + "learning_rate": 0.0005, + "loss": 1.1359, + "step": 13220 + }, + { + "epoch": 0.4218884530756721, + "grad_norm": 0.33796781301498413, + "learning_rate": 0.0005, + "loss": 1.144, + "step": 13230 + }, + { + "epoch": 0.422207340795306, + "grad_norm": 0.35478195548057556, + "learning_rate": 0.0005, + "loss": 1.1256, + "step": 13240 + }, + { + "epoch": 0.42252622851493987, + "grad_norm": 0.33419090509414673, + "learning_rate": 0.0005, + "loss": 1.1144, + "step": 13250 + }, + { + "epoch": 0.4228451162345738, + "grad_norm": 0.3418004810810089, + "learning_rate": 0.0005, + "loss": 1.1271, + "step": 13260 + }, + { + "epoch": 0.4231640039542077, + "grad_norm": 0.3498900532722473, + "learning_rate": 0.0005, + "loss": 1.1429, + "step": 13270 + }, + { + "epoch": 0.4234828916738416, + "grad_norm": 0.36065348982810974, + "learning_rate": 0.0005, + "loss": 1.1184, + "step": 13280 + }, + { + "epoch": 0.4238017793934756, + "grad_norm": 0.3300412595272064, + "learning_rate": 0.0005, + "loss": 1.1276, + "step": 13290 + }, + { + "epoch": 0.4241206671131095, + "grad_norm": 0.3515334725379944, + "learning_rate": 0.0005, + "loss": 1.151, + "step": 13300 + }, + { + "epoch": 0.42443955483274337, + "grad_norm": 0.33978763222694397, + "learning_rate": 0.0005, + "loss": 1.1498, + "step": 13310 + }, + { + "epoch": 0.4247584425523773, + "grad_norm": 0.32748571038246155, + "learning_rate": 0.0005, + "loss": 1.1273, + "step": 13320 + }, + { + "epoch": 0.4250773302720112, + "grad_norm": 0.34806540608406067, + "learning_rate": 0.0005, + "loss": 1.1401, + "step": 13330 + }, + { + "epoch": 0.4253962179916451, + "grad_norm": 0.3434291183948517, + "learning_rate": 0.0005, + "loss": 1.1291, + "step": 13340 + }, + { + "epoch": 0.4257151057112791, + "grad_norm": 0.3458356559276581, + "learning_rate": 0.0005, + "loss": 1.1373, + "step": 13350 + }, + { + "epoch": 0.42603399343091297, + "grad_norm": 0.3550995886325836, + "learning_rate": 0.0005, + "loss": 1.1265, + "step": 13360 + }, + { + "epoch": 0.42635288115054687, + "grad_norm": 0.3559800088405609, + "learning_rate": 0.0005, + "loss": 1.1417, + "step": 13370 + }, + { + "epoch": 0.4266717688701808, + "grad_norm": 0.3434266746044159, + "learning_rate": 0.0005, + "loss": 1.1189, + "step": 13380 + }, + { + "epoch": 0.4269906565898147, + "grad_norm": 0.35063812136650085, + "learning_rate": 0.0005, + "loss": 1.1267, + "step": 13390 + }, + { + "epoch": 0.4273095443094486, + "grad_norm": 0.34082141518592834, + "learning_rate": 0.0005, + "loss": 1.1221, + "step": 13400 + }, + { + "epoch": 0.4276284320290826, + "grad_norm": 0.3359578251838684, + "learning_rate": 0.0005, + "loss": 1.1136, + "step": 13410 + }, + { + "epoch": 0.42794731974871647, + "grad_norm": 0.3475954532623291, + "learning_rate": 0.0005, + "loss": 1.146, + "step": 13420 + }, + { + "epoch": 0.42826620746835037, + "grad_norm": 0.339847207069397, + "learning_rate": 0.0005, + "loss": 1.1373, + "step": 13430 + }, + { + "epoch": 0.4285850951879843, + "grad_norm": 0.3585573434829712, + "learning_rate": 0.0005, + "loss": 1.1432, + "step": 13440 + }, + { + "epoch": 0.4289039829076182, + "grad_norm": 0.3391568958759308, + "learning_rate": 0.0005, + "loss": 1.131, + "step": 13450 + }, + { + "epoch": 0.4292228706272521, + "grad_norm": 0.34588027000427246, + "learning_rate": 0.0005, + "loss": 1.1336, + "step": 13460 + }, + { + "epoch": 0.42954175834688607, + "grad_norm": 0.3400319516658783, + "learning_rate": 0.0005, + "loss": 1.1366, + "step": 13470 + }, + { + "epoch": 0.42986064606651997, + "grad_norm": 0.35178929567337036, + "learning_rate": 0.0005, + "loss": 1.1506, + "step": 13480 + }, + { + "epoch": 0.4301795337861539, + "grad_norm": 0.3204183876514435, + "learning_rate": 0.0005, + "loss": 1.1251, + "step": 13490 + }, + { + "epoch": 0.4304984215057878, + "grad_norm": 0.3399496376514435, + "learning_rate": 0.0005, + "loss": 1.1321, + "step": 13500 + }, + { + "epoch": 0.4308173092254217, + "grad_norm": 0.3532831072807312, + "learning_rate": 0.0005, + "loss": 1.1276, + "step": 13510 + }, + { + "epoch": 0.43113619694505567, + "grad_norm": 0.3372787833213806, + "learning_rate": 0.0005, + "loss": 1.1514, + "step": 13520 + }, + { + "epoch": 0.43145508466468957, + "grad_norm": 0.34379592537879944, + "learning_rate": 0.0005, + "loss": 1.1256, + "step": 13530 + }, + { + "epoch": 0.43177397238432347, + "grad_norm": 0.3416652977466583, + "learning_rate": 0.0005, + "loss": 1.1366, + "step": 13540 + }, + { + "epoch": 0.4320928601039574, + "grad_norm": 0.3551793098449707, + "learning_rate": 0.0005, + "loss": 1.1432, + "step": 13550 + }, + { + "epoch": 0.4324117478235913, + "grad_norm": 0.33919355273246765, + "learning_rate": 0.0005, + "loss": 1.1323, + "step": 13560 + }, + { + "epoch": 0.4327306355432252, + "grad_norm": 0.3380427360534668, + "learning_rate": 0.0005, + "loss": 1.1333, + "step": 13570 + }, + { + "epoch": 0.43304952326285917, + "grad_norm": 0.3556479215621948, + "learning_rate": 0.0005, + "loss": 1.1445, + "step": 13580 + }, + { + "epoch": 0.43336841098249307, + "grad_norm": 0.33824047446250916, + "learning_rate": 0.0005, + "loss": 1.1287, + "step": 13590 + }, + { + "epoch": 0.43368729870212697, + "grad_norm": 0.3541043996810913, + "learning_rate": 0.0005, + "loss": 1.1281, + "step": 13600 + }, + { + "epoch": 0.4340061864217609, + "grad_norm": 0.35882213711738586, + "learning_rate": 0.0005, + "loss": 1.1314, + "step": 13610 + }, + { + "epoch": 0.4343250741413948, + "grad_norm": 0.3262694478034973, + "learning_rate": 0.0005, + "loss": 1.1505, + "step": 13620 + }, + { + "epoch": 0.4346439618610287, + "grad_norm": 0.33700451254844666, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 13630 + }, + { + "epoch": 0.43496284958066267, + "grad_norm": 0.3422270119190216, + "learning_rate": 0.0005, + "loss": 1.1396, + "step": 13640 + }, + { + "epoch": 0.43528173730029657, + "grad_norm": 0.3300841450691223, + "learning_rate": 0.0005, + "loss": 1.1296, + "step": 13650 + }, + { + "epoch": 0.43560062501993047, + "grad_norm": 0.3336879014968872, + "learning_rate": 0.0005, + "loss": 1.1198, + "step": 13660 + }, + { + "epoch": 0.4359195127395644, + "grad_norm": 0.33946508169174194, + "learning_rate": 0.0005, + "loss": 1.1379, + "step": 13670 + }, + { + "epoch": 0.4362384004591983, + "grad_norm": 0.3411202132701874, + "learning_rate": 0.0005, + "loss": 1.1249, + "step": 13680 + }, + { + "epoch": 0.4365572881788322, + "grad_norm": 0.3332071304321289, + "learning_rate": 0.0005, + "loss": 1.1257, + "step": 13690 + }, + { + "epoch": 0.43687617589846617, + "grad_norm": 0.35336050391197205, + "learning_rate": 0.0005, + "loss": 1.1404, + "step": 13700 + }, + { + "epoch": 0.43719506361810007, + "grad_norm": 0.34526899456977844, + "learning_rate": 0.0005, + "loss": 1.126, + "step": 13710 + }, + { + "epoch": 0.43751395133773396, + "grad_norm": 0.34036850929260254, + "learning_rate": 0.0005, + "loss": 1.1086, + "step": 13720 + }, + { + "epoch": 0.4378328390573679, + "grad_norm": 0.34457531571388245, + "learning_rate": 0.0005, + "loss": 1.1478, + "step": 13730 + }, + { + "epoch": 0.4381517267770018, + "grad_norm": 0.34224215149879456, + "learning_rate": 0.0005, + "loss": 1.1355, + "step": 13740 + }, + { + "epoch": 0.4384706144966357, + "grad_norm": 0.34279027581214905, + "learning_rate": 0.0005, + "loss": 1.1249, + "step": 13750 + }, + { + "epoch": 0.43878950221626967, + "grad_norm": 0.3422126770019531, + "learning_rate": 0.0005, + "loss": 1.1215, + "step": 13760 + }, + { + "epoch": 0.43910838993590356, + "grad_norm": 0.3363499641418457, + "learning_rate": 0.0005, + "loss": 1.1344, + "step": 13770 + }, + { + "epoch": 0.43942727765553746, + "grad_norm": 0.3555680811405182, + "learning_rate": 0.0005, + "loss": 1.1377, + "step": 13780 + }, + { + "epoch": 0.4397461653751714, + "grad_norm": 0.3454040586948395, + "learning_rate": 0.0005, + "loss": 1.1365, + "step": 13790 + }, + { + "epoch": 0.4400650530948053, + "grad_norm": 0.34297218918800354, + "learning_rate": 0.0005, + "loss": 1.1291, + "step": 13800 + }, + { + "epoch": 0.4403839408144392, + "grad_norm": 0.34979772567749023, + "learning_rate": 0.0005, + "loss": 1.1187, + "step": 13810 + }, + { + "epoch": 0.44070282853407317, + "grad_norm": 0.3685065805912018, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 13820 + }, + { + "epoch": 0.44102171625370706, + "grad_norm": 0.33668696880340576, + "learning_rate": 0.0005, + "loss": 1.1245, + "step": 13830 + }, + { + "epoch": 0.44134060397334096, + "grad_norm": 0.3478093147277832, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 13840 + }, + { + "epoch": 0.4416594916929749, + "grad_norm": 0.3335479199886322, + "learning_rate": 0.0005, + "loss": 1.1241, + "step": 13850 + }, + { + "epoch": 0.4419783794126088, + "grad_norm": 0.3367275893688202, + "learning_rate": 0.0005, + "loss": 1.1258, + "step": 13860 + }, + { + "epoch": 0.4422972671322427, + "grad_norm": 0.3387754261493683, + "learning_rate": 0.0005, + "loss": 1.1147, + "step": 13870 + }, + { + "epoch": 0.44261615485187666, + "grad_norm": 0.33694881200790405, + "learning_rate": 0.0005, + "loss": 1.1038, + "step": 13880 + }, + { + "epoch": 0.44293504257151056, + "grad_norm": 0.326871782541275, + "learning_rate": 0.0005, + "loss": 1.1115, + "step": 13890 + }, + { + "epoch": 0.4432539302911445, + "grad_norm": 0.3470039367675781, + "learning_rate": 0.0005, + "loss": 1.1341, + "step": 13900 + }, + { + "epoch": 0.4435728180107784, + "grad_norm": 0.32792967557907104, + "learning_rate": 0.0005, + "loss": 1.1149, + "step": 13910 + }, + { + "epoch": 0.4438917057304123, + "grad_norm": 0.3368305265903473, + "learning_rate": 0.0005, + "loss": 1.1328, + "step": 13920 + }, + { + "epoch": 0.44421059345004626, + "grad_norm": 0.3338005840778351, + "learning_rate": 0.0005, + "loss": 1.1386, + "step": 13930 + }, + { + "epoch": 0.44452948116968016, + "grad_norm": 0.33262819051742554, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 13940 + }, + { + "epoch": 0.44484836888931406, + "grad_norm": 0.34088337421417236, + "learning_rate": 0.0005, + "loss": 1.1255, + "step": 13950 + }, + { + "epoch": 0.445167256608948, + "grad_norm": 0.33699917793273926, + "learning_rate": 0.0005, + "loss": 1.1453, + "step": 13960 + }, + { + "epoch": 0.4454861443285819, + "grad_norm": 0.3505221903324127, + "learning_rate": 0.0005, + "loss": 1.1192, + "step": 13970 + }, + { + "epoch": 0.4458050320482158, + "grad_norm": 0.3351021707057953, + "learning_rate": 0.0005, + "loss": 1.1242, + "step": 13980 + }, + { + "epoch": 0.44612391976784976, + "grad_norm": 0.3325427770614624, + "learning_rate": 0.0005, + "loss": 1.121, + "step": 13990 + }, + { + "epoch": 0.44644280748748366, + "grad_norm": 0.34713199734687805, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 14000 + }, + { + "epoch": 0.44676169520711756, + "grad_norm": 0.3391982614994049, + "learning_rate": 0.0005, + "loss": 1.1378, + "step": 14010 + }, + { + "epoch": 0.4470805829267515, + "grad_norm": 0.3458082675933838, + "learning_rate": 0.0005, + "loss": 1.1308, + "step": 14020 + }, + { + "epoch": 0.4473994706463854, + "grad_norm": 0.3400178849697113, + "learning_rate": 0.0005, + "loss": 1.1312, + "step": 14030 + }, + { + "epoch": 0.4477183583660193, + "grad_norm": 0.33684873580932617, + "learning_rate": 0.0005, + "loss": 1.116, + "step": 14040 + }, + { + "epoch": 0.44803724608565326, + "grad_norm": 0.3401698172092438, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 14050 + }, + { + "epoch": 0.44835613380528716, + "grad_norm": 0.34139543771743774, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 14060 + }, + { + "epoch": 0.44867502152492106, + "grad_norm": 0.3483545780181885, + "learning_rate": 0.0005, + "loss": 1.1096, + "step": 14070 + }, + { + "epoch": 0.448993909244555, + "grad_norm": 0.3576740026473999, + "learning_rate": 0.0005, + "loss": 1.1229, + "step": 14080 + }, + { + "epoch": 0.4493127969641889, + "grad_norm": 0.3445344865322113, + "learning_rate": 0.0005, + "loss": 1.1173, + "step": 14090 + }, + { + "epoch": 0.4496316846838228, + "grad_norm": 0.32825151085853577, + "learning_rate": 0.0005, + "loss": 1.1222, + "step": 14100 + }, + { + "epoch": 0.44995057240345676, + "grad_norm": 0.3463006317615509, + "learning_rate": 0.0005, + "loss": 1.1239, + "step": 14110 + }, + { + "epoch": 0.45026946012309066, + "grad_norm": 0.3307591676712036, + "learning_rate": 0.0005, + "loss": 1.112, + "step": 14120 + }, + { + "epoch": 0.45058834784272456, + "grad_norm": 0.33084917068481445, + "learning_rate": 0.0005, + "loss": 1.1167, + "step": 14130 + }, + { + "epoch": 0.4509072355623585, + "grad_norm": 0.3341514468193054, + "learning_rate": 0.0005, + "loss": 1.1146, + "step": 14140 + }, + { + "epoch": 0.4512261232819924, + "grad_norm": 0.3277684450149536, + "learning_rate": 0.0005, + "loss": 1.144, + "step": 14150 + }, + { + "epoch": 0.4515450110016263, + "grad_norm": 0.34516265988349915, + "learning_rate": 0.0005, + "loss": 1.1197, + "step": 14160 + }, + { + "epoch": 0.45186389872126026, + "grad_norm": 0.32295408844947815, + "learning_rate": 0.0005, + "loss": 1.1117, + "step": 14170 + }, + { + "epoch": 0.45218278644089416, + "grad_norm": 0.3291379511356354, + "learning_rate": 0.0005, + "loss": 1.1206, + "step": 14180 + }, + { + "epoch": 0.45250167416052806, + "grad_norm": 0.3296593129634857, + "learning_rate": 0.0005, + "loss": 1.1071, + "step": 14190 + }, + { + "epoch": 0.452820561880162, + "grad_norm": 0.3422677516937256, + "learning_rate": 0.0005, + "loss": 1.1355, + "step": 14200 + }, + { + "epoch": 0.4531394495997959, + "grad_norm": 0.3407352566719055, + "learning_rate": 0.0005, + "loss": 1.1322, + "step": 14210 + }, + { + "epoch": 0.4534583373194298, + "grad_norm": 0.3275332748889923, + "learning_rate": 0.0005, + "loss": 1.1203, + "step": 14220 + }, + { + "epoch": 0.45377722503906376, + "grad_norm": 0.33814772963523865, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 14230 + }, + { + "epoch": 0.45409611275869766, + "grad_norm": 0.3463352918624878, + "learning_rate": 0.0005, + "loss": 1.1066, + "step": 14240 + }, + { + "epoch": 0.45441500047833155, + "grad_norm": 0.3440384864807129, + "learning_rate": 0.0005, + "loss": 1.1264, + "step": 14250 + }, + { + "epoch": 0.4547338881979655, + "grad_norm": 0.3311120271682739, + "learning_rate": 0.0005, + "loss": 1.1377, + "step": 14260 + }, + { + "epoch": 0.4550527759175994, + "grad_norm": 0.3387615978717804, + "learning_rate": 0.0005, + "loss": 1.1166, + "step": 14270 + }, + { + "epoch": 0.4553716636372333, + "grad_norm": 0.33410459756851196, + "learning_rate": 0.0005, + "loss": 1.1278, + "step": 14280 + }, + { + "epoch": 0.45569055135686726, + "grad_norm": 0.34034672379493713, + "learning_rate": 0.0005, + "loss": 1.134, + "step": 14290 + }, + { + "epoch": 0.45600943907650116, + "grad_norm": 0.33677056431770325, + "learning_rate": 0.0005, + "loss": 1.1329, + "step": 14300 + }, + { + "epoch": 0.45632832679613505, + "grad_norm": 0.3402319550514221, + "learning_rate": 0.0005, + "loss": 1.1325, + "step": 14310 + }, + { + "epoch": 0.456647214515769, + "grad_norm": 0.3517909646034241, + "learning_rate": 0.0005, + "loss": 1.1216, + "step": 14320 + }, + { + "epoch": 0.4569661022354029, + "grad_norm": 0.3427649438381195, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 14330 + }, + { + "epoch": 0.45728498995503686, + "grad_norm": 0.339320570230484, + "learning_rate": 0.0005, + "loss": 1.1099, + "step": 14340 + }, + { + "epoch": 0.45760387767467076, + "grad_norm": 0.3295920193195343, + "learning_rate": 0.0005, + "loss": 1.12, + "step": 14350 + }, + { + "epoch": 0.45792276539430465, + "grad_norm": 0.331986665725708, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 14360 + }, + { + "epoch": 0.4582416531139386, + "grad_norm": 0.3381638526916504, + "learning_rate": 0.0005, + "loss": 1.1441, + "step": 14370 + }, + { + "epoch": 0.4585605408335725, + "grad_norm": 0.3370720446109772, + "learning_rate": 0.0005, + "loss": 1.1319, + "step": 14380 + }, + { + "epoch": 0.4588794285532064, + "grad_norm": 0.3332827091217041, + "learning_rate": 0.0005, + "loss": 1.1128, + "step": 14390 + }, + { + "epoch": 0.45919831627284036, + "grad_norm": 0.33136168122291565, + "learning_rate": 0.0005, + "loss": 1.138, + "step": 14400 + }, + { + "epoch": 0.45951720399247425, + "grad_norm": 0.32325100898742676, + "learning_rate": 0.0005, + "loss": 1.0948, + "step": 14410 + }, + { + "epoch": 0.45983609171210815, + "grad_norm": 0.3420909643173218, + "learning_rate": 0.0005, + "loss": 1.1158, + "step": 14420 + }, + { + "epoch": 0.4601549794317421, + "grad_norm": 0.35374853014945984, + "learning_rate": 0.0005, + "loss": 1.1158, + "step": 14430 + }, + { + "epoch": 0.460473867151376, + "grad_norm": 0.338075190782547, + "learning_rate": 0.0005, + "loss": 1.1328, + "step": 14440 + }, + { + "epoch": 0.4607927548710099, + "grad_norm": 0.3302494287490845, + "learning_rate": 0.0005, + "loss": 1.1088, + "step": 14450 + }, + { + "epoch": 0.46111164259064386, + "grad_norm": 0.33893561363220215, + "learning_rate": 0.0005, + "loss": 1.1255, + "step": 14460 + }, + { + "epoch": 0.46143053031027775, + "grad_norm": 0.3488835394382477, + "learning_rate": 0.0005, + "loss": 1.114, + "step": 14470 + }, + { + "epoch": 0.46174941802991165, + "grad_norm": 0.3383651077747345, + "learning_rate": 0.0005, + "loss": 1.1055, + "step": 14480 + }, + { + "epoch": 0.4620683057495456, + "grad_norm": 0.3396001160144806, + "learning_rate": 0.0005, + "loss": 1.1138, + "step": 14490 + }, + { + "epoch": 0.4623871934691795, + "grad_norm": 0.3394538164138794, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 14500 + }, + { + "epoch": 0.4627060811888134, + "grad_norm": 0.34485116600990295, + "learning_rate": 0.0005, + "loss": 1.1136, + "step": 14510 + }, + { + "epoch": 0.46302496890844735, + "grad_norm": 0.3296126425266266, + "learning_rate": 0.0005, + "loss": 1.1114, + "step": 14520 + }, + { + "epoch": 0.46334385662808125, + "grad_norm": 0.34639716148376465, + "learning_rate": 0.0005, + "loss": 1.138, + "step": 14530 + }, + { + "epoch": 0.46366274434771515, + "grad_norm": 0.32055237889289856, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 14540 + }, + { + "epoch": 0.4639816320673491, + "grad_norm": 0.3470534682273865, + "learning_rate": 0.0005, + "loss": 1.1337, + "step": 14550 + }, + { + "epoch": 0.464300519786983, + "grad_norm": 0.3348836898803711, + "learning_rate": 0.0005, + "loss": 1.1207, + "step": 14560 + }, + { + "epoch": 0.4646194075066169, + "grad_norm": 0.34529122710227966, + "learning_rate": 0.0005, + "loss": 1.1196, + "step": 14570 + }, + { + "epoch": 0.46493829522625085, + "grad_norm": 0.34491243958473206, + "learning_rate": 0.0005, + "loss": 1.1148, + "step": 14580 + }, + { + "epoch": 0.46525718294588475, + "grad_norm": 0.32686635851860046, + "learning_rate": 0.0005, + "loss": 1.1122, + "step": 14590 + }, + { + "epoch": 0.46557607066551865, + "grad_norm": 0.33966755867004395, + "learning_rate": 0.0005, + "loss": 1.1222, + "step": 14600 + }, + { + "epoch": 0.4658949583851526, + "grad_norm": 0.3253386914730072, + "learning_rate": 0.0005, + "loss": 1.1237, + "step": 14610 + }, + { + "epoch": 0.4662138461047865, + "grad_norm": 0.3355180621147156, + "learning_rate": 0.0005, + "loss": 1.1192, + "step": 14620 + }, + { + "epoch": 0.4665327338244204, + "grad_norm": 0.3447900414466858, + "learning_rate": 0.0005, + "loss": 1.1223, + "step": 14630 + }, + { + "epoch": 0.46685162154405435, + "grad_norm": 0.3398261070251465, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 14640 + }, + { + "epoch": 0.46717050926368825, + "grad_norm": 0.32602953910827637, + "learning_rate": 0.0005, + "loss": 1.1209, + "step": 14650 + }, + { + "epoch": 0.46748939698332215, + "grad_norm": 0.3350808322429657, + "learning_rate": 0.0005, + "loss": 1.1139, + "step": 14660 + }, + { + "epoch": 0.4678082847029561, + "grad_norm": 0.3368006944656372, + "learning_rate": 0.0005, + "loss": 1.1172, + "step": 14670 + }, + { + "epoch": 0.46812717242259, + "grad_norm": 0.33167609572410583, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 14680 + }, + { + "epoch": 0.4684460601422239, + "grad_norm": 0.350157767534256, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 14690 + }, + { + "epoch": 0.46876494786185785, + "grad_norm": 0.339697003364563, + "learning_rate": 0.0005, + "loss": 1.1128, + "step": 14700 + }, + { + "epoch": 0.46908383558149175, + "grad_norm": 0.3364179730415344, + "learning_rate": 0.0005, + "loss": 1.1288, + "step": 14710 + }, + { + "epoch": 0.46940272330112565, + "grad_norm": 0.33690929412841797, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 14720 + }, + { + "epoch": 0.4697216110207596, + "grad_norm": 0.334957480430603, + "learning_rate": 0.0005, + "loss": 1.1152, + "step": 14730 + }, + { + "epoch": 0.4700404987403935, + "grad_norm": 0.343614786863327, + "learning_rate": 0.0005, + "loss": 1.129, + "step": 14740 + }, + { + "epoch": 0.47035938646002745, + "grad_norm": 0.3393288254737854, + "learning_rate": 0.0005, + "loss": 1.1313, + "step": 14750 + }, + { + "epoch": 0.47067827417966135, + "grad_norm": 0.3249128460884094, + "learning_rate": 0.0005, + "loss": 1.1023, + "step": 14760 + }, + { + "epoch": 0.47099716189929525, + "grad_norm": 0.3428761065006256, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 14770 + }, + { + "epoch": 0.4713160496189292, + "grad_norm": 0.3276873826980591, + "learning_rate": 0.0005, + "loss": 1.1204, + "step": 14780 + }, + { + "epoch": 0.4716349373385631, + "grad_norm": 0.3779163062572479, + "learning_rate": 0.0005, + "loss": 1.1314, + "step": 14790 + }, + { + "epoch": 0.471953825058197, + "grad_norm": 0.334412157535553, + "learning_rate": 0.0005, + "loss": 1.1271, + "step": 14800 + }, + { + "epoch": 0.47227271277783095, + "grad_norm": 0.3291500508785248, + "learning_rate": 0.0005, + "loss": 1.115, + "step": 14810 + }, + { + "epoch": 0.47259160049746485, + "grad_norm": 0.34810540080070496, + "learning_rate": 0.0005, + "loss": 1.1239, + "step": 14820 + }, + { + "epoch": 0.47291048821709875, + "grad_norm": 0.3348764479160309, + "learning_rate": 0.0005, + "loss": 1.1267, + "step": 14830 + }, + { + "epoch": 0.4732293759367327, + "grad_norm": 0.3247257173061371, + "learning_rate": 0.0005, + "loss": 1.1244, + "step": 14840 + }, + { + "epoch": 0.4735482636563666, + "grad_norm": 0.32146748900413513, + "learning_rate": 0.0005, + "loss": 1.1196, + "step": 14850 + }, + { + "epoch": 0.4738671513760005, + "grad_norm": 0.32966938614845276, + "learning_rate": 0.0005, + "loss": 1.1019, + "step": 14860 + }, + { + "epoch": 0.47418603909563445, + "grad_norm": 0.3306904435157776, + "learning_rate": 0.0005, + "loss": 1.1064, + "step": 14870 + }, + { + "epoch": 0.47450492681526835, + "grad_norm": 0.3338717222213745, + "learning_rate": 0.0005, + "loss": 1.1099, + "step": 14880 + }, + { + "epoch": 0.47482381453490224, + "grad_norm": 0.3372955322265625, + "learning_rate": 0.0005, + "loss": 1.117, + "step": 14890 + }, + { + "epoch": 0.4751427022545362, + "grad_norm": 0.34013262391090393, + "learning_rate": 0.0005, + "loss": 1.119, + "step": 14900 + }, + { + "epoch": 0.4754615899741701, + "grad_norm": 0.33529746532440186, + "learning_rate": 0.0005, + "loss": 1.1284, + "step": 14910 + }, + { + "epoch": 0.475780477693804, + "grad_norm": 0.3290479779243469, + "learning_rate": 0.0005, + "loss": 1.1167, + "step": 14920 + }, + { + "epoch": 0.47609936541343795, + "grad_norm": 0.333850234746933, + "learning_rate": 0.0005, + "loss": 1.124, + "step": 14930 + }, + { + "epoch": 0.47641825313307185, + "grad_norm": 0.32731056213378906, + "learning_rate": 0.0005, + "loss": 1.1333, + "step": 14940 + }, + { + "epoch": 0.47673714085270574, + "grad_norm": 0.3414525091648102, + "learning_rate": 0.0005, + "loss": 1.1063, + "step": 14950 + }, + { + "epoch": 0.4770560285723397, + "grad_norm": 0.3164287507534027, + "learning_rate": 0.0005, + "loss": 1.1181, + "step": 14960 + }, + { + "epoch": 0.4773749162919736, + "grad_norm": 0.33766913414001465, + "learning_rate": 0.0005, + "loss": 1.1238, + "step": 14970 + }, + { + "epoch": 0.4776938040116075, + "grad_norm": 0.33459576964378357, + "learning_rate": 0.0005, + "loss": 1.1084, + "step": 14980 + }, + { + "epoch": 0.47801269173124145, + "grad_norm": 0.3171371519565582, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 14990 + }, + { + "epoch": 0.47833157945087534, + "grad_norm": 0.3457358479499817, + "learning_rate": 0.0005, + "loss": 1.1307, + "step": 15000 + }, + { + "epoch": 0.47865046717050924, + "grad_norm": 0.329092800617218, + "learning_rate": 0.0005, + "loss": 1.1265, + "step": 15010 + }, + { + "epoch": 0.4789693548901432, + "grad_norm": 0.34381380677223206, + "learning_rate": 0.0005, + "loss": 1.1096, + "step": 15020 + }, + { + "epoch": 0.4792882426097771, + "grad_norm": 0.3248065710067749, + "learning_rate": 0.0005, + "loss": 1.1278, + "step": 15030 + }, + { + "epoch": 0.479607130329411, + "grad_norm": 0.3197561800479889, + "learning_rate": 0.0005, + "loss": 1.1023, + "step": 15040 + }, + { + "epoch": 0.47992601804904494, + "grad_norm": 0.3313983082771301, + "learning_rate": 0.0005, + "loss": 1.1176, + "step": 15050 + }, + { + "epoch": 0.48024490576867884, + "grad_norm": 0.3262551724910736, + "learning_rate": 0.0005, + "loss": 1.109, + "step": 15060 + }, + { + "epoch": 0.48056379348831274, + "grad_norm": 0.3366710841655731, + "learning_rate": 0.0005, + "loss": 1.1223, + "step": 15070 + }, + { + "epoch": 0.4808826812079467, + "grad_norm": 0.32736074924468994, + "learning_rate": 0.0005, + "loss": 1.1187, + "step": 15080 + }, + { + "epoch": 0.4812015689275806, + "grad_norm": 0.33949264883995056, + "learning_rate": 0.0005, + "loss": 1.1115, + "step": 15090 + }, + { + "epoch": 0.4815204566472145, + "grad_norm": 0.3322353661060333, + "learning_rate": 0.0005, + "loss": 1.1256, + "step": 15100 + }, + { + "epoch": 0.48183934436684844, + "grad_norm": 0.3546300530433655, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 15110 + }, + { + "epoch": 0.48215823208648234, + "grad_norm": 0.31901800632476807, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 15120 + }, + { + "epoch": 0.48247711980611624, + "grad_norm": 0.3382745087146759, + "learning_rate": 0.0005, + "loss": 1.1004, + "step": 15130 + }, + { + "epoch": 0.4827960075257502, + "grad_norm": 0.3378961980342865, + "learning_rate": 0.0005, + "loss": 1.1076, + "step": 15140 + }, + { + "epoch": 0.4831148952453841, + "grad_norm": 0.3255893290042877, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 15150 + }, + { + "epoch": 0.48343378296501804, + "grad_norm": 0.324762225151062, + "learning_rate": 0.0005, + "loss": 1.1073, + "step": 15160 + }, + { + "epoch": 0.48375267068465194, + "grad_norm": 0.3625001907348633, + "learning_rate": 0.0005, + "loss": 1.1125, + "step": 15170 + }, + { + "epoch": 0.48407155840428584, + "grad_norm": 0.35341137647628784, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 15180 + }, + { + "epoch": 0.4843904461239198, + "grad_norm": 0.3476621210575104, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 15190 + }, + { + "epoch": 0.4847093338435537, + "grad_norm": 0.339345246553421, + "learning_rate": 0.0005, + "loss": 1.1371, + "step": 15200 + }, + { + "epoch": 0.4850282215631876, + "grad_norm": 0.3191971182823181, + "learning_rate": 0.0005, + "loss": 1.0994, + "step": 15210 + }, + { + "epoch": 0.48534710928282154, + "grad_norm": 0.3185611665248871, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 15220 + }, + { + "epoch": 0.48566599700245544, + "grad_norm": 0.33163318037986755, + "learning_rate": 0.0005, + "loss": 1.1218, + "step": 15230 + }, + { + "epoch": 0.48598488472208934, + "grad_norm": 0.3259880840778351, + "learning_rate": 0.0005, + "loss": 1.121, + "step": 15240 + }, + { + "epoch": 0.4863037724417233, + "grad_norm": 0.32738441228866577, + "learning_rate": 0.0005, + "loss": 1.1032, + "step": 15250 + }, + { + "epoch": 0.4866226601613572, + "grad_norm": 0.3310328423976898, + "learning_rate": 0.0005, + "loss": 1.1088, + "step": 15260 + }, + { + "epoch": 0.4869415478809911, + "grad_norm": 0.32167747616767883, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 15270 + }, + { + "epoch": 0.48726043560062504, + "grad_norm": 0.32243403792381287, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 15280 + }, + { + "epoch": 0.48757932332025894, + "grad_norm": 0.3249606788158417, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 15290 + }, + { + "epoch": 0.48789821103989284, + "grad_norm": 0.32354891300201416, + "learning_rate": 0.0005, + "loss": 1.1013, + "step": 15300 + }, + { + "epoch": 0.4882170987595268, + "grad_norm": 0.3281603157520294, + "learning_rate": 0.0005, + "loss": 1.0913, + "step": 15310 + }, + { + "epoch": 0.4885359864791607, + "grad_norm": 0.3279193639755249, + "learning_rate": 0.0005, + "loss": 1.1088, + "step": 15320 + }, + { + "epoch": 0.4888548741987946, + "grad_norm": 0.34850239753723145, + "learning_rate": 0.0005, + "loss": 1.1121, + "step": 15330 + }, + { + "epoch": 0.48917376191842854, + "grad_norm": 0.34309151768684387, + "learning_rate": 0.0005, + "loss": 1.0991, + "step": 15340 + }, + { + "epoch": 0.48949264963806244, + "grad_norm": 0.3212507665157318, + "learning_rate": 0.0005, + "loss": 1.1025, + "step": 15350 + }, + { + "epoch": 0.48981153735769634, + "grad_norm": 0.329383909702301, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 15360 + }, + { + "epoch": 0.4901304250773303, + "grad_norm": 0.32786908745765686, + "learning_rate": 0.0005, + "loss": 1.1184, + "step": 15370 + }, + { + "epoch": 0.4904493127969642, + "grad_norm": 0.3314971327781677, + "learning_rate": 0.0005, + "loss": 1.1109, + "step": 15380 + }, + { + "epoch": 0.4907682005165981, + "grad_norm": 0.3312240540981293, + "learning_rate": 0.0005, + "loss": 1.1164, + "step": 15390 + }, + { + "epoch": 0.49108708823623204, + "grad_norm": 0.33236539363861084, + "learning_rate": 0.0005, + "loss": 1.0962, + "step": 15400 + }, + { + "epoch": 0.49140597595586594, + "grad_norm": 0.33051598072052, + "learning_rate": 0.0005, + "loss": 1.1167, + "step": 15410 + }, + { + "epoch": 0.49172486367549983, + "grad_norm": 0.33629119396209717, + "learning_rate": 0.0005, + "loss": 1.1157, + "step": 15420 + }, + { + "epoch": 0.4920437513951338, + "grad_norm": 0.33263641595840454, + "learning_rate": 0.0005, + "loss": 1.1083, + "step": 15430 + }, + { + "epoch": 0.4923626391147677, + "grad_norm": 0.32079949975013733, + "learning_rate": 0.0005, + "loss": 1.1283, + "step": 15440 + }, + { + "epoch": 0.4926815268344016, + "grad_norm": 0.34400150179862976, + "learning_rate": 0.0005, + "loss": 1.1332, + "step": 15450 + }, + { + "epoch": 0.49300041455403554, + "grad_norm": 0.32528164982795715, + "learning_rate": 0.0005, + "loss": 1.1287, + "step": 15460 + }, + { + "epoch": 0.49331930227366944, + "grad_norm": 0.34158089756965637, + "learning_rate": 0.0005, + "loss": 1.1292, + "step": 15470 + }, + { + "epoch": 0.49363818999330333, + "grad_norm": 0.3264855444431305, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 15480 + }, + { + "epoch": 0.4939570777129373, + "grad_norm": 0.3351936638355255, + "learning_rate": 0.0005, + "loss": 1.1266, + "step": 15490 + }, + { + "epoch": 0.4942759654325712, + "grad_norm": 0.3334040939807892, + "learning_rate": 0.0005, + "loss": 1.102, + "step": 15500 + }, + { + "epoch": 0.4945948531522051, + "grad_norm": 0.34181714057922363, + "learning_rate": 0.0005, + "loss": 1.1126, + "step": 15510 + }, + { + "epoch": 0.49491374087183904, + "grad_norm": 0.32649973034858704, + "learning_rate": 0.0005, + "loss": 1.1222, + "step": 15520 + }, + { + "epoch": 0.49523262859147293, + "grad_norm": 0.33181843161582947, + "learning_rate": 0.0005, + "loss": 1.1101, + "step": 15530 + }, + { + "epoch": 0.49555151631110683, + "grad_norm": 0.3195984661579132, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 15540 + }, + { + "epoch": 0.4958704040307408, + "grad_norm": 0.3320752680301666, + "learning_rate": 0.0005, + "loss": 1.1157, + "step": 15550 + }, + { + "epoch": 0.4961892917503747, + "grad_norm": 0.3402622640132904, + "learning_rate": 0.0005, + "loss": 1.1054, + "step": 15560 + }, + { + "epoch": 0.49650817947000864, + "grad_norm": 0.32147037982940674, + "learning_rate": 0.0005, + "loss": 1.1161, + "step": 15570 + }, + { + "epoch": 0.49682706718964254, + "grad_norm": 0.32301703095436096, + "learning_rate": 0.0005, + "loss": 1.1137, + "step": 15580 + }, + { + "epoch": 0.49714595490927643, + "grad_norm": 0.3312668800354004, + "learning_rate": 0.0005, + "loss": 1.1193, + "step": 15590 + }, + { + "epoch": 0.4974648426289104, + "grad_norm": 0.3265191316604614, + "learning_rate": 0.0005, + "loss": 1.1028, + "step": 15600 + }, + { + "epoch": 0.4977837303485443, + "grad_norm": 0.3307774066925049, + "learning_rate": 0.0005, + "loss": 1.1019, + "step": 15610 + }, + { + "epoch": 0.4981026180681782, + "grad_norm": 0.3307434320449829, + "learning_rate": 0.0005, + "loss": 1.1119, + "step": 15620 + }, + { + "epoch": 0.49842150578781214, + "grad_norm": 0.3392607867717743, + "learning_rate": 0.0005, + "loss": 1.0973, + "step": 15630 + }, + { + "epoch": 0.49874039350744603, + "grad_norm": 0.33843058347702026, + "learning_rate": 0.0005, + "loss": 1.1075, + "step": 15640 + }, + { + "epoch": 0.49905928122707993, + "grad_norm": 0.3343130946159363, + "learning_rate": 0.0005, + "loss": 1.1091, + "step": 15650 + }, + { + "epoch": 0.4993781689467139, + "grad_norm": 0.3436983823776245, + "learning_rate": 0.0005, + "loss": 1.1301, + "step": 15660 + }, + { + "epoch": 0.4996970566663478, + "grad_norm": 0.33115842938423157, + "learning_rate": 0.0005, + "loss": 1.1082, + "step": 15670 + }, + { + "epoch": 0.5000159443859817, + "grad_norm": 0.3339487612247467, + "learning_rate": 0.0005, + "loss": 1.1121, + "step": 15680 + }, + { + "epoch": 0.5003348321056156, + "grad_norm": 0.32088544964790344, + "learning_rate": 0.0005, + "loss": 1.1045, + "step": 15690 + }, + { + "epoch": 0.5006537198252495, + "grad_norm": 0.32219377160072327, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 15700 + }, + { + "epoch": 0.5009726075448835, + "grad_norm": 0.32998543977737427, + "learning_rate": 0.0005, + "loss": 1.1107, + "step": 15710 + }, + { + "epoch": 0.5012914952645173, + "grad_norm": 0.32973790168762207, + "learning_rate": 0.0005, + "loss": 1.1204, + "step": 15720 + }, + { + "epoch": 0.5016103829841513, + "grad_norm": 0.32395172119140625, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 15730 + }, + { + "epoch": 0.5019292707037852, + "grad_norm": 0.328885942697525, + "learning_rate": 0.0005, + "loss": 1.1309, + "step": 15740 + }, + { + "epoch": 0.5022481584234191, + "grad_norm": 0.33086928725242615, + "learning_rate": 0.0005, + "loss": 1.1112, + "step": 15750 + }, + { + "epoch": 0.502567046143053, + "grad_norm": 0.3351631760597229, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 15760 + }, + { + "epoch": 0.502885933862687, + "grad_norm": 0.33762550354003906, + "learning_rate": 0.0005, + "loss": 1.1138, + "step": 15770 + }, + { + "epoch": 0.5032048215823208, + "grad_norm": 0.35110947489738464, + "learning_rate": 0.0005, + "loss": 1.1083, + "step": 15780 + }, + { + "epoch": 0.5035237093019548, + "grad_norm": 0.3249277174472809, + "learning_rate": 0.0005, + "loss": 1.1036, + "step": 15790 + }, + { + "epoch": 0.5038425970215887, + "grad_norm": 0.3232463300228119, + "learning_rate": 0.0005, + "loss": 1.1012, + "step": 15800 + }, + { + "epoch": 0.5041614847412226, + "grad_norm": 0.33399292826652527, + "learning_rate": 0.0005, + "loss": 1.1157, + "step": 15810 + }, + { + "epoch": 0.5044803724608565, + "grad_norm": 0.3410165309906006, + "learning_rate": 0.0005, + "loss": 1.1029, + "step": 15820 + }, + { + "epoch": 0.5047992601804905, + "grad_norm": 0.32605990767478943, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 15830 + }, + { + "epoch": 0.5051181479001243, + "grad_norm": 0.3320138156414032, + "learning_rate": 0.0005, + "loss": 1.1122, + "step": 15840 + }, + { + "epoch": 0.5054370356197583, + "grad_norm": 0.3410665690898895, + "learning_rate": 0.0005, + "loss": 1.1164, + "step": 15850 + }, + { + "epoch": 0.5057559233393922, + "grad_norm": 0.34686121344566345, + "learning_rate": 0.0005, + "loss": 1.1192, + "step": 15860 + }, + { + "epoch": 0.5060748110590261, + "grad_norm": 0.34597936272621155, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 15870 + }, + { + "epoch": 0.50639369877866, + "grad_norm": 0.3277563750743866, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 15880 + }, + { + "epoch": 0.506712586498294, + "grad_norm": 0.31833529472351074, + "learning_rate": 0.0005, + "loss": 1.0957, + "step": 15890 + }, + { + "epoch": 0.5070314742179278, + "grad_norm": 0.33636802434921265, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 15900 + }, + { + "epoch": 0.5073503619375618, + "grad_norm": 0.32934021949768066, + "learning_rate": 0.0005, + "loss": 1.0999, + "step": 15910 + }, + { + "epoch": 0.5076692496571957, + "grad_norm": 0.33739903569221497, + "learning_rate": 0.0005, + "loss": 1.099, + "step": 15920 + }, + { + "epoch": 0.5079881373768296, + "grad_norm": 0.32094883918762207, + "learning_rate": 0.0005, + "loss": 1.1056, + "step": 15930 + }, + { + "epoch": 0.5083070250964635, + "grad_norm": 0.31300342082977295, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 15940 + }, + { + "epoch": 0.5086259128160975, + "grad_norm": 0.3224886655807495, + "learning_rate": 0.0005, + "loss": 1.1031, + "step": 15950 + }, + { + "epoch": 0.5089448005357313, + "grad_norm": 0.33514395356178284, + "learning_rate": 0.0005, + "loss": 1.11, + "step": 15960 + }, + { + "epoch": 0.5092636882553653, + "grad_norm": 0.320422887802124, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 15970 + }, + { + "epoch": 0.5095825759749992, + "grad_norm": 0.3200617730617523, + "learning_rate": 0.0005, + "loss": 1.0999, + "step": 15980 + }, + { + "epoch": 0.5099014636946331, + "grad_norm": 0.32363903522491455, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 15990 + }, + { + "epoch": 0.510220351414267, + "grad_norm": 0.32799628376960754, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 16000 + }, + { + "epoch": 0.510539239133901, + "grad_norm": 0.33881473541259766, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 16010 + }, + { + "epoch": 0.5108581268535348, + "grad_norm": 0.32602325081825256, + "learning_rate": 0.0005, + "loss": 1.1111, + "step": 16020 + }, + { + "epoch": 0.5111770145731688, + "grad_norm": 0.33234670758247375, + "learning_rate": 0.0005, + "loss": 1.0863, + "step": 16030 + }, + { + "epoch": 0.5114959022928027, + "grad_norm": 0.3310084640979767, + "learning_rate": 0.0005, + "loss": 1.1222, + "step": 16040 + }, + { + "epoch": 0.5118147900124366, + "grad_norm": 0.3287167549133301, + "learning_rate": 0.0005, + "loss": 1.0986, + "step": 16050 + }, + { + "epoch": 0.5121336777320705, + "grad_norm": 0.33373862504959106, + "learning_rate": 0.0005, + "loss": 1.1214, + "step": 16060 + }, + { + "epoch": 0.5124525654517045, + "grad_norm": 0.3199145793914795, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 16070 + }, + { + "epoch": 0.5127714531713383, + "grad_norm": 0.33313313126564026, + "learning_rate": 0.0005, + "loss": 1.0977, + "step": 16080 + }, + { + "epoch": 0.5130903408909723, + "grad_norm": 0.31770849227905273, + "learning_rate": 0.0005, + "loss": 1.1104, + "step": 16090 + }, + { + "epoch": 0.5134092286106062, + "grad_norm": 0.32164034247398376, + "learning_rate": 0.0005, + "loss": 1.1103, + "step": 16100 + }, + { + "epoch": 0.5137281163302401, + "grad_norm": 0.3254896104335785, + "learning_rate": 0.0005, + "loss": 1.1007, + "step": 16110 + }, + { + "epoch": 0.514047004049874, + "grad_norm": 0.3215978443622589, + "learning_rate": 0.0005, + "loss": 1.0996, + "step": 16120 + }, + { + "epoch": 0.514365891769508, + "grad_norm": 0.31912076473236084, + "learning_rate": 0.0005, + "loss": 1.1018, + "step": 16130 + }, + { + "epoch": 0.5146847794891418, + "grad_norm": 0.31868767738342285, + "learning_rate": 0.0005, + "loss": 1.1013, + "step": 16140 + }, + { + "epoch": 0.5150036672087758, + "grad_norm": 0.3318631649017334, + "learning_rate": 0.0005, + "loss": 1.1092, + "step": 16150 + }, + { + "epoch": 0.5153225549284097, + "grad_norm": 0.3318668603897095, + "learning_rate": 0.0005, + "loss": 1.1043, + "step": 16160 + }, + { + "epoch": 0.5156414426480436, + "grad_norm": 0.3503141403198242, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 16170 + }, + { + "epoch": 0.5159603303676775, + "grad_norm": 0.33148205280303955, + "learning_rate": 0.0005, + "loss": 1.0893, + "step": 16180 + }, + { + "epoch": 0.5162792180873115, + "grad_norm": 0.320092111825943, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 16190 + }, + { + "epoch": 0.5165981058069454, + "grad_norm": 0.3361297845840454, + "learning_rate": 0.0005, + "loss": 1.109, + "step": 16200 + }, + { + "epoch": 0.5169169935265793, + "grad_norm": 0.3223029673099518, + "learning_rate": 0.0005, + "loss": 1.1007, + "step": 16210 + }, + { + "epoch": 0.5172358812462132, + "grad_norm": 0.3160358667373657, + "learning_rate": 0.0005, + "loss": 1.1045, + "step": 16220 + }, + { + "epoch": 0.5175547689658472, + "grad_norm": 0.3212055563926697, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 16230 + }, + { + "epoch": 0.517873656685481, + "grad_norm": 0.33216482400894165, + "learning_rate": 0.0005, + "loss": 1.1005, + "step": 16240 + }, + { + "epoch": 0.518192544405115, + "grad_norm": 0.32473310828208923, + "learning_rate": 0.0005, + "loss": 1.1105, + "step": 16250 + }, + { + "epoch": 0.5185114321247489, + "grad_norm": 0.3209489583969116, + "learning_rate": 0.0005, + "loss": 1.1025, + "step": 16260 + }, + { + "epoch": 0.5188303198443828, + "grad_norm": 0.31811580061912537, + "learning_rate": 0.0005, + "loss": 1.0915, + "step": 16270 + }, + { + "epoch": 0.5191492075640167, + "grad_norm": 0.32100510597229004, + "learning_rate": 0.0005, + "loss": 1.0897, + "step": 16280 + }, + { + "epoch": 0.5194680952836507, + "grad_norm": 0.3171219229698181, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 16290 + }, + { + "epoch": 0.5197869830032845, + "grad_norm": 0.3136873245239258, + "learning_rate": 0.0005, + "loss": 1.1141, + "step": 16300 + }, + { + "epoch": 0.5201058707229185, + "grad_norm": 0.3253128230571747, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 16310 + }, + { + "epoch": 0.5204247584425524, + "grad_norm": 0.3093385696411133, + "learning_rate": 0.0005, + "loss": 1.0969, + "step": 16320 + }, + { + "epoch": 0.5207436461621863, + "grad_norm": 0.33357712626457214, + "learning_rate": 0.0005, + "loss": 1.1024, + "step": 16330 + }, + { + "epoch": 0.5210625338818202, + "grad_norm": 0.32705700397491455, + "learning_rate": 0.0005, + "loss": 1.1166, + "step": 16340 + }, + { + "epoch": 0.5213814216014542, + "grad_norm": 0.3455062508583069, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 16350 + }, + { + "epoch": 0.521700309321088, + "grad_norm": 0.3206673264503479, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 16360 + }, + { + "epoch": 0.522019197040722, + "grad_norm": 0.32268789410591125, + "learning_rate": 0.0005, + "loss": 1.0939, + "step": 16370 + }, + { + "epoch": 0.5223380847603559, + "grad_norm": 0.3400406241416931, + "learning_rate": 0.0005, + "loss": 1.0899, + "step": 16380 + }, + { + "epoch": 0.5226569724799898, + "grad_norm": 0.32846105098724365, + "learning_rate": 0.0005, + "loss": 1.1309, + "step": 16390 + }, + { + "epoch": 0.5229758601996237, + "grad_norm": 0.3352746069431305, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 16400 + }, + { + "epoch": 0.5232947479192577, + "grad_norm": 0.335052490234375, + "learning_rate": 0.0005, + "loss": 1.0957, + "step": 16410 + }, + { + "epoch": 0.5236136356388915, + "grad_norm": 0.3198140859603882, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 16420 + }, + { + "epoch": 0.5239325233585255, + "grad_norm": 0.33067774772644043, + "learning_rate": 0.0005, + "loss": 1.1192, + "step": 16430 + }, + { + "epoch": 0.5242514110781594, + "grad_norm": 0.3276583254337311, + "learning_rate": 0.0005, + "loss": 1.1113, + "step": 16440 + }, + { + "epoch": 0.5245702987977933, + "grad_norm": 0.3308953046798706, + "learning_rate": 0.0005, + "loss": 1.113, + "step": 16450 + }, + { + "epoch": 0.5248891865174272, + "grad_norm": 0.32457342743873596, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 16460 + }, + { + "epoch": 0.5252080742370612, + "grad_norm": 0.3347114324569702, + "learning_rate": 0.0005, + "loss": 1.1081, + "step": 16470 + }, + { + "epoch": 0.525526961956695, + "grad_norm": 0.3195747435092926, + "learning_rate": 0.0005, + "loss": 1.1221, + "step": 16480 + }, + { + "epoch": 0.525845849676329, + "grad_norm": 0.3215080499649048, + "learning_rate": 0.0005, + "loss": 1.1107, + "step": 16490 + }, + { + "epoch": 0.5261647373959629, + "grad_norm": 0.32074692845344543, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 16500 + }, + { + "epoch": 0.5264836251155968, + "grad_norm": 0.3176482617855072, + "learning_rate": 0.0005, + "loss": 1.0918, + "step": 16510 + }, + { + "epoch": 0.5268025128352307, + "grad_norm": 0.3347056806087494, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 16520 + }, + { + "epoch": 0.5271214005548647, + "grad_norm": 0.31140273809432983, + "learning_rate": 0.0005, + "loss": 1.0929, + "step": 16530 + }, + { + "epoch": 0.5274402882744985, + "grad_norm": 0.33070501685142517, + "learning_rate": 0.0005, + "loss": 1.1311, + "step": 16540 + }, + { + "epoch": 0.5277591759941325, + "grad_norm": 0.31990689039230347, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 16550 + }, + { + "epoch": 0.5280780637137664, + "grad_norm": 0.3268377482891083, + "learning_rate": 0.0005, + "loss": 1.1039, + "step": 16560 + }, + { + "epoch": 0.5283969514334003, + "grad_norm": 0.33449193835258484, + "learning_rate": 0.0005, + "loss": 1.0925, + "step": 16570 + }, + { + "epoch": 0.5287158391530342, + "grad_norm": 0.3253713548183441, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 16580 + }, + { + "epoch": 0.5290347268726682, + "grad_norm": 0.32997438311576843, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 16590 + }, + { + "epoch": 0.529353614592302, + "grad_norm": 0.30654647946357727, + "learning_rate": 0.0005, + "loss": 1.0962, + "step": 16600 + }, + { + "epoch": 0.529672502311936, + "grad_norm": 0.3221551775932312, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 16610 + }, + { + "epoch": 0.5299913900315699, + "grad_norm": 0.3215653896331787, + "learning_rate": 0.0005, + "loss": 1.1215, + "step": 16620 + }, + { + "epoch": 0.5303102777512038, + "grad_norm": 0.3248690962791443, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 16630 + }, + { + "epoch": 0.5306291654708377, + "grad_norm": 0.3199111223220825, + "learning_rate": 0.0005, + "loss": 1.1017, + "step": 16640 + }, + { + "epoch": 0.5309480531904717, + "grad_norm": 0.3270610570907593, + "learning_rate": 0.0005, + "loss": 1.1033, + "step": 16650 + }, + { + "epoch": 0.5312669409101055, + "grad_norm": 0.31784605979919434, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 16660 + }, + { + "epoch": 0.5315858286297395, + "grad_norm": 0.31250184774398804, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 16670 + }, + { + "epoch": 0.5319047163493734, + "grad_norm": 0.31594663858413696, + "learning_rate": 0.0005, + "loss": 1.1303, + "step": 16680 + }, + { + "epoch": 0.5322236040690073, + "grad_norm": 0.31342098116874695, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 16690 + }, + { + "epoch": 0.5325424917886412, + "grad_norm": 0.33595171570777893, + "learning_rate": 0.0005, + "loss": 1.0904, + "step": 16700 + }, + { + "epoch": 0.5328613795082752, + "grad_norm": 0.32776495814323425, + "learning_rate": 0.0005, + "loss": 1.0886, + "step": 16710 + }, + { + "epoch": 0.533180267227909, + "grad_norm": 0.324022114276886, + "learning_rate": 0.0005, + "loss": 1.1122, + "step": 16720 + }, + { + "epoch": 0.533499154947543, + "grad_norm": 0.32876861095428467, + "learning_rate": 0.0005, + "loss": 1.1067, + "step": 16730 + }, + { + "epoch": 0.5338180426671769, + "grad_norm": 0.3247472643852234, + "learning_rate": 0.0005, + "loss": 1.0947, + "step": 16740 + }, + { + "epoch": 0.5341369303868108, + "grad_norm": 0.3151411712169647, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 16750 + }, + { + "epoch": 0.5344558181064447, + "grad_norm": 0.3146439790725708, + "learning_rate": 0.0005, + "loss": 1.1091, + "step": 16760 + }, + { + "epoch": 0.5347747058260787, + "grad_norm": 0.3136953115463257, + "learning_rate": 0.0005, + "loss": 1.0998, + "step": 16770 + }, + { + "epoch": 0.5350935935457125, + "grad_norm": 0.3351533114910126, + "learning_rate": 0.0005, + "loss": 1.1047, + "step": 16780 + }, + { + "epoch": 0.5354124812653465, + "grad_norm": 0.3329567611217499, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 16790 + }, + { + "epoch": 0.5357313689849804, + "grad_norm": 0.3350735306739807, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 16800 + }, + { + "epoch": 0.5360502567046143, + "grad_norm": 0.3343573212623596, + "learning_rate": 0.0005, + "loss": 1.1199, + "step": 16810 + }, + { + "epoch": 0.5363691444242482, + "grad_norm": 0.3330814242362976, + "learning_rate": 0.0005, + "loss": 1.0983, + "step": 16820 + }, + { + "epoch": 0.5366880321438822, + "grad_norm": 0.3203672766685486, + "learning_rate": 0.0005, + "loss": 1.0973, + "step": 16830 + }, + { + "epoch": 0.537006919863516, + "grad_norm": 0.3277007043361664, + "learning_rate": 0.0005, + "loss": 1.1185, + "step": 16840 + }, + { + "epoch": 0.53732580758315, + "grad_norm": 0.3185708820819855, + "learning_rate": 0.0005, + "loss": 1.1009, + "step": 16850 + }, + { + "epoch": 0.5376446953027839, + "grad_norm": 0.3192184567451477, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 16860 + }, + { + "epoch": 0.5379635830224178, + "grad_norm": 0.33029595017433167, + "learning_rate": 0.0005, + "loss": 1.0923, + "step": 16870 + }, + { + "epoch": 0.5382824707420517, + "grad_norm": 0.3171393871307373, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 16880 + }, + { + "epoch": 0.5386013584616857, + "grad_norm": 0.30802884697914124, + "learning_rate": 0.0005, + "loss": 1.1076, + "step": 16890 + }, + { + "epoch": 0.5389202461813195, + "grad_norm": 0.31417855620384216, + "learning_rate": 0.0005, + "loss": 1.1079, + "step": 16900 + }, + { + "epoch": 0.5392391339009535, + "grad_norm": 0.3120649456977844, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 16910 + }, + { + "epoch": 0.5395580216205874, + "grad_norm": 0.3179928958415985, + "learning_rate": 0.0005, + "loss": 1.1141, + "step": 16920 + }, + { + "epoch": 0.5398769093402213, + "grad_norm": 0.31776371598243713, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 16930 + }, + { + "epoch": 0.5401957970598552, + "grad_norm": 0.31800293922424316, + "learning_rate": 0.0005, + "loss": 1.0993, + "step": 16940 + }, + { + "epoch": 0.5405146847794892, + "grad_norm": 0.3233039081096649, + "learning_rate": 0.0005, + "loss": 1.0912, + "step": 16950 + }, + { + "epoch": 0.540833572499123, + "grad_norm": 0.34119749069213867, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 16960 + }, + { + "epoch": 0.541152460218757, + "grad_norm": 0.3114102780818939, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 16970 + }, + { + "epoch": 0.5414713479383909, + "grad_norm": 0.32388511300086975, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 16980 + }, + { + "epoch": 0.5417902356580248, + "grad_norm": 0.3139544129371643, + "learning_rate": 0.0005, + "loss": 1.1136, + "step": 16990 + }, + { + "epoch": 0.5421091233776587, + "grad_norm": 0.31297817826271057, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 17000 + }, + { + "epoch": 0.5424280110972927, + "grad_norm": 0.3118950128555298, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 17010 + }, + { + "epoch": 0.5427468988169266, + "grad_norm": 0.3142285943031311, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 17020 + }, + { + "epoch": 0.5430657865365605, + "grad_norm": 0.320333868265152, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 17030 + }, + { + "epoch": 0.5433846742561944, + "grad_norm": 0.3135775327682495, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 17040 + }, + { + "epoch": 0.5437035619758284, + "grad_norm": 0.3219400644302368, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 17050 + }, + { + "epoch": 0.5440224496954622, + "grad_norm": 0.3215932250022888, + "learning_rate": 0.0005, + "loss": 1.1065, + "step": 17060 + }, + { + "epoch": 0.5443413374150962, + "grad_norm": 0.3155306279659271, + "learning_rate": 0.0005, + "loss": 1.1091, + "step": 17070 + }, + { + "epoch": 0.5446602251347301, + "grad_norm": 0.3236227333545685, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 17080 + }, + { + "epoch": 0.544979112854364, + "grad_norm": 0.333713561296463, + "learning_rate": 0.0005, + "loss": 1.1022, + "step": 17090 + }, + { + "epoch": 0.5452980005739979, + "grad_norm": 0.3219466805458069, + "learning_rate": 0.0005, + "loss": 1.0927, + "step": 17100 + }, + { + "epoch": 0.5456168882936319, + "grad_norm": 0.3192996680736542, + "learning_rate": 0.0005, + "loss": 1.1043, + "step": 17110 + }, + { + "epoch": 0.5459357760132657, + "grad_norm": 0.32369479537010193, + "learning_rate": 0.0005, + "loss": 1.1067, + "step": 17120 + }, + { + "epoch": 0.5462546637328997, + "grad_norm": 0.32851865887641907, + "learning_rate": 0.0005, + "loss": 1.0983, + "step": 17130 + }, + { + "epoch": 0.5465735514525336, + "grad_norm": 0.3223077952861786, + "learning_rate": 0.0005, + "loss": 1.0899, + "step": 17140 + }, + { + "epoch": 0.5468924391721675, + "grad_norm": 0.3217220902442932, + "learning_rate": 0.0005, + "loss": 1.0957, + "step": 17150 + }, + { + "epoch": 0.5472113268918014, + "grad_norm": 0.31770792603492737, + "learning_rate": 0.0005, + "loss": 1.0947, + "step": 17160 + }, + { + "epoch": 0.5475302146114354, + "grad_norm": 0.3142428398132324, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 17170 + }, + { + "epoch": 0.5478491023310692, + "grad_norm": 0.3312866687774658, + "learning_rate": 0.0005, + "loss": 1.107, + "step": 17180 + }, + { + "epoch": 0.5481679900507032, + "grad_norm": 0.32655006647109985, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 17190 + }, + { + "epoch": 0.5484868777703371, + "grad_norm": 0.3185037076473236, + "learning_rate": 0.0005, + "loss": 1.1043, + "step": 17200 + }, + { + "epoch": 0.548805765489971, + "grad_norm": 0.3207053542137146, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 17210 + }, + { + "epoch": 0.5491246532096049, + "grad_norm": 0.31583043932914734, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 17220 + }, + { + "epoch": 0.5494435409292389, + "grad_norm": 0.3145662248134613, + "learning_rate": 0.0005, + "loss": 1.0854, + "step": 17230 + }, + { + "epoch": 0.5497624286488727, + "grad_norm": 0.3155752718448639, + "learning_rate": 0.0005, + "loss": 1.1009, + "step": 17240 + }, + { + "epoch": 0.5500813163685067, + "grad_norm": 0.32154303789138794, + "learning_rate": 0.0005, + "loss": 1.0996, + "step": 17250 + }, + { + "epoch": 0.5504002040881406, + "grad_norm": 0.3479996621608734, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 17260 + }, + { + "epoch": 0.5507190918077745, + "grad_norm": 0.33058908581733704, + "learning_rate": 0.0005, + "loss": 1.1195, + "step": 17270 + }, + { + "epoch": 0.5510379795274084, + "grad_norm": 0.31440943479537964, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 17280 + }, + { + "epoch": 0.5513568672470424, + "grad_norm": 0.32479768991470337, + "learning_rate": 0.0005, + "loss": 1.1164, + "step": 17290 + }, + { + "epoch": 0.5516757549666762, + "grad_norm": 0.3268716037273407, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 17300 + }, + { + "epoch": 0.5519946426863102, + "grad_norm": 0.3237885534763336, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 17310 + }, + { + "epoch": 0.5523135304059441, + "grad_norm": 0.31847426295280457, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 17320 + }, + { + "epoch": 0.552632418125578, + "grad_norm": 0.3174494802951813, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 17330 + }, + { + "epoch": 0.5529513058452119, + "grad_norm": 0.3171641528606415, + "learning_rate": 0.0005, + "loss": 1.0966, + "step": 17340 + }, + { + "epoch": 0.5532701935648459, + "grad_norm": 0.31176289916038513, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 17350 + }, + { + "epoch": 0.5535890812844797, + "grad_norm": 0.31715038418769836, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 17360 + }, + { + "epoch": 0.5539079690041137, + "grad_norm": 0.3252870440483093, + "learning_rate": 0.0005, + "loss": 1.0895, + "step": 17370 + }, + { + "epoch": 0.5542268567237476, + "grad_norm": 0.3222629725933075, + "learning_rate": 0.0005, + "loss": 1.1029, + "step": 17380 + }, + { + "epoch": 0.5545457444433814, + "grad_norm": 0.3321414291858673, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 17390 + }, + { + "epoch": 0.5548646321630154, + "grad_norm": 0.31622350215911865, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 17400 + }, + { + "epoch": 0.5551835198826494, + "grad_norm": 0.33685582876205444, + "learning_rate": 0.0005, + "loss": 1.0993, + "step": 17410 + }, + { + "epoch": 0.5555024076022832, + "grad_norm": 0.3243965208530426, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 17420 + }, + { + "epoch": 0.5558212953219172, + "grad_norm": 0.331458181142807, + "learning_rate": 0.0005, + "loss": 1.0939, + "step": 17430 + }, + { + "epoch": 0.5561401830415511, + "grad_norm": 0.32177624106407166, + "learning_rate": 0.0005, + "loss": 1.0987, + "step": 17440 + }, + { + "epoch": 0.556459070761185, + "grad_norm": 0.31778931617736816, + "learning_rate": 0.0005, + "loss": 1.0989, + "step": 17450 + }, + { + "epoch": 0.5567779584808189, + "grad_norm": 0.3121318519115448, + "learning_rate": 0.0005, + "loss": 1.0955, + "step": 17460 + }, + { + "epoch": 0.5570968462004529, + "grad_norm": 0.3124259114265442, + "learning_rate": 0.0005, + "loss": 1.1019, + "step": 17470 + }, + { + "epoch": 0.5574157339200867, + "grad_norm": 0.3164832890033722, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 17480 + }, + { + "epoch": 0.5577346216397207, + "grad_norm": 0.31521815061569214, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 17490 + }, + { + "epoch": 0.5580535093593546, + "grad_norm": 0.3184082508087158, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 17500 + }, + { + "epoch": 0.5583723970789884, + "grad_norm": 0.33521050214767456, + "learning_rate": 0.0005, + "loss": 1.0955, + "step": 17510 + }, + { + "epoch": 0.5586912847986224, + "grad_norm": 0.3602006733417511, + "learning_rate": 0.0005, + "loss": 1.0959, + "step": 17520 + }, + { + "epoch": 0.5590101725182564, + "grad_norm": 0.33869659900665283, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 17530 + }, + { + "epoch": 0.5593290602378902, + "grad_norm": 0.30730921030044556, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 17540 + }, + { + "epoch": 0.5596479479575242, + "grad_norm": 0.32410943508148193, + "learning_rate": 0.0005, + "loss": 1.1208, + "step": 17550 + }, + { + "epoch": 0.5599668356771581, + "grad_norm": 0.323948472738266, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 17560 + }, + { + "epoch": 0.560285723396792, + "grad_norm": 0.35217639803886414, + "learning_rate": 0.0005, + "loss": 1.0914, + "step": 17570 + }, + { + "epoch": 0.5606046111164259, + "grad_norm": 0.32672759890556335, + "learning_rate": 0.0005, + "loss": 1.0978, + "step": 17580 + }, + { + "epoch": 0.5609234988360599, + "grad_norm": 0.3202705979347229, + "learning_rate": 0.0005, + "loss": 1.0918, + "step": 17590 + }, + { + "epoch": 0.5612423865556937, + "grad_norm": 0.31115081906318665, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 17600 + }, + { + "epoch": 0.5615612742753276, + "grad_norm": 0.3230699598789215, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 17610 + }, + { + "epoch": 0.5618801619949616, + "grad_norm": 0.31278496980667114, + "learning_rate": 0.0005, + "loss": 1.093, + "step": 17620 + }, + { + "epoch": 0.5621990497145954, + "grad_norm": 0.32693833112716675, + "learning_rate": 0.0005, + "loss": 1.098, + "step": 17630 + }, + { + "epoch": 0.5625179374342294, + "grad_norm": 0.33379238843917847, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 17640 + }, + { + "epoch": 0.5628368251538634, + "grad_norm": 0.3170439600944519, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 17650 + }, + { + "epoch": 0.5631557128734972, + "grad_norm": 0.3304067850112915, + "learning_rate": 0.0005, + "loss": 1.0921, + "step": 17660 + }, + { + "epoch": 0.5634746005931311, + "grad_norm": 0.3296552300453186, + "learning_rate": 0.0005, + "loss": 1.0899, + "step": 17670 + }, + { + "epoch": 0.5637934883127651, + "grad_norm": 0.3277881145477295, + "learning_rate": 0.0005, + "loss": 1.0861, + "step": 17680 + }, + { + "epoch": 0.5641123760323989, + "grad_norm": 0.3206186592578888, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 17690 + }, + { + "epoch": 0.5644312637520329, + "grad_norm": 0.3092396557331085, + "learning_rate": 0.0005, + "loss": 1.0925, + "step": 17700 + }, + { + "epoch": 0.5647501514716669, + "grad_norm": 0.32194575667381287, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 17710 + }, + { + "epoch": 0.5650690391913007, + "grad_norm": 0.32105258107185364, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 17720 + }, + { + "epoch": 0.5653879269109346, + "grad_norm": 0.30944812297821045, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 17730 + }, + { + "epoch": 0.5657068146305686, + "grad_norm": 0.31964802742004395, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 17740 + }, + { + "epoch": 0.5660257023502024, + "grad_norm": 0.317140132188797, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 17750 + }, + { + "epoch": 0.5663445900698364, + "grad_norm": 0.31247347593307495, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 17760 + }, + { + "epoch": 0.5666634777894703, + "grad_norm": 0.317495733499527, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 17770 + }, + { + "epoch": 0.5669823655091042, + "grad_norm": 0.324897438287735, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 17780 + }, + { + "epoch": 0.5673012532287381, + "grad_norm": 0.3336643874645233, + "learning_rate": 0.0005, + "loss": 1.1004, + "step": 17790 + }, + { + "epoch": 0.5676201409483721, + "grad_norm": 0.3118143081665039, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 17800 + }, + { + "epoch": 0.5679390286680059, + "grad_norm": 0.3250415623188019, + "learning_rate": 0.0005, + "loss": 1.0936, + "step": 17810 + }, + { + "epoch": 0.5682579163876399, + "grad_norm": 0.32718950510025024, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 17820 + }, + { + "epoch": 0.5685768041072738, + "grad_norm": 0.3129884898662567, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 17830 + }, + { + "epoch": 0.5688956918269078, + "grad_norm": 0.3273843228816986, + "learning_rate": 0.0005, + "loss": 1.0775, + "step": 17840 + }, + { + "epoch": 0.5692145795465416, + "grad_norm": 0.3269795775413513, + "learning_rate": 0.0005, + "loss": 1.0889, + "step": 17850 + }, + { + "epoch": 0.5695334672661756, + "grad_norm": 0.3237743079662323, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 17860 + }, + { + "epoch": 0.5698523549858096, + "grad_norm": 0.31734833121299744, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 17870 + }, + { + "epoch": 0.5701712427054434, + "grad_norm": 0.33148670196533203, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 17880 + }, + { + "epoch": 0.5704901304250773, + "grad_norm": 0.31050851941108704, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 17890 + }, + { + "epoch": 0.5708090181447113, + "grad_norm": 0.327975869178772, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 17900 + }, + { + "epoch": 0.5711279058643451, + "grad_norm": 0.30302131175994873, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 17910 + }, + { + "epoch": 0.5714467935839791, + "grad_norm": 0.30800721049308777, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 17920 + }, + { + "epoch": 0.571765681303613, + "grad_norm": 0.3125106394290924, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 17930 + }, + { + "epoch": 0.5720845690232469, + "grad_norm": 0.3220685124397278, + "learning_rate": 0.0005, + "loss": 1.1014, + "step": 17940 + }, + { + "epoch": 0.5724034567428808, + "grad_norm": 0.3280767798423767, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 17950 + }, + { + "epoch": 0.5727223444625148, + "grad_norm": 0.3060448467731476, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 17960 + }, + { + "epoch": 0.5730412321821486, + "grad_norm": 0.30415078997612, + "learning_rate": 0.0005, + "loss": 1.0799, + "step": 17970 + }, + { + "epoch": 0.5733601199017826, + "grad_norm": 0.32798802852630615, + "learning_rate": 0.0005, + "loss": 1.0938, + "step": 17980 + }, + { + "epoch": 0.5736790076214165, + "grad_norm": 0.31149980425834656, + "learning_rate": 0.0005, + "loss": 1.0994, + "step": 17990 + }, + { + "epoch": 0.5739978953410504, + "grad_norm": 0.3258190155029297, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 18000 + }, + { + "epoch": 0.5743167830606843, + "grad_norm": 0.3247692883014679, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 18010 + }, + { + "epoch": 0.5746356707803183, + "grad_norm": 0.3152218759059906, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 18020 + }, + { + "epoch": 0.5749545584999521, + "grad_norm": 0.32625725865364075, + "learning_rate": 0.0005, + "loss": 1.0934, + "step": 18030 + }, + { + "epoch": 0.5752734462195861, + "grad_norm": 0.3017418682575226, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 18040 + }, + { + "epoch": 0.57559233393922, + "grad_norm": 0.3239911198616028, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 18050 + }, + { + "epoch": 0.5759112216588539, + "grad_norm": 0.3368164002895355, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 18060 + }, + { + "epoch": 0.5762301093784878, + "grad_norm": 0.3111822307109833, + "learning_rate": 0.0005, + "loss": 1.0862, + "step": 18070 + }, + { + "epoch": 0.5765489970981218, + "grad_norm": 0.3116544485092163, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 18080 + }, + { + "epoch": 0.5768678848177556, + "grad_norm": 0.3096770942211151, + "learning_rate": 0.0005, + "loss": 1.1032, + "step": 18090 + }, + { + "epoch": 0.5771867725373896, + "grad_norm": 0.3190501630306244, + "learning_rate": 0.0005, + "loss": 1.0994, + "step": 18100 + }, + { + "epoch": 0.5775056602570235, + "grad_norm": 0.31052494049072266, + "learning_rate": 0.0005, + "loss": 1.0836, + "step": 18110 + }, + { + "epoch": 0.5778245479766574, + "grad_norm": 0.33043143153190613, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 18120 + }, + { + "epoch": 0.5781434356962913, + "grad_norm": 0.30940407514572144, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 18130 + }, + { + "epoch": 0.5784623234159253, + "grad_norm": 0.3174685537815094, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 18140 + }, + { + "epoch": 0.5787812111355591, + "grad_norm": 0.30869245529174805, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 18150 + }, + { + "epoch": 0.5791000988551931, + "grad_norm": 0.32156649231910706, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 18160 + }, + { + "epoch": 0.579418986574827, + "grad_norm": 0.3175433278083801, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 18170 + }, + { + "epoch": 0.5797378742944609, + "grad_norm": 0.3061865270137787, + "learning_rate": 0.0005, + "loss": 1.0898, + "step": 18180 + }, + { + "epoch": 0.5800567620140948, + "grad_norm": 0.31781259179115295, + "learning_rate": 0.0005, + "loss": 1.0893, + "step": 18190 + }, + { + "epoch": 0.5803756497337288, + "grad_norm": 0.32358020544052124, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 18200 + }, + { + "epoch": 0.5806945374533626, + "grad_norm": 0.3164525330066681, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 18210 + }, + { + "epoch": 0.5810134251729966, + "grad_norm": 0.3080329895019531, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 18220 + }, + { + "epoch": 0.5813323128926305, + "grad_norm": 0.30992263555526733, + "learning_rate": 0.0005, + "loss": 1.1065, + "step": 18230 + }, + { + "epoch": 0.5816512006122644, + "grad_norm": 0.31690654158592224, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 18240 + }, + { + "epoch": 0.5819700883318983, + "grad_norm": 0.3134094774723053, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 18250 + }, + { + "epoch": 0.5822889760515323, + "grad_norm": 0.3203316032886505, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 18260 + }, + { + "epoch": 0.5826078637711661, + "grad_norm": 0.3246511220932007, + "learning_rate": 0.0005, + "loss": 1.0917, + "step": 18270 + }, + { + "epoch": 0.5829267514908001, + "grad_norm": 0.3299003541469574, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 18280 + }, + { + "epoch": 0.583245639210434, + "grad_norm": 0.3138013184070587, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 18290 + }, + { + "epoch": 0.5835645269300679, + "grad_norm": 0.3196541666984558, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 18300 + }, + { + "epoch": 0.5838834146497018, + "grad_norm": 0.31367239356040955, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 18310 + }, + { + "epoch": 0.5842023023693358, + "grad_norm": 0.31627488136291504, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 18320 + }, + { + "epoch": 0.5845211900889696, + "grad_norm": 0.32422131299972534, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 18330 + }, + { + "epoch": 0.5848400778086036, + "grad_norm": 0.3215131163597107, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 18340 + }, + { + "epoch": 0.5851589655282375, + "grad_norm": 0.29539674520492554, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 18350 + }, + { + "epoch": 0.5854778532478714, + "grad_norm": 0.3192177712917328, + "learning_rate": 0.0005, + "loss": 1.1016, + "step": 18360 + }, + { + "epoch": 0.5857967409675053, + "grad_norm": 0.32366615533828735, + "learning_rate": 0.0005, + "loss": 1.1019, + "step": 18370 + }, + { + "epoch": 0.5861156286871393, + "grad_norm": 0.34069007635116577, + "learning_rate": 0.0005, + "loss": 1.1053, + "step": 18380 + }, + { + "epoch": 0.5864345164067731, + "grad_norm": 0.3245450556278229, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 18390 + }, + { + "epoch": 0.5867534041264071, + "grad_norm": 0.31849008798599243, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 18400 + }, + { + "epoch": 0.587072291846041, + "grad_norm": 0.3136410713195801, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 18410 + }, + { + "epoch": 0.5873911795656749, + "grad_norm": 0.32406342029571533, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 18420 + }, + { + "epoch": 0.5877100672853088, + "grad_norm": 0.3156557083129883, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 18430 + }, + { + "epoch": 0.5880289550049428, + "grad_norm": 0.31240981817245483, + "learning_rate": 0.0005, + "loss": 1.0888, + "step": 18440 + }, + { + "epoch": 0.5883478427245766, + "grad_norm": 0.3174673914909363, + "learning_rate": 0.0005, + "loss": 1.0876, + "step": 18450 + }, + { + "epoch": 0.5886667304442106, + "grad_norm": 0.30896031856536865, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 18460 + }, + { + "epoch": 0.5889856181638445, + "grad_norm": 0.3110184967517853, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 18470 + }, + { + "epoch": 0.5893045058834784, + "grad_norm": 0.3145858645439148, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 18480 + }, + { + "epoch": 0.5896233936031123, + "grad_norm": 0.30939409136772156, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 18490 + }, + { + "epoch": 0.5899422813227463, + "grad_norm": 0.30431243777275085, + "learning_rate": 0.0005, + "loss": 1.087, + "step": 18500 + }, + { + "epoch": 0.5902611690423801, + "grad_norm": 0.31970348954200745, + "learning_rate": 0.0005, + "loss": 1.0854, + "step": 18510 + }, + { + "epoch": 0.5905800567620141, + "grad_norm": 0.3159744441509247, + "learning_rate": 0.0005, + "loss": 1.0932, + "step": 18520 + }, + { + "epoch": 0.590898944481648, + "grad_norm": 0.32231128215789795, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 18530 + }, + { + "epoch": 0.5912178322012819, + "grad_norm": 0.31338217854499817, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 18540 + }, + { + "epoch": 0.5915367199209158, + "grad_norm": 0.31467384099960327, + "learning_rate": 0.0005, + "loss": 1.0889, + "step": 18550 + }, + { + "epoch": 0.5918556076405498, + "grad_norm": 0.3193754255771637, + "learning_rate": 0.0005, + "loss": 1.1047, + "step": 18560 + }, + { + "epoch": 0.5921744953601836, + "grad_norm": 0.3180255591869354, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 18570 + }, + { + "epoch": 0.5924933830798176, + "grad_norm": 0.33617210388183594, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 18580 + }, + { + "epoch": 0.5928122707994515, + "grad_norm": 0.31531816720962524, + "learning_rate": 0.0005, + "loss": 1.0864, + "step": 18590 + }, + { + "epoch": 0.5931311585190854, + "grad_norm": 0.31398406624794006, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 18600 + }, + { + "epoch": 0.5934500462387193, + "grad_norm": 0.3130438029766083, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 18610 + }, + { + "epoch": 0.5937689339583533, + "grad_norm": 0.31848466396331787, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 18620 + }, + { + "epoch": 0.5940878216779871, + "grad_norm": 0.3218449652194977, + "learning_rate": 0.0005, + "loss": 1.0859, + "step": 18630 + }, + { + "epoch": 0.5944067093976211, + "grad_norm": 0.3119860589504242, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 18640 + }, + { + "epoch": 0.594725597117255, + "grad_norm": 0.3109588325023651, + "learning_rate": 0.0005, + "loss": 1.0874, + "step": 18650 + }, + { + "epoch": 0.5950444848368889, + "grad_norm": 0.320926696062088, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 18660 + }, + { + "epoch": 0.5953633725565228, + "grad_norm": 0.3240651786327362, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 18670 + }, + { + "epoch": 0.5956822602761568, + "grad_norm": 0.31916317343711853, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 18680 + }, + { + "epoch": 0.5960011479957907, + "grad_norm": 0.3060884177684784, + "learning_rate": 0.0005, + "loss": 1.0777, + "step": 18690 + }, + { + "epoch": 0.5963200357154246, + "grad_norm": 0.3231722414493561, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 18700 + }, + { + "epoch": 0.5966389234350585, + "grad_norm": 0.31437253952026367, + "learning_rate": 0.0005, + "loss": 1.0897, + "step": 18710 + }, + { + "epoch": 0.5969578111546925, + "grad_norm": 0.3079948425292969, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 18720 + }, + { + "epoch": 0.5972766988743263, + "grad_norm": 0.3151026964187622, + "learning_rate": 0.0005, + "loss": 1.0825, + "step": 18730 + }, + { + "epoch": 0.5975955865939603, + "grad_norm": 0.30802813172340393, + "learning_rate": 0.0005, + "loss": 1.0981, + "step": 18740 + }, + { + "epoch": 0.5979144743135942, + "grad_norm": 0.305775910615921, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 18750 + }, + { + "epoch": 0.5982333620332281, + "grad_norm": 0.325735479593277, + "learning_rate": 0.0005, + "loss": 1.0901, + "step": 18760 + }, + { + "epoch": 0.598552249752862, + "grad_norm": 0.3160100281238556, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 18770 + }, + { + "epoch": 0.598871137472496, + "grad_norm": 0.31380367279052734, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 18780 + }, + { + "epoch": 0.5991900251921298, + "grad_norm": 0.3192298114299774, + "learning_rate": 0.0005, + "loss": 1.1119, + "step": 18790 + }, + { + "epoch": 0.5995089129117638, + "grad_norm": 0.3143872618675232, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 18800 + }, + { + "epoch": 0.5998278006313977, + "grad_norm": 0.31682097911834717, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 18810 + }, + { + "epoch": 0.6001466883510316, + "grad_norm": 0.31039610505104065, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 18820 + }, + { + "epoch": 0.6004655760706655, + "grad_norm": 0.32007476687431335, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 18830 + }, + { + "epoch": 0.6007844637902995, + "grad_norm": 0.3164651393890381, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 18840 + }, + { + "epoch": 0.6011033515099333, + "grad_norm": 0.32360073924064636, + "learning_rate": 0.0005, + "loss": 1.1139, + "step": 18850 + }, + { + "epoch": 0.6014222392295673, + "grad_norm": 0.3149234354496002, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 18860 + }, + { + "epoch": 0.6017411269492012, + "grad_norm": 0.3178025484085083, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 18870 + }, + { + "epoch": 0.6020600146688351, + "grad_norm": 0.324581116437912, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 18880 + }, + { + "epoch": 0.602378902388469, + "grad_norm": 0.3058132231235504, + "learning_rate": 0.0005, + "loss": 1.0791, + "step": 18890 + }, + { + "epoch": 0.602697790108103, + "grad_norm": 0.30854079127311707, + "learning_rate": 0.0005, + "loss": 1.0819, + "step": 18900 + }, + { + "epoch": 0.6030166778277368, + "grad_norm": 0.31456077098846436, + "learning_rate": 0.0005, + "loss": 1.0844, + "step": 18910 + }, + { + "epoch": 0.6033355655473708, + "grad_norm": 0.31017518043518066, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 18920 + }, + { + "epoch": 0.6036544532670047, + "grad_norm": 0.3030060827732086, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 18930 + }, + { + "epoch": 0.6039733409866386, + "grad_norm": 0.34125179052352905, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 18940 + }, + { + "epoch": 0.6042922287062725, + "grad_norm": 0.3149068355560303, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 18950 + }, + { + "epoch": 0.6046111164259065, + "grad_norm": 0.32189345359802246, + "learning_rate": 0.0005, + "loss": 1.0925, + "step": 18960 + }, + { + "epoch": 0.6049300041455403, + "grad_norm": 0.32223328948020935, + "learning_rate": 0.0005, + "loss": 1.0847, + "step": 18970 + }, + { + "epoch": 0.6052488918651743, + "grad_norm": 0.3165595829486847, + "learning_rate": 0.0005, + "loss": 1.0891, + "step": 18980 + }, + { + "epoch": 0.6055677795848082, + "grad_norm": 0.32246798276901245, + "learning_rate": 0.0005, + "loss": 1.0869, + "step": 18990 + }, + { + "epoch": 0.6058866673044421, + "grad_norm": 0.3145750164985657, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 19000 + }, + { + "epoch": 0.606205555024076, + "grad_norm": 0.3327442705631256, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 19010 + }, + { + "epoch": 0.60652444274371, + "grad_norm": 0.3172445595264435, + "learning_rate": 0.0005, + "loss": 1.0942, + "step": 19020 + }, + { + "epoch": 0.6068433304633438, + "grad_norm": 0.30171164870262146, + "learning_rate": 0.0005, + "loss": 1.0749, + "step": 19030 + }, + { + "epoch": 0.6071622181829778, + "grad_norm": 0.3355121314525604, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 19040 + }, + { + "epoch": 0.6074811059026117, + "grad_norm": 0.3107321262359619, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 19050 + }, + { + "epoch": 0.6077999936222456, + "grad_norm": 0.2994323968887329, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 19060 + }, + { + "epoch": 0.6081188813418795, + "grad_norm": 0.3183419108390808, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 19070 + }, + { + "epoch": 0.6084377690615135, + "grad_norm": 0.30589038133621216, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 19080 + }, + { + "epoch": 0.6087566567811473, + "grad_norm": 0.3241669833660126, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 19090 + }, + { + "epoch": 0.6090755445007813, + "grad_norm": 0.32239434123039246, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 19100 + }, + { + "epoch": 0.6093944322204152, + "grad_norm": 0.30899012088775635, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 19110 + }, + { + "epoch": 0.6097133199400491, + "grad_norm": 0.32763203978538513, + "learning_rate": 0.0005, + "loss": 1.0819, + "step": 19120 + }, + { + "epoch": 0.610032207659683, + "grad_norm": 0.3291943371295929, + "learning_rate": 0.0005, + "loss": 1.0924, + "step": 19130 + }, + { + "epoch": 0.610351095379317, + "grad_norm": 0.3210895359516144, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 19140 + }, + { + "epoch": 0.6106699830989508, + "grad_norm": 0.3067147433757782, + "learning_rate": 0.0005, + "loss": 1.0917, + "step": 19150 + }, + { + "epoch": 0.6109888708185848, + "grad_norm": 0.31053534150123596, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 19160 + }, + { + "epoch": 0.6113077585382187, + "grad_norm": 0.3062504827976227, + "learning_rate": 0.0005, + "loss": 1.0925, + "step": 19170 + }, + { + "epoch": 0.6116266462578526, + "grad_norm": 0.3294965326786041, + "learning_rate": 0.0005, + "loss": 1.09, + "step": 19180 + }, + { + "epoch": 0.6119455339774865, + "grad_norm": 0.3160441219806671, + "learning_rate": 0.0005, + "loss": 1.0911, + "step": 19190 + }, + { + "epoch": 0.6122644216971205, + "grad_norm": 0.31811705231666565, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 19200 + }, + { + "epoch": 0.6125833094167543, + "grad_norm": 0.3058662414550781, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 19210 + }, + { + "epoch": 0.6129021971363883, + "grad_norm": 0.31138697266578674, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 19220 + }, + { + "epoch": 0.6132210848560222, + "grad_norm": 0.30498701333999634, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 19230 + }, + { + "epoch": 0.6135399725756561, + "grad_norm": 0.3057716488838196, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 19240 + }, + { + "epoch": 0.61385886029529, + "grad_norm": 0.31259825825691223, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 19250 + }, + { + "epoch": 0.614177748014924, + "grad_norm": 0.32048723101615906, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 19260 + }, + { + "epoch": 0.6144966357345578, + "grad_norm": 0.3062502443790436, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 19270 + }, + { + "epoch": 0.6148155234541918, + "grad_norm": 0.30581751465797424, + "learning_rate": 0.0005, + "loss": 1.0821, + "step": 19280 + }, + { + "epoch": 0.6151344111738257, + "grad_norm": 0.30549851059913635, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 19290 + }, + { + "epoch": 0.6154532988934596, + "grad_norm": 0.3184187710285187, + "learning_rate": 0.0005, + "loss": 1.0855, + "step": 19300 + }, + { + "epoch": 0.6157721866130935, + "grad_norm": 0.3128301799297333, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 19310 + }, + { + "epoch": 0.6160910743327275, + "grad_norm": 0.3164614737033844, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 19320 + }, + { + "epoch": 0.6164099620523613, + "grad_norm": 0.30828580260276794, + "learning_rate": 0.0005, + "loss": 1.0978, + "step": 19330 + }, + { + "epoch": 0.6167288497719953, + "grad_norm": 0.30957871675491333, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 19340 + }, + { + "epoch": 0.6170477374916292, + "grad_norm": 0.31249192357063293, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 19350 + }, + { + "epoch": 0.6173666252112631, + "grad_norm": 0.32036566734313965, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 19360 + }, + { + "epoch": 0.617685512930897, + "grad_norm": 0.3068339228630066, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 19370 + }, + { + "epoch": 0.618004400650531, + "grad_norm": 0.3214913010597229, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 19380 + }, + { + "epoch": 0.6183232883701648, + "grad_norm": 0.3247632682323456, + "learning_rate": 0.0005, + "loss": 1.095, + "step": 19390 + }, + { + "epoch": 0.6186421760897988, + "grad_norm": 0.3041056990623474, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 19400 + }, + { + "epoch": 0.6189610638094327, + "grad_norm": 0.3186788856983185, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 19410 + }, + { + "epoch": 0.6192799515290666, + "grad_norm": 0.3210011124610901, + "learning_rate": 0.0005, + "loss": 1.0952, + "step": 19420 + }, + { + "epoch": 0.6195988392487005, + "grad_norm": 0.32189202308654785, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 19430 + }, + { + "epoch": 0.6199177269683345, + "grad_norm": 0.30417484045028687, + "learning_rate": 0.0005, + "loss": 1.0834, + "step": 19440 + }, + { + "epoch": 0.6202366146879683, + "grad_norm": 0.31225481629371643, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 19450 + }, + { + "epoch": 0.6205555024076023, + "grad_norm": 0.30717405676841736, + "learning_rate": 0.0005, + "loss": 1.0985, + "step": 19460 + }, + { + "epoch": 0.6208743901272362, + "grad_norm": 0.31118044257164, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 19470 + }, + { + "epoch": 0.6211932778468701, + "grad_norm": 0.31739023327827454, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 19480 + }, + { + "epoch": 0.621512165566504, + "grad_norm": 0.3207088112831116, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 19490 + }, + { + "epoch": 0.621831053286138, + "grad_norm": 0.3196359872817993, + "learning_rate": 0.0005, + "loss": 1.0844, + "step": 19500 + }, + { + "epoch": 0.6221499410057719, + "grad_norm": 0.30430689454078674, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 19510 + }, + { + "epoch": 0.6224688287254058, + "grad_norm": 0.3219655752182007, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 19520 + }, + { + "epoch": 0.6227877164450397, + "grad_norm": 0.31854191422462463, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 19530 + }, + { + "epoch": 0.6231066041646737, + "grad_norm": 0.3191505968570709, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 19540 + }, + { + "epoch": 0.6234254918843075, + "grad_norm": 0.3143983781337738, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 19550 + }, + { + "epoch": 0.6237443796039415, + "grad_norm": 0.31715285778045654, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 19560 + }, + { + "epoch": 0.6240632673235754, + "grad_norm": 0.31628987193107605, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 19570 + }, + { + "epoch": 0.6243821550432093, + "grad_norm": 0.35582295060157776, + "learning_rate": 0.0005, + "loss": 1.0843, + "step": 19580 + }, + { + "epoch": 0.6247010427628432, + "grad_norm": 0.30786216259002686, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 19590 + }, + { + "epoch": 0.6250199304824772, + "grad_norm": 0.31700748205184937, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 19600 + }, + { + "epoch": 0.625338818202111, + "grad_norm": 0.31084170937538147, + "learning_rate": 0.0005, + "loss": 1.0858, + "step": 19610 + }, + { + "epoch": 0.625657705921745, + "grad_norm": 0.3071608245372772, + "learning_rate": 0.0005, + "loss": 1.0952, + "step": 19620 + }, + { + "epoch": 0.6259765936413789, + "grad_norm": 0.3199949562549591, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 19630 + }, + { + "epoch": 0.6262954813610128, + "grad_norm": 0.3328166902065277, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 19640 + }, + { + "epoch": 0.6266143690806467, + "grad_norm": 0.31577205657958984, + "learning_rate": 0.0005, + "loss": 1.082, + "step": 19650 + }, + { + "epoch": 0.6269332568002807, + "grad_norm": 0.3245375454425812, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 19660 + }, + { + "epoch": 0.6272521445199145, + "grad_norm": 0.3172302544116974, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 19670 + }, + { + "epoch": 0.6275710322395485, + "grad_norm": 0.31262245774269104, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 19680 + }, + { + "epoch": 0.6278899199591824, + "grad_norm": 0.30418139696121216, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 19690 + }, + { + "epoch": 0.6282088076788163, + "grad_norm": 0.31656044721603394, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 19700 + }, + { + "epoch": 0.6285276953984502, + "grad_norm": 0.3122449517250061, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 19710 + }, + { + "epoch": 0.6288465831180842, + "grad_norm": 0.3014175593852997, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 19720 + }, + { + "epoch": 0.629165470837718, + "grad_norm": 0.3063112199306488, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 19730 + }, + { + "epoch": 0.629484358557352, + "grad_norm": 0.3160080313682556, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 19740 + }, + { + "epoch": 0.6298032462769859, + "grad_norm": 0.3040753901004791, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 19750 + }, + { + "epoch": 0.6301221339966198, + "grad_norm": 0.31027182936668396, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 19760 + }, + { + "epoch": 0.6304410217162537, + "grad_norm": 0.32170340418815613, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 19770 + }, + { + "epoch": 0.6307599094358877, + "grad_norm": 0.3243429660797119, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 19780 + }, + { + "epoch": 0.6310787971555215, + "grad_norm": 0.3192366361618042, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 19790 + }, + { + "epoch": 0.6313976848751555, + "grad_norm": 0.32173165678977966, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 19800 + }, + { + "epoch": 0.6317165725947894, + "grad_norm": 0.3074743449687958, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 19810 + }, + { + "epoch": 0.6320354603144233, + "grad_norm": 0.3193652927875519, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 19820 + }, + { + "epoch": 0.6323543480340572, + "grad_norm": 0.3196190297603607, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 19830 + }, + { + "epoch": 0.6326732357536912, + "grad_norm": 0.3098837435245514, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 19840 + }, + { + "epoch": 0.632992123473325, + "grad_norm": 0.31178563833236694, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 19850 + }, + { + "epoch": 0.633311011192959, + "grad_norm": 0.31588777899742126, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 19860 + }, + { + "epoch": 0.6336298989125929, + "grad_norm": 0.3056369125843048, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 19870 + }, + { + "epoch": 0.6339487866322268, + "grad_norm": 0.30744126439094543, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 19880 + }, + { + "epoch": 0.6342676743518607, + "grad_norm": 0.3057192265987396, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 19890 + }, + { + "epoch": 0.6345865620714947, + "grad_norm": 0.3178814947605133, + "learning_rate": 0.0005, + "loss": 1.0742, + "step": 19900 + }, + { + "epoch": 0.6349054497911285, + "grad_norm": 0.328322172164917, + "learning_rate": 0.0005, + "loss": 1.0863, + "step": 19910 + }, + { + "epoch": 0.6352243375107625, + "grad_norm": 0.32027745246887207, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 19920 + }, + { + "epoch": 0.6355432252303964, + "grad_norm": 0.3261570334434509, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 19930 + }, + { + "epoch": 0.6358621129500303, + "grad_norm": 0.321636825799942, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 19940 + }, + { + "epoch": 0.6361810006696642, + "grad_norm": 0.2975068986415863, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 19950 + }, + { + "epoch": 0.6364998883892982, + "grad_norm": 0.30960360169410706, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 19960 + }, + { + "epoch": 0.636818776108932, + "grad_norm": 0.31270766258239746, + "learning_rate": 0.0005, + "loss": 1.0889, + "step": 19970 + }, + { + "epoch": 0.637137663828566, + "grad_norm": 0.3046696186065674, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 19980 + }, + { + "epoch": 0.6374565515481999, + "grad_norm": 0.30438730120658875, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 19990 + }, + { + "epoch": 0.6377754392678338, + "grad_norm": 0.3202705383300781, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 20000 + }, + { + "epoch": 0.6380943269874677, + "grad_norm": 0.30801746249198914, + "learning_rate": 0.0005, + "loss": 1.0845, + "step": 20010 + }, + { + "epoch": 0.6384132147071017, + "grad_norm": 0.3115306794643402, + "learning_rate": 0.0005, + "loss": 1.0866, + "step": 20020 + }, + { + "epoch": 0.6387321024267355, + "grad_norm": 0.316930890083313, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 20030 + }, + { + "epoch": 0.6390509901463695, + "grad_norm": 0.2994682192802429, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 20040 + }, + { + "epoch": 0.6393698778660034, + "grad_norm": 0.29843565821647644, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 20050 + }, + { + "epoch": 0.6396887655856373, + "grad_norm": 0.3006264567375183, + "learning_rate": 0.0005, + "loss": 1.0761, + "step": 20060 + }, + { + "epoch": 0.6400076533052712, + "grad_norm": 0.30730554461479187, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 20070 + }, + { + "epoch": 0.6403265410249052, + "grad_norm": 0.3036216199398041, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 20080 + }, + { + "epoch": 0.640645428744539, + "grad_norm": 0.29510772228240967, + "learning_rate": 0.0005, + "loss": 1.0851, + "step": 20090 + }, + { + "epoch": 0.640964316464173, + "grad_norm": 0.3131539821624756, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 20100 + }, + { + "epoch": 0.6412832041838069, + "grad_norm": 0.3116014301776886, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 20110 + }, + { + "epoch": 0.6416020919034408, + "grad_norm": 0.30821144580841064, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 20120 + }, + { + "epoch": 0.6419209796230747, + "grad_norm": 0.3133336007595062, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 20130 + }, + { + "epoch": 0.6422398673427087, + "grad_norm": 0.30954423546791077, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 20140 + }, + { + "epoch": 0.6425587550623425, + "grad_norm": 0.31236380338668823, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 20150 + }, + { + "epoch": 0.6428776427819765, + "grad_norm": 0.3041278123855591, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 20160 + }, + { + "epoch": 0.6431965305016104, + "grad_norm": 0.31309154629707336, + "learning_rate": 0.0005, + "loss": 1.0839, + "step": 20170 + }, + { + "epoch": 0.6435154182212443, + "grad_norm": 0.30441009998321533, + "learning_rate": 0.0005, + "loss": 1.0804, + "step": 20180 + }, + { + "epoch": 0.6438343059408782, + "grad_norm": 0.31017962098121643, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 20190 + }, + { + "epoch": 0.6441531936605122, + "grad_norm": 0.3073233664035797, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 20200 + }, + { + "epoch": 0.644472081380146, + "grad_norm": 0.3151942491531372, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 20210 + }, + { + "epoch": 0.64479096909978, + "grad_norm": 0.31230977177619934, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 20220 + }, + { + "epoch": 0.6451098568194139, + "grad_norm": 0.3134220838546753, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 20230 + }, + { + "epoch": 0.6454287445390477, + "grad_norm": 0.31314539909362793, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 20240 + }, + { + "epoch": 0.6457476322586817, + "grad_norm": 0.3104364573955536, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 20250 + }, + { + "epoch": 0.6460665199783157, + "grad_norm": 0.3331508934497833, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 20260 + }, + { + "epoch": 0.6463854076979495, + "grad_norm": 0.30866751074790955, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 20270 + }, + { + "epoch": 0.6467042954175835, + "grad_norm": 0.30994173884391785, + "learning_rate": 0.0005, + "loss": 1.0934, + "step": 20280 + }, + { + "epoch": 0.6470231831372174, + "grad_norm": 0.3175150454044342, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 20290 + }, + { + "epoch": 0.6473420708568512, + "grad_norm": 0.32161569595336914, + "learning_rate": 0.0005, + "loss": 1.0798, + "step": 20300 + }, + { + "epoch": 0.6476609585764852, + "grad_norm": 0.31296443939208984, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 20310 + }, + { + "epoch": 0.6479798462961192, + "grad_norm": 0.3004898428916931, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 20320 + }, + { + "epoch": 0.6482987340157531, + "grad_norm": 0.30378371477127075, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 20330 + }, + { + "epoch": 0.648617621735387, + "grad_norm": 0.3166843056678772, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 20340 + }, + { + "epoch": 0.6489365094550209, + "grad_norm": 0.31301385164260864, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 20350 + }, + { + "epoch": 0.6492553971746549, + "grad_norm": 0.30465880036354065, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 20360 + }, + { + "epoch": 0.6495742848942887, + "grad_norm": 0.3161429166793823, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 20370 + }, + { + "epoch": 0.6498931726139227, + "grad_norm": 0.30736565589904785, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 20380 + }, + { + "epoch": 0.6502120603335566, + "grad_norm": 0.3056853413581848, + "learning_rate": 0.0004987293446490035, + "loss": 1.047, + "step": 20390 + }, + { + "epoch": 0.6505309480531904, + "grad_norm": 0.32306766510009766, + "learning_rate": 0.000496824661048599, + "loss": 1.0778, + "step": 20400 + }, + { + "epoch": 0.6508498357728244, + "grad_norm": 0.3094659745693207, + "learning_rate": 0.0004949272515732417, + "loss": 1.0593, + "step": 20410 + }, + { + "epoch": 0.6511687234924584, + "grad_norm": 0.3049302101135254, + "learning_rate": 0.0004930370884425194, + "loss": 1.0757, + "step": 20420 + }, + { + "epoch": 0.6514876112120922, + "grad_norm": 0.3081049919128418, + "learning_rate": 0.0004911541439821156, + "loss": 1.0774, + "step": 20430 + }, + { + "epoch": 0.6518064989317262, + "grad_norm": 0.31899651885032654, + "learning_rate": 0.000489278390623404, + "loss": 1.0995, + "step": 20440 + }, + { + "epoch": 0.6521253866513601, + "grad_norm": 0.30184099078178406, + "learning_rate": 0.0004874098009030445, + "loss": 1.0715, + "step": 20450 + }, + { + "epoch": 0.652444274370994, + "grad_norm": 0.31541329622268677, + "learning_rate": 0.0004855483474625822, + "loss": 1.0667, + "step": 20460 + }, + { + "epoch": 0.6527631620906279, + "grad_norm": 0.3150116801261902, + "learning_rate": 0.0004836940030480454, + "loss": 1.0679, + "step": 20470 + }, + { + "epoch": 0.6530820498102619, + "grad_norm": 0.3065341114997864, + "learning_rate": 0.0004818467405095477, + "loss": 1.04, + "step": 20480 + }, + { + "epoch": 0.6534009375298957, + "grad_norm": 0.3076440691947937, + "learning_rate": 0.00048000653280089, + "loss": 1.0669, + "step": 20490 + }, + { + "epoch": 0.6537198252495297, + "grad_norm": 0.31127315759658813, + "learning_rate": 0.00047817335297916465, + "loss": 1.0612, + "step": 20500 + }, + { + "epoch": 0.6540387129691636, + "grad_norm": 0.30329588055610657, + "learning_rate": 0.0004763471742043608, + "loss": 1.0636, + "step": 20510 + }, + { + "epoch": 0.6543576006887974, + "grad_norm": 0.31193721294403076, + "learning_rate": 0.00047452796973897166, + "loss": 1.0676, + "step": 20520 + }, + { + "epoch": 0.6546764884084314, + "grad_norm": 0.30703896284103394, + "learning_rate": 0.0004727157129476028, + "loss": 1.0552, + "step": 20530 + }, + { + "epoch": 0.6549953761280654, + "grad_norm": 0.31341981887817383, + "learning_rate": 0.0004709103772965826, + "loss": 1.074, + "step": 20540 + }, + { + "epoch": 0.6553142638476992, + "grad_norm": 0.3001408874988556, + "learning_rate": 0.0004691119363535731, + "loss": 1.0641, + "step": 20550 + }, + { + "epoch": 0.6556331515673332, + "grad_norm": 0.2930058240890503, + "learning_rate": 0.0004673203637871834, + "loss": 1.0539, + "step": 20560 + }, + { + "epoch": 0.6559520392869671, + "grad_norm": 0.309577614068985, + "learning_rate": 0.0004655356333665843, + "loss": 1.0751, + "step": 20570 + }, + { + "epoch": 0.656270927006601, + "grad_norm": 0.3096447288990021, + "learning_rate": 0.000463757718961124, + "loss": 1.0723, + "step": 20580 + }, + { + "epoch": 0.6565898147262349, + "grad_norm": 0.308167427778244, + "learning_rate": 0.00046198659453994544, + "loss": 1.0593, + "step": 20590 + }, + { + "epoch": 0.6569087024458689, + "grad_norm": 0.3029198944568634, + "learning_rate": 0.0004602222341716053, + "loss": 1.0637, + "step": 20600 + }, + { + "epoch": 0.6572275901655027, + "grad_norm": 0.3003123104572296, + "learning_rate": 0.0004584646120236944, + "loss": 1.0652, + "step": 20610 + }, + { + "epoch": 0.6575464778851366, + "grad_norm": 0.30862095952033997, + "learning_rate": 0.0004567137023624594, + "loss": 1.0593, + "step": 20620 + }, + { + "epoch": 0.6578653656047706, + "grad_norm": 0.29509639739990234, + "learning_rate": 0.0004549694795524261, + "loss": 1.0419, + "step": 20630 + }, + { + "epoch": 0.6581842533244044, + "grad_norm": 0.3133094608783722, + "learning_rate": 0.0004532319180560237, + "loss": 1.1013, + "step": 20640 + }, + { + "epoch": 0.6585031410440384, + "grad_norm": 0.3254597783088684, + "learning_rate": 0.00045150099243321175, + "loss": 1.0659, + "step": 20650 + }, + { + "epoch": 0.6588220287636724, + "grad_norm": 0.3049736022949219, + "learning_rate": 0.00044977667734110685, + "loss": 1.044, + "step": 20660 + }, + { + "epoch": 0.6591409164833062, + "grad_norm": 0.30839237570762634, + "learning_rate": 0.00044805894753361183, + "loss": 1.061, + "step": 20670 + }, + { + "epoch": 0.6594598042029401, + "grad_norm": 0.2982349991798401, + "learning_rate": 0.0004463477778610465, + "loss": 1.0611, + "step": 20680 + }, + { + "epoch": 0.6597786919225741, + "grad_norm": 0.30832356214523315, + "learning_rate": 0.0004446431432697789, + "loss": 1.055, + "step": 20690 + }, + { + "epoch": 0.6600975796422079, + "grad_norm": 0.2984870970249176, + "learning_rate": 0.0004429450188018587, + "loss": 1.0611, + "step": 20700 + }, + { + "epoch": 0.6604164673618419, + "grad_norm": 0.3130272924900055, + "learning_rate": 0.00044125337959465206, + "loss": 1.0632, + "step": 20710 + }, + { + "epoch": 0.6607353550814759, + "grad_norm": 0.3037204146385193, + "learning_rate": 0.00043956820088047696, + "loss": 1.0297, + "step": 20720 + }, + { + "epoch": 0.6610542428011097, + "grad_norm": 0.30242738127708435, + "learning_rate": 0.0004378894579862412, + "loss": 1.0539, + "step": 20730 + }, + { + "epoch": 0.6613731305207436, + "grad_norm": 0.3087151050567627, + "learning_rate": 0.00043621712633308096, + "loss": 1.0575, + "step": 20740 + }, + { + "epoch": 0.6616920182403776, + "grad_norm": 0.3058840036392212, + "learning_rate": 0.00043455118143600075, + "loss": 1.0649, + "step": 20750 + }, + { + "epoch": 0.6620109059600114, + "grad_norm": 0.2988451421260834, + "learning_rate": 0.00043289159890351497, + "loss": 1.0449, + "step": 20760 + }, + { + "epoch": 0.6623297936796454, + "grad_norm": 0.31583669781684875, + "learning_rate": 0.00043123835443729117, + "loss": 1.0668, + "step": 20770 + }, + { + "epoch": 0.6626486813992793, + "grad_norm": 0.3084590435028076, + "learning_rate": 0.0004295914238317937, + "loss": 1.0578, + "step": 20780 + }, + { + "epoch": 0.6629675691189132, + "grad_norm": 0.3043973445892334, + "learning_rate": 0.00042795078297392985, + "loss": 1.0539, + "step": 20790 + }, + { + "epoch": 0.6632864568385471, + "grad_norm": 0.3038099706172943, + "learning_rate": 0.0004263164078426964, + "loss": 1.0719, + "step": 20800 + }, + { + "epoch": 0.6636053445581811, + "grad_norm": 0.3076210021972656, + "learning_rate": 0.0004246882745088281, + "loss": 1.0376, + "step": 20810 + }, + { + "epoch": 0.6639242322778149, + "grad_norm": 0.30958908796310425, + "learning_rate": 0.0004230663591344474, + "loss": 1.0645, + "step": 20820 + }, + { + "epoch": 0.6642431199974489, + "grad_norm": 0.3039079010486603, + "learning_rate": 0.00042145063797271515, + "loss": 1.0444, + "step": 20830 + }, + { + "epoch": 0.6645620077170828, + "grad_norm": 0.30851301550865173, + "learning_rate": 0.00041984108736748337, + "loss": 1.0453, + "step": 20840 + }, + { + "epoch": 0.6648808954367167, + "grad_norm": 0.300443172454834, + "learning_rate": 0.00041823768375294853, + "loss": 1.0507, + "step": 20850 + }, + { + "epoch": 0.6651997831563506, + "grad_norm": 0.30188149213790894, + "learning_rate": 0.0004166404036533064, + "loss": 1.0443, + "step": 20860 + }, + { + "epoch": 0.6655186708759846, + "grad_norm": 0.30435100197792053, + "learning_rate": 0.0004150492236824088, + "loss": 1.0586, + "step": 20870 + }, + { + "epoch": 0.6658375585956184, + "grad_norm": 0.31833893060684204, + "learning_rate": 0.000413464120543421, + "loss": 1.0498, + "step": 20880 + }, + { + "epoch": 0.6661564463152524, + "grad_norm": 0.3018447756767273, + "learning_rate": 0.00041188507102848026, + "loss": 1.0474, + "step": 20890 + }, + { + "epoch": 0.6664753340348863, + "grad_norm": 0.30486419796943665, + "learning_rate": 0.00041031205201835666, + "loss": 1.0525, + "step": 20900 + }, + { + "epoch": 0.6667942217545202, + "grad_norm": 0.3087126910686493, + "learning_rate": 0.0004087450404821141, + "loss": 1.0435, + "step": 20910 + }, + { + "epoch": 0.6671131094741541, + "grad_norm": 0.2946838140487671, + "learning_rate": 0.0004071840134767734, + "loss": 1.0272, + "step": 20920 + }, + { + "epoch": 0.6674319971937881, + "grad_norm": 0.28987836837768555, + "learning_rate": 0.0004056289481469763, + "loss": 1.0473, + "step": 20930 + }, + { + "epoch": 0.6677508849134219, + "grad_norm": 0.3040097951889038, + "learning_rate": 0.00040407982172465073, + "loss": 1.0351, + "step": 20940 + }, + { + "epoch": 0.6680697726330559, + "grad_norm": 0.31401920318603516, + "learning_rate": 0.00040253661152867774, + "loss": 1.0547, + "step": 20950 + }, + { + "epoch": 0.6683886603526898, + "grad_norm": 0.30797773599624634, + "learning_rate": 0.00040099929496455883, + "loss": 1.0548, + "step": 20960 + }, + { + "epoch": 0.6687075480723237, + "grad_norm": 0.2939365804195404, + "learning_rate": 0.0003994678495240859, + "loss": 1.0335, + "step": 20970 + }, + { + "epoch": 0.6690264357919576, + "grad_norm": 0.29866331815719604, + "learning_rate": 0.0003979422527850112, + "loss": 1.0369, + "step": 20980 + }, + { + "epoch": 0.6693453235115916, + "grad_norm": 0.30503347516059875, + "learning_rate": 0.00039642248241071923, + "loss": 1.0408, + "step": 20990 + }, + { + "epoch": 0.6696642112312254, + "grad_norm": 0.29835569858551025, + "learning_rate": 0.0003949085161498995, + "loss": 1.0557, + "step": 21000 + }, + { + "epoch": 0.6699830989508594, + "grad_norm": 0.2976466417312622, + "learning_rate": 0.00039340033183622096, + "loss": 1.0394, + "step": 21010 + }, + { + "epoch": 0.6703019866704933, + "grad_norm": 0.3077746629714966, + "learning_rate": 0.00039189790738800755, + "loss": 1.0403, + "step": 21020 + }, + { + "epoch": 0.6706208743901272, + "grad_norm": 0.32718950510025024, + "learning_rate": 0.00039040122080791445, + "loss": 1.0463, + "step": 21030 + }, + { + "epoch": 0.6709397621097611, + "grad_norm": 0.30107590556144714, + "learning_rate": 0.0003889102501826066, + "loss": 1.043, + "step": 21040 + }, + { + "epoch": 0.6712586498293951, + "grad_norm": 0.2878934144973755, + "learning_rate": 0.00038742497368243723, + "loss": 1.0468, + "step": 21050 + }, + { + "epoch": 0.6715775375490289, + "grad_norm": 0.3092693090438843, + "learning_rate": 0.00038594536956112894, + "loss": 1.0431, + "step": 21060 + }, + { + "epoch": 0.6718964252686629, + "grad_norm": 0.3057478964328766, + "learning_rate": 0.00038447141615545455, + "loss": 1.0429, + "step": 21070 + }, + { + "epoch": 0.6722153129882968, + "grad_norm": 0.3098829388618469, + "learning_rate": 0.00038300309188492066, + "loss": 1.0655, + "step": 21080 + }, + { + "epoch": 0.6725342007079307, + "grad_norm": 0.30573147535324097, + "learning_rate": 0.00038154037525145126, + "loss": 1.0372, + "step": 21090 + }, + { + "epoch": 0.6728530884275646, + "grad_norm": 0.29434239864349365, + "learning_rate": 0.0003800832448390728, + "loss": 1.0116, + "step": 21100 + }, + { + "epoch": 0.6731719761471986, + "grad_norm": 0.3012748658657074, + "learning_rate": 0.00037863167931360134, + "loss": 1.06, + "step": 21110 + }, + { + "epoch": 0.6734908638668324, + "grad_norm": 0.3058761656284332, + "learning_rate": 0.00037718565742232924, + "loss": 1.0392, + "step": 21120 + }, + { + "epoch": 0.6738097515864664, + "grad_norm": 0.30524468421936035, + "learning_rate": 0.0003757451579937149, + "loss": 1.024, + "step": 21130 + }, + { + "epoch": 0.6741286393061003, + "grad_norm": 0.3032868206501007, + "learning_rate": 0.00037431015993707214, + "loss": 1.0362, + "step": 21140 + }, + { + "epoch": 0.6744475270257342, + "grad_norm": 0.30639830231666565, + "learning_rate": 0.00037288064224226184, + "loss": 1.0327, + "step": 21150 + }, + { + "epoch": 0.6747664147453681, + "grad_norm": 0.29574814438819885, + "learning_rate": 0.00037145658397938383, + "loss": 1.0589, + "step": 21160 + }, + { + "epoch": 0.6750853024650021, + "grad_norm": 0.3002104163169861, + "learning_rate": 0.0003700379642984711, + "loss": 1.0235, + "step": 21170 + }, + { + "epoch": 0.675404190184636, + "grad_norm": 0.303792268037796, + "learning_rate": 0.0003686247624291839, + "loss": 1.0553, + "step": 21180 + }, + { + "epoch": 0.6757230779042699, + "grad_norm": 0.30552342534065247, + "learning_rate": 0.000367216957680506, + "loss": 1.0126, + "step": 21190 + }, + { + "epoch": 0.6760419656239038, + "grad_norm": 0.29988184571266174, + "learning_rate": 0.00036581452944044174, + "loss": 1.0194, + "step": 21200 + }, + { + "epoch": 0.6763608533435378, + "grad_norm": 0.29784929752349854, + "learning_rate": 0.0003644174571757142, + "loss": 1.0048, + "step": 21210 + }, + { + "epoch": 0.6766797410631716, + "grad_norm": 0.291316956281662, + "learning_rate": 0.00036302572043146427, + "loss": 1.0252, + "step": 21220 + }, + { + "epoch": 0.6769986287828056, + "grad_norm": 0.2924594283103943, + "learning_rate": 0.00036163929883095176, + "loss": 1.0144, + "step": 21230 + }, + { + "epoch": 0.6773175165024395, + "grad_norm": 0.30861392617225647, + "learning_rate": 0.0003602581720752565, + "loss": 1.0258, + "step": 21240 + }, + { + "epoch": 0.6776364042220734, + "grad_norm": 0.29646366834640503, + "learning_rate": 0.00035888231994298154, + "loss": 1.0436, + "step": 21250 + }, + { + "epoch": 0.6779552919417073, + "grad_norm": 0.299898236989975, + "learning_rate": 0.00035751172228995677, + "loss": 1.0235, + "step": 21260 + }, + { + "epoch": 0.6782741796613413, + "grad_norm": 0.3147020936012268, + "learning_rate": 0.00035614635904894417, + "loss": 1.0335, + "step": 21270 + }, + { + "epoch": 0.6785930673809751, + "grad_norm": 0.299283504486084, + "learning_rate": 0.000354786210229344, + "loss": 1.044, + "step": 21280 + }, + { + "epoch": 0.6789119551006091, + "grad_norm": 0.2966497242450714, + "learning_rate": 0.0003534312559169021, + "loss": 1.0162, + "step": 21290 + }, + { + "epoch": 0.679230842820243, + "grad_norm": 0.29665714502334595, + "learning_rate": 0.00035208147627341824, + "loss": 1.0322, + "step": 21300 + }, + { + "epoch": 0.6795497305398769, + "grad_norm": 0.29577651619911194, + "learning_rate": 0.0003507368515364557, + "loss": 1.0338, + "step": 21310 + }, + { + "epoch": 0.6798686182595108, + "grad_norm": 0.3065530061721802, + "learning_rate": 0.00034939736201905193, + "loss": 1.0233, + "step": 21320 + }, + { + "epoch": 0.6801875059791448, + "grad_norm": 0.29793888330459595, + "learning_rate": 0.0003480629881094304, + "loss": 1.0468, + "step": 21330 + }, + { + "epoch": 0.6805063936987786, + "grad_norm": 0.3028135895729065, + "learning_rate": 0.0003467337102707133, + "loss": 1.0133, + "step": 21340 + }, + { + "epoch": 0.6808252814184126, + "grad_norm": 0.3011240065097809, + "learning_rate": 0.0003454095090406356, + "loss": 1.0428, + "step": 21350 + }, + { + "epoch": 0.6811441691380465, + "grad_norm": 0.3000783622264862, + "learning_rate": 0.00034409036503126023, + "loss": 1.0193, + "step": 21360 + }, + { + "epoch": 0.6814630568576804, + "grad_norm": 0.3108273446559906, + "learning_rate": 0.00034277625892869374, + "loss": 1.0302, + "step": 21370 + }, + { + "epoch": 0.6817819445773143, + "grad_norm": 0.2920081615447998, + "learning_rate": 0.0003414671714928041, + "loss": 1.0516, + "step": 21380 + }, + { + "epoch": 0.6821008322969483, + "grad_norm": 0.30891862511634827, + "learning_rate": 0.00034016308355693865, + "loss": 1.0285, + "step": 21390 + }, + { + "epoch": 0.6824197200165821, + "grad_norm": 0.308363676071167, + "learning_rate": 0.0003388639760276436, + "loss": 1.03, + "step": 21400 + }, + { + "epoch": 0.6827386077362161, + "grad_norm": 0.2917180359363556, + "learning_rate": 0.0003375698298843843, + "loss": 1.0295, + "step": 21410 + }, + { + "epoch": 0.68305749545585, + "grad_norm": 0.2991756796836853, + "learning_rate": 0.00033628062617926724, + "loss": 1.0186, + "step": 21420 + }, + { + "epoch": 0.6833763831754839, + "grad_norm": 0.3052626848220825, + "learning_rate": 0.0003349963460367619, + "loss": 1.02, + "step": 21430 + }, + { + "epoch": 0.6836952708951178, + "grad_norm": 0.29897376894950867, + "learning_rate": 0.0003337169706534251, + "loss": 1.0281, + "step": 21440 + }, + { + "epoch": 0.6840141586147518, + "grad_norm": 0.29893383383750916, + "learning_rate": 0.00033244248129762514, + "loss": 1.0148, + "step": 21450 + }, + { + "epoch": 0.6843330463343856, + "grad_norm": 0.2953765392303467, + "learning_rate": 0.00033117285930926786, + "loss": 1.0206, + "step": 21460 + }, + { + "epoch": 0.6846519340540196, + "grad_norm": 0.3016309142112732, + "learning_rate": 0.0003299080860995236, + "loss": 1.0222, + "step": 21470 + }, + { + "epoch": 0.6849708217736535, + "grad_norm": 0.28592649102211, + "learning_rate": 0.00032864814315055425, + "loss": 1.0199, + "step": 21480 + }, + { + "epoch": 0.6852897094932874, + "grad_norm": 0.3022935688495636, + "learning_rate": 0.00032739301201524317, + "loss": 1.0286, + "step": 21490 + }, + { + "epoch": 0.6856085972129213, + "grad_norm": 0.30078959465026855, + "learning_rate": 0.0003261426743169244, + "loss": 1.0419, + "step": 21500 + }, + { + "epoch": 0.6859274849325553, + "grad_norm": 0.2940540611743927, + "learning_rate": 0.0003248971117491138, + "loss": 1.0184, + "step": 21510 + }, + { + "epoch": 0.6862463726521891, + "grad_norm": 0.2969309389591217, + "learning_rate": 0.00032365630607524107, + "loss": 1.0269, + "step": 21520 + }, + { + "epoch": 0.6865652603718231, + "grad_norm": 0.28925058245658875, + "learning_rate": 0.00032242023912838264, + "loss": 1.0162, + "step": 21530 + }, + { + "epoch": 0.686884148091457, + "grad_norm": 0.2998692989349365, + "learning_rate": 0.00032118889281099574, + "loss": 1.0226, + "step": 21540 + }, + { + "epoch": 0.6872030358110909, + "grad_norm": 0.2965996265411377, + "learning_rate": 0.0003199622490946535, + "loss": 1.0165, + "step": 21550 + }, + { + "epoch": 0.6875219235307248, + "grad_norm": 0.3009476363658905, + "learning_rate": 0.0003187402900197809, + "loss": 1.0253, + "step": 21560 + }, + { + "epoch": 0.6878408112503588, + "grad_norm": 0.30146294832229614, + "learning_rate": 0.0003175229976953918, + "loss": 1.0075, + "step": 21570 + }, + { + "epoch": 0.6881596989699926, + "grad_norm": 0.3076365292072296, + "learning_rate": 0.000316310354298827, + "loss": 0.9962, + "step": 21580 + }, + { + "epoch": 0.6884785866896266, + "grad_norm": 0.3112267255783081, + "learning_rate": 0.00031510234207549347, + "loss": 1.0198, + "step": 21590 + }, + { + "epoch": 0.6887974744092605, + "grad_norm": 0.29509472846984863, + "learning_rate": 0.0003138989433386042, + "loss": 1.0115, + "step": 21600 + }, + { + "epoch": 0.6891163621288944, + "grad_norm": 0.2932247221469879, + "learning_rate": 0.0003127001404689193, + "loss": 1.0168, + "step": 21610 + }, + { + "epoch": 0.6894352498485283, + "grad_norm": 0.29951944947242737, + "learning_rate": 0.0003115059159144878, + "loss": 1.0189, + "step": 21620 + }, + { + "epoch": 0.6897541375681623, + "grad_norm": 0.29388749599456787, + "learning_rate": 0.0003103162521903914, + "loss": 1.0033, + "step": 21630 + }, + { + "epoch": 0.6900730252877961, + "grad_norm": 0.3006175458431244, + "learning_rate": 0.00030913113187848755, + "loss": 1.0147, + "step": 21640 + }, + { + "epoch": 0.6903919130074301, + "grad_norm": 0.29276975989341736, + "learning_rate": 0.00030795053762715507, + "loss": 1.0185, + "step": 21650 + }, + { + "epoch": 0.690710800727064, + "grad_norm": 0.2958676517009735, + "learning_rate": 0.00030677445215103975, + "loss": 1.0211, + "step": 21660 + }, + { + "epoch": 0.6910296884466979, + "grad_norm": 0.30803918838500977, + "learning_rate": 0.00030560285823080156, + "loss": 1.008, + "step": 21670 + }, + { + "epoch": 0.6913485761663318, + "grad_norm": 0.30229854583740234, + "learning_rate": 0.00030443573871286224, + "loss": 1.0236, + "step": 21680 + }, + { + "epoch": 0.6916674638859658, + "grad_norm": 0.30095160007476807, + "learning_rate": 0.0003032730765091543, + "loss": 1.0158, + "step": 21690 + }, + { + "epoch": 0.6919863516055996, + "grad_norm": 0.295236736536026, + "learning_rate": 0.000302114854596871, + "loss": 1.0213, + "step": 21700 + }, + { + "epoch": 0.6923052393252336, + "grad_norm": 0.2985754609107971, + "learning_rate": 0.00030096105601821663, + "loss": 1.0131, + "step": 21710 + }, + { + "epoch": 0.6926241270448675, + "grad_norm": 0.30802398920059204, + "learning_rate": 0.0002998116638801587, + "loss": 1.009, + "step": 21720 + }, + { + "epoch": 0.6929430147645014, + "grad_norm": 0.3082250952720642, + "learning_rate": 0.0002986666613541806, + "loss": 1.0245, + "step": 21730 + }, + { + "epoch": 0.6932619024841353, + "grad_norm": 0.29745855927467346, + "learning_rate": 0.00029752603167603484, + "loss": 1.0063, + "step": 21740 + }, + { + "epoch": 0.6935807902037693, + "grad_norm": 0.29345378279685974, + "learning_rate": 0.00029638975814549775, + "loss": 1.0124, + "step": 21750 + }, + { + "epoch": 0.6938996779234031, + "grad_norm": 0.2981776297092438, + "learning_rate": 0.0002952578241261252, + "loss": 1.0184, + "step": 21760 + }, + { + "epoch": 0.6942185656430371, + "grad_norm": 0.30407872796058655, + "learning_rate": 0.00029413021304500875, + "loss": 1.0117, + "step": 21770 + }, + { + "epoch": 0.694537453362671, + "grad_norm": 0.29665490984916687, + "learning_rate": 0.0002930069083925332, + "loss": 1.0199, + "step": 21780 + }, + { + "epoch": 0.6948563410823049, + "grad_norm": 0.29667824506759644, + "learning_rate": 0.0002918878937221347, + "loss": 1.0138, + "step": 21790 + }, + { + "epoch": 0.6951752288019388, + "grad_norm": 0.2990490794181824, + "learning_rate": 0.00029077315265005994, + "loss": 1.0146, + "step": 21800 + }, + { + "epoch": 0.6954941165215728, + "grad_norm": 0.3064717948436737, + "learning_rate": 0.00028966266885512655, + "loss": 1.0067, + "step": 21810 + }, + { + "epoch": 0.6958130042412066, + "grad_norm": 0.3038541376590729, + "learning_rate": 0.0002885564260784838, + "loss": 1.0198, + "step": 21820 + }, + { + "epoch": 0.6961318919608406, + "grad_norm": 0.30450543761253357, + "learning_rate": 0.00028745440812337464, + "loss": 0.9972, + "step": 21830 + }, + { + "epoch": 0.6964507796804745, + "grad_norm": 0.2951158285140991, + "learning_rate": 0.00028635659885489876, + "loss": 0.9993, + "step": 21840 + }, + { + "epoch": 0.6967696674001084, + "grad_norm": 0.30965596437454224, + "learning_rate": 0.00028526298219977615, + "loss": 1.0153, + "step": 21850 + }, + { + "epoch": 0.6970885551197423, + "grad_norm": 0.3073263168334961, + "learning_rate": 0.0002841735421461118, + "loss": 1.0039, + "step": 21860 + }, + { + "epoch": 0.6974074428393763, + "grad_norm": 0.2883033752441406, + "learning_rate": 0.00028308826274316135, + "loss": 1.0199, + "step": 21870 + }, + { + "epoch": 0.6977263305590101, + "grad_norm": 0.3025035262107849, + "learning_rate": 0.00028200712810109736, + "loss": 1.0074, + "step": 21880 + }, + { + "epoch": 0.6980452182786441, + "grad_norm": 0.29780346155166626, + "learning_rate": 0.00028093012239077697, + "loss": 1.0152, + "step": 21890 + }, + { + "epoch": 0.698364105998278, + "grad_norm": 0.3000519871711731, + "learning_rate": 0.0002798572298435098, + "loss": 1.0103, + "step": 21900 + }, + { + "epoch": 0.6986829937179119, + "grad_norm": 0.28480494022369385, + "learning_rate": 0.00027878843475082743, + "loss": 0.9929, + "step": 21910 + }, + { + "epoch": 0.6990018814375458, + "grad_norm": 0.29734286665916443, + "learning_rate": 0.00027772372146425304, + "loss": 0.9953, + "step": 21920 + }, + { + "epoch": 0.6993207691571798, + "grad_norm": 0.3029066026210785, + "learning_rate": 0.00027666307439507253, + "loss": 1.008, + "step": 21930 + }, + { + "epoch": 0.6996396568768136, + "grad_norm": 0.3026646673679352, + "learning_rate": 0.0002756064780141064, + "loss": 0.9897, + "step": 21940 + }, + { + "epoch": 0.6999585445964476, + "grad_norm": 0.2930282652378082, + "learning_rate": 0.00027455391685148186, + "loss": 0.9982, + "step": 21950 + }, + { + "epoch": 0.7002774323160815, + "grad_norm": 0.30149659514427185, + "learning_rate": 0.0002735053754964071, + "loss": 1.0087, + "step": 21960 + }, + { + "epoch": 0.7005963200357154, + "grad_norm": 0.2959401309490204, + "learning_rate": 0.00027246083859694485, + "loss": 0.9837, + "step": 21970 + }, + { + "epoch": 0.7009152077553493, + "grad_norm": 0.29587122797966003, + "learning_rate": 0.0002714202908597884, + "loss": 1.0099, + "step": 21980 + }, + { + "epoch": 0.7012340954749833, + "grad_norm": 0.2973346412181854, + "learning_rate": 0.00027038371705003693, + "loss": 1.0117, + "step": 21990 + }, + { + "epoch": 0.7015529831946172, + "grad_norm": 0.30666384100914, + "learning_rate": 0.0002693511019909731, + "loss": 1.0027, + "step": 22000 + }, + { + "epoch": 0.7018718709142511, + "grad_norm": 0.2986239194869995, + "learning_rate": 0.0002683224305638403, + "loss": 1.0127, + "step": 22010 + }, + { + "epoch": 0.702190758633885, + "grad_norm": 0.30097588896751404, + "learning_rate": 0.0002672976877076218, + "loss": 0.9949, + "step": 22020 + }, + { + "epoch": 0.702509646353519, + "grad_norm": 0.3022991120815277, + "learning_rate": 0.0002662768584188198, + "loss": 1.009, + "step": 22030 + }, + { + "epoch": 0.7028285340731528, + "grad_norm": 0.29063522815704346, + "learning_rate": 0.0002652599277512359, + "loss": 0.9988, + "step": 22040 + }, + { + "epoch": 0.7031474217927868, + "grad_norm": 0.3014470636844635, + "learning_rate": 0.00026424688081575263, + "loss": 1.0068, + "step": 22050 + }, + { + "epoch": 0.7034663095124207, + "grad_norm": 0.2925295829772949, + "learning_rate": 0.0002632377027801149, + "loss": 0.9807, + "step": 22060 + }, + { + "epoch": 0.7037851972320546, + "grad_norm": 0.3046445846557617, + "learning_rate": 0.000262232378868713, + "loss": 1.0161, + "step": 22070 + }, + { + "epoch": 0.7041040849516885, + "grad_norm": 0.30122342705726624, + "learning_rate": 0.00026123089436236655, + "loss": 1.0032, + "step": 22080 + }, + { + "epoch": 0.7044229726713225, + "grad_norm": 0.29070591926574707, + "learning_rate": 0.0002602332345981087, + "loss": 0.9921, + "step": 22090 + }, + { + "epoch": 0.7047418603909563, + "grad_norm": 0.29186105728149414, + "learning_rate": 0.00025923938496897155, + "loss": 1.0109, + "step": 22100 + }, + { + "epoch": 0.7050607481105903, + "grad_norm": 0.3077830970287323, + "learning_rate": 0.0002582493309237722, + "loss": 0.9994, + "step": 22110 + }, + { + "epoch": 0.7053796358302242, + "grad_norm": 0.2919236719608307, + "learning_rate": 0.0002572630579669, + "loss": 0.9873, + "step": 22120 + }, + { + "epoch": 0.7056985235498581, + "grad_norm": 0.2887571454048157, + "learning_rate": 0.0002562805516581037, + "loss": 1.0074, + "step": 22130 + }, + { + "epoch": 0.706017411269492, + "grad_norm": 0.28118544816970825, + "learning_rate": 0.00025530179761228084, + "loss": 0.9953, + "step": 22140 + }, + { + "epoch": 0.706336298989126, + "grad_norm": 0.2915787100791931, + "learning_rate": 0.00025432678149926643, + "loss": 0.9959, + "step": 22150 + }, + { + "epoch": 0.7066551867087598, + "grad_norm": 0.2891466021537781, + "learning_rate": 0.00025335548904362355, + "loss": 1.01, + "step": 22160 + }, + { + "epoch": 0.7069740744283938, + "grad_norm": 0.29659250378608704, + "learning_rate": 0.0002523879060244341, + "loss": 0.9755, + "step": 22170 + }, + { + "epoch": 0.7072929621480277, + "grad_norm": 0.30222389101982117, + "learning_rate": 0.00025142401827509084, + "loss": 0.9955, + "step": 22180 + }, + { + "epoch": 0.7076118498676616, + "grad_norm": 0.2955167293548584, + "learning_rate": 0.00025046381168308975, + "loss": 1.005, + "step": 22190 + }, + { + "epoch": 0.7079307375872955, + "grad_norm": 0.2983476221561432, + "learning_rate": 0.00024950727218982346, + "loss": 0.9933, + "step": 22200 + }, + { + "epoch": 0.7082496253069295, + "grad_norm": 0.29820889234542847, + "learning_rate": 0.00024855438579037556, + "loss": 0.9969, + "step": 22210 + }, + { + "epoch": 0.7085685130265633, + "grad_norm": 0.2924354076385498, + "learning_rate": 0.0002476051385333152, + "loss": 0.9908, + "step": 22220 + }, + { + "epoch": 0.7088874007461973, + "grad_norm": 0.28841105103492737, + "learning_rate": 0.0002466595165204934, + "loss": 1.0096, + "step": 22230 + }, + { + "epoch": 0.7092062884658312, + "grad_norm": 0.30202123522758484, + "learning_rate": 0.000245717505906839, + "loss": 1.0128, + "step": 22240 + }, + { + "epoch": 0.7095251761854651, + "grad_norm": 0.28892645239830017, + "learning_rate": 0.00024477909290015614, + "loss": 0.9837, + "step": 22250 + }, + { + "epoch": 0.709844063905099, + "grad_norm": 0.2854585349559784, + "learning_rate": 0.00024384426376092245, + "loss": 0.987, + "step": 22260 + }, + { + "epoch": 0.710162951624733, + "grad_norm": 0.2966926395893097, + "learning_rate": 0.00024291300480208775, + "loss": 0.9888, + "step": 22270 + }, + { + "epoch": 0.7104818393443668, + "grad_norm": 0.2984680235385895, + "learning_rate": 0.00024198530238887377, + "loss": 0.9936, + "step": 22280 + }, + { + "epoch": 0.7108007270640008, + "grad_norm": 0.29596757888793945, + "learning_rate": 0.00024106114293857438, + "loss": 1.0019, + "step": 22290 + }, + { + "epoch": 0.7111196147836347, + "grad_norm": 0.2984013557434082, + "learning_rate": 0.00024014051292035676, + "loss": 1.0001, + "step": 22300 + }, + { + "epoch": 0.7114385025032686, + "grad_norm": 0.2908388376235962, + "learning_rate": 0.00023922339885506342, + "loss": 0.996, + "step": 22310 + }, + { + "epoch": 0.7117573902229025, + "grad_norm": 0.2907799780368805, + "learning_rate": 0.00023830978731501472, + "loss": 0.997, + "step": 22320 + }, + { + "epoch": 0.7120762779425365, + "grad_norm": 0.30171695351600647, + "learning_rate": 0.00023739966492381225, + "loss": 0.9943, + "step": 22330 + }, + { + "epoch": 0.7123951656621703, + "grad_norm": 0.2960367500782013, + "learning_rate": 0.0002364930183561431, + "loss": 0.9891, + "step": 22340 + }, + { + "epoch": 0.7127140533818043, + "grad_norm": 0.2947923243045807, + "learning_rate": 0.00023558983433758467, + "loss": 0.9766, + "step": 22350 + }, + { + "epoch": 0.7130329411014382, + "grad_norm": 0.28949424624443054, + "learning_rate": 0.0002346900996444103, + "loss": 0.9916, + "step": 22360 + }, + { + "epoch": 0.7133518288210721, + "grad_norm": 0.29011088609695435, + "learning_rate": 0.00023379380110339585, + "loss": 0.9879, + "step": 22370 + }, + { + "epoch": 0.713670716540706, + "grad_norm": 0.28879514336586, + "learning_rate": 0.00023290092559162653, + "loss": 0.9758, + "step": 22380 + }, + { + "epoch": 0.71398960426034, + "grad_norm": 0.288805216550827, + "learning_rate": 0.0002320114600363049, + "loss": 0.9858, + "step": 22390 + }, + { + "epoch": 0.7143084919799738, + "grad_norm": 0.2958844006061554, + "learning_rate": 0.0002311253914145597, + "loss": 0.9706, + "step": 22400 + }, + { + "epoch": 0.7146273796996078, + "grad_norm": 0.3001909852027893, + "learning_rate": 0.00023024270675325468, + "loss": 1.0048, + "step": 22410 + }, + { + "epoch": 0.7149462674192417, + "grad_norm": 0.29309847950935364, + "learning_rate": 0.0002293633931287991, + "loss": 0.9978, + "step": 22420 + }, + { + "epoch": 0.7152651551388756, + "grad_norm": 0.28979870676994324, + "learning_rate": 0.00022848743766695835, + "loss": 0.9768, + "step": 22430 + }, + { + "epoch": 0.7155840428585095, + "grad_norm": 0.2978128492832184, + "learning_rate": 0.00022761482754266545, + "loss": 0.981, + "step": 22440 + }, + { + "epoch": 0.7159029305781435, + "grad_norm": 0.29963478446006775, + "learning_rate": 0.00022674554997983317, + "loss": 0.9862, + "step": 22450 + }, + { + "epoch": 0.7162218182977773, + "grad_norm": 0.2936873733997345, + "learning_rate": 0.0002258795922511673, + "loss": 0.9954, + "step": 22460 + }, + { + "epoch": 0.7165407060174113, + "grad_norm": 0.2975631356239319, + "learning_rate": 0.00022501694167797987, + "loss": 0.983, + "step": 22470 + }, + { + "epoch": 0.7168595937370452, + "grad_norm": 0.2997070550918579, + "learning_rate": 0.00022415758563000388, + "loss": 0.9788, + "step": 22480 + }, + { + "epoch": 0.7171784814566791, + "grad_norm": 0.2969716787338257, + "learning_rate": 0.0002233015115252082, + "loss": 0.9866, + "step": 22490 + }, + { + "epoch": 0.717497369176313, + "grad_norm": 0.29699286818504333, + "learning_rate": 0.00022244870682961338, + "loss": 0.99, + "step": 22500 + }, + { + "epoch": 0.717816256895947, + "grad_norm": 0.3004414737224579, + "learning_rate": 0.00022159915905710817, + "loss": 0.9883, + "step": 22510 + }, + { + "epoch": 0.7181351446155808, + "grad_norm": 0.29157665371894836, + "learning_rate": 0.00022075285576926662, + "loss": 0.9781, + "step": 22520 + }, + { + "epoch": 0.7184540323352148, + "grad_norm": 0.3094356656074524, + "learning_rate": 0.00021990978457516616, + "loss": 0.9807, + "step": 22530 + }, + { + "epoch": 0.7187729200548487, + "grad_norm": 0.29095223546028137, + "learning_rate": 0.00021906993313120594, + "loss": 0.9825, + "step": 22540 + }, + { + "epoch": 0.7190918077744826, + "grad_norm": 0.29207363724708557, + "learning_rate": 0.00021823328914092629, + "loss": 0.9792, + "step": 22550 + }, + { + "epoch": 0.7194106954941165, + "grad_norm": 0.2970081865787506, + "learning_rate": 0.00021739984035482846, + "loss": 0.9807, + "step": 22560 + }, + { + "epoch": 0.7197295832137505, + "grad_norm": 0.2927705943584442, + "learning_rate": 0.00021656957457019568, + "loss": 0.9855, + "step": 22570 + }, + { + "epoch": 0.7200484709333843, + "grad_norm": 0.29955190420150757, + "learning_rate": 0.00021574247963091408, + "loss": 0.9804, + "step": 22580 + }, + { + "epoch": 0.7203673586530183, + "grad_norm": 0.3011538088321686, + "learning_rate": 0.00021491854342729487, + "loss": 0.9766, + "step": 22590 + }, + { + "epoch": 0.7206862463726522, + "grad_norm": 0.2975776195526123, + "learning_rate": 0.00021409775389589705, + "loss": 0.9849, + "step": 22600 + }, + { + "epoch": 0.7210051340922861, + "grad_norm": 0.29041531682014465, + "learning_rate": 0.00021328009901935087, + "loss": 0.9671, + "step": 22610 + }, + { + "epoch": 0.72132402181192, + "grad_norm": 0.2931303083896637, + "learning_rate": 0.00021246556682618162, + "loss": 0.9847, + "step": 22620 + }, + { + "epoch": 0.721642909531554, + "grad_norm": 0.2981482744216919, + "learning_rate": 0.0002116541453906347, + "loss": 0.9809, + "step": 22630 + }, + { + "epoch": 0.7219617972511878, + "grad_norm": 0.28368112444877625, + "learning_rate": 0.0002108458228325007, + "loss": 0.9686, + "step": 22640 + }, + { + "epoch": 0.7222806849708218, + "grad_norm": 0.2948022782802582, + "learning_rate": 0.0002100405873169417, + "loss": 0.9818, + "step": 22650 + }, + { + "epoch": 0.7225995726904557, + "grad_norm": 0.3083368241786957, + "learning_rate": 0.0002092384270543178, + "loss": 0.9774, + "step": 22660 + }, + { + "epoch": 0.7229184604100896, + "grad_norm": 0.2966402769088745, + "learning_rate": 0.00020843933030001454, + "loss": 0.9715, + "step": 22670 + }, + { + "epoch": 0.7232373481297235, + "grad_norm": 0.2903192937374115, + "learning_rate": 0.00020764328535427122, + "loss": 0.977, + "step": 22680 + }, + { + "epoch": 0.7235562358493575, + "grad_norm": 0.2952418327331543, + "learning_rate": 0.0002068502805620091, + "loss": 0.9908, + "step": 22690 + }, + { + "epoch": 0.7238751235689913, + "grad_norm": 0.29269060492515564, + "learning_rate": 0.0002060603043126613, + "loss": 0.9872, + "step": 22700 + }, + { + "epoch": 0.7241940112886253, + "grad_norm": 0.30094051361083984, + "learning_rate": 0.00020527334504000228, + "loss": 0.9802, + "step": 22710 + }, + { + "epoch": 0.7245128990082592, + "grad_norm": 0.2994566261768341, + "learning_rate": 0.00020448939122197894, + "loss": 0.9669, + "step": 22720 + }, + { + "epoch": 0.7248317867278931, + "grad_norm": 0.2904956042766571, + "learning_rate": 0.00020370843138054165, + "loss": 0.9748, + "step": 22730 + }, + { + "epoch": 0.725150674447527, + "grad_norm": 0.308773010969162, + "learning_rate": 0.00020293045408147637, + "loss": 0.9635, + "step": 22740 + }, + { + "epoch": 0.725469562167161, + "grad_norm": 0.3033744990825653, + "learning_rate": 0.00020215544793423702, + "loss": 0.9724, + "step": 22750 + }, + { + "epoch": 0.7257884498867948, + "grad_norm": 0.30875164270401, + "learning_rate": 0.00020138340159177898, + "loss": 0.9818, + "step": 22760 + }, + { + "epoch": 0.7261073376064288, + "grad_norm": 0.3124573230743408, + "learning_rate": 0.00020061430375039268, + "loss": 0.9786, + "step": 22770 + }, + { + "epoch": 0.7264262253260627, + "grad_norm": 0.29787442088127136, + "learning_rate": 0.00019984814314953845, + "loss": 0.9688, + "step": 22780 + }, + { + "epoch": 0.7267451130456966, + "grad_norm": 0.29551681876182556, + "learning_rate": 0.0001990849085716812, + "loss": 0.9926, + "step": 22790 + }, + { + "epoch": 0.7270640007653305, + "grad_norm": 0.2911129593849182, + "learning_rate": 0.0001983245888421265, + "loss": 0.972, + "step": 22800 + }, + { + "epoch": 0.7273828884849645, + "grad_norm": 0.2950027287006378, + "learning_rate": 0.000197567172828857, + "loss": 0.9871, + "step": 22810 + }, + { + "epoch": 0.7277017762045984, + "grad_norm": 0.30667251348495483, + "learning_rate": 0.0001968126494423692, + "loss": 0.9906, + "step": 22820 + }, + { + "epoch": 0.7280206639242323, + "grad_norm": 0.29852592945098877, + "learning_rate": 0.0001960610076355113, + "loss": 0.9819, + "step": 22830 + }, + { + "epoch": 0.7283395516438662, + "grad_norm": 0.3074007034301758, + "learning_rate": 0.00019531223640332132, + "loss": 0.9905, + "step": 22840 + }, + { + "epoch": 0.7286584393635002, + "grad_norm": 0.2953300178050995, + "learning_rate": 0.00019456632478286607, + "loss": 0.9926, + "step": 22850 + }, + { + "epoch": 0.728977327083134, + "grad_norm": 0.29797452688217163, + "learning_rate": 0.00019382326185308054, + "loss": 0.9698, + "step": 22860 + }, + { + "epoch": 0.729296214802768, + "grad_norm": 0.2951212525367737, + "learning_rate": 0.00019308303673460815, + "loss": 0.9786, + "step": 22870 + }, + { + "epoch": 0.7296151025224019, + "grad_norm": 0.3035532236099243, + "learning_rate": 0.0001923456385896413, + "loss": 0.9626, + "step": 22880 + }, + { + "epoch": 0.7299339902420358, + "grad_norm": 0.3000469207763672, + "learning_rate": 0.0001916110566217628, + "loss": 0.971, + "step": 22890 + }, + { + "epoch": 0.7302528779616697, + "grad_norm": 0.29106569290161133, + "learning_rate": 0.00019087928007578776, + "loss": 0.987, + "step": 22900 + }, + { + "epoch": 0.7305717656813037, + "grad_norm": 0.28790828585624695, + "learning_rate": 0.0001901502982376061, + "loss": 0.9702, + "step": 22910 + }, + { + "epoch": 0.7308906534009375, + "grad_norm": 0.2918194532394409, + "learning_rate": 0.00018942410043402573, + "loss": 0.9686, + "step": 22920 + }, + { + "epoch": 0.7312095411205715, + "grad_norm": 0.299245148897171, + "learning_rate": 0.0001887006760326162, + "loss": 0.984, + "step": 22930 + }, + { + "epoch": 0.7315284288402054, + "grad_norm": 0.29804402589797974, + "learning_rate": 0.00018798001444155318, + "loss": 0.9651, + "step": 22940 + }, + { + "epoch": 0.7318473165598393, + "grad_norm": 0.29632264375686646, + "learning_rate": 0.00018726210510946323, + "loss": 0.967, + "step": 22950 + }, + { + "epoch": 0.7321662042794732, + "grad_norm": 0.29573965072631836, + "learning_rate": 0.00018654693752526939, + "loss": 0.9698, + "step": 22960 + }, + { + "epoch": 0.7324850919991072, + "grad_norm": 0.29792520403862, + "learning_rate": 0.00018583450121803722, + "loss": 0.9609, + "step": 22970 + }, + { + "epoch": 0.732803979718741, + "grad_norm": 0.2980731427669525, + "learning_rate": 0.00018512478575682153, + "loss": 0.9676, + "step": 22980 + }, + { + "epoch": 0.733122867438375, + "grad_norm": 0.2982822060585022, + "learning_rate": 0.00018441778075051386, + "loss": 0.9586, + "step": 22990 + }, + { + "epoch": 0.7334417551580089, + "grad_norm": 0.31218892335891724, + "learning_rate": 0.00018371347584768985, + "loss": 0.9756, + "step": 23000 + }, + { + "epoch": 0.7337606428776428, + "grad_norm": 0.2934929430484772, + "learning_rate": 0.0001830118607364582, + "loss": 0.9658, + "step": 23010 + }, + { + "epoch": 0.7340795305972767, + "grad_norm": 0.3041274845600128, + "learning_rate": 0.00018231292514430937, + "loss": 0.9774, + "step": 23020 + }, + { + "epoch": 0.7343984183169107, + "grad_norm": 0.29541394114494324, + "learning_rate": 0.00018161665883796536, + "loss": 0.9876, + "step": 23030 + }, + { + "epoch": 0.7347173060365445, + "grad_norm": 0.28990721702575684, + "learning_rate": 0.0001809230516232297, + "loss": 0.9596, + "step": 23040 + }, + { + "epoch": 0.7350361937561785, + "grad_norm": 0.29748308658599854, + "learning_rate": 0.0001802320933448384, + "loss": 0.9863, + "step": 23050 + }, + { + "epoch": 0.7353550814758124, + "grad_norm": 0.2978643476963043, + "learning_rate": 0.0001795437738863111, + "loss": 0.9724, + "step": 23060 + }, + { + "epoch": 0.7356739691954463, + "grad_norm": 0.3006666898727417, + "learning_rate": 0.000178858083169803, + "loss": 0.9686, + "step": 23070 + }, + { + "epoch": 0.7359928569150802, + "grad_norm": 0.2904208302497864, + "learning_rate": 0.00017817501115595735, + "loss": 0.9763, + "step": 23080 + }, + { + "epoch": 0.7363117446347142, + "grad_norm": 0.2943444848060608, + "learning_rate": 0.0001774945478437585, + "loss": 0.9506, + "step": 23090 + }, + { + "epoch": 0.736630632354348, + "grad_norm": 0.2955239415168762, + "learning_rate": 0.00017681668327038525, + "loss": 0.9692, + "step": 23100 + }, + { + "epoch": 0.736949520073982, + "grad_norm": 0.29572662711143494, + "learning_rate": 0.00017614140751106525, + "loss": 0.954, + "step": 23110 + }, + { + "epoch": 0.7372684077936159, + "grad_norm": 0.298018217086792, + "learning_rate": 0.00017546871067892955, + "loss": 0.9701, + "step": 23120 + }, + { + "epoch": 0.7375872955132498, + "grad_norm": 0.30332425236701965, + "learning_rate": 0.00017479858292486792, + "loss": 0.9623, + "step": 23130 + }, + { + "epoch": 0.7379061832328837, + "grad_norm": 0.3037513196468353, + "learning_rate": 0.0001741310144373845, + "loss": 0.9921, + "step": 23140 + }, + { + "epoch": 0.7382250709525177, + "grad_norm": 0.2905147969722748, + "learning_rate": 0.00017346599544245428, + "loss": 0.983, + "step": 23150 + }, + { + "epoch": 0.7385439586721515, + "grad_norm": 0.2991820275783539, + "learning_rate": 0.00017280351620338, + "loss": 0.9676, + "step": 23160 + }, + { + "epoch": 0.7388628463917855, + "grad_norm": 0.29384827613830566, + "learning_rate": 0.00017214356702064952, + "loss": 0.9778, + "step": 23170 + }, + { + "epoch": 0.7391817341114194, + "grad_norm": 0.29975441098213196, + "learning_rate": 0.00017148613823179387, + "loss": 0.9632, + "step": 23180 + }, + { + "epoch": 0.7395006218310533, + "grad_norm": 0.2996976375579834, + "learning_rate": 0.00017083122021124575, + "loss": 0.9638, + "step": 23190 + }, + { + "epoch": 0.7398195095506872, + "grad_norm": 0.29453301429748535, + "learning_rate": 0.00017017880337019854, + "loss": 0.9618, + "step": 23200 + }, + { + "epoch": 0.7401383972703212, + "grad_norm": 0.29755935072898865, + "learning_rate": 0.00016952887815646604, + "loss": 0.9716, + "step": 23210 + }, + { + "epoch": 0.740457284989955, + "grad_norm": 0.2969668209552765, + "learning_rate": 0.0001688814350543425, + "loss": 0.9548, + "step": 23220 + }, + { + "epoch": 0.740776172709589, + "grad_norm": 0.2998278737068176, + "learning_rate": 0.00016823646458446337, + "loss": 0.9516, + "step": 23230 + }, + { + "epoch": 0.7410950604292229, + "grad_norm": 0.30308350920677185, + "learning_rate": 0.00016759395730366647, + "loss": 0.9581, + "step": 23240 + }, + { + "epoch": 0.7414139481488567, + "grad_norm": 0.30527162551879883, + "learning_rate": 0.00016695390380485372, + "loss": 0.9601, + "step": 23250 + }, + { + "epoch": 0.7417328358684907, + "grad_norm": 0.2971056401729584, + "learning_rate": 0.00016631629471685345, + "loss": 0.961, + "step": 23260 + }, + { + "epoch": 0.7420517235881247, + "grad_norm": 0.29665809869766235, + "learning_rate": 0.0001656811207042832, + "loss": 0.9591, + "step": 23270 + }, + { + "epoch": 0.7423706113077585, + "grad_norm": 0.30231061577796936, + "learning_rate": 0.00016504837246741303, + "loss": 0.9541, + "step": 23280 + }, + { + "epoch": 0.7426894990273925, + "grad_norm": 0.29970023036003113, + "learning_rate": 0.00016441804074202924, + "loss": 0.9554, + "step": 23290 + }, + { + "epoch": 0.7430083867470264, + "grad_norm": 0.3048689663410187, + "learning_rate": 0.00016379011629929894, + "loss": 0.9787, + "step": 23300 + }, + { + "epoch": 0.7433272744666602, + "grad_norm": 0.2947291433811188, + "learning_rate": 0.00016316458994563475, + "loss": 0.9644, + "step": 23310 + }, + { + "epoch": 0.7436461621862942, + "grad_norm": 0.30350422859191895, + "learning_rate": 0.00016254145252256035, + "loss": 0.9737, + "step": 23320 + }, + { + "epoch": 0.7439650499059282, + "grad_norm": 0.29172733426094055, + "learning_rate": 0.0001619206949065762, + "loss": 0.9717, + "step": 23330 + }, + { + "epoch": 0.744283937625562, + "grad_norm": 0.29668304324150085, + "learning_rate": 0.00016130230800902616, + "loss": 0.9663, + "step": 23340 + }, + { + "epoch": 0.744602825345196, + "grad_norm": 0.2947177588939667, + "learning_rate": 0.00016068628277596432, + "loss": 0.9654, + "step": 23350 + }, + { + "epoch": 0.7449217130648299, + "grad_norm": 0.29447776079177856, + "learning_rate": 0.00016007261018802238, + "loss": 0.9474, + "step": 23360 + }, + { + "epoch": 0.7452406007844637, + "grad_norm": 0.28898507356643677, + "learning_rate": 0.00015946128126027772, + "loss": 0.9569, + "step": 23370 + }, + { + "epoch": 0.7455594885040977, + "grad_norm": 0.30178964138031006, + "learning_rate": 0.0001588522870421218, + "loss": 0.9586, + "step": 23380 + }, + { + "epoch": 0.7458783762237317, + "grad_norm": 0.313784122467041, + "learning_rate": 0.00015824561861712907, + "loss": 0.9641, + "step": 23390 + }, + { + "epoch": 0.7461972639433655, + "grad_norm": 0.3013688325881958, + "learning_rate": 0.00015764126710292645, + "loss": 0.9634, + "step": 23400 + }, + { + "epoch": 0.7465161516629994, + "grad_norm": 0.29923033714294434, + "learning_rate": 0.0001570392236510633, + "loss": 0.975, + "step": 23410 + }, + { + "epoch": 0.7468350393826334, + "grad_norm": 0.3004629611968994, + "learning_rate": 0.00015643947944688188, + "loss": 0.9622, + "step": 23420 + }, + { + "epoch": 0.7471539271022672, + "grad_norm": 0.2845831513404846, + "learning_rate": 0.00015584202570938826, + "loss": 0.954, + "step": 23430 + }, + { + "epoch": 0.7474728148219012, + "grad_norm": 0.29445093870162964, + "learning_rate": 0.00015524685369112376, + "loss": 0.9471, + "step": 23440 + }, + { + "epoch": 0.7477917025415352, + "grad_norm": 0.2833983898162842, + "learning_rate": 0.00015465395467803683, + "loss": 0.9486, + "step": 23450 + }, + { + "epoch": 0.748110590261169, + "grad_norm": 0.30625349283218384, + "learning_rate": 0.00015406331998935564, + "loss": 0.9747, + "step": 23460 + }, + { + "epoch": 0.748429477980803, + "grad_norm": 0.29102441668510437, + "learning_rate": 0.00015347494097746068, + "loss": 0.9649, + "step": 23470 + }, + { + "epoch": 0.7487483657004369, + "grad_norm": 0.3019132614135742, + "learning_rate": 0.0001528888090277585, + "loss": 0.9533, + "step": 23480 + }, + { + "epoch": 0.7490672534200707, + "grad_norm": 0.30258646607398987, + "learning_rate": 0.0001523049155585553, + "loss": 0.9544, + "step": 23490 + }, + { + "epoch": 0.7493861411397047, + "grad_norm": 0.3062969744205475, + "learning_rate": 0.00015172325202093142, + "loss": 0.9646, + "step": 23500 + }, + { + "epoch": 0.7497050288593387, + "grad_norm": 0.28206557035446167, + "learning_rate": 0.00015114380989861618, + "loss": 0.9502, + "step": 23510 + }, + { + "epoch": 0.7500239165789725, + "grad_norm": 0.29351454973220825, + "learning_rate": 0.0001505665807078631, + "loss": 0.9459, + "step": 23520 + }, + { + "epoch": 0.7503428042986064, + "grad_norm": 0.30770137906074524, + "learning_rate": 0.00014999155599732582, + "loss": 0.9596, + "step": 23530 + }, + { + "epoch": 0.7506616920182404, + "grad_norm": 0.28911292552948, + "learning_rate": 0.00014941872734793423, + "loss": 0.9553, + "step": 23540 + }, + { + "epoch": 0.7509805797378742, + "grad_norm": 0.29347413778305054, + "learning_rate": 0.0001488480863727712, + "loss": 0.9649, + "step": 23550 + }, + { + "epoch": 0.7512994674575082, + "grad_norm": 0.2877577245235443, + "learning_rate": 0.00014827962471694999, + "loss": 0.9516, + "step": 23560 + }, + { + "epoch": 0.7516183551771422, + "grad_norm": 0.2972458302974701, + "learning_rate": 0.00014771333405749166, + "loss": 0.9637, + "step": 23570 + }, + { + "epoch": 0.751937242896776, + "grad_norm": 0.2873701751232147, + "learning_rate": 0.00014714920610320336, + "loss": 0.9532, + "step": 23580 + }, + { + "epoch": 0.75225613061641, + "grad_norm": 0.2910812795162201, + "learning_rate": 0.00014658723259455698, + "loss": 0.9506, + "step": 23590 + }, + { + "epoch": 0.7525750183360439, + "grad_norm": 0.29596349596977234, + "learning_rate": 0.00014602740530356807, + "loss": 0.9468, + "step": 23600 + }, + { + "epoch": 0.7528939060556777, + "grad_norm": 0.298368364572525, + "learning_rate": 0.00014546971603367545, + "loss": 0.9631, + "step": 23610 + }, + { + "epoch": 0.7532127937753117, + "grad_norm": 0.29450052976608276, + "learning_rate": 0.00014491415661962125, + "loss": 0.9641, + "step": 23620 + }, + { + "epoch": 0.7535316814949456, + "grad_norm": 0.2979263663291931, + "learning_rate": 0.00014436071892733128, + "loss": 0.9673, + "step": 23630 + }, + { + "epoch": 0.7538505692145795, + "grad_norm": 0.2885546088218689, + "learning_rate": 0.00014380939485379596, + "loss": 0.9397, + "step": 23640 + }, + { + "epoch": 0.7541694569342134, + "grad_norm": 0.30781450867652893, + "learning_rate": 0.00014326017632695172, + "loss": 0.9604, + "step": 23650 + }, + { + "epoch": 0.7544883446538474, + "grad_norm": 0.30378997325897217, + "learning_rate": 0.00014271305530556277, + "loss": 0.9551, + "step": 23660 + }, + { + "epoch": 0.7548072323734814, + "grad_norm": 0.29330143332481384, + "learning_rate": 0.00014216802377910344, + "loss": 0.9503, + "step": 23670 + }, + { + "epoch": 0.7551261200931152, + "grad_norm": 0.2918219268321991, + "learning_rate": 0.00014162507376764066, + "loss": 0.9498, + "step": 23680 + }, + { + "epoch": 0.7554450078127491, + "grad_norm": 0.30905428528785706, + "learning_rate": 0.0001410841973217176, + "loss": 0.9724, + "step": 23690 + }, + { + "epoch": 0.7557638955323831, + "grad_norm": 0.29354947805404663, + "learning_rate": 0.0001405453865222367, + "loss": 0.9546, + "step": 23700 + }, + { + "epoch": 0.7560827832520169, + "grad_norm": 0.2877028286457062, + "learning_rate": 0.0001400086334803442, + "loss": 0.9587, + "step": 23710 + }, + { + "epoch": 0.7564016709716509, + "grad_norm": 0.3082736134529114, + "learning_rate": 0.0001394739303373143, + "loss": 0.9457, + "step": 23720 + }, + { + "epoch": 0.7567205586912849, + "grad_norm": 0.3034181594848633, + "learning_rate": 0.0001389412692644344, + "loss": 0.9521, + "step": 23730 + }, + { + "epoch": 0.7570394464109187, + "grad_norm": 0.2895594835281372, + "learning_rate": 0.00013841064246289018, + "loss": 0.9519, + "step": 23740 + }, + { + "epoch": 0.7573583341305526, + "grad_norm": 0.2918117642402649, + "learning_rate": 0.00013788204216365162, + "loss": 0.946, + "step": 23750 + }, + { + "epoch": 0.7576772218501866, + "grad_norm": 0.30359402298927307, + "learning_rate": 0.00013735546062735915, + "loss": 0.938, + "step": 23760 + }, + { + "epoch": 0.7579961095698204, + "grad_norm": 0.30913326144218445, + "learning_rate": 0.00013683089014421051, + "loss": 0.9617, + "step": 23770 + }, + { + "epoch": 0.7583149972894544, + "grad_norm": 0.2882547676563263, + "learning_rate": 0.0001363083230338476, + "loss": 0.9395, + "step": 23780 + }, + { + "epoch": 0.7586338850090883, + "grad_norm": 0.29031381011009216, + "learning_rate": 0.00013578775164524418, + "loss": 0.9587, + "step": 23790 + }, + { + "epoch": 0.7589527727287222, + "grad_norm": 0.30304664373397827, + "learning_rate": 0.00013526916835659387, + "loss": 0.9743, + "step": 23800 + }, + { + "epoch": 0.7592716604483561, + "grad_norm": 0.3043603003025055, + "learning_rate": 0.00013475256557519852, + "loss": 0.9396, + "step": 23810 + }, + { + "epoch": 0.7595905481679901, + "grad_norm": 0.30265742540359497, + "learning_rate": 0.00013423793573735703, + "loss": 0.9545, + "step": 23820 + }, + { + "epoch": 0.7599094358876239, + "grad_norm": 0.29440733790397644, + "learning_rate": 0.00013372527130825463, + "loss": 0.9535, + "step": 23830 + }, + { + "epoch": 0.7602283236072579, + "grad_norm": 0.2963486313819885, + "learning_rate": 0.00013321456478185254, + "loss": 0.9536, + "step": 23840 + }, + { + "epoch": 0.7605472113268918, + "grad_norm": 0.2968265116214752, + "learning_rate": 0.00013270580868077813, + "loss": 0.9552, + "step": 23850 + }, + { + "epoch": 0.7608660990465257, + "grad_norm": 0.303342342376709, + "learning_rate": 0.00013219899555621538, + "loss": 0.9403, + "step": 23860 + }, + { + "epoch": 0.7611849867661596, + "grad_norm": 0.29479485750198364, + "learning_rate": 0.00013169411798779585, + "loss": 0.9686, + "step": 23870 + }, + { + "epoch": 0.7615038744857936, + "grad_norm": 0.2946151793003082, + "learning_rate": 0.00013119116858348995, + "loss": 0.9551, + "step": 23880 + }, + { + "epoch": 0.7618227622054274, + "grad_norm": 0.29338327050209045, + "learning_rate": 0.0001306901399794989, + "loss": 0.9594, + "step": 23890 + }, + { + "epoch": 0.7621416499250614, + "grad_norm": 0.3011675179004669, + "learning_rate": 0.0001301910248401467, + "loss": 0.942, + "step": 23900 + }, + { + "epoch": 0.7624605376446953, + "grad_norm": 0.30116263031959534, + "learning_rate": 0.0001296938158577729, + "loss": 0.9561, + "step": 23910 + }, + { + "epoch": 0.7627794253643292, + "grad_norm": 0.2906716465950012, + "learning_rate": 0.0001291985057526255, + "loss": 0.9438, + "step": 23920 + }, + { + "epoch": 0.7630983130839631, + "grad_norm": 0.29096871614456177, + "learning_rate": 0.00012870508727275446, + "loss": 0.9429, + "step": 23930 + }, + { + "epoch": 0.7634172008035971, + "grad_norm": 0.30688127875328064, + "learning_rate": 0.0001282135531939054, + "loss": 0.9586, + "step": 23940 + }, + { + "epoch": 0.7637360885232309, + "grad_norm": 0.29994523525238037, + "learning_rate": 0.00012772389631941392, + "loss": 0.9515, + "step": 23950 + }, + { + "epoch": 0.7640549762428649, + "grad_norm": 0.302988737821579, + "learning_rate": 0.00012723610948010017, + "loss": 0.9508, + "step": 23960 + }, + { + "epoch": 0.7643738639624988, + "grad_norm": 0.29745492339134216, + "learning_rate": 0.00012675018553416398, + "loss": 0.9381, + "step": 23970 + }, + { + "epoch": 0.7646927516821327, + "grad_norm": 0.294744074344635, + "learning_rate": 0.00012626611736708018, + "loss": 0.9523, + "step": 23980 + }, + { + "epoch": 0.7650116394017666, + "grad_norm": 0.3051568269729614, + "learning_rate": 0.00012578389789149453, + "loss": 0.9519, + "step": 23990 + }, + { + "epoch": 0.7653305271214006, + "grad_norm": 0.2979709208011627, + "learning_rate": 0.00012530352004711987, + "loss": 0.9629, + "step": 24000 + }, + { + "epoch": 0.7656494148410344, + "grad_norm": 0.2889893054962158, + "learning_rate": 0.00012482497680063275, + "loss": 0.9319, + "step": 24010 + }, + { + "epoch": 0.7659683025606684, + "grad_norm": 0.28444361686706543, + "learning_rate": 0.00012434826114557067, + "loss": 0.9473, + "step": 24020 + }, + { + "epoch": 0.7662871902803023, + "grad_norm": 0.3054760992527008, + "learning_rate": 0.00012387336610222914, + "loss": 0.9636, + "step": 24030 + }, + { + "epoch": 0.7666060779999362, + "grad_norm": 0.29499685764312744, + "learning_rate": 0.00012340028471755974, + "loss": 0.954, + "step": 24040 + }, + { + "epoch": 0.7669249657195701, + "grad_norm": 0.294760525226593, + "learning_rate": 0.00012292901006506822, + "loss": 0.9515, + "step": 24050 + }, + { + "epoch": 0.7672438534392041, + "grad_norm": 0.2936845123767853, + "learning_rate": 0.00012245953524471316, + "loss": 0.957, + "step": 24060 + }, + { + "epoch": 0.7675627411588379, + "grad_norm": 0.292159765958786, + "learning_rate": 0.00012199185338280484, + "loss": 0.9705, + "step": 24070 + }, + { + "epoch": 0.7678816288784719, + "grad_norm": 0.2962867319583893, + "learning_rate": 0.00012152595763190471, + "loss": 0.9582, + "step": 24080 + }, + { + "epoch": 0.7682005165981058, + "grad_norm": 0.295032799243927, + "learning_rate": 0.00012106184117072507, + "loss": 0.9469, + "step": 24090 + }, + { + "epoch": 0.7685194043177397, + "grad_norm": 0.2952403426170349, + "learning_rate": 0.00012059949720402918, + "loss": 0.9507, + "step": 24100 + }, + { + "epoch": 0.7688382920373736, + "grad_norm": 0.30328744649887085, + "learning_rate": 0.00012013891896253189, + "loss": 0.9545, + "step": 24110 + }, + { + "epoch": 0.7691571797570076, + "grad_norm": 0.292563796043396, + "learning_rate": 0.00011968009970280033, + "loss": 0.945, + "step": 24120 + }, + { + "epoch": 0.7694760674766414, + "grad_norm": 0.30118969082832336, + "learning_rate": 0.00011922303270715539, + "loss": 0.9483, + "step": 24130 + }, + { + "epoch": 0.7697949551962754, + "grad_norm": 0.2996894419193268, + "learning_rate": 0.00011876771128357317, + "loss": 0.9486, + "step": 24140 + }, + { + "epoch": 0.7701138429159093, + "grad_norm": 0.29649966955184937, + "learning_rate": 0.00011831412876558719, + "loss": 0.9512, + "step": 24150 + }, + { + "epoch": 0.7704327306355432, + "grad_norm": 0.29212257266044617, + "learning_rate": 0.00011786227851219062, + "loss": 0.935, + "step": 24160 + }, + { + "epoch": 0.7707516183551771, + "grad_norm": 0.29390260577201843, + "learning_rate": 0.00011741215390773915, + "loss": 0.9384, + "step": 24170 + }, + { + "epoch": 0.7710705060748111, + "grad_norm": 0.29803451895713806, + "learning_rate": 0.00011696374836185405, + "loss": 0.9535, + "step": 24180 + }, + { + "epoch": 0.7713893937944449, + "grad_norm": 0.30166253447532654, + "learning_rate": 0.00011651705530932584, + "loss": 0.9464, + "step": 24190 + }, + { + "epoch": 0.7717082815140789, + "grad_norm": 0.29554587602615356, + "learning_rate": 0.00011607206821001794, + "loss": 0.9386, + "step": 24200 + }, + { + "epoch": 0.7720271692337128, + "grad_norm": 0.29633378982543945, + "learning_rate": 0.00011562878054877104, + "loss": 0.9422, + "step": 24210 + }, + { + "epoch": 0.7723460569533467, + "grad_norm": 0.29746270179748535, + "learning_rate": 0.00011518718583530775, + "loss": 0.9347, + "step": 24220 + }, + { + "epoch": 0.7726649446729806, + "grad_norm": 0.30118975043296814, + "learning_rate": 0.00011474727760413749, + "loss": 0.9196, + "step": 24230 + }, + { + "epoch": 0.7729838323926146, + "grad_norm": 0.29916834831237793, + "learning_rate": 0.00011430904941446183, + "loss": 0.9439, + "step": 24240 + }, + { + "epoch": 0.7733027201122484, + "grad_norm": 0.2930695414543152, + "learning_rate": 0.00011387249485008026, + "loss": 0.9367, + "step": 24250 + }, + { + "epoch": 0.7736216078318824, + "grad_norm": 0.3068337142467499, + "learning_rate": 0.0001134376075192962, + "loss": 0.9356, + "step": 24260 + }, + { + "epoch": 0.7739404955515163, + "grad_norm": 0.29748132824897766, + "learning_rate": 0.00011300438105482337, + "loss": 0.9471, + "step": 24270 + }, + { + "epoch": 0.7742593832711502, + "grad_norm": 0.30024072527885437, + "learning_rate": 0.00011257280911369273, + "loss": 0.9564, + "step": 24280 + }, + { + "epoch": 0.7745782709907841, + "grad_norm": 0.3009824752807617, + "learning_rate": 0.00011214288537715942, + "loss": 0.9544, + "step": 24290 + }, + { + "epoch": 0.7748971587104181, + "grad_norm": 0.28870582580566406, + "learning_rate": 0.00011171460355061031, + "loss": 0.9459, + "step": 24300 + }, + { + "epoch": 0.7752160464300519, + "grad_norm": 0.29776766896247864, + "learning_rate": 0.0001112879573634719, + "loss": 0.9492, + "step": 24310 + }, + { + "epoch": 0.7755349341496859, + "grad_norm": 0.3070945143699646, + "learning_rate": 0.00011086294056911845, + "loss": 0.9562, + "step": 24320 + }, + { + "epoch": 0.7758538218693198, + "grad_norm": 0.2858162522315979, + "learning_rate": 0.00011043954694478053, + "loss": 0.9385, + "step": 24330 + }, + { + "epoch": 0.7761727095889537, + "grad_norm": 0.30373987555503845, + "learning_rate": 0.00011001777029145393, + "loss": 0.9516, + "step": 24340 + }, + { + "epoch": 0.7764915973085876, + "grad_norm": 0.2940308153629303, + "learning_rate": 0.00010959760443380887, + "loss": 0.9504, + "step": 24350 + }, + { + "epoch": 0.7768104850282216, + "grad_norm": 0.2989041805267334, + "learning_rate": 0.00010917904322009965, + "loss": 0.9447, + "step": 24360 + }, + { + "epoch": 0.7771293727478554, + "grad_norm": 0.294559508562088, + "learning_rate": 0.00010876208052207448, + "loss": 0.9328, + "step": 24370 + }, + { + "epoch": 0.7774482604674894, + "grad_norm": 0.29151490330696106, + "learning_rate": 0.00010834671023488582, + "loss": 0.925, + "step": 24380 + }, + { + "epoch": 0.7777671481871233, + "grad_norm": 0.29237502813339233, + "learning_rate": 0.000107932926277001, + "loss": 0.9441, + "step": 24390 + }, + { + "epoch": 0.7780860359067572, + "grad_norm": 0.3009505271911621, + "learning_rate": 0.00010752072259011318, + "loss": 0.9462, + "step": 24400 + }, + { + "epoch": 0.7784049236263911, + "grad_norm": 0.2872629463672638, + "learning_rate": 0.00010711009313905259, + "loss": 0.9472, + "step": 24410 + }, + { + "epoch": 0.7787238113460251, + "grad_norm": 0.2956571578979492, + "learning_rate": 0.00010670103191169825, + "loss": 0.9524, + "step": 24420 + }, + { + "epoch": 0.7790426990656589, + "grad_norm": 0.2941124439239502, + "learning_rate": 0.00010629353291888986, + "loss": 0.9457, + "step": 24430 + }, + { + "epoch": 0.7793615867852929, + "grad_norm": 0.29650887846946716, + "learning_rate": 0.00010588759019434024, + "loss": 0.9447, + "step": 24440 + }, + { + "epoch": 0.7796804745049268, + "grad_norm": 0.28828340768814087, + "learning_rate": 0.00010548319779454782, + "loss": 0.933, + "step": 24450 + }, + { + "epoch": 0.7799993622245607, + "grad_norm": 0.29822880029678345, + "learning_rate": 0.00010508034979870972, + "loss": 0.9441, + "step": 24460 + }, + { + "epoch": 0.7803182499441946, + "grad_norm": 0.2889900505542755, + "learning_rate": 0.00010467904030863507, + "loss": 0.938, + "step": 24470 + }, + { + "epoch": 0.7806371376638286, + "grad_norm": 0.2908421754837036, + "learning_rate": 0.00010427926344865853, + "loss": 0.9256, + "step": 24480 + }, + { + "epoch": 0.7809560253834625, + "grad_norm": 0.29898807406425476, + "learning_rate": 0.00010388101336555442, + "loss": 0.9449, + "step": 24490 + }, + { + "epoch": 0.7812749131030964, + "grad_norm": 0.30551448464393616, + "learning_rate": 0.00010348428422845097, + "loss": 0.947, + "step": 24500 + }, + { + "epoch": 0.7815938008227303, + "grad_norm": 0.3013573884963989, + "learning_rate": 0.00010308907022874489, + "loss": 0.9266, + "step": 24510 + }, + { + "epoch": 0.7819126885423643, + "grad_norm": 0.29684606194496155, + "learning_rate": 0.00010269536558001635, + "loss": 0.9466, + "step": 24520 + }, + { + "epoch": 0.7822315762619981, + "grad_norm": 0.29465001821517944, + "learning_rate": 0.00010230316451794439, + "loss": 0.9542, + "step": 24530 + }, + { + "epoch": 0.7825504639816321, + "grad_norm": 0.2919856905937195, + "learning_rate": 0.00010191246130022226, + "loss": 0.9328, + "step": 24540 + }, + { + "epoch": 0.782869351701266, + "grad_norm": 0.29406240582466125, + "learning_rate": 0.00010152325020647362, + "loss": 0.9406, + "step": 24550 + }, + { + "epoch": 0.7831882394208999, + "grad_norm": 0.3012252151966095, + "learning_rate": 0.00010113552553816855, + "loss": 0.9365, + "step": 24560 + }, + { + "epoch": 0.7835071271405338, + "grad_norm": 0.3084176182746887, + "learning_rate": 0.00010074928161854036, + "loss": 0.9421, + "step": 24570 + }, + { + "epoch": 0.7838260148601678, + "grad_norm": 0.2966445982456207, + "learning_rate": 0.00010036451279250222, + "loss": 0.9372, + "step": 24580 + }, + { + "epoch": 0.7841449025798016, + "grad_norm": 0.28686439990997314, + "learning_rate": 9.998121342656457e-05, + "loss": 0.9429, + "step": 24590 + }, + { + "epoch": 0.7844637902994356, + "grad_norm": 0.2919677197933197, + "learning_rate": 9.959937790875249e-05, + "loss": 0.9405, + "step": 24600 + }, + { + "epoch": 0.7847826780190695, + "grad_norm": 0.2990611791610718, + "learning_rate": 9.92190006485237e-05, + "loss": 0.9352, + "step": 24610 + }, + { + "epoch": 0.7851015657387034, + "grad_norm": 0.29399555921554565, + "learning_rate": 9.884007607668652e-05, + "loss": 0.9236, + "step": 24620 + }, + { + "epoch": 0.7854204534583373, + "grad_norm": 0.3035459518432617, + "learning_rate": 9.846259864531842e-05, + "loss": 0.9397, + "step": 24630 + }, + { + "epoch": 0.7857393411779713, + "grad_norm": 0.30628782510757446, + "learning_rate": 9.808656282768486e-05, + "loss": 0.9341, + "step": 24640 + }, + { + "epoch": 0.7860582288976051, + "grad_norm": 0.3014283776283264, + "learning_rate": 9.77119631181582e-05, + "loss": 0.9338, + "step": 24650 + }, + { + "epoch": 0.7863771166172391, + "grad_norm": 0.29893139004707336, + "learning_rate": 9.733879403213728e-05, + "loss": 0.9387, + "step": 24660 + }, + { + "epoch": 0.786696004336873, + "grad_norm": 0.287754625082016, + "learning_rate": 9.696705010596698e-05, + "loss": 0.9269, + "step": 24670 + }, + { + "epoch": 0.7870148920565069, + "grad_norm": 0.2931799590587616, + "learning_rate": 9.65967258968583e-05, + "loss": 0.9466, + "step": 24680 + }, + { + "epoch": 0.7873337797761408, + "grad_norm": 0.3002002537250519, + "learning_rate": 9.622781598280861e-05, + "loss": 0.9433, + "step": 24690 + }, + { + "epoch": 0.7876526674957748, + "grad_norm": 0.2931079566478729, + "learning_rate": 9.586031496252237e-05, + "loss": 0.9258, + "step": 24700 + }, + { + "epoch": 0.7879715552154086, + "grad_norm": 0.3031649887561798, + "learning_rate": 9.549421745533192e-05, + "loss": 0.9424, + "step": 24710 + }, + { + "epoch": 0.7882904429350426, + "grad_norm": 0.29497942328453064, + "learning_rate": 9.512951810111879e-05, + "loss": 0.9243, + "step": 24720 + }, + { + "epoch": 0.7886093306546765, + "grad_norm": 0.29368606209754944, + "learning_rate": 9.476621156023519e-05, + "loss": 0.9426, + "step": 24730 + }, + { + "epoch": 0.7889282183743104, + "grad_norm": 0.3123050630092621, + "learning_rate": 9.44042925134258e-05, + "loss": 0.9408, + "step": 24740 + }, + { + "epoch": 0.7892471060939443, + "grad_norm": 0.28893402218818665, + "learning_rate": 9.404375566174994e-05, + "loss": 0.9569, + "step": 24750 + }, + { + "epoch": 0.7895659938135783, + "grad_norm": 0.2901870608329773, + "learning_rate": 9.368459572650401e-05, + "loss": 0.9323, + "step": 24760 + }, + { + "epoch": 0.7898848815332121, + "grad_norm": 0.2980130612850189, + "learning_rate": 9.33268074491441e-05, + "loss": 0.9224, + "step": 24770 + }, + { + "epoch": 0.7902037692528461, + "grad_norm": 0.2842077314853668, + "learning_rate": 9.297038559120912e-05, + "loss": 0.9443, + "step": 24780 + }, + { + "epoch": 0.79052265697248, + "grad_norm": 0.2993188798427582, + "learning_rate": 9.2615324934244e-05, + "loss": 0.9566, + "step": 24790 + }, + { + "epoch": 0.7908415446921139, + "grad_norm": 0.2908771336078644, + "learning_rate": 9.226162027972337e-05, + "loss": 0.9255, + "step": 24800 + }, + { + "epoch": 0.7911604324117478, + "grad_norm": 0.28995591402053833, + "learning_rate": 9.190926644897531e-05, + "loss": 0.9434, + "step": 24810 + }, + { + "epoch": 0.7914793201313818, + "grad_norm": 0.30371615290641785, + "learning_rate": 9.155825828310578e-05, + "loss": 0.9343, + "step": 24820 + }, + { + "epoch": 0.7917982078510156, + "grad_norm": 0.30678293108940125, + "learning_rate": 9.120859064292278e-05, + "loss": 0.9372, + "step": 24830 + }, + { + "epoch": 0.7921170955706496, + "grad_norm": 0.2907579839229584, + "learning_rate": 9.086025840886135e-05, + "loss": 0.93, + "step": 24840 + }, + { + "epoch": 0.7924359832902835, + "grad_norm": 0.30206260085105896, + "learning_rate": 9.05132564809085e-05, + "loss": 0.94, + "step": 24850 + }, + { + "epoch": 0.7927548710099174, + "grad_norm": 0.3013043999671936, + "learning_rate": 9.016757977852851e-05, + "loss": 0.931, + "step": 24860 + }, + { + "epoch": 0.7930737587295513, + "grad_norm": 0.29542598128318787, + "learning_rate": 8.982322324058868e-05, + "loss": 0.9251, + "step": 24870 + }, + { + "epoch": 0.7933926464491853, + "grad_norm": 0.2961618900299072, + "learning_rate": 8.948018182528511e-05, + "loss": 0.9327, + "step": 24880 + }, + { + "epoch": 0.7937115341688191, + "grad_norm": 0.2946419417858124, + "learning_rate": 8.913845051006889e-05, + "loss": 0.9261, + "step": 24890 + }, + { + "epoch": 0.7940304218884531, + "grad_norm": 0.2816670835018158, + "learning_rate": 8.879802429157259e-05, + "loss": 0.9261, + "step": 24900 + }, + { + "epoch": 0.794349309608087, + "grad_norm": 0.30013415217399597, + "learning_rate": 8.845889818553704e-05, + "loss": 0.9395, + "step": 24910 + }, + { + "epoch": 0.7946681973277209, + "grad_norm": 0.2884807288646698, + "learning_rate": 8.81210672267383e-05, + "loss": 0.9348, + "step": 24920 + }, + { + "epoch": 0.7949870850473548, + "grad_norm": 0.296625554561615, + "learning_rate": 8.778452646891497e-05, + "loss": 0.9279, + "step": 24930 + }, + { + "epoch": 0.7953059727669888, + "grad_norm": 0.2994830906391144, + "learning_rate": 8.744927098469577e-05, + "loss": 0.932, + "step": 24940 + }, + { + "epoch": 0.7956248604866226, + "grad_norm": 0.29800304770469666, + "learning_rate": 8.711529586552748e-05, + "loss": 0.9428, + "step": 24950 + }, + { + "epoch": 0.7959437482062566, + "grad_norm": 0.2980853319168091, + "learning_rate": 8.678259622160293e-05, + "loss": 0.9293, + "step": 24960 + }, + { + "epoch": 0.7962626359258905, + "grad_norm": 0.29429447650909424, + "learning_rate": 8.645116718178947e-05, + "loss": 0.9274, + "step": 24970 + }, + { + "epoch": 0.7965815236455244, + "grad_norm": 0.29675599932670593, + "learning_rate": 8.612100389355772e-05, + "loss": 0.9342, + "step": 24980 + }, + { + "epoch": 0.7969004113651583, + "grad_norm": 0.29180872440338135, + "learning_rate": 8.579210152291042e-05, + "loss": 0.9107, + "step": 24990 + }, + { + "epoch": 0.7972192990847923, + "grad_norm": 0.29538294672966003, + "learning_rate": 8.546445525431173e-05, + "loss": 0.9268, + "step": 25000 + }, + { + "epoch": 0.7975381868044261, + "grad_norm": 0.30392956733703613, + "learning_rate": 8.513806029061663e-05, + "loss": 0.9478, + "step": 25010 + }, + { + "epoch": 0.7978570745240601, + "grad_norm": 0.30718958377838135, + "learning_rate": 8.481291185300078e-05, + "loss": 0.9272, + "step": 25020 + }, + { + "epoch": 0.798175962243694, + "grad_norm": 0.31373822689056396, + "learning_rate": 8.448900518089058e-05, + "loss": 0.9256, + "step": 25030 + }, + { + "epoch": 0.7984948499633279, + "grad_norm": 0.31905749440193176, + "learning_rate": 8.416633553189332e-05, + "loss": 0.9426, + "step": 25040 + }, + { + "epoch": 0.7988137376829618, + "grad_norm": 0.288647323846817, + "learning_rate": 8.384489818172784e-05, + "loss": 0.9257, + "step": 25050 + }, + { + "epoch": 0.7991326254025958, + "grad_norm": 0.2976011037826538, + "learning_rate": 8.35246884241554e-05, + "loss": 0.9251, + "step": 25060 + }, + { + "epoch": 0.7994515131222296, + "grad_norm": 0.298535019159317, + "learning_rate": 8.32057015709107e-05, + "loss": 0.9444, + "step": 25070 + }, + { + "epoch": 0.7997704008418636, + "grad_norm": 0.3076132833957672, + "learning_rate": 8.288793295163325e-05, + "loss": 0.9376, + "step": 25080 + }, + { + "epoch": 0.8000892885614975, + "grad_norm": 0.29532676935195923, + "learning_rate": 8.257137791379903e-05, + "loss": 0.9432, + "step": 25090 + }, + { + "epoch": 0.8004081762811314, + "grad_norm": 0.29447340965270996, + "learning_rate": 8.225603182265234e-05, + "loss": 0.9187, + "step": 25100 + }, + { + "epoch": 0.8007270640007653, + "grad_norm": 0.29353076219558716, + "learning_rate": 8.194189006113793e-05, + "loss": 0.9317, + "step": 25110 + }, + { + "epoch": 0.8010459517203993, + "grad_norm": 0.2983883023262024, + "learning_rate": 8.162894802983348e-05, + "loss": 0.9314, + "step": 25120 + }, + { + "epoch": 0.8013648394400331, + "grad_norm": 0.3010222613811493, + "learning_rate": 8.131720114688214e-05, + "loss": 0.9255, + "step": 25130 + }, + { + "epoch": 0.8016837271596671, + "grad_norm": 0.3008621335029602, + "learning_rate": 8.100664484792551e-05, + "loss": 0.9321, + "step": 25140 + }, + { + "epoch": 0.802002614879301, + "grad_norm": 0.3051190972328186, + "learning_rate": 8.069727458603683e-05, + "loss": 0.9206, + "step": 25150 + }, + { + "epoch": 0.8023215025989349, + "grad_norm": 0.2958706021308899, + "learning_rate": 8.038908583165436e-05, + "loss": 0.9458, + "step": 25160 + }, + { + "epoch": 0.8026403903185688, + "grad_norm": 0.30285048484802246, + "learning_rate": 8.008207407251511e-05, + "loss": 0.9097, + "step": 25170 + }, + { + "epoch": 0.8029592780382028, + "grad_norm": 0.29344502091407776, + "learning_rate": 7.977623481358872e-05, + "loss": 0.9226, + "step": 25180 + }, + { + "epoch": 0.8032781657578366, + "grad_norm": 0.2978428304195404, + "learning_rate": 7.947156357701168e-05, + "loss": 0.9196, + "step": 25190 + }, + { + "epoch": 0.8035970534774706, + "grad_norm": 0.29757606983184814, + "learning_rate": 7.916805590202182e-05, + "loss": 0.9299, + "step": 25200 + }, + { + "epoch": 0.8039159411971045, + "grad_norm": 0.2951332926750183, + "learning_rate": 7.886570734489289e-05, + "loss": 0.9392, + "step": 25210 + }, + { + "epoch": 0.8042348289167384, + "grad_norm": 0.29570409655570984, + "learning_rate": 7.856451347886955e-05, + "loss": 0.9252, + "step": 25220 + }, + { + "epoch": 0.8045537166363723, + "grad_norm": 0.2930358350276947, + "learning_rate": 7.82644698941026e-05, + "loss": 0.9182, + "step": 25230 + }, + { + "epoch": 0.8048726043560063, + "grad_norm": 0.30780941247940063, + "learning_rate": 7.796557219758433e-05, + "loss": 0.9442, + "step": 25240 + }, + { + "epoch": 0.8051914920756401, + "grad_norm": 0.28759080171585083, + "learning_rate": 7.766781601308432e-05, + "loss": 0.9377, + "step": 25250 + }, + { + "epoch": 0.8055103797952741, + "grad_norm": 0.3078532814979553, + "learning_rate": 7.737119698108516e-05, + "loss": 0.9119, + "step": 25260 + }, + { + "epoch": 0.805829267514908, + "grad_norm": 0.2813483774662018, + "learning_rate": 7.70757107587189e-05, + "loss": 0.9083, + "step": 25270 + }, + { + "epoch": 0.8061481552345419, + "grad_norm": 0.29622021317481995, + "learning_rate": 7.678135301970321e-05, + "loss": 0.9323, + "step": 25280 + }, + { + "epoch": 0.8064670429541758, + "grad_norm": 0.294413298368454, + "learning_rate": 7.648811945427822e-05, + "loss": 0.9312, + "step": 25290 + }, + { + "epoch": 0.8067859306738098, + "grad_norm": 0.2925923466682434, + "learning_rate": 7.61960057691433e-05, + "loss": 0.9284, + "step": 25300 + }, + { + "epoch": 0.8071048183934437, + "grad_norm": 0.302916020154953, + "learning_rate": 7.590500768739427e-05, + "loss": 0.9162, + "step": 25310 + }, + { + "epoch": 0.8074237061130776, + "grad_norm": 0.2981068789958954, + "learning_rate": 7.561512094846075e-05, + "loss": 0.9496, + "step": 25320 + }, + { + "epoch": 0.8077425938327115, + "grad_norm": 0.2955712080001831, + "learning_rate": 7.532634130804376e-05, + "loss": 0.9274, + "step": 25330 + }, + { + "epoch": 0.8080614815523455, + "grad_norm": 0.29238399863243103, + "learning_rate": 7.503866453805368e-05, + "loss": 0.9292, + "step": 25340 + }, + { + "epoch": 0.8083803692719793, + "grad_norm": 0.2969522476196289, + "learning_rate": 7.475208642654819e-05, + "loss": 0.9279, + "step": 25350 + }, + { + "epoch": 0.8086992569916133, + "grad_norm": 0.29493582248687744, + "learning_rate": 7.44666027776707e-05, + "loss": 0.9236, + "step": 25360 + }, + { + "epoch": 0.8090181447112472, + "grad_norm": 0.29591089487075806, + "learning_rate": 7.418220941158897e-05, + "loss": 0.9205, + "step": 25370 + }, + { + "epoch": 0.8093370324308811, + "grad_norm": 0.2962036430835724, + "learning_rate": 7.389890216443378e-05, + "loss": 0.9343, + "step": 25380 + }, + { + "epoch": 0.809655920150515, + "grad_norm": 0.29654377698898315, + "learning_rate": 7.361667688823803e-05, + "loss": 0.9211, + "step": 25390 + }, + { + "epoch": 0.809974807870149, + "grad_norm": 0.29060810804367065, + "learning_rate": 7.333552945087602e-05, + "loss": 0.9202, + "step": 25400 + }, + { + "epoch": 0.8102936955897828, + "grad_norm": 0.2932351231575012, + "learning_rate": 7.305545573600293e-05, + "loss": 0.9223, + "step": 25410 + }, + { + "epoch": 0.8106125833094168, + "grad_norm": 0.30790457129478455, + "learning_rate": 7.277645164299459e-05, + "loss": 0.9295, + "step": 25420 + }, + { + "epoch": 0.8109314710290507, + "grad_norm": 0.30155086517333984, + "learning_rate": 7.249851308688739e-05, + "loss": 0.9283, + "step": 25430 + }, + { + "epoch": 0.8112503587486846, + "grad_norm": 0.3012137711048126, + "learning_rate": 7.222163599831843e-05, + "loss": 0.9385, + "step": 25440 + }, + { + "epoch": 0.8115692464683185, + "grad_norm": 0.30086320638656616, + "learning_rate": 7.194581632346617e-05, + "loss": 0.9277, + "step": 25450 + }, + { + "epoch": 0.8118881341879525, + "grad_norm": 0.29009512066841125, + "learning_rate": 7.167105002399073e-05, + "loss": 0.9119, + "step": 25460 + }, + { + "epoch": 0.8122070219075863, + "grad_norm": 0.30384042859077454, + "learning_rate": 7.139733307697503e-05, + "loss": 0.9378, + "step": 25470 + }, + { + "epoch": 0.8125259096272203, + "grad_norm": 0.28765690326690674, + "learning_rate": 7.112466147486579e-05, + "loss": 0.9416, + "step": 25480 + }, + { + "epoch": 0.8128447973468542, + "grad_norm": 0.3002883791923523, + "learning_rate": 7.08530312254149e-05, + "loss": 0.9368, + "step": 25490 + }, + { + "epoch": 0.8131636850664881, + "grad_norm": 0.3096667230129242, + "learning_rate": 7.058243835162084e-05, + "loss": 0.9346, + "step": 25500 + }, + { + "epoch": 0.813482572786122, + "grad_norm": 0.29634806513786316, + "learning_rate": 7.031287889167066e-05, + "loss": 0.9294, + "step": 25510 + }, + { + "epoch": 0.813801460505756, + "grad_norm": 0.3080945909023285, + "learning_rate": 7.004434889888181e-05, + "loss": 0.9251, + "step": 25520 + }, + { + "epoch": 0.8141203482253898, + "grad_norm": 0.2975804805755615, + "learning_rate": 6.977684444164437e-05, + "loss": 0.9313, + "step": 25530 + }, + { + "epoch": 0.8144392359450238, + "grad_norm": 0.2953619062900543, + "learning_rate": 6.951036160336359e-05, + "loss": 0.9318, + "step": 25540 + }, + { + "epoch": 0.8147581236646577, + "grad_norm": 0.30130645632743835, + "learning_rate": 6.924489648240246e-05, + "loss": 0.9289, + "step": 25550 + }, + { + "epoch": 0.8150770113842916, + "grad_norm": 0.29083284735679626, + "learning_rate": 6.89804451920246e-05, + "loss": 0.9212, + "step": 25560 + }, + { + "epoch": 0.8153958991039255, + "grad_norm": 0.2993018329143524, + "learning_rate": 6.871700386033734e-05, + "loss": 0.9303, + "step": 25570 + }, + { + "epoch": 0.8157147868235595, + "grad_norm": 0.29969847202301025, + "learning_rate": 6.845456863023507e-05, + "loss": 0.927, + "step": 25580 + }, + { + "epoch": 0.8160336745431933, + "grad_norm": 0.3024032711982727, + "learning_rate": 6.819313565934276e-05, + "loss": 0.9385, + "step": 25590 + }, + { + "epoch": 0.8163525622628273, + "grad_norm": 0.2944593131542206, + "learning_rate": 6.793270111995963e-05, + "loss": 0.9268, + "step": 25600 + }, + { + "epoch": 0.8166714499824612, + "grad_norm": 0.29249638319015503, + "learning_rate": 6.767326119900324e-05, + "loss": 0.9226, + "step": 25610 + }, + { + "epoch": 0.8169903377020951, + "grad_norm": 0.2940239906311035, + "learning_rate": 6.741481209795356e-05, + "loss": 0.9279, + "step": 25620 + }, + { + "epoch": 0.817309225421729, + "grad_norm": 0.30038586258888245, + "learning_rate": 6.715735003279736e-05, + "loss": 0.935, + "step": 25630 + }, + { + "epoch": 0.817628113141363, + "grad_norm": 0.294821172952652, + "learning_rate": 6.69008712339729e-05, + "loss": 0.9313, + "step": 25640 + }, + { + "epoch": 0.8179470008609968, + "grad_norm": 0.2999110519886017, + "learning_rate": 6.664537194631458e-05, + "loss": 0.9118, + "step": 25650 + }, + { + "epoch": 0.8182658885806308, + "grad_norm": 0.30320724844932556, + "learning_rate": 6.639084842899812e-05, + "loss": 0.9299, + "step": 25660 + }, + { + "epoch": 0.8185847763002647, + "grad_norm": 0.3023329973220825, + "learning_rate": 6.613729695548567e-05, + "loss": 0.9234, + "step": 25670 + }, + { + "epoch": 0.8189036640198986, + "grad_norm": 0.3076138198375702, + "learning_rate": 6.588471381347133e-05, + "loss": 0.9323, + "step": 25680 + }, + { + "epoch": 0.8192225517395325, + "grad_norm": 0.3051240146160126, + "learning_rate": 6.563309530482677e-05, + "loss": 0.9331, + "step": 25690 + }, + { + "epoch": 0.8195414394591665, + "grad_norm": 0.31167781352996826, + "learning_rate": 6.538243774554706e-05, + "loss": 0.937, + "step": 25700 + }, + { + "epoch": 0.8198603271788003, + "grad_norm": 0.30333802103996277, + "learning_rate": 6.513273746569676e-05, + "loss": 0.9235, + "step": 25710 + }, + { + "epoch": 0.8201792148984343, + "grad_norm": 0.2919188439846039, + "learning_rate": 6.48839908093562e-05, + "loss": 0.9078, + "step": 25720 + }, + { + "epoch": 0.8204981026180682, + "grad_norm": 0.30451634526252747, + "learning_rate": 6.463619413456787e-05, + "loss": 0.9274, + "step": 25730 + }, + { + "epoch": 0.8208169903377021, + "grad_norm": 0.30046597123146057, + "learning_rate": 6.438934381328326e-05, + "loss": 0.9228, + "step": 25740 + }, + { + "epoch": 0.821135878057336, + "grad_norm": 0.29581958055496216, + "learning_rate": 6.414343623130956e-05, + "loss": 0.9238, + "step": 25750 + }, + { + "epoch": 0.82145476577697, + "grad_norm": 0.29217979311943054, + "learning_rate": 6.389846778825685e-05, + "loss": 0.9004, + "step": 25760 + }, + { + "epoch": 0.8217736534966038, + "grad_norm": 0.29643428325653076, + "learning_rate": 6.365443489748536e-05, + "loss": 0.9239, + "step": 25770 + }, + { + "epoch": 0.8220925412162378, + "grad_norm": 0.30537211894989014, + "learning_rate": 6.341133398605295e-05, + "loss": 0.9208, + "step": 25780 + }, + { + "epoch": 0.8224114289358717, + "grad_norm": 0.30653655529022217, + "learning_rate": 6.316916149466283e-05, + "loss": 0.9282, + "step": 25790 + }, + { + "epoch": 0.8227303166555056, + "grad_norm": 0.3039204180240631, + "learning_rate": 6.292791387761143e-05, + "loss": 0.905, + "step": 25800 + }, + { + "epoch": 0.8230492043751395, + "grad_norm": 0.2952761650085449, + "learning_rate": 6.268758760273642e-05, + "loss": 0.9261, + "step": 25810 + }, + { + "epoch": 0.8233680920947735, + "grad_norm": 0.29730528593063354, + "learning_rate": 6.24481791513651e-05, + "loss": 0.9288, + "step": 25820 + }, + { + "epoch": 0.8236869798144073, + "grad_norm": 0.3040090799331665, + "learning_rate": 6.220968501826283e-05, + "loss": 0.9172, + "step": 25830 + }, + { + "epoch": 0.8240058675340413, + "grad_norm": 0.2928590476512909, + "learning_rate": 6.197210171158173e-05, + "loss": 0.9149, + "step": 25840 + }, + { + "epoch": 0.8243247552536752, + "grad_norm": 0.3051118850708008, + "learning_rate": 6.173542575280949e-05, + "loss": 0.9294, + "step": 25850 + }, + { + "epoch": 0.824643642973309, + "grad_norm": 0.3019244074821472, + "learning_rate": 6.149965367671856e-05, + "loss": 0.9181, + "step": 25860 + }, + { + "epoch": 0.824962530692943, + "grad_norm": 0.2956985831260681, + "learning_rate": 6.126478203131529e-05, + "loss": 0.906, + "step": 25870 + }, + { + "epoch": 0.825281418412577, + "grad_norm": 0.29620710015296936, + "learning_rate": 6.1030807377789486e-05, + "loss": 0.9138, + "step": 25880 + }, + { + "epoch": 0.8256003061322108, + "grad_norm": 0.30234384536743164, + "learning_rate": 6.0797726290463996e-05, + "loss": 0.927, + "step": 25890 + }, + { + "epoch": 0.8259191938518448, + "grad_norm": 0.30620619654655457, + "learning_rate": 6.056553535674458e-05, + "loss": 0.9098, + "step": 25900 + }, + { + "epoch": 0.8262380815714787, + "grad_norm": 0.3129229247570038, + "learning_rate": 6.033423117706994e-05, + "loss": 0.9256, + "step": 25910 + }, + { + "epoch": 0.8265569692911126, + "grad_norm": 0.29301196336746216, + "learning_rate": 6.0103810364861955e-05, + "loss": 0.9254, + "step": 25920 + }, + { + "epoch": 0.8268758570107465, + "grad_norm": 0.30466556549072266, + "learning_rate": 5.9874269546476105e-05, + "loss": 0.9175, + "step": 25930 + }, + { + "epoch": 0.8271947447303805, + "grad_norm": 0.2885163724422455, + "learning_rate": 5.964560536115204e-05, + "loss": 0.9272, + "step": 25940 + }, + { + "epoch": 0.8275136324500143, + "grad_norm": 0.2929341495037079, + "learning_rate": 5.941781446096441e-05, + "loss": 0.8968, + "step": 25950 + }, + { + "epoch": 0.8278325201696483, + "grad_norm": 0.3047708570957184, + "learning_rate": 5.9190893510773834e-05, + "loss": 0.9265, + "step": 25960 + }, + { + "epoch": 0.8281514078892822, + "grad_norm": 0.3061845600605011, + "learning_rate": 5.896483918817807e-05, + "loss": 0.9216, + "step": 25970 + }, + { + "epoch": 0.828470295608916, + "grad_norm": 0.31115177273750305, + "learning_rate": 5.873964818346338e-05, + "loss": 0.9316, + "step": 25980 + }, + { + "epoch": 0.82878918332855, + "grad_norm": 0.31330665946006775, + "learning_rate": 5.8515317199556014e-05, + "loss": 0.9172, + "step": 25990 + }, + { + "epoch": 0.829108071048184, + "grad_norm": 0.3075343072414398, + "learning_rate": 5.829184295197409e-05, + "loss": 0.9142, + "step": 26000 + }, + { + "epoch": 0.8294269587678178, + "grad_norm": 0.2841055691242218, + "learning_rate": 5.806922216877932e-05, + "loss": 0.912, + "step": 26010 + }, + { + "epoch": 0.8297458464874518, + "grad_norm": 0.29682737588882446, + "learning_rate": 5.784745159052919e-05, + "loss": 0.9113, + "step": 26020 + }, + { + "epoch": 0.8300647342070857, + "grad_norm": 0.29654332995414734, + "learning_rate": 5.7626527970229256e-05, + "loss": 0.9276, + "step": 26030 + }, + { + "epoch": 0.8303836219267196, + "grad_norm": 0.3100140690803528, + "learning_rate": 5.7406448073285566e-05, + "loss": 0.9193, + "step": 26040 + }, + { + "epoch": 0.8307025096463535, + "grad_norm": 0.3016508221626282, + "learning_rate": 5.718720867745734e-05, + "loss": 0.9318, + "step": 26050 + }, + { + "epoch": 0.8310213973659875, + "grad_norm": 0.2981314957141876, + "learning_rate": 5.6968806572809736e-05, + "loss": 0.9169, + "step": 26060 + }, + { + "epoch": 0.8313402850856213, + "grad_norm": 0.29724085330963135, + "learning_rate": 5.675123856166692e-05, + "loss": 0.9076, + "step": 26070 + }, + { + "epoch": 0.8316591728052553, + "grad_norm": 0.30090001225471497, + "learning_rate": 5.653450145856519e-05, + "loss": 0.9193, + "step": 26080 + }, + { + "epoch": 0.8319780605248892, + "grad_norm": 0.3024527430534363, + "learning_rate": 5.631859209020637e-05, + "loss": 0.9148, + "step": 26090 + }, + { + "epoch": 0.832296948244523, + "grad_norm": 0.3030632436275482, + "learning_rate": 5.6103507295411355e-05, + "loss": 0.9211, + "step": 26100 + }, + { + "epoch": 0.832615835964157, + "grad_norm": 0.29895177483558655, + "learning_rate": 5.5889243925073783e-05, + "loss": 0.952, + "step": 26110 + }, + { + "epoch": 0.832934723683791, + "grad_norm": 0.3039778172969818, + "learning_rate": 5.5675798842113984e-05, + "loss": 0.9227, + "step": 26120 + }, + { + "epoch": 0.8332536114034248, + "grad_norm": 0.29388052225112915, + "learning_rate": 5.546316892143301e-05, + "loss": 0.9155, + "step": 26130 + }, + { + "epoch": 0.8335724991230588, + "grad_norm": 0.3008153736591339, + "learning_rate": 5.525135104986689e-05, + "loss": 0.91, + "step": 26140 + }, + { + "epoch": 0.8338913868426927, + "grad_norm": 0.2974579930305481, + "learning_rate": 5.5040342126141065e-05, + "loss": 0.92, + "step": 26150 + }, + { + "epoch": 0.8342102745623267, + "grad_norm": 0.3058636486530304, + "learning_rate": 5.483013906082494e-05, + "loss": 0.9288, + "step": 26160 + }, + { + "epoch": 0.8345291622819605, + "grad_norm": 0.29824039340019226, + "learning_rate": 5.462073877628674e-05, + "loss": 0.8927, + "step": 26170 + }, + { + "epoch": 0.8348480500015945, + "grad_norm": 0.31605926156044006, + "learning_rate": 5.4412138206648335e-05, + "loss": 0.9409, + "step": 26180 + }, + { + "epoch": 0.8351669377212284, + "grad_norm": 0.3000921308994293, + "learning_rate": 5.420433429774042e-05, + "loss": 0.9149, + "step": 26190 + }, + { + "epoch": 0.8354858254408623, + "grad_norm": 0.300586462020874, + "learning_rate": 5.3997324007057795e-05, + "loss": 0.9032, + "step": 26200 + }, + { + "epoch": 0.8358047131604962, + "grad_norm": 0.3025199770927429, + "learning_rate": 5.379110430371478e-05, + "loss": 0.9097, + "step": 26210 + }, + { + "epoch": 0.8361236008801302, + "grad_norm": 0.29137352108955383, + "learning_rate": 5.358567216840091e-05, + "loss": 0.9149, + "step": 26220 + }, + { + "epoch": 0.836442488599764, + "grad_norm": 0.30015018582344055, + "learning_rate": 5.338102459333666e-05, + "loss": 0.9139, + "step": 26230 + }, + { + "epoch": 0.836761376319398, + "grad_norm": 0.2905537188053131, + "learning_rate": 5.317715858222943e-05, + "loss": 0.9161, + "step": 26240 + }, + { + "epoch": 0.8370802640390319, + "grad_norm": 0.2965790927410126, + "learning_rate": 5.297407115022969e-05, + "loss": 0.9137, + "step": 26250 + }, + { + "epoch": 0.8373991517586657, + "grad_norm": 0.29615485668182373, + "learning_rate": 5.277175932388726e-05, + "loss": 0.8942, + "step": 26260 + }, + { + "epoch": 0.8377180394782997, + "grad_norm": 0.30239808559417725, + "learning_rate": 5.2570220141107796e-05, + "loss": 0.9362, + "step": 26270 + }, + { + "epoch": 0.8380369271979337, + "grad_norm": 0.30178868770599365, + "learning_rate": 5.236945065110938e-05, + "loss": 0.9336, + "step": 26280 + }, + { + "epoch": 0.8383558149175675, + "grad_norm": 0.2944106161594391, + "learning_rate": 5.216944791437939e-05, + "loss": 0.9131, + "step": 26290 + }, + { + "epoch": 0.8386747026372015, + "grad_norm": 0.2962855100631714, + "learning_rate": 5.1970209002631356e-05, + "loss": 0.9228, + "step": 26300 + }, + { + "epoch": 0.8389935903568354, + "grad_norm": 0.30278846621513367, + "learning_rate": 5.177173099876221e-05, + "loss": 0.9166, + "step": 26310 + }, + { + "epoch": 0.8393124780764692, + "grad_norm": 0.2937107980251312, + "learning_rate": 5.157401099680946e-05, + "loss": 0.9147, + "step": 26320 + }, + { + "epoch": 0.8396313657961032, + "grad_norm": 0.303463876247406, + "learning_rate": 5.13770461019087e-05, + "loss": 0.9206, + "step": 26330 + }, + { + "epoch": 0.8399502535157372, + "grad_norm": 0.2950541377067566, + "learning_rate": 5.118083343025129e-05, + "loss": 0.9133, + "step": 26340 + }, + { + "epoch": 0.840269141235371, + "grad_norm": 0.2916299104690552, + "learning_rate": 5.0985370109041986e-05, + "loss": 0.9022, + "step": 26350 + }, + { + "epoch": 0.840588028955005, + "grad_norm": 0.29524877667427063, + "learning_rate": 5.079065327645699e-05, + "loss": 0.9277, + "step": 26360 + }, + { + "epoch": 0.8409069166746389, + "grad_norm": 0.30931761860847473, + "learning_rate": 5.059668008160201e-05, + "loss": 0.9224, + "step": 26370 + }, + { + "epoch": 0.8412258043942727, + "grad_norm": 0.3061833679676056, + "learning_rate": 5.0403447684470534e-05, + "loss": 0.9148, + "step": 26380 + }, + { + "epoch": 0.8415446921139067, + "grad_norm": 0.2954571843147278, + "learning_rate": 5.021095325590223e-05, + "loss": 0.9069, + "step": 26390 + }, + { + "epoch": 0.8418635798335407, + "grad_norm": 0.31159186363220215, + "learning_rate": 5.001919397754154e-05, + "loss": 0.9337, + "step": 26400 + }, + { + "epoch": 0.8421824675531745, + "grad_norm": 0.29583415389060974, + "learning_rate": 4.9828167041796415e-05, + "loss": 0.9238, + "step": 26410 + }, + { + "epoch": 0.8425013552728084, + "grad_norm": 0.3016620874404907, + "learning_rate": 4.9637869651797196e-05, + "loss": 0.8986, + "step": 26420 + }, + { + "epoch": 0.8428202429924424, + "grad_norm": 0.303551584482193, + "learning_rate": 4.944829902135568e-05, + "loss": 0.918, + "step": 26430 + }, + { + "epoch": 0.8431391307120762, + "grad_norm": 0.29466649889945984, + "learning_rate": 4.925945237492433e-05, + "loss": 0.9219, + "step": 26440 + }, + { + "epoch": 0.8434580184317102, + "grad_norm": 0.30812641978263855, + "learning_rate": 4.9071326947555595e-05, + "loss": 0.9345, + "step": 26450 + }, + { + "epoch": 0.8437769061513442, + "grad_norm": 0.29291924834251404, + "learning_rate": 4.8883919984861484e-05, + "loss": 0.9256, + "step": 26460 + }, + { + "epoch": 0.844095793870978, + "grad_norm": 0.3033911883831024, + "learning_rate": 4.86972287429732e-05, + "loss": 0.9032, + "step": 26470 + }, + { + "epoch": 0.844414681590612, + "grad_norm": 0.2836741805076599, + "learning_rate": 4.8511250488501005e-05, + "loss": 0.9112, + "step": 26480 + }, + { + "epoch": 0.8447335693102459, + "grad_norm": 0.3043772876262665, + "learning_rate": 4.832598249849414e-05, + "loss": 0.9109, + "step": 26490 + }, + { + "epoch": 0.8450524570298797, + "grad_norm": 0.2880835235118866, + "learning_rate": 4.814142206040099e-05, + "loss": 0.9149, + "step": 26500 + }, + { + "epoch": 0.8453713447495137, + "grad_norm": 0.2935115098953247, + "learning_rate": 4.795756647202945e-05, + "loss": 0.9118, + "step": 26510 + }, + { + "epoch": 0.8456902324691477, + "grad_norm": 0.2969006597995758, + "learning_rate": 4.777441304150719e-05, + "loss": 0.906, + "step": 26520 + }, + { + "epoch": 0.8460091201887815, + "grad_norm": 0.30467066168785095, + "learning_rate": 4.759195908724238e-05, + "loss": 0.9099, + "step": 26530 + }, + { + "epoch": 0.8463280079084154, + "grad_norm": 0.29597529768943787, + "learning_rate": 4.741020193788437e-05, + "loss": 0.906, + "step": 26540 + }, + { + "epoch": 0.8466468956280494, + "grad_norm": 0.3062756359577179, + "learning_rate": 4.72291389322846e-05, + "loss": 0.9162, + "step": 26550 + }, + { + "epoch": 0.8469657833476832, + "grad_norm": 0.2979463040828705, + "learning_rate": 4.7048767419457626e-05, + "loss": 0.91, + "step": 26560 + }, + { + "epoch": 0.8472846710673172, + "grad_norm": 0.2843947112560272, + "learning_rate": 4.686908475854231e-05, + "loss": 0.9273, + "step": 26570 + }, + { + "epoch": 0.8476035587869512, + "grad_norm": 0.2960202395915985, + "learning_rate": 4.669008831876315e-05, + "loss": 0.909, + "step": 26580 + }, + { + "epoch": 0.847922446506585, + "grad_norm": 0.2985907196998596, + "learning_rate": 4.651177547939179e-05, + "loss": 0.9071, + "step": 26590 + }, + { + "epoch": 0.848241334226219, + "grad_norm": 0.3096676170825958, + "learning_rate": 4.633414362970859e-05, + "loss": 0.9192, + "step": 26600 + }, + { + "epoch": 0.8485602219458529, + "grad_norm": 0.31174537539482117, + "learning_rate": 4.6157190168964464e-05, + "loss": 0.9073, + "step": 26610 + }, + { + "epoch": 0.8488791096654867, + "grad_norm": 0.3036581575870514, + "learning_rate": 4.598091250634277e-05, + "loss": 0.9155, + "step": 26620 + }, + { + "epoch": 0.8491979973851207, + "grad_norm": 0.30269771814346313, + "learning_rate": 4.580530806092137e-05, + "loss": 0.9152, + "step": 26630 + }, + { + "epoch": 0.8495168851047546, + "grad_norm": 0.29733002185821533, + "learning_rate": 4.563037426163488e-05, + "loss": 0.9315, + "step": 26640 + }, + { + "epoch": 0.8498357728243885, + "grad_norm": 0.2943744659423828, + "learning_rate": 4.545610854723698e-05, + "loss": 0.9136, + "step": 26650 + }, + { + "epoch": 0.8501546605440224, + "grad_norm": 0.2963501513004303, + "learning_rate": 4.528250836626295e-05, + "loss": 0.9026, + "step": 26660 + }, + { + "epoch": 0.8504735482636564, + "grad_norm": 0.30102425813674927, + "learning_rate": 4.5109571176992264e-05, + "loss": 0.916, + "step": 26670 + }, + { + "epoch": 0.8507924359832902, + "grad_norm": 0.2941626012325287, + "learning_rate": 4.493729444741149e-05, + "loss": 0.9177, + "step": 26680 + }, + { + "epoch": 0.8511113237029242, + "grad_norm": 0.2860785126686096, + "learning_rate": 4.476567565517706e-05, + "loss": 0.9068, + "step": 26690 + }, + { + "epoch": 0.8514302114225581, + "grad_norm": 0.2924879789352417, + "learning_rate": 4.459471228757844e-05, + "loss": 0.9065, + "step": 26700 + }, + { + "epoch": 0.851749099142192, + "grad_norm": 0.2924700677394867, + "learning_rate": 4.442440184150135e-05, + "loss": 0.9184, + "step": 26710 + }, + { + "epoch": 0.8520679868618259, + "grad_norm": 0.2944236695766449, + "learning_rate": 4.425474182339106e-05, + "loss": 0.9126, + "step": 26720 + }, + { + "epoch": 0.8523868745814599, + "grad_norm": 0.2980422079563141, + "learning_rate": 4.40857297492159e-05, + "loss": 0.9211, + "step": 26730 + }, + { + "epoch": 0.8527057623010937, + "grad_norm": 0.3008560836315155, + "learning_rate": 4.391736314443091e-05, + "loss": 0.9262, + "step": 26740 + }, + { + "epoch": 0.8530246500207277, + "grad_norm": 0.2976393699645996, + "learning_rate": 4.37496395439416e-05, + "loss": 0.9227, + "step": 26750 + }, + { + "epoch": 0.8533435377403616, + "grad_norm": 0.31023627519607544, + "learning_rate": 4.3582556492067844e-05, + "loss": 0.9081, + "step": 26760 + }, + { + "epoch": 0.8536624254599955, + "grad_norm": 0.3027059733867645, + "learning_rate": 4.341611154250795e-05, + "loss": 0.9044, + "step": 26770 + }, + { + "epoch": 0.8539813131796294, + "grad_norm": 0.3005632758140564, + "learning_rate": 4.325030225830281e-05, + "loss": 0.9241, + "step": 26780 + }, + { + "epoch": 0.8543002008992634, + "grad_norm": 0.3049420118331909, + "learning_rate": 4.308512621180027e-05, + "loss": 0.9203, + "step": 26790 + }, + { + "epoch": 0.8546190886188972, + "grad_norm": 0.29897886514663696, + "learning_rate": 4.2920580984619533e-05, + "loss": 0.8969, + "step": 26800 + }, + { + "epoch": 0.8549379763385312, + "grad_norm": 0.2905649244785309, + "learning_rate": 4.275666416761577e-05, + "loss": 0.9149, + "step": 26810 + }, + { + "epoch": 0.8552568640581651, + "grad_norm": 0.30739012360572815, + "learning_rate": 4.259337336084486e-05, + "loss": 0.9262, + "step": 26820 + }, + { + "epoch": 0.855575751777799, + "grad_norm": 0.30363893508911133, + "learning_rate": 4.243070617352825e-05, + "loss": 0.9272, + "step": 26830 + }, + { + "epoch": 0.8558946394974329, + "grad_norm": 0.3002462089061737, + "learning_rate": 4.226866022401794e-05, + "loss": 0.9178, + "step": 26840 + }, + { + "epoch": 0.8562135272170669, + "grad_norm": 0.31431305408477783, + "learning_rate": 4.2107233139761615e-05, + "loss": 0.9074, + "step": 26850 + }, + { + "epoch": 0.8565324149367007, + "grad_norm": 0.3078189492225647, + "learning_rate": 4.194642255726791e-05, + "loss": 0.9078, + "step": 26860 + }, + { + "epoch": 0.8568513026563347, + "grad_norm": 0.30262264609336853, + "learning_rate": 4.1786226122071794e-05, + "loss": 0.9189, + "step": 26870 + }, + { + "epoch": 0.8571701903759686, + "grad_norm": 0.30763888359069824, + "learning_rate": 4.162664148870013e-05, + "loss": 0.9119, + "step": 26880 + }, + { + "epoch": 0.8574890780956025, + "grad_norm": 0.3012956976890564, + "learning_rate": 4.146766632063729e-05, + "loss": 0.9212, + "step": 26890 + }, + { + "epoch": 0.8578079658152364, + "grad_norm": 0.3108784556388855, + "learning_rate": 4.1309298290290994e-05, + "loss": 0.9221, + "step": 26900 + }, + { + "epoch": 0.8581268535348704, + "grad_norm": 0.3018873631954193, + "learning_rate": 4.1151535078958185e-05, + "loss": 0.9244, + "step": 26910 + }, + { + "epoch": 0.8584457412545042, + "grad_norm": 0.3096584975719452, + "learning_rate": 4.099437437679111e-05, + "loss": 0.9098, + "step": 26920 + }, + { + "epoch": 0.8587646289741382, + "grad_norm": 0.2895481288433075, + "learning_rate": 4.08378138827635e-05, + "loss": 0.9285, + "step": 26930 + }, + { + "epoch": 0.8590835166937721, + "grad_norm": 0.30135300755500793, + "learning_rate": 4.0681851304636857e-05, + "loss": 0.9091, + "step": 26940 + }, + { + "epoch": 0.859402404413406, + "grad_norm": 0.30391889810562134, + "learning_rate": 4.052648435892692e-05, + "loss": 0.9058, + "step": 26950 + }, + { + "epoch": 0.8597212921330399, + "grad_norm": 0.29739269614219666, + "learning_rate": 4.037171077087022e-05, + "loss": 0.9178, + "step": 26960 + }, + { + "epoch": 0.8600401798526739, + "grad_norm": 0.29668211936950684, + "learning_rate": 4.021752827439075e-05, + "loss": 0.8903, + "step": 26970 + }, + { + "epoch": 0.8603590675723078, + "grad_norm": 0.30262407660484314, + "learning_rate": 4.0063934612066855e-05, + "loss": 0.9003, + "step": 26980 + }, + { + "epoch": 0.8606779552919417, + "grad_norm": 0.3077700436115265, + "learning_rate": 3.991092753509812e-05, + "loss": 0.9104, + "step": 26990 + }, + { + "epoch": 0.8609968430115756, + "grad_norm": 0.29334044456481934, + "learning_rate": 3.975850480327241e-05, + "loss": 0.8973, + "step": 27000 + }, + { + "epoch": 0.8613157307312096, + "grad_norm": 0.2982562780380249, + "learning_rate": 3.960666418493324e-05, + "loss": 0.8958, + "step": 27010 + }, + { + "epoch": 0.8616346184508434, + "grad_norm": 0.3008376955986023, + "learning_rate": 3.9455403456946875e-05, + "loss": 0.9145, + "step": 27020 + }, + { + "epoch": 0.8619535061704774, + "grad_norm": 0.2966604232788086, + "learning_rate": 3.930472040466995e-05, + "loss": 0.9166, + "step": 27030 + }, + { + "epoch": 0.8622723938901113, + "grad_norm": 0.290862113237381, + "learning_rate": 3.915461282191693e-05, + "loss": 0.9264, + "step": 27040 + }, + { + "epoch": 0.8625912816097452, + "grad_norm": 0.3028498888015747, + "learning_rate": 3.900507851092791e-05, + "loss": 0.9287, + "step": 27050 + }, + { + "epoch": 0.8629101693293791, + "grad_norm": 0.3000180125236511, + "learning_rate": 3.885611528233638e-05, + "loss": 0.9223, + "step": 27060 + }, + { + "epoch": 0.8632290570490131, + "grad_norm": 0.3000931143760681, + "learning_rate": 3.870772095513717e-05, + "loss": 0.9042, + "step": 27070 + }, + { + "epoch": 0.8635479447686469, + "grad_norm": 0.2879917323589325, + "learning_rate": 3.855989335665453e-05, + "loss": 0.9207, + "step": 27080 + }, + { + "epoch": 0.8638668324882809, + "grad_norm": 0.30853670835494995, + "learning_rate": 3.841263032251032e-05, + "loss": 0.9243, + "step": 27090 + }, + { + "epoch": 0.8641857202079148, + "grad_norm": 0.2945482134819031, + "learning_rate": 3.8265929696592315e-05, + "loss": 0.9161, + "step": 27100 + }, + { + "epoch": 0.8645046079275487, + "grad_norm": 0.29502424597740173, + "learning_rate": 3.811978933102264e-05, + "loss": 0.9068, + "step": 27110 + }, + { + "epoch": 0.8648234956471826, + "grad_norm": 0.29816558957099915, + "learning_rate": 3.797420708612632e-05, + "loss": 0.9172, + "step": 27120 + }, + { + "epoch": 0.8651423833668166, + "grad_norm": 0.29900944232940674, + "learning_rate": 3.7829180830399963e-05, + "loss": 0.9034, + "step": 27130 + }, + { + "epoch": 0.8654612710864504, + "grad_norm": 0.29612571001052856, + "learning_rate": 3.768470844048052e-05, + "loss": 0.8965, + "step": 27140 + }, + { + "epoch": 0.8657801588060844, + "grad_norm": 0.29050344228744507, + "learning_rate": 3.7540787801114243e-05, + "loss": 0.9144, + "step": 27150 + }, + { + "epoch": 0.8660990465257183, + "grad_norm": 0.2979932725429535, + "learning_rate": 3.739741680512569e-05, + "loss": 0.9189, + "step": 27160 + }, + { + "epoch": 0.8664179342453522, + "grad_norm": 0.3041857182979584, + "learning_rate": 3.725459335338685e-05, + "loss": 0.9034, + "step": 27170 + }, + { + "epoch": 0.8667368219649861, + "grad_norm": 0.2961612343788147, + "learning_rate": 3.711231535478648e-05, + "loss": 0.9151, + "step": 27180 + }, + { + "epoch": 0.8670557096846201, + "grad_norm": 0.2977130711078644, + "learning_rate": 3.697058072619941e-05, + "loss": 0.912, + "step": 27190 + }, + { + "epoch": 0.8673745974042539, + "grad_norm": 0.2953384518623352, + "learning_rate": 3.6829387392456075e-05, + "loss": 0.9095, + "step": 27200 + }, + { + "epoch": 0.8676934851238879, + "grad_norm": 0.30030712485313416, + "learning_rate": 3.668873328631214e-05, + "loss": 0.908, + "step": 27210 + }, + { + "epoch": 0.8680123728435218, + "grad_norm": 0.29483762383461, + "learning_rate": 3.6548616348418236e-05, + "loss": 0.9091, + "step": 27220 + }, + { + "epoch": 0.8683312605631557, + "grad_norm": 0.2982817590236664, + "learning_rate": 3.640903452728978e-05, + "loss": 0.9046, + "step": 27230 + }, + { + "epoch": 0.8686501482827896, + "grad_norm": 0.3042912483215332, + "learning_rate": 3.626998577927698e-05, + "loss": 0.9053, + "step": 27240 + }, + { + "epoch": 0.8689690360024236, + "grad_norm": 0.30695784091949463, + "learning_rate": 3.6131468068534876e-05, + "loss": 0.9161, + "step": 27250 + }, + { + "epoch": 0.8692879237220574, + "grad_norm": 0.29794302582740784, + "learning_rate": 3.599347936699354e-05, + "loss": 0.9065, + "step": 27260 + }, + { + "epoch": 0.8696068114416914, + "grad_norm": 0.29503190517425537, + "learning_rate": 3.585601765432841e-05, + "loss": 0.9098, + "step": 27270 + }, + { + "epoch": 0.8699256991613253, + "grad_norm": 0.30191126465797424, + "learning_rate": 3.571908091793068e-05, + "loss": 0.9056, + "step": 27280 + }, + { + "epoch": 0.8702445868809592, + "grad_norm": 0.29695814847946167, + "learning_rate": 3.558266715287785e-05, + "loss": 0.9103, + "step": 27290 + }, + { + "epoch": 0.8705634746005931, + "grad_norm": 0.3085416555404663, + "learning_rate": 3.544677436190435e-05, + "loss": 0.9198, + "step": 27300 + }, + { + "epoch": 0.8708823623202271, + "grad_norm": 0.30874142050743103, + "learning_rate": 3.5311400555372326e-05, + "loss": 0.929, + "step": 27310 + }, + { + "epoch": 0.8712012500398609, + "grad_norm": 0.2925911545753479, + "learning_rate": 3.517654375124249e-05, + "loss": 0.9045, + "step": 27320 + }, + { + "epoch": 0.8715201377594949, + "grad_norm": 0.2910749018192291, + "learning_rate": 3.5042201975045115e-05, + "loss": 0.9206, + "step": 27330 + }, + { + "epoch": 0.8718390254791288, + "grad_norm": 0.29372018575668335, + "learning_rate": 3.490837325985108e-05, + "loss": 0.9189, + "step": 27340 + }, + { + "epoch": 0.8721579131987627, + "grad_norm": 0.29507362842559814, + "learning_rate": 3.4775055646243186e-05, + "loss": 0.9068, + "step": 27350 + }, + { + "epoch": 0.8724768009183966, + "grad_norm": 0.3026776611804962, + "learning_rate": 3.464224718228731e-05, + "loss": 0.9182, + "step": 27360 + }, + { + "epoch": 0.8727956886380306, + "grad_norm": 0.2964503765106201, + "learning_rate": 3.450994592350395e-05, + "loss": 0.9027, + "step": 27370 + }, + { + "epoch": 0.8731145763576644, + "grad_norm": 0.29183635115623474, + "learning_rate": 3.437814993283972e-05, + "loss": 0.9101, + "step": 27380 + }, + { + "epoch": 0.8734334640772984, + "grad_norm": 0.3055545687675476, + "learning_rate": 3.424685728063894e-05, + "loss": 0.9126, + "step": 27390 + }, + { + "epoch": 0.8737523517969323, + "grad_norm": 0.29662030935287476, + "learning_rate": 3.411606604461545e-05, + "loss": 0.9079, + "step": 27400 + }, + { + "epoch": 0.8740712395165662, + "grad_norm": 0.2959885895252228, + "learning_rate": 3.398577430982446e-05, + "loss": 0.8983, + "step": 27410 + }, + { + "epoch": 0.8743901272362001, + "grad_norm": 0.298469603061676, + "learning_rate": 3.385598016863445e-05, + "loss": 0.9029, + "step": 27420 + }, + { + "epoch": 0.8747090149558341, + "grad_norm": 0.2938048839569092, + "learning_rate": 3.372668172069933e-05, + "loss": 0.9121, + "step": 27430 + }, + { + "epoch": 0.8750279026754679, + "grad_norm": 0.3020491898059845, + "learning_rate": 3.3597877072930536e-05, + "loss": 0.9042, + "step": 27440 + }, + { + "epoch": 0.8753467903951019, + "grad_norm": 0.30387693643569946, + "learning_rate": 3.3469564339469355e-05, + "loss": 0.9113, + "step": 27450 + }, + { + "epoch": 0.8756656781147358, + "grad_norm": 0.2981345057487488, + "learning_rate": 3.334174164165931e-05, + "loss": 0.906, + "step": 27460 + }, + { + "epoch": 0.8759845658343697, + "grad_norm": 0.2940691113471985, + "learning_rate": 3.321440710801865e-05, + "loss": 0.9156, + "step": 27470 + }, + { + "epoch": 0.8763034535540036, + "grad_norm": 0.3027077317237854, + "learning_rate": 3.308755887421296e-05, + "loss": 0.9064, + "step": 27480 + }, + { + "epoch": 0.8766223412736376, + "grad_norm": 0.2986266613006592, + "learning_rate": 3.296119508302781e-05, + "loss": 0.8992, + "step": 27490 + }, + { + "epoch": 0.8769412289932714, + "grad_norm": 0.3028486669063568, + "learning_rate": 3.2835313884341655e-05, + "loss": 0.9183, + "step": 27500 + }, + { + "epoch": 0.8772601167129054, + "grad_norm": 0.3042612075805664, + "learning_rate": 3.2709913435098666e-05, + "loss": 0.9037, + "step": 27510 + }, + { + "epoch": 0.8775790044325393, + "grad_norm": 0.3072284758090973, + "learning_rate": 3.2584991899281825e-05, + "loss": 0.9171, + "step": 27520 + }, + { + "epoch": 0.8778978921521732, + "grad_norm": 0.29810744524002075, + "learning_rate": 3.246054744788594e-05, + "loss": 0.8848, + "step": 27530 + }, + { + "epoch": 0.8782167798718071, + "grad_norm": 0.2982387840747833, + "learning_rate": 3.233657825889095e-05, + "loss": 0.9106, + "step": 27540 + }, + { + "epoch": 0.8785356675914411, + "grad_norm": 0.3004058003425598, + "learning_rate": 3.221308251723522e-05, + "loss": 0.9152, + "step": 27550 + }, + { + "epoch": 0.8788545553110749, + "grad_norm": 0.2940619885921478, + "learning_rate": 3.2090058414788956e-05, + "loss": 0.9059, + "step": 27560 + }, + { + "epoch": 0.8791734430307089, + "grad_norm": 0.29345858097076416, + "learning_rate": 3.196750415032777e-05, + "loss": 0.9055, + "step": 27570 + }, + { + "epoch": 0.8794923307503428, + "grad_norm": 0.3052271902561188, + "learning_rate": 3.1845417929506246e-05, + "loss": 0.9127, + "step": 27580 + }, + { + "epoch": 0.8798112184699767, + "grad_norm": 0.30265307426452637, + "learning_rate": 3.1723797964831725e-05, + "loss": 0.9087, + "step": 27590 + }, + { + "epoch": 0.8801301061896106, + "grad_norm": 0.3081713318824768, + "learning_rate": 3.160264247563812e-05, + "loss": 0.908, + "step": 27600 + }, + { + "epoch": 0.8804489939092446, + "grad_norm": 0.302143931388855, + "learning_rate": 3.1481949688059806e-05, + "loss": 0.9051, + "step": 27610 + }, + { + "epoch": 0.8807678816288784, + "grad_norm": 0.3035591244697571, + "learning_rate": 3.1361717835005704e-05, + "loss": 0.9119, + "step": 27620 + }, + { + "epoch": 0.8810867693485124, + "grad_norm": 0.30098700523376465, + "learning_rate": 3.1241945156133386e-05, + "loss": 0.9125, + "step": 27630 + }, + { + "epoch": 0.8814056570681463, + "grad_norm": 0.3099716305732727, + "learning_rate": 3.1122629897823284e-05, + "loss": 0.906, + "step": 27640 + }, + { + "epoch": 0.8817245447877802, + "grad_norm": 0.2987038791179657, + "learning_rate": 3.100377031315304e-05, + "loss": 0.9013, + "step": 27650 + }, + { + "epoch": 0.8820434325074141, + "grad_norm": 0.30601197481155396, + "learning_rate": 3.088536466187193e-05, + "loss": 0.9189, + "step": 27660 + }, + { + "epoch": 0.8823623202270481, + "grad_norm": 0.29450488090515137, + "learning_rate": 3.076741121037534e-05, + "loss": 0.8981, + "step": 27670 + }, + { + "epoch": 0.8826812079466819, + "grad_norm": 0.3021726608276367, + "learning_rate": 3.064990823167945e-05, + "loss": 0.91, + "step": 27680 + }, + { + "epoch": 0.8830000956663159, + "grad_norm": 0.30047470331192017, + "learning_rate": 3.053285400539591e-05, + "loss": 0.9283, + "step": 27690 + }, + { + "epoch": 0.8833189833859498, + "grad_norm": 0.3061404526233673, + "learning_rate": 3.041624681770667e-05, + "loss": 0.9088, + "step": 27700 + }, + { + "epoch": 0.8836378711055837, + "grad_norm": 0.3043230175971985, + "learning_rate": 3.030008496133884e-05, + "loss": 0.8921, + "step": 27710 + }, + { + "epoch": 0.8839567588252176, + "grad_norm": 0.2976597845554352, + "learning_rate": 3.0184366735539748e-05, + "loss": 0.9011, + "step": 27720 + }, + { + "epoch": 0.8842756465448516, + "grad_norm": 0.29181239008903503, + "learning_rate": 3.006909044605203e-05, + "loss": 0.9071, + "step": 27730 + }, + { + "epoch": 0.8845945342644854, + "grad_norm": 0.30034399032592773, + "learning_rate": 2.995425440508881e-05, + "loss": 0.8866, + "step": 27740 + }, + { + "epoch": 0.8849134219841194, + "grad_norm": 0.29232257604599, + "learning_rate": 2.983985693130898e-05, + "loss": 0.9205, + "step": 27750 + }, + { + "epoch": 0.8852323097037533, + "grad_norm": 0.3077869713306427, + "learning_rate": 2.9725896349792608e-05, + "loss": 0.9076, + "step": 27760 + }, + { + "epoch": 0.8855511974233872, + "grad_norm": 0.2921328544616699, + "learning_rate": 2.9612370992016398e-05, + "loss": 0.908, + "step": 27770 + }, + { + "epoch": 0.8858700851430211, + "grad_norm": 0.3000868260860443, + "learning_rate": 2.9499279195829267e-05, + "loss": 0.8937, + "step": 27780 + }, + { + "epoch": 0.8861889728626551, + "grad_norm": 0.29569077491760254, + "learning_rate": 2.9386619305428005e-05, + "loss": 0.9016, + "step": 27790 + }, + { + "epoch": 0.886507860582289, + "grad_norm": 0.3077255189418793, + "learning_rate": 2.9274389671333044e-05, + "loss": 0.9156, + "step": 27800 + }, + { + "epoch": 0.8868267483019229, + "grad_norm": 0.3058718144893646, + "learning_rate": 2.9162588650364285e-05, + "loss": 0.9102, + "step": 27810 + }, + { + "epoch": 0.8871456360215568, + "grad_norm": 0.2974928021430969, + "learning_rate": 2.9051214605617067e-05, + "loss": 0.8916, + "step": 27820 + }, + { + "epoch": 0.8874645237411908, + "grad_norm": 0.2975204885005951, + "learning_rate": 2.8940265906438172e-05, + "loss": 0.9004, + "step": 27830 + }, + { + "epoch": 0.8877834114608246, + "grad_norm": 0.301376074552536, + "learning_rate": 2.882974092840196e-05, + "loss": 0.9053, + "step": 27840 + }, + { + "epoch": 0.8881022991804586, + "grad_norm": 0.29913774132728577, + "learning_rate": 2.871963805328664e-05, + "loss": 0.9204, + "step": 27850 + }, + { + "epoch": 0.8884211869000925, + "grad_norm": 0.298863023519516, + "learning_rate": 2.860995566905046e-05, + "loss": 0.9073, + "step": 27860 + }, + { + "epoch": 0.8887400746197264, + "grad_norm": 0.2911647856235504, + "learning_rate": 2.850069216980822e-05, + "loss": 0.9052, + "step": 27870 + }, + { + "epoch": 0.8890589623393603, + "grad_norm": 0.30974212288856506, + "learning_rate": 2.8391845955807693e-05, + "loss": 0.9118, + "step": 27880 + }, + { + "epoch": 0.8893778500589943, + "grad_norm": 0.3009479343891144, + "learning_rate": 2.8283415433406215e-05, + "loss": 0.8883, + "step": 27890 + }, + { + "epoch": 0.8896967377786281, + "grad_norm": 0.29356539249420166, + "learning_rate": 2.8175399015047376e-05, + "loss": 0.914, + "step": 27900 + }, + { + "epoch": 0.8900156254982621, + "grad_norm": 0.29100754857063293, + "learning_rate": 2.8067795119237755e-05, + "loss": 0.9132, + "step": 27910 + }, + { + "epoch": 0.890334513217896, + "grad_norm": 0.29773107171058655, + "learning_rate": 2.796060217052376e-05, + "loss": 0.9217, + "step": 27920 + }, + { + "epoch": 0.8906534009375299, + "grad_norm": 0.3045399487018585, + "learning_rate": 2.785381859946858e-05, + "loss": 0.9037, + "step": 27930 + }, + { + "epoch": 0.8909722886571638, + "grad_norm": 0.29568803310394287, + "learning_rate": 2.7747442842629192e-05, + "loss": 0.9042, + "step": 27940 + }, + { + "epoch": 0.8912911763767978, + "grad_norm": 0.2984471023082733, + "learning_rate": 2.7641473342533486e-05, + "loss": 0.9092, + "step": 27950 + }, + { + "epoch": 0.8916100640964316, + "grad_norm": 0.31042030453681946, + "learning_rate": 2.753590854765743e-05, + "loss": 0.9193, + "step": 27960 + }, + { + "epoch": 0.8919289518160656, + "grad_norm": 0.30285584926605225, + "learning_rate": 2.7430746912402397e-05, + "loss": 0.9192, + "step": 27970 + }, + { + "epoch": 0.8922478395356995, + "grad_norm": 0.30536022782325745, + "learning_rate": 2.73259868970725e-05, + "loss": 0.9267, + "step": 27980 + }, + { + "epoch": 0.8925667272553334, + "grad_norm": 0.3031696677207947, + "learning_rate": 2.722162696785207e-05, + "loss": 0.9038, + "step": 27990 + }, + { + "epoch": 0.8928856149749673, + "grad_norm": 0.2872668504714966, + "learning_rate": 2.711766559678319e-05, + "loss": 0.9067, + "step": 28000 + }, + { + "epoch": 0.8932045026946013, + "grad_norm": 0.30583876371383667, + "learning_rate": 2.7014101261743303e-05, + "loss": 0.9048, + "step": 28010 + }, + { + "epoch": 0.8935233904142351, + "grad_norm": 0.2939108610153198, + "learning_rate": 2.6910932446423003e-05, + "loss": 0.9092, + "step": 28020 + }, + { + "epoch": 0.8938422781338691, + "grad_norm": 0.29195529222488403, + "learning_rate": 2.6808157640303726e-05, + "loss": 0.9137, + "step": 28030 + }, + { + "epoch": 0.894161165853503, + "grad_norm": 0.29214227199554443, + "learning_rate": 2.670577533863571e-05, + "loss": 0.9005, + "step": 28040 + }, + { + "epoch": 0.8944800535731369, + "grad_norm": 0.3046804666519165, + "learning_rate": 2.6603784042415948e-05, + "loss": 0.8878, + "step": 28050 + }, + { + "epoch": 0.8947989412927708, + "grad_norm": 0.3029037117958069, + "learning_rate": 2.6502182258366217e-05, + "loss": 0.9043, + "step": 28060 + }, + { + "epoch": 0.8951178290124048, + "grad_norm": 0.3057042062282562, + "learning_rate": 2.640096849891124e-05, + "loss": 0.9032, + "step": 28070 + }, + { + "epoch": 0.8954367167320386, + "grad_norm": 0.2865537106990814, + "learning_rate": 2.630014128215691e-05, + "loss": 0.9048, + "step": 28080 + }, + { + "epoch": 0.8957556044516726, + "grad_norm": 0.3084664046764374, + "learning_rate": 2.6199699131868562e-05, + "loss": 0.9025, + "step": 28090 + }, + { + "epoch": 0.8960744921713065, + "grad_norm": 0.3067476749420166, + "learning_rate": 2.6099640577449387e-05, + "loss": 0.9095, + "step": 28100 + }, + { + "epoch": 0.8963933798909404, + "grad_norm": 0.2988032400608063, + "learning_rate": 2.5999964153918897e-05, + "loss": 0.9005, + "step": 28110 + }, + { + "epoch": 0.8967122676105743, + "grad_norm": 0.30241847038269043, + "learning_rate": 2.5900668401891458e-05, + "loss": 0.9179, + "step": 28120 + }, + { + "epoch": 0.8970311553302083, + "grad_norm": 0.2994360029697418, + "learning_rate": 2.5801751867554953e-05, + "loss": 0.8999, + "step": 28130 + }, + { + "epoch": 0.8973500430498421, + "grad_norm": 0.3065774440765381, + "learning_rate": 2.570321310264946e-05, + "loss": 0.9246, + "step": 28140 + }, + { + "epoch": 0.8976689307694761, + "grad_norm": 0.30343982577323914, + "learning_rate": 2.5605050664446088e-05, + "loss": 0.9103, + "step": 28150 + }, + { + "epoch": 0.89798781848911, + "grad_norm": 0.2930835485458374, + "learning_rate": 2.550726311572581e-05, + "loss": 0.9086, + "step": 28160 + }, + { + "epoch": 0.8983067062087439, + "grad_norm": 0.30507856607437134, + "learning_rate": 2.540984902475846e-05, + "loss": 0.8979, + "step": 28170 + }, + { + "epoch": 0.8986255939283778, + "grad_norm": 0.2992618680000305, + "learning_rate": 2.5312806965281734e-05, + "loss": 0.9004, + "step": 28180 + }, + { + "epoch": 0.8989444816480118, + "grad_norm": 0.29800593852996826, + "learning_rate": 2.5216135516480365e-05, + "loss": 0.9016, + "step": 28190 + }, + { + "epoch": 0.8992633693676456, + "grad_norm": 0.2914922833442688, + "learning_rate": 2.5119833262965248e-05, + "loss": 0.9049, + "step": 28200 + }, + { + "epoch": 0.8995822570872796, + "grad_norm": 0.303462952375412, + "learning_rate": 2.502389879475276e-05, + "loss": 0.91, + "step": 28210 + }, + { + "epoch": 0.8999011448069135, + "grad_norm": 0.3019823431968689, + "learning_rate": 2.4928330707244114e-05, + "loss": 0.9172, + "step": 28220 + }, + { + "epoch": 0.9002200325265474, + "grad_norm": 0.2922445833683014, + "learning_rate": 2.4833127601204777e-05, + "loss": 0.8916, + "step": 28230 + }, + { + "epoch": 0.9005389202461813, + "grad_norm": 0.30021733045578003, + "learning_rate": 2.4738288082744006e-05, + "loss": 0.9124, + "step": 28240 + }, + { + "epoch": 0.9008578079658153, + "grad_norm": 0.2954046130180359, + "learning_rate": 2.4643810763294427e-05, + "loss": 0.9114, + "step": 28250 + }, + { + "epoch": 0.9011766956854491, + "grad_norm": 0.29407021403312683, + "learning_rate": 2.4549694259591695e-05, + "loss": 0.8875, + "step": 28260 + }, + { + "epoch": 0.9014955834050831, + "grad_norm": 0.30686795711517334, + "learning_rate": 2.4455937193654268e-05, + "loss": 0.9125, + "step": 28270 + }, + { + "epoch": 0.901814471124717, + "grad_norm": 0.29066452383995056, + "learning_rate": 2.4362538192763198e-05, + "loss": 0.8989, + "step": 28280 + }, + { + "epoch": 0.9021333588443509, + "grad_norm": 0.2918499708175659, + "learning_rate": 2.4269495889442062e-05, + "loss": 0.9129, + "step": 28290 + }, + { + "epoch": 0.9024522465639848, + "grad_norm": 0.30791276693344116, + "learning_rate": 2.417680892143693e-05, + "loss": 0.9288, + "step": 28300 + }, + { + "epoch": 0.9027711342836188, + "grad_norm": 0.2873190939426422, + "learning_rate": 2.4084475931696406e-05, + "loss": 0.9119, + "step": 28310 + }, + { + "epoch": 0.9030900220032526, + "grad_norm": 0.3017880916595459, + "learning_rate": 2.399249556835179e-05, + "loss": 0.8997, + "step": 28320 + }, + { + "epoch": 0.9034089097228866, + "grad_norm": 0.2940405309200287, + "learning_rate": 2.390086648469725e-05, + "loss": 0.9052, + "step": 28330 + }, + { + "epoch": 0.9037277974425205, + "grad_norm": 0.2958362400531769, + "learning_rate": 2.3809587339170133e-05, + "loss": 0.8872, + "step": 28340 + }, + { + "epoch": 0.9040466851621544, + "grad_norm": 0.304292768239975, + "learning_rate": 2.3718656795331296e-05, + "loss": 0.9111, + "step": 28350 + }, + { + "epoch": 0.9043655728817883, + "grad_norm": 0.2933245599269867, + "learning_rate": 2.362807352184559e-05, + "loss": 0.9089, + "step": 28360 + }, + { + "epoch": 0.9046844606014223, + "grad_norm": 0.29516878724098206, + "learning_rate": 2.3537836192462286e-05, + "loss": 0.91, + "step": 28370 + }, + { + "epoch": 0.9050033483210561, + "grad_norm": 0.2948715388774872, + "learning_rate": 2.344794348599573e-05, + "loss": 0.8964, + "step": 28380 + }, + { + "epoch": 0.9053222360406901, + "grad_norm": 0.2943824231624603, + "learning_rate": 2.3358394086305966e-05, + "loss": 0.8966, + "step": 28390 + }, + { + "epoch": 0.905641123760324, + "grad_norm": 0.30269181728363037, + "learning_rate": 2.3269186682279475e-05, + "loss": 0.9035, + "step": 28400 + }, + { + "epoch": 0.9059600114799579, + "grad_norm": 0.30818644165992737, + "learning_rate": 2.3180319967809967e-05, + "loss": 0.9079, + "step": 28410 + }, + { + "epoch": 0.9062788991995918, + "grad_norm": 0.29804039001464844, + "learning_rate": 2.3091792641779272e-05, + "loss": 0.9004, + "step": 28420 + }, + { + "epoch": 0.9065977869192258, + "grad_norm": 0.2957077622413635, + "learning_rate": 2.300360340803829e-05, + "loss": 0.9246, + "step": 28430 + }, + { + "epoch": 0.9069166746388596, + "grad_norm": 0.30506861209869385, + "learning_rate": 2.2915750975388005e-05, + "loss": 0.9161, + "step": 28440 + }, + { + "epoch": 0.9072355623584936, + "grad_norm": 0.29112300276756287, + "learning_rate": 2.2828234057560574e-05, + "loss": 0.9136, + "step": 28450 + }, + { + "epoch": 0.9075544500781275, + "grad_norm": 0.30500057339668274, + "learning_rate": 2.2741051373200522e-05, + "loss": 0.8983, + "step": 28460 + }, + { + "epoch": 0.9078733377977614, + "grad_norm": 0.2991545796394348, + "learning_rate": 2.265420164584595e-05, + "loss": 0.9081, + "step": 28470 + }, + { + "epoch": 0.9081922255173953, + "grad_norm": 0.2967453896999359, + "learning_rate": 2.2567683603909864e-05, + "loss": 0.9075, + "step": 28480 + }, + { + "epoch": 0.9085111132370293, + "grad_norm": 0.3122044503688812, + "learning_rate": 2.2481495980661557e-05, + "loss": 0.9134, + "step": 28490 + }, + { + "epoch": 0.9088300009566631, + "grad_norm": 0.30028966069221497, + "learning_rate": 2.239563751420805e-05, + "loss": 0.9082, + "step": 28500 + }, + { + "epoch": 0.9091488886762971, + "grad_norm": 0.2991483807563782, + "learning_rate": 2.2310106947475637e-05, + "loss": 0.9027, + "step": 28510 + }, + { + "epoch": 0.909467776395931, + "grad_norm": 0.2894665598869324, + "learning_rate": 2.2224903028191445e-05, + "loss": 0.913, + "step": 28520 + }, + { + "epoch": 0.9097866641155649, + "grad_norm": 0.295378178358078, + "learning_rate": 2.2140024508865157e-05, + "loss": 0.8794, + "step": 28530 + }, + { + "epoch": 0.9101055518351988, + "grad_norm": 0.2910739481449127, + "learning_rate": 2.205547014677069e-05, + "loss": 0.9054, + "step": 28540 + }, + { + "epoch": 0.9104244395548328, + "grad_norm": 0.29597222805023193, + "learning_rate": 2.197123870392802e-05, + "loss": 0.8965, + "step": 28550 + }, + { + "epoch": 0.9107433272744666, + "grad_norm": 0.30073556303977966, + "learning_rate": 2.1887328947085065e-05, + "loss": 0.9167, + "step": 28560 + }, + { + "epoch": 0.9110622149941006, + "grad_norm": 0.2994864583015442, + "learning_rate": 2.1803739647699623e-05, + "loss": 0.9093, + "step": 28570 + }, + { + "epoch": 0.9113811027137345, + "grad_norm": 0.2985394597053528, + "learning_rate": 2.172046958192138e-05, + "loss": 0.9105, + "step": 28580 + }, + { + "epoch": 0.9116999904333684, + "grad_norm": 0.2986948490142822, + "learning_rate": 2.1637517530574002e-05, + "loss": 0.9166, + "step": 28590 + }, + { + "epoch": 0.9120188781530023, + "grad_norm": 0.3221426010131836, + "learning_rate": 2.155488227913727e-05, + "loss": 0.9234, + "step": 28600 + }, + { + "epoch": 0.9123377658726363, + "grad_norm": 0.31154584884643555, + "learning_rate": 2.14725626177293e-05, + "loss": 0.9182, + "step": 28610 + }, + { + "epoch": 0.9126566535922701, + "grad_norm": 0.2910720705986023, + "learning_rate": 2.139055734108886e-05, + "loss": 0.9048, + "step": 28620 + }, + { + "epoch": 0.9129755413119041, + "grad_norm": 0.29777389764785767, + "learning_rate": 2.1308865248557674e-05, + "loss": 0.8945, + "step": 28630 + }, + { + "epoch": 0.913294429031538, + "grad_norm": 0.2937983572483063, + "learning_rate": 2.122748514406288e-05, + "loss": 0.9024, + "step": 28640 + }, + { + "epoch": 0.913613316751172, + "grad_norm": 0.3081187605857849, + "learning_rate": 2.1146415836099496e-05, + "loss": 0.8989, + "step": 28650 + }, + { + "epoch": 0.9139322044708058, + "grad_norm": 0.3129999339580536, + "learning_rate": 2.1065656137712995e-05, + "loss": 0.905, + "step": 28660 + }, + { + "epoch": 0.9142510921904398, + "grad_norm": 0.29838865995407104, + "learning_rate": 2.0985204866481902e-05, + "loss": 0.9085, + "step": 28670 + }, + { + "epoch": 0.9145699799100737, + "grad_norm": 0.3048774302005768, + "learning_rate": 2.0905060844500506e-05, + "loss": 0.9148, + "step": 28680 + }, + { + "epoch": 0.9148888676297076, + "grad_norm": 0.30030128359794617, + "learning_rate": 2.0825222898361594e-05, + "loss": 0.9164, + "step": 28690 + }, + { + "epoch": 0.9152077553493415, + "grad_norm": 0.30484727025032043, + "learning_rate": 2.07456898591393e-05, + "loss": 0.9038, + "step": 28700 + }, + { + "epoch": 0.9155266430689755, + "grad_norm": 0.3036182224750519, + "learning_rate": 2.0666460562371957e-05, + "loss": 0.9036, + "step": 28710 + }, + { + "epoch": 0.9158455307886093, + "grad_norm": 0.30354034900665283, + "learning_rate": 2.0587533848045053e-05, + "loss": 0.9091, + "step": 28720 + }, + { + "epoch": 0.9161644185082433, + "grad_norm": 0.29555216431617737, + "learning_rate": 2.050890856057427e-05, + "loss": 0.9229, + "step": 28730 + }, + { + "epoch": 0.9164833062278772, + "grad_norm": 0.3115293085575104, + "learning_rate": 2.0430583548788544e-05, + "loss": 0.9274, + "step": 28740 + }, + { + "epoch": 0.9168021939475111, + "grad_norm": 0.29061922430992126, + "learning_rate": 2.0352557665913218e-05, + "loss": 0.915, + "step": 28750 + }, + { + "epoch": 0.917121081667145, + "grad_norm": 0.2968040704727173, + "learning_rate": 2.027482976955325e-05, + "loss": 0.9089, + "step": 28760 + }, + { + "epoch": 0.917439969386779, + "grad_norm": 0.3121855556964874, + "learning_rate": 2.0197398721676495e-05, + "loss": 0.9059, + "step": 28770 + }, + { + "epoch": 0.9177588571064128, + "grad_norm": 0.3086029589176178, + "learning_rate": 2.0120263388597025e-05, + "loss": 0.9078, + "step": 28780 + }, + { + "epoch": 0.9180777448260468, + "grad_norm": 0.29410770535469055, + "learning_rate": 2.004342264095854e-05, + "loss": 0.9042, + "step": 28790 + }, + { + "epoch": 0.9183966325456807, + "grad_norm": 0.2909134030342102, + "learning_rate": 1.9966875353717854e-05, + "loss": 0.9014, + "step": 28800 + }, + { + "epoch": 0.9187155202653146, + "grad_norm": 0.305385023355484, + "learning_rate": 1.9890620406128368e-05, + "loss": 0.9003, + "step": 28810 + }, + { + "epoch": 0.9190344079849485, + "grad_norm": 0.3027626574039459, + "learning_rate": 1.981465668172373e-05, + "loss": 0.886, + "step": 28820 + }, + { + "epoch": 0.9193532957045825, + "grad_norm": 0.29952168464660645, + "learning_rate": 1.9738983068301432e-05, + "loss": 0.9006, + "step": 28830 + }, + { + "epoch": 0.9196721834242163, + "grad_norm": 0.2944650948047638, + "learning_rate": 1.966359845790656e-05, + "loss": 0.904, + "step": 28840 + }, + { + "epoch": 0.9199910711438503, + "grad_norm": 0.2949775457382202, + "learning_rate": 1.9588501746815556e-05, + "loss": 0.9068, + "step": 28850 + }, + { + "epoch": 0.9203099588634842, + "grad_norm": 0.29960328340530396, + "learning_rate": 1.9513691835520046e-05, + "loss": 0.9094, + "step": 28860 + }, + { + "epoch": 0.920628846583118, + "grad_norm": 0.3011898398399353, + "learning_rate": 1.9439167628710803e-05, + "loss": 0.8934, + "step": 28870 + }, + { + "epoch": 0.920947734302752, + "grad_norm": 0.3027596175670624, + "learning_rate": 1.936492803526162e-05, + "loss": 0.902, + "step": 28880 + }, + { + "epoch": 0.921266622022386, + "grad_norm": 0.3028440773487091, + "learning_rate": 1.9290971968213404e-05, + "loss": 0.9131, + "step": 28890 + }, + { + "epoch": 0.9215855097420198, + "grad_norm": 0.3038038909435272, + "learning_rate": 1.9217298344758223e-05, + "loss": 0.9135, + "step": 28900 + }, + { + "epoch": 0.9219043974616538, + "grad_norm": 0.30496832728385925, + "learning_rate": 1.9143906086223483e-05, + "loss": 0.9035, + "step": 28910 + }, + { + "epoch": 0.9222232851812877, + "grad_norm": 0.2991454601287842, + "learning_rate": 1.907079411805611e-05, + "loss": 0.912, + "step": 28920 + }, + { + "epoch": 0.9225421729009216, + "grad_norm": 0.3076494038105011, + "learning_rate": 1.8997961369806826e-05, + "loss": 0.9167, + "step": 28930 + }, + { + "epoch": 0.9228610606205555, + "grad_norm": 0.3043328821659088, + "learning_rate": 1.8925406775114476e-05, + "loss": 0.9028, + "step": 28940 + }, + { + "epoch": 0.9231799483401895, + "grad_norm": 0.293874591588974, + "learning_rate": 1.8853129271690422e-05, + "loss": 0.9062, + "step": 28950 + }, + { + "epoch": 0.9234988360598233, + "grad_norm": 0.2940611243247986, + "learning_rate": 1.878112780130298e-05, + "loss": 0.905, + "step": 28960 + }, + { + "epoch": 0.9238177237794573, + "grad_norm": 0.30397751927375793, + "learning_rate": 1.8709401309761924e-05, + "loss": 0.9002, + "step": 28970 + }, + { + "epoch": 0.9241366114990912, + "grad_norm": 0.3081769049167633, + "learning_rate": 1.863794874690307e-05, + "loss": 0.8932, + "step": 28980 + }, + { + "epoch": 0.924455499218725, + "grad_norm": 0.2985953092575073, + "learning_rate": 1.8566769066572868e-05, + "loss": 0.9033, + "step": 28990 + }, + { + "epoch": 0.924774386938359, + "grad_norm": 0.31341174244880676, + "learning_rate": 1.849586122661313e-05, + "loss": 0.8932, + "step": 29000 + }, + { + "epoch": 0.925093274657993, + "grad_norm": 0.30072101950645447, + "learning_rate": 1.842522418884572e-05, + "loss": 0.9038, + "step": 29010 + }, + { + "epoch": 0.9254121623776268, + "grad_norm": 0.29787227511405945, + "learning_rate": 1.8354856919057388e-05, + "loss": 0.9065, + "step": 29020 + }, + { + "epoch": 0.9257310500972608, + "grad_norm": 0.297067254781723, + "learning_rate": 1.8284758386984637e-05, + "loss": 0.8989, + "step": 29030 + }, + { + "epoch": 0.9260499378168947, + "grad_norm": 0.29680055379867554, + "learning_rate": 1.8214927566298603e-05, + "loss": 0.8947, + "step": 29040 + }, + { + "epoch": 0.9263688255365286, + "grad_norm": 0.29479241371154785, + "learning_rate": 1.814536343459005e-05, + "loss": 0.9007, + "step": 29050 + }, + { + "epoch": 0.9266877132561625, + "grad_norm": 0.2918523848056793, + "learning_rate": 1.8076064973354396e-05, + "loss": 0.912, + "step": 29060 + }, + { + "epoch": 0.9270066009757965, + "grad_norm": 0.296645849943161, + "learning_rate": 1.80070311679768e-05, + "loss": 0.9015, + "step": 29070 + }, + { + "epoch": 0.9273254886954303, + "grad_norm": 0.2984611690044403, + "learning_rate": 1.793826100771732e-05, + "loss": 0.886, + "step": 29080 + }, + { + "epoch": 0.9276443764150643, + "grad_norm": 0.2978875935077667, + "learning_rate": 1.7869753485696093e-05, + "loss": 0.8939, + "step": 29090 + }, + { + "epoch": 0.9279632641346982, + "grad_norm": 0.2979802191257477, + "learning_rate": 1.7801507598878604e-05, + "loss": 0.9128, + "step": 29100 + }, + { + "epoch": 0.928282151854332, + "grad_norm": 0.30805638432502747, + "learning_rate": 1.7733522348061006e-05, + "loss": 0.9297, + "step": 29110 + }, + { + "epoch": 0.928601039573966, + "grad_norm": 0.3010908365249634, + "learning_rate": 1.7665796737855474e-05, + "loss": 0.9166, + "step": 29120 + }, + { + "epoch": 0.9289199272936, + "grad_norm": 0.3014841079711914, + "learning_rate": 1.7598329776675653e-05, + "loss": 0.8999, + "step": 29130 + }, + { + "epoch": 0.9292388150132338, + "grad_norm": 0.303438663482666, + "learning_rate": 1.753112047672212e-05, + "loss": 0.9029, + "step": 29140 + }, + { + "epoch": 0.9295577027328678, + "grad_norm": 0.2883744239807129, + "learning_rate": 1.7464167853967936e-05, + "loss": 0.8886, + "step": 29150 + }, + { + "epoch": 0.9298765904525017, + "grad_norm": 0.29751503467559814, + "learning_rate": 1.739747092814422e-05, + "loss": 0.8856, + "step": 29160 + }, + { + "epoch": 0.9301954781721355, + "grad_norm": 0.2995840907096863, + "learning_rate": 1.7331028722725825e-05, + "loss": 0.9102, + "step": 29170 + }, + { + "epoch": 0.9305143658917695, + "grad_norm": 0.30100563168525696, + "learning_rate": 1.726484026491702e-05, + "loss": 0.9028, + "step": 29180 + }, + { + "epoch": 0.9308332536114035, + "grad_norm": 0.2966121733188629, + "learning_rate": 1.7198904585637235e-05, + "loss": 0.8812, + "step": 29190 + }, + { + "epoch": 0.9311521413310373, + "grad_norm": 0.30071768164634705, + "learning_rate": 1.713322071950692e-05, + "loss": 0.906, + "step": 29200 + }, + { + "epoch": 0.9314710290506713, + "grad_norm": 0.30014944076538086, + "learning_rate": 1.7067787704833355e-05, + "loss": 0.9213, + "step": 29210 + }, + { + "epoch": 0.9317899167703052, + "grad_norm": 0.31035640835762024, + "learning_rate": 1.7002604583596607e-05, + "loss": 0.9063, + "step": 29220 + }, + { + "epoch": 0.932108804489939, + "grad_norm": 0.2940179705619812, + "learning_rate": 1.6937670401435477e-05, + "loss": 0.8975, + "step": 29230 + }, + { + "epoch": 0.932427692209573, + "grad_norm": 0.31050607562065125, + "learning_rate": 1.6872984207633558e-05, + "loss": 0.8947, + "step": 29240 + }, + { + "epoch": 0.932746579929207, + "grad_norm": 0.3037688136100769, + "learning_rate": 1.6808545055105283e-05, + "loss": 0.9016, + "step": 29250 + }, + { + "epoch": 0.9330654676488408, + "grad_norm": 0.3068945109844208, + "learning_rate": 1.6744352000382084e-05, + "loss": 0.903, + "step": 29260 + }, + { + "epoch": 0.9333843553684747, + "grad_norm": 0.3037906289100647, + "learning_rate": 1.6680404103598565e-05, + "loss": 0.8951, + "step": 29270 + }, + { + "epoch": 0.9337032430881087, + "grad_norm": 0.3063509166240692, + "learning_rate": 1.6616700428478738e-05, + "loss": 0.8969, + "step": 29280 + }, + { + "epoch": 0.9340221308077425, + "grad_norm": 0.3033375144004822, + "learning_rate": 1.6553240042322332e-05, + "loss": 0.8981, + "step": 29290 + }, + { + "epoch": 0.9343410185273765, + "grad_norm": 0.29124125838279724, + "learning_rate": 1.6490022015991115e-05, + "loss": 0.8943, + "step": 29300 + }, + { + "epoch": 0.9346599062470105, + "grad_norm": 0.297909140586853, + "learning_rate": 1.6427045423895318e-05, + "loss": 0.9071, + "step": 29310 + }, + { + "epoch": 0.9349787939666443, + "grad_norm": 0.29066595435142517, + "learning_rate": 1.636430934398004e-05, + "loss": 0.9035, + "step": 29320 + }, + { + "epoch": 0.9352976816862782, + "grad_norm": 0.3028257191181183, + "learning_rate": 1.63018128577118e-05, + "loss": 0.9195, + "step": 29330 + }, + { + "epoch": 0.9356165694059122, + "grad_norm": 0.30161142349243164, + "learning_rate": 1.623955505006505e-05, + "loss": 0.8899, + "step": 29340 + }, + { + "epoch": 0.935935457125546, + "grad_norm": 0.30503955483436584, + "learning_rate": 1.61775350095088e-05, + "loss": 0.8928, + "step": 29350 + }, + { + "epoch": 0.93625434484518, + "grad_norm": 0.3002879023551941, + "learning_rate": 1.611575182799324e-05, + "loss": 0.8799, + "step": 29360 + }, + { + "epoch": 0.936573232564814, + "grad_norm": 0.31323641538619995, + "learning_rate": 1.605420460093651e-05, + "loss": 0.9107, + "step": 29370 + }, + { + "epoch": 0.9368921202844478, + "grad_norm": 0.29981517791748047, + "learning_rate": 1.5992892427211376e-05, + "loss": 0.9071, + "step": 29380 + }, + { + "epoch": 0.9372110080040817, + "grad_norm": 0.3079449534416199, + "learning_rate": 1.5931814409132093e-05, + "loss": 0.9044, + "step": 29390 + }, + { + "epoch": 0.9375298957237157, + "grad_norm": 0.2934107184410095, + "learning_rate": 1.587096965244123e-05, + "loss": 0.9099, + "step": 29400 + }, + { + "epoch": 0.9378487834433495, + "grad_norm": 0.28969040513038635, + "learning_rate": 1.5810357266296604e-05, + "loss": 0.9033, + "step": 29410 + }, + { + "epoch": 0.9381676711629835, + "grad_norm": 0.30076563358306885, + "learning_rate": 1.5749976363258215e-05, + "loss": 0.9081, + "step": 29420 + }, + { + "epoch": 0.9384865588826174, + "grad_norm": 0.30778390169143677, + "learning_rate": 1.5689826059275262e-05, + "loss": 0.8972, + "step": 29430 + }, + { + "epoch": 0.9388054466022513, + "grad_norm": 0.30305200815200806, + "learning_rate": 1.5629905473673202e-05, + "loss": 0.8905, + "step": 29440 + }, + { + "epoch": 0.9391243343218852, + "grad_norm": 0.2984049916267395, + "learning_rate": 1.5570213729140846e-05, + "loss": 0.8958, + "step": 29450 + }, + { + "epoch": 0.9394432220415192, + "grad_norm": 0.297616571187973, + "learning_rate": 1.5510749951717523e-05, + "loss": 0.9065, + "step": 29460 + }, + { + "epoch": 0.9397621097611532, + "grad_norm": 0.29675933718681335, + "learning_rate": 1.5451513270780287e-05, + "loss": 0.9012, + "step": 29470 + }, + { + "epoch": 0.940080997480787, + "grad_norm": 0.2959672510623932, + "learning_rate": 1.539250281903115e-05, + "loss": 0.8988, + "step": 29480 + }, + { + "epoch": 0.940399885200421, + "grad_norm": 0.29955780506134033, + "learning_rate": 1.5333717732484415e-05, + "loss": 0.9001, + "step": 29490 + }, + { + "epoch": 0.9407187729200549, + "grad_norm": 0.3054618239402771, + "learning_rate": 1.5275157150454e-05, + "loss": 0.9055, + "step": 29500 + }, + { + "epoch": 0.9410376606396887, + "grad_norm": 0.293949156999588, + "learning_rate": 1.5216820215540841e-05, + "loss": 0.8992, + "step": 29510 + }, + { + "epoch": 0.9413565483593227, + "grad_norm": 0.3134571611881256, + "learning_rate": 1.5158706073620354e-05, + "loss": 0.8935, + "step": 29520 + }, + { + "epoch": 0.9416754360789567, + "grad_norm": 0.30413368344306946, + "learning_rate": 1.5100813873829904e-05, + "loss": 0.8834, + "step": 29530 + }, + { + "epoch": 0.9419943237985905, + "grad_norm": 0.3008265495300293, + "learning_rate": 1.5043142768556388e-05, + "loss": 0.9111, + "step": 29540 + }, + { + "epoch": 0.9423132115182244, + "grad_norm": 0.3124244213104248, + "learning_rate": 1.4985691913423778e-05, + "loss": 0.907, + "step": 29550 + }, + { + "epoch": 0.9426320992378584, + "grad_norm": 0.3013421595096588, + "learning_rate": 1.4928460467280777e-05, + "loss": 0.893, + "step": 29560 + }, + { + "epoch": 0.9429509869574922, + "grad_norm": 0.30568447709083557, + "learning_rate": 1.4871447592188524e-05, + "loss": 0.9057, + "step": 29570 + }, + { + "epoch": 0.9432698746771262, + "grad_norm": 0.2980208098888397, + "learning_rate": 1.4814652453408288e-05, + "loss": 0.8886, + "step": 29580 + }, + { + "epoch": 0.9435887623967602, + "grad_norm": 0.30180996656417847, + "learning_rate": 1.4758074219389278e-05, + "loss": 0.8927, + "step": 29590 + }, + { + "epoch": 0.943907650116394, + "grad_norm": 0.3070039451122284, + "learning_rate": 1.4701712061756454e-05, + "loss": 0.9086, + "step": 29600 + }, + { + "epoch": 0.944226537836028, + "grad_norm": 0.3042294383049011, + "learning_rate": 1.4645565155298395e-05, + "loss": 0.9102, + "step": 29610 + }, + { + "epoch": 0.9445454255556619, + "grad_norm": 0.28559359908103943, + "learning_rate": 1.458963267795523e-05, + "loss": 0.9113, + "step": 29620 + }, + { + "epoch": 0.9448643132752957, + "grad_norm": 0.30219465494155884, + "learning_rate": 1.4533913810806589e-05, + "loss": 0.9138, + "step": 29630 + }, + { + "epoch": 0.9451832009949297, + "grad_norm": 0.3139053285121918, + "learning_rate": 1.4478407738059622e-05, + "loss": 0.8951, + "step": 29640 + }, + { + "epoch": 0.9455020887145636, + "grad_norm": 0.2970397472381592, + "learning_rate": 1.4423113647037045e-05, + "loss": 0.8881, + "step": 29650 + }, + { + "epoch": 0.9458209764341975, + "grad_norm": 0.2954052686691284, + "learning_rate": 1.4368030728165257e-05, + "loss": 0.8962, + "step": 29660 + }, + { + "epoch": 0.9461398641538314, + "grad_norm": 0.30081456899642944, + "learning_rate": 1.4313158174962467e-05, + "loss": 0.9137, + "step": 29670 + }, + { + "epoch": 0.9464587518734654, + "grad_norm": 0.29948690533638, + "learning_rate": 1.4258495184026909e-05, + "loss": 0.9047, + "step": 29680 + }, + { + "epoch": 0.9467776395930992, + "grad_norm": 0.2990148067474365, + "learning_rate": 1.4204040955025053e-05, + "loss": 0.9023, + "step": 29690 + }, + { + "epoch": 0.9470965273127332, + "grad_norm": 0.3068961501121521, + "learning_rate": 1.4149794690679904e-05, + "loss": 0.898, + "step": 29700 + }, + { + "epoch": 0.9474154150323671, + "grad_norm": 0.2919014096260071, + "learning_rate": 1.409575559675934e-05, + "loss": 0.879, + "step": 29710 + }, + { + "epoch": 0.947734302752001, + "grad_norm": 0.30254507064819336, + "learning_rate": 1.4041922882064455e-05, + "loss": 0.8929, + "step": 29720 + }, + { + "epoch": 0.9480531904716349, + "grad_norm": 0.3149068057537079, + "learning_rate": 1.398829575841799e-05, + "loss": 0.9133, + "step": 29730 + }, + { + "epoch": 0.9483720781912689, + "grad_norm": 0.30008769035339355, + "learning_rate": 1.3934873440652796e-05, + "loss": 0.8871, + "step": 29740 + }, + { + "epoch": 0.9486909659109027, + "grad_norm": 0.2980942726135254, + "learning_rate": 1.3881655146600332e-05, + "loss": 0.9036, + "step": 29750 + }, + { + "epoch": 0.9490098536305367, + "grad_norm": 0.2967151999473572, + "learning_rate": 1.3828640097079218e-05, + "loss": 0.9042, + "step": 29760 + }, + { + "epoch": 0.9493287413501706, + "grad_norm": 0.306095153093338, + "learning_rate": 1.377582751588382e-05, + "loss": 0.8936, + "step": 29770 + }, + { + "epoch": 0.9496476290698045, + "grad_norm": 0.30544817447662354, + "learning_rate": 1.3723216629772897e-05, + "loss": 0.8961, + "step": 29780 + }, + { + "epoch": 0.9499665167894384, + "grad_norm": 0.3010407090187073, + "learning_rate": 1.3670806668458264e-05, + "loss": 0.9043, + "step": 29790 + }, + { + "epoch": 0.9502854045090724, + "grad_norm": 0.3018208146095276, + "learning_rate": 1.3618596864593529e-05, + "loss": 0.8995, + "step": 29800 + }, + { + "epoch": 0.9506042922287062, + "grad_norm": 0.29537490010261536, + "learning_rate": 1.3566586453762848e-05, + "loss": 0.8846, + "step": 29810 + }, + { + "epoch": 0.9509231799483402, + "grad_norm": 0.29739969968795776, + "learning_rate": 1.3514774674469737e-05, + "loss": 0.9081, + "step": 29820 + }, + { + "epoch": 0.9512420676679741, + "grad_norm": 0.3044932186603546, + "learning_rate": 1.3463160768125927e-05, + "loss": 0.914, + "step": 29830 + }, + { + "epoch": 0.951560955387608, + "grad_norm": 0.30703359842300415, + "learning_rate": 1.3411743979040244e-05, + "loss": 0.9054, + "step": 29840 + }, + { + "epoch": 0.9518798431072419, + "grad_norm": 0.3007422685623169, + "learning_rate": 1.3360523554407562e-05, + "loss": 0.9064, + "step": 29850 + }, + { + "epoch": 0.9521987308268759, + "grad_norm": 0.3024388253688812, + "learning_rate": 1.3309498744297768e-05, + "loss": 0.8915, + "step": 29860 + }, + { + "epoch": 0.9525176185465097, + "grad_norm": 0.30477872490882874, + "learning_rate": 1.3258668801644778e-05, + "loss": 0.9077, + "step": 29870 + }, + { + "epoch": 0.9528365062661437, + "grad_norm": 0.29049551486968994, + "learning_rate": 1.3208032982235637e-05, + "loss": 0.8962, + "step": 29880 + }, + { + "epoch": 0.9531553939857776, + "grad_norm": 0.2952941954135895, + "learning_rate": 1.315759054469956e-05, + "loss": 0.8911, + "step": 29890 + }, + { + "epoch": 0.9534742817054115, + "grad_norm": 0.30060091614723206, + "learning_rate": 1.3107340750497128e-05, + "loss": 0.8945, + "step": 29900 + }, + { + "epoch": 0.9537931694250454, + "grad_norm": 0.29785463213920593, + "learning_rate": 1.3057282863909452e-05, + "loss": 0.9044, + "step": 29910 + }, + { + "epoch": 0.9541120571446794, + "grad_norm": 0.29402804374694824, + "learning_rate": 1.3007416152027412e-05, + "loss": 0.9048, + "step": 29920 + }, + { + "epoch": 0.9544309448643132, + "grad_norm": 0.29524412751197815, + "learning_rate": 1.2957739884740917e-05, + "loss": 0.8943, + "step": 29930 + }, + { + "epoch": 0.9547498325839472, + "grad_norm": 0.31547248363494873, + "learning_rate": 1.290825333472822e-05, + "loss": 0.9, + "step": 29940 + }, + { + "epoch": 0.9550687203035811, + "grad_norm": 0.30710914731025696, + "learning_rate": 1.2858955777445266e-05, + "loss": 0.9259, + "step": 29950 + }, + { + "epoch": 0.955387608023215, + "grad_norm": 0.3070859909057617, + "learning_rate": 1.2809846491115096e-05, + "loss": 0.8958, + "step": 29960 + }, + { + "epoch": 0.9557064957428489, + "grad_norm": 0.2996968924999237, + "learning_rate": 1.276092475671726e-05, + "loss": 0.908, + "step": 29970 + }, + { + "epoch": 0.9560253834624829, + "grad_norm": 0.3038216531276703, + "learning_rate": 1.27121898579773e-05, + "loss": 0.896, + "step": 29980 + }, + { + "epoch": 0.9563442711821167, + "grad_norm": 0.30883684754371643, + "learning_rate": 1.266364108135627e-05, + "loss": 0.9177, + "step": 29990 + }, + { + "epoch": 0.9566631589017507, + "grad_norm": 0.31187060475349426, + "learning_rate": 1.2615277716040274e-05, + "loss": 0.9133, + "step": 30000 + }, + { + "epoch": 0.9569820466213846, + "grad_norm": 0.3062373697757721, + "learning_rate": 1.2567099053930065e-05, + "loss": 0.9176, + "step": 30010 + }, + { + "epoch": 0.9573009343410185, + "grad_norm": 0.31157636642456055, + "learning_rate": 1.2519104389630684e-05, + "loss": 0.8886, + "step": 30020 + }, + { + "epoch": 0.9576198220606524, + "grad_norm": 0.2978048622608185, + "learning_rate": 1.2471293020441117e-05, + "loss": 0.8989, + "step": 30030 + }, + { + "epoch": 0.9579387097802864, + "grad_norm": 0.2935866117477417, + "learning_rate": 1.2423664246344036e-05, + "loss": 0.8981, + "step": 30040 + }, + { + "epoch": 0.9582575974999202, + "grad_norm": 0.30189335346221924, + "learning_rate": 1.2376217369995511e-05, + "loss": 0.8878, + "step": 30050 + }, + { + "epoch": 0.9585764852195542, + "grad_norm": 0.30390262603759766, + "learning_rate": 1.2328951696714822e-05, + "loss": 0.9185, + "step": 30060 + }, + { + "epoch": 0.9588953729391881, + "grad_norm": 0.2978833019733429, + "learning_rate": 1.2281866534474292e-05, + "loss": 0.9151, + "step": 30070 + }, + { + "epoch": 0.959214260658822, + "grad_norm": 0.3033181130886078, + "learning_rate": 1.2234961193889144e-05, + "loss": 0.8983, + "step": 30080 + }, + { + "epoch": 0.9595331483784559, + "grad_norm": 0.2994658648967743, + "learning_rate": 1.218823498820741e-05, + "loss": 0.9006, + "step": 30090 + }, + { + "epoch": 0.9598520360980899, + "grad_norm": 0.30001530051231384, + "learning_rate": 1.214168723329988e-05, + "loss": 0.9078, + "step": 30100 + }, + { + "epoch": 0.9601709238177237, + "grad_norm": 0.2979891896247864, + "learning_rate": 1.2095317247650083e-05, + "loss": 0.9055, + "step": 30110 + }, + { + "epoch": 0.9604898115373577, + "grad_norm": 0.3081294298171997, + "learning_rate": 1.204912435234431e-05, + "loss": 0.9004, + "step": 30120 + }, + { + "epoch": 0.9608086992569916, + "grad_norm": 0.2981680929660797, + "learning_rate": 1.200310787106167e-05, + "loss": 0.8889, + "step": 30130 + }, + { + "epoch": 0.9611275869766255, + "grad_norm": 0.31134116649627686, + "learning_rate": 1.195726713006419e-05, + "loss": 0.9044, + "step": 30140 + }, + { + "epoch": 0.9614464746962594, + "grad_norm": 0.30942800641059875, + "learning_rate": 1.1911601458186958e-05, + "loss": 0.9033, + "step": 30150 + }, + { + "epoch": 0.9617653624158934, + "grad_norm": 0.30746373534202576, + "learning_rate": 1.186611018682828e-05, + "loss": 0.8926, + "step": 30160 + }, + { + "epoch": 0.9620842501355272, + "grad_norm": 0.30481740832328796, + "learning_rate": 1.1820792649939912e-05, + "loss": 0.8913, + "step": 30170 + }, + { + "epoch": 0.9624031378551612, + "grad_norm": 0.2997003495693207, + "learning_rate": 1.1775648184017282e-05, + "loss": 0.9051, + "step": 30180 + }, + { + "epoch": 0.9627220255747951, + "grad_norm": 0.3015143871307373, + "learning_rate": 1.1730676128089802e-05, + "loss": 0.9117, + "step": 30190 + }, + { + "epoch": 0.963040913294429, + "grad_norm": 0.3021433651447296, + "learning_rate": 1.1685875823711168e-05, + "loss": 0.8969, + "step": 30200 + }, + { + "epoch": 0.9633598010140629, + "grad_norm": 0.3064768314361572, + "learning_rate": 1.164124661494975e-05, + "loss": 0.9047, + "step": 30210 + }, + { + "epoch": 0.9636786887336969, + "grad_norm": 0.30161044001579285, + "learning_rate": 1.1596787848378949e-05, + "loss": 0.9124, + "step": 30220 + }, + { + "epoch": 0.9639975764533307, + "grad_norm": 0.30823206901550293, + "learning_rate": 1.1552498873067655e-05, + "loss": 0.9101, + "step": 30230 + }, + { + "epoch": 0.9643164641729647, + "grad_norm": 0.3106628358364105, + "learning_rate": 1.1508379040570714e-05, + "loss": 0.9075, + "step": 30240 + }, + { + "epoch": 0.9646353518925986, + "grad_norm": 0.29570499062538147, + "learning_rate": 1.146442770491943e-05, + "loss": 0.878, + "step": 30250 + }, + { + "epoch": 0.9649542396122325, + "grad_norm": 0.30375078320503235, + "learning_rate": 1.1420644222612106e-05, + "loss": 0.8909, + "step": 30260 + }, + { + "epoch": 0.9652731273318664, + "grad_norm": 0.2965579330921173, + "learning_rate": 1.1377027952604628e-05, + "loss": 0.8977, + "step": 30270 + }, + { + "epoch": 0.9655920150515004, + "grad_norm": 0.30961713194847107, + "learning_rate": 1.1333578256301075e-05, + "loss": 0.8859, + "step": 30280 + }, + { + "epoch": 0.9659109027711343, + "grad_norm": 0.2947857081890106, + "learning_rate": 1.129029449754437e-05, + "loss": 0.8957, + "step": 30290 + }, + { + "epoch": 0.9662297904907682, + "grad_norm": 0.2956250011920929, + "learning_rate": 1.1247176042606964e-05, + "loss": 0.9079, + "step": 30300 + }, + { + "epoch": 0.9665486782104021, + "grad_norm": 0.306384801864624, + "learning_rate": 1.1204222260181564e-05, + "loss": 0.8966, + "step": 30310 + }, + { + "epoch": 0.9668675659300361, + "grad_norm": 0.2974628210067749, + "learning_rate": 1.1161432521371883e-05, + "loss": 0.8998, + "step": 30320 + }, + { + "epoch": 0.9671864536496699, + "grad_norm": 0.3047086298465729, + "learning_rate": 1.1118806199683434e-05, + "loss": 0.8988, + "step": 30330 + }, + { + "epoch": 0.9675053413693039, + "grad_norm": 0.3030594289302826, + "learning_rate": 1.1076342671014357e-05, + "loss": 0.9229, + "step": 30340 + }, + { + "epoch": 0.9678242290889378, + "grad_norm": 0.32489851117134094, + "learning_rate": 1.1034041313646285e-05, + "loss": 0.9173, + "step": 30350 + }, + { + "epoch": 0.9681431168085717, + "grad_norm": 0.2998977601528168, + "learning_rate": 1.099190150823524e-05, + "loss": 0.9021, + "step": 30360 + }, + { + "epoch": 0.9684620045282056, + "grad_norm": 0.29817602038383484, + "learning_rate": 1.0949922637802553e-05, + "loss": 0.9189, + "step": 30370 + }, + { + "epoch": 0.9687808922478396, + "grad_norm": 0.30538299679756165, + "learning_rate": 1.0908104087725861e-05, + "loss": 0.8946, + "step": 30380 + }, + { + "epoch": 0.9690997799674734, + "grad_norm": 0.3044208884239197, + "learning_rate": 1.0866445245730072e-05, + "loss": 0.8986, + "step": 30390 + }, + { + "epoch": 0.9694186676871074, + "grad_norm": 0.29799777269363403, + "learning_rate": 1.0824945501878422e-05, + "loss": 0.9099, + "step": 30400 + }, + { + "epoch": 0.9697375554067413, + "grad_norm": 0.2967960238456726, + "learning_rate": 1.0783604248563538e-05, + "loss": 0.8932, + "step": 30410 + }, + { + "epoch": 0.9700564431263752, + "grad_norm": 0.30792462825775146, + "learning_rate": 1.0742420880498549e-05, + "loss": 0.9087, + "step": 30420 + }, + { + "epoch": 0.9703753308460091, + "grad_norm": 0.297896146774292, + "learning_rate": 1.0701394794708213e-05, + "loss": 0.9056, + "step": 30430 + }, + { + "epoch": 0.9706942185656431, + "grad_norm": 0.2979851961135864, + "learning_rate": 1.0660525390520096e-05, + "loss": 0.8883, + "step": 30440 + }, + { + "epoch": 0.9710131062852769, + "grad_norm": 0.2927461266517639, + "learning_rate": 1.0619812069555778e-05, + "loss": 0.906, + "step": 30450 + }, + { + "epoch": 0.9713319940049109, + "grad_norm": 0.2955889403820038, + "learning_rate": 1.0579254235722086e-05, + "loss": 0.9076, + "step": 30460 + }, + { + "epoch": 0.9716508817245448, + "grad_norm": 0.2992059290409088, + "learning_rate": 1.0538851295202372e-05, + "loss": 0.9145, + "step": 30470 + }, + { + "epoch": 0.9719697694441787, + "grad_norm": 0.30370664596557617, + "learning_rate": 1.0498602656447817e-05, + "loss": 0.9125, + "step": 30480 + }, + { + "epoch": 0.9722886571638126, + "grad_norm": 0.30064305663108826, + "learning_rate": 1.0458507730168771e-05, + "loss": 0.9096, + "step": 30490 + }, + { + "epoch": 0.9726075448834466, + "grad_norm": 0.28889572620391846, + "learning_rate": 1.0418565929326121e-05, + "loss": 0.9028, + "step": 30500 + }, + { + "epoch": 0.9729264326030804, + "grad_norm": 0.30345210433006287, + "learning_rate": 1.0378776669122702e-05, + "loss": 0.904, + "step": 30510 + }, + { + "epoch": 0.9732453203227144, + "grad_norm": 0.30365508794784546, + "learning_rate": 1.0339139366994728e-05, + "loss": 0.9115, + "step": 30520 + }, + { + "epoch": 0.9735642080423483, + "grad_norm": 0.3063391149044037, + "learning_rate": 1.0299653442603272e-05, + "loss": 0.9011, + "step": 30530 + }, + { + "epoch": 0.9738830957619822, + "grad_norm": 0.2992682456970215, + "learning_rate": 1.0260318317825752e-05, + "loss": 0.8967, + "step": 30540 + }, + { + "epoch": 0.9742019834816161, + "grad_norm": 0.31188657879829407, + "learning_rate": 1.0221133416747503e-05, + "loss": 0.9105, + "step": 30550 + }, + { + "epoch": 0.9745208712012501, + "grad_norm": 0.29710718989372253, + "learning_rate": 1.0182098165653291e-05, + "loss": 0.9099, + "step": 30560 + }, + { + "epoch": 0.9748397589208839, + "grad_norm": 0.30273959040641785, + "learning_rate": 1.014321199301896e-05, + "loss": 0.9047, + "step": 30570 + }, + { + "epoch": 0.9751586466405179, + "grad_norm": 0.3000819683074951, + "learning_rate": 1.0104474329503038e-05, + "loss": 0.9013, + "step": 30580 + }, + { + "epoch": 0.9754775343601518, + "grad_norm": 0.299858033657074, + "learning_rate": 1.0065884607938414e-05, + "loss": 0.9122, + "step": 30590 + }, + { + "epoch": 0.9757964220797857, + "grad_norm": 0.31173914670944214, + "learning_rate": 1.0027442263324029e-05, + "loss": 0.927, + "step": 30600 + }, + { + "epoch": 0.9761153097994196, + "grad_norm": 0.2916642427444458, + "learning_rate": 9.989146732816599e-06, + "loss": 0.9044, + "step": 30610 + }, + { + "epoch": 0.9764341975190536, + "grad_norm": 0.29491642117500305, + "learning_rate": 9.950997455722386e-06, + "loss": 0.902, + "step": 30620 + }, + { + "epoch": 0.9767530852386874, + "grad_norm": 0.29398080706596375, + "learning_rate": 9.912993873488982e-06, + "loss": 0.8944, + "step": 30630 + }, + { + "epoch": 0.9770719729583214, + "grad_norm": 0.3057700991630554, + "learning_rate": 9.875135429697123e-06, + "loss": 0.9103, + "step": 30640 + }, + { + "epoch": 0.9773908606779553, + "grad_norm": 0.3077187240123749, + "learning_rate": 9.83742157005256e-06, + "loss": 0.9091, + "step": 30650 + }, + { + "epoch": 0.9777097483975892, + "grad_norm": 0.30761876702308655, + "learning_rate": 9.79985174237793e-06, + "loss": 0.8964, + "step": 30660 + }, + { + "epoch": 0.9780286361172231, + "grad_norm": 0.3030130863189697, + "learning_rate": 9.762425396604675e-06, + "loss": 0.899, + "step": 30670 + }, + { + "epoch": 0.9783475238368571, + "grad_norm": 0.30985233187675476, + "learning_rate": 9.72514198476499e-06, + "loss": 0.9015, + "step": 30680 + }, + { + "epoch": 0.9786664115564909, + "grad_norm": 0.305759072303772, + "learning_rate": 9.688000960983798e-06, + "loss": 0.9086, + "step": 30690 + }, + { + "epoch": 0.9789852992761249, + "grad_norm": 0.2983715236186981, + "learning_rate": 9.65100178147076e-06, + "loss": 0.9056, + "step": 30700 + }, + { + "epoch": 0.9793041869957588, + "grad_norm": 0.30125829577445984, + "learning_rate": 9.614143904512304e-06, + "loss": 0.912, + "step": 30710 + }, + { + "epoch": 0.9796230747153927, + "grad_norm": 0.29553210735321045, + "learning_rate": 9.577426790463718e-06, + "loss": 0.8988, + "step": 30720 + }, + { + "epoch": 0.9799419624350266, + "grad_norm": 0.2971706688404083, + "learning_rate": 9.540849901741222e-06, + "loss": 0.8887, + "step": 30730 + }, + { + "epoch": 0.9802608501546606, + "grad_norm": 0.2950115501880646, + "learning_rate": 9.504412702814105e-06, + "loss": 0.8844, + "step": 30740 + }, + { + "epoch": 0.9805797378742944, + "grad_norm": 0.3017497956752777, + "learning_rate": 9.468114660196888e-06, + "loss": 0.9029, + "step": 30750 + }, + { + "epoch": 0.9808986255939284, + "grad_norm": 0.30914148688316345, + "learning_rate": 9.431955242441515e-06, + "loss": 0.9113, + "step": 30760 + }, + { + "epoch": 0.9812175133135623, + "grad_norm": 0.3126702308654785, + "learning_rate": 9.395933920129571e-06, + "loss": 0.8945, + "step": 30770 + }, + { + "epoch": 0.9815364010331962, + "grad_norm": 0.29880809783935547, + "learning_rate": 9.360050165864518e-06, + "loss": 0.9026, + "step": 30780 + }, + { + "epoch": 0.9818552887528301, + "grad_norm": 0.3105475604534149, + "learning_rate": 9.324303454263998e-06, + "loss": 0.8953, + "step": 30790 + }, + { + "epoch": 0.9821741764724641, + "grad_norm": 0.30883198976516724, + "learning_rate": 9.288693261952112e-06, + "loss": 0.8923, + "step": 30800 + }, + { + "epoch": 0.9824930641920979, + "grad_norm": 0.3052988350391388, + "learning_rate": 9.253219067551782e-06, + "loss": 0.8886, + "step": 30810 + }, + { + "epoch": 0.9828119519117319, + "grad_norm": 0.2951613664627075, + "learning_rate": 9.217880351677104e-06, + "loss": 0.891, + "step": 30820 + }, + { + "epoch": 0.9831308396313658, + "grad_norm": 0.29393279552459717, + "learning_rate": 9.182676596925743e-06, + "loss": 0.8915, + "step": 30830 + }, + { + "epoch": 0.9834497273509997, + "grad_norm": 0.2976394295692444, + "learning_rate": 9.147607287871367e-06, + "loss": 0.8896, + "step": 30840 + }, + { + "epoch": 0.9837686150706336, + "grad_norm": 0.30196166038513184, + "learning_rate": 9.112671911056089e-06, + "loss": 0.9013, + "step": 30850 + }, + { + "epoch": 0.9840875027902676, + "grad_norm": 0.29965201020240784, + "learning_rate": 9.077869954982961e-06, + "loss": 0.8945, + "step": 30860 + }, + { + "epoch": 0.9844063905099014, + "grad_norm": 0.2930929660797119, + "learning_rate": 9.043200910108472e-06, + "loss": 0.8928, + "step": 30870 + }, + { + "epoch": 0.9847252782295354, + "grad_norm": 0.3056459128856659, + "learning_rate": 9.008664268835097e-06, + "loss": 0.894, + "step": 30880 + }, + { + "epoch": 0.9850441659491693, + "grad_norm": 0.3035235106945038, + "learning_rate": 8.97425952550387e-06, + "loss": 0.9012, + "step": 30890 + }, + { + "epoch": 0.9853630536688032, + "grad_norm": 0.2955482602119446, + "learning_rate": 8.939986176386965e-06, + "loss": 0.9011, + "step": 30900 + }, + { + "epoch": 0.9856819413884371, + "grad_norm": 0.3025868535041809, + "learning_rate": 8.90584371968033e-06, + "loss": 0.8887, + "step": 30910 + }, + { + "epoch": 0.9860008291080711, + "grad_norm": 0.2991355061531067, + "learning_rate": 8.871831655496341e-06, + "loss": 0.9031, + "step": 30920 + }, + { + "epoch": 0.9863197168277049, + "grad_norm": 0.3063535690307617, + "learning_rate": 8.837949485856484e-06, + "loss": 0.8958, + "step": 30930 + }, + { + "epoch": 0.9866386045473389, + "grad_norm": 0.29752489924430847, + "learning_rate": 8.804196714684051e-06, + "loss": 0.8858, + "step": 30940 + }, + { + "epoch": 0.9869574922669728, + "grad_norm": 0.30858591198921204, + "learning_rate": 8.770572847796902e-06, + "loss": 0.8985, + "step": 30950 + }, + { + "epoch": 0.9872763799866067, + "grad_norm": 0.3076741099357605, + "learning_rate": 8.737077392900202e-06, + "loss": 0.9049, + "step": 30960 + }, + { + "epoch": 0.9875952677062406, + "grad_norm": 0.3044413924217224, + "learning_rate": 8.703709859579232e-06, + "loss": 0.9002, + "step": 30970 + }, + { + "epoch": 0.9879141554258746, + "grad_norm": 0.309249609708786, + "learning_rate": 8.670469759292197e-06, + "loss": 0.9028, + "step": 30980 + }, + { + "epoch": 0.9882330431455084, + "grad_norm": 0.3098140060901642, + "learning_rate": 8.637356605363086e-06, + "loss": 0.92, + "step": 30990 + }, + { + "epoch": 0.9885519308651424, + "grad_norm": 0.29409000277519226, + "learning_rate": 8.604369912974537e-06, + "loss": 0.895, + "step": 31000 + }, + { + "epoch": 0.9888708185847763, + "grad_norm": 0.3101252615451813, + "learning_rate": 8.571509199160733e-06, + "loss": 0.9301, + "step": 31010 + }, + { + "epoch": 0.9891897063044102, + "grad_norm": 0.316646009683609, + "learning_rate": 8.53877398280035e-06, + "loss": 0.8972, + "step": 31020 + }, + { + "epoch": 0.9895085940240441, + "grad_norm": 0.294486403465271, + "learning_rate": 8.506163784609493e-06, + "loss": 0.8991, + "step": 31030 + }, + { + "epoch": 0.9898274817436781, + "grad_norm": 0.29972952604293823, + "learning_rate": 8.47367812713469e-06, + "loss": 0.913, + "step": 31040 + }, + { + "epoch": 0.9901463694633119, + "grad_norm": 0.30866244435310364, + "learning_rate": 8.441316534745896e-06, + "loss": 0.9004, + "step": 31050 + }, + { + "epoch": 0.9904652571829459, + "grad_norm": 0.29569417238235474, + "learning_rate": 8.409078533629542e-06, + "loss": 0.8924, + "step": 31060 + }, + { + "epoch": 0.9907841449025798, + "grad_norm": 0.3136916160583496, + "learning_rate": 8.376963651781573e-06, + "loss": 0.8984, + "step": 31070 + }, + { + "epoch": 0.9911030326222137, + "grad_norm": 0.3088937997817993, + "learning_rate": 8.34497141900056e-06, + "loss": 0.8946, + "step": 31080 + }, + { + "epoch": 0.9914219203418476, + "grad_norm": 0.3017382025718689, + "learning_rate": 8.313101366880808e-06, + "loss": 0.895, + "step": 31090 + }, + { + "epoch": 0.9917408080614816, + "grad_norm": 0.30170178413391113, + "learning_rate": 8.28135302880549e-06, + "loss": 0.8961, + "step": 31100 + }, + { + "epoch": 0.9920596957811154, + "grad_norm": 0.2898147702217102, + "learning_rate": 8.24972593993983e-06, + "loss": 0.884, + "step": 31110 + }, + { + "epoch": 0.9923785835007494, + "grad_norm": 0.29774415493011475, + "learning_rate": 8.21821963722429e-06, + "loss": 0.8996, + "step": 31120 + }, + { + "epoch": 0.9926974712203833, + "grad_norm": 0.30368444323539734, + "learning_rate": 8.186833659367786e-06, + "loss": 0.8988, + "step": 31130 + }, + { + "epoch": 0.9930163589400173, + "grad_norm": 0.3021566569805145, + "learning_rate": 8.155567546840937e-06, + "loss": 0.9072, + "step": 31140 + }, + { + "epoch": 0.9933352466596511, + "grad_norm": 0.30956295132637024, + "learning_rate": 8.124420841869342e-06, + "loss": 0.8995, + "step": 31150 + }, + { + "epoch": 0.9936541343792851, + "grad_norm": 0.3030258119106293, + "learning_rate": 8.093393088426874e-06, + "loss": 0.894, + "step": 31160 + }, + { + "epoch": 0.993973022098919, + "grad_norm": 0.2955738306045532, + "learning_rate": 8.062483832229001e-06, + "loss": 0.8893, + "step": 31170 + }, + { + "epoch": 0.9942919098185529, + "grad_norm": 0.29659897089004517, + "learning_rate": 8.031692620726136e-06, + "loss": 0.8884, + "step": 31180 + }, + { + "epoch": 0.9946107975381868, + "grad_norm": 0.3124343156814575, + "learning_rate": 8.001019003097014e-06, + "loss": 0.8917, + "step": 31190 + }, + { + "epoch": 0.9949296852578208, + "grad_norm": 0.3097270727157593, + "learning_rate": 7.970462530242083e-06, + "loss": 0.894, + "step": 31200 + }, + { + "epoch": 0.9952485729774546, + "grad_norm": 0.31330564618110657, + "learning_rate": 7.940022754776945e-06, + "loss": 0.9184, + "step": 31210 + }, + { + "epoch": 0.9955674606970886, + "grad_norm": 0.296106219291687, + "learning_rate": 7.90969923102579e-06, + "loss": 0.8857, + "step": 31220 + }, + { + "epoch": 0.9958863484167225, + "grad_norm": 0.28833168745040894, + "learning_rate": 7.879491515014875e-06, + "loss": 0.8868, + "step": 31230 + }, + { + "epoch": 0.9962052361363564, + "grad_norm": 0.30367282032966614, + "learning_rate": 7.849399164466028e-06, + "loss": 0.8864, + "step": 31240 + }, + { + "epoch": 0.9965241238559903, + "grad_norm": 0.30643120408058167, + "learning_rate": 7.819421738790164e-06, + "loss": 0.9024, + "step": 31250 + }, + { + "epoch": 0.9968430115756243, + "grad_norm": 0.2968752086162567, + "learning_rate": 7.78955879908085e-06, + "loss": 0.8872, + "step": 31260 + }, + { + "epoch": 0.9971618992952581, + "grad_norm": 0.31228750944137573, + "learning_rate": 7.759809908107856e-06, + "loss": 0.9089, + "step": 31270 + }, + { + "epoch": 0.9974807870148921, + "grad_norm": 0.3094320595264435, + "learning_rate": 7.730174630310774e-06, + "loss": 0.8925, + "step": 31280 + }, + { + "epoch": 0.997799674734526, + "grad_norm": 0.2979593873023987, + "learning_rate": 7.700652531792634e-06, + "loss": 0.886, + "step": 31290 + }, + { + "epoch": 0.9981185624541599, + "grad_norm": 0.3050536513328552, + "learning_rate": 7.671243180313544e-06, + "loss": 0.9029, + "step": 31300 + }, + { + "epoch": 0.9984374501737938, + "grad_norm": 0.30217939615249634, + "learning_rate": 7.641946145284373e-06, + "loss": 0.8811, + "step": 31310 + }, + { + "epoch": 0.9987563378934278, + "grad_norm": 0.30297768115997314, + "learning_rate": 7.612760997760436e-06, + "loss": 0.8971, + "step": 31320 + }, + { + "epoch": 0.9990752256130616, + "grad_norm": 0.30359673500061035, + "learning_rate": 7.583687310435224e-06, + "loss": 0.9074, + "step": 31330 + }, + { + "epoch": 0.9993941133326956, + "grad_norm": 0.29697471857070923, + "learning_rate": 7.5547246576341395e-06, + "loss": 0.8844, + "step": 31340 + }, + { + "epoch": 0.9997130010523295, + "grad_norm": 0.30605244636535645, + "learning_rate": 7.525872615308268e-06, + "loss": 0.908, + "step": 31350 + } + ], + "logging_steps": 10, + "max_steps": 31359, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.828103613319793e+17, + "train_batch_size": 512, + "trial_name": null, + "trial_params": null + }