diff --git "a/checkpoint-76500/trainer_state.json" "b/checkpoint-76500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-76500/trainer_state.json" @@ -0,0 +1,53571 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.16253434600375316, + "eval_steps": 500, + "global_step": 76500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.12463197390527e-05, + "grad_norm": 5.126564979553223, + "learning_rate": 2.1244954323348205e-07, + "loss": 2.7657, + "step": 10 + }, + { + "epoch": 4.24926394781054e-05, + "grad_norm": 5.615072727203369, + "learning_rate": 4.248990864669641e-07, + "loss": 2.7002, + "step": 20 + }, + { + "epoch": 6.37389592171581e-05, + "grad_norm": 3.057600498199463, + "learning_rate": 6.373486297004462e-07, + "loss": 2.6641, + "step": 30 + }, + { + "epoch": 8.49852789562108e-05, + "grad_norm": 3.074167251586914, + "learning_rate": 8.497981729339282e-07, + "loss": 2.8052, + "step": 40 + }, + { + "epoch": 0.00010623159869526351, + "grad_norm": 4.98861837387085, + "learning_rate": 1.0622477161674104e-06, + "loss": 2.7431, + "step": 50 + }, + { + "epoch": 0.0001274779184343162, + "grad_norm": 3.4498507976531982, + "learning_rate": 1.2746972594008924e-06, + "loss": 2.9199, + "step": 60 + }, + { + "epoch": 0.00014872423817336892, + "grad_norm": 11.24447250366211, + "learning_rate": 1.4871468026343744e-06, + "loss": 2.8124, + "step": 70 + }, + { + "epoch": 0.0001699705579124216, + "grad_norm": 4.361234188079834, + "learning_rate": 1.6995963458678564e-06, + "loss": 2.8042, + "step": 80 + }, + { + "epoch": 0.0001912168776514743, + "grad_norm": 2.065486431121826, + "learning_rate": 1.9120458891013386e-06, + "loss": 2.5817, + "step": 90 + }, + { + "epoch": 0.00021246319739052702, + "grad_norm": 1.7600457668304443, + "learning_rate": 2.124495432334821e-06, + "loss": 2.6734, + "step": 100 + }, + { + "epoch": 0.0002337095171295797, + "grad_norm": 3.210998773574829, + "learning_rate": 2.3369449755683026e-06, + "loss": 2.6951, + "step": 110 + }, + { + "epoch": 0.0002549558368686324, + "grad_norm": 6.885165214538574, + "learning_rate": 2.5493945188017848e-06, + "loss": 2.6578, + "step": 120 + }, + { + "epoch": 0.0002762021566076851, + "grad_norm": 4.454357624053955, + "learning_rate": 2.7618440620352666e-06, + "loss": 2.6411, + "step": 130 + }, + { + "epoch": 0.00029744847634673784, + "grad_norm": 2.535442590713501, + "learning_rate": 2.9742936052687488e-06, + "loss": 2.5801, + "step": 140 + }, + { + "epoch": 0.0003186947960857905, + "grad_norm": 1.3518927097320557, + "learning_rate": 3.186743148502231e-06, + "loss": 2.4699, + "step": 150 + }, + { + "epoch": 0.0003399411158248432, + "grad_norm": 1.2880351543426514, + "learning_rate": 3.3991926917357128e-06, + "loss": 2.3834, + "step": 160 + }, + { + "epoch": 0.00036118743556389594, + "grad_norm": 1.4115750789642334, + "learning_rate": 3.6116422349691954e-06, + "loss": 2.4117, + "step": 170 + }, + { + "epoch": 0.0003824337553029486, + "grad_norm": 11.405415534973145, + "learning_rate": 3.824091778202677e-06, + "loss": 2.4498, + "step": 180 + }, + { + "epoch": 0.0004036800750420013, + "grad_norm": 2.3859736919403076, + "learning_rate": 4.036541321436159e-06, + "loss": 2.4686, + "step": 190 + }, + { + "epoch": 0.00042492639478105404, + "grad_norm": 20.802696228027344, + "learning_rate": 4.248990864669642e-06, + "loss": 2.4032, + "step": 200 + }, + { + "epoch": 0.0004461727145201067, + "grad_norm": 1.9198743104934692, + "learning_rate": 4.461440407903123e-06, + "loss": 2.4303, + "step": 210 + }, + { + "epoch": 0.0004674190342591594, + "grad_norm": 0.7131376266479492, + "learning_rate": 4.673889951136605e-06, + "loss": 2.3437, + "step": 220 + }, + { + "epoch": 0.0004886653539982121, + "grad_norm": 1.3298033475875854, + "learning_rate": 4.886339494370088e-06, + "loss": 2.3295, + "step": 230 + }, + { + "epoch": 0.0005099116737372648, + "grad_norm": 2.0641324520111084, + "learning_rate": 5.0987890376035696e-06, + "loss": 2.3342, + "step": 240 + }, + { + "epoch": 0.0005311579934763175, + "grad_norm": 0.7873539328575134, + "learning_rate": 5.311238580837051e-06, + "loss": 2.2129, + "step": 250 + }, + { + "epoch": 0.0005524043132153702, + "grad_norm": 0.5126165747642517, + "learning_rate": 5.523688124070533e-06, + "loss": 2.2246, + "step": 260 + }, + { + "epoch": 0.0005736506329544229, + "grad_norm": 0.8253518342971802, + "learning_rate": 5.736137667304015e-06, + "loss": 2.2639, + "step": 270 + }, + { + "epoch": 0.0005948969526934757, + "grad_norm": 3.702577590942383, + "learning_rate": 5.9485872105374975e-06, + "loss": 2.265, + "step": 280 + }, + { + "epoch": 0.0006161432724325283, + "grad_norm": 0.7893111109733582, + "learning_rate": 6.161036753770979e-06, + "loss": 2.2702, + "step": 290 + }, + { + "epoch": 0.000637389592171581, + "grad_norm": 0.6148609519004822, + "learning_rate": 6.373486297004462e-06, + "loss": 2.2515, + "step": 300 + }, + { + "epoch": 0.0006586359119106338, + "grad_norm": 3.5288407802581787, + "learning_rate": 6.585935840237943e-06, + "loss": 2.1762, + "step": 310 + }, + { + "epoch": 0.0006798822316496864, + "grad_norm": 1.703134536743164, + "learning_rate": 6.7983853834714255e-06, + "loss": 2.2031, + "step": 320 + }, + { + "epoch": 0.0007011285513887391, + "grad_norm": 1.4556134939193726, + "learning_rate": 7.010834926704908e-06, + "loss": 2.2607, + "step": 330 + }, + { + "epoch": 0.0007223748711277919, + "grad_norm": 1.0154985189437866, + "learning_rate": 7.223284469938391e-06, + "loss": 2.2004, + "step": 340 + }, + { + "epoch": 0.0007436211908668445, + "grad_norm": 1.2762023210525513, + "learning_rate": 7.435734013171872e-06, + "loss": 2.1921, + "step": 350 + }, + { + "epoch": 0.0007648675106058972, + "grad_norm": 0.46400967240333557, + "learning_rate": 7.648183556405354e-06, + "loss": 2.1798, + "step": 360 + }, + { + "epoch": 0.00078611383034495, + "grad_norm": 13.1392822265625, + "learning_rate": 7.860633099638836e-06, + "loss": 2.0537, + "step": 370 + }, + { + "epoch": 0.0008073601500840026, + "grad_norm": 0.6416741609573364, + "learning_rate": 8.073082642872318e-06, + "loss": 2.122, + "step": 380 + }, + { + "epoch": 0.0008286064698230553, + "grad_norm": 0.926095724105835, + "learning_rate": 8.2855321861058e-06, + "loss": 2.0868, + "step": 390 + }, + { + "epoch": 0.0008498527895621081, + "grad_norm": 0.9541590213775635, + "learning_rate": 8.497981729339283e-06, + "loss": 2.1184, + "step": 400 + }, + { + "epoch": 0.0008710991093011607, + "grad_norm": 1.3921229839324951, + "learning_rate": 8.710431272572763e-06, + "loss": 1.9946, + "step": 410 + }, + { + "epoch": 0.0008923454290402134, + "grad_norm": 1.4127691984176636, + "learning_rate": 8.922880815806247e-06, + "loss": 2.0973, + "step": 420 + }, + { + "epoch": 0.0009135917487792662, + "grad_norm": 1.690733790397644, + "learning_rate": 9.135330359039729e-06, + "loss": 2.1253, + "step": 430 + }, + { + "epoch": 0.0009348380685183188, + "grad_norm": 0.9957524538040161, + "learning_rate": 9.34777990227321e-06, + "loss": 2.0787, + "step": 440 + }, + { + "epoch": 0.0009560843882573715, + "grad_norm": 1.0490230321884155, + "learning_rate": 9.560229445506692e-06, + "loss": 2.1351, + "step": 450 + }, + { + "epoch": 0.0009773307079964243, + "grad_norm": 0.45668384432792664, + "learning_rate": 9.772678988740176e-06, + "loss": 2.0879, + "step": 460 + }, + { + "epoch": 0.000998577027735477, + "grad_norm": 0.7475268244743347, + "learning_rate": 9.985128531973657e-06, + "loss": 2.0981, + "step": 470 + }, + { + "epoch": 0.0010198233474745296, + "grad_norm": 0.6253747344017029, + "learning_rate": 1.0197578075207139e-05, + "loss": 2.0244, + "step": 480 + }, + { + "epoch": 0.0010410696672135823, + "grad_norm": 0.7328705787658691, + "learning_rate": 1.0410027618440621e-05, + "loss": 2.0227, + "step": 490 + }, + { + "epoch": 0.001062315986952635, + "grad_norm": 0.5342766046524048, + "learning_rate": 1.0622477161674103e-05, + "loss": 2.0476, + "step": 500 + }, + { + "epoch": 0.0010835623066916878, + "grad_norm": 2.2934184074401855, + "learning_rate": 1.0834926704907584e-05, + "loss": 1.9868, + "step": 510 + }, + { + "epoch": 0.0011048086264307405, + "grad_norm": 0.4630323648452759, + "learning_rate": 1.1047376248141066e-05, + "loss": 2.0251, + "step": 520 + }, + { + "epoch": 0.0011260549461697932, + "grad_norm": 0.4629395008087158, + "learning_rate": 1.125982579137455e-05, + "loss": 1.9645, + "step": 530 + }, + { + "epoch": 0.0011473012659088458, + "grad_norm": 0.557029664516449, + "learning_rate": 1.147227533460803e-05, + "loss": 2.021, + "step": 540 + }, + { + "epoch": 0.0011685475856478985, + "grad_norm": 0.6987115740776062, + "learning_rate": 1.1684724877841513e-05, + "loss": 2.0179, + "step": 550 + }, + { + "epoch": 0.0011897939053869514, + "grad_norm": 0.42350128293037415, + "learning_rate": 1.1897174421074995e-05, + "loss": 1.9506, + "step": 560 + }, + { + "epoch": 0.001211040225126004, + "grad_norm": 0.48130160570144653, + "learning_rate": 1.2109623964308479e-05, + "loss": 1.964, + "step": 570 + }, + { + "epoch": 0.0012322865448650567, + "grad_norm": 0.7689043879508972, + "learning_rate": 1.2322073507541959e-05, + "loss": 2.0156, + "step": 580 + }, + { + "epoch": 0.0012535328646041094, + "grad_norm": 0.49595606327056885, + "learning_rate": 1.253452305077544e-05, + "loss": 1.8801, + "step": 590 + }, + { + "epoch": 0.001274779184343162, + "grad_norm": 0.5113564133644104, + "learning_rate": 1.2746972594008924e-05, + "loss": 1.9157, + "step": 600 + }, + { + "epoch": 0.0012960255040822147, + "grad_norm": 0.6487011909484863, + "learning_rate": 1.2959422137242406e-05, + "loss": 1.9332, + "step": 610 + }, + { + "epoch": 0.0013172718238212676, + "grad_norm": 0.5928911566734314, + "learning_rate": 1.3171871680475886e-05, + "loss": 1.9283, + "step": 620 + }, + { + "epoch": 0.0013385181435603202, + "grad_norm": 0.5895127654075623, + "learning_rate": 1.3384321223709371e-05, + "loss": 1.9431, + "step": 630 + }, + { + "epoch": 0.001359764463299373, + "grad_norm": 0.5481201410293579, + "learning_rate": 1.3596770766942851e-05, + "loss": 1.93, + "step": 640 + }, + { + "epoch": 0.0013810107830384256, + "grad_norm": 0.9899961948394775, + "learning_rate": 1.3809220310176335e-05, + "loss": 1.9267, + "step": 650 + }, + { + "epoch": 0.0014022571027774782, + "grad_norm": 0.6837050914764404, + "learning_rate": 1.4021669853409816e-05, + "loss": 1.9151, + "step": 660 + }, + { + "epoch": 0.0014235034225165309, + "grad_norm": 0.5140948295593262, + "learning_rate": 1.4234119396643298e-05, + "loss": 1.9076, + "step": 670 + }, + { + "epoch": 0.0014447497422555838, + "grad_norm": 0.9297535419464111, + "learning_rate": 1.4446568939876782e-05, + "loss": 1.9197, + "step": 680 + }, + { + "epoch": 0.0014659960619946364, + "grad_norm": 0.5466400980949402, + "learning_rate": 1.4659018483110262e-05, + "loss": 1.8364, + "step": 690 + }, + { + "epoch": 0.001487242381733689, + "grad_norm": 0.8906233310699463, + "learning_rate": 1.4871468026343743e-05, + "loss": 1.8941, + "step": 700 + }, + { + "epoch": 0.0015084887014727418, + "grad_norm": 1.1113930940628052, + "learning_rate": 1.5083917569577227e-05, + "loss": 1.886, + "step": 710 + }, + { + "epoch": 0.0015297350212117944, + "grad_norm": 1.0322551727294922, + "learning_rate": 1.529636711281071e-05, + "loss": 1.9178, + "step": 720 + }, + { + "epoch": 0.001550981340950847, + "grad_norm": 0.7651371359825134, + "learning_rate": 1.550881665604419e-05, + "loss": 1.8575, + "step": 730 + }, + { + "epoch": 0.0015722276606899, + "grad_norm": 0.6901090145111084, + "learning_rate": 1.5721266199277672e-05, + "loss": 1.8752, + "step": 740 + }, + { + "epoch": 0.0015934739804289526, + "grad_norm": 0.6597158908843994, + "learning_rate": 1.5933715742511156e-05, + "loss": 1.8289, + "step": 750 + }, + { + "epoch": 0.0016147203001680053, + "grad_norm": 0.641681432723999, + "learning_rate": 1.6146165285744636e-05, + "loss": 1.8524, + "step": 760 + }, + { + "epoch": 0.001635966619907058, + "grad_norm": 0.7931463718414307, + "learning_rate": 1.635861482897812e-05, + "loss": 1.8655, + "step": 770 + }, + { + "epoch": 0.0016572129396461106, + "grad_norm": 0.6841832995414734, + "learning_rate": 1.65710643722116e-05, + "loss": 1.8041, + "step": 780 + }, + { + "epoch": 0.0016784592593851633, + "grad_norm": 1.2679414749145508, + "learning_rate": 1.6783513915445083e-05, + "loss": 1.8491, + "step": 790 + }, + { + "epoch": 0.0016997055791242162, + "grad_norm": 0.6396725177764893, + "learning_rate": 1.6995963458678566e-05, + "loss": 1.7926, + "step": 800 + }, + { + "epoch": 0.0017209518988632688, + "grad_norm": 0.7741534113883972, + "learning_rate": 1.7208413001912046e-05, + "loss": 1.8503, + "step": 810 + }, + { + "epoch": 0.0017421982186023215, + "grad_norm": 0.9238278865814209, + "learning_rate": 1.7420862545145527e-05, + "loss": 1.8379, + "step": 820 + }, + { + "epoch": 0.0017634445383413742, + "grad_norm": 1.3347398042678833, + "learning_rate": 1.763331208837901e-05, + "loss": 1.8837, + "step": 830 + }, + { + "epoch": 0.0017846908580804268, + "grad_norm": 0.6294313669204712, + "learning_rate": 1.7845761631612493e-05, + "loss": 1.8298, + "step": 840 + }, + { + "epoch": 0.0018059371778194795, + "grad_norm": 0.7493991255760193, + "learning_rate": 1.8058211174845974e-05, + "loss": 1.8081, + "step": 850 + }, + { + "epoch": 0.0018271834975585324, + "grad_norm": 0.6665028929710388, + "learning_rate": 1.8270660718079457e-05, + "loss": 1.8021, + "step": 860 + }, + { + "epoch": 0.001848429817297585, + "grad_norm": 0.7960326075553894, + "learning_rate": 1.8483110261312937e-05, + "loss": 1.8354, + "step": 870 + }, + { + "epoch": 0.0018696761370366377, + "grad_norm": 1.2572838068008423, + "learning_rate": 1.869555980454642e-05, + "loss": 1.8135, + "step": 880 + }, + { + "epoch": 0.0018909224567756904, + "grad_norm": 1.5625611543655396, + "learning_rate": 1.8908009347779904e-05, + "loss": 1.8464, + "step": 890 + }, + { + "epoch": 0.001912168776514743, + "grad_norm": 1.0861330032348633, + "learning_rate": 1.9120458891013384e-05, + "loss": 1.8046, + "step": 900 + }, + { + "epoch": 0.0019334150962537957, + "grad_norm": 0.6853525638580322, + "learning_rate": 1.9332908434246868e-05, + "loss": 1.8054, + "step": 910 + }, + { + "epoch": 0.0019546614159928486, + "grad_norm": 2.140418767929077, + "learning_rate": 1.954535797748035e-05, + "loss": 1.7692, + "step": 920 + }, + { + "epoch": 0.001975907735731901, + "grad_norm": 0.6957737803459167, + "learning_rate": 1.975780752071383e-05, + "loss": 1.8178, + "step": 930 + }, + { + "epoch": 0.001997154055470954, + "grad_norm": 0.6889773607254028, + "learning_rate": 1.9970257063947315e-05, + "loss": 1.7725, + "step": 940 + }, + { + "epoch": 0.0020184003752100068, + "grad_norm": 0.8981391191482544, + "learning_rate": 2.0182706607180795e-05, + "loss": 1.8371, + "step": 950 + }, + { + "epoch": 0.0020396466949490592, + "grad_norm": 1.3700798749923706, + "learning_rate": 2.0395156150414278e-05, + "loss": 1.7892, + "step": 960 + }, + { + "epoch": 0.002060893014688112, + "grad_norm": 1.5715270042419434, + "learning_rate": 2.0607605693647762e-05, + "loss": 1.8233, + "step": 970 + }, + { + "epoch": 0.0020821393344271646, + "grad_norm": 1.1580621004104614, + "learning_rate": 2.0820055236881242e-05, + "loss": 1.7519, + "step": 980 + }, + { + "epoch": 0.0021033856541662174, + "grad_norm": 0.738719642162323, + "learning_rate": 2.1032504780114722e-05, + "loss": 1.7582, + "step": 990 + }, + { + "epoch": 0.00212463197390527, + "grad_norm": 1.3361958265304565, + "learning_rate": 2.1244954323348205e-05, + "loss": 1.791, + "step": 1000 + }, + { + "epoch": 0.0021458782936443228, + "grad_norm": 1.2333531379699707, + "learning_rate": 2.145740386658169e-05, + "loss": 1.8098, + "step": 1010 + }, + { + "epoch": 0.0021671246133833756, + "grad_norm": 0.6487088203430176, + "learning_rate": 2.166985340981517e-05, + "loss": 1.7916, + "step": 1020 + }, + { + "epoch": 0.002188370933122428, + "grad_norm": 1.1477603912353516, + "learning_rate": 2.1882302953048652e-05, + "loss": 1.7488, + "step": 1030 + }, + { + "epoch": 0.002209617252861481, + "grad_norm": 0.7726391553878784, + "learning_rate": 2.2094752496282133e-05, + "loss": 1.7596, + "step": 1040 + }, + { + "epoch": 0.0022308635726005334, + "grad_norm": 1.6083648204803467, + "learning_rate": 2.2307202039515616e-05, + "loss": 1.7923, + "step": 1050 + }, + { + "epoch": 0.0022521098923395863, + "grad_norm": 1.3215006589889526, + "learning_rate": 2.25196515827491e-05, + "loss": 1.7839, + "step": 1060 + }, + { + "epoch": 0.002273356212078639, + "grad_norm": 0.8354921340942383, + "learning_rate": 2.273210112598258e-05, + "loss": 1.7818, + "step": 1070 + }, + { + "epoch": 0.0022946025318176916, + "grad_norm": 0.9046885967254639, + "learning_rate": 2.294455066921606e-05, + "loss": 1.7658, + "step": 1080 + }, + { + "epoch": 0.0023158488515567445, + "grad_norm": 0.9762981534004211, + "learning_rate": 2.3157000212449547e-05, + "loss": 1.763, + "step": 1090 + }, + { + "epoch": 0.002337095171295797, + "grad_norm": 3.1376545429229736, + "learning_rate": 2.3369449755683027e-05, + "loss": 1.8302, + "step": 1100 + }, + { + "epoch": 0.00235834149103485, + "grad_norm": 1.9120508432388306, + "learning_rate": 2.3581899298916507e-05, + "loss": 1.7852, + "step": 1110 + }, + { + "epoch": 0.0023795878107739027, + "grad_norm": 0.7516286969184875, + "learning_rate": 2.379434884214999e-05, + "loss": 1.7764, + "step": 1120 + }, + { + "epoch": 0.002400834130512955, + "grad_norm": 0.8718913793563843, + "learning_rate": 2.4006798385383474e-05, + "loss": 1.7885, + "step": 1130 + }, + { + "epoch": 0.002422080450252008, + "grad_norm": 0.8511870503425598, + "learning_rate": 2.4219247928616957e-05, + "loss": 1.7398, + "step": 1140 + }, + { + "epoch": 0.0024433267699910605, + "grad_norm": 0.8492255806922913, + "learning_rate": 2.4431697471850437e-05, + "loss": 1.7867, + "step": 1150 + }, + { + "epoch": 0.0024645730897301134, + "grad_norm": 1.1522537469863892, + "learning_rate": 2.4644147015083917e-05, + "loss": 1.7195, + "step": 1160 + }, + { + "epoch": 0.002485819409469166, + "grad_norm": 0.9927936792373657, + "learning_rate": 2.48565965583174e-05, + "loss": 1.7516, + "step": 1170 + }, + { + "epoch": 0.0025070657292082187, + "grad_norm": 0.7463197112083435, + "learning_rate": 2.506904610155088e-05, + "loss": 1.7119, + "step": 1180 + }, + { + "epoch": 0.0025283120489472716, + "grad_norm": 1.1197805404663086, + "learning_rate": 2.5281495644784364e-05, + "loss": 1.7251, + "step": 1190 + }, + { + "epoch": 0.002549558368686324, + "grad_norm": 2.0068371295928955, + "learning_rate": 2.5493945188017848e-05, + "loss": 1.7451, + "step": 1200 + }, + { + "epoch": 0.002570804688425377, + "grad_norm": 0.8472912311553955, + "learning_rate": 2.5706394731251328e-05, + "loss": 1.7385, + "step": 1210 + }, + { + "epoch": 0.0025920510081644294, + "grad_norm": 0.8593330979347229, + "learning_rate": 2.591884427448481e-05, + "loss": 1.7619, + "step": 1220 + }, + { + "epoch": 0.0026132973279034822, + "grad_norm": 1.2452706098556519, + "learning_rate": 2.6131293817718295e-05, + "loss": 1.7113, + "step": 1230 + }, + { + "epoch": 0.002634543647642535, + "grad_norm": 0.7820441722869873, + "learning_rate": 2.634374336095177e-05, + "loss": 1.7338, + "step": 1240 + }, + { + "epoch": 0.0026557899673815876, + "grad_norm": 0.9799504280090332, + "learning_rate": 2.6556192904185255e-05, + "loss": 1.6964, + "step": 1250 + }, + { + "epoch": 0.0026770362871206405, + "grad_norm": 0.9213830828666687, + "learning_rate": 2.6768642447418742e-05, + "loss": 1.7501, + "step": 1260 + }, + { + "epoch": 0.002698282606859693, + "grad_norm": 0.8344391584396362, + "learning_rate": 2.6981091990652225e-05, + "loss": 1.7527, + "step": 1270 + }, + { + "epoch": 0.002719528926598746, + "grad_norm": 0.8629624247550964, + "learning_rate": 2.7193541533885702e-05, + "loss": 1.7267, + "step": 1280 + }, + { + "epoch": 0.0027407752463377982, + "grad_norm": 1.0476369857788086, + "learning_rate": 2.7405991077119186e-05, + "loss": 1.802, + "step": 1290 + }, + { + "epoch": 0.002762021566076851, + "grad_norm": 0.9389401078224182, + "learning_rate": 2.761844062035267e-05, + "loss": 1.6892, + "step": 1300 + }, + { + "epoch": 0.002783267885815904, + "grad_norm": 0.750410795211792, + "learning_rate": 2.783089016358615e-05, + "loss": 1.6972, + "step": 1310 + }, + { + "epoch": 0.0028045142055549564, + "grad_norm": 1.202742338180542, + "learning_rate": 2.8043339706819633e-05, + "loss": 1.7339, + "step": 1320 + }, + { + "epoch": 0.0028257605252940093, + "grad_norm": 1.282296895980835, + "learning_rate": 2.8255789250053116e-05, + "loss": 1.7138, + "step": 1330 + }, + { + "epoch": 0.0028470068450330618, + "grad_norm": 1.6468796730041504, + "learning_rate": 2.8468238793286596e-05, + "loss": 1.6946, + "step": 1340 + }, + { + "epoch": 0.0028682531647721147, + "grad_norm": 2.280949354171753, + "learning_rate": 2.868068833652008e-05, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 0.0028894994845111675, + "grad_norm": 0.7980895638465881, + "learning_rate": 2.8893137879753563e-05, + "loss": 1.7361, + "step": 1360 + }, + { + "epoch": 0.00291074580425022, + "grad_norm": 0.8235357403755188, + "learning_rate": 2.910558742298704e-05, + "loss": 1.7062, + "step": 1370 + }, + { + "epoch": 0.002931992123989273, + "grad_norm": 1.3008054494857788, + "learning_rate": 2.9318036966220523e-05, + "loss": 1.74, + "step": 1380 + }, + { + "epoch": 0.0029532384437283253, + "grad_norm": 2.110330104827881, + "learning_rate": 2.9530486509454007e-05, + "loss": 1.7116, + "step": 1390 + }, + { + "epoch": 0.002974484763467378, + "grad_norm": 1.468788981437683, + "learning_rate": 2.9742936052687487e-05, + "loss": 1.6613, + "step": 1400 + }, + { + "epoch": 0.0029957310832064306, + "grad_norm": 0.9747596979141235, + "learning_rate": 2.995538559592097e-05, + "loss": 1.7168, + "step": 1410 + }, + { + "epoch": 0.0030169774029454835, + "grad_norm": 1.1915490627288818, + "learning_rate": 3.0167835139154454e-05, + "loss": 1.6702, + "step": 1420 + }, + { + "epoch": 0.0030382237226845364, + "grad_norm": 1.434261441230774, + "learning_rate": 3.0380284682387934e-05, + "loss": 1.6916, + "step": 1430 + }, + { + "epoch": 0.003059470042423589, + "grad_norm": 1.6392701864242554, + "learning_rate": 3.059273422562142e-05, + "loss": 1.7109, + "step": 1440 + }, + { + "epoch": 0.0030807163621626417, + "grad_norm": 1.096319556236267, + "learning_rate": 3.08051837688549e-05, + "loss": 1.7278, + "step": 1450 + }, + { + "epoch": 0.003101962681901694, + "grad_norm": 0.8512833714485168, + "learning_rate": 3.101763331208838e-05, + "loss": 1.7079, + "step": 1460 + }, + { + "epoch": 0.003123209001640747, + "grad_norm": 1.0432088375091553, + "learning_rate": 3.1230082855321864e-05, + "loss": 1.7229, + "step": 1470 + }, + { + "epoch": 0.0031444553213798, + "grad_norm": 0.7871159911155701, + "learning_rate": 3.1442532398555345e-05, + "loss": 1.6707, + "step": 1480 + }, + { + "epoch": 0.0031657016411188524, + "grad_norm": 0.7996189594268799, + "learning_rate": 3.1654981941788825e-05, + "loss": 1.6789, + "step": 1490 + }, + { + "epoch": 0.0031869479608579053, + "grad_norm": 1.0813953876495361, + "learning_rate": 3.186743148502231e-05, + "loss": 1.68, + "step": 1500 + }, + { + "epoch": 0.0032081942805969577, + "grad_norm": 1.054692268371582, + "learning_rate": 3.207988102825579e-05, + "loss": 1.6919, + "step": 1510 + }, + { + "epoch": 0.0032294406003360106, + "grad_norm": 1.0501441955566406, + "learning_rate": 3.229233057148927e-05, + "loss": 1.6954, + "step": 1520 + }, + { + "epoch": 0.003250686920075063, + "grad_norm": 0.7806438207626343, + "learning_rate": 3.250478011472275e-05, + "loss": 1.7228, + "step": 1530 + }, + { + "epoch": 0.003271933239814116, + "grad_norm": 1.4988213777542114, + "learning_rate": 3.271722965795624e-05, + "loss": 1.7193, + "step": 1540 + }, + { + "epoch": 0.003293179559553169, + "grad_norm": 1.2674223184585571, + "learning_rate": 3.292967920118972e-05, + "loss": 1.7316, + "step": 1550 + }, + { + "epoch": 0.0033144258792922212, + "grad_norm": 0.8796882033348083, + "learning_rate": 3.31421287444232e-05, + "loss": 1.7139, + "step": 1560 + }, + { + "epoch": 0.003335672199031274, + "grad_norm": 1.512904405593872, + "learning_rate": 3.3354578287656686e-05, + "loss": 1.691, + "step": 1570 + }, + { + "epoch": 0.0033569185187703266, + "grad_norm": 1.3386465311050415, + "learning_rate": 3.3567027830890166e-05, + "loss": 1.6686, + "step": 1580 + }, + { + "epoch": 0.0033781648385093795, + "grad_norm": 1.0487931966781616, + "learning_rate": 3.3779477374123646e-05, + "loss": 1.6343, + "step": 1590 + }, + { + "epoch": 0.0033994111582484323, + "grad_norm": 0.9472367763519287, + "learning_rate": 3.399192691735713e-05, + "loss": 1.663, + "step": 1600 + }, + { + "epoch": 0.003420657477987485, + "grad_norm": 0.9561249613761902, + "learning_rate": 3.4204376460590606e-05, + "loss": 1.6634, + "step": 1610 + }, + { + "epoch": 0.0034419037977265377, + "grad_norm": 0.79259192943573, + "learning_rate": 3.441682600382409e-05, + "loss": 1.6865, + "step": 1620 + }, + { + "epoch": 0.00346315011746559, + "grad_norm": 0.8566011786460876, + "learning_rate": 3.462927554705758e-05, + "loss": 1.6455, + "step": 1630 + }, + { + "epoch": 0.003484396437204643, + "grad_norm": 0.9912124872207642, + "learning_rate": 3.484172509029105e-05, + "loss": 1.7166, + "step": 1640 + }, + { + "epoch": 0.0035056427569436954, + "grad_norm": 1.0376073122024536, + "learning_rate": 3.505417463352454e-05, + "loss": 1.7161, + "step": 1650 + }, + { + "epoch": 0.0035268890766827483, + "grad_norm": 0.9525182247161865, + "learning_rate": 3.526662417675802e-05, + "loss": 1.6967, + "step": 1660 + }, + { + "epoch": 0.003548135396421801, + "grad_norm": 0.8527894020080566, + "learning_rate": 3.54790737199915e-05, + "loss": 1.6937, + "step": 1670 + }, + { + "epoch": 0.0035693817161608537, + "grad_norm": 1.1493324041366577, + "learning_rate": 3.569152326322499e-05, + "loss": 1.6653, + "step": 1680 + }, + { + "epoch": 0.0035906280358999065, + "grad_norm": 0.9014407396316528, + "learning_rate": 3.590397280645847e-05, + "loss": 1.6753, + "step": 1690 + }, + { + "epoch": 0.003611874355638959, + "grad_norm": 0.928118109703064, + "learning_rate": 3.611642234969195e-05, + "loss": 1.709, + "step": 1700 + }, + { + "epoch": 0.003633120675378012, + "grad_norm": 1.5548665523529053, + "learning_rate": 3.6328871892925434e-05, + "loss": 1.6566, + "step": 1710 + }, + { + "epoch": 0.0036543669951170647, + "grad_norm": 1.049028754234314, + "learning_rate": 3.6541321436158914e-05, + "loss": 1.6622, + "step": 1720 + }, + { + "epoch": 0.003675613314856117, + "grad_norm": 0.8538280129432678, + "learning_rate": 3.6753770979392394e-05, + "loss": 1.7, + "step": 1730 + }, + { + "epoch": 0.00369685963459517, + "grad_norm": 1.0827655792236328, + "learning_rate": 3.6966220522625874e-05, + "loss": 1.6794, + "step": 1740 + }, + { + "epoch": 0.0037181059543342225, + "grad_norm": 1.0013504028320312, + "learning_rate": 3.717867006585936e-05, + "loss": 1.6742, + "step": 1750 + }, + { + "epoch": 0.0037393522740732754, + "grad_norm": 0.921330451965332, + "learning_rate": 3.739111960909284e-05, + "loss": 1.6464, + "step": 1760 + }, + { + "epoch": 0.0037605985938123283, + "grad_norm": 1.0056246519088745, + "learning_rate": 3.760356915232632e-05, + "loss": 1.6511, + "step": 1770 + }, + { + "epoch": 0.0037818449135513807, + "grad_norm": 0.7334320545196533, + "learning_rate": 3.781601869555981e-05, + "loss": 1.6395, + "step": 1780 + }, + { + "epoch": 0.0038030912332904336, + "grad_norm": 0.9269031882286072, + "learning_rate": 3.802846823879329e-05, + "loss": 1.6523, + "step": 1790 + }, + { + "epoch": 0.003824337553029486, + "grad_norm": 1.1196471452713013, + "learning_rate": 3.824091778202677e-05, + "loss": 1.6579, + "step": 1800 + }, + { + "epoch": 0.003845583872768539, + "grad_norm": 0.7869518399238586, + "learning_rate": 3.8453367325260255e-05, + "loss": 1.654, + "step": 1810 + }, + { + "epoch": 0.0038668301925075914, + "grad_norm": 0.9754536151885986, + "learning_rate": 3.8665816868493735e-05, + "loss": 1.646, + "step": 1820 + }, + { + "epoch": 0.0038880765122466443, + "grad_norm": 1.1568890810012817, + "learning_rate": 3.8878266411727215e-05, + "loss": 1.6575, + "step": 1830 + }, + { + "epoch": 0.003909322831985697, + "grad_norm": 1.117510437965393, + "learning_rate": 3.90907159549607e-05, + "loss": 1.6326, + "step": 1840 + }, + { + "epoch": 0.00393056915172475, + "grad_norm": 1.0115188360214233, + "learning_rate": 3.930316549819418e-05, + "loss": 1.6668, + "step": 1850 + }, + { + "epoch": 0.003951815471463802, + "grad_norm": 0.8613663911819458, + "learning_rate": 3.951561504142766e-05, + "loss": 1.6695, + "step": 1860 + }, + { + "epoch": 0.003973061791202855, + "grad_norm": 0.7858186960220337, + "learning_rate": 3.972806458466114e-05, + "loss": 1.6433, + "step": 1870 + }, + { + "epoch": 0.003994308110941908, + "grad_norm": 0.9840145707130432, + "learning_rate": 3.994051412789463e-05, + "loss": 1.6522, + "step": 1880 + }, + { + "epoch": 0.004015554430680961, + "grad_norm": 1.4670604467391968, + "learning_rate": 4.015296367112811e-05, + "loss": 1.6056, + "step": 1890 + }, + { + "epoch": 0.0040368007504200136, + "grad_norm": 0.9330235719680786, + "learning_rate": 4.036541321436159e-05, + "loss": 1.6182, + "step": 1900 + }, + { + "epoch": 0.004058047070159066, + "grad_norm": 0.8840314149856567, + "learning_rate": 4.0577862757595076e-05, + "loss": 1.6283, + "step": 1910 + }, + { + "epoch": 0.0040792933898981185, + "grad_norm": 0.912764310836792, + "learning_rate": 4.0790312300828557e-05, + "loss": 1.6587, + "step": 1920 + }, + { + "epoch": 0.004100539709637171, + "grad_norm": 1.192991852760315, + "learning_rate": 4.1002761844062037e-05, + "loss": 1.65, + "step": 1930 + }, + { + "epoch": 0.004121786029376224, + "grad_norm": 0.8729370832443237, + "learning_rate": 4.1215211387295523e-05, + "loss": 1.6542, + "step": 1940 + }, + { + "epoch": 0.004143032349115277, + "grad_norm": 1.2994292974472046, + "learning_rate": 4.1427660930529e-05, + "loss": 1.6347, + "step": 1950 + }, + { + "epoch": 0.004164278668854329, + "grad_norm": 1.0736210346221924, + "learning_rate": 4.1640110473762484e-05, + "loss": 1.7099, + "step": 1960 + }, + { + "epoch": 0.004185524988593382, + "grad_norm": 0.780449628829956, + "learning_rate": 4.185256001699597e-05, + "loss": 1.6502, + "step": 1970 + }, + { + "epoch": 0.004206771308332435, + "grad_norm": 0.9329586625099182, + "learning_rate": 4.2065009560229444e-05, + "loss": 1.6669, + "step": 1980 + }, + { + "epoch": 0.004228017628071488, + "grad_norm": 1.054909348487854, + "learning_rate": 4.227745910346293e-05, + "loss": 1.6195, + "step": 1990 + }, + { + "epoch": 0.00424926394781054, + "grad_norm": 0.7247118353843689, + "learning_rate": 4.248990864669641e-05, + "loss": 1.598, + "step": 2000 + }, + { + "epoch": 0.004270510267549593, + "grad_norm": 1.6144375801086426, + "learning_rate": 4.270235818992989e-05, + "loss": 1.6632, + "step": 2010 + }, + { + "epoch": 0.0042917565872886455, + "grad_norm": 1.1903817653656006, + "learning_rate": 4.291480773316338e-05, + "loss": 1.5649, + "step": 2020 + }, + { + "epoch": 0.004313002907027698, + "grad_norm": 1.7288334369659424, + "learning_rate": 4.312725727639686e-05, + "loss": 1.6322, + "step": 2030 + }, + { + "epoch": 0.004334249226766751, + "grad_norm": 1.0450774431228638, + "learning_rate": 4.333970681963034e-05, + "loss": 1.5958, + "step": 2040 + }, + { + "epoch": 0.004355495546505803, + "grad_norm": 1.926113486289978, + "learning_rate": 4.3552156362863825e-05, + "loss": 1.578, + "step": 2050 + }, + { + "epoch": 0.004376741866244856, + "grad_norm": 0.8286908864974976, + "learning_rate": 4.3764605906097305e-05, + "loss": 1.6697, + "step": 2060 + }, + { + "epoch": 0.004397988185983909, + "grad_norm": 1.0654383897781372, + "learning_rate": 4.3977055449330785e-05, + "loss": 1.6436, + "step": 2070 + }, + { + "epoch": 0.004419234505722962, + "grad_norm": 0.9462096691131592, + "learning_rate": 4.4189504992564265e-05, + "loss": 1.5845, + "step": 2080 + }, + { + "epoch": 0.004440480825462015, + "grad_norm": 1.9075877666473389, + "learning_rate": 4.440195453579775e-05, + "loss": 1.5917, + "step": 2090 + }, + { + "epoch": 0.004461727145201067, + "grad_norm": 1.3778623342514038, + "learning_rate": 4.461440407903123e-05, + "loss": 1.6351, + "step": 2100 + }, + { + "epoch": 0.00448297346494012, + "grad_norm": 1.9135502576828003, + "learning_rate": 4.482685362226471e-05, + "loss": 1.68, + "step": 2110 + }, + { + "epoch": 0.004504219784679173, + "grad_norm": 1.743018627166748, + "learning_rate": 4.50393031654982e-05, + "loss": 1.5906, + "step": 2120 + }, + { + "epoch": 0.0045254661044182255, + "grad_norm": 1.9830175638198853, + "learning_rate": 4.525175270873168e-05, + "loss": 1.604, + "step": 2130 + }, + { + "epoch": 0.004546712424157278, + "grad_norm": 0.9616091251373291, + "learning_rate": 4.546420225196516e-05, + "loss": 1.6113, + "step": 2140 + }, + { + "epoch": 0.00456795874389633, + "grad_norm": 1.3615095615386963, + "learning_rate": 4.5676651795198646e-05, + "loss": 1.605, + "step": 2150 + }, + { + "epoch": 0.004589205063635383, + "grad_norm": 1.2730841636657715, + "learning_rate": 4.588910133843212e-05, + "loss": 1.6453, + "step": 2160 + }, + { + "epoch": 0.004610451383374436, + "grad_norm": 0.8637818098068237, + "learning_rate": 4.6101550881665606e-05, + "loss": 1.5901, + "step": 2170 + }, + { + "epoch": 0.004631697703113489, + "grad_norm": 0.953292191028595, + "learning_rate": 4.631400042489909e-05, + "loss": 1.591, + "step": 2180 + }, + { + "epoch": 0.004652944022852542, + "grad_norm": 1.776922583580017, + "learning_rate": 4.6526449968132566e-05, + "loss": 1.5954, + "step": 2190 + }, + { + "epoch": 0.004674190342591594, + "grad_norm": 1.3939787149429321, + "learning_rate": 4.673889951136605e-05, + "loss": 1.632, + "step": 2200 + }, + { + "epoch": 0.004695436662330647, + "grad_norm": 1.0919915437698364, + "learning_rate": 4.695134905459953e-05, + "loss": 1.6687, + "step": 2210 + }, + { + "epoch": 0.0047166829820697, + "grad_norm": 1.9136030673980713, + "learning_rate": 4.7163798597833013e-05, + "loss": 1.5969, + "step": 2220 + }, + { + "epoch": 0.004737929301808753, + "grad_norm": 1.491115927696228, + "learning_rate": 4.73762481410665e-05, + "loss": 1.5659, + "step": 2230 + }, + { + "epoch": 0.0047591756215478054, + "grad_norm": 1.3262012004852295, + "learning_rate": 4.758869768429998e-05, + "loss": 1.6468, + "step": 2240 + }, + { + "epoch": 0.0047804219412868575, + "grad_norm": 1.1200557947158813, + "learning_rate": 4.780114722753346e-05, + "loss": 1.5953, + "step": 2250 + }, + { + "epoch": 0.00480166826102591, + "grad_norm": 1.7130342721939087, + "learning_rate": 4.801359677076695e-05, + "loss": 1.5877, + "step": 2260 + }, + { + "epoch": 0.004822914580764963, + "grad_norm": 1.233087182044983, + "learning_rate": 4.822604631400043e-05, + "loss": 1.5746, + "step": 2270 + }, + { + "epoch": 0.004844160900504016, + "grad_norm": 1.2473499774932861, + "learning_rate": 4.8438495857233914e-05, + "loss": 1.6325, + "step": 2280 + }, + { + "epoch": 0.004865407220243068, + "grad_norm": 0.8840552568435669, + "learning_rate": 4.865094540046739e-05, + "loss": 1.6374, + "step": 2290 + }, + { + "epoch": 0.004886653539982121, + "grad_norm": 1.1558473110198975, + "learning_rate": 4.8863394943700874e-05, + "loss": 1.5882, + "step": 2300 + }, + { + "epoch": 0.004907899859721174, + "grad_norm": 1.1258968114852905, + "learning_rate": 4.907584448693436e-05, + "loss": 1.6372, + "step": 2310 + }, + { + "epoch": 0.004929146179460227, + "grad_norm": 1.4454100131988525, + "learning_rate": 4.9288294030167835e-05, + "loss": 1.6207, + "step": 2320 + }, + { + "epoch": 0.00495039249919928, + "grad_norm": 1.375669002532959, + "learning_rate": 4.950074357340132e-05, + "loss": 1.5838, + "step": 2330 + }, + { + "epoch": 0.004971638818938332, + "grad_norm": 1.3189280033111572, + "learning_rate": 4.97131931166348e-05, + "loss": 1.6131, + "step": 2340 + }, + { + "epoch": 0.0049928851386773845, + "grad_norm": 0.8942054510116577, + "learning_rate": 4.992564265986828e-05, + "loss": 1.6202, + "step": 2350 + }, + { + "epoch": 0.005014131458416437, + "grad_norm": 1.3580176830291748, + "learning_rate": 5.013809220310176e-05, + "loss": 1.5862, + "step": 2360 + }, + { + "epoch": 0.00503537777815549, + "grad_norm": 1.4574054479599, + "learning_rate": 5.035054174633524e-05, + "loss": 1.6138, + "step": 2370 + }, + { + "epoch": 0.005056624097894543, + "grad_norm": 0.941665768623352, + "learning_rate": 5.056299128956873e-05, + "loss": 1.5902, + "step": 2380 + }, + { + "epoch": 0.005077870417633595, + "grad_norm": 1.0588945150375366, + "learning_rate": 5.0775440832802216e-05, + "loss": 1.6348, + "step": 2390 + }, + { + "epoch": 0.005099116737372648, + "grad_norm": 0.9442176222801208, + "learning_rate": 5.0987890376035696e-05, + "loss": 1.5921, + "step": 2400 + }, + { + "epoch": 0.005120363057111701, + "grad_norm": 1.1033592224121094, + "learning_rate": 5.120033991926918e-05, + "loss": 1.6148, + "step": 2410 + }, + { + "epoch": 0.005141609376850754, + "grad_norm": 0.976649284362793, + "learning_rate": 5.1412789462502656e-05, + "loss": 1.5745, + "step": 2420 + }, + { + "epoch": 0.005162855696589807, + "grad_norm": 1.2703427076339722, + "learning_rate": 5.1625239005736136e-05, + "loss": 1.5957, + "step": 2430 + }, + { + "epoch": 0.005184102016328859, + "grad_norm": 1.254042625427246, + "learning_rate": 5.183768854896962e-05, + "loss": 1.5891, + "step": 2440 + }, + { + "epoch": 0.005205348336067912, + "grad_norm": 1.2543736696243286, + "learning_rate": 5.20501380922031e-05, + "loss": 1.6079, + "step": 2450 + }, + { + "epoch": 0.0052265946558069645, + "grad_norm": 1.0222806930541992, + "learning_rate": 5.226258763543659e-05, + "loss": 1.5765, + "step": 2460 + }, + { + "epoch": 0.005247840975546017, + "grad_norm": 0.865307092666626, + "learning_rate": 5.247503717867007e-05, + "loss": 1.5888, + "step": 2470 + }, + { + "epoch": 0.00526908729528507, + "grad_norm": 0.9452382922172546, + "learning_rate": 5.268748672190354e-05, + "loss": 1.5473, + "step": 2480 + }, + { + "epoch": 0.005290333615024122, + "grad_norm": 0.9051697850227356, + "learning_rate": 5.289993626513703e-05, + "loss": 1.56, + "step": 2490 + }, + { + "epoch": 0.005311579934763175, + "grad_norm": 1.1898553371429443, + "learning_rate": 5.311238580837051e-05, + "loss": 1.575, + "step": 2500 + }, + { + "epoch": 0.005332826254502228, + "grad_norm": 0.9314600229263306, + "learning_rate": 5.3324835351604e-05, + "loss": 1.5504, + "step": 2510 + }, + { + "epoch": 0.005354072574241281, + "grad_norm": 0.7449698448181152, + "learning_rate": 5.3537284894837484e-05, + "loss": 1.587, + "step": 2520 + }, + { + "epoch": 0.005375318893980333, + "grad_norm": 1.0603394508361816, + "learning_rate": 5.3749734438070964e-05, + "loss": 1.5802, + "step": 2530 + }, + { + "epoch": 0.005396565213719386, + "grad_norm": 0.9971975684165955, + "learning_rate": 5.396218398130445e-05, + "loss": 1.5626, + "step": 2540 + }, + { + "epoch": 0.005417811533458439, + "grad_norm": 0.8380981683731079, + "learning_rate": 5.4174633524537924e-05, + "loss": 1.5426, + "step": 2550 + }, + { + "epoch": 0.005439057853197492, + "grad_norm": 1.500501275062561, + "learning_rate": 5.4387083067771404e-05, + "loss": 1.5838, + "step": 2560 + }, + { + "epoch": 0.0054603041729365445, + "grad_norm": 1.0054744482040405, + "learning_rate": 5.459953261100489e-05, + "loss": 1.5523, + "step": 2570 + }, + { + "epoch": 0.0054815504926755965, + "grad_norm": 1.1710342168807983, + "learning_rate": 5.481198215423837e-05, + "loss": 1.545, + "step": 2580 + }, + { + "epoch": 0.005502796812414649, + "grad_norm": 1.027153491973877, + "learning_rate": 5.502443169747186e-05, + "loss": 1.6026, + "step": 2590 + }, + { + "epoch": 0.005524043132153702, + "grad_norm": 0.9386345148086548, + "learning_rate": 5.523688124070534e-05, + "loss": 1.5635, + "step": 2600 + }, + { + "epoch": 0.005545289451892755, + "grad_norm": 1.0839945077896118, + "learning_rate": 5.544933078393881e-05, + "loss": 1.551, + "step": 2610 + }, + { + "epoch": 0.005566535771631808, + "grad_norm": 0.8766878843307495, + "learning_rate": 5.56617803271723e-05, + "loss": 1.5394, + "step": 2620 + }, + { + "epoch": 0.00558778209137086, + "grad_norm": 1.3963909149169922, + "learning_rate": 5.587422987040578e-05, + "loss": 1.5491, + "step": 2630 + }, + { + "epoch": 0.005609028411109913, + "grad_norm": 0.7619982957839966, + "learning_rate": 5.6086679413639265e-05, + "loss": 1.5876, + "step": 2640 + }, + { + "epoch": 0.005630274730848966, + "grad_norm": 1.3876686096191406, + "learning_rate": 5.6299128956872745e-05, + "loss": 1.5828, + "step": 2650 + }, + { + "epoch": 0.005651521050588019, + "grad_norm": 1.052322506904602, + "learning_rate": 5.651157850010623e-05, + "loss": 1.5474, + "step": 2660 + }, + { + "epoch": 0.0056727673703270715, + "grad_norm": 1.020403504371643, + "learning_rate": 5.6724028043339705e-05, + "loss": 1.5347, + "step": 2670 + }, + { + "epoch": 0.0056940136900661235, + "grad_norm": 1.0478382110595703, + "learning_rate": 5.693647758657319e-05, + "loss": 1.5927, + "step": 2680 + }, + { + "epoch": 0.005715260009805176, + "grad_norm": 0.8784224987030029, + "learning_rate": 5.714892712980667e-05, + "loss": 1.5699, + "step": 2690 + }, + { + "epoch": 0.005736506329544229, + "grad_norm": 0.9683448076248169, + "learning_rate": 5.736137667304016e-05, + "loss": 1.5508, + "step": 2700 + }, + { + "epoch": 0.005757752649283282, + "grad_norm": 1.5860514640808105, + "learning_rate": 5.757382621627364e-05, + "loss": 1.5507, + "step": 2710 + }, + { + "epoch": 0.005778998969022335, + "grad_norm": 1.4566774368286133, + "learning_rate": 5.7786275759507126e-05, + "loss": 1.5636, + "step": 2720 + }, + { + "epoch": 0.005800245288761387, + "grad_norm": 1.217324137687683, + "learning_rate": 5.79987253027406e-05, + "loss": 1.5611, + "step": 2730 + }, + { + "epoch": 0.00582149160850044, + "grad_norm": 0.8988829851150513, + "learning_rate": 5.821117484597408e-05, + "loss": 1.563, + "step": 2740 + }, + { + "epoch": 0.005842737928239493, + "grad_norm": 0.9736409187316895, + "learning_rate": 5.8423624389207567e-05, + "loss": 1.5456, + "step": 2750 + }, + { + "epoch": 0.005863984247978546, + "grad_norm": 0.9623783230781555, + "learning_rate": 5.8636073932441047e-05, + "loss": 1.5857, + "step": 2760 + }, + { + "epoch": 0.005885230567717598, + "grad_norm": 1.1413803100585938, + "learning_rate": 5.8848523475674533e-05, + "loss": 1.5985, + "step": 2770 + }, + { + "epoch": 0.005906476887456651, + "grad_norm": 0.9451686143875122, + "learning_rate": 5.9060973018908014e-05, + "loss": 1.553, + "step": 2780 + }, + { + "epoch": 0.0059277232071957035, + "grad_norm": 0.9815428853034973, + "learning_rate": 5.927342256214149e-05, + "loss": 1.5533, + "step": 2790 + }, + { + "epoch": 0.005948969526934756, + "grad_norm": 1.162389874458313, + "learning_rate": 5.9485872105374974e-05, + "loss": 1.5371, + "step": 2800 + }, + { + "epoch": 0.005970215846673809, + "grad_norm": 1.269148588180542, + "learning_rate": 5.969832164860846e-05, + "loss": 1.5674, + "step": 2810 + }, + { + "epoch": 0.005991462166412861, + "grad_norm": 0.770132303237915, + "learning_rate": 5.991077119184194e-05, + "loss": 1.5112, + "step": 2820 + }, + { + "epoch": 0.006012708486151914, + "grad_norm": 0.8054612278938293, + "learning_rate": 6.012322073507543e-05, + "loss": 1.5401, + "step": 2830 + }, + { + "epoch": 0.006033954805890967, + "grad_norm": 1.3314446210861206, + "learning_rate": 6.033567027830891e-05, + "loss": 1.5526, + "step": 2840 + }, + { + "epoch": 0.00605520112563002, + "grad_norm": 1.1334437131881714, + "learning_rate": 6.054811982154238e-05, + "loss": 1.5716, + "step": 2850 + }, + { + "epoch": 0.006076447445369073, + "grad_norm": 1.0587393045425415, + "learning_rate": 6.076056936477587e-05, + "loss": 1.5436, + "step": 2860 + }, + { + "epoch": 0.006097693765108125, + "grad_norm": 0.8275590538978577, + "learning_rate": 6.097301890800935e-05, + "loss": 1.5741, + "step": 2870 + }, + { + "epoch": 0.006118940084847178, + "grad_norm": 0.8488883376121521, + "learning_rate": 6.118546845124283e-05, + "loss": 1.5283, + "step": 2880 + }, + { + "epoch": 0.006140186404586231, + "grad_norm": 1.0172247886657715, + "learning_rate": 6.139791799447631e-05, + "loss": 1.5484, + "step": 2890 + }, + { + "epoch": 0.0061614327243252835, + "grad_norm": 0.969590961933136, + "learning_rate": 6.16103675377098e-05, + "loss": 1.5134, + "step": 2900 + }, + { + "epoch": 0.006182679044064336, + "grad_norm": 1.1794122457504272, + "learning_rate": 6.182281708094328e-05, + "loss": 1.5429, + "step": 2910 + }, + { + "epoch": 0.006203925363803388, + "grad_norm": 0.7906720042228699, + "learning_rate": 6.203526662417676e-05, + "loss": 1.5102, + "step": 2920 + }, + { + "epoch": 0.006225171683542441, + "grad_norm": 1.0701607465744019, + "learning_rate": 6.224771616741024e-05, + "loss": 1.5847, + "step": 2930 + }, + { + "epoch": 0.006246418003281494, + "grad_norm": 0.8435912728309631, + "learning_rate": 6.246016571064373e-05, + "loss": 1.5424, + "step": 2940 + }, + { + "epoch": 0.006267664323020547, + "grad_norm": 0.9610419273376465, + "learning_rate": 6.267261525387721e-05, + "loss": 1.5309, + "step": 2950 + }, + { + "epoch": 0.0062889106427596, + "grad_norm": 0.8362812399864197, + "learning_rate": 6.288506479711069e-05, + "loss": 1.5303, + "step": 2960 + }, + { + "epoch": 0.006310156962498652, + "grad_norm": 0.810815155506134, + "learning_rate": 6.309751434034417e-05, + "loss": 1.566, + "step": 2970 + }, + { + "epoch": 0.006331403282237705, + "grad_norm": 1.1078112125396729, + "learning_rate": 6.330996388357765e-05, + "loss": 1.6013, + "step": 2980 + }, + { + "epoch": 0.006352649601976758, + "grad_norm": 1.150909423828125, + "learning_rate": 6.352241342681113e-05, + "loss": 1.5109, + "step": 2990 + }, + { + "epoch": 0.0063738959217158105, + "grad_norm": 1.0444499254226685, + "learning_rate": 6.373486297004462e-05, + "loss": 1.5428, + "step": 3000 + }, + { + "epoch": 0.006395142241454863, + "grad_norm": 1.0519038438796997, + "learning_rate": 6.39473125132781e-05, + "loss": 1.5437, + "step": 3010 + }, + { + "epoch": 0.006416388561193915, + "grad_norm": 0.8512790203094482, + "learning_rate": 6.415976205651158e-05, + "loss": 1.5294, + "step": 3020 + }, + { + "epoch": 0.006437634880932968, + "grad_norm": 1.004648208618164, + "learning_rate": 6.437221159974506e-05, + "loss": 1.5419, + "step": 3030 + }, + { + "epoch": 0.006458881200672021, + "grad_norm": 0.7901549935340881, + "learning_rate": 6.458466114297854e-05, + "loss": 1.505, + "step": 3040 + }, + { + "epoch": 0.006480127520411074, + "grad_norm": 1.0302339792251587, + "learning_rate": 6.479711068621202e-05, + "loss": 1.5545, + "step": 3050 + }, + { + "epoch": 0.006501373840150126, + "grad_norm": 1.0161783695220947, + "learning_rate": 6.50095602294455e-05, + "loss": 1.5152, + "step": 3060 + }, + { + "epoch": 0.006522620159889179, + "grad_norm": 0.8339026570320129, + "learning_rate": 6.5222009772679e-05, + "loss": 1.5437, + "step": 3070 + }, + { + "epoch": 0.006543866479628232, + "grad_norm": 0.7774878144264221, + "learning_rate": 6.543445931591248e-05, + "loss": 1.5345, + "step": 3080 + }, + { + "epoch": 0.006565112799367285, + "grad_norm": 0.7507886290550232, + "learning_rate": 6.564690885914596e-05, + "loss": 1.5172, + "step": 3090 + }, + { + "epoch": 0.006586359119106338, + "grad_norm": 0.7967400550842285, + "learning_rate": 6.585935840237944e-05, + "loss": 1.4705, + "step": 3100 + }, + { + "epoch": 0.00660760543884539, + "grad_norm": 1.2758278846740723, + "learning_rate": 6.607180794561292e-05, + "loss": 1.5513, + "step": 3110 + }, + { + "epoch": 0.0066288517585844425, + "grad_norm": 1.880794644355774, + "learning_rate": 6.62842574888464e-05, + "loss": 1.5072, + "step": 3120 + }, + { + "epoch": 0.006650098078323495, + "grad_norm": 1.088399052619934, + "learning_rate": 6.649670703207989e-05, + "loss": 1.5148, + "step": 3130 + }, + { + "epoch": 0.006671344398062548, + "grad_norm": 0.8695627450942993, + "learning_rate": 6.670915657531337e-05, + "loss": 1.4881, + "step": 3140 + }, + { + "epoch": 0.006692590717801601, + "grad_norm": 0.7961494326591492, + "learning_rate": 6.692160611854685e-05, + "loss": 1.5239, + "step": 3150 + }, + { + "epoch": 0.006713837037540653, + "grad_norm": 0.7324531078338623, + "learning_rate": 6.713405566178033e-05, + "loss": 1.4951, + "step": 3160 + }, + { + "epoch": 0.006735083357279706, + "grad_norm": 0.9449358582496643, + "learning_rate": 6.734650520501381e-05, + "loss": 1.5085, + "step": 3170 + }, + { + "epoch": 0.006756329677018759, + "grad_norm": 1.3087553977966309, + "learning_rate": 6.755895474824729e-05, + "loss": 1.4955, + "step": 3180 + }, + { + "epoch": 0.006777575996757812, + "grad_norm": 1.2013590335845947, + "learning_rate": 6.777140429148077e-05, + "loss": 1.5341, + "step": 3190 + }, + { + "epoch": 0.006798822316496865, + "grad_norm": 1.1037042140960693, + "learning_rate": 6.798385383471427e-05, + "loss": 1.5045, + "step": 3200 + }, + { + "epoch": 0.006820068636235917, + "grad_norm": 1.107533574104309, + "learning_rate": 6.819630337794775e-05, + "loss": 1.5238, + "step": 3210 + }, + { + "epoch": 0.00684131495597497, + "grad_norm": 0.7903012633323669, + "learning_rate": 6.840875292118121e-05, + "loss": 1.5203, + "step": 3220 + }, + { + "epoch": 0.0068625612757140225, + "grad_norm": 1.1096696853637695, + "learning_rate": 6.86212024644147e-05, + "loss": 1.5217, + "step": 3230 + }, + { + "epoch": 0.006883807595453075, + "grad_norm": 1.0476487874984741, + "learning_rate": 6.883365200764819e-05, + "loss": 1.5582, + "step": 3240 + }, + { + "epoch": 0.006905053915192128, + "grad_norm": 0.8859138488769531, + "learning_rate": 6.904610155088167e-05, + "loss": 1.5158, + "step": 3250 + }, + { + "epoch": 0.00692630023493118, + "grad_norm": 1.0247817039489746, + "learning_rate": 6.925855109411516e-05, + "loss": 1.5436, + "step": 3260 + }, + { + "epoch": 0.006947546554670233, + "grad_norm": 0.8065521717071533, + "learning_rate": 6.947100063734864e-05, + "loss": 1.5227, + "step": 3270 + }, + { + "epoch": 0.006968792874409286, + "grad_norm": 0.8112274408340454, + "learning_rate": 6.96834501805821e-05, + "loss": 1.5624, + "step": 3280 + }, + { + "epoch": 0.006990039194148339, + "grad_norm": 0.8062530159950256, + "learning_rate": 6.98958997238156e-05, + "loss": 1.4827, + "step": 3290 + }, + { + "epoch": 0.007011285513887391, + "grad_norm": 0.9799218773841858, + "learning_rate": 7.010834926704908e-05, + "loss": 1.502, + "step": 3300 + }, + { + "epoch": 0.007032531833626444, + "grad_norm": 1.3122270107269287, + "learning_rate": 7.032079881028256e-05, + "loss": 1.5064, + "step": 3310 + }, + { + "epoch": 0.007053778153365497, + "grad_norm": 0.6945155262947083, + "learning_rate": 7.053324835351604e-05, + "loss": 1.5401, + "step": 3320 + }, + { + "epoch": 0.0070750244731045495, + "grad_norm": 0.9642041921615601, + "learning_rate": 7.074569789674953e-05, + "loss": 1.528, + "step": 3330 + }, + { + "epoch": 0.007096270792843602, + "grad_norm": 0.9301373362541199, + "learning_rate": 7.0958147439983e-05, + "loss": 1.4978, + "step": 3340 + }, + { + "epoch": 0.007117517112582654, + "grad_norm": 0.930833101272583, + "learning_rate": 7.117059698321648e-05, + "loss": 1.4901, + "step": 3350 + }, + { + "epoch": 0.007138763432321707, + "grad_norm": 0.968771755695343, + "learning_rate": 7.138304652644997e-05, + "loss": 1.4865, + "step": 3360 + }, + { + "epoch": 0.00716000975206076, + "grad_norm": 0.677682101726532, + "learning_rate": 7.159549606968345e-05, + "loss": 1.5176, + "step": 3370 + }, + { + "epoch": 0.007181256071799813, + "grad_norm": 1.1080256700515747, + "learning_rate": 7.180794561291693e-05, + "loss": 1.4682, + "step": 3380 + }, + { + "epoch": 0.007202502391538866, + "grad_norm": 0.6101229190826416, + "learning_rate": 7.202039515615043e-05, + "loss": 1.4798, + "step": 3390 + }, + { + "epoch": 0.007223748711277918, + "grad_norm": 0.8912479877471924, + "learning_rate": 7.22328446993839e-05, + "loss": 1.4816, + "step": 3400 + }, + { + "epoch": 0.007244995031016971, + "grad_norm": 0.7409901022911072, + "learning_rate": 7.244529424261737e-05, + "loss": 1.5381, + "step": 3410 + }, + { + "epoch": 0.007266241350756024, + "grad_norm": 0.7516536116600037, + "learning_rate": 7.265774378585087e-05, + "loss": 1.5102, + "step": 3420 + }, + { + "epoch": 0.007287487670495077, + "grad_norm": 1.5887451171875, + "learning_rate": 7.287019332908435e-05, + "loss": 1.5234, + "step": 3430 + }, + { + "epoch": 0.0073087339902341295, + "grad_norm": 0.8808043003082275, + "learning_rate": 7.308264287231783e-05, + "loss": 1.5164, + "step": 3440 + }, + { + "epoch": 0.0073299803099731815, + "grad_norm": 0.7470552921295166, + "learning_rate": 7.329509241555131e-05, + "loss": 1.5093, + "step": 3450 + }, + { + "epoch": 0.007351226629712234, + "grad_norm": 1.184937834739685, + "learning_rate": 7.350754195878479e-05, + "loss": 1.5228, + "step": 3460 + }, + { + "epoch": 0.007372472949451287, + "grad_norm": 0.7568943500518799, + "learning_rate": 7.371999150201827e-05, + "loss": 1.5025, + "step": 3470 + }, + { + "epoch": 0.00739371926919034, + "grad_norm": 0.857840895652771, + "learning_rate": 7.393244104525175e-05, + "loss": 1.4986, + "step": 3480 + }, + { + "epoch": 0.007414965588929393, + "grad_norm": 0.9660385847091675, + "learning_rate": 7.414489058848524e-05, + "loss": 1.5271, + "step": 3490 + }, + { + "epoch": 0.007436211908668445, + "grad_norm": 0.8806533217430115, + "learning_rate": 7.435734013171872e-05, + "loss": 1.5371, + "step": 3500 + }, + { + "epoch": 0.007457458228407498, + "grad_norm": 1.576085090637207, + "learning_rate": 7.45697896749522e-05, + "loss": 1.4638, + "step": 3510 + }, + { + "epoch": 0.007478704548146551, + "grad_norm": 0.8821079730987549, + "learning_rate": 7.478223921818568e-05, + "loss": 1.4852, + "step": 3520 + }, + { + "epoch": 0.007499950867885604, + "grad_norm": 0.895588219165802, + "learning_rate": 7.499468876141916e-05, + "loss": 1.5211, + "step": 3530 + }, + { + "epoch": 0.0075211971876246566, + "grad_norm": 0.7447425127029419, + "learning_rate": 7.520713830465264e-05, + "loss": 1.5032, + "step": 3540 + }, + { + "epoch": 0.007542443507363709, + "grad_norm": 1.3681302070617676, + "learning_rate": 7.541958784788614e-05, + "loss": 1.5191, + "step": 3550 + }, + { + "epoch": 0.0075636898271027615, + "grad_norm": 0.7409792542457581, + "learning_rate": 7.563203739111962e-05, + "loss": 1.5213, + "step": 3560 + }, + { + "epoch": 0.007584936146841814, + "grad_norm": 0.7510338425636292, + "learning_rate": 7.58444869343531e-05, + "loss": 1.4999, + "step": 3570 + }, + { + "epoch": 0.007606182466580867, + "grad_norm": 0.8400350213050842, + "learning_rate": 7.605693647758658e-05, + "loss": 1.5179, + "step": 3580 + }, + { + "epoch": 0.007627428786319919, + "grad_norm": 0.7481834292411804, + "learning_rate": 7.626938602082006e-05, + "loss": 1.517, + "step": 3590 + }, + { + "epoch": 0.007648675106058972, + "grad_norm": 0.8494490385055542, + "learning_rate": 7.648183556405354e-05, + "loss": 1.4683, + "step": 3600 + }, + { + "epoch": 0.007669921425798025, + "grad_norm": 0.8673060536384583, + "learning_rate": 7.669428510728702e-05, + "loss": 1.4699, + "step": 3610 + }, + { + "epoch": 0.007691167745537078, + "grad_norm": 1.0948890447616577, + "learning_rate": 7.690673465052051e-05, + "loss": 1.4658, + "step": 3620 + }, + { + "epoch": 0.007712414065276131, + "grad_norm": 0.7623769640922546, + "learning_rate": 7.711918419375399e-05, + "loss": 1.5183, + "step": 3630 + }, + { + "epoch": 0.007733660385015183, + "grad_norm": 1.101075530052185, + "learning_rate": 7.733163373698747e-05, + "loss": 1.4817, + "step": 3640 + }, + { + "epoch": 0.007754906704754236, + "grad_norm": 0.7084292769432068, + "learning_rate": 7.754408328022095e-05, + "loss": 1.4797, + "step": 3650 + }, + { + "epoch": 0.0077761530244932885, + "grad_norm": 0.7245771288871765, + "learning_rate": 7.775653282345443e-05, + "loss": 1.4797, + "step": 3660 + }, + { + "epoch": 0.007797399344232341, + "grad_norm": 1.9507850408554077, + "learning_rate": 7.796898236668791e-05, + "loss": 1.481, + "step": 3670 + }, + { + "epoch": 0.007818645663971394, + "grad_norm": 1.0518580675125122, + "learning_rate": 7.81814319099214e-05, + "loss": 1.4968, + "step": 3680 + }, + { + "epoch": 0.007839891983710446, + "grad_norm": 0.7658743858337402, + "learning_rate": 7.839388145315488e-05, + "loss": 1.4967, + "step": 3690 + }, + { + "epoch": 0.0078611383034495, + "grad_norm": 0.8128832578659058, + "learning_rate": 7.860633099638836e-05, + "loss": 1.4967, + "step": 3700 + }, + { + "epoch": 0.007882384623188552, + "grad_norm": 0.9265111088752747, + "learning_rate": 7.881878053962184e-05, + "loss": 1.4737, + "step": 3710 + }, + { + "epoch": 0.007903630942927604, + "grad_norm": 0.8029168248176575, + "learning_rate": 7.903123008285532e-05, + "loss": 1.4802, + "step": 3720 + }, + { + "epoch": 0.007924877262666658, + "grad_norm": 0.8350634574890137, + "learning_rate": 7.92436796260888e-05, + "loss": 1.4855, + "step": 3730 + }, + { + "epoch": 0.00794612358240571, + "grad_norm": 0.8015345335006714, + "learning_rate": 7.945612916932229e-05, + "loss": 1.4626, + "step": 3740 + }, + { + "epoch": 0.007967369902144764, + "grad_norm": 0.897199273109436, + "learning_rate": 7.966857871255578e-05, + "loss": 1.4756, + "step": 3750 + }, + { + "epoch": 0.007988616221883816, + "grad_norm": 0.930965781211853, + "learning_rate": 7.988102825578926e-05, + "loss": 1.4515, + "step": 3760 + }, + { + "epoch": 0.008009862541622868, + "grad_norm": 0.7563024759292603, + "learning_rate": 8.009347779902273e-05, + "loss": 1.4622, + "step": 3770 + }, + { + "epoch": 0.008031108861361921, + "grad_norm": 0.8976707458496094, + "learning_rate": 8.030592734225622e-05, + "loss": 1.4945, + "step": 3780 + }, + { + "epoch": 0.008052355181100973, + "grad_norm": 0.9463152289390564, + "learning_rate": 8.05183768854897e-05, + "loss": 1.4675, + "step": 3790 + }, + { + "epoch": 0.008073601500840027, + "grad_norm": 1.0075587034225464, + "learning_rate": 8.073082642872318e-05, + "loss": 1.4761, + "step": 3800 + }, + { + "epoch": 0.00809484782057908, + "grad_norm": 0.8393225073814392, + "learning_rate": 8.094327597195667e-05, + "loss": 1.4729, + "step": 3810 + }, + { + "epoch": 0.008116094140318131, + "grad_norm": 0.7345854043960571, + "learning_rate": 8.115572551519015e-05, + "loss": 1.4527, + "step": 3820 + }, + { + "epoch": 0.008137340460057185, + "grad_norm": 0.9295156598091125, + "learning_rate": 8.136817505842362e-05, + "loss": 1.4728, + "step": 3830 + }, + { + "epoch": 0.008158586779796237, + "grad_norm": 1.3265165090560913, + "learning_rate": 8.158062460165711e-05, + "loss": 1.4975, + "step": 3840 + }, + { + "epoch": 0.00817983309953529, + "grad_norm": 0.7967504858970642, + "learning_rate": 8.179307414489059e-05, + "loss": 1.4885, + "step": 3850 + }, + { + "epoch": 0.008201079419274343, + "grad_norm": 0.8234679102897644, + "learning_rate": 8.200552368812407e-05, + "loss": 1.4743, + "step": 3860 + }, + { + "epoch": 0.008222325739013395, + "grad_norm": 0.7211614847183228, + "learning_rate": 8.221797323135755e-05, + "loss": 1.4448, + "step": 3870 + }, + { + "epoch": 0.008243572058752448, + "grad_norm": 1.0717051029205322, + "learning_rate": 8.243042277459105e-05, + "loss": 1.4877, + "step": 3880 + }, + { + "epoch": 0.0082648183784915, + "grad_norm": 0.7813867330551147, + "learning_rate": 8.264287231782451e-05, + "loss": 1.4721, + "step": 3890 + }, + { + "epoch": 0.008286064698230554, + "grad_norm": 0.9514865279197693, + "learning_rate": 8.2855321861058e-05, + "loss": 1.4859, + "step": 3900 + }, + { + "epoch": 0.008307311017969606, + "grad_norm": 0.8033127188682556, + "learning_rate": 8.306777140429149e-05, + "loss": 1.5115, + "step": 3910 + }, + { + "epoch": 0.008328557337708658, + "grad_norm": 0.8999913930892944, + "learning_rate": 8.328022094752497e-05, + "loss": 1.4459, + "step": 3920 + }, + { + "epoch": 0.008349803657447712, + "grad_norm": 0.7814891934394836, + "learning_rate": 8.349267049075845e-05, + "loss": 1.5093, + "step": 3930 + }, + { + "epoch": 0.008371049977186764, + "grad_norm": 0.6487235426902771, + "learning_rate": 8.370512003399194e-05, + "loss": 1.4747, + "step": 3940 + }, + { + "epoch": 0.008392296296925818, + "grad_norm": 0.8092133402824402, + "learning_rate": 8.391756957722541e-05, + "loss": 1.486, + "step": 3950 + }, + { + "epoch": 0.00841354261666487, + "grad_norm": 0.9346337914466858, + "learning_rate": 8.413001912045889e-05, + "loss": 1.5125, + "step": 3960 + }, + { + "epoch": 0.008434788936403922, + "grad_norm": 0.7005777955055237, + "learning_rate": 8.434246866369238e-05, + "loss": 1.4421, + "step": 3970 + }, + { + "epoch": 0.008456035256142976, + "grad_norm": 1.0044195652008057, + "learning_rate": 8.455491820692586e-05, + "loss": 1.5036, + "step": 3980 + }, + { + "epoch": 0.008477281575882028, + "grad_norm": 0.7909959554672241, + "learning_rate": 8.476736775015934e-05, + "loss": 1.4804, + "step": 3990 + }, + { + "epoch": 0.00849852789562108, + "grad_norm": 0.8148873448371887, + "learning_rate": 8.497981729339282e-05, + "loss": 1.5334, + "step": 4000 + }, + { + "epoch": 0.008519774215360133, + "grad_norm": 0.9284026622772217, + "learning_rate": 8.51922668366263e-05, + "loss": 1.4666, + "step": 4010 + }, + { + "epoch": 0.008541020535099185, + "grad_norm": 0.763813853263855, + "learning_rate": 8.540471637985978e-05, + "loss": 1.4613, + "step": 4020 + }, + { + "epoch": 0.008562266854838239, + "grad_norm": 1.0137665271759033, + "learning_rate": 8.561716592309326e-05, + "loss": 1.4205, + "step": 4030 + }, + { + "epoch": 0.008583513174577291, + "grad_norm": 0.8735483884811401, + "learning_rate": 8.582961546632676e-05, + "loss": 1.4959, + "step": 4040 + }, + { + "epoch": 0.008604759494316343, + "grad_norm": 0.7434191703796387, + "learning_rate": 8.604206500956024e-05, + "loss": 1.4506, + "step": 4050 + }, + { + "epoch": 0.008626005814055397, + "grad_norm": 0.6415987014770508, + "learning_rate": 8.625451455279372e-05, + "loss": 1.5042, + "step": 4060 + }, + { + "epoch": 0.008647252133794449, + "grad_norm": 1.1595107316970825, + "learning_rate": 8.646696409602721e-05, + "loss": 1.4617, + "step": 4070 + }, + { + "epoch": 0.008668498453533503, + "grad_norm": 0.9103615880012512, + "learning_rate": 8.667941363926068e-05, + "loss": 1.4295, + "step": 4080 + }, + { + "epoch": 0.008689744773272555, + "grad_norm": 0.7023127675056458, + "learning_rate": 8.689186318249416e-05, + "loss": 1.4173, + "step": 4090 + }, + { + "epoch": 0.008710991093011607, + "grad_norm": 1.0021463632583618, + "learning_rate": 8.710431272572765e-05, + "loss": 1.4117, + "step": 4100 + }, + { + "epoch": 0.00873223741275066, + "grad_norm": 0.7121887803077698, + "learning_rate": 8.731676226896113e-05, + "loss": 1.4216, + "step": 4110 + }, + { + "epoch": 0.008753483732489712, + "grad_norm": 1.1200122833251953, + "learning_rate": 8.752921181219461e-05, + "loss": 1.4323, + "step": 4120 + }, + { + "epoch": 0.008774730052228766, + "grad_norm": 0.6485759615898132, + "learning_rate": 8.774166135542809e-05, + "loss": 1.4484, + "step": 4130 + }, + { + "epoch": 0.008795976371967818, + "grad_norm": 1.1413930654525757, + "learning_rate": 8.795411089866157e-05, + "loss": 1.4537, + "step": 4140 + }, + { + "epoch": 0.00881722269170687, + "grad_norm": 0.8164786696434021, + "learning_rate": 8.816656044189505e-05, + "loss": 1.4459, + "step": 4150 + }, + { + "epoch": 0.008838469011445924, + "grad_norm": 0.8486210703849792, + "learning_rate": 8.837900998512853e-05, + "loss": 1.4322, + "step": 4160 + }, + { + "epoch": 0.008859715331184976, + "grad_norm": 0.8040435314178467, + "learning_rate": 8.859145952836202e-05, + "loss": 1.4902, + "step": 4170 + }, + { + "epoch": 0.00888096165092403, + "grad_norm": 0.7916246056556702, + "learning_rate": 8.88039090715955e-05, + "loss": 1.4781, + "step": 4180 + }, + { + "epoch": 0.008902207970663082, + "grad_norm": 0.7665431499481201, + "learning_rate": 8.901635861482898e-05, + "loss": 1.4593, + "step": 4190 + }, + { + "epoch": 0.008923454290402134, + "grad_norm": 0.8341965675354004, + "learning_rate": 8.922880815806246e-05, + "loss": 1.4763, + "step": 4200 + }, + { + "epoch": 0.008944700610141187, + "grad_norm": 0.9005393385887146, + "learning_rate": 8.944125770129594e-05, + "loss": 1.4975, + "step": 4210 + }, + { + "epoch": 0.00896594692988024, + "grad_norm": 0.8118691444396973, + "learning_rate": 8.965370724452942e-05, + "loss": 1.4238, + "step": 4220 + }, + { + "epoch": 0.008987193249619293, + "grad_norm": 0.6407181620597839, + "learning_rate": 8.986615678776292e-05, + "loss": 1.4693, + "step": 4230 + }, + { + "epoch": 0.009008439569358345, + "grad_norm": 0.7410246133804321, + "learning_rate": 9.00786063309964e-05, + "loss": 1.4336, + "step": 4240 + }, + { + "epoch": 0.009029685889097397, + "grad_norm": 0.7400311231613159, + "learning_rate": 9.029105587422988e-05, + "loss": 1.4567, + "step": 4250 + }, + { + "epoch": 0.009050932208836451, + "grad_norm": 0.7124418616294861, + "learning_rate": 9.050350541746336e-05, + "loss": 1.4545, + "step": 4260 + }, + { + "epoch": 0.009072178528575503, + "grad_norm": 0.8414973616600037, + "learning_rate": 9.071595496069684e-05, + "loss": 1.4472, + "step": 4270 + }, + { + "epoch": 0.009093424848314557, + "grad_norm": 1.0789527893066406, + "learning_rate": 9.092840450393032e-05, + "loss": 1.4699, + "step": 4280 + }, + { + "epoch": 0.009114671168053609, + "grad_norm": 0.7048367261886597, + "learning_rate": 9.11408540471638e-05, + "loss": 1.4225, + "step": 4290 + }, + { + "epoch": 0.00913591748779266, + "grad_norm": 34.650386810302734, + "learning_rate": 9.135330359039729e-05, + "loss": 1.5019, + "step": 4300 + }, + { + "epoch": 0.009157163807531715, + "grad_norm": 0.626020073890686, + "learning_rate": 9.156575313363077e-05, + "loss": 1.4165, + "step": 4310 + }, + { + "epoch": 0.009178410127270767, + "grad_norm": 0.6413044929504395, + "learning_rate": 9.177820267686424e-05, + "loss": 1.4407, + "step": 4320 + }, + { + "epoch": 0.00919965644700982, + "grad_norm": 0.650858998298645, + "learning_rate": 9.199065222009773e-05, + "loss": 1.4415, + "step": 4330 + }, + { + "epoch": 0.009220902766748872, + "grad_norm": 0.7195905447006226, + "learning_rate": 9.220310176333121e-05, + "loss": 1.4431, + "step": 4340 + }, + { + "epoch": 0.009242149086487924, + "grad_norm": 1185.1309814453125, + "learning_rate": 9.241555130656469e-05, + "loss": 1.6068, + "step": 4350 + }, + { + "epoch": 0.009263395406226978, + "grad_norm": 1.443697452545166, + "learning_rate": 9.262800084979819e-05, + "loss": 3.8546, + "step": 4360 + }, + { + "epoch": 0.00928464172596603, + "grad_norm": 1.2622209787368774, + "learning_rate": 9.284045039303167e-05, + "loss": 1.4972, + "step": 4370 + }, + { + "epoch": 0.009305888045705084, + "grad_norm": 0.7756127715110779, + "learning_rate": 9.305289993626513e-05, + "loss": 1.4692, + "step": 4380 + }, + { + "epoch": 0.009327134365444136, + "grad_norm": 0.7200732231140137, + "learning_rate": 9.326534947949863e-05, + "loss": 1.4517, + "step": 4390 + }, + { + "epoch": 0.009348380685183188, + "grad_norm": 0.7031376361846924, + "learning_rate": 9.34777990227321e-05, + "loss": 1.4659, + "step": 4400 + }, + { + "epoch": 0.009369627004922242, + "grad_norm": 0.8936090469360352, + "learning_rate": 9.369024856596559e-05, + "loss": 1.4794, + "step": 4410 + }, + { + "epoch": 0.009390873324661294, + "grad_norm": 0.6548094749450684, + "learning_rate": 9.390269810919907e-05, + "loss": 1.5054, + "step": 4420 + }, + { + "epoch": 0.009412119644400347, + "grad_norm": 0.7224562168121338, + "learning_rate": 9.411514765243256e-05, + "loss": 1.4433, + "step": 4430 + }, + { + "epoch": 0.0094333659641394, + "grad_norm": 0.690342366695404, + "learning_rate": 9.432759719566603e-05, + "loss": 1.4455, + "step": 4440 + }, + { + "epoch": 0.009454612283878451, + "grad_norm": 0.7699481844902039, + "learning_rate": 9.454004673889951e-05, + "loss": 1.4463, + "step": 4450 + }, + { + "epoch": 0.009475858603617505, + "grad_norm": 0.7414612770080566, + "learning_rate": 9.4752496282133e-05, + "loss": 1.4422, + "step": 4460 + }, + { + "epoch": 0.009497104923356557, + "grad_norm": 0.6941277980804443, + "learning_rate": 9.496494582536648e-05, + "loss": 1.4354, + "step": 4470 + }, + { + "epoch": 0.009518351243095611, + "grad_norm": 0.8244491219520569, + "learning_rate": 9.517739536859996e-05, + "loss": 1.4533, + "step": 4480 + }, + { + "epoch": 0.009539597562834663, + "grad_norm": 0.6897347569465637, + "learning_rate": 9.538984491183345e-05, + "loss": 1.4477, + "step": 4490 + }, + { + "epoch": 0.009560843882573715, + "grad_norm": 1.2066535949707031, + "learning_rate": 9.560229445506692e-05, + "loss": 1.485, + "step": 4500 + }, + { + "epoch": 0.009582090202312769, + "grad_norm": 1.0045440196990967, + "learning_rate": 9.58147439983004e-05, + "loss": 1.4498, + "step": 4510 + }, + { + "epoch": 0.00960333652205182, + "grad_norm": 0.7952929735183716, + "learning_rate": 9.60271935415339e-05, + "loss": 1.4325, + "step": 4520 + }, + { + "epoch": 0.009624582841790873, + "grad_norm": 0.6014060378074646, + "learning_rate": 9.623964308476737e-05, + "loss": 1.4348, + "step": 4530 + }, + { + "epoch": 0.009645829161529926, + "grad_norm": 1.091628074645996, + "learning_rate": 9.645209262800085e-05, + "loss": 1.4613, + "step": 4540 + }, + { + "epoch": 0.009667075481268978, + "grad_norm": 0.6455795764923096, + "learning_rate": 9.666454217123433e-05, + "loss": 1.4374, + "step": 4550 + }, + { + "epoch": 0.009688321801008032, + "grad_norm": 1.2629566192626953, + "learning_rate": 9.687699171446783e-05, + "loss": 1.4382, + "step": 4560 + }, + { + "epoch": 0.009709568120747084, + "grad_norm": 0.9129518866539001, + "learning_rate": 9.70894412577013e-05, + "loss": 1.4292, + "step": 4570 + }, + { + "epoch": 0.009730814440486136, + "grad_norm": 0.5996886491775513, + "learning_rate": 9.730189080093478e-05, + "loss": 1.4572, + "step": 4580 + }, + { + "epoch": 0.00975206076022519, + "grad_norm": 0.6286561489105225, + "learning_rate": 9.751434034416827e-05, + "loss": 1.4737, + "step": 4590 + }, + { + "epoch": 0.009773307079964242, + "grad_norm": 0.8071895241737366, + "learning_rate": 9.772678988740175e-05, + "loss": 1.5068, + "step": 4600 + }, + { + "epoch": 0.009794553399703296, + "grad_norm": 0.9910821914672852, + "learning_rate": 9.793923943063523e-05, + "loss": 1.455, + "step": 4610 + }, + { + "epoch": 0.009815799719442348, + "grad_norm": 0.8075299263000488, + "learning_rate": 9.815168897386872e-05, + "loss": 1.4582, + "step": 4620 + }, + { + "epoch": 0.0098370460391814, + "grad_norm": 0.7408568859100342, + "learning_rate": 9.836413851710219e-05, + "loss": 1.3991, + "step": 4630 + }, + { + "epoch": 0.009858292358920454, + "grad_norm": 1.0674296617507935, + "learning_rate": 9.857658806033567e-05, + "loss": 1.4136, + "step": 4640 + }, + { + "epoch": 0.009879538678659506, + "grad_norm": 0.6583450436592102, + "learning_rate": 9.878903760356916e-05, + "loss": 1.4266, + "step": 4650 + }, + { + "epoch": 0.00990078499839856, + "grad_norm": 0.6935600638389587, + "learning_rate": 9.900148714680264e-05, + "loss": 1.447, + "step": 4660 + }, + { + "epoch": 0.009922031318137611, + "grad_norm": 0.6474509239196777, + "learning_rate": 9.921393669003612e-05, + "loss": 1.4001, + "step": 4670 + }, + { + "epoch": 0.009943277637876663, + "grad_norm": 0.6144103407859802, + "learning_rate": 9.94263862332696e-05, + "loss": 1.3991, + "step": 4680 + }, + { + "epoch": 0.009964523957615717, + "grad_norm": 1.0067622661590576, + "learning_rate": 9.963883577650308e-05, + "loss": 1.4288, + "step": 4690 + }, + { + "epoch": 0.009985770277354769, + "grad_norm": 0.7757844924926758, + "learning_rate": 9.985128531973656e-05, + "loss": 1.4729, + "step": 4700 + }, + { + "epoch": 0.010007016597093823, + "grad_norm": 0.6036081314086914, + "learning_rate": 0.00010006373486297004, + "loss": 1.4464, + "step": 4710 + }, + { + "epoch": 0.010028262916832875, + "grad_norm": 0.5930192470550537, + "learning_rate": 0.00010027618440620352, + "loss": 1.4216, + "step": 4720 + }, + { + "epoch": 0.010049509236571927, + "grad_norm": 0.8924044370651245, + "learning_rate": 0.00010048863394943702, + "loss": 1.4122, + "step": 4730 + }, + { + "epoch": 0.01007075555631098, + "grad_norm": 0.6390189528465271, + "learning_rate": 0.00010070108349267048, + "loss": 1.4504, + "step": 4740 + }, + { + "epoch": 0.010092001876050033, + "grad_norm": 0.8490970730781555, + "learning_rate": 0.00010091353303590399, + "loss": 1.4185, + "step": 4750 + }, + { + "epoch": 0.010113248195789086, + "grad_norm": 0.7365291118621826, + "learning_rate": 0.00010112598257913746, + "loss": 1.4661, + "step": 4760 + }, + { + "epoch": 0.010134494515528138, + "grad_norm": 1.8528046607971191, + "learning_rate": 0.00010133843212237095, + "loss": 1.444, + "step": 4770 + }, + { + "epoch": 0.01015574083526719, + "grad_norm": 0.798801839351654, + "learning_rate": 0.00010155088166560443, + "loss": 1.4401, + "step": 4780 + }, + { + "epoch": 0.010176987155006244, + "grad_norm": 1.0564903020858765, + "learning_rate": 0.0001017633312088379, + "loss": 1.4427, + "step": 4790 + }, + { + "epoch": 0.010198233474745296, + "grad_norm": 1.1877000331878662, + "learning_rate": 0.00010197578075207139, + "loss": 1.4373, + "step": 4800 + }, + { + "epoch": 0.01021947979448435, + "grad_norm": 0.6322603821754456, + "learning_rate": 0.00010218823029530487, + "loss": 1.4552, + "step": 4810 + }, + { + "epoch": 0.010240726114223402, + "grad_norm": 0.7643141150474548, + "learning_rate": 0.00010240067983853837, + "loss": 1.4051, + "step": 4820 + }, + { + "epoch": 0.010261972433962454, + "grad_norm": 0.6797776818275452, + "learning_rate": 0.00010261312938177183, + "loss": 1.4499, + "step": 4830 + }, + { + "epoch": 0.010283218753701508, + "grad_norm": 0.7800890803337097, + "learning_rate": 0.00010282557892500531, + "loss": 1.4455, + "step": 4840 + }, + { + "epoch": 0.01030446507344056, + "grad_norm": 0.5910698771476746, + "learning_rate": 0.0001030380284682388, + "loss": 1.4463, + "step": 4850 + }, + { + "epoch": 0.010325711393179613, + "grad_norm": 0.7205632328987122, + "learning_rate": 0.00010325047801147227, + "loss": 1.4326, + "step": 4860 + }, + { + "epoch": 0.010346957712918665, + "grad_norm": 0.6730954647064209, + "learning_rate": 0.00010346292755470577, + "loss": 1.4331, + "step": 4870 + }, + { + "epoch": 0.010368204032657717, + "grad_norm": 0.7388630509376526, + "learning_rate": 0.00010367537709793925, + "loss": 1.4353, + "step": 4880 + }, + { + "epoch": 0.010389450352396771, + "grad_norm": 0.7097591757774353, + "learning_rate": 0.00010388782664117274, + "loss": 1.4312, + "step": 4890 + }, + { + "epoch": 0.010410696672135823, + "grad_norm": 0.9331147074699402, + "learning_rate": 0.0001041002761844062, + "loss": 1.4539, + "step": 4900 + }, + { + "epoch": 0.010431942991874877, + "grad_norm": 0.8234098553657532, + "learning_rate": 0.00010431272572763969, + "loss": 1.4177, + "step": 4910 + }, + { + "epoch": 0.010453189311613929, + "grad_norm": 0.5758325457572937, + "learning_rate": 0.00010452517527087318, + "loss": 1.4496, + "step": 4920 + }, + { + "epoch": 0.010474435631352981, + "grad_norm": 0.6086469888687134, + "learning_rate": 0.00010473762481410665, + "loss": 1.3787, + "step": 4930 + }, + { + "epoch": 0.010495681951092035, + "grad_norm": 0.5768498182296753, + "learning_rate": 0.00010495007435734014, + "loss": 1.3893, + "step": 4940 + }, + { + "epoch": 0.010516928270831087, + "grad_norm": 0.7388303279876709, + "learning_rate": 0.00010516252390057362, + "loss": 1.4459, + "step": 4950 + }, + { + "epoch": 0.01053817459057014, + "grad_norm": 0.7272471189498901, + "learning_rate": 0.00010537497344380709, + "loss": 1.4092, + "step": 4960 + }, + { + "epoch": 0.010559420910309193, + "grad_norm": 0.6896994709968567, + "learning_rate": 0.00010558742298704058, + "loss": 1.4442, + "step": 4970 + }, + { + "epoch": 0.010580667230048245, + "grad_norm": 0.9328058958053589, + "learning_rate": 0.00010579987253027406, + "loss": 1.3983, + "step": 4980 + }, + { + "epoch": 0.010601913549787298, + "grad_norm": 0.8187626004219055, + "learning_rate": 0.00010601232207350755, + "loss": 1.4293, + "step": 4990 + }, + { + "epoch": 0.01062315986952635, + "grad_norm": 0.8106652498245239, + "learning_rate": 0.00010622477161674102, + "loss": 1.4051, + "step": 5000 + }, + { + "epoch": 0.010644406189265404, + "grad_norm": 0.6662151217460632, + "learning_rate": 0.00010643722115997451, + "loss": 1.424, + "step": 5010 + }, + { + "epoch": 0.010665652509004456, + "grad_norm": 1.0364971160888672, + "learning_rate": 0.000106649670703208, + "loss": 1.4629, + "step": 5020 + }, + { + "epoch": 0.010686898828743508, + "grad_norm": 0.5987892150878906, + "learning_rate": 0.00010686212024644146, + "loss": 1.3849, + "step": 5030 + }, + { + "epoch": 0.010708145148482562, + "grad_norm": 0.6603736281394958, + "learning_rate": 0.00010707456978967497, + "loss": 1.4228, + "step": 5040 + }, + { + "epoch": 0.010729391468221614, + "grad_norm": 0.7147841453552246, + "learning_rate": 0.00010728701933290843, + "loss": 1.3775, + "step": 5050 + }, + { + "epoch": 0.010750637787960666, + "grad_norm": 0.6328175067901611, + "learning_rate": 0.00010749946887614193, + "loss": 1.3828, + "step": 5060 + }, + { + "epoch": 0.01077188410769972, + "grad_norm": 0.5772563219070435, + "learning_rate": 0.00010771191841937541, + "loss": 1.4258, + "step": 5070 + }, + { + "epoch": 0.010793130427438772, + "grad_norm": 0.9825975298881531, + "learning_rate": 0.0001079243679626089, + "loss": 1.4247, + "step": 5080 + }, + { + "epoch": 0.010814376747177825, + "grad_norm": 0.9413897395133972, + "learning_rate": 0.00010813681750584237, + "loss": 1.436, + "step": 5090 + }, + { + "epoch": 0.010835623066916877, + "grad_norm": 0.6210631728172302, + "learning_rate": 0.00010834926704907585, + "loss": 1.4237, + "step": 5100 + }, + { + "epoch": 0.01085686938665593, + "grad_norm": 0.6317740678787231, + "learning_rate": 0.00010856171659230934, + "loss": 1.4104, + "step": 5110 + }, + { + "epoch": 0.010878115706394983, + "grad_norm": 0.7321771383285522, + "learning_rate": 0.00010877416613554281, + "loss": 1.4042, + "step": 5120 + }, + { + "epoch": 0.010899362026134035, + "grad_norm": 0.5356608629226685, + "learning_rate": 0.0001089866156787763, + "loss": 1.4063, + "step": 5130 + }, + { + "epoch": 0.010920608345873089, + "grad_norm": 0.5906651616096497, + "learning_rate": 0.00010919906522200978, + "loss": 1.4037, + "step": 5140 + }, + { + "epoch": 0.010941854665612141, + "grad_norm": 0.5607311725616455, + "learning_rate": 0.00010941151476524325, + "loss": 1.3815, + "step": 5150 + }, + { + "epoch": 0.010963100985351193, + "grad_norm": 1.2977726459503174, + "learning_rate": 0.00010962396430847674, + "loss": 1.4263, + "step": 5160 + }, + { + "epoch": 0.010984347305090247, + "grad_norm": 0.5212831497192383, + "learning_rate": 0.00010983641385171022, + "loss": 1.4391, + "step": 5170 + }, + { + "epoch": 0.011005593624829299, + "grad_norm": 0.6604179739952087, + "learning_rate": 0.00011004886339494372, + "loss": 1.4154, + "step": 5180 + }, + { + "epoch": 0.011026839944568352, + "grad_norm": 0.5935566425323486, + "learning_rate": 0.00011026131293817718, + "loss": 1.3881, + "step": 5190 + }, + { + "epoch": 0.011048086264307404, + "grad_norm": 0.531766414642334, + "learning_rate": 0.00011047376248141068, + "loss": 1.3787, + "step": 5200 + }, + { + "epoch": 0.011069332584046456, + "grad_norm": 0.7179148197174072, + "learning_rate": 0.00011068621202464416, + "loss": 1.4403, + "step": 5210 + }, + { + "epoch": 0.01109057890378551, + "grad_norm": 0.6188239455223083, + "learning_rate": 0.00011089866156787762, + "loss": 1.3796, + "step": 5220 + }, + { + "epoch": 0.011111825223524562, + "grad_norm": 0.713153064250946, + "learning_rate": 0.00011111111111111112, + "loss": 1.4319, + "step": 5230 + }, + { + "epoch": 0.011133071543263616, + "grad_norm": 0.6363143920898438, + "learning_rate": 0.0001113235606543446, + "loss": 1.4325, + "step": 5240 + }, + { + "epoch": 0.011154317863002668, + "grad_norm": 0.8979236483573914, + "learning_rate": 0.00011153601019757809, + "loss": 1.3769, + "step": 5250 + }, + { + "epoch": 0.01117556418274172, + "grad_norm": 0.7728798985481262, + "learning_rate": 0.00011174845974081156, + "loss": 1.3889, + "step": 5260 + }, + { + "epoch": 0.011196810502480774, + "grad_norm": 0.7071273922920227, + "learning_rate": 0.00011196090928404504, + "loss": 1.4038, + "step": 5270 + }, + { + "epoch": 0.011218056822219826, + "grad_norm": 0.6715747714042664, + "learning_rate": 0.00011217335882727853, + "loss": 1.4302, + "step": 5280 + }, + { + "epoch": 0.01123930314195888, + "grad_norm": 0.5914523601531982, + "learning_rate": 0.000112385808370512, + "loss": 1.3897, + "step": 5290 + }, + { + "epoch": 0.011260549461697932, + "grad_norm": 0.7290401458740234, + "learning_rate": 0.00011259825791374549, + "loss": 1.3973, + "step": 5300 + }, + { + "epoch": 0.011281795781436984, + "grad_norm": 0.7645283937454224, + "learning_rate": 0.00011281070745697897, + "loss": 1.4137, + "step": 5310 + }, + { + "epoch": 0.011303042101176037, + "grad_norm": 0.5857424736022949, + "learning_rate": 0.00011302315700021246, + "loss": 1.4184, + "step": 5320 + }, + { + "epoch": 0.01132428842091509, + "grad_norm": 1.1197936534881592, + "learning_rate": 0.00011323560654344594, + "loss": 1.4212, + "step": 5330 + }, + { + "epoch": 0.011345534740654143, + "grad_norm": 0.9865072965621948, + "learning_rate": 0.00011344805608667941, + "loss": 1.383, + "step": 5340 + }, + { + "epoch": 0.011366781060393195, + "grad_norm": 0.9827483892440796, + "learning_rate": 0.0001136605056299129, + "loss": 1.3994, + "step": 5350 + }, + { + "epoch": 0.011388027380132247, + "grad_norm": 0.9113360643386841, + "learning_rate": 0.00011387295517314638, + "loss": 1.3905, + "step": 5360 + }, + { + "epoch": 0.0114092736998713, + "grad_norm": 0.5420899987220764, + "learning_rate": 0.00011408540471637988, + "loss": 1.3908, + "step": 5370 + }, + { + "epoch": 0.011430520019610353, + "grad_norm": 0.6199735999107361, + "learning_rate": 0.00011429785425961334, + "loss": 1.3692, + "step": 5380 + }, + { + "epoch": 0.011451766339349407, + "grad_norm": 0.6558612585067749, + "learning_rate": 0.00011451030380284682, + "loss": 1.4271, + "step": 5390 + }, + { + "epoch": 0.011473012659088459, + "grad_norm": 0.5576156377792358, + "learning_rate": 0.00011472275334608032, + "loss": 1.3779, + "step": 5400 + }, + { + "epoch": 0.01149425897882751, + "grad_norm": 0.6264739036560059, + "learning_rate": 0.00011493520288931379, + "loss": 1.3834, + "step": 5410 + }, + { + "epoch": 0.011515505298566564, + "grad_norm": 0.6076098084449768, + "learning_rate": 0.00011514765243254728, + "loss": 1.3801, + "step": 5420 + }, + { + "epoch": 0.011536751618305616, + "grad_norm": 0.6804399490356445, + "learning_rate": 0.00011536010197578076, + "loss": 1.4182, + "step": 5430 + }, + { + "epoch": 0.01155799793804467, + "grad_norm": 0.5762757658958435, + "learning_rate": 0.00011557255151901425, + "loss": 1.4148, + "step": 5440 + }, + { + "epoch": 0.011579244257783722, + "grad_norm": 0.528236448764801, + "learning_rate": 0.00011578500106224772, + "loss": 1.3792, + "step": 5450 + }, + { + "epoch": 0.011600490577522774, + "grad_norm": 0.5046994090080261, + "learning_rate": 0.0001159974506054812, + "loss": 1.4017, + "step": 5460 + }, + { + "epoch": 0.011621736897261828, + "grad_norm": 0.5642015337944031, + "learning_rate": 0.00011620990014871469, + "loss": 1.4058, + "step": 5470 + }, + { + "epoch": 0.01164298321700088, + "grad_norm": 0.6489053964614868, + "learning_rate": 0.00011642234969194816, + "loss": 1.4248, + "step": 5480 + }, + { + "epoch": 0.011664229536739934, + "grad_norm": 0.554080605506897, + "learning_rate": 0.00011663479923518165, + "loss": 1.4016, + "step": 5490 + }, + { + "epoch": 0.011685475856478986, + "grad_norm": 0.56331866979599, + "learning_rate": 0.00011684724877841513, + "loss": 1.3907, + "step": 5500 + }, + { + "epoch": 0.011706722176218038, + "grad_norm": 0.7390121817588806, + "learning_rate": 0.0001170596983216486, + "loss": 1.4139, + "step": 5510 + }, + { + "epoch": 0.011727968495957091, + "grad_norm": 0.5328644514083862, + "learning_rate": 0.00011727214786488209, + "loss": 1.3868, + "step": 5520 + }, + { + "epoch": 0.011749214815696143, + "grad_norm": 0.5733030438423157, + "learning_rate": 0.00011748459740811557, + "loss": 1.42, + "step": 5530 + }, + { + "epoch": 0.011770461135435195, + "grad_norm": 0.5360122919082642, + "learning_rate": 0.00011769704695134907, + "loss": 1.3819, + "step": 5540 + }, + { + "epoch": 0.01179170745517425, + "grad_norm": 0.5992305278778076, + "learning_rate": 0.00011790949649458253, + "loss": 1.4229, + "step": 5550 + }, + { + "epoch": 0.011812953774913301, + "grad_norm": 0.5954650044441223, + "learning_rate": 0.00011812194603781603, + "loss": 1.3776, + "step": 5560 + }, + { + "epoch": 0.011834200094652355, + "grad_norm": 0.5803873538970947, + "learning_rate": 0.00011833439558104951, + "loss": 1.3956, + "step": 5570 + }, + { + "epoch": 0.011855446414391407, + "grad_norm": 0.540205180644989, + "learning_rate": 0.00011854684512428297, + "loss": 1.3646, + "step": 5580 + }, + { + "epoch": 0.011876692734130459, + "grad_norm": 0.6005442142486572, + "learning_rate": 0.00011875929466751647, + "loss": 1.3905, + "step": 5590 + }, + { + "epoch": 0.011897939053869513, + "grad_norm": 0.8107340931892395, + "learning_rate": 0.00011897174421074995, + "loss": 1.3579, + "step": 5600 + }, + { + "epoch": 0.011919185373608565, + "grad_norm": 0.7063789367675781, + "learning_rate": 0.00011918419375398344, + "loss": 1.3697, + "step": 5610 + }, + { + "epoch": 0.011940431693347619, + "grad_norm": 0.8628270030021667, + "learning_rate": 0.00011939664329721692, + "loss": 1.4105, + "step": 5620 + }, + { + "epoch": 0.01196167801308667, + "grad_norm": 0.8255085945129395, + "learning_rate": 0.00011960909284045041, + "loss": 1.4106, + "step": 5630 + }, + { + "epoch": 0.011982924332825723, + "grad_norm": 0.6672144532203674, + "learning_rate": 0.00011982154238368388, + "loss": 1.4085, + "step": 5640 + }, + { + "epoch": 0.012004170652564776, + "grad_norm": 0.6955538392066956, + "learning_rate": 0.00012003399192691736, + "loss": 1.3858, + "step": 5650 + }, + { + "epoch": 0.012025416972303828, + "grad_norm": 0.5445026755332947, + "learning_rate": 0.00012024644147015086, + "loss": 1.372, + "step": 5660 + }, + { + "epoch": 0.012046663292042882, + "grad_norm": 0.581702470779419, + "learning_rate": 0.00012045889101338432, + "loss": 1.4117, + "step": 5670 + }, + { + "epoch": 0.012067909611781934, + "grad_norm": 0.8399189114570618, + "learning_rate": 0.00012067134055661782, + "loss": 1.4181, + "step": 5680 + }, + { + "epoch": 0.012089155931520986, + "grad_norm": 0.5659377574920654, + "learning_rate": 0.0001208837900998513, + "loss": 1.3896, + "step": 5690 + }, + { + "epoch": 0.01211040225126004, + "grad_norm": 0.5055270195007324, + "learning_rate": 0.00012109623964308476, + "loss": 1.372, + "step": 5700 + }, + { + "epoch": 0.012131648570999092, + "grad_norm": 0.635951578617096, + "learning_rate": 0.00012130868918631826, + "loss": 1.4097, + "step": 5710 + }, + { + "epoch": 0.012152894890738146, + "grad_norm": 0.5763226747512817, + "learning_rate": 0.00012152113872955174, + "loss": 1.4102, + "step": 5720 + }, + { + "epoch": 0.012174141210477198, + "grad_norm": 0.6898927092552185, + "learning_rate": 0.00012173358827278523, + "loss": 1.3852, + "step": 5730 + }, + { + "epoch": 0.01219538753021625, + "grad_norm": 0.5738946199417114, + "learning_rate": 0.0001219460378160187, + "loss": 1.4018, + "step": 5740 + }, + { + "epoch": 0.012216633849955303, + "grad_norm": 0.6912546753883362, + "learning_rate": 0.0001221584873592522, + "loss": 1.3797, + "step": 5750 + }, + { + "epoch": 0.012237880169694355, + "grad_norm": 0.6604278683662415, + "learning_rate": 0.00012237093690248567, + "loss": 1.4165, + "step": 5760 + }, + { + "epoch": 0.012259126489433409, + "grad_norm": 0.6905834078788757, + "learning_rate": 0.00012258338644571915, + "loss": 1.3617, + "step": 5770 + }, + { + "epoch": 0.012280372809172461, + "grad_norm": 0.6306886672973633, + "learning_rate": 0.00012279583598895263, + "loss": 1.3825, + "step": 5780 + }, + { + "epoch": 0.012301619128911513, + "grad_norm": 0.6372388601303101, + "learning_rate": 0.0001230082855321861, + "loss": 1.4003, + "step": 5790 + }, + { + "epoch": 0.012322865448650567, + "grad_norm": 0.653247058391571, + "learning_rate": 0.0001232207350754196, + "loss": 1.3808, + "step": 5800 + }, + { + "epoch": 0.012344111768389619, + "grad_norm": 0.8148528933525085, + "learning_rate": 0.00012343318461865307, + "loss": 1.4108, + "step": 5810 + }, + { + "epoch": 0.012365358088128673, + "grad_norm": 0.5398937463760376, + "learning_rate": 0.00012364563416188655, + "loss": 1.38, + "step": 5820 + }, + { + "epoch": 0.012386604407867725, + "grad_norm": 0.5235511660575867, + "learning_rate": 0.00012385808370512003, + "loss": 1.3758, + "step": 5830 + }, + { + "epoch": 0.012407850727606777, + "grad_norm": 0.5551550984382629, + "learning_rate": 0.0001240705332483535, + "loss": 1.4119, + "step": 5840 + }, + { + "epoch": 0.01242909704734583, + "grad_norm": 0.524071216583252, + "learning_rate": 0.00012428298279158702, + "loss": 1.3855, + "step": 5850 + }, + { + "epoch": 0.012450343367084882, + "grad_norm": 0.6189266443252563, + "learning_rate": 0.00012449543233482047, + "loss": 1.4299, + "step": 5860 + }, + { + "epoch": 0.012471589686823936, + "grad_norm": 0.6269586682319641, + "learning_rate": 0.00012470788187805398, + "loss": 1.4162, + "step": 5870 + }, + { + "epoch": 0.012492836006562988, + "grad_norm": 0.6150676012039185, + "learning_rate": 0.00012492033142128746, + "loss": 1.3849, + "step": 5880 + }, + { + "epoch": 0.01251408232630204, + "grad_norm": 0.5207821130752563, + "learning_rate": 0.0001251327809645209, + "loss": 1.4224, + "step": 5890 + }, + { + "epoch": 0.012535328646041094, + "grad_norm": 0.5386185050010681, + "learning_rate": 0.00012534523050775442, + "loss": 1.3808, + "step": 5900 + }, + { + "epoch": 0.012556574965780146, + "grad_norm": 0.6039036512374878, + "learning_rate": 0.0001255576800509879, + "loss": 1.4033, + "step": 5910 + }, + { + "epoch": 0.0125778212855192, + "grad_norm": 0.5610729455947876, + "learning_rate": 0.00012577012959422138, + "loss": 1.3892, + "step": 5920 + }, + { + "epoch": 0.012599067605258252, + "grad_norm": 0.5880275368690491, + "learning_rate": 0.00012598257913745486, + "loss": 1.3823, + "step": 5930 + }, + { + "epoch": 0.012620313924997304, + "grad_norm": 0.5574862957000732, + "learning_rate": 0.00012619502868068834, + "loss": 1.3491, + "step": 5940 + }, + { + "epoch": 0.012641560244736358, + "grad_norm": 0.7595932483673096, + "learning_rate": 0.00012640747822392182, + "loss": 1.4148, + "step": 5950 + }, + { + "epoch": 0.01266280656447541, + "grad_norm": 0.6555654406547546, + "learning_rate": 0.0001266199277671553, + "loss": 1.346, + "step": 5960 + }, + { + "epoch": 0.012684052884214463, + "grad_norm": 0.575062096118927, + "learning_rate": 0.0001268323773103888, + "loss": 1.4198, + "step": 5970 + }, + { + "epoch": 0.012705299203953515, + "grad_norm": 0.5165491104125977, + "learning_rate": 0.00012704482685362226, + "loss": 1.3765, + "step": 5980 + }, + { + "epoch": 0.012726545523692567, + "grad_norm": 0.7284215092658997, + "learning_rate": 0.00012725727639685577, + "loss": 1.3811, + "step": 5990 + }, + { + "epoch": 0.012747791843431621, + "grad_norm": 0.6870171427726746, + "learning_rate": 0.00012746972594008925, + "loss": 1.4176, + "step": 6000 + }, + { + "epoch": 0.012769038163170673, + "grad_norm": 0.8021366000175476, + "learning_rate": 0.0001276821754833227, + "loss": 1.4205, + "step": 6010 + }, + { + "epoch": 0.012790284482909727, + "grad_norm": 0.6822094917297363, + "learning_rate": 0.0001278946250265562, + "loss": 1.4423, + "step": 6020 + }, + { + "epoch": 0.012811530802648779, + "grad_norm": 0.8552868962287903, + "learning_rate": 0.00012810707456978969, + "loss": 1.3915, + "step": 6030 + }, + { + "epoch": 0.01283277712238783, + "grad_norm": 0.6564489006996155, + "learning_rate": 0.00012831952411302317, + "loss": 1.3402, + "step": 6040 + }, + { + "epoch": 0.012854023442126885, + "grad_norm": 0.5438880324363708, + "learning_rate": 0.00012853197365625665, + "loss": 1.4144, + "step": 6050 + }, + { + "epoch": 0.012875269761865937, + "grad_norm": 0.5195238590240479, + "learning_rate": 0.00012874442319949013, + "loss": 1.3925, + "step": 6060 + }, + { + "epoch": 0.012896516081604989, + "grad_norm": 0.5582408905029297, + "learning_rate": 0.0001289568727427236, + "loss": 1.3939, + "step": 6070 + }, + { + "epoch": 0.012917762401344042, + "grad_norm": 0.5838832855224609, + "learning_rate": 0.0001291693222859571, + "loss": 1.3839, + "step": 6080 + }, + { + "epoch": 0.012939008721083094, + "grad_norm": 0.5253778696060181, + "learning_rate": 0.00012938177182919057, + "loss": 1.4089, + "step": 6090 + }, + { + "epoch": 0.012960255040822148, + "grad_norm": 0.5948464274406433, + "learning_rate": 0.00012959422137242405, + "loss": 1.3845, + "step": 6100 + }, + { + "epoch": 0.0129815013605612, + "grad_norm": 0.5558990836143494, + "learning_rate": 0.00012980667091565755, + "loss": 1.3902, + "step": 6110 + }, + { + "epoch": 0.013002747680300252, + "grad_norm": 0.5359461307525635, + "learning_rate": 0.000130019120458891, + "loss": 1.4199, + "step": 6120 + }, + { + "epoch": 0.013023994000039306, + "grad_norm": 0.5273844003677368, + "learning_rate": 0.0001302315700021245, + "loss": 1.3639, + "step": 6130 + }, + { + "epoch": 0.013045240319778358, + "grad_norm": 0.4880719482898712, + "learning_rate": 0.000130444019545358, + "loss": 1.3892, + "step": 6140 + }, + { + "epoch": 0.013066486639517412, + "grad_norm": 0.5231435298919678, + "learning_rate": 0.00013065646908859145, + "loss": 1.3686, + "step": 6150 + }, + { + "epoch": 0.013087732959256464, + "grad_norm": 0.5446478724479675, + "learning_rate": 0.00013086891863182495, + "loss": 1.3891, + "step": 6160 + }, + { + "epoch": 0.013108979278995516, + "grad_norm": 0.7875513434410095, + "learning_rate": 0.00013108136817505843, + "loss": 1.379, + "step": 6170 + }, + { + "epoch": 0.01313022559873457, + "grad_norm": 0.5573650598526001, + "learning_rate": 0.00013129381771829191, + "loss": 1.3599, + "step": 6180 + }, + { + "epoch": 0.013151471918473621, + "grad_norm": 0.4949086010456085, + "learning_rate": 0.0001315062672615254, + "loss": 1.3694, + "step": 6190 + }, + { + "epoch": 0.013172718238212675, + "grad_norm": 0.5581845641136169, + "learning_rate": 0.00013171871680475887, + "loss": 1.3722, + "step": 6200 + }, + { + "epoch": 0.013193964557951727, + "grad_norm": 0.5882039070129395, + "learning_rate": 0.00013193116634799235, + "loss": 1.3966, + "step": 6210 + }, + { + "epoch": 0.01321521087769078, + "grad_norm": 0.520283043384552, + "learning_rate": 0.00013214361589122583, + "loss": 1.41, + "step": 6220 + }, + { + "epoch": 0.013236457197429833, + "grad_norm": 0.6046220660209656, + "learning_rate": 0.00013235606543445934, + "loss": 1.3845, + "step": 6230 + }, + { + "epoch": 0.013257703517168885, + "grad_norm": 0.5983796715736389, + "learning_rate": 0.0001325685149776928, + "loss": 1.3853, + "step": 6240 + }, + { + "epoch": 0.013278949836907939, + "grad_norm": 0.8147283792495728, + "learning_rate": 0.00013278096452092628, + "loss": 1.3729, + "step": 6250 + }, + { + "epoch": 0.01330019615664699, + "grad_norm": 0.6633943319320679, + "learning_rate": 0.00013299341406415978, + "loss": 1.3676, + "step": 6260 + }, + { + "epoch": 0.013321442476386043, + "grad_norm": 0.5924898982048035, + "learning_rate": 0.00013320586360739324, + "loss": 1.3603, + "step": 6270 + }, + { + "epoch": 0.013342688796125097, + "grad_norm": 0.5851970314979553, + "learning_rate": 0.00013341831315062674, + "loss": 1.4096, + "step": 6280 + }, + { + "epoch": 0.013363935115864149, + "grad_norm": 0.8361711502075195, + "learning_rate": 0.00013363076269386022, + "loss": 1.3737, + "step": 6290 + }, + { + "epoch": 0.013385181435603202, + "grad_norm": 0.673024594783783, + "learning_rate": 0.0001338432122370937, + "loss": 1.3706, + "step": 6300 + }, + { + "epoch": 0.013406427755342254, + "grad_norm": 0.4737209975719452, + "learning_rate": 0.00013405566178032718, + "loss": 1.3706, + "step": 6310 + }, + { + "epoch": 0.013427674075081306, + "grad_norm": 0.4806216061115265, + "learning_rate": 0.00013426811132356066, + "loss": 1.3681, + "step": 6320 + }, + { + "epoch": 0.01344892039482036, + "grad_norm": 0.6931087374687195, + "learning_rate": 0.00013448056086679414, + "loss": 1.3839, + "step": 6330 + }, + { + "epoch": 0.013470166714559412, + "grad_norm": 0.6292034983634949, + "learning_rate": 0.00013469301041002762, + "loss": 1.3694, + "step": 6340 + }, + { + "epoch": 0.013491413034298466, + "grad_norm": 0.628761887550354, + "learning_rate": 0.0001349054599532611, + "loss": 1.3411, + "step": 6350 + }, + { + "epoch": 0.013512659354037518, + "grad_norm": 0.7165646553039551, + "learning_rate": 0.00013511790949649458, + "loss": 1.3797, + "step": 6360 + }, + { + "epoch": 0.01353390567377657, + "grad_norm": 0.5728852152824402, + "learning_rate": 0.00013533035903972806, + "loss": 1.3635, + "step": 6370 + }, + { + "epoch": 0.013555151993515624, + "grad_norm": 0.5188817381858826, + "learning_rate": 0.00013554280858296154, + "loss": 1.4048, + "step": 6380 + }, + { + "epoch": 0.013576398313254676, + "grad_norm": 0.495222270488739, + "learning_rate": 0.00013575525812619502, + "loss": 1.364, + "step": 6390 + }, + { + "epoch": 0.01359764463299373, + "grad_norm": 0.5686549544334412, + "learning_rate": 0.00013596770766942853, + "loss": 1.4036, + "step": 6400 + }, + { + "epoch": 0.013618890952732781, + "grad_norm": 0.7718875408172607, + "learning_rate": 0.00013618015721266198, + "loss": 1.3737, + "step": 6410 + }, + { + "epoch": 0.013640137272471833, + "grad_norm": 0.4771776497364044, + "learning_rate": 0.0001363926067558955, + "loss": 1.365, + "step": 6420 + }, + { + "epoch": 0.013661383592210887, + "grad_norm": 0.5017504096031189, + "learning_rate": 0.00013660505629912897, + "loss": 1.3551, + "step": 6430 + }, + { + "epoch": 0.01368262991194994, + "grad_norm": 0.5959005951881409, + "learning_rate": 0.00013681750584236242, + "loss": 1.3822, + "step": 6440 + }, + { + "epoch": 0.013703876231688993, + "grad_norm": 0.47851037979125977, + "learning_rate": 0.00013702995538559593, + "loss": 1.3779, + "step": 6450 + }, + { + "epoch": 0.013725122551428045, + "grad_norm": 0.5450897216796875, + "learning_rate": 0.0001372424049288294, + "loss": 1.3734, + "step": 6460 + }, + { + "epoch": 0.013746368871167097, + "grad_norm": 0.6967869997024536, + "learning_rate": 0.0001374548544720629, + "loss": 1.3712, + "step": 6470 + }, + { + "epoch": 0.01376761519090615, + "grad_norm": 0.5700177550315857, + "learning_rate": 0.00013766730401529637, + "loss": 1.3761, + "step": 6480 + }, + { + "epoch": 0.013788861510645203, + "grad_norm": 0.5554580688476562, + "learning_rate": 0.00013787975355852985, + "loss": 1.3162, + "step": 6490 + }, + { + "epoch": 0.013810107830384256, + "grad_norm": 0.5608049631118774, + "learning_rate": 0.00013809220310176333, + "loss": 1.3303, + "step": 6500 + }, + { + "epoch": 0.013831354150123308, + "grad_norm": 0.5958791375160217, + "learning_rate": 0.0001383046526449968, + "loss": 1.374, + "step": 6510 + }, + { + "epoch": 0.01385260046986236, + "grad_norm": 0.6641851663589478, + "learning_rate": 0.00013851710218823032, + "loss": 1.4202, + "step": 6520 + }, + { + "epoch": 0.013873846789601414, + "grad_norm": 0.5452728867530823, + "learning_rate": 0.00013872955173146377, + "loss": 1.3641, + "step": 6530 + }, + { + "epoch": 0.013895093109340466, + "grad_norm": 0.6888812780380249, + "learning_rate": 0.00013894200127469728, + "loss": 1.3818, + "step": 6540 + }, + { + "epoch": 0.01391633942907952, + "grad_norm": 0.6580241322517395, + "learning_rate": 0.00013915445081793076, + "loss": 1.3768, + "step": 6550 + }, + { + "epoch": 0.013937585748818572, + "grad_norm": 0.5549701452255249, + "learning_rate": 0.0001393669003611642, + "loss": 1.3529, + "step": 6560 + }, + { + "epoch": 0.013958832068557624, + "grad_norm": 0.47298964858055115, + "learning_rate": 0.00013957934990439772, + "loss": 1.3918, + "step": 6570 + }, + { + "epoch": 0.013980078388296678, + "grad_norm": 0.49210458993911743, + "learning_rate": 0.0001397917994476312, + "loss": 1.3774, + "step": 6580 + }, + { + "epoch": 0.01400132470803573, + "grad_norm": 0.6028631329536438, + "learning_rate": 0.00014000424899086468, + "loss": 1.379, + "step": 6590 + }, + { + "epoch": 0.014022571027774782, + "grad_norm": 0.5093877911567688, + "learning_rate": 0.00014021669853409816, + "loss": 1.3385, + "step": 6600 + }, + { + "epoch": 0.014043817347513836, + "grad_norm": 0.5207003355026245, + "learning_rate": 0.00014042914807733164, + "loss": 1.4056, + "step": 6610 + }, + { + "epoch": 0.014065063667252888, + "grad_norm": 0.6153278946876526, + "learning_rate": 0.00014064159762056512, + "loss": 1.3815, + "step": 6620 + }, + { + "epoch": 0.014086309986991941, + "grad_norm": 0.5291756987571716, + "learning_rate": 0.0001408540471637986, + "loss": 1.3986, + "step": 6630 + }, + { + "epoch": 0.014107556306730993, + "grad_norm": 0.5250617861747742, + "learning_rate": 0.00014106649670703208, + "loss": 1.3579, + "step": 6640 + }, + { + "epoch": 0.014128802626470045, + "grad_norm": 0.47577017545700073, + "learning_rate": 0.00014127894625026556, + "loss": 1.4061, + "step": 6650 + }, + { + "epoch": 0.014150048946209099, + "grad_norm": 0.6014146208763123, + "learning_rate": 0.00014149139579349907, + "loss": 1.3718, + "step": 6660 + }, + { + "epoch": 0.014171295265948151, + "grad_norm": 0.5270480513572693, + "learning_rate": 0.00014170384533673252, + "loss": 1.3475, + "step": 6670 + }, + { + "epoch": 0.014192541585687205, + "grad_norm": 0.7320585250854492, + "learning_rate": 0.000141916294879966, + "loss": 1.3363, + "step": 6680 + }, + { + "epoch": 0.014213787905426257, + "grad_norm": 0.7235461473464966, + "learning_rate": 0.0001421287444231995, + "loss": 1.3607, + "step": 6690 + }, + { + "epoch": 0.014235034225165309, + "grad_norm": 0.5367947816848755, + "learning_rate": 0.00014234119396643296, + "loss": 1.3788, + "step": 6700 + }, + { + "epoch": 0.014256280544904363, + "grad_norm": 0.48176679015159607, + "learning_rate": 0.00014255364350966647, + "loss": 1.3809, + "step": 6710 + }, + { + "epoch": 0.014277526864643415, + "grad_norm": 0.5814561247825623, + "learning_rate": 0.00014276609305289995, + "loss": 1.3572, + "step": 6720 + }, + { + "epoch": 0.014298773184382468, + "grad_norm": 0.5337371230125427, + "learning_rate": 0.00014297854259613343, + "loss": 1.3424, + "step": 6730 + }, + { + "epoch": 0.01432001950412152, + "grad_norm": 0.5957461595535278, + "learning_rate": 0.0001431909921393669, + "loss": 1.3706, + "step": 6740 + }, + { + "epoch": 0.014341265823860572, + "grad_norm": 0.6465137600898743, + "learning_rate": 0.0001434034416826004, + "loss": 1.3696, + "step": 6750 + }, + { + "epoch": 0.014362512143599626, + "grad_norm": 0.4840565621852875, + "learning_rate": 0.00014361589122583387, + "loss": 1.3331, + "step": 6760 + }, + { + "epoch": 0.014383758463338678, + "grad_norm": 0.4731079936027527, + "learning_rate": 0.00014382834076906735, + "loss": 1.3368, + "step": 6770 + }, + { + "epoch": 0.014405004783077732, + "grad_norm": 0.6646950244903564, + "learning_rate": 0.00014404079031230086, + "loss": 1.3419, + "step": 6780 + }, + { + "epoch": 0.014426251102816784, + "grad_norm": 0.5947681665420532, + "learning_rate": 0.0001442532398555343, + "loss": 1.3996, + "step": 6790 + }, + { + "epoch": 0.014447497422555836, + "grad_norm": 0.4368617534637451, + "learning_rate": 0.0001444656893987678, + "loss": 1.3557, + "step": 6800 + }, + { + "epoch": 0.01446874374229489, + "grad_norm": 0.5751848220825195, + "learning_rate": 0.0001446781389420013, + "loss": 1.3702, + "step": 6810 + }, + { + "epoch": 0.014489990062033942, + "grad_norm": 0.511394202709198, + "learning_rate": 0.00014489058848523475, + "loss": 1.3806, + "step": 6820 + }, + { + "epoch": 0.014511236381772995, + "grad_norm": 0.586337149143219, + "learning_rate": 0.00014510303802846826, + "loss": 1.3806, + "step": 6830 + }, + { + "epoch": 0.014532482701512047, + "grad_norm": 0.4926144778728485, + "learning_rate": 0.00014531548757170174, + "loss": 1.3548, + "step": 6840 + }, + { + "epoch": 0.0145537290212511, + "grad_norm": 0.4807242453098297, + "learning_rate": 0.00014552793711493522, + "loss": 1.3862, + "step": 6850 + }, + { + "epoch": 0.014574975340990153, + "grad_norm": 0.5215096473693848, + "learning_rate": 0.0001457403866581687, + "loss": 1.364, + "step": 6860 + }, + { + "epoch": 0.014596221660729205, + "grad_norm": 0.48413315415382385, + "learning_rate": 0.00014595283620140218, + "loss": 1.331, + "step": 6870 + }, + { + "epoch": 0.014617467980468259, + "grad_norm": 0.41339004039764404, + "learning_rate": 0.00014616528574463566, + "loss": 1.3618, + "step": 6880 + }, + { + "epoch": 0.014638714300207311, + "grad_norm": 0.5330844521522522, + "learning_rate": 0.00014637773528786914, + "loss": 1.3735, + "step": 6890 + }, + { + "epoch": 0.014659960619946363, + "grad_norm": 0.6858949065208435, + "learning_rate": 0.00014659018483110262, + "loss": 1.3892, + "step": 6900 + }, + { + "epoch": 0.014681206939685417, + "grad_norm": 1.0868937969207764, + "learning_rate": 0.0001468026343743361, + "loss": 1.3532, + "step": 6910 + }, + { + "epoch": 0.014702453259424469, + "grad_norm": 0.49813032150268555, + "learning_rate": 0.00014701508391756958, + "loss": 1.3983, + "step": 6920 + }, + { + "epoch": 0.014723699579163523, + "grad_norm": 0.5788235664367676, + "learning_rate": 0.00014722753346080306, + "loss": 1.376, + "step": 6930 + }, + { + "epoch": 0.014744945898902575, + "grad_norm": 0.549350917339325, + "learning_rate": 0.00014743998300403654, + "loss": 1.3624, + "step": 6940 + }, + { + "epoch": 0.014766192218641627, + "grad_norm": 0.5756009817123413, + "learning_rate": 0.00014765243254727004, + "loss": 1.3585, + "step": 6950 + }, + { + "epoch": 0.01478743853838068, + "grad_norm": 0.5513563752174377, + "learning_rate": 0.0001478648820905035, + "loss": 1.3681, + "step": 6960 + }, + { + "epoch": 0.014808684858119732, + "grad_norm": 0.6632333993911743, + "learning_rate": 0.000148077331633737, + "loss": 1.3719, + "step": 6970 + }, + { + "epoch": 0.014829931177858786, + "grad_norm": 0.4879901111125946, + "learning_rate": 0.00014828978117697048, + "loss": 1.3555, + "step": 6980 + }, + { + "epoch": 0.014851177497597838, + "grad_norm": 0.5552974343299866, + "learning_rate": 0.00014850223072020394, + "loss": 1.3349, + "step": 6990 + }, + { + "epoch": 0.01487242381733689, + "grad_norm": 0.4820345640182495, + "learning_rate": 0.00014871468026343744, + "loss": 1.3666, + "step": 7000 + }, + { + "epoch": 0.014893670137075944, + "grad_norm": 0.4491882622241974, + "learning_rate": 0.00014892712980667092, + "loss": 1.3557, + "step": 7010 + }, + { + "epoch": 0.014914916456814996, + "grad_norm": 0.6907098293304443, + "learning_rate": 0.0001491395793499044, + "loss": 1.421, + "step": 7020 + }, + { + "epoch": 0.01493616277655405, + "grad_norm": 0.7527890205383301, + "learning_rate": 0.00014935202889313788, + "loss": 1.3223, + "step": 7030 + }, + { + "epoch": 0.014957409096293102, + "grad_norm": 0.4811762869358063, + "learning_rate": 0.00014956447843637136, + "loss": 1.3689, + "step": 7040 + }, + { + "epoch": 0.014978655416032154, + "grad_norm": 0.47868889570236206, + "learning_rate": 0.00014977692797960484, + "loss": 1.384, + "step": 7050 + }, + { + "epoch": 0.014999901735771207, + "grad_norm": 0.5008025765419006, + "learning_rate": 0.00014998937752283833, + "loss": 1.3976, + "step": 7060 + }, + { + "epoch": 0.01502114805551026, + "grad_norm": 0.517736554145813, + "learning_rate": 0.00015020182706607183, + "loss": 1.3616, + "step": 7070 + }, + { + "epoch": 0.015042394375249313, + "grad_norm": 0.45363613963127136, + "learning_rate": 0.00015041427660930529, + "loss": 1.3503, + "step": 7080 + }, + { + "epoch": 0.015063640694988365, + "grad_norm": 0.5433399081230164, + "learning_rate": 0.0001506267261525388, + "loss": 1.3566, + "step": 7090 + }, + { + "epoch": 0.015084887014727417, + "grad_norm": 0.4994738698005676, + "learning_rate": 0.00015083917569577227, + "loss": 1.3722, + "step": 7100 + }, + { + "epoch": 0.015106133334466471, + "grad_norm": 0.500108003616333, + "learning_rate": 0.00015105162523900573, + "loss": 1.3699, + "step": 7110 + }, + { + "epoch": 0.015127379654205523, + "grad_norm": 0.45867204666137695, + "learning_rate": 0.00015126407478223923, + "loss": 1.3433, + "step": 7120 + }, + { + "epoch": 0.015148625973944575, + "grad_norm": 0.8870165944099426, + "learning_rate": 0.0001514765243254727, + "loss": 1.3928, + "step": 7130 + }, + { + "epoch": 0.015169872293683629, + "grad_norm": 0.49730563163757324, + "learning_rate": 0.0001516889738687062, + "loss": 1.3568, + "step": 7140 + }, + { + "epoch": 0.01519111861342268, + "grad_norm": 0.4763539731502533, + "learning_rate": 0.00015190142341193967, + "loss": 1.3589, + "step": 7150 + }, + { + "epoch": 0.015212364933161734, + "grad_norm": 0.4863751530647278, + "learning_rate": 0.00015211387295517315, + "loss": 1.3395, + "step": 7160 + }, + { + "epoch": 0.015233611252900786, + "grad_norm": 0.4821912944316864, + "learning_rate": 0.00015232632249840663, + "loss": 1.3676, + "step": 7170 + }, + { + "epoch": 0.015254857572639838, + "grad_norm": 0.4904688596725464, + "learning_rate": 0.0001525387720416401, + "loss": 1.3396, + "step": 7180 + }, + { + "epoch": 0.015276103892378892, + "grad_norm": 0.4566408395767212, + "learning_rate": 0.0001527512215848736, + "loss": 1.3534, + "step": 7190 + }, + { + "epoch": 0.015297350212117944, + "grad_norm": 0.4547022581100464, + "learning_rate": 0.00015296367112810707, + "loss": 1.3942, + "step": 7200 + }, + { + "epoch": 0.015318596531856998, + "grad_norm": 0.5369927883148193, + "learning_rate": 0.00015317612067134058, + "loss": 1.3702, + "step": 7210 + }, + { + "epoch": 0.01533984285159605, + "grad_norm": 0.566633403301239, + "learning_rate": 0.00015338857021457403, + "loss": 1.365, + "step": 7220 + }, + { + "epoch": 0.015361089171335102, + "grad_norm": 0.5580968260765076, + "learning_rate": 0.00015360101975780751, + "loss": 1.3594, + "step": 7230 + }, + { + "epoch": 0.015382335491074156, + "grad_norm": 0.48391491174697876, + "learning_rate": 0.00015381346930104102, + "loss": 1.34, + "step": 7240 + }, + { + "epoch": 0.015403581810813208, + "grad_norm": 0.6143089532852173, + "learning_rate": 0.00015402591884427447, + "loss": 1.3808, + "step": 7250 + }, + { + "epoch": 0.015424828130552262, + "grad_norm": 0.49317747354507446, + "learning_rate": 0.00015423836838750798, + "loss": 1.3604, + "step": 7260 + }, + { + "epoch": 0.015446074450291314, + "grad_norm": 0.4435961842536926, + "learning_rate": 0.00015445081793074146, + "loss": 1.3724, + "step": 7270 + }, + { + "epoch": 0.015467320770030366, + "grad_norm": 0.5114983916282654, + "learning_rate": 0.00015466326747397494, + "loss": 1.3104, + "step": 7280 + }, + { + "epoch": 0.01548856708976942, + "grad_norm": 0.5695301294326782, + "learning_rate": 0.00015487571701720842, + "loss": 1.3261, + "step": 7290 + }, + { + "epoch": 0.015509813409508471, + "grad_norm": 0.5319899320602417, + "learning_rate": 0.0001550881665604419, + "loss": 1.3487, + "step": 7300 + }, + { + "epoch": 0.015531059729247525, + "grad_norm": 0.566006064414978, + "learning_rate": 0.00015530061610367538, + "loss": 1.323, + "step": 7310 + }, + { + "epoch": 0.015552306048986577, + "grad_norm": 0.47193530201911926, + "learning_rate": 0.00015551306564690886, + "loss": 1.3368, + "step": 7320 + }, + { + "epoch": 0.015573552368725629, + "grad_norm": 0.40182584524154663, + "learning_rate": 0.00015572551519014237, + "loss": 1.3125, + "step": 7330 + }, + { + "epoch": 0.015594798688464683, + "grad_norm": 0.5195749402046204, + "learning_rate": 0.00015593796473337582, + "loss": 1.381, + "step": 7340 + }, + { + "epoch": 0.015616045008203735, + "grad_norm": 0.5150757431983948, + "learning_rate": 0.0001561504142766093, + "loss": 1.3646, + "step": 7350 + }, + { + "epoch": 0.01563729132794279, + "grad_norm": 0.5138119459152222, + "learning_rate": 0.0001563628638198428, + "loss": 1.372, + "step": 7360 + }, + { + "epoch": 0.015658537647681842, + "grad_norm": 0.4868418872356415, + "learning_rate": 0.00015657531336307626, + "loss": 1.3739, + "step": 7370 + }, + { + "epoch": 0.015679783967420893, + "grad_norm": 0.6214230060577393, + "learning_rate": 0.00015678776290630977, + "loss": 1.3628, + "step": 7380 + }, + { + "epoch": 0.015701030287159946, + "grad_norm": 0.516598105430603, + "learning_rate": 0.00015700021244954325, + "loss": 1.3166, + "step": 7390 + }, + { + "epoch": 0.015722276606899, + "grad_norm": 0.9041601419448853, + "learning_rate": 0.00015721266199277673, + "loss": 1.3606, + "step": 7400 + }, + { + "epoch": 0.01574352292663805, + "grad_norm": 0.49733829498291016, + "learning_rate": 0.0001574251115360102, + "loss": 1.35, + "step": 7410 + }, + { + "epoch": 0.015764769246377104, + "grad_norm": 0.4387696385383606, + "learning_rate": 0.0001576375610792437, + "loss": 1.3389, + "step": 7420 + }, + { + "epoch": 0.015786015566116158, + "grad_norm": 0.5174664855003357, + "learning_rate": 0.00015785001062247717, + "loss": 1.3771, + "step": 7430 + }, + { + "epoch": 0.015807261885855208, + "grad_norm": 0.5754226446151733, + "learning_rate": 0.00015806246016571065, + "loss": 1.3255, + "step": 7440 + }, + { + "epoch": 0.015828508205594262, + "grad_norm": 0.5487271547317505, + "learning_rate": 0.00015827490970894413, + "loss": 1.3628, + "step": 7450 + }, + { + "epoch": 0.015849754525333316, + "grad_norm": 0.5043888688087463, + "learning_rate": 0.0001584873592521776, + "loss": 1.3527, + "step": 7460 + }, + { + "epoch": 0.015871000845072366, + "grad_norm": 0.4383392333984375, + "learning_rate": 0.0001586998087954111, + "loss": 1.2985, + "step": 7470 + }, + { + "epoch": 0.01589224716481142, + "grad_norm": 0.4884001612663269, + "learning_rate": 0.00015891225833864457, + "loss": 1.364, + "step": 7480 + }, + { + "epoch": 0.015913493484550473, + "grad_norm": 0.5358350276947021, + "learning_rate": 0.00015912470788187805, + "loss": 1.3608, + "step": 7490 + }, + { + "epoch": 0.015934739804289527, + "grad_norm": 0.5625265836715698, + "learning_rate": 0.00015933715742511156, + "loss": 1.3521, + "step": 7500 + }, + { + "epoch": 0.015955986124028577, + "grad_norm": 0.4471549093723297, + "learning_rate": 0.000159549606968345, + "loss": 1.3421, + "step": 7510 + }, + { + "epoch": 0.01597723244376763, + "grad_norm": 0.4622962772846222, + "learning_rate": 0.00015976205651157852, + "loss": 1.4328, + "step": 7520 + }, + { + "epoch": 0.015998478763506685, + "grad_norm": 0.5225508809089661, + "learning_rate": 0.000159974506054812, + "loss": 1.363, + "step": 7530 + }, + { + "epoch": 0.016019725083245735, + "grad_norm": 0.4757327735424042, + "learning_rate": 0.00016018695559804545, + "loss": 1.3326, + "step": 7540 + }, + { + "epoch": 0.01604097140298479, + "grad_norm": 0.5699297189712524, + "learning_rate": 0.00016039940514127896, + "loss": 1.3454, + "step": 7550 + }, + { + "epoch": 0.016062217722723843, + "grad_norm": 0.47822776436805725, + "learning_rate": 0.00016061185468451244, + "loss": 1.384, + "step": 7560 + }, + { + "epoch": 0.016083464042462893, + "grad_norm": 0.5001635551452637, + "learning_rate": 0.00016082430422774592, + "loss": 1.3756, + "step": 7570 + }, + { + "epoch": 0.016104710362201947, + "grad_norm": 0.4569293260574341, + "learning_rate": 0.0001610367537709794, + "loss": 1.3481, + "step": 7580 + }, + { + "epoch": 0.016125956681941, + "grad_norm": 0.4637899696826935, + "learning_rate": 0.00016124920331421288, + "loss": 1.3193, + "step": 7590 + }, + { + "epoch": 0.016147203001680054, + "grad_norm": 0.8080773949623108, + "learning_rate": 0.00016146165285744636, + "loss": 1.3711, + "step": 7600 + }, + { + "epoch": 0.016168449321419105, + "grad_norm": 0.6997451782226562, + "learning_rate": 0.00016167410240067984, + "loss": 1.3582, + "step": 7610 + }, + { + "epoch": 0.01618969564115816, + "grad_norm": 0.5247102379798889, + "learning_rate": 0.00016188655194391335, + "loss": 1.2926, + "step": 7620 + }, + { + "epoch": 0.016210941960897212, + "grad_norm": 0.49925464391708374, + "learning_rate": 0.0001620990014871468, + "loss": 1.3333, + "step": 7630 + }, + { + "epoch": 0.016232188280636262, + "grad_norm": 0.4469118118286133, + "learning_rate": 0.0001623114510303803, + "loss": 1.3042, + "step": 7640 + }, + { + "epoch": 0.016253434600375316, + "grad_norm": 0.473599910736084, + "learning_rate": 0.00016252390057361379, + "loss": 1.3406, + "step": 7650 + }, + { + "epoch": 0.01627468092011437, + "grad_norm": 0.43392133712768555, + "learning_rate": 0.00016273635011684724, + "loss": 1.3353, + "step": 7660 + }, + { + "epoch": 0.01629592723985342, + "grad_norm": 0.43173784017562866, + "learning_rate": 0.00016294879966008075, + "loss": 1.2962, + "step": 7670 + }, + { + "epoch": 0.016317173559592474, + "grad_norm": 0.42490261793136597, + "learning_rate": 0.00016316124920331423, + "loss": 1.3402, + "step": 7680 + }, + { + "epoch": 0.016338419879331528, + "grad_norm": 0.4721096158027649, + "learning_rate": 0.0001633736987465477, + "loss": 1.3646, + "step": 7690 + }, + { + "epoch": 0.01635966619907058, + "grad_norm": 0.4871390461921692, + "learning_rate": 0.00016358614828978119, + "loss": 1.3384, + "step": 7700 + }, + { + "epoch": 0.01638091251880963, + "grad_norm": 0.5719059705734253, + "learning_rate": 0.00016379859783301467, + "loss": 1.3303, + "step": 7710 + }, + { + "epoch": 0.016402158838548685, + "grad_norm": 0.40087199211120605, + "learning_rate": 0.00016401104737624815, + "loss": 1.3494, + "step": 7720 + }, + { + "epoch": 0.01642340515828774, + "grad_norm": 0.45157358050346375, + "learning_rate": 0.00016422349691948163, + "loss": 1.4133, + "step": 7730 + }, + { + "epoch": 0.01644465147802679, + "grad_norm": 0.44293448328971863, + "learning_rate": 0.0001644359464627151, + "loss": 1.3394, + "step": 7740 + }, + { + "epoch": 0.016465897797765843, + "grad_norm": 0.5098276734352112, + "learning_rate": 0.0001646483960059486, + "loss": 1.3528, + "step": 7750 + }, + { + "epoch": 0.016487144117504897, + "grad_norm": 0.5244005918502808, + "learning_rate": 0.0001648608455491821, + "loss": 1.3376, + "step": 7760 + }, + { + "epoch": 0.016508390437243947, + "grad_norm": 0.5179757475852966, + "learning_rate": 0.00016507329509241555, + "loss": 1.339, + "step": 7770 + }, + { + "epoch": 0.016529636756983, + "grad_norm": 0.5716722011566162, + "learning_rate": 0.00016528574463564903, + "loss": 1.3285, + "step": 7780 + }, + { + "epoch": 0.016550883076722055, + "grad_norm": 0.5262454152107239, + "learning_rate": 0.00016549819417888253, + "loss": 1.3469, + "step": 7790 + }, + { + "epoch": 0.01657212939646111, + "grad_norm": 0.47885745763778687, + "learning_rate": 0.000165710643722116, + "loss": 1.3596, + "step": 7800 + }, + { + "epoch": 0.01659337571620016, + "grad_norm": 0.4280646741390228, + "learning_rate": 0.0001659230932653495, + "loss": 1.3402, + "step": 7810 + }, + { + "epoch": 0.016614622035939212, + "grad_norm": 0.5282872319221497, + "learning_rate": 0.00016613554280858297, + "loss": 1.3477, + "step": 7820 + }, + { + "epoch": 0.016635868355678266, + "grad_norm": 0.48838913440704346, + "learning_rate": 0.00016634799235181645, + "loss": 1.3361, + "step": 7830 + }, + { + "epoch": 0.016657114675417316, + "grad_norm": 0.4329339861869812, + "learning_rate": 0.00016656044189504993, + "loss": 1.3246, + "step": 7840 + }, + { + "epoch": 0.01667836099515637, + "grad_norm": 0.4790267050266266, + "learning_rate": 0.00016677289143828341, + "loss": 1.3633, + "step": 7850 + }, + { + "epoch": 0.016699607314895424, + "grad_norm": 0.4037834405899048, + "learning_rate": 0.0001669853409815169, + "loss": 1.3084, + "step": 7860 + }, + { + "epoch": 0.016720853634634474, + "grad_norm": 0.4516991078853607, + "learning_rate": 0.00016719779052475037, + "loss": 1.3634, + "step": 7870 + }, + { + "epoch": 0.016742099954373528, + "grad_norm": 0.5640581846237183, + "learning_rate": 0.00016741024006798388, + "loss": 1.3241, + "step": 7880 + }, + { + "epoch": 0.016763346274112582, + "grad_norm": 0.4675106704235077, + "learning_rate": 0.00016762268961121734, + "loss": 1.3152, + "step": 7890 + }, + { + "epoch": 0.016784592593851635, + "grad_norm": 0.4900490641593933, + "learning_rate": 0.00016783513915445082, + "loss": 1.3588, + "step": 7900 + }, + { + "epoch": 0.016805838913590686, + "grad_norm": 0.46857184171676636, + "learning_rate": 0.00016804758869768432, + "loss": 1.3311, + "step": 7910 + }, + { + "epoch": 0.01682708523332974, + "grad_norm": 0.4191713333129883, + "learning_rate": 0.00016826003824091778, + "loss": 1.3226, + "step": 7920 + }, + { + "epoch": 0.016848331553068793, + "grad_norm": 0.47432300448417664, + "learning_rate": 0.00016847248778415128, + "loss": 1.3542, + "step": 7930 + }, + { + "epoch": 0.016869577872807844, + "grad_norm": 0.6375020742416382, + "learning_rate": 0.00016868493732738476, + "loss": 1.3658, + "step": 7940 + }, + { + "epoch": 0.016890824192546897, + "grad_norm": 0.46469977498054504, + "learning_rate": 0.00016889738687061824, + "loss": 1.3707, + "step": 7950 + }, + { + "epoch": 0.01691207051228595, + "grad_norm": 0.5327292084693909, + "learning_rate": 0.00016910983641385172, + "loss": 1.3487, + "step": 7960 + }, + { + "epoch": 0.016933316832025, + "grad_norm": 0.4102821946144104, + "learning_rate": 0.0001693222859570852, + "loss": 1.3463, + "step": 7970 + }, + { + "epoch": 0.016954563151764055, + "grad_norm": 0.48191526532173157, + "learning_rate": 0.00016953473550031868, + "loss": 1.3225, + "step": 7980 + }, + { + "epoch": 0.01697580947150311, + "grad_norm": 0.4739604592323303, + "learning_rate": 0.00016974718504355216, + "loss": 1.3573, + "step": 7990 + }, + { + "epoch": 0.01699705579124216, + "grad_norm": 0.49535757303237915, + "learning_rate": 0.00016995963458678564, + "loss": 1.3442, + "step": 8000 + }, + { + "epoch": 0.017018302110981213, + "grad_norm": 0.6563238501548767, + "learning_rate": 0.00017017208413001912, + "loss": 1.32, + "step": 8010 + }, + { + "epoch": 0.017039548430720267, + "grad_norm": 1.0233174562454224, + "learning_rate": 0.0001703845336732526, + "loss": 1.3383, + "step": 8020 + }, + { + "epoch": 0.01706079475045932, + "grad_norm": 0.7053066492080688, + "learning_rate": 0.00017059698321648608, + "loss": 1.3087, + "step": 8030 + }, + { + "epoch": 0.01708204107019837, + "grad_norm": 0.5392405986785889, + "learning_rate": 0.00017080943275971956, + "loss": 1.358, + "step": 8040 + }, + { + "epoch": 0.017103287389937424, + "grad_norm": 0.37548479437828064, + "learning_rate": 0.00017102188230295307, + "loss": 1.3342, + "step": 8050 + }, + { + "epoch": 0.017124533709676478, + "grad_norm": 0.43353673815727234, + "learning_rate": 0.00017123433184618652, + "loss": 1.3456, + "step": 8060 + }, + { + "epoch": 0.01714578002941553, + "grad_norm": 0.4818825423717499, + "learning_rate": 0.00017144678138942003, + "loss": 1.3169, + "step": 8070 + }, + { + "epoch": 0.017167026349154582, + "grad_norm": 0.5311658382415771, + "learning_rate": 0.0001716592309326535, + "loss": 1.3459, + "step": 8080 + }, + { + "epoch": 0.017188272668893636, + "grad_norm": 0.39392268657684326, + "learning_rate": 0.00017187168047588696, + "loss": 1.3274, + "step": 8090 + }, + { + "epoch": 0.017209518988632686, + "grad_norm": 0.43905967473983765, + "learning_rate": 0.00017208413001912047, + "loss": 1.2963, + "step": 8100 + }, + { + "epoch": 0.01723076530837174, + "grad_norm": 0.46422114968299866, + "learning_rate": 0.00017229657956235395, + "loss": 1.3633, + "step": 8110 + }, + { + "epoch": 0.017252011628110794, + "grad_norm": 0.4653816819190979, + "learning_rate": 0.00017250902910558743, + "loss": 1.3465, + "step": 8120 + }, + { + "epoch": 0.017273257947849847, + "grad_norm": 0.48171886801719666, + "learning_rate": 0.0001727214786488209, + "loss": 1.3234, + "step": 8130 + }, + { + "epoch": 0.017294504267588898, + "grad_norm": 0.4245544672012329, + "learning_rate": 0.00017293392819205442, + "loss": 1.3749, + "step": 8140 + }, + { + "epoch": 0.01731575058732795, + "grad_norm": 0.5271256566047668, + "learning_rate": 0.00017314637773528787, + "loss": 1.3353, + "step": 8150 + }, + { + "epoch": 0.017336996907067005, + "grad_norm": 0.45146334171295166, + "learning_rate": 0.00017335882727852135, + "loss": 1.3528, + "step": 8160 + }, + { + "epoch": 0.017358243226806055, + "grad_norm": 0.51809161901474, + "learning_rate": 0.00017357127682175486, + "loss": 1.3517, + "step": 8170 + }, + { + "epoch": 0.01737948954654511, + "grad_norm": 0.6253212690353394, + "learning_rate": 0.0001737837263649883, + "loss": 1.3483, + "step": 8180 + }, + { + "epoch": 0.017400735866284163, + "grad_norm": 0.3974685072898865, + "learning_rate": 0.00017399617590822182, + "loss": 1.3512, + "step": 8190 + }, + { + "epoch": 0.017421982186023213, + "grad_norm": 0.4440295398235321, + "learning_rate": 0.0001742086254514553, + "loss": 1.3627, + "step": 8200 + }, + { + "epoch": 0.017443228505762267, + "grad_norm": 0.4281602203845978, + "learning_rate": 0.00017442107499468875, + "loss": 1.334, + "step": 8210 + }, + { + "epoch": 0.01746447482550132, + "grad_norm": 0.4027789235115051, + "learning_rate": 0.00017463352453792226, + "loss": 1.362, + "step": 8220 + }, + { + "epoch": 0.017485721145240374, + "grad_norm": 0.720975935459137, + "learning_rate": 0.00017484597408115574, + "loss": 1.3779, + "step": 8230 + }, + { + "epoch": 0.017506967464979425, + "grad_norm": 0.5159640312194824, + "learning_rate": 0.00017505842362438922, + "loss": 1.297, + "step": 8240 + }, + { + "epoch": 0.01752821378471848, + "grad_norm": 0.5373861789703369, + "learning_rate": 0.0001752708731676227, + "loss": 1.3434, + "step": 8250 + }, + { + "epoch": 0.017549460104457532, + "grad_norm": 0.5399473309516907, + "learning_rate": 0.00017548332271085618, + "loss": 1.3463, + "step": 8260 + }, + { + "epoch": 0.017570706424196583, + "grad_norm": 0.4104929566383362, + "learning_rate": 0.00017569577225408966, + "loss": 1.338, + "step": 8270 + }, + { + "epoch": 0.017591952743935636, + "grad_norm": 0.40214282274246216, + "learning_rate": 0.00017590822179732314, + "loss": 1.372, + "step": 8280 + }, + { + "epoch": 0.01761319906367469, + "grad_norm": 0.43704092502593994, + "learning_rate": 0.00017612067134055662, + "loss": 1.3176, + "step": 8290 + }, + { + "epoch": 0.01763444538341374, + "grad_norm": 0.5160467624664307, + "learning_rate": 0.0001763331208837901, + "loss": 1.3342, + "step": 8300 + }, + { + "epoch": 0.017655691703152794, + "grad_norm": 0.42854687571525574, + "learning_rate": 0.0001765455704270236, + "loss": 1.3309, + "step": 8310 + }, + { + "epoch": 0.017676938022891848, + "grad_norm": 0.43220067024230957, + "learning_rate": 0.00017675801997025706, + "loss": 1.3538, + "step": 8320 + }, + { + "epoch": 0.0176981843426309, + "grad_norm": 0.5256612300872803, + "learning_rate": 0.00017697046951349054, + "loss": 1.3547, + "step": 8330 + }, + { + "epoch": 0.017719430662369952, + "grad_norm": 0.4027842879295349, + "learning_rate": 0.00017718291905672405, + "loss": 1.3479, + "step": 8340 + }, + { + "epoch": 0.017740676982109006, + "grad_norm": 0.44559478759765625, + "learning_rate": 0.0001773953685999575, + "loss": 1.3515, + "step": 8350 + }, + { + "epoch": 0.01776192330184806, + "grad_norm": 0.5017858147621155, + "learning_rate": 0.000177607818143191, + "loss": 1.3474, + "step": 8360 + }, + { + "epoch": 0.01778316962158711, + "grad_norm": 0.5997343063354492, + "learning_rate": 0.0001778202676864245, + "loss": 1.3638, + "step": 8370 + }, + { + "epoch": 0.017804415941326163, + "grad_norm": 0.5511430501937866, + "learning_rate": 0.00017803271722965797, + "loss": 1.2972, + "step": 8380 + }, + { + "epoch": 0.017825662261065217, + "grad_norm": 0.39995935559272766, + "learning_rate": 0.00017824516677289145, + "loss": 1.3169, + "step": 8390 + }, + { + "epoch": 0.017846908580804267, + "grad_norm": 0.5575509667396545, + "learning_rate": 0.00017845761631612493, + "loss": 1.3595, + "step": 8400 + }, + { + "epoch": 0.01786815490054332, + "grad_norm": 0.39920955896377563, + "learning_rate": 0.0001786700658593584, + "loss": 1.3448, + "step": 8410 + }, + { + "epoch": 0.017889401220282375, + "grad_norm": 0.4858992397785187, + "learning_rate": 0.0001788825154025919, + "loss": 1.3515, + "step": 8420 + }, + { + "epoch": 0.01791064754002143, + "grad_norm": 0.45782899856567383, + "learning_rate": 0.0001790949649458254, + "loss": 1.3005, + "step": 8430 + }, + { + "epoch": 0.01793189385976048, + "grad_norm": 0.46270182728767395, + "learning_rate": 0.00017930741448905885, + "loss": 1.3226, + "step": 8440 + }, + { + "epoch": 0.017953140179499533, + "grad_norm": 0.4460267126560211, + "learning_rate": 0.00017951986403229233, + "loss": 1.3416, + "step": 8450 + }, + { + "epoch": 0.017974386499238586, + "grad_norm": 0.6100211143493652, + "learning_rate": 0.00017973231357552584, + "loss": 1.334, + "step": 8460 + }, + { + "epoch": 0.017995632818977637, + "grad_norm": 0.7315779328346252, + "learning_rate": 0.0001799447631187593, + "loss": 1.3009, + "step": 8470 + }, + { + "epoch": 0.01801687913871669, + "grad_norm": 0.46650680899620056, + "learning_rate": 0.0001801572126619928, + "loss": 1.2942, + "step": 8480 + }, + { + "epoch": 0.018038125458455744, + "grad_norm": 0.6037485599517822, + "learning_rate": 0.00018036966220522628, + "loss": 1.3264, + "step": 8490 + }, + { + "epoch": 0.018059371778194794, + "grad_norm": 0.6474299430847168, + "learning_rate": 0.00018058211174845976, + "loss": 1.3214, + "step": 8500 + }, + { + "epoch": 0.018080618097933848, + "grad_norm": 0.5461345911026001, + "learning_rate": 0.00018079456129169324, + "loss": 1.3261, + "step": 8510 + }, + { + "epoch": 0.018101864417672902, + "grad_norm": 0.5145887732505798, + "learning_rate": 0.00018100701083492672, + "loss": 1.3426, + "step": 8520 + }, + { + "epoch": 0.018123110737411952, + "grad_norm": 0.5647517442703247, + "learning_rate": 0.0001812194603781602, + "loss": 1.3598, + "step": 8530 + }, + { + "epoch": 0.018144357057151006, + "grad_norm": 0.3920421004295349, + "learning_rate": 0.00018143190992139368, + "loss": 1.345, + "step": 8540 + }, + { + "epoch": 0.01816560337689006, + "grad_norm": 0.49675852060317993, + "learning_rate": 0.00018164435946462716, + "loss": 1.3446, + "step": 8550 + }, + { + "epoch": 0.018186849696629113, + "grad_norm": 0.42556846141815186, + "learning_rate": 0.00018185680900786064, + "loss": 1.3016, + "step": 8560 + }, + { + "epoch": 0.018208096016368164, + "grad_norm": 0.3992804288864136, + "learning_rate": 0.00018206925855109412, + "loss": 1.3562, + "step": 8570 + }, + { + "epoch": 0.018229342336107218, + "grad_norm": 0.4302438199520111, + "learning_rate": 0.0001822817080943276, + "loss": 1.3223, + "step": 8580 + }, + { + "epoch": 0.01825058865584627, + "grad_norm": 0.3920704126358032, + "learning_rate": 0.00018249415763756108, + "loss": 1.3447, + "step": 8590 + }, + { + "epoch": 0.01827183497558532, + "grad_norm": 0.39971745014190674, + "learning_rate": 0.00018270660718079458, + "loss": 1.3461, + "step": 8600 + }, + { + "epoch": 0.018293081295324375, + "grad_norm": 0.3880845904350281, + "learning_rate": 0.00018291905672402804, + "loss": 1.289, + "step": 8610 + }, + { + "epoch": 0.01831432761506343, + "grad_norm": 0.44876378774642944, + "learning_rate": 0.00018313150626726154, + "loss": 1.293, + "step": 8620 + }, + { + "epoch": 0.01833557393480248, + "grad_norm": 0.49290233850479126, + "learning_rate": 0.00018334395581049502, + "loss": 1.3129, + "step": 8630 + }, + { + "epoch": 0.018356820254541533, + "grad_norm": 0.43164414167404175, + "learning_rate": 0.00018355640535372848, + "loss": 1.3074, + "step": 8640 + }, + { + "epoch": 0.018378066574280587, + "grad_norm": 0.5284647345542908, + "learning_rate": 0.00018376885489696198, + "loss": 1.3278, + "step": 8650 + }, + { + "epoch": 0.01839931289401964, + "grad_norm": 0.5126187801361084, + "learning_rate": 0.00018398130444019546, + "loss": 1.3094, + "step": 8660 + }, + { + "epoch": 0.01842055921375869, + "grad_norm": 0.503959059715271, + "learning_rate": 0.00018419375398342894, + "loss": 1.3166, + "step": 8670 + }, + { + "epoch": 0.018441805533497745, + "grad_norm": 0.3981548845767975, + "learning_rate": 0.00018440620352666242, + "loss": 1.3162, + "step": 8680 + }, + { + "epoch": 0.0184630518532368, + "grad_norm": 0.5236872434616089, + "learning_rate": 0.00018461865306989593, + "loss": 1.3722, + "step": 8690 + }, + { + "epoch": 0.01848429817297585, + "grad_norm": 0.35994836688041687, + "learning_rate": 0.00018483110261312938, + "loss": 1.3186, + "step": 8700 + }, + { + "epoch": 0.018505544492714902, + "grad_norm": 0.4705830216407776, + "learning_rate": 0.00018504355215636286, + "loss": 1.3287, + "step": 8710 + }, + { + "epoch": 0.018526790812453956, + "grad_norm": 0.4087369740009308, + "learning_rate": 0.00018525600169959637, + "loss": 1.356, + "step": 8720 + }, + { + "epoch": 0.018548037132193006, + "grad_norm": 0.3820595145225525, + "learning_rate": 0.00018546845124282983, + "loss": 1.3202, + "step": 8730 + }, + { + "epoch": 0.01856928345193206, + "grad_norm": 0.40614980459213257, + "learning_rate": 0.00018568090078606333, + "loss": 1.2996, + "step": 8740 + }, + { + "epoch": 0.018590529771671114, + "grad_norm": 0.48920807242393494, + "learning_rate": 0.0001858933503292968, + "loss": 1.3259, + "step": 8750 + }, + { + "epoch": 0.018611776091410168, + "grad_norm": 0.4946304261684418, + "learning_rate": 0.00018610579987253027, + "loss": 1.3038, + "step": 8760 + }, + { + "epoch": 0.018633022411149218, + "grad_norm": 0.4733341634273529, + "learning_rate": 0.00018631824941576377, + "loss": 1.2884, + "step": 8770 + }, + { + "epoch": 0.01865426873088827, + "grad_norm": 0.40089792013168335, + "learning_rate": 0.00018653069895899725, + "loss": 1.3709, + "step": 8780 + }, + { + "epoch": 0.018675515050627325, + "grad_norm": 0.4330296218395233, + "learning_rate": 0.00018674314850223073, + "loss": 1.3383, + "step": 8790 + }, + { + "epoch": 0.018696761370366376, + "grad_norm": 0.48306480050086975, + "learning_rate": 0.0001869555980454642, + "loss": 1.365, + "step": 8800 + }, + { + "epoch": 0.01871800769010543, + "grad_norm": 0.46313342452049255, + "learning_rate": 0.0001871680475886977, + "loss": 1.3439, + "step": 8810 + }, + { + "epoch": 0.018739254009844483, + "grad_norm": 0.4928863048553467, + "learning_rate": 0.00018738049713193117, + "loss": 1.3179, + "step": 8820 + }, + { + "epoch": 0.018760500329583533, + "grad_norm": 0.3958575129508972, + "learning_rate": 0.00018759294667516465, + "loss": 1.3639, + "step": 8830 + }, + { + "epoch": 0.018781746649322587, + "grad_norm": 0.3737720549106598, + "learning_rate": 0.00018780539621839813, + "loss": 1.3142, + "step": 8840 + }, + { + "epoch": 0.01880299296906164, + "grad_norm": 0.39473819732666016, + "learning_rate": 0.0001880178457616316, + "loss": 1.3129, + "step": 8850 + }, + { + "epoch": 0.018824239288800695, + "grad_norm": 0.40131810307502747, + "learning_rate": 0.00018823029530486512, + "loss": 1.3215, + "step": 8860 + }, + { + "epoch": 0.018845485608539745, + "grad_norm": 0.38204288482666016, + "learning_rate": 0.00018844274484809857, + "loss": 1.3288, + "step": 8870 + }, + { + "epoch": 0.0188667319282788, + "grad_norm": 0.3883414566516876, + "learning_rate": 0.00018865519439133205, + "loss": 1.3253, + "step": 8880 + }, + { + "epoch": 0.018887978248017852, + "grad_norm": 0.42854729294776917, + "learning_rate": 0.00018886764393456556, + "loss": 1.2735, + "step": 8890 + }, + { + "epoch": 0.018909224567756903, + "grad_norm": 0.41609522700309753, + "learning_rate": 0.00018908009347779901, + "loss": 1.3498, + "step": 8900 + }, + { + "epoch": 0.018930470887495957, + "grad_norm": 0.4271881878376007, + "learning_rate": 0.00018929254302103252, + "loss": 1.3343, + "step": 8910 + }, + { + "epoch": 0.01895171720723501, + "grad_norm": 0.5738940238952637, + "learning_rate": 0.000189504992564266, + "loss": 1.3235, + "step": 8920 + }, + { + "epoch": 0.01897296352697406, + "grad_norm": 0.46246758103370667, + "learning_rate": 0.00018971744210749948, + "loss": 1.3166, + "step": 8930 + }, + { + "epoch": 0.018994209846713114, + "grad_norm": 0.3694075047969818, + "learning_rate": 0.00018992989165073296, + "loss": 1.3054, + "step": 8940 + }, + { + "epoch": 0.019015456166452168, + "grad_norm": 0.5443398356437683, + "learning_rate": 0.00019014234119396644, + "loss": 1.3385, + "step": 8950 + }, + { + "epoch": 0.019036702486191222, + "grad_norm": 0.46073266863822937, + "learning_rate": 0.00019035479073719992, + "loss": 1.3508, + "step": 8960 + }, + { + "epoch": 0.019057948805930272, + "grad_norm": 0.43779510259628296, + "learning_rate": 0.0001905672402804334, + "loss": 1.3177, + "step": 8970 + }, + { + "epoch": 0.019079195125669326, + "grad_norm": 0.425118625164032, + "learning_rate": 0.0001907796898236669, + "loss": 1.3228, + "step": 8980 + }, + { + "epoch": 0.01910044144540838, + "grad_norm": 0.38012003898620605, + "learning_rate": 0.00019099213936690036, + "loss": 1.3063, + "step": 8990 + }, + { + "epoch": 0.01912168776514743, + "grad_norm": 0.5601592063903809, + "learning_rate": 0.00019120458891013384, + "loss": 1.3335, + "step": 9000 + }, + { + "epoch": 0.019142934084886484, + "grad_norm": 0.40178459882736206, + "learning_rate": 0.00019141703845336735, + "loss": 1.3308, + "step": 9010 + }, + { + "epoch": 0.019164180404625537, + "grad_norm": 0.42838606238365173, + "learning_rate": 0.0001916294879966008, + "loss": 1.307, + "step": 9020 + }, + { + "epoch": 0.019185426724364588, + "grad_norm": 0.38543522357940674, + "learning_rate": 0.0001918419375398343, + "loss": 1.3195, + "step": 9030 + }, + { + "epoch": 0.01920667304410364, + "grad_norm": 0.39618176221847534, + "learning_rate": 0.0001920543870830678, + "loss": 1.3233, + "step": 9040 + }, + { + "epoch": 0.019227919363842695, + "grad_norm": 0.44110575318336487, + "learning_rate": 0.00019226683662630127, + "loss": 1.3146, + "step": 9050 + }, + { + "epoch": 0.019249165683581745, + "grad_norm": 0.5189473032951355, + "learning_rate": 0.00019247928616953475, + "loss": 1.2799, + "step": 9060 + }, + { + "epoch": 0.0192704120033208, + "grad_norm": 0.5721403360366821, + "learning_rate": 0.00019269173571276823, + "loss": 1.3195, + "step": 9070 + }, + { + "epoch": 0.019291658323059853, + "grad_norm": 0.464632123708725, + "learning_rate": 0.0001929041852560017, + "loss": 1.3258, + "step": 9080 + }, + { + "epoch": 0.019312904642798907, + "grad_norm": 0.5132027268409729, + "learning_rate": 0.0001931166347992352, + "loss": 1.3476, + "step": 9090 + }, + { + "epoch": 0.019334150962537957, + "grad_norm": 0.5532477498054504, + "learning_rate": 0.00019332908434246867, + "loss": 1.3762, + "step": 9100 + }, + { + "epoch": 0.01935539728227701, + "grad_norm": 0.7433248162269592, + "learning_rate": 0.00019354153388570215, + "loss": 1.3555, + "step": 9110 + }, + { + "epoch": 0.019376643602016064, + "grad_norm": 0.4303207993507385, + "learning_rate": 0.00019375398342893566, + "loss": 1.3624, + "step": 9120 + }, + { + "epoch": 0.019397889921755115, + "grad_norm": 0.3773690462112427, + "learning_rate": 0.0001939664329721691, + "loss": 1.2941, + "step": 9130 + }, + { + "epoch": 0.01941913624149417, + "grad_norm": 0.3837755620479584, + "learning_rate": 0.0001941788825154026, + "loss": 1.3238, + "step": 9140 + }, + { + "epoch": 0.019440382561233222, + "grad_norm": 0.38324686884880066, + "learning_rate": 0.0001943913320586361, + "loss": 1.3129, + "step": 9150 + }, + { + "epoch": 0.019461628880972272, + "grad_norm": 0.44700583815574646, + "learning_rate": 0.00019460378160186955, + "loss": 1.2652, + "step": 9160 + }, + { + "epoch": 0.019482875200711326, + "grad_norm": 0.3845692574977875, + "learning_rate": 0.00019481623114510306, + "loss": 1.307, + "step": 9170 + }, + { + "epoch": 0.01950412152045038, + "grad_norm": 0.5040942430496216, + "learning_rate": 0.00019502868068833654, + "loss": 1.311, + "step": 9180 + }, + { + "epoch": 0.019525367840189434, + "grad_norm": 0.4517185091972351, + "learning_rate": 0.00019524113023157, + "loss": 1.2885, + "step": 9190 + }, + { + "epoch": 0.019546614159928484, + "grad_norm": 0.4060044586658478, + "learning_rate": 0.0001954535797748035, + "loss": 1.3066, + "step": 9200 + }, + { + "epoch": 0.019567860479667538, + "grad_norm": 0.4376065135002136, + "learning_rate": 0.00019566602931803698, + "loss": 1.3678, + "step": 9210 + }, + { + "epoch": 0.01958910679940659, + "grad_norm": 0.49438655376434326, + "learning_rate": 0.00019587847886127046, + "loss": 1.3124, + "step": 9220 + }, + { + "epoch": 0.019610353119145642, + "grad_norm": 0.5151435732841492, + "learning_rate": 0.00019609092840450394, + "loss": 1.3352, + "step": 9230 + }, + { + "epoch": 0.019631599438884696, + "grad_norm": 0.46144798398017883, + "learning_rate": 0.00019630337794773745, + "loss": 1.3342, + "step": 9240 + }, + { + "epoch": 0.01965284575862375, + "grad_norm": 0.6166277527809143, + "learning_rate": 0.0001965158274909709, + "loss": 1.3091, + "step": 9250 + }, + { + "epoch": 0.0196740920783628, + "grad_norm": 0.4483606815338135, + "learning_rate": 0.00019672827703420438, + "loss": 1.3118, + "step": 9260 + }, + { + "epoch": 0.019695338398101853, + "grad_norm": 0.48402494192123413, + "learning_rate": 0.00019694072657743789, + "loss": 1.371, + "step": 9270 + }, + { + "epoch": 0.019716584717840907, + "grad_norm": 0.382341593503952, + "learning_rate": 0.00019715317612067134, + "loss": 1.3529, + "step": 9280 + }, + { + "epoch": 0.01973783103757996, + "grad_norm": 0.4788714647293091, + "learning_rate": 0.00019736562566390485, + "loss": 1.3471, + "step": 9290 + }, + { + "epoch": 0.01975907735731901, + "grad_norm": 0.39735954999923706, + "learning_rate": 0.00019757807520713833, + "loss": 1.324, + "step": 9300 + }, + { + "epoch": 0.019780323677058065, + "grad_norm": 0.38381752371788025, + "learning_rate": 0.00019779052475037178, + "loss": 1.3504, + "step": 9310 + }, + { + "epoch": 0.01980156999679712, + "grad_norm": 0.3590700030326843, + "learning_rate": 0.00019800297429360529, + "loss": 1.3068, + "step": 9320 + }, + { + "epoch": 0.01982281631653617, + "grad_norm": 0.3928352892398834, + "learning_rate": 0.00019821542383683877, + "loss": 1.3376, + "step": 9330 + }, + { + "epoch": 0.019844062636275223, + "grad_norm": 0.4324212968349457, + "learning_rate": 0.00019842787338007225, + "loss": 1.2994, + "step": 9340 + }, + { + "epoch": 0.019865308956014276, + "grad_norm": 0.44516658782958984, + "learning_rate": 0.00019864032292330573, + "loss": 1.3074, + "step": 9350 + }, + { + "epoch": 0.019886555275753327, + "grad_norm": 0.4035835862159729, + "learning_rate": 0.0001988527724665392, + "loss": 1.3074, + "step": 9360 + }, + { + "epoch": 0.01990780159549238, + "grad_norm": 0.3958171308040619, + "learning_rate": 0.00019906522200977269, + "loss": 1.3205, + "step": 9370 + }, + { + "epoch": 0.019929047915231434, + "grad_norm": 0.4585682451725006, + "learning_rate": 0.00019927767155300617, + "loss": 1.3011, + "step": 9380 + }, + { + "epoch": 0.019950294234970488, + "grad_norm": 0.4138244688510895, + "learning_rate": 0.00019949012109623965, + "loss": 1.3277, + "step": 9390 + }, + { + "epoch": 0.019971540554709538, + "grad_norm": 0.5346614122390747, + "learning_rate": 0.00019970257063947313, + "loss": 1.3293, + "step": 9400 + }, + { + "epoch": 0.019992786874448592, + "grad_norm": 0.4047664701938629, + "learning_rate": 0.00019991502018270663, + "loss": 1.3313, + "step": 9410 + }, + { + "epoch": 0.020014033194187646, + "grad_norm": 0.4478345513343811, + "learning_rate": 0.0001999999999164993, + "loss": 1.3325, + "step": 9420 + }, + { + "epoch": 0.020035279513926696, + "grad_norm": 0.4331769049167633, + "learning_rate": 0.0001999999994062173, + "loss": 1.3331, + "step": 9430 + }, + { + "epoch": 0.02005652583366575, + "grad_norm": 0.43624380230903625, + "learning_rate": 0.00019999999843204255, + "loss": 1.3262, + "step": 9440 + }, + { + "epoch": 0.020077772153404803, + "grad_norm": 0.37638309597969055, + "learning_rate": 0.0001999999969939751, + "loss": 1.3146, + "step": 9450 + }, + { + "epoch": 0.020099018473143854, + "grad_norm": 0.37836524844169617, + "learning_rate": 0.0001999999950920149, + "loss": 1.2824, + "step": 9460 + }, + { + "epoch": 0.020120264792882907, + "grad_norm": 0.3898759186267853, + "learning_rate": 0.000199999992726162, + "loss": 1.3555, + "step": 9470 + }, + { + "epoch": 0.02014151111262196, + "grad_norm": 0.4096510410308838, + "learning_rate": 0.0001999999898964164, + "loss": 1.3114, + "step": 9480 + }, + { + "epoch": 0.020162757432361015, + "grad_norm": 0.37356212735176086, + "learning_rate": 0.0001999999866027781, + "loss": 1.3157, + "step": 9490 + }, + { + "epoch": 0.020184003752100065, + "grad_norm": 0.38895922899246216, + "learning_rate": 0.00019999998284524716, + "loss": 1.3205, + "step": 9500 + }, + { + "epoch": 0.02020525007183912, + "grad_norm": 0.431063175201416, + "learning_rate": 0.00019999997862382355, + "loss": 1.348, + "step": 9510 + }, + { + "epoch": 0.020226496391578173, + "grad_norm": 0.3938753008842468, + "learning_rate": 0.00019999997393850726, + "loss": 1.3087, + "step": 9520 + }, + { + "epoch": 0.020247742711317223, + "grad_norm": 0.4594714641571045, + "learning_rate": 0.0001999999687892984, + "loss": 1.2918, + "step": 9530 + }, + { + "epoch": 0.020268989031056277, + "grad_norm": 0.4649589955806732, + "learning_rate": 0.00019999996317619694, + "loss": 1.3071, + "step": 9540 + }, + { + "epoch": 0.02029023535079533, + "grad_norm": 0.3947903513908386, + "learning_rate": 0.00019999995709920292, + "loss": 1.3095, + "step": 9550 + }, + { + "epoch": 0.02031148167053438, + "grad_norm": 0.4237905442714691, + "learning_rate": 0.00019999995055831633, + "loss": 1.3152, + "step": 9560 + }, + { + "epoch": 0.020332727990273435, + "grad_norm": 0.41066765785217285, + "learning_rate": 0.00019999994355353727, + "loss": 1.3063, + "step": 9570 + }, + { + "epoch": 0.02035397431001249, + "grad_norm": 0.4671986401081085, + "learning_rate": 0.00019999993608486573, + "loss": 1.3069, + "step": 9580 + }, + { + "epoch": 0.02037522062975154, + "grad_norm": 0.37012219429016113, + "learning_rate": 0.00019999992815230175, + "loss": 1.3455, + "step": 9590 + }, + { + "epoch": 0.020396466949490592, + "grad_norm": 0.4353351294994354, + "learning_rate": 0.00019999991975584535, + "loss": 1.3793, + "step": 9600 + }, + { + "epoch": 0.020417713269229646, + "grad_norm": 0.4162021279335022, + "learning_rate": 0.00019999991089549664, + "loss": 1.2932, + "step": 9610 + }, + { + "epoch": 0.0204389595889687, + "grad_norm": 0.3823590576648712, + "learning_rate": 0.00019999990157125555, + "loss": 1.3545, + "step": 9620 + }, + { + "epoch": 0.02046020590870775, + "grad_norm": 0.3807418942451477, + "learning_rate": 0.0001999998917831222, + "loss": 1.3396, + "step": 9630 + }, + { + "epoch": 0.020481452228446804, + "grad_norm": 0.38454094529151917, + "learning_rate": 0.00019999988153109662, + "loss": 1.291, + "step": 9640 + }, + { + "epoch": 0.020502698548185858, + "grad_norm": 0.5412732362747192, + "learning_rate": 0.00019999987081517886, + "loss": 1.3703, + "step": 9650 + }, + { + "epoch": 0.020523944867924908, + "grad_norm": 0.6448853015899658, + "learning_rate": 0.00019999985963536896, + "loss": 1.3749, + "step": 9660 + }, + { + "epoch": 0.02054519118766396, + "grad_norm": 0.5143558979034424, + "learning_rate": 0.00019999984799166698, + "loss": 1.2906, + "step": 9670 + }, + { + "epoch": 0.020566437507403015, + "grad_norm": 0.40538492798805237, + "learning_rate": 0.00019999983588407297, + "loss": 1.3619, + "step": 9680 + }, + { + "epoch": 0.020587683827142066, + "grad_norm": 0.3787655830383301, + "learning_rate": 0.00019999982331258696, + "loss": 1.2925, + "step": 9690 + }, + { + "epoch": 0.02060893014688112, + "grad_norm": 0.36879733204841614, + "learning_rate": 0.00019999981027720907, + "loss": 1.3554, + "step": 9700 + }, + { + "epoch": 0.020630176466620173, + "grad_norm": 0.41508162021636963, + "learning_rate": 0.00019999979677793933, + "loss": 1.2902, + "step": 9710 + }, + { + "epoch": 0.020651422786359227, + "grad_norm": 0.38242247700691223, + "learning_rate": 0.00019999978281477776, + "loss": 1.3478, + "step": 9720 + }, + { + "epoch": 0.020672669106098277, + "grad_norm": 0.3563601076602936, + "learning_rate": 0.00019999976838772448, + "loss": 1.2774, + "step": 9730 + }, + { + "epoch": 0.02069391542583733, + "grad_norm": 0.4097003638744354, + "learning_rate": 0.00019999975349677953, + "loss": 1.3209, + "step": 9740 + }, + { + "epoch": 0.020715161745576385, + "grad_norm": 0.3516746461391449, + "learning_rate": 0.000199999738141943, + "loss": 1.3416, + "step": 9750 + }, + { + "epoch": 0.020736408065315435, + "grad_norm": 0.402161568403244, + "learning_rate": 0.00019999972232321493, + "loss": 1.3393, + "step": 9760 + }, + { + "epoch": 0.02075765438505449, + "grad_norm": 0.3882290720939636, + "learning_rate": 0.00019999970604059545, + "loss": 1.3257, + "step": 9770 + }, + { + "epoch": 0.020778900704793542, + "grad_norm": 0.3787508010864258, + "learning_rate": 0.00019999968929408456, + "loss": 1.3, + "step": 9780 + }, + { + "epoch": 0.020800147024532593, + "grad_norm": 0.4051687717437744, + "learning_rate": 0.0001999996720836824, + "loss": 1.3069, + "step": 9790 + }, + { + "epoch": 0.020821393344271646, + "grad_norm": 0.48272427916526794, + "learning_rate": 0.00019999965440938903, + "loss": 1.3229, + "step": 9800 + }, + { + "epoch": 0.0208426396640107, + "grad_norm": 0.41763049364089966, + "learning_rate": 0.00019999963627120453, + "loss": 1.2824, + "step": 9810 + }, + { + "epoch": 0.020863885983749754, + "grad_norm": 0.4362330436706543, + "learning_rate": 0.00019999961766912895, + "loss": 1.3082, + "step": 9820 + }, + { + "epoch": 0.020885132303488804, + "grad_norm": 0.3657620847225189, + "learning_rate": 0.00019999959860316242, + "loss": 1.2987, + "step": 9830 + }, + { + "epoch": 0.020906378623227858, + "grad_norm": 0.39101263880729675, + "learning_rate": 0.00019999957907330503, + "loss": 1.3169, + "step": 9840 + }, + { + "epoch": 0.02092762494296691, + "grad_norm": 0.39599308371543884, + "learning_rate": 0.00019999955907955687, + "loss": 1.2885, + "step": 9850 + }, + { + "epoch": 0.020948871262705962, + "grad_norm": 0.39984455704689026, + "learning_rate": 0.000199999538621918, + "loss": 1.3512, + "step": 9860 + }, + { + "epoch": 0.020970117582445016, + "grad_norm": 0.34481069445610046, + "learning_rate": 0.00019999951770038852, + "loss": 1.2807, + "step": 9870 + }, + { + "epoch": 0.02099136390218407, + "grad_norm": 0.42392948269844055, + "learning_rate": 0.00019999949631496858, + "loss": 1.3133, + "step": 9880 + }, + { + "epoch": 0.02101261022192312, + "grad_norm": 0.3671132028102875, + "learning_rate": 0.00019999947446565822, + "loss": 1.3293, + "step": 9890 + }, + { + "epoch": 0.021033856541662174, + "grad_norm": 0.3772485554218292, + "learning_rate": 0.00019999945215245756, + "loss": 1.3192, + "step": 9900 + }, + { + "epoch": 0.021055102861401227, + "grad_norm": 0.399903267621994, + "learning_rate": 0.00019999942937536672, + "loss": 1.3253, + "step": 9910 + }, + { + "epoch": 0.02107634918114028, + "grad_norm": 0.46043267846107483, + "learning_rate": 0.00019999940613438577, + "loss": 1.2865, + "step": 9920 + }, + { + "epoch": 0.02109759550087933, + "grad_norm": 0.4129765033721924, + "learning_rate": 0.00019999938242951486, + "loss": 1.3293, + "step": 9930 + }, + { + "epoch": 0.021118841820618385, + "grad_norm": 0.431996613740921, + "learning_rate": 0.00019999935826075406, + "loss": 1.3015, + "step": 9940 + }, + { + "epoch": 0.02114008814035744, + "grad_norm": 0.468212753534317, + "learning_rate": 0.0001999993336281035, + "loss": 1.2754, + "step": 9950 + }, + { + "epoch": 0.02116133446009649, + "grad_norm": 0.3797603249549866, + "learning_rate": 0.00019999930853156333, + "loss": 1.306, + "step": 9960 + }, + { + "epoch": 0.021182580779835543, + "grad_norm": 0.3776324689388275, + "learning_rate": 0.0001999992829711336, + "loss": 1.3028, + "step": 9970 + }, + { + "epoch": 0.021203827099574597, + "grad_norm": 0.3791152238845825, + "learning_rate": 0.00019999925694681446, + "loss": 1.2961, + "step": 9980 + }, + { + "epoch": 0.021225073419313647, + "grad_norm": 0.3964308202266693, + "learning_rate": 0.00019999923045860608, + "loss": 1.3466, + "step": 9990 + }, + { + "epoch": 0.0212463197390527, + "grad_norm": 0.4859187602996826, + "learning_rate": 0.0001999992035065085, + "loss": 1.3467, + "step": 10000 + }, + { + "epoch": 0.021267566058791754, + "grad_norm": 0.35371866822242737, + "learning_rate": 0.00019999917609052185, + "loss": 1.3365, + "step": 10010 + }, + { + "epoch": 0.021288812378530808, + "grad_norm": 0.44722500443458557, + "learning_rate": 0.00019999914821064633, + "loss": 1.3725, + "step": 10020 + }, + { + "epoch": 0.02131005869826986, + "grad_norm": 0.445234477519989, + "learning_rate": 0.000199999119866882, + "loss": 1.3463, + "step": 10030 + }, + { + "epoch": 0.021331305018008912, + "grad_norm": 0.44929665327072144, + "learning_rate": 0.00019999909105922903, + "loss": 1.3151, + "step": 10040 + }, + { + "epoch": 0.021352551337747966, + "grad_norm": 0.4045484662055969, + "learning_rate": 0.00019999906178768754, + "loss": 1.3318, + "step": 10050 + }, + { + "epoch": 0.021373797657487016, + "grad_norm": 0.34963032603263855, + "learning_rate": 0.00019999903205225768, + "loss": 1.275, + "step": 10060 + }, + { + "epoch": 0.02139504397722607, + "grad_norm": 0.4751238524913788, + "learning_rate": 0.00019999900185293954, + "loss": 1.3024, + "step": 10070 + }, + { + "epoch": 0.021416290296965124, + "grad_norm": 0.4349416196346283, + "learning_rate": 0.00019999897118973335, + "loss": 1.3265, + "step": 10080 + }, + { + "epoch": 0.021437536616704174, + "grad_norm": 0.39586910605430603, + "learning_rate": 0.00019999894006263915, + "loss": 1.3228, + "step": 10090 + }, + { + "epoch": 0.021458782936443228, + "grad_norm": 0.3918954133987427, + "learning_rate": 0.00019999890847165715, + "loss": 1.3573, + "step": 10100 + }, + { + "epoch": 0.02148002925618228, + "grad_norm": 0.38766899704933167, + "learning_rate": 0.00019999887641678746, + "loss": 1.3008, + "step": 10110 + }, + { + "epoch": 0.02150127557592133, + "grad_norm": 0.4316602051258087, + "learning_rate": 0.00019999884389803026, + "loss": 1.3381, + "step": 10120 + }, + { + "epoch": 0.021522521895660385, + "grad_norm": 0.35322603583335876, + "learning_rate": 0.00019999881091538568, + "loss": 1.2971, + "step": 10130 + }, + { + "epoch": 0.02154376821539944, + "grad_norm": 0.5310440063476562, + "learning_rate": 0.00019999877746885392, + "loss": 1.3423, + "step": 10140 + }, + { + "epoch": 0.021565014535138493, + "grad_norm": 0.4586990475654602, + "learning_rate": 0.00019999874355843504, + "loss": 1.3187, + "step": 10150 + }, + { + "epoch": 0.021586260854877543, + "grad_norm": 0.5276186466217041, + "learning_rate": 0.0001999987091841293, + "loss": 1.2623, + "step": 10160 + }, + { + "epoch": 0.021607507174616597, + "grad_norm": 0.48670312762260437, + "learning_rate": 0.0001999986743459368, + "loss": 1.276, + "step": 10170 + }, + { + "epoch": 0.02162875349435565, + "grad_norm": 0.4462718665599823, + "learning_rate": 0.00019999863904385769, + "loss": 1.288, + "step": 10180 + }, + { + "epoch": 0.0216499998140947, + "grad_norm": 0.3735065758228302, + "learning_rate": 0.00019999860327789217, + "loss": 1.2881, + "step": 10190 + }, + { + "epoch": 0.021671246133833755, + "grad_norm": 0.35293012857437134, + "learning_rate": 0.0001999985670480404, + "loss": 1.3237, + "step": 10200 + }, + { + "epoch": 0.02169249245357281, + "grad_norm": 0.35089191794395447, + "learning_rate": 0.00019999853035430253, + "loss": 1.2806, + "step": 10210 + }, + { + "epoch": 0.02171373877331186, + "grad_norm": 0.4377574026584625, + "learning_rate": 0.0001999984931966788, + "loss": 1.2985, + "step": 10220 + }, + { + "epoch": 0.021734985093050913, + "grad_norm": 0.3661186993122101, + "learning_rate": 0.00019999845557516923, + "loss": 1.3152, + "step": 10230 + }, + { + "epoch": 0.021756231412789966, + "grad_norm": 0.3618735373020172, + "learning_rate": 0.00019999841748977415, + "loss": 1.3282, + "step": 10240 + }, + { + "epoch": 0.02177747773252902, + "grad_norm": 0.38630446791648865, + "learning_rate": 0.00019999837894049368, + "loss": 1.3203, + "step": 10250 + }, + { + "epoch": 0.02179872405226807, + "grad_norm": 0.5389037728309631, + "learning_rate": 0.00019999833992732796, + "loss": 1.2925, + "step": 10260 + }, + { + "epoch": 0.021819970372007124, + "grad_norm": 0.46082013845443726, + "learning_rate": 0.00019999830045027724, + "loss": 1.2959, + "step": 10270 + }, + { + "epoch": 0.021841216691746178, + "grad_norm": 0.3740817606449127, + "learning_rate": 0.00019999826050934164, + "loss": 1.3261, + "step": 10280 + }, + { + "epoch": 0.021862463011485228, + "grad_norm": 0.37227267026901245, + "learning_rate": 0.0001999982201045214, + "loss": 1.3015, + "step": 10290 + }, + { + "epoch": 0.021883709331224282, + "grad_norm": 0.4817439615726471, + "learning_rate": 0.00019999817923581665, + "loss": 1.3206, + "step": 10300 + }, + { + "epoch": 0.021904955650963336, + "grad_norm": 0.5341250896453857, + "learning_rate": 0.00019999813790322763, + "loss": 1.3095, + "step": 10310 + }, + { + "epoch": 0.021926201970702386, + "grad_norm": 0.41697561740875244, + "learning_rate": 0.0001999980961067545, + "loss": 1.3234, + "step": 10320 + }, + { + "epoch": 0.02194744829044144, + "grad_norm": 0.42047399282455444, + "learning_rate": 0.00019999805384639748, + "loss": 1.3316, + "step": 10330 + }, + { + "epoch": 0.021968694610180493, + "grad_norm": 0.40889638662338257, + "learning_rate": 0.00019999801112215675, + "loss": 1.3189, + "step": 10340 + }, + { + "epoch": 0.021989940929919547, + "grad_norm": 0.4185725450515747, + "learning_rate": 0.0001999979679340325, + "loss": 1.3352, + "step": 10350 + }, + { + "epoch": 0.022011187249658597, + "grad_norm": 0.39917564392089844, + "learning_rate": 0.00019999792428202498, + "loss": 1.2825, + "step": 10360 + }, + { + "epoch": 0.02203243356939765, + "grad_norm": 0.4190349280834198, + "learning_rate": 0.00019999788016613433, + "loss": 1.2799, + "step": 10370 + }, + { + "epoch": 0.022053679889136705, + "grad_norm": 0.4130414128303528, + "learning_rate": 0.00019999783558636075, + "loss": 1.3347, + "step": 10380 + }, + { + "epoch": 0.022074926208875755, + "grad_norm": 0.34225961565971375, + "learning_rate": 0.00019999779054270453, + "loss": 1.3096, + "step": 10390 + }, + { + "epoch": 0.02209617252861481, + "grad_norm": 0.39506465196609497, + "learning_rate": 0.0001999977450351658, + "loss": 1.2812, + "step": 10400 + }, + { + "epoch": 0.022117418848353863, + "grad_norm": 0.37534037232398987, + "learning_rate": 0.0001999976990637448, + "loss": 1.3042, + "step": 10410 + }, + { + "epoch": 0.022138665168092913, + "grad_norm": 0.37747693061828613, + "learning_rate": 0.00019999765262844173, + "loss": 1.3714, + "step": 10420 + }, + { + "epoch": 0.022159911487831967, + "grad_norm": 0.4177028238773346, + "learning_rate": 0.00019999760572925678, + "loss": 1.3017, + "step": 10430 + }, + { + "epoch": 0.02218115780757102, + "grad_norm": 0.41562479734420776, + "learning_rate": 0.00019999755836619024, + "loss": 1.287, + "step": 10440 + }, + { + "epoch": 0.022202404127310074, + "grad_norm": 0.3788703382015228, + "learning_rate": 0.0001999975105392423, + "loss": 1.2898, + "step": 10450 + }, + { + "epoch": 0.022223650447049124, + "grad_norm": 0.4929772615432739, + "learning_rate": 0.00019999746224841315, + "loss": 1.3053, + "step": 10460 + }, + { + "epoch": 0.022244896766788178, + "grad_norm": 0.36570030450820923, + "learning_rate": 0.00019999741349370305, + "loss": 1.2992, + "step": 10470 + }, + { + "epoch": 0.022266143086527232, + "grad_norm": 0.3629460632801056, + "learning_rate": 0.0001999973642751122, + "loss": 1.2928, + "step": 10480 + }, + { + "epoch": 0.022287389406266282, + "grad_norm": 0.4734468162059784, + "learning_rate": 0.00019999731459264085, + "loss": 1.3383, + "step": 10490 + }, + { + "epoch": 0.022308635726005336, + "grad_norm": 0.6972452402114868, + "learning_rate": 0.00019999726444628926, + "loss": 1.3013, + "step": 10500 + }, + { + "epoch": 0.02232988204574439, + "grad_norm": 0.3563283681869507, + "learning_rate": 0.00019999721383605757, + "loss": 1.3039, + "step": 10510 + }, + { + "epoch": 0.02235112836548344, + "grad_norm": 0.37546229362487793, + "learning_rate": 0.00019999716276194608, + "loss": 1.3282, + "step": 10520 + }, + { + "epoch": 0.022372374685222494, + "grad_norm": 0.39041754603385925, + "learning_rate": 0.00019999711122395503, + "loss": 1.3222, + "step": 10530 + }, + { + "epoch": 0.022393621004961548, + "grad_norm": 0.37354663014411926, + "learning_rate": 0.00019999705922208464, + "loss": 1.3023, + "step": 10540 + }, + { + "epoch": 0.022414867324700598, + "grad_norm": 0.44209498167037964, + "learning_rate": 0.00019999700675633517, + "loss": 1.3395, + "step": 10550 + }, + { + "epoch": 0.02243611364443965, + "grad_norm": 0.4131948947906494, + "learning_rate": 0.00019999695382670685, + "loss": 1.3268, + "step": 10560 + }, + { + "epoch": 0.022457359964178705, + "grad_norm": 0.3726363480091095, + "learning_rate": 0.0001999969004331999, + "loss": 1.3125, + "step": 10570 + }, + { + "epoch": 0.02247860628391776, + "grad_norm": 0.42983683943748474, + "learning_rate": 0.00019999684657581458, + "loss": 1.2825, + "step": 10580 + }, + { + "epoch": 0.02249985260365681, + "grad_norm": 0.38731005787849426, + "learning_rate": 0.0001999967922545512, + "loss": 1.2901, + "step": 10590 + }, + { + "epoch": 0.022521098923395863, + "grad_norm": 0.41453438997268677, + "learning_rate": 0.00019999673746940998, + "loss": 1.2884, + "step": 10600 + }, + { + "epoch": 0.022542345243134917, + "grad_norm": 0.40356945991516113, + "learning_rate": 0.0001999966822203911, + "loss": 1.2927, + "step": 10610 + }, + { + "epoch": 0.022563591562873967, + "grad_norm": 0.4271157681941986, + "learning_rate": 0.0001999966265074949, + "loss": 1.2983, + "step": 10620 + }, + { + "epoch": 0.02258483788261302, + "grad_norm": 0.4500367343425751, + "learning_rate": 0.00019999657033072162, + "loss": 1.3419, + "step": 10630 + }, + { + "epoch": 0.022606084202352075, + "grad_norm": 0.44891396164894104, + "learning_rate": 0.0001999965136900715, + "loss": 1.2811, + "step": 10640 + }, + { + "epoch": 0.022627330522091125, + "grad_norm": 0.4287288784980774, + "learning_rate": 0.00019999645658554482, + "loss": 1.3149, + "step": 10650 + }, + { + "epoch": 0.02264857684183018, + "grad_norm": 0.47665756940841675, + "learning_rate": 0.00019999639901714184, + "loss": 1.3134, + "step": 10660 + }, + { + "epoch": 0.022669823161569232, + "grad_norm": 0.4033728539943695, + "learning_rate": 0.00019999634098486285, + "loss": 1.3083, + "step": 10670 + }, + { + "epoch": 0.022691069481308286, + "grad_norm": 0.4043264389038086, + "learning_rate": 0.0001999962824887081, + "loss": 1.3219, + "step": 10680 + }, + { + "epoch": 0.022712315801047336, + "grad_norm": 0.4468698799610138, + "learning_rate": 0.00019999622352867783, + "loss": 1.3273, + "step": 10690 + }, + { + "epoch": 0.02273356212078639, + "grad_norm": 0.540629506111145, + "learning_rate": 0.0001999961641047724, + "loss": 1.3164, + "step": 10700 + }, + { + "epoch": 0.022754808440525444, + "grad_norm": 0.556750476360321, + "learning_rate": 0.00019999610421699194, + "loss": 1.2829, + "step": 10710 + }, + { + "epoch": 0.022776054760264494, + "grad_norm": 0.39703312516212463, + "learning_rate": 0.0001999960438653369, + "loss": 1.2598, + "step": 10720 + }, + { + "epoch": 0.022797301080003548, + "grad_norm": 0.3615317642688751, + "learning_rate": 0.00019999598304980745, + "loss": 1.3085, + "step": 10730 + }, + { + "epoch": 0.0228185473997426, + "grad_norm": 0.4793642461299896, + "learning_rate": 0.0001999959217704039, + "loss": 1.3122, + "step": 10740 + }, + { + "epoch": 0.022839793719481652, + "grad_norm": 0.3589838147163391, + "learning_rate": 0.0001999958600271265, + "loss": 1.2827, + "step": 10750 + }, + { + "epoch": 0.022861040039220706, + "grad_norm": 0.3739126920700073, + "learning_rate": 0.00019999579781997562, + "loss": 1.3011, + "step": 10760 + }, + { + "epoch": 0.02288228635895976, + "grad_norm": 0.40067288279533386, + "learning_rate": 0.00019999573514895146, + "loss": 1.2961, + "step": 10770 + }, + { + "epoch": 0.022903532678698813, + "grad_norm": 0.3316481411457062, + "learning_rate": 0.00019999567201405436, + "loss": 1.2882, + "step": 10780 + }, + { + "epoch": 0.022924778998437863, + "grad_norm": 0.3792462944984436, + "learning_rate": 0.0001999956084152846, + "loss": 1.2996, + "step": 10790 + }, + { + "epoch": 0.022946025318176917, + "grad_norm": 0.43986737728118896, + "learning_rate": 0.00019999554435264248, + "loss": 1.2739, + "step": 10800 + }, + { + "epoch": 0.02296727163791597, + "grad_norm": 0.316070020198822, + "learning_rate": 0.0001999954798261283, + "loss": 1.3156, + "step": 10810 + }, + { + "epoch": 0.02298851795765502, + "grad_norm": 0.4474739134311676, + "learning_rate": 0.00019999541483574235, + "loss": 1.3086, + "step": 10820 + }, + { + "epoch": 0.023009764277394075, + "grad_norm": 0.49081656336784363, + "learning_rate": 0.00019999534938148494, + "loss": 1.327, + "step": 10830 + }, + { + "epoch": 0.02303101059713313, + "grad_norm": 0.36911365389823914, + "learning_rate": 0.00019999528346335637, + "loss": 1.2585, + "step": 10840 + }, + { + "epoch": 0.02305225691687218, + "grad_norm": 0.38436999917030334, + "learning_rate": 0.00019999521708135694, + "loss": 1.2855, + "step": 10850 + }, + { + "epoch": 0.023073503236611233, + "grad_norm": 0.34169790148735046, + "learning_rate": 0.00019999515023548696, + "loss": 1.2884, + "step": 10860 + }, + { + "epoch": 0.023094749556350287, + "grad_norm": 0.4384957551956177, + "learning_rate": 0.00019999508292574674, + "loss": 1.3036, + "step": 10870 + }, + { + "epoch": 0.02311599587608934, + "grad_norm": 0.34675130248069763, + "learning_rate": 0.00019999501515213658, + "loss": 1.2765, + "step": 10880 + }, + { + "epoch": 0.02313724219582839, + "grad_norm": 0.39405888319015503, + "learning_rate": 0.00019999494691465684, + "loss": 1.2937, + "step": 10890 + }, + { + "epoch": 0.023158488515567444, + "grad_norm": 0.4217337965965271, + "learning_rate": 0.0001999948782133078, + "loss": 1.3013, + "step": 10900 + }, + { + "epoch": 0.023179734835306498, + "grad_norm": 0.3441830277442932, + "learning_rate": 0.00019999480904808978, + "loss": 1.3116, + "step": 10910 + }, + { + "epoch": 0.02320098115504555, + "grad_norm": 0.40516427159309387, + "learning_rate": 0.0001999947394190031, + "loss": 1.3143, + "step": 10920 + }, + { + "epoch": 0.023222227474784602, + "grad_norm": 0.37101906538009644, + "learning_rate": 0.0001999946693260481, + "loss": 1.2706, + "step": 10930 + }, + { + "epoch": 0.023243473794523656, + "grad_norm": 0.42191222310066223, + "learning_rate": 0.00019999459876922506, + "loss": 1.3106, + "step": 10940 + }, + { + "epoch": 0.023264720114262706, + "grad_norm": 0.4211720824241638, + "learning_rate": 0.00019999452774853438, + "loss": 1.2871, + "step": 10950 + }, + { + "epoch": 0.02328596643400176, + "grad_norm": 0.3989843428134918, + "learning_rate": 0.00019999445626397636, + "loss": 1.3255, + "step": 10960 + }, + { + "epoch": 0.023307212753740814, + "grad_norm": 0.44908440113067627, + "learning_rate": 0.00019999438431555128, + "loss": 1.3101, + "step": 10970 + }, + { + "epoch": 0.023328459073479867, + "grad_norm": 0.5045820474624634, + "learning_rate": 0.00019999431190325954, + "loss": 1.3085, + "step": 10980 + }, + { + "epoch": 0.023349705393218918, + "grad_norm": 0.37609630823135376, + "learning_rate": 0.0001999942390271014, + "loss": 1.2882, + "step": 10990 + }, + { + "epoch": 0.02337095171295797, + "grad_norm": 0.5574625730514526, + "learning_rate": 0.0001999941656870773, + "loss": 1.2976, + "step": 11000 + }, + { + "epoch": 0.023392198032697025, + "grad_norm": 0.3827749490737915, + "learning_rate": 0.00019999409188318755, + "loss": 1.3188, + "step": 11010 + }, + { + "epoch": 0.023413444352436075, + "grad_norm": 0.4523393511772156, + "learning_rate": 0.00019999401761543244, + "loss": 1.313, + "step": 11020 + }, + { + "epoch": 0.02343469067217513, + "grad_norm": 0.34186455607414246, + "learning_rate": 0.00019999394288381235, + "loss": 1.3177, + "step": 11030 + }, + { + "epoch": 0.023455936991914183, + "grad_norm": 0.39320486783981323, + "learning_rate": 0.0001999938676883276, + "loss": 1.2906, + "step": 11040 + }, + { + "epoch": 0.023477183311653233, + "grad_norm": 0.4385378062725067, + "learning_rate": 0.0001999937920289786, + "loss": 1.3169, + "step": 11050 + }, + { + "epoch": 0.023498429631392287, + "grad_norm": 0.4686935245990753, + "learning_rate": 0.00019999371590576567, + "loss": 1.2852, + "step": 11060 + }, + { + "epoch": 0.02351967595113134, + "grad_norm": 0.4951067268848419, + "learning_rate": 0.00019999363931868914, + "loss": 1.2927, + "step": 11070 + }, + { + "epoch": 0.02354092227087039, + "grad_norm": 0.3554975390434265, + "learning_rate": 0.00019999356226774938, + "loss": 1.3082, + "step": 11080 + }, + { + "epoch": 0.023562168590609445, + "grad_norm": 0.5273036956787109, + "learning_rate": 0.00019999348475294672, + "loss": 1.2981, + "step": 11090 + }, + { + "epoch": 0.0235834149103485, + "grad_norm": 0.4662444293498993, + "learning_rate": 0.0001999934067742816, + "loss": 1.3154, + "step": 11100 + }, + { + "epoch": 0.023604661230087552, + "grad_norm": 0.37750497460365295, + "learning_rate": 0.0001999933283317543, + "loss": 1.2896, + "step": 11110 + }, + { + "epoch": 0.023625907549826602, + "grad_norm": 0.3582260310649872, + "learning_rate": 0.00019999324942536522, + "loss": 1.2875, + "step": 11120 + }, + { + "epoch": 0.023647153869565656, + "grad_norm": 0.46206772327423096, + "learning_rate": 0.00019999317005511472, + "loss": 1.3132, + "step": 11130 + }, + { + "epoch": 0.02366840018930471, + "grad_norm": 0.4179605543613434, + "learning_rate": 0.00019999309022100318, + "loss": 1.2822, + "step": 11140 + }, + { + "epoch": 0.02368964650904376, + "grad_norm": 0.3835347294807434, + "learning_rate": 0.00019999300992303094, + "loss": 1.2866, + "step": 11150 + }, + { + "epoch": 0.023710892828782814, + "grad_norm": 0.33457818627357483, + "learning_rate": 0.00019999292916119839, + "loss": 1.3359, + "step": 11160 + }, + { + "epoch": 0.023732139148521868, + "grad_norm": 0.36134403944015503, + "learning_rate": 0.0001999928479355059, + "loss": 1.3114, + "step": 11170 + }, + { + "epoch": 0.023753385468260918, + "grad_norm": 0.3474771976470947, + "learning_rate": 0.00019999276624595389, + "loss": 1.3285, + "step": 11180 + }, + { + "epoch": 0.023774631787999972, + "grad_norm": 0.3687553107738495, + "learning_rate": 0.00019999268409254267, + "loss": 1.2923, + "step": 11190 + }, + { + "epoch": 0.023795878107739026, + "grad_norm": 0.33617067337036133, + "learning_rate": 0.00019999260147527266, + "loss": 1.3058, + "step": 11200 + }, + { + "epoch": 0.02381712442747808, + "grad_norm": 0.42926549911499023, + "learning_rate": 0.00019999251839414423, + "loss": 1.2864, + "step": 11210 + }, + { + "epoch": 0.02383837074721713, + "grad_norm": 0.5322855114936829, + "learning_rate": 0.00019999243484915776, + "loss": 1.2982, + "step": 11220 + }, + { + "epoch": 0.023859617066956183, + "grad_norm": 0.4355957508087158, + "learning_rate": 0.0001999923508403137, + "loss": 1.2998, + "step": 11230 + }, + { + "epoch": 0.023880863386695237, + "grad_norm": 0.3493080735206604, + "learning_rate": 0.0001999922663676123, + "loss": 1.2816, + "step": 11240 + }, + { + "epoch": 0.023902109706434287, + "grad_norm": 0.3586486279964447, + "learning_rate": 0.00019999218143105415, + "loss": 1.3135, + "step": 11250 + }, + { + "epoch": 0.02392335602617334, + "grad_norm": 0.3892957866191864, + "learning_rate": 0.00019999209603063946, + "loss": 1.3278, + "step": 11260 + }, + { + "epoch": 0.023944602345912395, + "grad_norm": 0.46234238147735596, + "learning_rate": 0.00019999201016636872, + "loss": 1.2593, + "step": 11270 + }, + { + "epoch": 0.023965848665651445, + "grad_norm": 0.4197115898132324, + "learning_rate": 0.00019999192383824232, + "loss": 1.3088, + "step": 11280 + }, + { + "epoch": 0.0239870949853905, + "grad_norm": 0.36394453048706055, + "learning_rate": 0.00019999183704626065, + "loss": 1.2469, + "step": 11290 + }, + { + "epoch": 0.024008341305129553, + "grad_norm": 0.5558617115020752, + "learning_rate": 0.0001999917497904241, + "loss": 1.3103, + "step": 11300 + }, + { + "epoch": 0.024029587624868606, + "grad_norm": 0.37398338317871094, + "learning_rate": 0.00019999166207073307, + "loss": 1.2838, + "step": 11310 + }, + { + "epoch": 0.024050833944607657, + "grad_norm": 0.4000920057296753, + "learning_rate": 0.00019999157388718802, + "loss": 1.2923, + "step": 11320 + }, + { + "epoch": 0.02407208026434671, + "grad_norm": 0.4116889536380768, + "learning_rate": 0.0001999914852397893, + "loss": 1.2557, + "step": 11330 + }, + { + "epoch": 0.024093326584085764, + "grad_norm": 0.32335156202316284, + "learning_rate": 0.00019999139612853734, + "loss": 1.2994, + "step": 11340 + }, + { + "epoch": 0.024114572903824814, + "grad_norm": 0.3910031318664551, + "learning_rate": 0.0001999913065534326, + "loss": 1.263, + "step": 11350 + }, + { + "epoch": 0.024135819223563868, + "grad_norm": 0.3742384612560272, + "learning_rate": 0.0001999912165144754, + "loss": 1.3076, + "step": 11360 + }, + { + "epoch": 0.024157065543302922, + "grad_norm": 0.39666956663131714, + "learning_rate": 0.00019999112601166622, + "loss": 1.3133, + "step": 11370 + }, + { + "epoch": 0.024178311863041972, + "grad_norm": 0.4130505919456482, + "learning_rate": 0.0001999910350450055, + "loss": 1.3273, + "step": 11380 + }, + { + "epoch": 0.024199558182781026, + "grad_norm": 0.4080508351325989, + "learning_rate": 0.0001999909436144936, + "loss": 1.2769, + "step": 11390 + }, + { + "epoch": 0.02422080450252008, + "grad_norm": 0.3642793297767639, + "learning_rate": 0.000199990851720131, + "loss": 1.2751, + "step": 11400 + }, + { + "epoch": 0.024242050822259133, + "grad_norm": 0.4025561511516571, + "learning_rate": 0.0001999907593619181, + "loss": 1.3029, + "step": 11410 + }, + { + "epoch": 0.024263297141998184, + "grad_norm": 0.35533812642097473, + "learning_rate": 0.0001999906665398553, + "loss": 1.2897, + "step": 11420 + }, + { + "epoch": 0.024284543461737237, + "grad_norm": 0.37342891097068787, + "learning_rate": 0.0001999905732539431, + "loss": 1.2752, + "step": 11430 + }, + { + "epoch": 0.02430578978147629, + "grad_norm": 0.40166446566581726, + "learning_rate": 0.00019999047950418187, + "loss": 1.2943, + "step": 11440 + }, + { + "epoch": 0.02432703610121534, + "grad_norm": 0.3380446434020996, + "learning_rate": 0.0001999903852905721, + "loss": 1.2924, + "step": 11450 + }, + { + "epoch": 0.024348282420954395, + "grad_norm": 0.3580317199230194, + "learning_rate": 0.00019999029061311418, + "loss": 1.2718, + "step": 11460 + }, + { + "epoch": 0.02436952874069345, + "grad_norm": 0.38421258330345154, + "learning_rate": 0.00019999019547180854, + "loss": 1.319, + "step": 11470 + }, + { + "epoch": 0.0243907750604325, + "grad_norm": 0.38857391476631165, + "learning_rate": 0.00019999009986665568, + "loss": 1.3025, + "step": 11480 + }, + { + "epoch": 0.024412021380171553, + "grad_norm": 0.346388578414917, + "learning_rate": 0.000199990003797656, + "loss": 1.2946, + "step": 11490 + }, + { + "epoch": 0.024433267699910607, + "grad_norm": 0.37632015347480774, + "learning_rate": 0.00019998990726480996, + "loss": 1.2851, + "step": 11500 + }, + { + "epoch": 0.02445451401964966, + "grad_norm": 0.3728422522544861, + "learning_rate": 0.000199989810268118, + "loss": 1.2835, + "step": 11510 + }, + { + "epoch": 0.02447576033938871, + "grad_norm": 0.4010753035545349, + "learning_rate": 0.0001999897128075806, + "loss": 1.3466, + "step": 11520 + }, + { + "epoch": 0.024497006659127765, + "grad_norm": 0.35096320509910583, + "learning_rate": 0.00019998961488319815, + "loss": 1.2912, + "step": 11530 + }, + { + "epoch": 0.024518252978866818, + "grad_norm": 0.4235300123691559, + "learning_rate": 0.00019998951649497114, + "loss": 1.3058, + "step": 11540 + }, + { + "epoch": 0.02453949929860587, + "grad_norm": 0.37316426634788513, + "learning_rate": 0.00019998941764290008, + "loss": 1.3324, + "step": 11550 + }, + { + "epoch": 0.024560745618344922, + "grad_norm": 0.3629796504974365, + "learning_rate": 0.00019998931832698532, + "loss": 1.3218, + "step": 11560 + }, + { + "epoch": 0.024581991938083976, + "grad_norm": 0.37516453862190247, + "learning_rate": 0.00019998921854722738, + "loss": 1.2551, + "step": 11570 + }, + { + "epoch": 0.024603238257823026, + "grad_norm": 0.34434691071510315, + "learning_rate": 0.00019998911830362674, + "loss": 1.2619, + "step": 11580 + }, + { + "epoch": 0.02462448457756208, + "grad_norm": 0.37135398387908936, + "learning_rate": 0.0001999890175961838, + "loss": 1.267, + "step": 11590 + }, + { + "epoch": 0.024645730897301134, + "grad_norm": 0.3295104503631592, + "learning_rate": 0.00019998891642489914, + "loss": 1.3023, + "step": 11600 + }, + { + "epoch": 0.024666977217040184, + "grad_norm": 0.3873163163661957, + "learning_rate": 0.00019998881478977315, + "loss": 1.2667, + "step": 11610 + }, + { + "epoch": 0.024688223536779238, + "grad_norm": 0.38426122069358826, + "learning_rate": 0.00019998871269080626, + "loss": 1.2853, + "step": 11620 + }, + { + "epoch": 0.02470946985651829, + "grad_norm": 0.35662633180618286, + "learning_rate": 0.00019998861012799902, + "loss": 1.2814, + "step": 11630 + }, + { + "epoch": 0.024730716176257345, + "grad_norm": 0.4079803228378296, + "learning_rate": 0.00019998850710135191, + "loss": 1.2981, + "step": 11640 + }, + { + "epoch": 0.024751962495996396, + "grad_norm": 0.454907089471817, + "learning_rate": 0.00019998840361086533, + "loss": 1.2858, + "step": 11650 + }, + { + "epoch": 0.02477320881573545, + "grad_norm": 0.5685102939605713, + "learning_rate": 0.00019998829965653985, + "loss": 1.2773, + "step": 11660 + }, + { + "epoch": 0.024794455135474503, + "grad_norm": 0.37614378333091736, + "learning_rate": 0.00019998819523837588, + "loss": 1.2734, + "step": 11670 + }, + { + "epoch": 0.024815701455213553, + "grad_norm": 0.36724740266799927, + "learning_rate": 0.00019998809035637395, + "loss": 1.3192, + "step": 11680 + }, + { + "epoch": 0.024836947774952607, + "grad_norm": 0.41889646649360657, + "learning_rate": 0.00019998798501053459, + "loss": 1.3065, + "step": 11690 + }, + { + "epoch": 0.02485819409469166, + "grad_norm": 0.3524831235408783, + "learning_rate": 0.00019998787920085816, + "loss": 1.279, + "step": 11700 + }, + { + "epoch": 0.02487944041443071, + "grad_norm": 0.3758469820022583, + "learning_rate": 0.00019998777292734522, + "loss": 1.3359, + "step": 11710 + }, + { + "epoch": 0.024900686734169765, + "grad_norm": 0.3436757028102875, + "learning_rate": 0.00019998766618999628, + "loss": 1.268, + "step": 11720 + }, + { + "epoch": 0.02492193305390882, + "grad_norm": 0.373537540435791, + "learning_rate": 0.00019998755898881183, + "loss": 1.3045, + "step": 11730 + }, + { + "epoch": 0.024943179373647872, + "grad_norm": 0.3681647777557373, + "learning_rate": 0.00019998745132379235, + "loss": 1.2845, + "step": 11740 + }, + { + "epoch": 0.024964425693386923, + "grad_norm": 0.4312489926815033, + "learning_rate": 0.00019998734319493836, + "loss": 1.2824, + "step": 11750 + }, + { + "epoch": 0.024985672013125976, + "grad_norm": 0.5208578705787659, + "learning_rate": 0.00019998723460225035, + "loss": 1.2985, + "step": 11760 + }, + { + "epoch": 0.02500691833286503, + "grad_norm": 0.4109800457954407, + "learning_rate": 0.0001999871255457288, + "loss": 1.287, + "step": 11770 + }, + { + "epoch": 0.02502816465260408, + "grad_norm": 0.36693671345710754, + "learning_rate": 0.00019998701602537427, + "loss": 1.3007, + "step": 11780 + }, + { + "epoch": 0.025049410972343134, + "grad_norm": 0.3513535261154175, + "learning_rate": 0.0001999869060411872, + "loss": 1.3088, + "step": 11790 + }, + { + "epoch": 0.025070657292082188, + "grad_norm": 0.47722822427749634, + "learning_rate": 0.00019998679559316815, + "loss": 1.3052, + "step": 11800 + }, + { + "epoch": 0.025091903611821238, + "grad_norm": 0.3883603513240814, + "learning_rate": 0.00019998668468131764, + "loss": 1.3117, + "step": 11810 + }, + { + "epoch": 0.025113149931560292, + "grad_norm": 0.5103073716163635, + "learning_rate": 0.00019998657330563612, + "loss": 1.2549, + "step": 11820 + }, + { + "epoch": 0.025134396251299346, + "grad_norm": 0.36414334177970886, + "learning_rate": 0.00019998646146612422, + "loss": 1.2816, + "step": 11830 + }, + { + "epoch": 0.0251556425710384, + "grad_norm": 0.3414251506328583, + "learning_rate": 0.00019998634916278232, + "loss": 1.3102, + "step": 11840 + }, + { + "epoch": 0.02517688889077745, + "grad_norm": 0.4086225926876068, + "learning_rate": 0.00019998623639561105, + "loss": 1.2774, + "step": 11850 + }, + { + "epoch": 0.025198135210516504, + "grad_norm": 0.33740782737731934, + "learning_rate": 0.00019998612316461087, + "loss": 1.3167, + "step": 11860 + }, + { + "epoch": 0.025219381530255557, + "grad_norm": 0.36907947063446045, + "learning_rate": 0.00019998600946978233, + "loss": 1.291, + "step": 11870 + }, + { + "epoch": 0.025240627849994608, + "grad_norm": 0.3383497893810272, + "learning_rate": 0.00019998589531112598, + "loss": 1.2496, + "step": 11880 + }, + { + "epoch": 0.02526187416973366, + "grad_norm": 0.43260660767555237, + "learning_rate": 0.0001999857806886423, + "loss": 1.2927, + "step": 11890 + }, + { + "epoch": 0.025283120489472715, + "grad_norm": 0.38249698281288147, + "learning_rate": 0.00019998566560233184, + "loss": 1.2562, + "step": 11900 + }, + { + "epoch": 0.025304366809211765, + "grad_norm": 0.36611905694007874, + "learning_rate": 0.00019998555005219516, + "loss": 1.3071, + "step": 11910 + }, + { + "epoch": 0.02532561312895082, + "grad_norm": 0.38970670104026794, + "learning_rate": 0.00019998543403823278, + "loss": 1.3462, + "step": 11920 + }, + { + "epoch": 0.025346859448689873, + "grad_norm": 0.35336777567863464, + "learning_rate": 0.00019998531756044524, + "loss": 1.3072, + "step": 11930 + }, + { + "epoch": 0.025368105768428927, + "grad_norm": 0.34059253334999084, + "learning_rate": 0.00019998520061883306, + "loss": 1.3029, + "step": 11940 + }, + { + "epoch": 0.025389352088167977, + "grad_norm": 0.37389084696769714, + "learning_rate": 0.0001999850832133968, + "loss": 1.2517, + "step": 11950 + }, + { + "epoch": 0.02541059840790703, + "grad_norm": 0.35599616169929504, + "learning_rate": 0.00019998496534413699, + "loss": 1.35, + "step": 11960 + }, + { + "epoch": 0.025431844727646084, + "grad_norm": 0.36525243520736694, + "learning_rate": 0.00019998484701105423, + "loss": 1.3108, + "step": 11970 + }, + { + "epoch": 0.025453091047385135, + "grad_norm": 0.39142942428588867, + "learning_rate": 0.00019998472821414898, + "loss": 1.3278, + "step": 11980 + }, + { + "epoch": 0.02547433736712419, + "grad_norm": 0.3346650302410126, + "learning_rate": 0.00019998460895342187, + "loss": 1.2802, + "step": 11990 + }, + { + "epoch": 0.025495583686863242, + "grad_norm": 0.3319089114665985, + "learning_rate": 0.0001999844892288734, + "loss": 1.2706, + "step": 12000 + }, + { + "epoch": 0.025516830006602292, + "grad_norm": 0.3720344603061676, + "learning_rate": 0.00019998436904050418, + "loss": 1.2909, + "step": 12010 + }, + { + "epoch": 0.025538076326341346, + "grad_norm": 0.4889415502548218, + "learning_rate": 0.00019998424838831472, + "loss": 1.2917, + "step": 12020 + }, + { + "epoch": 0.0255593226460804, + "grad_norm": 0.4059412479400635, + "learning_rate": 0.00019998412727230557, + "loss": 1.286, + "step": 12030 + }, + { + "epoch": 0.025580568965819454, + "grad_norm": 0.3755929172039032, + "learning_rate": 0.00019998400569247736, + "loss": 1.2716, + "step": 12040 + }, + { + "epoch": 0.025601815285558504, + "grad_norm": 0.3800663948059082, + "learning_rate": 0.00019998388364883058, + "loss": 1.2884, + "step": 12050 + }, + { + "epoch": 0.025623061605297558, + "grad_norm": 0.44676050543785095, + "learning_rate": 0.00019998376114136585, + "loss": 1.295, + "step": 12060 + }, + { + "epoch": 0.02564430792503661, + "grad_norm": 0.37821489572525024, + "learning_rate": 0.00019998363817008369, + "loss": 1.2973, + "step": 12070 + }, + { + "epoch": 0.02566555424477566, + "grad_norm": 0.32973310351371765, + "learning_rate": 0.00019998351473498474, + "loss": 1.2771, + "step": 12080 + }, + { + "epoch": 0.025686800564514715, + "grad_norm": 0.4038003981113434, + "learning_rate": 0.0001999833908360695, + "loss": 1.2885, + "step": 12090 + }, + { + "epoch": 0.02570804688425377, + "grad_norm": 0.420273095369339, + "learning_rate": 0.00019998326647333855, + "loss": 1.3098, + "step": 12100 + }, + { + "epoch": 0.02572929320399282, + "grad_norm": 0.3691064417362213, + "learning_rate": 0.00019998314164679253, + "loss": 1.27, + "step": 12110 + }, + { + "epoch": 0.025750539523731873, + "grad_norm": 0.5628901124000549, + "learning_rate": 0.00019998301635643198, + "loss": 1.2624, + "step": 12120 + }, + { + "epoch": 0.025771785843470927, + "grad_norm": 0.33787471055984497, + "learning_rate": 0.00019998289060225744, + "loss": 1.2857, + "step": 12130 + }, + { + "epoch": 0.025793032163209977, + "grad_norm": 0.6804336905479431, + "learning_rate": 0.00019998276438426955, + "loss": 1.2888, + "step": 12140 + }, + { + "epoch": 0.02581427848294903, + "grad_norm": 0.3540322482585907, + "learning_rate": 0.0001999826377024689, + "loss": 1.2542, + "step": 12150 + }, + { + "epoch": 0.025835524802688085, + "grad_norm": 0.6135880351066589, + "learning_rate": 0.00019998251055685607, + "loss": 1.3051, + "step": 12160 + }, + { + "epoch": 0.02585677112242714, + "grad_norm": 0.3655414283275604, + "learning_rate": 0.0001999823829474316, + "loss": 1.2914, + "step": 12170 + }, + { + "epoch": 0.02587801744216619, + "grad_norm": 0.32828661799430847, + "learning_rate": 0.00019998225487419613, + "loss": 1.256, + "step": 12180 + }, + { + "epoch": 0.025899263761905243, + "grad_norm": 0.35472214221954346, + "learning_rate": 0.00019998212633715024, + "loss": 1.3039, + "step": 12190 + }, + { + "epoch": 0.025920510081644296, + "grad_norm": 0.3833518624305725, + "learning_rate": 0.00019998199733629453, + "loss": 1.2971, + "step": 12200 + }, + { + "epoch": 0.025941756401383347, + "grad_norm": 0.38734865188598633, + "learning_rate": 0.0001999818678716296, + "loss": 1.3058, + "step": 12210 + }, + { + "epoch": 0.0259630027211224, + "grad_norm": 0.4097994863986969, + "learning_rate": 0.00019998173794315605, + "loss": 1.29, + "step": 12220 + }, + { + "epoch": 0.025984249040861454, + "grad_norm": 0.34088006615638733, + "learning_rate": 0.00019998160755087447, + "loss": 1.2586, + "step": 12230 + }, + { + "epoch": 0.026005495360600504, + "grad_norm": 0.36685889959335327, + "learning_rate": 0.0001999814766947855, + "loss": 1.2928, + "step": 12240 + }, + { + "epoch": 0.026026741680339558, + "grad_norm": 0.36060255765914917, + "learning_rate": 0.0001999813453748897, + "loss": 1.3042, + "step": 12250 + }, + { + "epoch": 0.026047988000078612, + "grad_norm": 0.357498437166214, + "learning_rate": 0.00019998121359118771, + "loss": 1.2528, + "step": 12260 + }, + { + "epoch": 0.026069234319817666, + "grad_norm": 0.4269144833087921, + "learning_rate": 0.00019998108134368015, + "loss": 1.2685, + "step": 12270 + }, + { + "epoch": 0.026090480639556716, + "grad_norm": 0.47336313128471375, + "learning_rate": 0.0001999809486323676, + "loss": 1.2789, + "step": 12280 + }, + { + "epoch": 0.02611172695929577, + "grad_norm": 0.3780252933502197, + "learning_rate": 0.00019998081545725068, + "loss": 1.2818, + "step": 12290 + }, + { + "epoch": 0.026132973279034823, + "grad_norm": 0.37964537739753723, + "learning_rate": 0.00019998068181833003, + "loss": 1.2776, + "step": 12300 + }, + { + "epoch": 0.026154219598773874, + "grad_norm": 0.3593664765357971, + "learning_rate": 0.0001999805477156063, + "loss": 1.2889, + "step": 12310 + }, + { + "epoch": 0.026175465918512927, + "grad_norm": 0.5626049637794495, + "learning_rate": 0.00019998041314908002, + "loss": 1.2772, + "step": 12320 + }, + { + "epoch": 0.02619671223825198, + "grad_norm": 0.5620774626731873, + "learning_rate": 0.0001999802781187519, + "loss": 1.2307, + "step": 12330 + }, + { + "epoch": 0.02621795855799103, + "grad_norm": 0.3585665822029114, + "learning_rate": 0.0001999801426246225, + "loss": 1.3413, + "step": 12340 + }, + { + "epoch": 0.026239204877730085, + "grad_norm": 0.3317555785179138, + "learning_rate": 0.0001999800066666925, + "loss": 1.2515, + "step": 12350 + }, + { + "epoch": 0.02626045119746914, + "grad_norm": 0.392757773399353, + "learning_rate": 0.0001999798702449625, + "loss": 1.2982, + "step": 12360 + }, + { + "epoch": 0.026281697517208193, + "grad_norm": 0.3561962842941284, + "learning_rate": 0.00019997973335943316, + "loss": 1.294, + "step": 12370 + }, + { + "epoch": 0.026302943836947243, + "grad_norm": 0.37247124314308167, + "learning_rate": 0.0001999795960101051, + "loss": 1.3088, + "step": 12380 + }, + { + "epoch": 0.026324190156686297, + "grad_norm": 0.38120484352111816, + "learning_rate": 0.00019997945819697895, + "loss": 1.2934, + "step": 12390 + }, + { + "epoch": 0.02634543647642535, + "grad_norm": 0.3369123637676239, + "learning_rate": 0.00019997931992005538, + "loss": 1.2723, + "step": 12400 + }, + { + "epoch": 0.0263666827961644, + "grad_norm": 0.447797030210495, + "learning_rate": 0.000199979181179335, + "loss": 1.2711, + "step": 12410 + }, + { + "epoch": 0.026387929115903454, + "grad_norm": 0.3649319112300873, + "learning_rate": 0.00019997904197481843, + "loss": 1.2752, + "step": 12420 + }, + { + "epoch": 0.026409175435642508, + "grad_norm": 0.3879178762435913, + "learning_rate": 0.00019997890230650638, + "loss": 1.2833, + "step": 12430 + }, + { + "epoch": 0.02643042175538156, + "grad_norm": 0.3658178746700287, + "learning_rate": 0.00019997876217439946, + "loss": 1.2916, + "step": 12440 + }, + { + "epoch": 0.026451668075120612, + "grad_norm": 0.3879554271697998, + "learning_rate": 0.00019997862157849832, + "loss": 1.3013, + "step": 12450 + }, + { + "epoch": 0.026472914394859666, + "grad_norm": 0.3557450473308563, + "learning_rate": 0.00019997848051880358, + "loss": 1.2697, + "step": 12460 + }, + { + "epoch": 0.02649416071459872, + "grad_norm": 0.35950708389282227, + "learning_rate": 0.000199978338995316, + "loss": 1.2781, + "step": 12470 + }, + { + "epoch": 0.02651540703433777, + "grad_norm": 0.40616676211357117, + "learning_rate": 0.00019997819700803615, + "loss": 1.2617, + "step": 12480 + }, + { + "epoch": 0.026536653354076824, + "grad_norm": 0.42574700713157654, + "learning_rate": 0.00019997805455696467, + "loss": 1.27, + "step": 12490 + }, + { + "epoch": 0.026557899673815877, + "grad_norm": 0.4259454905986786, + "learning_rate": 0.00019997791164210228, + "loss": 1.292, + "step": 12500 + }, + { + "epoch": 0.026579145993554928, + "grad_norm": 0.3844059109687805, + "learning_rate": 0.00019997776826344966, + "loss": 1.2744, + "step": 12510 + }, + { + "epoch": 0.02660039231329398, + "grad_norm": 0.5331470370292664, + "learning_rate": 0.00019997762442100739, + "loss": 1.2818, + "step": 12520 + }, + { + "epoch": 0.026621638633033035, + "grad_norm": 0.43398451805114746, + "learning_rate": 0.0001999774801147762, + "loss": 1.2836, + "step": 12530 + }, + { + "epoch": 0.026642884952772086, + "grad_norm": 0.34918269515037537, + "learning_rate": 0.0001999773353447567, + "loss": 1.252, + "step": 12540 + }, + { + "epoch": 0.02666413127251114, + "grad_norm": 0.46049466729164124, + "learning_rate": 0.00019997719011094966, + "loss": 1.246, + "step": 12550 + }, + { + "epoch": 0.026685377592250193, + "grad_norm": 0.4092930853366852, + "learning_rate": 0.0001999770444133557, + "loss": 1.2927, + "step": 12560 + }, + { + "epoch": 0.026706623911989247, + "grad_norm": 0.3706120252609253, + "learning_rate": 0.00019997689825197544, + "loss": 1.2877, + "step": 12570 + }, + { + "epoch": 0.026727870231728297, + "grad_norm": 0.36785629391670227, + "learning_rate": 0.00019997675162680966, + "loss": 1.2548, + "step": 12580 + }, + { + "epoch": 0.02674911655146735, + "grad_norm": 0.4972873330116272, + "learning_rate": 0.000199976604537859, + "loss": 1.2965, + "step": 12590 + }, + { + "epoch": 0.026770362871206405, + "grad_norm": 0.5300833582878113, + "learning_rate": 0.00019997645698512407, + "loss": 1.276, + "step": 12600 + }, + { + "epoch": 0.026791609190945455, + "grad_norm": 0.35581403970718384, + "learning_rate": 0.00019997630896860568, + "loss": 1.2771, + "step": 12610 + }, + { + "epoch": 0.02681285551068451, + "grad_norm": 0.3980450928211212, + "learning_rate": 0.00019997616048830446, + "loss": 1.3112, + "step": 12620 + }, + { + "epoch": 0.026834101830423562, + "grad_norm": 0.3557485044002533, + "learning_rate": 0.00019997601154422105, + "loss": 1.2401, + "step": 12630 + }, + { + "epoch": 0.026855348150162613, + "grad_norm": 0.33328577876091003, + "learning_rate": 0.0001999758621363562, + "loss": 1.2621, + "step": 12640 + }, + { + "epoch": 0.026876594469901666, + "grad_norm": 0.4051186740398407, + "learning_rate": 0.0001999757122647106, + "loss": 1.3181, + "step": 12650 + }, + { + "epoch": 0.02689784078964072, + "grad_norm": 0.49233904480934143, + "learning_rate": 0.00019997556192928495, + "loss": 1.2647, + "step": 12660 + }, + { + "epoch": 0.02691908710937977, + "grad_norm": 0.4481161832809448, + "learning_rate": 0.00019997541113007993, + "loss": 1.2789, + "step": 12670 + }, + { + "epoch": 0.026940333429118824, + "grad_norm": 0.3275681734085083, + "learning_rate": 0.00019997525986709618, + "loss": 1.2945, + "step": 12680 + }, + { + "epoch": 0.026961579748857878, + "grad_norm": 0.6532595157623291, + "learning_rate": 0.00019997510814033452, + "loss": 1.2678, + "step": 12690 + }, + { + "epoch": 0.02698282606859693, + "grad_norm": 0.4009140431880951, + "learning_rate": 0.00019997495594979557, + "loss": 1.2657, + "step": 12700 + }, + { + "epoch": 0.027004072388335982, + "grad_norm": 0.4098702669143677, + "learning_rate": 0.00019997480329548008, + "loss": 1.2846, + "step": 12710 + }, + { + "epoch": 0.027025318708075036, + "grad_norm": 0.35350340604782104, + "learning_rate": 0.00019997465017738874, + "loss": 1.2946, + "step": 12720 + }, + { + "epoch": 0.02704656502781409, + "grad_norm": 0.404502809047699, + "learning_rate": 0.00019997449659552222, + "loss": 1.3016, + "step": 12730 + }, + { + "epoch": 0.02706781134755314, + "grad_norm": 0.3676071763038635, + "learning_rate": 0.00019997434254988132, + "loss": 1.3016, + "step": 12740 + }, + { + "epoch": 0.027089057667292193, + "grad_norm": 0.31724128127098083, + "learning_rate": 0.00019997418804046672, + "loss": 1.3005, + "step": 12750 + }, + { + "epoch": 0.027110303987031247, + "grad_norm": 0.33016830682754517, + "learning_rate": 0.0001999740330672791, + "loss": 1.3262, + "step": 12760 + }, + { + "epoch": 0.027131550306770297, + "grad_norm": 0.3513452708721161, + "learning_rate": 0.00019997387763031919, + "loss": 1.293, + "step": 12770 + }, + { + "epoch": 0.02715279662650935, + "grad_norm": 0.41702812910079956, + "learning_rate": 0.00019997372172958776, + "loss": 1.3061, + "step": 12780 + }, + { + "epoch": 0.027174042946248405, + "grad_norm": 0.3278518319129944, + "learning_rate": 0.00019997356536508547, + "loss": 1.3033, + "step": 12790 + }, + { + "epoch": 0.02719528926598746, + "grad_norm": 0.3970349133014679, + "learning_rate": 0.00019997340853681307, + "loss": 1.2859, + "step": 12800 + }, + { + "epoch": 0.02721653558572651, + "grad_norm": 0.3539119064807892, + "learning_rate": 0.0001999732512447713, + "loss": 1.3088, + "step": 12810 + }, + { + "epoch": 0.027237781905465563, + "grad_norm": 0.34745603799819946, + "learning_rate": 0.00019997309348896087, + "loss": 1.2594, + "step": 12820 + }, + { + "epoch": 0.027259028225204616, + "grad_norm": 0.34619054198265076, + "learning_rate": 0.00019997293526938253, + "loss": 1.3004, + "step": 12830 + }, + { + "epoch": 0.027280274544943667, + "grad_norm": 0.40082764625549316, + "learning_rate": 0.00019997277658603702, + "loss": 1.2799, + "step": 12840 + }, + { + "epoch": 0.02730152086468272, + "grad_norm": 0.39590975642204285, + "learning_rate": 0.00019997261743892502, + "loss": 1.2875, + "step": 12850 + }, + { + "epoch": 0.027322767184421774, + "grad_norm": 0.34417515993118286, + "learning_rate": 0.00019997245782804737, + "loss": 1.2919, + "step": 12860 + }, + { + "epoch": 0.027344013504160825, + "grad_norm": 0.3346881568431854, + "learning_rate": 0.00019997229775340473, + "loss": 1.2568, + "step": 12870 + }, + { + "epoch": 0.02736525982389988, + "grad_norm": 0.3924204111099243, + "learning_rate": 0.00019997213721499783, + "loss": 1.289, + "step": 12880 + }, + { + "epoch": 0.027386506143638932, + "grad_norm": 0.429219514131546, + "learning_rate": 0.0001999719762128275, + "loss": 1.2728, + "step": 12890 + }, + { + "epoch": 0.027407752463377986, + "grad_norm": 0.32337313890457153, + "learning_rate": 0.0001999718147468944, + "loss": 1.2699, + "step": 12900 + }, + { + "epoch": 0.027428998783117036, + "grad_norm": 0.35570967197418213, + "learning_rate": 0.00019997165281719932, + "loss": 1.3086, + "step": 12910 + }, + { + "epoch": 0.02745024510285609, + "grad_norm": 0.3769350051879883, + "learning_rate": 0.000199971490423743, + "loss": 1.2941, + "step": 12920 + }, + { + "epoch": 0.027471491422595144, + "grad_norm": 0.4251541793346405, + "learning_rate": 0.00019997132756652622, + "loss": 1.2896, + "step": 12930 + }, + { + "epoch": 0.027492737742334194, + "grad_norm": 0.3966098725795746, + "learning_rate": 0.00019997116424554968, + "loss": 1.274, + "step": 12940 + }, + { + "epoch": 0.027513984062073248, + "grad_norm": 0.3386051654815674, + "learning_rate": 0.00019997100046081418, + "loss": 1.2967, + "step": 12950 + }, + { + "epoch": 0.0275352303818123, + "grad_norm": 0.3239937424659729, + "learning_rate": 0.0001999708362123205, + "loss": 1.2971, + "step": 12960 + }, + { + "epoch": 0.02755647670155135, + "grad_norm": 0.5803064703941345, + "learning_rate": 0.00019997067150006936, + "loss": 1.2804, + "step": 12970 + }, + { + "epoch": 0.027577723021290405, + "grad_norm": 0.4080747961997986, + "learning_rate": 0.0001999705063240615, + "loss": 1.3112, + "step": 12980 + }, + { + "epoch": 0.02759896934102946, + "grad_norm": 0.4086076617240906, + "learning_rate": 0.00019997034068429777, + "loss": 1.3044, + "step": 12990 + }, + { + "epoch": 0.027620215660768513, + "grad_norm": 0.35568761825561523, + "learning_rate": 0.0001999701745807789, + "loss": 1.2891, + "step": 13000 + }, + { + "epoch": 0.027641461980507563, + "grad_norm": 0.33371490240097046, + "learning_rate": 0.00019997000801350563, + "loss": 1.2882, + "step": 13010 + }, + { + "epoch": 0.027662708300246617, + "grad_norm": 0.31974488496780396, + "learning_rate": 0.00019996984098247876, + "loss": 1.2863, + "step": 13020 + }, + { + "epoch": 0.02768395461998567, + "grad_norm": 0.3738625943660736, + "learning_rate": 0.00019996967348769905, + "loss": 1.2538, + "step": 13030 + }, + { + "epoch": 0.02770520093972472, + "grad_norm": 0.36826571822166443, + "learning_rate": 0.0001999695055291673, + "loss": 1.2815, + "step": 13040 + }, + { + "epoch": 0.027726447259463775, + "grad_norm": 0.4074922800064087, + "learning_rate": 0.00019996933710688425, + "loss": 1.3297, + "step": 13050 + }, + { + "epoch": 0.02774769357920283, + "grad_norm": 0.3399704396724701, + "learning_rate": 0.00019996916822085075, + "loss": 1.2805, + "step": 13060 + }, + { + "epoch": 0.02776893989894188, + "grad_norm": 0.5272880792617798, + "learning_rate": 0.00019996899887106753, + "loss": 1.2975, + "step": 13070 + }, + { + "epoch": 0.027790186218680932, + "grad_norm": 0.5374834537506104, + "learning_rate": 0.00019996882905753534, + "loss": 1.248, + "step": 13080 + }, + { + "epoch": 0.027811432538419986, + "grad_norm": 0.42231687903404236, + "learning_rate": 0.00019996865878025509, + "loss": 1.2865, + "step": 13090 + }, + { + "epoch": 0.02783267885815904, + "grad_norm": 0.38991424441337585, + "learning_rate": 0.00019996848803922746, + "loss": 1.3288, + "step": 13100 + }, + { + "epoch": 0.02785392517789809, + "grad_norm": 0.3948550522327423, + "learning_rate": 0.00019996831683445328, + "loss": 1.2605, + "step": 13110 + }, + { + "epoch": 0.027875171497637144, + "grad_norm": 0.51406329870224, + "learning_rate": 0.00019996814516593333, + "loss": 1.2884, + "step": 13120 + }, + { + "epoch": 0.027896417817376198, + "grad_norm": 0.4475144147872925, + "learning_rate": 0.0001999679730336684, + "loss": 1.2708, + "step": 13130 + }, + { + "epoch": 0.027917664137115248, + "grad_norm": 0.42705655097961426, + "learning_rate": 0.00019996780043765933, + "loss": 1.2774, + "step": 13140 + }, + { + "epoch": 0.027938910456854302, + "grad_norm": 0.3492595851421356, + "learning_rate": 0.00019996762737790687, + "loss": 1.313, + "step": 13150 + }, + { + "epoch": 0.027960156776593355, + "grad_norm": 0.4762828052043915, + "learning_rate": 0.0001999674538544119, + "loss": 1.3125, + "step": 13160 + }, + { + "epoch": 0.027981403096332406, + "grad_norm": 0.3712509572505951, + "learning_rate": 0.00019996727986717515, + "loss": 1.2774, + "step": 13170 + }, + { + "epoch": 0.02800264941607146, + "grad_norm": 0.34350696206092834, + "learning_rate": 0.00019996710541619742, + "loss": 1.2772, + "step": 13180 + }, + { + "epoch": 0.028023895735810513, + "grad_norm": 0.37947219610214233, + "learning_rate": 0.00019996693050147962, + "loss": 1.2812, + "step": 13190 + }, + { + "epoch": 0.028045142055549564, + "grad_norm": 0.33421334624290466, + "learning_rate": 0.00019996675512302242, + "loss": 1.2857, + "step": 13200 + }, + { + "epoch": 0.028066388375288617, + "grad_norm": 0.3302818536758423, + "learning_rate": 0.00019996657928082675, + "loss": 1.267, + "step": 13210 + }, + { + "epoch": 0.02808763469502767, + "grad_norm": 0.3992621898651123, + "learning_rate": 0.00019996640297489335, + "loss": 1.3015, + "step": 13220 + }, + { + "epoch": 0.028108881014766725, + "grad_norm": 0.33518970012664795, + "learning_rate": 0.0001999662262052231, + "loss": 1.2763, + "step": 13230 + }, + { + "epoch": 0.028130127334505775, + "grad_norm": 0.33514687418937683, + "learning_rate": 0.00019996604897181674, + "loss": 1.2735, + "step": 13240 + }, + { + "epoch": 0.02815137365424483, + "grad_norm": 0.43553170561790466, + "learning_rate": 0.00019996587127467518, + "loss": 1.2775, + "step": 13250 + }, + { + "epoch": 0.028172619973983883, + "grad_norm": 0.3574298322200775, + "learning_rate": 0.0001999656931137992, + "loss": 1.2751, + "step": 13260 + }, + { + "epoch": 0.028193866293722933, + "grad_norm": 0.345941960811615, + "learning_rate": 0.00019996551448918964, + "loss": 1.2765, + "step": 13270 + }, + { + "epoch": 0.028215112613461987, + "grad_norm": 0.37367862462997437, + "learning_rate": 0.00019996533540084731, + "loss": 1.2719, + "step": 13280 + }, + { + "epoch": 0.02823635893320104, + "grad_norm": 0.5121581554412842, + "learning_rate": 0.00019996515584877304, + "loss": 1.261, + "step": 13290 + }, + { + "epoch": 0.02825760525294009, + "grad_norm": 0.4437972605228424, + "learning_rate": 0.00019996497583296772, + "loss": 1.3103, + "step": 13300 + }, + { + "epoch": 0.028278851572679144, + "grad_norm": 0.3266242742538452, + "learning_rate": 0.00019996479535343208, + "loss": 1.2806, + "step": 13310 + }, + { + "epoch": 0.028300097892418198, + "grad_norm": 0.3355073034763336, + "learning_rate": 0.00019996461441016704, + "loss": 1.2458, + "step": 13320 + }, + { + "epoch": 0.028321344212157252, + "grad_norm": 0.42394161224365234, + "learning_rate": 0.00019996443300317345, + "loss": 1.2973, + "step": 13330 + }, + { + "epoch": 0.028342590531896302, + "grad_norm": 0.3751457929611206, + "learning_rate": 0.00019996425113245209, + "loss": 1.2782, + "step": 13340 + }, + { + "epoch": 0.028363836851635356, + "grad_norm": 0.3850574791431427, + "learning_rate": 0.00019996406879800382, + "loss": 1.2787, + "step": 13350 + }, + { + "epoch": 0.02838508317137441, + "grad_norm": 0.38333263993263245, + "learning_rate": 0.0001999638859998295, + "loss": 1.2757, + "step": 13360 + }, + { + "epoch": 0.02840632949111346, + "grad_norm": 0.3686596155166626, + "learning_rate": 0.00019996370273792996, + "loss": 1.2549, + "step": 13370 + }, + { + "epoch": 0.028427575810852514, + "grad_norm": 0.36714428663253784, + "learning_rate": 0.0001999635190123061, + "loss": 1.2579, + "step": 13380 + }, + { + "epoch": 0.028448822130591567, + "grad_norm": 0.3491072654724121, + "learning_rate": 0.00019996333482295872, + "loss": 1.299, + "step": 13390 + }, + { + "epoch": 0.028470068450330618, + "grad_norm": 0.3488186299800873, + "learning_rate": 0.0001999631501698887, + "loss": 1.2682, + "step": 13400 + }, + { + "epoch": 0.02849131477006967, + "grad_norm": 0.3316390812397003, + "learning_rate": 0.00019996296505309688, + "loss": 1.2991, + "step": 13410 + }, + { + "epoch": 0.028512561089808725, + "grad_norm": 0.354986310005188, + "learning_rate": 0.00019996277947258411, + "loss": 1.265, + "step": 13420 + }, + { + "epoch": 0.02853380740954778, + "grad_norm": 0.3361266851425171, + "learning_rate": 0.00019996259342835128, + "loss": 1.249, + "step": 13430 + }, + { + "epoch": 0.02855505372928683, + "grad_norm": 0.3540953993797302, + "learning_rate": 0.00019996240692039927, + "loss": 1.2715, + "step": 13440 + }, + { + "epoch": 0.028576300049025883, + "grad_norm": 0.42566192150115967, + "learning_rate": 0.0001999622199487289, + "loss": 1.247, + "step": 13450 + }, + { + "epoch": 0.028597546368764937, + "grad_norm": 0.34134209156036377, + "learning_rate": 0.00019996203251334104, + "loss": 1.3023, + "step": 13460 + }, + { + "epoch": 0.028618792688503987, + "grad_norm": 0.3461844325065613, + "learning_rate": 0.00019996184461423656, + "loss": 1.2706, + "step": 13470 + }, + { + "epoch": 0.02864003900824304, + "grad_norm": 0.4711892008781433, + "learning_rate": 0.00019996165625141637, + "loss": 1.2587, + "step": 13480 + }, + { + "epoch": 0.028661285327982094, + "grad_norm": 0.35004571080207825, + "learning_rate": 0.00019996146742488134, + "loss": 1.2621, + "step": 13490 + }, + { + "epoch": 0.028682531647721145, + "grad_norm": 0.3495234549045563, + "learning_rate": 0.00019996127813463231, + "loss": 1.288, + "step": 13500 + }, + { + "epoch": 0.0287037779674602, + "grad_norm": 0.39064472913742065, + "learning_rate": 0.00019996108838067012, + "loss": 1.2782, + "step": 13510 + }, + { + "epoch": 0.028725024287199252, + "grad_norm": 0.3414998948574066, + "learning_rate": 0.00019996089816299578, + "loss": 1.2845, + "step": 13520 + }, + { + "epoch": 0.028746270606938306, + "grad_norm": 0.518709123134613, + "learning_rate": 0.00019996070748161006, + "loss": 1.3246, + "step": 13530 + }, + { + "epoch": 0.028767516926677356, + "grad_norm": 0.42235279083251953, + "learning_rate": 0.00019996051633651386, + "loss": 1.2527, + "step": 13540 + }, + { + "epoch": 0.02878876324641641, + "grad_norm": 0.3822650909423828, + "learning_rate": 0.0001999603247277081, + "loss": 1.2718, + "step": 13550 + }, + { + "epoch": 0.028810009566155464, + "grad_norm": 0.38291308283805847, + "learning_rate": 0.0001999601326551937, + "loss": 1.2646, + "step": 13560 + }, + { + "epoch": 0.028831255885894514, + "grad_norm": 0.3220558166503906, + "learning_rate": 0.00019995994011897145, + "loss": 1.275, + "step": 13570 + }, + { + "epoch": 0.028852502205633568, + "grad_norm": 0.34875422716140747, + "learning_rate": 0.00019995974711904232, + "loss": 1.28, + "step": 13580 + }, + { + "epoch": 0.02887374852537262, + "grad_norm": 0.36955636739730835, + "learning_rate": 0.0001999595536554072, + "loss": 1.295, + "step": 13590 + }, + { + "epoch": 0.028894994845111672, + "grad_norm": 0.45685267448425293, + "learning_rate": 0.00019995935972806693, + "loss": 1.2336, + "step": 13600 + }, + { + "epoch": 0.028916241164850726, + "grad_norm": 0.325433611869812, + "learning_rate": 0.0001999591653370225, + "loss": 1.2848, + "step": 13610 + }, + { + "epoch": 0.02893748748458978, + "grad_norm": 0.33891165256500244, + "learning_rate": 0.00019995897048227473, + "loss": 1.2878, + "step": 13620 + }, + { + "epoch": 0.028958733804328833, + "grad_norm": 0.3262781500816345, + "learning_rate": 0.00019995877516382457, + "loss": 1.2712, + "step": 13630 + }, + { + "epoch": 0.028979980124067883, + "grad_norm": 0.4527239203453064, + "learning_rate": 0.0001999585793816729, + "loss": 1.2749, + "step": 13640 + }, + { + "epoch": 0.029001226443806937, + "grad_norm": 0.32147401571273804, + "learning_rate": 0.00019995838313582068, + "loss": 1.289, + "step": 13650 + }, + { + "epoch": 0.02902247276354599, + "grad_norm": 0.3406784236431122, + "learning_rate": 0.00019995818642626876, + "loss": 1.2979, + "step": 13660 + }, + { + "epoch": 0.02904371908328504, + "grad_norm": 0.3593210279941559, + "learning_rate": 0.00019995798925301808, + "loss": 1.2894, + "step": 13670 + }, + { + "epoch": 0.029064965403024095, + "grad_norm": 0.31777748465538025, + "learning_rate": 0.00019995779161606953, + "loss": 1.2519, + "step": 13680 + }, + { + "epoch": 0.02908621172276315, + "grad_norm": 0.5061537027359009, + "learning_rate": 0.00019995759351542405, + "loss": 1.2577, + "step": 13690 + }, + { + "epoch": 0.0291074580425022, + "grad_norm": 0.3594975769519806, + "learning_rate": 0.0001999573949510826, + "loss": 1.313, + "step": 13700 + }, + { + "epoch": 0.029128704362241253, + "grad_norm": 0.40915602445602417, + "learning_rate": 0.000199957195923046, + "loss": 1.2752, + "step": 13710 + }, + { + "epoch": 0.029149950681980306, + "grad_norm": 0.5553947687149048, + "learning_rate": 0.00019995699643131525, + "loss": 1.2516, + "step": 13720 + }, + { + "epoch": 0.029171197001719357, + "grad_norm": 0.35843950510025024, + "learning_rate": 0.00019995679647589126, + "loss": 1.2779, + "step": 13730 + }, + { + "epoch": 0.02919244332145841, + "grad_norm": 0.37457653880119324, + "learning_rate": 0.00019995659605677495, + "loss": 1.2705, + "step": 13740 + }, + { + "epoch": 0.029213689641197464, + "grad_norm": 0.42225325107574463, + "learning_rate": 0.00019995639517396722, + "loss": 1.2515, + "step": 13750 + }, + { + "epoch": 0.029234935960936518, + "grad_norm": 0.37594228982925415, + "learning_rate": 0.0001999561938274691, + "loss": 1.2423, + "step": 13760 + }, + { + "epoch": 0.029256182280675568, + "grad_norm": 0.34361690282821655, + "learning_rate": 0.00019995599201728142, + "loss": 1.296, + "step": 13770 + }, + { + "epoch": 0.029277428600414622, + "grad_norm": 0.38517430424690247, + "learning_rate": 0.00019995578974340514, + "loss": 1.2815, + "step": 13780 + }, + { + "epoch": 0.029298674920153676, + "grad_norm": 0.4176165461540222, + "learning_rate": 0.00019995558700584124, + "loss": 1.3108, + "step": 13790 + }, + { + "epoch": 0.029319921239892726, + "grad_norm": 0.4373164772987366, + "learning_rate": 0.00019995538380459063, + "loss": 1.3076, + "step": 13800 + }, + { + "epoch": 0.02934116755963178, + "grad_norm": 0.34318822622299194, + "learning_rate": 0.00019995518013965423, + "loss": 1.2887, + "step": 13810 + }, + { + "epoch": 0.029362413879370833, + "grad_norm": 0.46214982867240906, + "learning_rate": 0.00019995497601103302, + "loss": 1.2362, + "step": 13820 + }, + { + "epoch": 0.029383660199109884, + "grad_norm": 0.5334571599960327, + "learning_rate": 0.00019995477141872799, + "loss": 1.3118, + "step": 13830 + }, + { + "epoch": 0.029404906518848938, + "grad_norm": 0.3634757101535797, + "learning_rate": 0.00019995456636273998, + "loss": 1.2225, + "step": 13840 + }, + { + "epoch": 0.02942615283858799, + "grad_norm": 0.46214285492897034, + "learning_rate": 0.00019995436084307001, + "loss": 1.2758, + "step": 13850 + }, + { + "epoch": 0.029447399158327045, + "grad_norm": 0.36621013283729553, + "learning_rate": 0.00019995415485971903, + "loss": 1.254, + "step": 13860 + }, + { + "epoch": 0.029468645478066095, + "grad_norm": 0.43865469098091125, + "learning_rate": 0.00019995394841268799, + "loss": 1.3141, + "step": 13870 + }, + { + "epoch": 0.02948989179780515, + "grad_norm": 0.334249347448349, + "learning_rate": 0.00019995374150197782, + "loss": 1.2997, + "step": 13880 + }, + { + "epoch": 0.029511138117544203, + "grad_norm": 0.33159032464027405, + "learning_rate": 0.00019995353412758952, + "loss": 1.2736, + "step": 13890 + }, + { + "epoch": 0.029532384437283253, + "grad_norm": 0.41078272461891174, + "learning_rate": 0.00019995332628952405, + "loss": 1.2481, + "step": 13900 + }, + { + "epoch": 0.029553630757022307, + "grad_norm": 0.3635225296020508, + "learning_rate": 0.00019995311798778236, + "loss": 1.2901, + "step": 13910 + }, + { + "epoch": 0.02957487707676136, + "grad_norm": 0.3608478009700775, + "learning_rate": 0.0001999529092223654, + "loss": 1.2911, + "step": 13920 + }, + { + "epoch": 0.02959612339650041, + "grad_norm": 0.33249956369400024, + "learning_rate": 0.00019995269999327416, + "loss": 1.2803, + "step": 13930 + }, + { + "epoch": 0.029617369716239465, + "grad_norm": 0.3411157727241516, + "learning_rate": 0.0001999524903005096, + "loss": 1.2638, + "step": 13940 + }, + { + "epoch": 0.02963861603597852, + "grad_norm": 0.3666328489780426, + "learning_rate": 0.0001999522801440727, + "loss": 1.246, + "step": 13950 + }, + { + "epoch": 0.029659862355717572, + "grad_norm": 0.3775026202201843, + "learning_rate": 0.00019995206952396445, + "loss": 1.3184, + "step": 13960 + }, + { + "epoch": 0.029681108675456622, + "grad_norm": 0.37314504384994507, + "learning_rate": 0.0001999518584401858, + "loss": 1.2815, + "step": 13970 + }, + { + "epoch": 0.029702354995195676, + "grad_norm": 0.44641393423080444, + "learning_rate": 0.00019995164689273772, + "loss": 1.2928, + "step": 13980 + }, + { + "epoch": 0.02972360131493473, + "grad_norm": 0.42537611722946167, + "learning_rate": 0.00019995143488162124, + "loss": 1.2964, + "step": 13990 + }, + { + "epoch": 0.02974484763467378, + "grad_norm": 0.32955029606819153, + "learning_rate": 0.00019995122240683733, + "loss": 1.2477, + "step": 14000 + }, + { + "epoch": 0.029766093954412834, + "grad_norm": 0.38156238198280334, + "learning_rate": 0.00019995100946838693, + "loss": 1.2823, + "step": 14010 + }, + { + "epoch": 0.029787340274151888, + "grad_norm": 0.3442648649215698, + "learning_rate": 0.00019995079606627107, + "loss": 1.2568, + "step": 14020 + }, + { + "epoch": 0.029808586593890938, + "grad_norm": 0.4730078876018524, + "learning_rate": 0.00019995058220049073, + "loss": 1.3081, + "step": 14030 + }, + { + "epoch": 0.02982983291362999, + "grad_norm": 0.32417139410972595, + "learning_rate": 0.0001999503678710469, + "loss": 1.2702, + "step": 14040 + }, + { + "epoch": 0.029851079233369045, + "grad_norm": 0.43689021468162537, + "learning_rate": 0.00019995015307794055, + "loss": 1.237, + "step": 14050 + }, + { + "epoch": 0.0298723255531081, + "grad_norm": 0.5414863228797913, + "learning_rate": 0.0001999499378211727, + "loss": 1.2654, + "step": 14060 + }, + { + "epoch": 0.02989357187284715, + "grad_norm": 0.34272947907447815, + "learning_rate": 0.0001999497221007444, + "loss": 1.3074, + "step": 14070 + }, + { + "epoch": 0.029914818192586203, + "grad_norm": 0.45459893345832825, + "learning_rate": 0.00019994950591665654, + "loss": 1.2758, + "step": 14080 + }, + { + "epoch": 0.029936064512325257, + "grad_norm": 0.4766577482223511, + "learning_rate": 0.00019994928926891022, + "loss": 1.2628, + "step": 14090 + }, + { + "epoch": 0.029957310832064307, + "grad_norm": 0.43908265233039856, + "learning_rate": 0.0001999490721575064, + "loss": 1.245, + "step": 14100 + }, + { + "epoch": 0.02997855715180336, + "grad_norm": 0.5082383155822754, + "learning_rate": 0.00019994885458244613, + "loss": 1.287, + "step": 14110 + }, + { + "epoch": 0.029999803471542415, + "grad_norm": 0.5202736854553223, + "learning_rate": 0.00019994863654373033, + "loss": 1.2755, + "step": 14120 + }, + { + "epoch": 0.030021049791281465, + "grad_norm": 0.35523271560668945, + "learning_rate": 0.00019994841804136008, + "loss": 1.2857, + "step": 14130 + }, + { + "epoch": 0.03004229611102052, + "grad_norm": 0.41309764981269836, + "learning_rate": 0.0001999481990753364, + "loss": 1.2942, + "step": 14140 + }, + { + "epoch": 0.030063542430759572, + "grad_norm": 0.3433438241481781, + "learning_rate": 0.00019994797964566026, + "loss": 1.3071, + "step": 14150 + }, + { + "epoch": 0.030084788750498626, + "grad_norm": 0.3363725244998932, + "learning_rate": 0.0001999477597523327, + "loss": 1.2961, + "step": 14160 + }, + { + "epoch": 0.030106035070237677, + "grad_norm": 0.3387785851955414, + "learning_rate": 0.00019994753939535479, + "loss": 1.3019, + "step": 14170 + }, + { + "epoch": 0.03012728138997673, + "grad_norm": 0.36963409185409546, + "learning_rate": 0.00019994731857472746, + "loss": 1.2891, + "step": 14180 + }, + { + "epoch": 0.030148527709715784, + "grad_norm": 0.3349338173866272, + "learning_rate": 0.0001999470972904518, + "loss": 1.3276, + "step": 14190 + }, + { + "epoch": 0.030169774029454834, + "grad_norm": 0.3104879856109619, + "learning_rate": 0.0001999468755425288, + "loss": 1.2644, + "step": 14200 + }, + { + "epoch": 0.030191020349193888, + "grad_norm": 0.55224609375, + "learning_rate": 0.0001999466533309595, + "loss": 1.2698, + "step": 14210 + }, + { + "epoch": 0.030212266668932942, + "grad_norm": 0.36400970816612244, + "learning_rate": 0.00019994643065574498, + "loss": 1.2698, + "step": 14220 + }, + { + "epoch": 0.030233512988671992, + "grad_norm": 0.36447665095329285, + "learning_rate": 0.00019994620751688622, + "loss": 1.2516, + "step": 14230 + }, + { + "epoch": 0.030254759308411046, + "grad_norm": 0.42988303303718567, + "learning_rate": 0.00019994598391438425, + "loss": 1.2732, + "step": 14240 + }, + { + "epoch": 0.0302760056281501, + "grad_norm": 0.33076268434524536, + "learning_rate": 0.0001999457598482401, + "loss": 1.2564, + "step": 14250 + }, + { + "epoch": 0.03029725194788915, + "grad_norm": 0.356808602809906, + "learning_rate": 0.00019994553531845486, + "loss": 1.2612, + "step": 14260 + }, + { + "epoch": 0.030318498267628204, + "grad_norm": 0.33413854241371155, + "learning_rate": 0.00019994531032502952, + "loss": 1.2569, + "step": 14270 + }, + { + "epoch": 0.030339744587367257, + "grad_norm": 0.32851123809814453, + "learning_rate": 0.00019994508486796515, + "loss": 1.2836, + "step": 14280 + }, + { + "epoch": 0.03036099090710631, + "grad_norm": 0.3438243269920349, + "learning_rate": 0.0001999448589472628, + "loss": 1.2831, + "step": 14290 + }, + { + "epoch": 0.03038223722684536, + "grad_norm": 0.3600449562072754, + "learning_rate": 0.0001999446325629235, + "loss": 1.282, + "step": 14300 + }, + { + "epoch": 0.030403483546584415, + "grad_norm": 0.4052808880805969, + "learning_rate": 0.00019994440571494834, + "loss": 1.2543, + "step": 14310 + }, + { + "epoch": 0.03042472986632347, + "grad_norm": 0.40478575229644775, + "learning_rate": 0.00019994417840333832, + "loss": 1.295, + "step": 14320 + }, + { + "epoch": 0.03044597618606252, + "grad_norm": 0.357112854719162, + "learning_rate": 0.0001999439506280945, + "loss": 1.29, + "step": 14330 + }, + { + "epoch": 0.030467222505801573, + "grad_norm": 0.46449750661849976, + "learning_rate": 0.000199943722389218, + "loss": 1.2607, + "step": 14340 + }, + { + "epoch": 0.030488468825540627, + "grad_norm": 0.4142216444015503, + "learning_rate": 0.0001999434936867098, + "loss": 1.2988, + "step": 14350 + }, + { + "epoch": 0.030509715145279677, + "grad_norm": 0.3795907497406006, + "learning_rate": 0.00019994326452057098, + "loss": 1.2899, + "step": 14360 + }, + { + "epoch": 0.03053096146501873, + "grad_norm": 0.38817375898361206, + "learning_rate": 0.00019994303489080263, + "loss": 1.2698, + "step": 14370 + }, + { + "epoch": 0.030552207784757784, + "grad_norm": 0.3691042363643646, + "learning_rate": 0.00019994280479740582, + "loss": 1.2724, + "step": 14380 + }, + { + "epoch": 0.030573454104496838, + "grad_norm": 0.4073532521724701, + "learning_rate": 0.00019994257424038158, + "loss": 1.2692, + "step": 14390 + }, + { + "epoch": 0.03059470042423589, + "grad_norm": 0.3791826069355011, + "learning_rate": 0.00019994234321973103, + "loss": 1.261, + "step": 14400 + }, + { + "epoch": 0.030615946743974942, + "grad_norm": 0.4001646637916565, + "learning_rate": 0.0001999421117354552, + "loss": 1.2772, + "step": 14410 + }, + { + "epoch": 0.030637193063713996, + "grad_norm": 0.3179972767829895, + "learning_rate": 0.00019994187978755515, + "loss": 1.2516, + "step": 14420 + }, + { + "epoch": 0.030658439383453046, + "grad_norm": 0.33542612195014954, + "learning_rate": 0.00019994164737603196, + "loss": 1.2633, + "step": 14430 + }, + { + "epoch": 0.0306796857031921, + "grad_norm": 0.37289318442344666, + "learning_rate": 0.00019994141450088677, + "loss": 1.3009, + "step": 14440 + }, + { + "epoch": 0.030700932022931154, + "grad_norm": 0.4535268545150757, + "learning_rate": 0.00019994118116212063, + "loss": 1.3039, + "step": 14450 + }, + { + "epoch": 0.030722178342670204, + "grad_norm": 0.36888983845710754, + "learning_rate": 0.00019994094735973456, + "loss": 1.3015, + "step": 14460 + }, + { + "epoch": 0.030743424662409258, + "grad_norm": 0.34519606828689575, + "learning_rate": 0.00019994071309372972, + "loss": 1.2723, + "step": 14470 + }, + { + "epoch": 0.03076467098214831, + "grad_norm": 0.3603862524032593, + "learning_rate": 0.00019994047836410718, + "loss": 1.2943, + "step": 14480 + }, + { + "epoch": 0.030785917301887365, + "grad_norm": 0.3322259187698364, + "learning_rate": 0.000199940243170868, + "loss": 1.2868, + "step": 14490 + }, + { + "epoch": 0.030807163621626416, + "grad_norm": 0.3318069279193878, + "learning_rate": 0.00019994000751401335, + "loss": 1.2859, + "step": 14500 + }, + { + "epoch": 0.03082840994136547, + "grad_norm": 0.34036770462989807, + "learning_rate": 0.0001999397713935442, + "loss": 1.2516, + "step": 14510 + }, + { + "epoch": 0.030849656261104523, + "grad_norm": 0.3331727981567383, + "learning_rate": 0.00019993953480946174, + "loss": 1.2663, + "step": 14520 + }, + { + "epoch": 0.030870902580843573, + "grad_norm": 0.49089735746383667, + "learning_rate": 0.00019993929776176704, + "loss": 1.2656, + "step": 14530 + }, + { + "epoch": 0.030892148900582627, + "grad_norm": 0.4451941251754761, + "learning_rate": 0.0001999390602504612, + "loss": 1.2722, + "step": 14540 + }, + { + "epoch": 0.03091339522032168, + "grad_norm": 0.3955433666706085, + "learning_rate": 0.0001999388222755453, + "loss": 1.2452, + "step": 14550 + }, + { + "epoch": 0.03093464154006073, + "grad_norm": 0.4301290214061737, + "learning_rate": 0.00019993858383702046, + "loss": 1.2797, + "step": 14560 + }, + { + "epoch": 0.030955887859799785, + "grad_norm": 0.4052306115627289, + "learning_rate": 0.00019993834493488782, + "loss": 1.3053, + "step": 14570 + }, + { + "epoch": 0.03097713417953884, + "grad_norm": 0.5228415131568909, + "learning_rate": 0.00019993810556914846, + "loss": 1.2841, + "step": 14580 + }, + { + "epoch": 0.030998380499277892, + "grad_norm": 0.37277454137802124, + "learning_rate": 0.00019993786573980343, + "loss": 1.2899, + "step": 14590 + }, + { + "epoch": 0.031019626819016943, + "grad_norm": 0.499331533908844, + "learning_rate": 0.00019993762544685397, + "loss": 1.2667, + "step": 14600 + }, + { + "epoch": 0.031040873138755996, + "grad_norm": 0.36541712284088135, + "learning_rate": 0.0001999373846903011, + "loss": 1.2446, + "step": 14610 + }, + { + "epoch": 0.03106211945849505, + "grad_norm": 0.34404948353767395, + "learning_rate": 0.00019993714347014597, + "loss": 1.2858, + "step": 14620 + }, + { + "epoch": 0.0310833657782341, + "grad_norm": 0.3525635898113251, + "learning_rate": 0.00019993690178638968, + "loss": 1.3088, + "step": 14630 + }, + { + "epoch": 0.031104612097973154, + "grad_norm": 0.4035264849662781, + "learning_rate": 0.00019993665963903334, + "loss": 1.2501, + "step": 14640 + }, + { + "epoch": 0.031125858417712208, + "grad_norm": 0.3281551003456116, + "learning_rate": 0.00019993641702807814, + "loss": 1.2686, + "step": 14650 + }, + { + "epoch": 0.031147104737451258, + "grad_norm": 0.5075016021728516, + "learning_rate": 0.00019993617395352512, + "loss": 1.325, + "step": 14660 + }, + { + "epoch": 0.031168351057190312, + "grad_norm": 0.40143775939941406, + "learning_rate": 0.00019993593041537548, + "loss": 1.2489, + "step": 14670 + }, + { + "epoch": 0.031189597376929366, + "grad_norm": 0.3652079701423645, + "learning_rate": 0.00019993568641363032, + "loss": 1.2889, + "step": 14680 + }, + { + "epoch": 0.03121084369666842, + "grad_norm": 0.4716062843799591, + "learning_rate": 0.00019993544194829078, + "loss": 1.2868, + "step": 14690 + }, + { + "epoch": 0.03123209001640747, + "grad_norm": 0.36881768703460693, + "learning_rate": 0.00019993519701935796, + "loss": 1.2933, + "step": 14700 + }, + { + "epoch": 0.03125333633614652, + "grad_norm": 0.32998985052108765, + "learning_rate": 0.00019993495162683303, + "loss": 1.246, + "step": 14710 + }, + { + "epoch": 0.03127458265588558, + "grad_norm": 0.4571867883205414, + "learning_rate": 0.00019993470577071712, + "loss": 1.2942, + "step": 14720 + }, + { + "epoch": 0.03129582897562463, + "grad_norm": 0.35258087515830994, + "learning_rate": 0.00019993445945101134, + "loss": 1.3129, + "step": 14730 + }, + { + "epoch": 0.031317075295363685, + "grad_norm": 0.35532641410827637, + "learning_rate": 0.00019993421266771693, + "loss": 1.325, + "step": 14740 + }, + { + "epoch": 0.031338321615102735, + "grad_norm": 0.33986592292785645, + "learning_rate": 0.00019993396542083492, + "loss": 1.3013, + "step": 14750 + }, + { + "epoch": 0.031359567934841785, + "grad_norm": 0.3425050675868988, + "learning_rate": 0.00019993371771036652, + "loss": 1.2291, + "step": 14760 + }, + { + "epoch": 0.03138081425458084, + "grad_norm": 0.3808043599128723, + "learning_rate": 0.00019993346953631284, + "loss": 1.2833, + "step": 14770 + }, + { + "epoch": 0.03140206057431989, + "grad_norm": 0.4066392183303833, + "learning_rate": 0.00019993322089867507, + "loss": 1.2665, + "step": 14780 + }, + { + "epoch": 0.03142330689405894, + "grad_norm": 0.38047507405281067, + "learning_rate": 0.00019993297179745433, + "loss": 1.2771, + "step": 14790 + }, + { + "epoch": 0.031444553213798, + "grad_norm": 0.406227707862854, + "learning_rate": 0.00019993272223265183, + "loss": 1.2865, + "step": 14800 + }, + { + "epoch": 0.03146579953353705, + "grad_norm": 0.5128522515296936, + "learning_rate": 0.00019993247220426868, + "loss": 1.2621, + "step": 14810 + }, + { + "epoch": 0.0314870458532761, + "grad_norm": 0.3749784529209137, + "learning_rate": 0.00019993222171230604, + "loss": 1.2666, + "step": 14820 + }, + { + "epoch": 0.03150829217301516, + "grad_norm": 0.3650704622268677, + "learning_rate": 0.00019993197075676507, + "loss": 1.2726, + "step": 14830 + }, + { + "epoch": 0.03152953849275421, + "grad_norm": 0.35300856828689575, + "learning_rate": 0.000199931719337647, + "loss": 1.2785, + "step": 14840 + }, + { + "epoch": 0.03155078481249326, + "grad_norm": 0.3928791284561157, + "learning_rate": 0.00019993146745495287, + "loss": 1.3147, + "step": 14850 + }, + { + "epoch": 0.031572031132232316, + "grad_norm": 0.4494268596172333, + "learning_rate": 0.00019993121510868397, + "loss": 1.2335, + "step": 14860 + }, + { + "epoch": 0.031593277451971366, + "grad_norm": 0.4268462061882019, + "learning_rate": 0.00019993096229884143, + "loss": 1.2606, + "step": 14870 + }, + { + "epoch": 0.031614523771710416, + "grad_norm": 0.43217942118644714, + "learning_rate": 0.00019993070902542642, + "loss": 1.2833, + "step": 14880 + }, + { + "epoch": 0.031635770091449474, + "grad_norm": 0.3238803446292877, + "learning_rate": 0.00019993045528844008, + "loss": 1.2542, + "step": 14890 + }, + { + "epoch": 0.031657016411188524, + "grad_norm": 0.43694010376930237, + "learning_rate": 0.00019993020108788364, + "loss": 1.2402, + "step": 14900 + }, + { + "epoch": 0.031678262730927574, + "grad_norm": 0.33208972215652466, + "learning_rate": 0.00019992994642375824, + "loss": 1.2836, + "step": 14910 + }, + { + "epoch": 0.03169950905066663, + "grad_norm": 0.3904381990432739, + "learning_rate": 0.00019992969129606507, + "loss": 1.2776, + "step": 14920 + }, + { + "epoch": 0.03172075537040568, + "grad_norm": 0.34953391551971436, + "learning_rate": 0.00019992943570480533, + "loss": 1.2987, + "step": 14930 + }, + { + "epoch": 0.03174200169014473, + "grad_norm": 0.3406221568584442, + "learning_rate": 0.00019992917964998022, + "loss": 1.2742, + "step": 14940 + }, + { + "epoch": 0.03176324800988379, + "grad_norm": 0.38346627354621887, + "learning_rate": 0.00019992892313159088, + "loss": 1.2635, + "step": 14950 + }, + { + "epoch": 0.03178449432962284, + "grad_norm": 0.3405255675315857, + "learning_rate": 0.00019992866614963853, + "loss": 1.272, + "step": 14960 + }, + { + "epoch": 0.0318057406493619, + "grad_norm": 0.4129686951637268, + "learning_rate": 0.00019992840870412435, + "loss": 1.2669, + "step": 14970 + }, + { + "epoch": 0.03182698696910095, + "grad_norm": 0.3885240852832794, + "learning_rate": 0.00019992815079504956, + "loss": 1.2344, + "step": 14980 + }, + { + "epoch": 0.03184823328884, + "grad_norm": 0.36981818079948425, + "learning_rate": 0.00019992789242241532, + "loss": 1.299, + "step": 14990 + }, + { + "epoch": 0.031869479608579054, + "grad_norm": 0.35504084825515747, + "learning_rate": 0.00019992763358622287, + "loss": 1.2537, + "step": 15000 + }, + { + "epoch": 0.031890725928318105, + "grad_norm": 0.39428818225860596, + "learning_rate": 0.0001999273742864734, + "loss": 1.2843, + "step": 15010 + }, + { + "epoch": 0.031911972248057155, + "grad_norm": 0.3261406421661377, + "learning_rate": 0.00019992711452316805, + "loss": 1.2294, + "step": 15020 + }, + { + "epoch": 0.03193321856779621, + "grad_norm": 0.468903124332428, + "learning_rate": 0.0001999268542963081, + "loss": 1.2991, + "step": 15030 + }, + { + "epoch": 0.03195446488753526, + "grad_norm": 0.34278786182403564, + "learning_rate": 0.00019992659360589473, + "loss": 1.3048, + "step": 15040 + }, + { + "epoch": 0.03197571120727431, + "grad_norm": 0.5257402062416077, + "learning_rate": 0.00019992633245192914, + "loss": 1.2736, + "step": 15050 + }, + { + "epoch": 0.03199695752701337, + "grad_norm": 0.5999605655670166, + "learning_rate": 0.00019992607083441256, + "loss": 1.2306, + "step": 15060 + }, + { + "epoch": 0.03201820384675242, + "grad_norm": 0.3719445765018463, + "learning_rate": 0.00019992580875334622, + "loss": 1.2246, + "step": 15070 + }, + { + "epoch": 0.03203945016649147, + "grad_norm": 0.39283740520477295, + "learning_rate": 0.0001999255462087313, + "loss": 1.2734, + "step": 15080 + }, + { + "epoch": 0.03206069648623053, + "grad_norm": 0.344653457403183, + "learning_rate": 0.00019992528320056903, + "loss": 1.273, + "step": 15090 + }, + { + "epoch": 0.03208194280596958, + "grad_norm": 0.3921027183532715, + "learning_rate": 0.00019992501972886063, + "loss": 1.2744, + "step": 15100 + }, + { + "epoch": 0.03210318912570863, + "grad_norm": 0.3433869779109955, + "learning_rate": 0.0001999247557936073, + "loss": 1.2829, + "step": 15110 + }, + { + "epoch": 0.032124435445447685, + "grad_norm": 0.370757132768631, + "learning_rate": 0.00019992449139481032, + "loss": 1.2753, + "step": 15120 + }, + { + "epoch": 0.032145681765186736, + "grad_norm": 0.3839212954044342, + "learning_rate": 0.00019992422653247087, + "loss": 1.2894, + "step": 15130 + }, + { + "epoch": 0.032166928084925786, + "grad_norm": 0.39270198345184326, + "learning_rate": 0.00019992396120659017, + "loss": 1.2278, + "step": 15140 + }, + { + "epoch": 0.03218817440466484, + "grad_norm": 0.36751046776771545, + "learning_rate": 0.00019992369541716953, + "loss": 1.2486, + "step": 15150 + }, + { + "epoch": 0.032209420724403894, + "grad_norm": 0.35467642545700073, + "learning_rate": 0.00019992342916421007, + "loss": 1.2722, + "step": 15160 + }, + { + "epoch": 0.03223066704414295, + "grad_norm": 0.3968997299671173, + "learning_rate": 0.00019992316244771312, + "loss": 1.2997, + "step": 15170 + }, + { + "epoch": 0.032251913363882, + "grad_norm": 0.42520871758461, + "learning_rate": 0.00019992289526767984, + "loss": 1.2686, + "step": 15180 + }, + { + "epoch": 0.03227315968362105, + "grad_norm": 0.3342803716659546, + "learning_rate": 0.00019992262762411156, + "loss": 1.3029, + "step": 15190 + }, + { + "epoch": 0.03229440600336011, + "grad_norm": 0.4276069700717926, + "learning_rate": 0.00019992235951700943, + "loss": 1.2231, + "step": 15200 + }, + { + "epoch": 0.03231565232309916, + "grad_norm": 0.42727893590927124, + "learning_rate": 0.00019992209094637473, + "loss": 1.2775, + "step": 15210 + }, + { + "epoch": 0.03233689864283821, + "grad_norm": 0.3702860474586487, + "learning_rate": 0.00019992182191220872, + "loss": 1.2836, + "step": 15220 + }, + { + "epoch": 0.032358144962577266, + "grad_norm": 0.36330029368400574, + "learning_rate": 0.00019992155241451264, + "loss": 1.2457, + "step": 15230 + }, + { + "epoch": 0.03237939128231632, + "grad_norm": 0.5436329245567322, + "learning_rate": 0.00019992128245328777, + "loss": 1.24, + "step": 15240 + }, + { + "epoch": 0.03240063760205537, + "grad_norm": 0.5102641582489014, + "learning_rate": 0.00019992101202853527, + "loss": 1.2776, + "step": 15250 + }, + { + "epoch": 0.032421883921794424, + "grad_norm": 0.46824702620506287, + "learning_rate": 0.00019992074114025648, + "loss": 1.266, + "step": 15260 + }, + { + "epoch": 0.032443130241533474, + "grad_norm": 0.4613623321056366, + "learning_rate": 0.00019992046978845266, + "loss": 1.2413, + "step": 15270 + }, + { + "epoch": 0.032464376561272525, + "grad_norm": 0.8026008009910583, + "learning_rate": 0.00019992019797312498, + "loss": 1.2609, + "step": 15280 + }, + { + "epoch": 0.03248562288101158, + "grad_norm": 0.37224283814430237, + "learning_rate": 0.0001999199256942748, + "loss": 1.2871, + "step": 15290 + }, + { + "epoch": 0.03250686920075063, + "grad_norm": 0.4019428491592407, + "learning_rate": 0.00019991965295190333, + "loss": 1.2893, + "step": 15300 + }, + { + "epoch": 0.03252811552048968, + "grad_norm": 0.5020738840103149, + "learning_rate": 0.00019991937974601185, + "loss": 1.2852, + "step": 15310 + }, + { + "epoch": 0.03254936184022874, + "grad_norm": 0.4568564295768738, + "learning_rate": 0.00019991910607660164, + "loss": 1.2705, + "step": 15320 + }, + { + "epoch": 0.03257060815996779, + "grad_norm": 0.42278704047203064, + "learning_rate": 0.00019991883194367396, + "loss": 1.2397, + "step": 15330 + }, + { + "epoch": 0.03259185447970684, + "grad_norm": 0.3175494968891144, + "learning_rate": 0.00019991855734723007, + "loss": 1.2942, + "step": 15340 + }, + { + "epoch": 0.0326131007994459, + "grad_norm": 0.37271857261657715, + "learning_rate": 0.00019991828228727124, + "loss": 1.2453, + "step": 15350 + }, + { + "epoch": 0.03263434711918495, + "grad_norm": 0.3860233724117279, + "learning_rate": 0.00019991800676379874, + "loss": 1.2762, + "step": 15360 + }, + { + "epoch": 0.032655593438924, + "grad_norm": 0.3879421353340149, + "learning_rate": 0.0001999177307768139, + "loss": 1.2793, + "step": 15370 + }, + { + "epoch": 0.032676839758663055, + "grad_norm": 0.5703461766242981, + "learning_rate": 0.00019991745432631794, + "loss": 1.2759, + "step": 15380 + }, + { + "epoch": 0.032698086078402105, + "grad_norm": 0.5119121074676514, + "learning_rate": 0.0001999171774123122, + "loss": 1.2979, + "step": 15390 + }, + { + "epoch": 0.03271933239814116, + "grad_norm": 0.446224570274353, + "learning_rate": 0.0001999169000347979, + "loss": 1.2802, + "step": 15400 + }, + { + "epoch": 0.03274057871788021, + "grad_norm": 0.3514077961444855, + "learning_rate": 0.0001999166221937764, + "loss": 1.2669, + "step": 15410 + }, + { + "epoch": 0.03276182503761926, + "grad_norm": 0.42321348190307617, + "learning_rate": 0.00019991634388924888, + "loss": 1.3001, + "step": 15420 + }, + { + "epoch": 0.03278307135735832, + "grad_norm": 0.37001708149909973, + "learning_rate": 0.00019991606512121676, + "loss": 1.2856, + "step": 15430 + }, + { + "epoch": 0.03280431767709737, + "grad_norm": 0.4480491280555725, + "learning_rate": 0.00019991578588968127, + "loss": 1.253, + "step": 15440 + }, + { + "epoch": 0.03282556399683642, + "grad_norm": 0.36783939599990845, + "learning_rate": 0.0001999155061946437, + "loss": 1.2656, + "step": 15450 + }, + { + "epoch": 0.03284681031657548, + "grad_norm": 0.48061245679855347, + "learning_rate": 0.00019991522603610533, + "loss": 1.2857, + "step": 15460 + }, + { + "epoch": 0.03286805663631453, + "grad_norm": 0.3011791706085205, + "learning_rate": 0.0001999149454140675, + "loss": 1.2691, + "step": 15470 + }, + { + "epoch": 0.03288930295605358, + "grad_norm": 0.3174755871295929, + "learning_rate": 0.0001999146643285315, + "loss": 1.2589, + "step": 15480 + }, + { + "epoch": 0.032910549275792636, + "grad_norm": 0.346529096364975, + "learning_rate": 0.00019991438277949862, + "loss": 1.25, + "step": 15490 + }, + { + "epoch": 0.032931795595531686, + "grad_norm": 0.4361819326877594, + "learning_rate": 0.00019991410076697022, + "loss": 1.2603, + "step": 15500 + }, + { + "epoch": 0.03295304191527074, + "grad_norm": 0.35652315616607666, + "learning_rate": 0.0001999138182909475, + "loss": 1.2994, + "step": 15510 + }, + { + "epoch": 0.032974288235009794, + "grad_norm": 0.3608313500881195, + "learning_rate": 0.00019991353535143188, + "loss": 1.2757, + "step": 15520 + }, + { + "epoch": 0.032995534554748844, + "grad_norm": 0.4360053241252899, + "learning_rate": 0.00019991325194842463, + "loss": 1.2868, + "step": 15530 + }, + { + "epoch": 0.033016780874487894, + "grad_norm": 0.34269797801971436, + "learning_rate": 0.00019991296808192702, + "loss": 1.2868, + "step": 15540 + }, + { + "epoch": 0.03303802719422695, + "grad_norm": 0.3669768273830414, + "learning_rate": 0.00019991268375194046, + "loss": 1.2458, + "step": 15550 + }, + { + "epoch": 0.033059273513966, + "grad_norm": 0.4512222409248352, + "learning_rate": 0.00019991239895846618, + "loss": 1.3103, + "step": 15560 + }, + { + "epoch": 0.03308051983370505, + "grad_norm": 0.3498994708061218, + "learning_rate": 0.00019991211370150555, + "loss": 1.2541, + "step": 15570 + }, + { + "epoch": 0.03310176615344411, + "grad_norm": 0.4457986056804657, + "learning_rate": 0.0001999118279810599, + "loss": 1.3283, + "step": 15580 + }, + { + "epoch": 0.03312301247318316, + "grad_norm": 0.36448872089385986, + "learning_rate": 0.00019991154179713053, + "loss": 1.2515, + "step": 15590 + }, + { + "epoch": 0.03314425879292222, + "grad_norm": 0.3735416531562805, + "learning_rate": 0.00019991125514971878, + "loss": 1.2622, + "step": 15600 + }, + { + "epoch": 0.03316550511266127, + "grad_norm": 0.34608855843544006, + "learning_rate": 0.00019991096803882597, + "loss": 1.2887, + "step": 15610 + }, + { + "epoch": 0.03318675143240032, + "grad_norm": 0.3681546449661255, + "learning_rate": 0.00019991068046445342, + "loss": 1.2686, + "step": 15620 + }, + { + "epoch": 0.033207997752139375, + "grad_norm": 0.4820307493209839, + "learning_rate": 0.0001999103924266025, + "loss": 1.3111, + "step": 15630 + }, + { + "epoch": 0.033229244071878425, + "grad_norm": 0.3969350755214691, + "learning_rate": 0.00019991010392527453, + "loss": 1.2613, + "step": 15640 + }, + { + "epoch": 0.033250490391617475, + "grad_norm": 0.3330456614494324, + "learning_rate": 0.00019990981496047084, + "loss": 1.2477, + "step": 15650 + }, + { + "epoch": 0.03327173671135653, + "grad_norm": 0.36321520805358887, + "learning_rate": 0.00019990952553219278, + "loss": 1.2924, + "step": 15660 + }, + { + "epoch": 0.03329298303109558, + "grad_norm": 0.6040831208229065, + "learning_rate": 0.0001999092356404417, + "loss": 1.2293, + "step": 15670 + }, + { + "epoch": 0.03331422935083463, + "grad_norm": 0.3242480158805847, + "learning_rate": 0.0001999089452852189, + "loss": 1.2776, + "step": 15680 + }, + { + "epoch": 0.03333547567057369, + "grad_norm": 0.5588496923446655, + "learning_rate": 0.0001999086544665258, + "loss": 1.2763, + "step": 15690 + }, + { + "epoch": 0.03335672199031274, + "grad_norm": 0.5367519855499268, + "learning_rate": 0.00019990836318436368, + "loss": 1.2499, + "step": 15700 + }, + { + "epoch": 0.03337796831005179, + "grad_norm": 0.31062445044517517, + "learning_rate": 0.00019990807143873394, + "loss": 1.2762, + "step": 15710 + }, + { + "epoch": 0.03339921462979085, + "grad_norm": 0.3614952862262726, + "learning_rate": 0.0001999077792296379, + "loss": 1.3106, + "step": 15720 + }, + { + "epoch": 0.0334204609495299, + "grad_norm": 0.3289617896080017, + "learning_rate": 0.00019990748655707695, + "loss": 1.2794, + "step": 15730 + }, + { + "epoch": 0.03344170726926895, + "grad_norm": 0.3650701642036438, + "learning_rate": 0.0001999071934210524, + "loss": 1.2665, + "step": 15740 + }, + { + "epoch": 0.033462953589008006, + "grad_norm": 0.37392622232437134, + "learning_rate": 0.00019990689982156567, + "loss": 1.2554, + "step": 15750 + }, + { + "epoch": 0.033484199908747056, + "grad_norm": 0.5381394624710083, + "learning_rate": 0.0001999066057586181, + "loss": 1.2711, + "step": 15760 + }, + { + "epoch": 0.033505446228486106, + "grad_norm": 0.41247138381004333, + "learning_rate": 0.000199906311232211, + "loss": 1.3005, + "step": 15770 + }, + { + "epoch": 0.033526692548225163, + "grad_norm": 0.41210880875587463, + "learning_rate": 0.00019990601624234579, + "loss": 1.2989, + "step": 15780 + }, + { + "epoch": 0.033547938867964214, + "grad_norm": 0.32545506954193115, + "learning_rate": 0.00019990572078902382, + "loss": 1.2237, + "step": 15790 + }, + { + "epoch": 0.03356918518770327, + "grad_norm": 0.3608953356742859, + "learning_rate": 0.0001999054248722465, + "loss": 1.2753, + "step": 15800 + }, + { + "epoch": 0.03359043150744232, + "grad_norm": 0.36564913392066956, + "learning_rate": 0.00019990512849201516, + "loss": 1.2568, + "step": 15810 + }, + { + "epoch": 0.03361167782718137, + "grad_norm": 0.34669533371925354, + "learning_rate": 0.0001999048316483312, + "loss": 1.2471, + "step": 15820 + }, + { + "epoch": 0.03363292414692043, + "grad_norm": 0.4360162019729614, + "learning_rate": 0.00019990453434119595, + "loss": 1.2894, + "step": 15830 + }, + { + "epoch": 0.03365417046665948, + "grad_norm": 0.5404664874076843, + "learning_rate": 0.00019990423657061083, + "loss": 1.2853, + "step": 15840 + }, + { + "epoch": 0.03367541678639853, + "grad_norm": 0.4193219244480133, + "learning_rate": 0.00019990393833657722, + "loss": 1.2776, + "step": 15850 + }, + { + "epoch": 0.033696663106137587, + "grad_norm": 0.44616463780403137, + "learning_rate": 0.00019990363963909652, + "loss": 1.2691, + "step": 15860 + }, + { + "epoch": 0.03371790942587664, + "grad_norm": 0.33071884512901306, + "learning_rate": 0.00019990334047817004, + "loss": 1.2894, + "step": 15870 + }, + { + "epoch": 0.03373915574561569, + "grad_norm": 0.3236343264579773, + "learning_rate": 0.00019990304085379927, + "loss": 1.2555, + "step": 15880 + }, + { + "epoch": 0.033760402065354744, + "grad_norm": 0.37276691198349, + "learning_rate": 0.00019990274076598556, + "loss": 1.2719, + "step": 15890 + }, + { + "epoch": 0.033781648385093795, + "grad_norm": 0.33365851640701294, + "learning_rate": 0.00019990244021473027, + "loss": 1.2685, + "step": 15900 + }, + { + "epoch": 0.033802894704832845, + "grad_norm": 0.5004066228866577, + "learning_rate": 0.0001999021392000348, + "loss": 1.2469, + "step": 15910 + }, + { + "epoch": 0.0338241410245719, + "grad_norm": 0.36595678329467773, + "learning_rate": 0.0001999018377219006, + "loss": 1.2931, + "step": 15920 + }, + { + "epoch": 0.03384538734431095, + "grad_norm": 0.5464569926261902, + "learning_rate": 0.000199901535780329, + "loss": 1.2737, + "step": 15930 + }, + { + "epoch": 0.03386663366405, + "grad_norm": 0.40404245257377625, + "learning_rate": 0.00019990123337532148, + "loss": 1.2639, + "step": 15940 + }, + { + "epoch": 0.03388787998378906, + "grad_norm": 0.44490644335746765, + "learning_rate": 0.00019990093050687935, + "loss": 1.268, + "step": 15950 + }, + { + "epoch": 0.03390912630352811, + "grad_norm": 0.3516036570072174, + "learning_rate": 0.00019990062717500408, + "loss": 1.2672, + "step": 15960 + }, + { + "epoch": 0.03393037262326716, + "grad_norm": 0.4503597617149353, + "learning_rate": 0.00019990032337969705, + "loss": 1.2744, + "step": 15970 + }, + { + "epoch": 0.03395161894300622, + "grad_norm": 0.46181735396385193, + "learning_rate": 0.00019990001912095968, + "loss": 1.2611, + "step": 15980 + }, + { + "epoch": 0.03397286526274527, + "grad_norm": 0.36851966381073, + "learning_rate": 0.00019989971439879337, + "loss": 1.2684, + "step": 15990 + }, + { + "epoch": 0.03399411158248432, + "grad_norm": 0.5014030337333679, + "learning_rate": 0.00019989940921319956, + "loss": 1.2391, + "step": 16000 + }, + { + "epoch": 0.034015357902223375, + "grad_norm": 0.4903841018676758, + "learning_rate": 0.00019989910356417966, + "loss": 1.2597, + "step": 16010 + }, + { + "epoch": 0.034036604221962426, + "grad_norm": 0.3653618097305298, + "learning_rate": 0.00019989879745173505, + "loss": 1.285, + "step": 16020 + }, + { + "epoch": 0.03405785054170148, + "grad_norm": 0.639984667301178, + "learning_rate": 0.0001998984908758672, + "loss": 1.2641, + "step": 16030 + }, + { + "epoch": 0.03407909686144053, + "grad_norm": 0.3656117916107178, + "learning_rate": 0.00019989818383657747, + "loss": 1.2321, + "step": 16040 + }, + { + "epoch": 0.03410034318117958, + "grad_norm": 0.355206161737442, + "learning_rate": 0.00019989787633386733, + "loss": 1.239, + "step": 16050 + }, + { + "epoch": 0.03412158950091864, + "grad_norm": 0.34968301653862, + "learning_rate": 0.00019989756836773822, + "loss": 1.2447, + "step": 16060 + }, + { + "epoch": 0.03414283582065769, + "grad_norm": 0.44445428252220154, + "learning_rate": 0.00019989725993819154, + "loss": 1.2608, + "step": 16070 + }, + { + "epoch": 0.03416408214039674, + "grad_norm": 0.47003307938575745, + "learning_rate": 0.00019989695104522873, + "loss": 1.2858, + "step": 16080 + }, + { + "epoch": 0.0341853284601358, + "grad_norm": 0.3413441777229309, + "learning_rate": 0.00019989664168885122, + "loss": 1.2464, + "step": 16090 + }, + { + "epoch": 0.03420657477987485, + "grad_norm": 0.3884921967983246, + "learning_rate": 0.00019989633186906044, + "loss": 1.2497, + "step": 16100 + }, + { + "epoch": 0.0342278210996139, + "grad_norm": 0.36734122037887573, + "learning_rate": 0.00019989602158585786, + "loss": 1.3099, + "step": 16110 + }, + { + "epoch": 0.034249067419352956, + "grad_norm": 0.3316885530948639, + "learning_rate": 0.00019989571083924487, + "loss": 1.2458, + "step": 16120 + }, + { + "epoch": 0.034270313739092007, + "grad_norm": 0.31272411346435547, + "learning_rate": 0.00019989539962922295, + "loss": 1.2591, + "step": 16130 + }, + { + "epoch": 0.03429156005883106, + "grad_norm": 0.3143038749694824, + "learning_rate": 0.0001998950879557935, + "loss": 1.2373, + "step": 16140 + }, + { + "epoch": 0.034312806378570114, + "grad_norm": 0.34355428814888, + "learning_rate": 0.000199894775818958, + "loss": 1.2696, + "step": 16150 + }, + { + "epoch": 0.034334052698309164, + "grad_norm": 0.39916226267814636, + "learning_rate": 0.0001998944632187179, + "loss": 1.2788, + "step": 16160 + }, + { + "epoch": 0.034355299018048215, + "grad_norm": 0.3444354236125946, + "learning_rate": 0.00019989415015507467, + "loss": 1.2721, + "step": 16170 + }, + { + "epoch": 0.03437654533778727, + "grad_norm": 0.45080286264419556, + "learning_rate": 0.00019989383662802972, + "loss": 1.2505, + "step": 16180 + }, + { + "epoch": 0.03439779165752632, + "grad_norm": 0.4975142180919647, + "learning_rate": 0.0001998935226375845, + "loss": 1.2875, + "step": 16190 + }, + { + "epoch": 0.03441903797726537, + "grad_norm": 0.601543664932251, + "learning_rate": 0.0001998932081837405, + "loss": 1.2507, + "step": 16200 + }, + { + "epoch": 0.03444028429700443, + "grad_norm": 0.39170190691947937, + "learning_rate": 0.00019989289326649914, + "loss": 1.2518, + "step": 16210 + }, + { + "epoch": 0.03446153061674348, + "grad_norm": 0.7249555587768555, + "learning_rate": 0.00019989257788586193, + "loss": 1.2449, + "step": 16220 + }, + { + "epoch": 0.03448277693648254, + "grad_norm": 0.3402309715747833, + "learning_rate": 0.00019989226204183028, + "loss": 1.2793, + "step": 16230 + }, + { + "epoch": 0.03450402325622159, + "grad_norm": 0.4412461221218109, + "learning_rate": 0.00019989194573440572, + "loss": 1.2873, + "step": 16240 + }, + { + "epoch": 0.03452526957596064, + "grad_norm": 0.46606695652008057, + "learning_rate": 0.00019989162896358965, + "loss": 1.2897, + "step": 16250 + }, + { + "epoch": 0.034546515895699695, + "grad_norm": 0.38720032572746277, + "learning_rate": 0.0001998913117293836, + "loss": 1.2666, + "step": 16260 + }, + { + "epoch": 0.034567762215438745, + "grad_norm": 0.31600186228752136, + "learning_rate": 0.00019989099403178902, + "loss": 1.2383, + "step": 16270 + }, + { + "epoch": 0.034589008535177795, + "grad_norm": 0.639407217502594, + "learning_rate": 0.00019989067587080733, + "loss": 1.236, + "step": 16280 + }, + { + "epoch": 0.03461025485491685, + "grad_norm": 0.6029006838798523, + "learning_rate": 0.00019989035724644007, + "loss": 1.2242, + "step": 16290 + }, + { + "epoch": 0.0346315011746559, + "grad_norm": 0.36438873410224915, + "learning_rate": 0.0001998900381586887, + "loss": 1.2637, + "step": 16300 + }, + { + "epoch": 0.03465274749439495, + "grad_norm": 0.576244592666626, + "learning_rate": 0.0001998897186075547, + "loss": 1.3119, + "step": 16310 + }, + { + "epoch": 0.03467399381413401, + "grad_norm": 0.45938974618911743, + "learning_rate": 0.00019988939859303957, + "loss": 1.2629, + "step": 16320 + }, + { + "epoch": 0.03469524013387306, + "grad_norm": 0.4667823910713196, + "learning_rate": 0.00019988907811514475, + "loss": 1.2572, + "step": 16330 + }, + { + "epoch": 0.03471648645361211, + "grad_norm": 0.42444729804992676, + "learning_rate": 0.00019988875717387176, + "loss": 1.252, + "step": 16340 + }, + { + "epoch": 0.03473773277335117, + "grad_norm": 0.3257114887237549, + "learning_rate": 0.00019988843576922212, + "loss": 1.2469, + "step": 16350 + }, + { + "epoch": 0.03475897909309022, + "grad_norm": 0.3547564148902893, + "learning_rate": 0.00019988811390119724, + "loss": 1.2358, + "step": 16360 + }, + { + "epoch": 0.03478022541282927, + "grad_norm": 0.48575201630592346, + "learning_rate": 0.00019988779156979866, + "loss": 1.2938, + "step": 16370 + }, + { + "epoch": 0.034801471732568326, + "grad_norm": 0.43686482310295105, + "learning_rate": 0.00019988746877502787, + "loss": 1.262, + "step": 16380 + }, + { + "epoch": 0.034822718052307376, + "grad_norm": 0.3619142174720764, + "learning_rate": 0.00019988714551688637, + "loss": 1.2615, + "step": 16390 + }, + { + "epoch": 0.034843964372046426, + "grad_norm": 0.42583733797073364, + "learning_rate": 0.00019988682179537568, + "loss": 1.2461, + "step": 16400 + }, + { + "epoch": 0.034865210691785484, + "grad_norm": 0.4063641428947449, + "learning_rate": 0.00019988649761049727, + "loss": 1.2803, + "step": 16410 + }, + { + "epoch": 0.034886457011524534, + "grad_norm": 0.3280755877494812, + "learning_rate": 0.00019988617296225265, + "loss": 1.2472, + "step": 16420 + }, + { + "epoch": 0.034907703331263584, + "grad_norm": 0.3474607765674591, + "learning_rate": 0.00019988584785064333, + "loss": 1.2655, + "step": 16430 + }, + { + "epoch": 0.03492894965100264, + "grad_norm": 0.41693395376205444, + "learning_rate": 0.0001998855222756708, + "loss": 1.2581, + "step": 16440 + }, + { + "epoch": 0.03495019597074169, + "grad_norm": 0.364944189786911, + "learning_rate": 0.00019988519623733662, + "loss": 1.2778, + "step": 16450 + }, + { + "epoch": 0.03497144229048075, + "grad_norm": 0.37515226006507874, + "learning_rate": 0.00019988486973564226, + "loss": 1.2777, + "step": 16460 + }, + { + "epoch": 0.0349926886102198, + "grad_norm": 0.3630068302154541, + "learning_rate": 0.00019988454277058924, + "loss": 1.2687, + "step": 16470 + }, + { + "epoch": 0.03501393492995885, + "grad_norm": 0.40400242805480957, + "learning_rate": 0.0001998842153421791, + "loss": 1.2531, + "step": 16480 + }, + { + "epoch": 0.03503518124969791, + "grad_norm": 0.3455105125904083, + "learning_rate": 0.00019988388745041332, + "loss": 1.2956, + "step": 16490 + }, + { + "epoch": 0.03505642756943696, + "grad_norm": 0.35955166816711426, + "learning_rate": 0.00019988355909529343, + "loss": 1.2607, + "step": 16500 + }, + { + "epoch": 0.03507767388917601, + "grad_norm": 0.40339526534080505, + "learning_rate": 0.000199883230276821, + "loss": 1.2688, + "step": 16510 + }, + { + "epoch": 0.035098920208915065, + "grad_norm": 0.39338403940200806, + "learning_rate": 0.00019988290099499752, + "loss": 1.2635, + "step": 16520 + }, + { + "epoch": 0.035120166528654115, + "grad_norm": 0.42484381794929504, + "learning_rate": 0.0001998825712498245, + "loss": 1.2586, + "step": 16530 + }, + { + "epoch": 0.035141412848393165, + "grad_norm": 0.3628781735897064, + "learning_rate": 0.00019988224104130348, + "loss": 1.2831, + "step": 16540 + }, + { + "epoch": 0.03516265916813222, + "grad_norm": 0.34263336658477783, + "learning_rate": 0.000199881910369436, + "loss": 1.2731, + "step": 16550 + }, + { + "epoch": 0.03518390548787127, + "grad_norm": 0.3525870144367218, + "learning_rate": 0.00019988157923422363, + "loss": 1.2427, + "step": 16560 + }, + { + "epoch": 0.03520515180761032, + "grad_norm": 0.359487384557724, + "learning_rate": 0.00019988124763566784, + "loss": 1.2441, + "step": 16570 + }, + { + "epoch": 0.03522639812734938, + "grad_norm": 0.3799266517162323, + "learning_rate": 0.0001998809155737702, + "loss": 1.273, + "step": 16580 + }, + { + "epoch": 0.03524764444708843, + "grad_norm": 0.3734087646007538, + "learning_rate": 0.00019988058304853227, + "loss": 1.2829, + "step": 16590 + }, + { + "epoch": 0.03526889076682748, + "grad_norm": 0.6886231303215027, + "learning_rate": 0.00019988025005995556, + "loss": 1.2368, + "step": 16600 + }, + { + "epoch": 0.03529013708656654, + "grad_norm": 0.40943530201911926, + "learning_rate": 0.0001998799166080416, + "loss": 1.2749, + "step": 16610 + }, + { + "epoch": 0.03531138340630559, + "grad_norm": 0.3531206548213959, + "learning_rate": 0.000199879582692792, + "loss": 1.2786, + "step": 16620 + }, + { + "epoch": 0.03533262972604464, + "grad_norm": 0.3462654948234558, + "learning_rate": 0.00019987924831420825, + "loss": 1.27, + "step": 16630 + }, + { + "epoch": 0.035353876045783696, + "grad_norm": 0.3319712281227112, + "learning_rate": 0.00019987891347229196, + "loss": 1.3013, + "step": 16640 + }, + { + "epoch": 0.035375122365522746, + "grad_norm": 0.3567074239253998, + "learning_rate": 0.0001998785781670446, + "loss": 1.2456, + "step": 16650 + }, + { + "epoch": 0.0353963686852618, + "grad_norm": 0.35557320713996887, + "learning_rate": 0.0001998782423984678, + "loss": 1.2515, + "step": 16660 + }, + { + "epoch": 0.03541761500500085, + "grad_norm": 0.4116458594799042, + "learning_rate": 0.00019987790616656306, + "loss": 1.2621, + "step": 16670 + }, + { + "epoch": 0.035438861324739904, + "grad_norm": 0.519535481929779, + "learning_rate": 0.00019987756947133199, + "loss": 1.2603, + "step": 16680 + }, + { + "epoch": 0.03546010764447896, + "grad_norm": 0.6644023060798645, + "learning_rate": 0.00019987723231277614, + "loss": 1.2749, + "step": 16690 + }, + { + "epoch": 0.03548135396421801, + "grad_norm": 0.4177393317222595, + "learning_rate": 0.00019987689469089706, + "loss": 1.2716, + "step": 16700 + }, + { + "epoch": 0.03550260028395706, + "grad_norm": 0.5882167220115662, + "learning_rate": 0.00019987655660569628, + "loss": 1.2486, + "step": 16710 + }, + { + "epoch": 0.03552384660369612, + "grad_norm": 0.3155616819858551, + "learning_rate": 0.00019987621805717546, + "loss": 1.2426, + "step": 16720 + }, + { + "epoch": 0.03554509292343517, + "grad_norm": 0.3898048996925354, + "learning_rate": 0.0001998758790453361, + "loss": 1.2839, + "step": 16730 + }, + { + "epoch": 0.03556633924317422, + "grad_norm": 0.4172990620136261, + "learning_rate": 0.0001998755395701798, + "loss": 1.2696, + "step": 16740 + }, + { + "epoch": 0.035587585562913276, + "grad_norm": 0.3939424753189087, + "learning_rate": 0.00019987519963170813, + "loss": 1.2836, + "step": 16750 + }, + { + "epoch": 0.03560883188265233, + "grad_norm": 0.38518226146698, + "learning_rate": 0.00019987485922992265, + "loss": 1.2698, + "step": 16760 + }, + { + "epoch": 0.03563007820239138, + "grad_norm": 0.3283689022064209, + "learning_rate": 0.00019987451836482495, + "loss": 1.2496, + "step": 16770 + }, + { + "epoch": 0.035651324522130434, + "grad_norm": 0.3631674349308014, + "learning_rate": 0.00019987417703641665, + "loss": 1.2531, + "step": 16780 + }, + { + "epoch": 0.035672570841869485, + "grad_norm": 0.34036996960639954, + "learning_rate": 0.00019987383524469927, + "loss": 1.2798, + "step": 16790 + }, + { + "epoch": 0.035693817161608535, + "grad_norm": 0.36152225732803345, + "learning_rate": 0.00019987349298967444, + "loss": 1.2216, + "step": 16800 + }, + { + "epoch": 0.03571506348134759, + "grad_norm": 0.4543357193470001, + "learning_rate": 0.0001998731502713437, + "loss": 1.2942, + "step": 16810 + }, + { + "epoch": 0.03573630980108664, + "grad_norm": 0.4078833758831024, + "learning_rate": 0.0001998728070897087, + "loss": 1.248, + "step": 16820 + }, + { + "epoch": 0.03575755612082569, + "grad_norm": 0.5505214333534241, + "learning_rate": 0.000199872463444771, + "loss": 1.2312, + "step": 16830 + }, + { + "epoch": 0.03577880244056475, + "grad_norm": 0.331667959690094, + "learning_rate": 0.0001998721193365322, + "loss": 1.2334, + "step": 16840 + }, + { + "epoch": 0.0358000487603038, + "grad_norm": 0.3727641999721527, + "learning_rate": 0.00019987177476499387, + "loss": 1.2925, + "step": 16850 + }, + { + "epoch": 0.03582129508004286, + "grad_norm": 0.42444875836372375, + "learning_rate": 0.00019987142973015767, + "loss": 1.2882, + "step": 16860 + }, + { + "epoch": 0.03584254139978191, + "grad_norm": 0.3772609531879425, + "learning_rate": 0.00019987108423202515, + "loss": 1.2242, + "step": 16870 + }, + { + "epoch": 0.03586378771952096, + "grad_norm": 0.35507452487945557, + "learning_rate": 0.0001998707382705979, + "loss": 1.2726, + "step": 16880 + }, + { + "epoch": 0.035885034039260015, + "grad_norm": 0.349435955286026, + "learning_rate": 0.00019987039184587757, + "loss": 1.2525, + "step": 16890 + }, + { + "epoch": 0.035906280358999065, + "grad_norm": 0.39889252185821533, + "learning_rate": 0.00019987004495786574, + "loss": 1.3029, + "step": 16900 + }, + { + "epoch": 0.035927526678738116, + "grad_norm": 0.3438369333744049, + "learning_rate": 0.00019986969760656405, + "loss": 1.2796, + "step": 16910 + }, + { + "epoch": 0.03594877299847717, + "grad_norm": 0.36799871921539307, + "learning_rate": 0.00019986934979197407, + "loss": 1.2404, + "step": 16920 + }, + { + "epoch": 0.03597001931821622, + "grad_norm": 0.5220965147018433, + "learning_rate": 0.00019986900151409742, + "loss": 1.3139, + "step": 16930 + }, + { + "epoch": 0.03599126563795527, + "grad_norm": 0.37973538041114807, + "learning_rate": 0.00019986865277293574, + "loss": 1.2372, + "step": 16940 + }, + { + "epoch": 0.03601251195769433, + "grad_norm": 0.42901262640953064, + "learning_rate": 0.00019986830356849065, + "loss": 1.2467, + "step": 16950 + }, + { + "epoch": 0.03603375827743338, + "grad_norm": 0.3212549388408661, + "learning_rate": 0.00019986795390076375, + "loss": 1.2588, + "step": 16960 + }, + { + "epoch": 0.03605500459717243, + "grad_norm": 0.42074304819107056, + "learning_rate": 0.00019986760376975663, + "loss": 1.2684, + "step": 16970 + }, + { + "epoch": 0.03607625091691149, + "grad_norm": 0.34623008966445923, + "learning_rate": 0.000199867253175471, + "loss": 1.2553, + "step": 16980 + }, + { + "epoch": 0.03609749723665054, + "grad_norm": 0.3473183512687683, + "learning_rate": 0.00019986690211790838, + "loss": 1.286, + "step": 16990 + }, + { + "epoch": 0.03611874355638959, + "grad_norm": 0.3570563495159149, + "learning_rate": 0.00019986655059707053, + "loss": 1.2699, + "step": 17000 + }, + { + "epoch": 0.036139989876128646, + "grad_norm": 0.3828585743904114, + "learning_rate": 0.00019986619861295894, + "loss": 1.2729, + "step": 17010 + }, + { + "epoch": 0.036161236195867696, + "grad_norm": 0.35039350390434265, + "learning_rate": 0.00019986584616557535, + "loss": 1.2088, + "step": 17020 + }, + { + "epoch": 0.03618248251560675, + "grad_norm": 0.4737040102481842, + "learning_rate": 0.00019986549325492136, + "loss": 1.2251, + "step": 17030 + }, + { + "epoch": 0.036203728835345804, + "grad_norm": 0.4077366292476654, + "learning_rate": 0.00019986513988099857, + "loss": 1.258, + "step": 17040 + }, + { + "epoch": 0.036224975155084854, + "grad_norm": 0.3453763723373413, + "learning_rate": 0.00019986478604380867, + "loss": 1.2421, + "step": 17050 + }, + { + "epoch": 0.036246221474823904, + "grad_norm": 0.4686037600040436, + "learning_rate": 0.0001998644317433533, + "loss": 1.2911, + "step": 17060 + }, + { + "epoch": 0.03626746779456296, + "grad_norm": 0.6384305357933044, + "learning_rate": 0.00019986407697963405, + "loss": 1.2585, + "step": 17070 + }, + { + "epoch": 0.03628871411430201, + "grad_norm": 0.5802843570709229, + "learning_rate": 0.00019986372175265262, + "loss": 1.2756, + "step": 17080 + }, + { + "epoch": 0.03630996043404107, + "grad_norm": 0.39798417687416077, + "learning_rate": 0.00019986336606241063, + "loss": 1.2486, + "step": 17090 + }, + { + "epoch": 0.03633120675378012, + "grad_norm": 0.41914308071136475, + "learning_rate": 0.00019986300990890973, + "loss": 1.2435, + "step": 17100 + }, + { + "epoch": 0.03635245307351917, + "grad_norm": 0.41906553506851196, + "learning_rate": 0.0001998626532921516, + "loss": 1.2599, + "step": 17110 + }, + { + "epoch": 0.03637369939325823, + "grad_norm": 0.3758905529975891, + "learning_rate": 0.00019986229621213789, + "loss": 1.265, + "step": 17120 + }, + { + "epoch": 0.03639494571299728, + "grad_norm": 0.34300652146339417, + "learning_rate": 0.00019986193866887021, + "loss": 1.2536, + "step": 17130 + }, + { + "epoch": 0.03641619203273633, + "grad_norm": 0.4092984199523926, + "learning_rate": 0.00019986158066235028, + "loss": 1.2544, + "step": 17140 + }, + { + "epoch": 0.036437438352475385, + "grad_norm": 0.3271426260471344, + "learning_rate": 0.00019986122219257973, + "loss": 1.2397, + "step": 17150 + }, + { + "epoch": 0.036458684672214435, + "grad_norm": 0.43366408348083496, + "learning_rate": 0.00019986086325956022, + "loss": 1.3019, + "step": 17160 + }, + { + "epoch": 0.036479930991953485, + "grad_norm": 0.33925899863243103, + "learning_rate": 0.0001998605038632934, + "loss": 1.2511, + "step": 17170 + }, + { + "epoch": 0.03650117731169254, + "grad_norm": 0.40767717361450195, + "learning_rate": 0.00019986014400378096, + "loss": 1.2612, + "step": 17180 + }, + { + "epoch": 0.03652242363143159, + "grad_norm": 0.4874541759490967, + "learning_rate": 0.00019985978368102457, + "loss": 1.2606, + "step": 17190 + }, + { + "epoch": 0.03654366995117064, + "grad_norm": 0.3557478189468384, + "learning_rate": 0.00019985942289502592, + "loss": 1.2614, + "step": 17200 + }, + { + "epoch": 0.0365649162709097, + "grad_norm": 0.5957018136978149, + "learning_rate": 0.00019985906164578666, + "loss": 1.2962, + "step": 17210 + }, + { + "epoch": 0.03658616259064875, + "grad_norm": 0.3868601322174072, + "learning_rate": 0.00019985869993330847, + "loss": 1.2468, + "step": 17220 + }, + { + "epoch": 0.0366074089103878, + "grad_norm": 0.32585498690605164, + "learning_rate": 0.000199858337757593, + "loss": 1.262, + "step": 17230 + }, + { + "epoch": 0.03662865523012686, + "grad_norm": 0.3208174407482147, + "learning_rate": 0.00019985797511864198, + "loss": 1.2532, + "step": 17240 + }, + { + "epoch": 0.03664990154986591, + "grad_norm": 0.3902543783187866, + "learning_rate": 0.00019985761201645703, + "loss": 1.2558, + "step": 17250 + }, + { + "epoch": 0.03667114786960496, + "grad_norm": 0.3947533965110779, + "learning_rate": 0.00019985724845103991, + "loss": 1.2763, + "step": 17260 + }, + { + "epoch": 0.036692394189344016, + "grad_norm": 0.47187817096710205, + "learning_rate": 0.00019985688442239225, + "loss": 1.2618, + "step": 17270 + }, + { + "epoch": 0.036713640509083066, + "grad_norm": 0.39405810832977295, + "learning_rate": 0.00019985651993051576, + "loss": 1.27, + "step": 17280 + }, + { + "epoch": 0.03673488682882212, + "grad_norm": 0.3721141517162323, + "learning_rate": 0.00019985615497541213, + "loss": 1.1915, + "step": 17290 + }, + { + "epoch": 0.036756133148561174, + "grad_norm": 0.45418912172317505, + "learning_rate": 0.00019985578955708307, + "loss": 1.2347, + "step": 17300 + }, + { + "epoch": 0.036777379468300224, + "grad_norm": 0.4118460416793823, + "learning_rate": 0.00019985542367553024, + "loss": 1.2604, + "step": 17310 + }, + { + "epoch": 0.03679862578803928, + "grad_norm": 0.43347465991973877, + "learning_rate": 0.00019985505733075533, + "loss": 1.2671, + "step": 17320 + }, + { + "epoch": 0.03681987210777833, + "grad_norm": 0.3458508849143982, + "learning_rate": 0.0001998546905227601, + "loss": 1.2759, + "step": 17330 + }, + { + "epoch": 0.03684111842751738, + "grad_norm": 0.32756415009498596, + "learning_rate": 0.0001998543232515462, + "loss": 1.2889, + "step": 17340 + }, + { + "epoch": 0.03686236474725644, + "grad_norm": 0.3567040264606476, + "learning_rate": 0.00019985395551711535, + "loss": 1.2636, + "step": 17350 + }, + { + "epoch": 0.03688361106699549, + "grad_norm": 0.44192618131637573, + "learning_rate": 0.00019985358731946926, + "loss": 1.2322, + "step": 17360 + }, + { + "epoch": 0.03690485738673454, + "grad_norm": 0.3638072907924652, + "learning_rate": 0.0001998532186586096, + "loss": 1.274, + "step": 17370 + }, + { + "epoch": 0.0369261037064736, + "grad_norm": 0.36857324838638306, + "learning_rate": 0.00019985284953453816, + "loss": 1.2748, + "step": 17380 + }, + { + "epoch": 0.03694735002621265, + "grad_norm": 0.32370829582214355, + "learning_rate": 0.0001998524799472566, + "loss": 1.2628, + "step": 17390 + }, + { + "epoch": 0.0369685963459517, + "grad_norm": 0.4592704772949219, + "learning_rate": 0.0001998521098967666, + "loss": 1.2478, + "step": 17400 + }, + { + "epoch": 0.036989842665690754, + "grad_norm": 0.3741736114025116, + "learning_rate": 0.00019985173938306996, + "loss": 1.2846, + "step": 17410 + }, + { + "epoch": 0.037011088985429805, + "grad_norm": 0.3615925908088684, + "learning_rate": 0.00019985136840616833, + "loss": 1.2489, + "step": 17420 + }, + { + "epoch": 0.037032335305168855, + "grad_norm": 0.3745890259742737, + "learning_rate": 0.00019985099696606348, + "loss": 1.224, + "step": 17430 + }, + { + "epoch": 0.03705358162490791, + "grad_norm": 0.48353955149650574, + "learning_rate": 0.00019985062506275707, + "loss": 1.2677, + "step": 17440 + }, + { + "epoch": 0.03707482794464696, + "grad_norm": 0.38267067074775696, + "learning_rate": 0.0001998502526962509, + "loss": 1.2997, + "step": 17450 + }, + { + "epoch": 0.03709607426438601, + "grad_norm": 0.44715026021003723, + "learning_rate": 0.00019984987986654664, + "loss": 1.2242, + "step": 17460 + }, + { + "epoch": 0.03711732058412507, + "grad_norm": 0.48657330870628357, + "learning_rate": 0.00019984950657364606, + "loss": 1.2572, + "step": 17470 + }, + { + "epoch": 0.03713856690386412, + "grad_norm": 0.3906572759151459, + "learning_rate": 0.00019984913281755084, + "loss": 1.2927, + "step": 17480 + }, + { + "epoch": 0.03715981322360317, + "grad_norm": 0.4829677641391754, + "learning_rate": 0.00019984875859826277, + "loss": 1.2547, + "step": 17490 + }, + { + "epoch": 0.03718105954334223, + "grad_norm": 0.49427488446235657, + "learning_rate": 0.00019984838391578355, + "loss": 1.2491, + "step": 17500 + }, + { + "epoch": 0.03720230586308128, + "grad_norm": 0.3726120889186859, + "learning_rate": 0.00019984800877011493, + "loss": 1.2563, + "step": 17510 + }, + { + "epoch": 0.037223552182820335, + "grad_norm": 0.36482423543930054, + "learning_rate": 0.0001998476331612587, + "loss": 1.2685, + "step": 17520 + }, + { + "epoch": 0.037244798502559386, + "grad_norm": 0.3592258095741272, + "learning_rate": 0.0001998472570892165, + "loss": 1.2594, + "step": 17530 + }, + { + "epoch": 0.037266044822298436, + "grad_norm": 0.33852657675743103, + "learning_rate": 0.00019984688055399015, + "loss": 1.2565, + "step": 17540 + }, + { + "epoch": 0.03728729114203749, + "grad_norm": 0.35450538992881775, + "learning_rate": 0.00019984650355558134, + "loss": 1.3028, + "step": 17550 + }, + { + "epoch": 0.03730853746177654, + "grad_norm": 0.6002469658851624, + "learning_rate": 0.00019984612609399188, + "loss": 1.2693, + "step": 17560 + }, + { + "epoch": 0.037329783781515594, + "grad_norm": 0.49934157729148865, + "learning_rate": 0.00019984574816922352, + "loss": 1.2685, + "step": 17570 + }, + { + "epoch": 0.03735103010125465, + "grad_norm": 0.3286626935005188, + "learning_rate": 0.00019984536978127798, + "loss": 1.2406, + "step": 17580 + }, + { + "epoch": 0.0373722764209937, + "grad_norm": 0.320042222738266, + "learning_rate": 0.000199844990930157, + "loss": 1.2544, + "step": 17590 + }, + { + "epoch": 0.03739352274073275, + "grad_norm": 0.5694490075111389, + "learning_rate": 0.00019984461161586236, + "loss": 1.2771, + "step": 17600 + }, + { + "epoch": 0.03741476906047181, + "grad_norm": 0.5279802083969116, + "learning_rate": 0.00019984423183839583, + "loss": 1.2442, + "step": 17610 + }, + { + "epoch": 0.03743601538021086, + "grad_norm": 0.41382038593292236, + "learning_rate": 0.00019984385159775917, + "loss": 1.2533, + "step": 17620 + }, + { + "epoch": 0.03745726169994991, + "grad_norm": 0.3582109212875366, + "learning_rate": 0.0001998434708939541, + "loss": 1.2336, + "step": 17630 + }, + { + "epoch": 0.037478508019688966, + "grad_norm": 0.3203234076499939, + "learning_rate": 0.00019984308972698247, + "loss": 1.2788, + "step": 17640 + }, + { + "epoch": 0.03749975433942802, + "grad_norm": 0.32123446464538574, + "learning_rate": 0.00019984270809684596, + "loss": 1.3061, + "step": 17650 + }, + { + "epoch": 0.03752100065916707, + "grad_norm": 0.43206924200057983, + "learning_rate": 0.00019984232600354639, + "loss": 1.2486, + "step": 17660 + }, + { + "epoch": 0.037542246978906124, + "grad_norm": 0.3686884641647339, + "learning_rate": 0.00019984194344708556, + "loss": 1.2795, + "step": 17670 + }, + { + "epoch": 0.037563493298645174, + "grad_norm": 0.3482140898704529, + "learning_rate": 0.00019984156042746515, + "loss": 1.328, + "step": 17680 + }, + { + "epoch": 0.037584739618384225, + "grad_norm": 0.4247514605522156, + "learning_rate": 0.00019984117694468704, + "loss": 1.2503, + "step": 17690 + }, + { + "epoch": 0.03760598593812328, + "grad_norm": 0.35217055678367615, + "learning_rate": 0.00019984079299875296, + "loss": 1.204, + "step": 17700 + }, + { + "epoch": 0.03762723225786233, + "grad_norm": 0.3781435191631317, + "learning_rate": 0.00019984040858966466, + "loss": 1.2751, + "step": 17710 + }, + { + "epoch": 0.03764847857760139, + "grad_norm": 0.3831736743450165, + "learning_rate": 0.00019984002371742398, + "loss": 1.2545, + "step": 17720 + }, + { + "epoch": 0.03766972489734044, + "grad_norm": 0.49480563402175903, + "learning_rate": 0.00019983963838203267, + "loss": 1.2537, + "step": 17730 + }, + { + "epoch": 0.03769097121707949, + "grad_norm": 0.505519688129425, + "learning_rate": 0.00019983925258349256, + "loss": 1.2652, + "step": 17740 + }, + { + "epoch": 0.03771221753681855, + "grad_norm": 0.5878559350967407, + "learning_rate": 0.00019983886632180537, + "loss": 1.2728, + "step": 17750 + }, + { + "epoch": 0.0377334638565576, + "grad_norm": 0.36544808745384216, + "learning_rate": 0.00019983847959697298, + "loss": 1.264, + "step": 17760 + }, + { + "epoch": 0.03775471017629665, + "grad_norm": 0.33982008695602417, + "learning_rate": 0.0001998380924089971, + "loss": 1.2198, + "step": 17770 + }, + { + "epoch": 0.037775956496035705, + "grad_norm": 0.3936421871185303, + "learning_rate": 0.00019983770475787957, + "loss": 1.2781, + "step": 17780 + }, + { + "epoch": 0.037797202815774755, + "grad_norm": 0.4574744999408722, + "learning_rate": 0.00019983731664362217, + "loss": 1.2744, + "step": 17790 + }, + { + "epoch": 0.037818449135513806, + "grad_norm": 0.5884156823158264, + "learning_rate": 0.00019983692806622676, + "loss": 1.2759, + "step": 17800 + }, + { + "epoch": 0.03783969545525286, + "grad_norm": 0.5564342737197876, + "learning_rate": 0.00019983653902569507, + "loss": 1.2421, + "step": 17810 + }, + { + "epoch": 0.03786094177499191, + "grad_norm": 0.6871036291122437, + "learning_rate": 0.0001998361495220289, + "loss": 1.2781, + "step": 17820 + }, + { + "epoch": 0.03788218809473096, + "grad_norm": 0.5166600346565247, + "learning_rate": 0.0001998357595552301, + "loss": 1.2504, + "step": 17830 + }, + { + "epoch": 0.03790343441447002, + "grad_norm": 0.3812960088253021, + "learning_rate": 0.00019983536912530046, + "loss": 1.2628, + "step": 17840 + }, + { + "epoch": 0.03792468073420907, + "grad_norm": 0.47909292578697205, + "learning_rate": 0.0001998349782322418, + "loss": 1.2915, + "step": 17850 + }, + { + "epoch": 0.03794592705394812, + "grad_norm": 0.45110252499580383, + "learning_rate": 0.00019983458687605592, + "loss": 1.3024, + "step": 17860 + }, + { + "epoch": 0.03796717337368718, + "grad_norm": 0.36254701018333435, + "learning_rate": 0.00019983419505674465, + "loss": 1.2797, + "step": 17870 + }, + { + "epoch": 0.03798841969342623, + "grad_norm": 0.35890212655067444, + "learning_rate": 0.00019983380277430982, + "loss": 1.2309, + "step": 17880 + }, + { + "epoch": 0.03800966601316528, + "grad_norm": 0.6050844788551331, + "learning_rate": 0.00019983341002875323, + "loss": 1.272, + "step": 17890 + }, + { + "epoch": 0.038030912332904336, + "grad_norm": 0.35284146666526794, + "learning_rate": 0.0001998330168200767, + "loss": 1.2538, + "step": 17900 + }, + { + "epoch": 0.038052158652643386, + "grad_norm": 0.3716890811920166, + "learning_rate": 0.00019983262314828205, + "loss": 1.2309, + "step": 17910 + }, + { + "epoch": 0.038073404972382444, + "grad_norm": 0.5296202301979065, + "learning_rate": 0.00019983222901337109, + "loss": 1.2622, + "step": 17920 + }, + { + "epoch": 0.038094651292121494, + "grad_norm": 0.3469552993774414, + "learning_rate": 0.0001998318344153457, + "loss": 1.2511, + "step": 17930 + }, + { + "epoch": 0.038115897611860544, + "grad_norm": 0.3768733739852905, + "learning_rate": 0.0001998314393542077, + "loss": 1.289, + "step": 17940 + }, + { + "epoch": 0.0381371439315996, + "grad_norm": 0.3658464848995209, + "learning_rate": 0.00019983104382995887, + "loss": 1.2313, + "step": 17950 + }, + { + "epoch": 0.03815839025133865, + "grad_norm": 0.40740591287612915, + "learning_rate": 0.0001998306478426011, + "loss": 1.2479, + "step": 17960 + }, + { + "epoch": 0.0381796365710777, + "grad_norm": 0.5274581909179688, + "learning_rate": 0.00019983025139213618, + "loss": 1.231, + "step": 17970 + }, + { + "epoch": 0.03820088289081676, + "grad_norm": 0.5525963306427002, + "learning_rate": 0.000199829854478566, + "loss": 1.2792, + "step": 17980 + }, + { + "epoch": 0.03822212921055581, + "grad_norm": 0.42543691396713257, + "learning_rate": 0.0001998294571018924, + "loss": 1.2148, + "step": 17990 + }, + { + "epoch": 0.03824337553029486, + "grad_norm": 0.37067171931266785, + "learning_rate": 0.00019982905926211716, + "loss": 1.216, + "step": 18000 + }, + { + "epoch": 0.03826462185003392, + "grad_norm": 0.3799036145210266, + "learning_rate": 0.00019982866095924218, + "loss": 1.2442, + "step": 18010 + }, + { + "epoch": 0.03828586816977297, + "grad_norm": 0.45350706577301025, + "learning_rate": 0.00019982826219326928, + "loss": 1.2744, + "step": 18020 + }, + { + "epoch": 0.03830711448951202, + "grad_norm": 0.40344464778900146, + "learning_rate": 0.00019982786296420034, + "loss": 1.2738, + "step": 18030 + }, + { + "epoch": 0.038328360809251075, + "grad_norm": 0.4797520041465759, + "learning_rate": 0.0001998274632720372, + "loss": 1.2566, + "step": 18040 + }, + { + "epoch": 0.038349607128990125, + "grad_norm": 0.525907039642334, + "learning_rate": 0.0001998270631167817, + "loss": 1.264, + "step": 18050 + }, + { + "epoch": 0.038370853448729175, + "grad_norm": 0.3417554795742035, + "learning_rate": 0.00019982666249843573, + "loss": 1.2403, + "step": 18060 + }, + { + "epoch": 0.03839209976846823, + "grad_norm": 0.36542925238609314, + "learning_rate": 0.0001998262614170011, + "loss": 1.2699, + "step": 18070 + }, + { + "epoch": 0.03841334608820728, + "grad_norm": 0.5048639178276062, + "learning_rate": 0.00019982585987247972, + "loss": 1.2678, + "step": 18080 + }, + { + "epoch": 0.03843459240794633, + "grad_norm": 0.4295097887516022, + "learning_rate": 0.00019982545786487342, + "loss": 1.2442, + "step": 18090 + }, + { + "epoch": 0.03845583872768539, + "grad_norm": 0.4658900499343872, + "learning_rate": 0.00019982505539418402, + "loss": 1.272, + "step": 18100 + }, + { + "epoch": 0.03847708504742444, + "grad_norm": 0.5487869381904602, + "learning_rate": 0.00019982465246041347, + "loss": 1.2274, + "step": 18110 + }, + { + "epoch": 0.03849833136716349, + "grad_norm": 0.32749143242836, + "learning_rate": 0.00019982424906356364, + "loss": 1.2581, + "step": 18120 + }, + { + "epoch": 0.03851957768690255, + "grad_norm": 0.5066694617271423, + "learning_rate": 0.00019982384520363637, + "loss": 1.3055, + "step": 18130 + }, + { + "epoch": 0.0385408240066416, + "grad_norm": 0.3283524513244629, + "learning_rate": 0.00019982344088063353, + "loss": 1.2752, + "step": 18140 + }, + { + "epoch": 0.038562070326380656, + "grad_norm": 0.4335974454879761, + "learning_rate": 0.00019982303609455696, + "loss": 1.2194, + "step": 18150 + }, + { + "epoch": 0.038583316646119706, + "grad_norm": 0.3253665864467621, + "learning_rate": 0.00019982263084540862, + "loss": 1.2795, + "step": 18160 + }, + { + "epoch": 0.038604562965858756, + "grad_norm": 0.32959678769111633, + "learning_rate": 0.00019982222513319033, + "loss": 1.2298, + "step": 18170 + }, + { + "epoch": 0.03862580928559781, + "grad_norm": 0.5444913506507874, + "learning_rate": 0.000199821818957904, + "loss": 1.2375, + "step": 18180 + }, + { + "epoch": 0.038647055605336864, + "grad_norm": 0.3862419128417969, + "learning_rate": 0.00019982141231955148, + "loss": 1.2442, + "step": 18190 + }, + { + "epoch": 0.038668301925075914, + "grad_norm": 0.3974175453186035, + "learning_rate": 0.00019982100521813472, + "loss": 1.2608, + "step": 18200 + }, + { + "epoch": 0.03868954824481497, + "grad_norm": 0.4338286817073822, + "learning_rate": 0.00019982059765365554, + "loss": 1.2782, + "step": 18210 + }, + { + "epoch": 0.03871079456455402, + "grad_norm": 0.3649689555168152, + "learning_rate": 0.00019982018962611586, + "loss": 1.2441, + "step": 18220 + }, + { + "epoch": 0.03873204088429307, + "grad_norm": 0.36239153146743774, + "learning_rate": 0.00019981978113551758, + "loss": 1.2664, + "step": 18230 + }, + { + "epoch": 0.03875328720403213, + "grad_norm": 0.390056312084198, + "learning_rate": 0.00019981937218186256, + "loss": 1.2467, + "step": 18240 + }, + { + "epoch": 0.03877453352377118, + "grad_norm": 0.41403377056121826, + "learning_rate": 0.00019981896276515275, + "loss": 1.261, + "step": 18250 + }, + { + "epoch": 0.03879577984351023, + "grad_norm": 0.4122839570045471, + "learning_rate": 0.00019981855288539002, + "loss": 1.2709, + "step": 18260 + }, + { + "epoch": 0.03881702616324929, + "grad_norm": 0.39846697449684143, + "learning_rate": 0.00019981814254257628, + "loss": 1.2896, + "step": 18270 + }, + { + "epoch": 0.03883827248298834, + "grad_norm": 0.3365967869758606, + "learning_rate": 0.0001998177317367134, + "loss": 1.2411, + "step": 18280 + }, + { + "epoch": 0.03885951880272739, + "grad_norm": 0.33333519101142883, + "learning_rate": 0.00019981732046780336, + "loss": 1.2665, + "step": 18290 + }, + { + "epoch": 0.038880765122466444, + "grad_norm": 0.3608172833919525, + "learning_rate": 0.000199816908735848, + "loss": 1.2799, + "step": 18300 + }, + { + "epoch": 0.038902011442205495, + "grad_norm": 0.33687838912010193, + "learning_rate": 0.00019981649654084925, + "loss": 1.2765, + "step": 18310 + }, + { + "epoch": 0.038923257761944545, + "grad_norm": 0.34966176748275757, + "learning_rate": 0.00019981608388280902, + "loss": 1.2573, + "step": 18320 + }, + { + "epoch": 0.0389445040816836, + "grad_norm": 0.4170011281967163, + "learning_rate": 0.00019981567076172922, + "loss": 1.2663, + "step": 18330 + }, + { + "epoch": 0.03896575040142265, + "grad_norm": 0.39517009258270264, + "learning_rate": 0.00019981525717761178, + "loss": 1.2744, + "step": 18340 + }, + { + "epoch": 0.03898699672116171, + "grad_norm": 0.500720202922821, + "learning_rate": 0.00019981484313045863, + "loss": 1.2106, + "step": 18350 + }, + { + "epoch": 0.03900824304090076, + "grad_norm": 0.3839428722858429, + "learning_rate": 0.00019981442862027166, + "loss": 1.2368, + "step": 18360 + }, + { + "epoch": 0.03902948936063981, + "grad_norm": 0.41456007957458496, + "learning_rate": 0.00019981401364705282, + "loss": 1.2331, + "step": 18370 + }, + { + "epoch": 0.03905073568037887, + "grad_norm": 0.383634477853775, + "learning_rate": 0.00019981359821080402, + "loss": 1.2623, + "step": 18380 + }, + { + "epoch": 0.03907198200011792, + "grad_norm": 0.4930288791656494, + "learning_rate": 0.0001998131823115272, + "loss": 1.2267, + "step": 18390 + }, + { + "epoch": 0.03909322831985697, + "grad_norm": 0.3596273958683014, + "learning_rate": 0.00019981276594922424, + "loss": 1.2497, + "step": 18400 + }, + { + "epoch": 0.039114474639596025, + "grad_norm": 0.3543323278427124, + "learning_rate": 0.00019981234912389711, + "loss": 1.2832, + "step": 18410 + }, + { + "epoch": 0.039135720959335075, + "grad_norm": 0.45005714893341064, + "learning_rate": 0.00019981193183554779, + "loss": 1.2428, + "step": 18420 + }, + { + "epoch": 0.039156967279074126, + "grad_norm": 0.5096533894538879, + "learning_rate": 0.00019981151408417813, + "loss": 1.2455, + "step": 18430 + }, + { + "epoch": 0.03917821359881318, + "grad_norm": 0.37116387486457825, + "learning_rate": 0.00019981109586979012, + "loss": 1.244, + "step": 18440 + }, + { + "epoch": 0.03919945991855223, + "grad_norm": 0.5169209241867065, + "learning_rate": 0.0001998106771923857, + "loss": 1.2492, + "step": 18450 + }, + { + "epoch": 0.039220706238291284, + "grad_norm": 0.3543490171432495, + "learning_rate": 0.00019981025805196678, + "loss": 1.2531, + "step": 18460 + }, + { + "epoch": 0.03924195255803034, + "grad_norm": 0.3944735527038574, + "learning_rate": 0.00019980983844853533, + "loss": 1.2748, + "step": 18470 + }, + { + "epoch": 0.03926319887776939, + "grad_norm": 0.406376451253891, + "learning_rate": 0.00019980941838209328, + "loss": 1.2622, + "step": 18480 + }, + { + "epoch": 0.03928444519750844, + "grad_norm": 0.4949665367603302, + "learning_rate": 0.00019980899785264263, + "loss": 1.2183, + "step": 18490 + }, + { + "epoch": 0.0393056915172475, + "grad_norm": 0.47493976354599, + "learning_rate": 0.00019980857686018527, + "loss": 1.2663, + "step": 18500 + }, + { + "epoch": 0.03932693783698655, + "grad_norm": 0.5668587684631348, + "learning_rate": 0.00019980815540472313, + "loss": 1.2231, + "step": 18510 + }, + { + "epoch": 0.0393481841567256, + "grad_norm": 0.5727503895759583, + "learning_rate": 0.00019980773348625826, + "loss": 1.2539, + "step": 18520 + }, + { + "epoch": 0.039369430476464656, + "grad_norm": 0.46369224786758423, + "learning_rate": 0.00019980731110479254, + "loss": 1.2607, + "step": 18530 + }, + { + "epoch": 0.03939067679620371, + "grad_norm": 0.34395089745521545, + "learning_rate": 0.00019980688826032797, + "loss": 1.2464, + "step": 18540 + }, + { + "epoch": 0.03941192311594276, + "grad_norm": 0.35747385025024414, + "learning_rate": 0.00019980646495286648, + "loss": 1.2674, + "step": 18550 + }, + { + "epoch": 0.039433169435681814, + "grad_norm": 0.579807460308075, + "learning_rate": 0.00019980604118241004, + "loss": 1.2473, + "step": 18560 + }, + { + "epoch": 0.039454415755420864, + "grad_norm": 0.4327913224697113, + "learning_rate": 0.00019980561694896064, + "loss": 1.2304, + "step": 18570 + }, + { + "epoch": 0.03947566207515992, + "grad_norm": 0.42074599862098694, + "learning_rate": 0.00019980519225252025, + "loss": 1.3045, + "step": 18580 + }, + { + "epoch": 0.03949690839489897, + "grad_norm": 0.5109519362449646, + "learning_rate": 0.00019980476709309083, + "loss": 1.291, + "step": 18590 + }, + { + "epoch": 0.03951815471463802, + "grad_norm": 0.5456305742263794, + "learning_rate": 0.00019980434147067431, + "loss": 1.2612, + "step": 18600 + }, + { + "epoch": 0.03953940103437708, + "grad_norm": 0.3790073096752167, + "learning_rate": 0.0001998039153852727, + "loss": 1.2587, + "step": 18610 + }, + { + "epoch": 0.03956064735411613, + "grad_norm": 0.41773897409439087, + "learning_rate": 0.00019980348883688799, + "loss": 1.2434, + "step": 18620 + }, + { + "epoch": 0.03958189367385518, + "grad_norm": 0.6627351641654968, + "learning_rate": 0.00019980306182552218, + "loss": 1.2689, + "step": 18630 + }, + { + "epoch": 0.03960313999359424, + "grad_norm": 0.6540055871009827, + "learning_rate": 0.0001998026343511772, + "loss": 1.2798, + "step": 18640 + }, + { + "epoch": 0.03962438631333329, + "grad_norm": 0.4004923403263092, + "learning_rate": 0.000199802206413855, + "loss": 1.2489, + "step": 18650 + }, + { + "epoch": 0.03964563263307234, + "grad_norm": 0.5833346843719482, + "learning_rate": 0.00019980177801355765, + "loss": 1.2693, + "step": 18660 + }, + { + "epoch": 0.039666878952811395, + "grad_norm": 0.433246910572052, + "learning_rate": 0.00019980134915028713, + "loss": 1.2552, + "step": 18670 + }, + { + "epoch": 0.039688125272550445, + "grad_norm": 0.3655533790588379, + "learning_rate": 0.00019980091982404536, + "loss": 1.2777, + "step": 18680 + }, + { + "epoch": 0.039709371592289495, + "grad_norm": 0.36537498235702515, + "learning_rate": 0.00019980049003483438, + "loss": 1.2987, + "step": 18690 + }, + { + "epoch": 0.03973061791202855, + "grad_norm": 0.3877891004085541, + "learning_rate": 0.00019980005978265618, + "loss": 1.2485, + "step": 18700 + }, + { + "epoch": 0.0397518642317676, + "grad_norm": 0.3748418092727661, + "learning_rate": 0.00019979962906751278, + "loss": 1.2766, + "step": 18710 + }, + { + "epoch": 0.03977311055150665, + "grad_norm": 0.4261835217475891, + "learning_rate": 0.0001997991978894061, + "loss": 1.2627, + "step": 18720 + }, + { + "epoch": 0.03979435687124571, + "grad_norm": 0.3545245826244354, + "learning_rate": 0.00019979876624833822, + "loss": 1.3107, + "step": 18730 + }, + { + "epoch": 0.03981560319098476, + "grad_norm": 0.35869982838630676, + "learning_rate": 0.00019979833414431112, + "loss": 1.2627, + "step": 18740 + }, + { + "epoch": 0.03983684951072381, + "grad_norm": 0.5158436298370361, + "learning_rate": 0.0001997979015773268, + "loss": 1.2251, + "step": 18750 + }, + { + "epoch": 0.03985809583046287, + "grad_norm": 0.46637555956840515, + "learning_rate": 0.00019979746854738726, + "loss": 1.2542, + "step": 18760 + }, + { + "epoch": 0.03987934215020192, + "grad_norm": 0.3285894989967346, + "learning_rate": 0.0001997970350544945, + "loss": 1.2866, + "step": 18770 + }, + { + "epoch": 0.039900588469940976, + "grad_norm": 0.4215281903743744, + "learning_rate": 0.00019979660109865056, + "loss": 1.2478, + "step": 18780 + }, + { + "epoch": 0.039921834789680026, + "grad_norm": 0.4163514971733093, + "learning_rate": 0.00019979616667985744, + "loss": 1.2748, + "step": 18790 + }, + { + "epoch": 0.039943081109419076, + "grad_norm": 0.3388252258300781, + "learning_rate": 0.00019979573179811712, + "loss": 1.2376, + "step": 18800 + }, + { + "epoch": 0.039964327429158134, + "grad_norm": 0.4350028336048126, + "learning_rate": 0.00019979529645343166, + "loss": 1.2941, + "step": 18810 + }, + { + "epoch": 0.039985573748897184, + "grad_norm": 0.47426924109458923, + "learning_rate": 0.0001997948606458031, + "loss": 1.2472, + "step": 18820 + }, + { + "epoch": 0.040006820068636234, + "grad_norm": 0.37416937947273254, + "learning_rate": 0.0001997944243752334, + "loss": 1.2769, + "step": 18830 + }, + { + "epoch": 0.04002806638837529, + "grad_norm": 0.4113101661205292, + "learning_rate": 0.00019979398764172465, + "loss": 1.2345, + "step": 18840 + }, + { + "epoch": 0.04004931270811434, + "grad_norm": 0.4277748763561249, + "learning_rate": 0.0001997935504452788, + "loss": 1.24, + "step": 18850 + }, + { + "epoch": 0.04007055902785339, + "grad_norm": 0.39529848098754883, + "learning_rate": 0.00019979311278589795, + "loss": 1.2642, + "step": 18860 + }, + { + "epoch": 0.04009180534759245, + "grad_norm": 0.39919254183769226, + "learning_rate": 0.0001997926746635841, + "loss": 1.2964, + "step": 18870 + }, + { + "epoch": 0.0401130516673315, + "grad_norm": 0.43216413259506226, + "learning_rate": 0.00019979223607833926, + "loss": 1.2617, + "step": 18880 + }, + { + "epoch": 0.04013429798707055, + "grad_norm": 0.503809928894043, + "learning_rate": 0.0001997917970301655, + "loss": 1.2864, + "step": 18890 + }, + { + "epoch": 0.04015554430680961, + "grad_norm": 0.5105933547019958, + "learning_rate": 0.00019979135751906486, + "loss": 1.1942, + "step": 18900 + }, + { + "epoch": 0.04017679062654866, + "grad_norm": 0.3941263258457184, + "learning_rate": 0.00019979091754503933, + "loss": 1.279, + "step": 18910 + }, + { + "epoch": 0.04019803694628771, + "grad_norm": 0.41056233644485474, + "learning_rate": 0.000199790477108091, + "loss": 1.2995, + "step": 18920 + }, + { + "epoch": 0.040219283266026765, + "grad_norm": 0.3785092532634735, + "learning_rate": 0.0001997900362082219, + "loss": 1.276, + "step": 18930 + }, + { + "epoch": 0.040240529585765815, + "grad_norm": 0.3529833257198334, + "learning_rate": 0.00019978959484543405, + "loss": 1.2655, + "step": 18940 + }, + { + "epoch": 0.040261775905504865, + "grad_norm": 0.3888013958930969, + "learning_rate": 0.00019978915301972953, + "loss": 1.2548, + "step": 18950 + }, + { + "epoch": 0.04028302222524392, + "grad_norm": 0.3609119653701782, + "learning_rate": 0.0001997887107311104, + "loss": 1.261, + "step": 18960 + }, + { + "epoch": 0.04030426854498297, + "grad_norm": 0.36513713002204895, + "learning_rate": 0.00019978826797957866, + "loss": 1.2571, + "step": 18970 + }, + { + "epoch": 0.04032551486472203, + "grad_norm": 0.41936829686164856, + "learning_rate": 0.0001997878247651364, + "loss": 1.2327, + "step": 18980 + }, + { + "epoch": 0.04034676118446108, + "grad_norm": 0.3601458668708801, + "learning_rate": 0.00019978738108778567, + "loss": 1.247, + "step": 18990 + }, + { + "epoch": 0.04036800750420013, + "grad_norm": 0.38460519909858704, + "learning_rate": 0.00019978693694752854, + "loss": 1.2482, + "step": 19000 + }, + { + "epoch": 0.04038925382393919, + "grad_norm": 0.3415684103965759, + "learning_rate": 0.00019978649234436707, + "loss": 1.2312, + "step": 19010 + }, + { + "epoch": 0.04041050014367824, + "grad_norm": 0.5605719089508057, + "learning_rate": 0.0001997860472783033, + "loss": 1.2864, + "step": 19020 + }, + { + "epoch": 0.04043174646341729, + "grad_norm": 0.45204320549964905, + "learning_rate": 0.00019978560174933928, + "loss": 1.2725, + "step": 19030 + }, + { + "epoch": 0.040452992783156345, + "grad_norm": 0.6007397770881653, + "learning_rate": 0.0001997851557574771, + "loss": 1.2376, + "step": 19040 + }, + { + "epoch": 0.040474239102895396, + "grad_norm": 0.34583643078804016, + "learning_rate": 0.00019978470930271888, + "loss": 1.266, + "step": 19050 + }, + { + "epoch": 0.040495485422634446, + "grad_norm": 0.4802301228046417, + "learning_rate": 0.00019978426238506663, + "loss": 1.2564, + "step": 19060 + }, + { + "epoch": 0.0405167317423735, + "grad_norm": 0.33800771832466125, + "learning_rate": 0.0001997838150045224, + "loss": 1.2605, + "step": 19070 + }, + { + "epoch": 0.040537978062112554, + "grad_norm": 0.39989233016967773, + "learning_rate": 0.00019978336716108835, + "loss": 1.2618, + "step": 19080 + }, + { + "epoch": 0.040559224381851604, + "grad_norm": 0.46265867352485657, + "learning_rate": 0.00019978291885476646, + "loss": 1.281, + "step": 19090 + }, + { + "epoch": 0.04058047070159066, + "grad_norm": 0.426350861787796, + "learning_rate": 0.00019978247008555888, + "loss": 1.2462, + "step": 19100 + }, + { + "epoch": 0.04060171702132971, + "grad_norm": 0.35468360781669617, + "learning_rate": 0.00019978202085346768, + "loss": 1.2543, + "step": 19110 + }, + { + "epoch": 0.04062296334106876, + "grad_norm": 0.44483742117881775, + "learning_rate": 0.00019978157115849493, + "loss": 1.2303, + "step": 19120 + }, + { + "epoch": 0.04064420966080782, + "grad_norm": 0.34414708614349365, + "learning_rate": 0.00019978112100064272, + "loss": 1.252, + "step": 19130 + }, + { + "epoch": 0.04066545598054687, + "grad_norm": 0.3451782763004303, + "learning_rate": 0.00019978067037991314, + "loss": 1.2641, + "step": 19140 + }, + { + "epoch": 0.04068670230028592, + "grad_norm": 0.6282997727394104, + "learning_rate": 0.00019978021929630826, + "loss": 1.263, + "step": 19150 + }, + { + "epoch": 0.04070794862002498, + "grad_norm": 0.3250134587287903, + "learning_rate": 0.00019977976774983024, + "loss": 1.2649, + "step": 19160 + }, + { + "epoch": 0.04072919493976403, + "grad_norm": 0.5353121161460876, + "learning_rate": 0.00019977931574048107, + "loss": 1.2387, + "step": 19170 + }, + { + "epoch": 0.04075044125950308, + "grad_norm": 0.3798757493495941, + "learning_rate": 0.00019977886326826293, + "loss": 1.249, + "step": 19180 + }, + { + "epoch": 0.040771687579242134, + "grad_norm": 0.38595980405807495, + "learning_rate": 0.00019977841033317787, + "loss": 1.2557, + "step": 19190 + }, + { + "epoch": 0.040792933898981185, + "grad_norm": 0.47010451555252075, + "learning_rate": 0.00019977795693522805, + "loss": 1.2886, + "step": 19200 + }, + { + "epoch": 0.04081418021872024, + "grad_norm": 0.4642966389656067, + "learning_rate": 0.0001997775030744155, + "loss": 1.2636, + "step": 19210 + }, + { + "epoch": 0.04083542653845929, + "grad_norm": 0.4332388639450073, + "learning_rate": 0.0001997770487507424, + "loss": 1.2528, + "step": 19220 + }, + { + "epoch": 0.04085667285819834, + "grad_norm": 0.45240887999534607, + "learning_rate": 0.00019977659396421078, + "loss": 1.2705, + "step": 19230 + }, + { + "epoch": 0.0408779191779374, + "grad_norm": 0.4439499080181122, + "learning_rate": 0.00019977613871482278, + "loss": 1.2524, + "step": 19240 + }, + { + "epoch": 0.04089916549767645, + "grad_norm": 0.45197373628616333, + "learning_rate": 0.0001997756830025806, + "loss": 1.2488, + "step": 19250 + }, + { + "epoch": 0.0409204118174155, + "grad_norm": 0.3380126655101776, + "learning_rate": 0.0001997752268274862, + "loss": 1.2583, + "step": 19260 + }, + { + "epoch": 0.04094165813715456, + "grad_norm": 0.3919467031955719, + "learning_rate": 0.00019977477018954178, + "loss": 1.2487, + "step": 19270 + }, + { + "epoch": 0.04096290445689361, + "grad_norm": 0.38753366470336914, + "learning_rate": 0.00019977431308874948, + "loss": 1.2669, + "step": 19280 + }, + { + "epoch": 0.04098415077663266, + "grad_norm": 0.48211562633514404, + "learning_rate": 0.00019977385552511135, + "loss": 1.2284, + "step": 19290 + }, + { + "epoch": 0.041005397096371715, + "grad_norm": 0.4603411555290222, + "learning_rate": 0.0001997733974986296, + "loss": 1.214, + "step": 19300 + }, + { + "epoch": 0.041026643416110765, + "grad_norm": 0.457356333732605, + "learning_rate": 0.00019977293900930626, + "loss": 1.2512, + "step": 19310 + }, + { + "epoch": 0.041047889735849816, + "grad_norm": 0.34482812881469727, + "learning_rate": 0.00019977248005714353, + "loss": 1.2593, + "step": 19320 + }, + { + "epoch": 0.04106913605558887, + "grad_norm": 0.35531023144721985, + "learning_rate": 0.00019977202064214354, + "loss": 1.2638, + "step": 19330 + }, + { + "epoch": 0.04109038237532792, + "grad_norm": 0.3409656584262848, + "learning_rate": 0.00019977156076430834, + "loss": 1.2492, + "step": 19340 + }, + { + "epoch": 0.041111628695066973, + "grad_norm": 0.3498866856098175, + "learning_rate": 0.00019977110042364014, + "loss": 1.2482, + "step": 19350 + }, + { + "epoch": 0.04113287501480603, + "grad_norm": 0.3245124816894531, + "learning_rate": 0.00019977063962014105, + "loss": 1.2528, + "step": 19360 + }, + { + "epoch": 0.04115412133454508, + "grad_norm": 0.3241535425186157, + "learning_rate": 0.00019977017835381324, + "loss": 1.22, + "step": 19370 + }, + { + "epoch": 0.04117536765428413, + "grad_norm": 0.35628506541252136, + "learning_rate": 0.0001997697166246588, + "loss": 1.228, + "step": 19380 + }, + { + "epoch": 0.04119661397402319, + "grad_norm": 0.41015660762786865, + "learning_rate": 0.00019976925443267987, + "loss": 1.2481, + "step": 19390 + }, + { + "epoch": 0.04121786029376224, + "grad_norm": 0.3475078046321869, + "learning_rate": 0.00019976879177787863, + "loss": 1.2557, + "step": 19400 + }, + { + "epoch": 0.041239106613501296, + "grad_norm": 0.37282076478004456, + "learning_rate": 0.00019976832866025723, + "loss": 1.2656, + "step": 19410 + }, + { + "epoch": 0.041260352933240346, + "grad_norm": 0.39893996715545654, + "learning_rate": 0.00019976786507981778, + "loss": 1.2595, + "step": 19420 + }, + { + "epoch": 0.041281599252979397, + "grad_norm": 0.40755516290664673, + "learning_rate": 0.0001997674010365625, + "loss": 1.2793, + "step": 19430 + }, + { + "epoch": 0.041302845572718454, + "grad_norm": 0.533052921295166, + "learning_rate": 0.00019976693653049347, + "loss": 1.2381, + "step": 19440 + }, + { + "epoch": 0.041324091892457504, + "grad_norm": 0.5006403923034668, + "learning_rate": 0.00019976647156161282, + "loss": 1.2592, + "step": 19450 + }, + { + "epoch": 0.041345338212196554, + "grad_norm": 0.36725273728370667, + "learning_rate": 0.0001997660061299228, + "loss": 1.2888, + "step": 19460 + }, + { + "epoch": 0.04136658453193561, + "grad_norm": 0.39514192938804626, + "learning_rate": 0.00019976554023542554, + "loss": 1.2368, + "step": 19470 + }, + { + "epoch": 0.04138783085167466, + "grad_norm": 0.4625631868839264, + "learning_rate": 0.00019976507387812315, + "loss": 1.2544, + "step": 19480 + }, + { + "epoch": 0.04140907717141371, + "grad_norm": 0.3955468535423279, + "learning_rate": 0.00019976460705801782, + "loss": 1.2156, + "step": 19490 + }, + { + "epoch": 0.04143032349115277, + "grad_norm": 0.3748619854450226, + "learning_rate": 0.00019976413977511177, + "loss": 1.27, + "step": 19500 + }, + { + "epoch": 0.04145156981089182, + "grad_norm": 0.3611891567707062, + "learning_rate": 0.0001997636720294071, + "loss": 1.2445, + "step": 19510 + }, + { + "epoch": 0.04147281613063087, + "grad_norm": 0.35576343536376953, + "learning_rate": 0.000199763203820906, + "loss": 1.2227, + "step": 19520 + }, + { + "epoch": 0.04149406245036993, + "grad_norm": 0.3580278754234314, + "learning_rate": 0.00019976273514961065, + "loss": 1.2708, + "step": 19530 + }, + { + "epoch": 0.04151530877010898, + "grad_norm": 0.5173043608665466, + "learning_rate": 0.00019976226601552322, + "loss": 1.2334, + "step": 19540 + }, + { + "epoch": 0.04153655508984803, + "grad_norm": 0.36131617426872253, + "learning_rate": 0.00019976179641864586, + "loss": 1.2224, + "step": 19550 + }, + { + "epoch": 0.041557801409587085, + "grad_norm": 0.4009658694267273, + "learning_rate": 0.00019976132635898082, + "loss": 1.2216, + "step": 19560 + }, + { + "epoch": 0.041579047729326135, + "grad_norm": 0.3941552937030792, + "learning_rate": 0.00019976085583653022, + "loss": 1.2794, + "step": 19570 + }, + { + "epoch": 0.041600294049065185, + "grad_norm": 0.3589954972267151, + "learning_rate": 0.00019976038485129626, + "loss": 1.2449, + "step": 19580 + }, + { + "epoch": 0.04162154036880424, + "grad_norm": 0.39787864685058594, + "learning_rate": 0.0001997599134032811, + "loss": 1.2383, + "step": 19590 + }, + { + "epoch": 0.04164278668854329, + "grad_norm": 0.3316423296928406, + "learning_rate": 0.00019975944149248697, + "loss": 1.2274, + "step": 19600 + }, + { + "epoch": 0.04166403300828234, + "grad_norm": 0.36119651794433594, + "learning_rate": 0.00019975896911891602, + "loss": 1.2515, + "step": 19610 + }, + { + "epoch": 0.0416852793280214, + "grad_norm": 0.4237316846847534, + "learning_rate": 0.00019975849628257048, + "loss": 1.271, + "step": 19620 + }, + { + "epoch": 0.04170652564776045, + "grad_norm": 0.34705832600593567, + "learning_rate": 0.0001997580229834525, + "loss": 1.2545, + "step": 19630 + }, + { + "epoch": 0.04172777196749951, + "grad_norm": 0.447437584400177, + "learning_rate": 0.00019975754922156434, + "loss": 1.2607, + "step": 19640 + }, + { + "epoch": 0.04174901828723856, + "grad_norm": 0.3502061069011688, + "learning_rate": 0.00019975707499690813, + "loss": 1.2831, + "step": 19650 + }, + { + "epoch": 0.04177026460697761, + "grad_norm": 0.4281626343727112, + "learning_rate": 0.00019975660030948608, + "loss": 1.2593, + "step": 19660 + }, + { + "epoch": 0.041791510926716666, + "grad_norm": 0.37369391322135925, + "learning_rate": 0.00019975612515930042, + "loss": 1.2292, + "step": 19670 + }, + { + "epoch": 0.041812757246455716, + "grad_norm": 0.37262019515037537, + "learning_rate": 0.00019975564954635336, + "loss": 1.2795, + "step": 19680 + }, + { + "epoch": 0.041834003566194766, + "grad_norm": 0.3942998945713043, + "learning_rate": 0.00019975517347064705, + "loss": 1.2329, + "step": 19690 + }, + { + "epoch": 0.04185524988593382, + "grad_norm": 0.3391101360321045, + "learning_rate": 0.00019975469693218377, + "loss": 1.2304, + "step": 19700 + }, + { + "epoch": 0.041876496205672874, + "grad_norm": 0.5822133421897888, + "learning_rate": 0.0001997542199309657, + "loss": 1.2674, + "step": 19710 + }, + { + "epoch": 0.041897742525411924, + "grad_norm": 0.3701478838920593, + "learning_rate": 0.00019975374246699503, + "loss": 1.2261, + "step": 19720 + }, + { + "epoch": 0.04191898884515098, + "grad_norm": 0.4077053368091583, + "learning_rate": 0.000199753264540274, + "loss": 1.2451, + "step": 19730 + }, + { + "epoch": 0.04194023516489003, + "grad_norm": 0.32284772396087646, + "learning_rate": 0.00019975278615080484, + "loss": 1.218, + "step": 19740 + }, + { + "epoch": 0.04196148148462908, + "grad_norm": 0.3342863917350769, + "learning_rate": 0.0001997523072985897, + "loss": 1.2461, + "step": 19750 + }, + { + "epoch": 0.04198272780436814, + "grad_norm": 0.3301607072353363, + "learning_rate": 0.00019975182798363092, + "loss": 1.2113, + "step": 19760 + }, + { + "epoch": 0.04200397412410719, + "grad_norm": 0.5210681557655334, + "learning_rate": 0.00019975134820593063, + "loss": 1.2247, + "step": 19770 + }, + { + "epoch": 0.04202522044384624, + "grad_norm": 0.41106703877449036, + "learning_rate": 0.0001997508679654911, + "loss": 1.2397, + "step": 19780 + }, + { + "epoch": 0.0420464667635853, + "grad_norm": 0.36347559094429016, + "learning_rate": 0.0001997503872623145, + "loss": 1.2592, + "step": 19790 + }, + { + "epoch": 0.04206771308332435, + "grad_norm": 0.3546050190925598, + "learning_rate": 0.0001997499060964031, + "loss": 1.2331, + "step": 19800 + }, + { + "epoch": 0.0420889594030634, + "grad_norm": 0.539020836353302, + "learning_rate": 0.00019974942446775916, + "loss": 1.2402, + "step": 19810 + }, + { + "epoch": 0.042110205722802455, + "grad_norm": 0.31960099935531616, + "learning_rate": 0.0001997489423763849, + "loss": 1.2883, + "step": 19820 + }, + { + "epoch": 0.042131452042541505, + "grad_norm": 0.9570057988166809, + "learning_rate": 0.00019974845982228252, + "loss": 1.2471, + "step": 19830 + }, + { + "epoch": 0.04215269836228056, + "grad_norm": 0.5397257804870605, + "learning_rate": 0.0001997479768054543, + "loss": 1.2426, + "step": 19840 + }, + { + "epoch": 0.04217394468201961, + "grad_norm": 0.5398075580596924, + "learning_rate": 0.00019974749332590244, + "loss": 1.261, + "step": 19850 + }, + { + "epoch": 0.04219519100175866, + "grad_norm": 0.3943006694316864, + "learning_rate": 0.0001997470093836292, + "loss": 1.2736, + "step": 19860 + }, + { + "epoch": 0.04221643732149772, + "grad_norm": 0.3651895523071289, + "learning_rate": 0.00019974652497863686, + "loss": 1.2432, + "step": 19870 + }, + { + "epoch": 0.04223768364123677, + "grad_norm": 0.3636420667171478, + "learning_rate": 0.00019974604011092762, + "loss": 1.2373, + "step": 19880 + }, + { + "epoch": 0.04225892996097582, + "grad_norm": 0.3919502794742584, + "learning_rate": 0.00019974555478050374, + "loss": 1.2349, + "step": 19890 + }, + { + "epoch": 0.04228017628071488, + "grad_norm": 0.5459203720092773, + "learning_rate": 0.0001997450689873675, + "loss": 1.2159, + "step": 19900 + }, + { + "epoch": 0.04230142260045393, + "grad_norm": 0.34529903531074524, + "learning_rate": 0.0001997445827315211, + "loss": 1.2365, + "step": 19910 + }, + { + "epoch": 0.04232266892019298, + "grad_norm": 0.41355350613594055, + "learning_rate": 0.00019974409601296685, + "loss": 1.249, + "step": 19920 + }, + { + "epoch": 0.042343915239932035, + "grad_norm": 0.3999328911304474, + "learning_rate": 0.00019974360883170695, + "loss": 1.2298, + "step": 19930 + }, + { + "epoch": 0.042365161559671086, + "grad_norm": 0.34703341126441956, + "learning_rate": 0.0001997431211877437, + "loss": 1.2356, + "step": 19940 + }, + { + "epoch": 0.042386407879410136, + "grad_norm": 0.4116883873939514, + "learning_rate": 0.00019974263308107938, + "loss": 1.2646, + "step": 19950 + }, + { + "epoch": 0.04240765419914919, + "grad_norm": 0.4342650771141052, + "learning_rate": 0.00019974214451171625, + "loss": 1.2508, + "step": 19960 + }, + { + "epoch": 0.04242890051888824, + "grad_norm": 0.5011958479881287, + "learning_rate": 0.00019974165547965653, + "loss": 1.2338, + "step": 19970 + }, + { + "epoch": 0.042450146838627294, + "grad_norm": 0.36851075291633606, + "learning_rate": 0.00019974116598490252, + "loss": 1.2454, + "step": 19980 + }, + { + "epoch": 0.04247139315836635, + "grad_norm": 0.3627263307571411, + "learning_rate": 0.0001997406760274565, + "loss": 1.2618, + "step": 19990 + }, + { + "epoch": 0.0424926394781054, + "grad_norm": 0.3667580187320709, + "learning_rate": 0.0001997401856073207, + "loss": 1.2613, + "step": 20000 + }, + { + "epoch": 0.04251388579784445, + "grad_norm": 0.3793926239013672, + "learning_rate": 0.00019973969472449746, + "loss": 1.2843, + "step": 20010 + }, + { + "epoch": 0.04253513211758351, + "grad_norm": 0.36715003848075867, + "learning_rate": 0.00019973920337898902, + "loss": 1.2258, + "step": 20020 + }, + { + "epoch": 0.04255637843732256, + "grad_norm": 0.48493099212646484, + "learning_rate": 0.00019973871157079765, + "loss": 1.2417, + "step": 20030 + }, + { + "epoch": 0.042577624757061616, + "grad_norm": 0.36171090602874756, + "learning_rate": 0.00019973821929992564, + "loss": 1.2346, + "step": 20040 + }, + { + "epoch": 0.042598871076800666, + "grad_norm": 0.3419795334339142, + "learning_rate": 0.00019973772656637525, + "loss": 1.2647, + "step": 20050 + }, + { + "epoch": 0.04262011739653972, + "grad_norm": 1.0554473400115967, + "learning_rate": 0.00019973723337014882, + "loss": 1.2403, + "step": 20060 + }, + { + "epoch": 0.042641363716278774, + "grad_norm": 0.35295605659484863, + "learning_rate": 0.00019973673971124863, + "loss": 1.2523, + "step": 20070 + }, + { + "epoch": 0.042662610036017824, + "grad_norm": 0.508945643901825, + "learning_rate": 0.00019973624558967693, + "loss": 1.2575, + "step": 20080 + }, + { + "epoch": 0.042683856355756875, + "grad_norm": 0.526848554611206, + "learning_rate": 0.00019973575100543602, + "loss": 1.2845, + "step": 20090 + }, + { + "epoch": 0.04270510267549593, + "grad_norm": 0.5998830199241638, + "learning_rate": 0.0001997352559585282, + "loss": 1.2595, + "step": 20100 + }, + { + "epoch": 0.04272634899523498, + "grad_norm": 0.41524556279182434, + "learning_rate": 0.0001997347604489558, + "loss": 1.2412, + "step": 20110 + }, + { + "epoch": 0.04274759531497403, + "grad_norm": 0.43970417976379395, + "learning_rate": 0.0001997342644767211, + "loss": 1.2426, + "step": 20120 + }, + { + "epoch": 0.04276884163471309, + "grad_norm": 0.3353283107280731, + "learning_rate": 0.00019973376804182635, + "loss": 1.2247, + "step": 20130 + }, + { + "epoch": 0.04279008795445214, + "grad_norm": 0.40513527393341064, + "learning_rate": 0.00019973327114427392, + "loss": 1.2536, + "step": 20140 + }, + { + "epoch": 0.04281133427419119, + "grad_norm": 0.4950891137123108, + "learning_rate": 0.0001997327737840661, + "loss": 1.2634, + "step": 20150 + }, + { + "epoch": 0.04283258059393025, + "grad_norm": 0.33954402804374695, + "learning_rate": 0.00019973227596120516, + "loss": 1.2133, + "step": 20160 + }, + { + "epoch": 0.0428538269136693, + "grad_norm": 0.4814887046813965, + "learning_rate": 0.00019973177767569346, + "loss": 1.2539, + "step": 20170 + }, + { + "epoch": 0.04287507323340835, + "grad_norm": 0.7054257988929749, + "learning_rate": 0.00019973127892753328, + "loss": 1.2661, + "step": 20180 + }, + { + "epoch": 0.042896319553147405, + "grad_norm": 0.469925194978714, + "learning_rate": 0.00019973077971672696, + "loss": 1.2457, + "step": 20190 + }, + { + "epoch": 0.042917565872886455, + "grad_norm": 0.49018537998199463, + "learning_rate": 0.00019973028004327678, + "loss": 1.2736, + "step": 20200 + }, + { + "epoch": 0.042938812192625506, + "grad_norm": 0.3516696095466614, + "learning_rate": 0.00019972977990718508, + "loss": 1.2547, + "step": 20210 + }, + { + "epoch": 0.04296005851236456, + "grad_norm": 0.47473958134651184, + "learning_rate": 0.00019972927930845418, + "loss": 1.2744, + "step": 20220 + }, + { + "epoch": 0.04298130483210361, + "grad_norm": 0.7716000080108643, + "learning_rate": 0.0001997287782470864, + "loss": 1.2694, + "step": 20230 + }, + { + "epoch": 0.04300255115184266, + "grad_norm": 0.4805481433868408, + "learning_rate": 0.00019972827672308405, + "loss": 1.2474, + "step": 20240 + }, + { + "epoch": 0.04302379747158172, + "grad_norm": 0.34756746888160706, + "learning_rate": 0.00019972777473644947, + "loss": 1.2603, + "step": 20250 + }, + { + "epoch": 0.04304504379132077, + "grad_norm": 0.38594821095466614, + "learning_rate": 0.000199727272287185, + "loss": 1.2744, + "step": 20260 + }, + { + "epoch": 0.04306629011105983, + "grad_norm": 0.3395930826663971, + "learning_rate": 0.00019972676937529297, + "loss": 1.2473, + "step": 20270 + }, + { + "epoch": 0.04308753643079888, + "grad_norm": 0.3726336658000946, + "learning_rate": 0.00019972626600077566, + "loss": 1.2043, + "step": 20280 + }, + { + "epoch": 0.04310878275053793, + "grad_norm": 0.3728717267513275, + "learning_rate": 0.00019972576216363547, + "loss": 1.2279, + "step": 20290 + }, + { + "epoch": 0.043130029070276986, + "grad_norm": 0.3484615385532379, + "learning_rate": 0.00019972525786387472, + "loss": 1.297, + "step": 20300 + }, + { + "epoch": 0.043151275390016036, + "grad_norm": 0.4114760756492615, + "learning_rate": 0.00019972475310149577, + "loss": 1.2284, + "step": 20310 + }, + { + "epoch": 0.043172521709755086, + "grad_norm": 0.365800142288208, + "learning_rate": 0.0001997242478765009, + "loss": 1.2712, + "step": 20320 + }, + { + "epoch": 0.043193768029494144, + "grad_norm": 0.39772140979766846, + "learning_rate": 0.0001997237421888925, + "loss": 1.2571, + "step": 20330 + }, + { + "epoch": 0.043215014349233194, + "grad_norm": 0.40073004364967346, + "learning_rate": 0.00019972323603867289, + "loss": 1.2518, + "step": 20340 + }, + { + "epoch": 0.043236260668972244, + "grad_norm": 0.36717137694358826, + "learning_rate": 0.00019972272942584446, + "loss": 1.2712, + "step": 20350 + }, + { + "epoch": 0.0432575069887113, + "grad_norm": 0.3430746793746948, + "learning_rate": 0.00019972222235040948, + "loss": 1.2481, + "step": 20360 + }, + { + "epoch": 0.04327875330845035, + "grad_norm": 0.36272844672203064, + "learning_rate": 0.0001997217148123704, + "loss": 1.222, + "step": 20370 + }, + { + "epoch": 0.0432999996281894, + "grad_norm": 0.5322484374046326, + "learning_rate": 0.00019972120681172953, + "loss": 1.2744, + "step": 20380 + }, + { + "epoch": 0.04332124594792846, + "grad_norm": 0.33286187052726746, + "learning_rate": 0.00019972069834848922, + "loss": 1.2437, + "step": 20390 + }, + { + "epoch": 0.04334249226766751, + "grad_norm": 0.46551135182380676, + "learning_rate": 0.00019972018942265182, + "loss": 1.2883, + "step": 20400 + }, + { + "epoch": 0.04336373858740656, + "grad_norm": 0.5816189050674438, + "learning_rate": 0.00019971968003421972, + "loss": 1.2424, + "step": 20410 + }, + { + "epoch": 0.04338498490714562, + "grad_norm": 0.3562418222427368, + "learning_rate": 0.00019971917018319526, + "loss": 1.2351, + "step": 20420 + }, + { + "epoch": 0.04340623122688467, + "grad_norm": 0.37234944105148315, + "learning_rate": 0.00019971865986958084, + "loss": 1.2685, + "step": 20430 + }, + { + "epoch": 0.04342747754662372, + "grad_norm": 0.33272480964660645, + "learning_rate": 0.00019971814909337877, + "loss": 1.2425, + "step": 20440 + }, + { + "epoch": 0.043448723866362775, + "grad_norm": 0.3552252948284149, + "learning_rate": 0.00019971763785459144, + "loss": 1.2536, + "step": 20450 + }, + { + "epoch": 0.043469970186101825, + "grad_norm": 0.5041614174842834, + "learning_rate": 0.00019971712615322128, + "loss": 1.2639, + "step": 20460 + }, + { + "epoch": 0.04349121650584088, + "grad_norm": 0.4701482355594635, + "learning_rate": 0.00019971661398927054, + "loss": 1.2163, + "step": 20470 + }, + { + "epoch": 0.04351246282557993, + "grad_norm": 0.4148831367492676, + "learning_rate": 0.00019971610136274175, + "loss": 1.2511, + "step": 20480 + }, + { + "epoch": 0.04353370914531898, + "grad_norm": 0.3213077187538147, + "learning_rate": 0.00019971558827363718, + "loss": 1.2605, + "step": 20490 + }, + { + "epoch": 0.04355495546505804, + "grad_norm": 0.35143527388572693, + "learning_rate": 0.00019971507472195925, + "loss": 1.2793, + "step": 20500 + }, + { + "epoch": 0.04357620178479709, + "grad_norm": 0.3906659483909607, + "learning_rate": 0.0001997145607077103, + "loss": 1.272, + "step": 20510 + }, + { + "epoch": 0.04359744810453614, + "grad_norm": 0.3607204556465149, + "learning_rate": 0.0001997140462308928, + "loss": 1.2593, + "step": 20520 + }, + { + "epoch": 0.0436186944242752, + "grad_norm": 0.43919679522514343, + "learning_rate": 0.00019971353129150901, + "loss": 1.2359, + "step": 20530 + }, + { + "epoch": 0.04363994074401425, + "grad_norm": 0.3741935193538666, + "learning_rate": 0.00019971301588956147, + "loss": 1.2478, + "step": 20540 + }, + { + "epoch": 0.0436611870637533, + "grad_norm": 0.633824348449707, + "learning_rate": 0.00019971250002505245, + "loss": 1.2177, + "step": 20550 + }, + { + "epoch": 0.043682433383492356, + "grad_norm": 0.36630311608314514, + "learning_rate": 0.00019971198369798438, + "loss": 1.2137, + "step": 20560 + }, + { + "epoch": 0.043703679703231406, + "grad_norm": 0.34067079424858093, + "learning_rate": 0.0001997114669083597, + "loss": 1.2479, + "step": 20570 + }, + { + "epoch": 0.043724926022970456, + "grad_norm": 0.3511509895324707, + "learning_rate": 0.00019971094965618073, + "loss": 1.2622, + "step": 20580 + }, + { + "epoch": 0.04374617234270951, + "grad_norm": 0.40651780366897583, + "learning_rate": 0.00019971043194144992, + "loss": 1.2881, + "step": 20590 + }, + { + "epoch": 0.043767418662448564, + "grad_norm": 0.3826054036617279, + "learning_rate": 0.00019970991376416967, + "loss": 1.2204, + "step": 20600 + }, + { + "epoch": 0.043788664982187614, + "grad_norm": 0.36403322219848633, + "learning_rate": 0.00019970939512434238, + "loss": 1.2842, + "step": 20610 + }, + { + "epoch": 0.04380991130192667, + "grad_norm": 0.7283288836479187, + "learning_rate": 0.00019970887602197042, + "loss": 1.2188, + "step": 20620 + }, + { + "epoch": 0.04383115762166572, + "grad_norm": 0.39398860931396484, + "learning_rate": 0.00019970835645705622, + "loss": 1.2168, + "step": 20630 + }, + { + "epoch": 0.04385240394140477, + "grad_norm": 0.3662661015987396, + "learning_rate": 0.00019970783642960223, + "loss": 1.2312, + "step": 20640 + }, + { + "epoch": 0.04387365026114383, + "grad_norm": 0.5527542233467102, + "learning_rate": 0.00019970731593961082, + "loss": 1.2244, + "step": 20650 + }, + { + "epoch": 0.04389489658088288, + "grad_norm": 0.5406763553619385, + "learning_rate": 0.00019970679498708442, + "loss": 1.2559, + "step": 20660 + }, + { + "epoch": 0.04391614290062193, + "grad_norm": 0.3392661213874817, + "learning_rate": 0.00019970627357202548, + "loss": 1.2249, + "step": 20670 + }, + { + "epoch": 0.04393738922036099, + "grad_norm": 0.36564958095550537, + "learning_rate": 0.00019970575169443632, + "loss": 1.2559, + "step": 20680 + }, + { + "epoch": 0.04395863554010004, + "grad_norm": 0.3377515971660614, + "learning_rate": 0.00019970522935431944, + "loss": 1.2529, + "step": 20690 + }, + { + "epoch": 0.043979881859839094, + "grad_norm": 0.3355962336063385, + "learning_rate": 0.00019970470655167723, + "loss": 1.246, + "step": 20700 + }, + { + "epoch": 0.044001128179578144, + "grad_norm": 0.363228440284729, + "learning_rate": 0.00019970418328651217, + "loss": 1.226, + "step": 20710 + }, + { + "epoch": 0.044022374499317195, + "grad_norm": 0.3094048798084259, + "learning_rate": 0.0001997036595588266, + "loss": 1.2377, + "step": 20720 + }, + { + "epoch": 0.04404362081905625, + "grad_norm": 0.39640477299690247, + "learning_rate": 0.00019970313536862304, + "loss": 1.2658, + "step": 20730 + }, + { + "epoch": 0.0440648671387953, + "grad_norm": 0.4778551161289215, + "learning_rate": 0.00019970261071590383, + "loss": 1.2124, + "step": 20740 + }, + { + "epoch": 0.04408611345853435, + "grad_norm": 0.34747108817100525, + "learning_rate": 0.0001997020856006715, + "loss": 1.2072, + "step": 20750 + }, + { + "epoch": 0.04410735977827341, + "grad_norm": 0.40410926938056946, + "learning_rate": 0.00019970156002292839, + "loss": 1.2647, + "step": 20760 + }, + { + "epoch": 0.04412860609801246, + "grad_norm": 0.37126144766807556, + "learning_rate": 0.00019970103398267704, + "loss": 1.2281, + "step": 20770 + }, + { + "epoch": 0.04414985241775151, + "grad_norm": 0.37580567598342896, + "learning_rate": 0.0001997005074799198, + "loss": 1.2628, + "step": 20780 + }, + { + "epoch": 0.04417109873749057, + "grad_norm": 0.537624716758728, + "learning_rate": 0.00019969998051465914, + "loss": 1.2697, + "step": 20790 + }, + { + "epoch": 0.04419234505722962, + "grad_norm": 0.42162322998046875, + "learning_rate": 0.00019969945308689755, + "loss": 1.2462, + "step": 20800 + }, + { + "epoch": 0.04421359137696867, + "grad_norm": 0.3497770130634308, + "learning_rate": 0.00019969892519663743, + "loss": 1.2649, + "step": 20810 + }, + { + "epoch": 0.044234837696707725, + "grad_norm": 0.36129438877105713, + "learning_rate": 0.0001996983968438812, + "loss": 1.2583, + "step": 20820 + }, + { + "epoch": 0.044256084016446776, + "grad_norm": 0.42565423250198364, + "learning_rate": 0.0001996978680286314, + "loss": 1.2725, + "step": 20830 + }, + { + "epoch": 0.044277330336185826, + "grad_norm": 0.44324302673339844, + "learning_rate": 0.00019969733875089044, + "loss": 1.2173, + "step": 20840 + }, + { + "epoch": 0.04429857665592488, + "grad_norm": 0.41872259974479675, + "learning_rate": 0.0001996968090106607, + "loss": 1.2552, + "step": 20850 + }, + { + "epoch": 0.04431982297566393, + "grad_norm": 0.4350530207157135, + "learning_rate": 0.00019969627880794474, + "loss": 1.2413, + "step": 20860 + }, + { + "epoch": 0.044341069295402984, + "grad_norm": 0.41359853744506836, + "learning_rate": 0.000199695748142745, + "loss": 1.2667, + "step": 20870 + }, + { + "epoch": 0.04436231561514204, + "grad_norm": 0.3656265437602997, + "learning_rate": 0.00019969521701506394, + "loss": 1.2346, + "step": 20880 + }, + { + "epoch": 0.04438356193488109, + "grad_norm": 0.3730177879333496, + "learning_rate": 0.000199694685424904, + "loss": 1.255, + "step": 20890 + }, + { + "epoch": 0.04440480825462015, + "grad_norm": 0.39516451954841614, + "learning_rate": 0.00019969415337226765, + "loss": 1.2338, + "step": 20900 + }, + { + "epoch": 0.0444260545743592, + "grad_norm": 0.46141064167022705, + "learning_rate": 0.00019969362085715733, + "loss": 1.2683, + "step": 20910 + }, + { + "epoch": 0.04444730089409825, + "grad_norm": 0.36800655722618103, + "learning_rate": 0.00019969308787957558, + "loss": 1.228, + "step": 20920 + }, + { + "epoch": 0.044468547213837306, + "grad_norm": 0.3500145375728607, + "learning_rate": 0.00019969255443952485, + "loss": 1.2352, + "step": 20930 + }, + { + "epoch": 0.044489793533576356, + "grad_norm": 0.3867168128490448, + "learning_rate": 0.0001996920205370076, + "loss": 1.2207, + "step": 20940 + }, + { + "epoch": 0.04451103985331541, + "grad_norm": 0.4619404375553131, + "learning_rate": 0.00019969148617202627, + "loss": 1.28, + "step": 20950 + }, + { + "epoch": 0.044532286173054464, + "grad_norm": 0.4276508092880249, + "learning_rate": 0.00019969095134458339, + "loss": 1.2531, + "step": 20960 + }, + { + "epoch": 0.044553532492793514, + "grad_norm": 0.333510160446167, + "learning_rate": 0.00019969041605468146, + "loss": 1.2122, + "step": 20970 + }, + { + "epoch": 0.044574778812532564, + "grad_norm": 0.36606365442276, + "learning_rate": 0.00019968988030232292, + "loss": 1.2246, + "step": 20980 + }, + { + "epoch": 0.04459602513227162, + "grad_norm": 0.6218535900115967, + "learning_rate": 0.00019968934408751027, + "loss": 1.2558, + "step": 20990 + }, + { + "epoch": 0.04461727145201067, + "grad_norm": 0.33888179063796997, + "learning_rate": 0.00019968880741024598, + "loss": 1.2366, + "step": 21000 + }, + { + "epoch": 0.04463851777174972, + "grad_norm": 0.423692911863327, + "learning_rate": 0.00019968827027053257, + "loss": 1.2705, + "step": 21010 + }, + { + "epoch": 0.04465976409148878, + "grad_norm": 0.35213562846183777, + "learning_rate": 0.00019968773266837252, + "loss": 1.2686, + "step": 21020 + }, + { + "epoch": 0.04468101041122783, + "grad_norm": 0.362846314907074, + "learning_rate": 0.0001996871946037683, + "loss": 1.2689, + "step": 21030 + }, + { + "epoch": 0.04470225673096688, + "grad_norm": 0.7589181065559387, + "learning_rate": 0.00019968665607672243, + "loss": 1.2532, + "step": 21040 + }, + { + "epoch": 0.04472350305070594, + "grad_norm": 0.5982083082199097, + "learning_rate": 0.00019968611708723742, + "loss": 1.2726, + "step": 21050 + }, + { + "epoch": 0.04474474937044499, + "grad_norm": 0.7214556932449341, + "learning_rate": 0.00019968557763531576, + "loss": 1.2449, + "step": 21060 + }, + { + "epoch": 0.04476599569018404, + "grad_norm": 0.4335973858833313, + "learning_rate": 0.0001996850377209599, + "loss": 1.282, + "step": 21070 + }, + { + "epoch": 0.044787242009923095, + "grad_norm": 0.45219162106513977, + "learning_rate": 0.00019968449734417243, + "loss": 1.2252, + "step": 21080 + }, + { + "epoch": 0.044808488329662145, + "grad_norm": 0.37671273946762085, + "learning_rate": 0.0001996839565049558, + "loss": 1.2314, + "step": 21090 + }, + { + "epoch": 0.044829734649401196, + "grad_norm": 0.4314756691455841, + "learning_rate": 0.00019968341520331255, + "loss": 1.2488, + "step": 21100 + }, + { + "epoch": 0.04485098096914025, + "grad_norm": 0.4837862253189087, + "learning_rate": 0.00019968287343924518, + "loss": 1.2505, + "step": 21110 + }, + { + "epoch": 0.0448722272888793, + "grad_norm": 0.3696022629737854, + "learning_rate": 0.0001996823312127562, + "loss": 1.2105, + "step": 21120 + }, + { + "epoch": 0.04489347360861836, + "grad_norm": 0.48314133286476135, + "learning_rate": 0.00019968178852384813, + "loss": 1.2523, + "step": 21130 + }, + { + "epoch": 0.04491471992835741, + "grad_norm": 0.36237186193466187, + "learning_rate": 0.00019968124537252352, + "loss": 1.3102, + "step": 21140 + }, + { + "epoch": 0.04493596624809646, + "grad_norm": 0.3966769278049469, + "learning_rate": 0.0001996807017587848, + "loss": 1.2588, + "step": 21150 + }, + { + "epoch": 0.04495721256783552, + "grad_norm": 0.42840591073036194, + "learning_rate": 0.00019968015768263454, + "loss": 1.2674, + "step": 21160 + }, + { + "epoch": 0.04497845888757457, + "grad_norm": 0.3857710361480713, + "learning_rate": 0.0001996796131440753, + "loss": 1.2574, + "step": 21170 + }, + { + "epoch": 0.04499970520731362, + "grad_norm": 0.5238664150238037, + "learning_rate": 0.0001996790681431096, + "loss": 1.264, + "step": 21180 + }, + { + "epoch": 0.045020951527052676, + "grad_norm": 0.35399654507637024, + "learning_rate": 0.00019967852267973988, + "loss": 1.2519, + "step": 21190 + }, + { + "epoch": 0.045042197846791726, + "grad_norm": 0.6864836812019348, + "learning_rate": 0.00019967797675396877, + "loss": 1.2557, + "step": 21200 + }, + { + "epoch": 0.045063444166530776, + "grad_norm": 0.35755762457847595, + "learning_rate": 0.0001996774303657988, + "loss": 1.2651, + "step": 21210 + }, + { + "epoch": 0.045084690486269834, + "grad_norm": 0.33784976601600647, + "learning_rate": 0.00019967688351523241, + "loss": 1.2723, + "step": 21220 + }, + { + "epoch": 0.045105936806008884, + "grad_norm": 0.3424018323421478, + "learning_rate": 0.00019967633620227222, + "loss": 1.2986, + "step": 21230 + }, + { + "epoch": 0.045127183125747934, + "grad_norm": 0.45706167817115784, + "learning_rate": 0.00019967578842692077, + "loss": 1.2544, + "step": 21240 + }, + { + "epoch": 0.04514842944548699, + "grad_norm": 0.46060630679130554, + "learning_rate": 0.00019967524018918054, + "loss": 1.2508, + "step": 21250 + }, + { + "epoch": 0.04516967576522604, + "grad_norm": 0.3756052553653717, + "learning_rate": 0.00019967469148905417, + "loss": 1.2476, + "step": 21260 + }, + { + "epoch": 0.04519092208496509, + "grad_norm": 0.3305533230304718, + "learning_rate": 0.00019967414232654408, + "loss": 1.2412, + "step": 21270 + }, + { + "epoch": 0.04521216840470415, + "grad_norm": 0.3357706069946289, + "learning_rate": 0.0001996735927016529, + "loss": 1.2458, + "step": 21280 + }, + { + "epoch": 0.0452334147244432, + "grad_norm": 0.5197626352310181, + "learning_rate": 0.00019967304261438322, + "loss": 1.2564, + "step": 21290 + }, + { + "epoch": 0.04525466104418225, + "grad_norm": 0.37495559453964233, + "learning_rate": 0.00019967249206473748, + "loss": 1.2364, + "step": 21300 + }, + { + "epoch": 0.04527590736392131, + "grad_norm": 0.35274094343185425, + "learning_rate": 0.00019967194105271831, + "loss": 1.2633, + "step": 21310 + }, + { + "epoch": 0.04529715368366036, + "grad_norm": 0.386566698551178, + "learning_rate": 0.00019967138957832826, + "loss": 1.2246, + "step": 21320 + }, + { + "epoch": 0.045318400003399414, + "grad_norm": 0.33018985390663147, + "learning_rate": 0.00019967083764156984, + "loss": 1.2279, + "step": 21330 + }, + { + "epoch": 0.045339646323138465, + "grad_norm": 0.4064560532569885, + "learning_rate": 0.00019967028524244568, + "loss": 1.2437, + "step": 21340 + }, + { + "epoch": 0.045360892642877515, + "grad_norm": 0.425586998462677, + "learning_rate": 0.00019966973238095826, + "loss": 1.2344, + "step": 21350 + }, + { + "epoch": 0.04538213896261657, + "grad_norm": 0.38618147373199463, + "learning_rate": 0.00019966917905711023, + "loss": 1.2601, + "step": 21360 + }, + { + "epoch": 0.04540338528235562, + "grad_norm": 0.42882558703422546, + "learning_rate": 0.00019966862527090413, + "loss": 1.2214, + "step": 21370 + }, + { + "epoch": 0.04542463160209467, + "grad_norm": 0.48444730043411255, + "learning_rate": 0.0001996680710223425, + "loss": 1.2469, + "step": 21380 + }, + { + "epoch": 0.04544587792183373, + "grad_norm": 0.47585585713386536, + "learning_rate": 0.00019966751631142794, + "loss": 1.2878, + "step": 21390 + }, + { + "epoch": 0.04546712424157278, + "grad_norm": 0.4424819350242615, + "learning_rate": 0.00019966696113816296, + "loss": 1.2556, + "step": 21400 + }, + { + "epoch": 0.04548837056131183, + "grad_norm": 0.44315749406814575, + "learning_rate": 0.00019966640550255025, + "loss": 1.2017, + "step": 21410 + }, + { + "epoch": 0.04550961688105089, + "grad_norm": 0.3474249541759491, + "learning_rate": 0.0001996658494045923, + "loss": 1.2697, + "step": 21420 + }, + { + "epoch": 0.04553086320078994, + "grad_norm": 0.8479357957839966, + "learning_rate": 0.00019966529284429175, + "loss": 1.251, + "step": 21430 + }, + { + "epoch": 0.04555210952052899, + "grad_norm": 0.34707722067832947, + "learning_rate": 0.00019966473582165112, + "loss": 1.2426, + "step": 21440 + }, + { + "epoch": 0.045573355840268046, + "grad_norm": 0.5356694459915161, + "learning_rate": 0.000199664178336673, + "loss": 1.2456, + "step": 21450 + }, + { + "epoch": 0.045594602160007096, + "grad_norm": 0.5023552775382996, + "learning_rate": 0.00019966362038936, + "loss": 1.2953, + "step": 21460 + }, + { + "epoch": 0.045615848479746146, + "grad_norm": 0.5707691311836243, + "learning_rate": 0.00019966306197971473, + "loss": 1.2583, + "step": 21470 + }, + { + "epoch": 0.0456370947994852, + "grad_norm": 0.35721105337142944, + "learning_rate": 0.00019966250310773974, + "loss": 1.2585, + "step": 21480 + }, + { + "epoch": 0.045658341119224254, + "grad_norm": 0.45341411232948303, + "learning_rate": 0.00019966194377343767, + "loss": 1.2656, + "step": 21490 + }, + { + "epoch": 0.045679587438963304, + "grad_norm": 0.42969051003456116, + "learning_rate": 0.00019966138397681102, + "loss": 1.2619, + "step": 21500 + }, + { + "epoch": 0.04570083375870236, + "grad_norm": 0.419837087392807, + "learning_rate": 0.0001996608237178625, + "loss": 1.2422, + "step": 21510 + }, + { + "epoch": 0.04572208007844141, + "grad_norm": 0.5065694451332092, + "learning_rate": 0.00019966026299659463, + "loss": 1.2528, + "step": 21520 + }, + { + "epoch": 0.04574332639818047, + "grad_norm": 0.47699832916259766, + "learning_rate": 0.00019965970181301008, + "loss": 1.2398, + "step": 21530 + }, + { + "epoch": 0.04576457271791952, + "grad_norm": 0.35722246766090393, + "learning_rate": 0.00019965914016711138, + "loss": 1.2768, + "step": 21540 + }, + { + "epoch": 0.04578581903765857, + "grad_norm": 0.3649885952472687, + "learning_rate": 0.0001996585780589012, + "loss": 1.2433, + "step": 21550 + }, + { + "epoch": 0.045807065357397626, + "grad_norm": 0.5551848411560059, + "learning_rate": 0.00019965801548838208, + "loss": 1.2562, + "step": 21560 + }, + { + "epoch": 0.04582831167713668, + "grad_norm": 0.613018810749054, + "learning_rate": 0.0001996574524555567, + "loss": 1.2602, + "step": 21570 + }, + { + "epoch": 0.04584955799687573, + "grad_norm": 0.5566929578781128, + "learning_rate": 0.0001996568889604276, + "loss": 1.2564, + "step": 21580 + }, + { + "epoch": 0.045870804316614784, + "grad_norm": 1.1840497255325317, + "learning_rate": 0.00019965632500299746, + "loss": 1.2346, + "step": 21590 + }, + { + "epoch": 0.045892050636353834, + "grad_norm": 0.42590731382369995, + "learning_rate": 0.00019965576058326884, + "loss": 1.2716, + "step": 21600 + }, + { + "epoch": 0.045913296956092885, + "grad_norm": 0.4117070138454437, + "learning_rate": 0.00019965519570124443, + "loss": 1.2647, + "step": 21610 + }, + { + "epoch": 0.04593454327583194, + "grad_norm": 0.32849839329719543, + "learning_rate": 0.00019965463035692674, + "loss": 1.2094, + "step": 21620 + }, + { + "epoch": 0.04595578959557099, + "grad_norm": 0.35415932536125183, + "learning_rate": 0.00019965406455031852, + "loss": 1.2397, + "step": 21630 + }, + { + "epoch": 0.04597703591531004, + "grad_norm": 0.4431362748146057, + "learning_rate": 0.00019965349828142233, + "loss": 1.2404, + "step": 21640 + }, + { + "epoch": 0.0459982822350491, + "grad_norm": 0.4458058476448059, + "learning_rate": 0.0001996529315502408, + "loss": 1.2689, + "step": 21650 + }, + { + "epoch": 0.04601952855478815, + "grad_norm": 0.37106889486312866, + "learning_rate": 0.00019965236435677653, + "loss": 1.2172, + "step": 21660 + }, + { + "epoch": 0.0460407748745272, + "grad_norm": 0.41047996282577515, + "learning_rate": 0.0001996517967010322, + "loss": 1.2103, + "step": 21670 + }, + { + "epoch": 0.04606202119426626, + "grad_norm": 0.33031928539276123, + "learning_rate": 0.00019965122858301042, + "loss": 1.2726, + "step": 21680 + }, + { + "epoch": 0.04608326751400531, + "grad_norm": 0.3951018750667572, + "learning_rate": 0.00019965066000271383, + "loss": 1.2449, + "step": 21690 + }, + { + "epoch": 0.04610451383374436, + "grad_norm": 0.3562219440937042, + "learning_rate": 0.0001996500909601451, + "loss": 1.2191, + "step": 21700 + }, + { + "epoch": 0.046125760153483415, + "grad_norm": 0.3855057656764984, + "learning_rate": 0.00019964952145530676, + "loss": 1.2354, + "step": 21710 + }, + { + "epoch": 0.046147006473222466, + "grad_norm": 0.36943408846855164, + "learning_rate": 0.0001996489514882016, + "loss": 1.2269, + "step": 21720 + }, + { + "epoch": 0.046168252792961516, + "grad_norm": 0.35172414779663086, + "learning_rate": 0.00019964838105883216, + "loss": 1.2278, + "step": 21730 + }, + { + "epoch": 0.04618949911270057, + "grad_norm": 0.4846532344818115, + "learning_rate": 0.00019964781016720113, + "loss": 1.2456, + "step": 21740 + }, + { + "epoch": 0.04621074543243962, + "grad_norm": 0.3912853002548218, + "learning_rate": 0.00019964723881331115, + "loss": 1.2694, + "step": 21750 + }, + { + "epoch": 0.04623199175217868, + "grad_norm": 0.33491864800453186, + "learning_rate": 0.00019964666699716488, + "loss": 1.2366, + "step": 21760 + }, + { + "epoch": 0.04625323807191773, + "grad_norm": 0.4273863732814789, + "learning_rate": 0.00019964609471876491, + "loss": 1.2728, + "step": 21770 + }, + { + "epoch": 0.04627448439165678, + "grad_norm": 0.4521116316318512, + "learning_rate": 0.000199645521978114, + "loss": 1.2784, + "step": 21780 + }, + { + "epoch": 0.04629573071139584, + "grad_norm": 0.6267074942588806, + "learning_rate": 0.00019964494877521474, + "loss": 1.2858, + "step": 21790 + }, + { + "epoch": 0.04631697703113489, + "grad_norm": 0.41620320081710815, + "learning_rate": 0.00019964437511006978, + "loss": 1.2738, + "step": 21800 + }, + { + "epoch": 0.04633822335087394, + "grad_norm": 0.3783860206604004, + "learning_rate": 0.00019964380098268185, + "loss": 1.2322, + "step": 21810 + }, + { + "epoch": 0.046359469670612996, + "grad_norm": 0.37712833285331726, + "learning_rate": 0.00019964322639305353, + "loss": 1.2653, + "step": 21820 + }, + { + "epoch": 0.046380715990352046, + "grad_norm": 0.33098626136779785, + "learning_rate": 0.0001996426513411875, + "loss": 1.2388, + "step": 21830 + }, + { + "epoch": 0.0464019623100911, + "grad_norm": 0.612244188785553, + "learning_rate": 0.00019964207582708648, + "loss": 1.2735, + "step": 21840 + }, + { + "epoch": 0.046423208629830154, + "grad_norm": 0.34554314613342285, + "learning_rate": 0.00019964149985075312, + "loss": 1.2197, + "step": 21850 + }, + { + "epoch": 0.046444454949569204, + "grad_norm": 0.3702610433101654, + "learning_rate": 0.00019964092341219006, + "loss": 1.223, + "step": 21860 + }, + { + "epoch": 0.046465701269308254, + "grad_norm": 0.5400881767272949, + "learning_rate": 0.0001996403465114, + "loss": 1.2903, + "step": 21870 + }, + { + "epoch": 0.04648694758904731, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0001996397691483856, + "loss": 1.2657, + "step": 21880 + }, + { + "epoch": 0.04650819390878636, + "grad_norm": 0.43281131982803345, + "learning_rate": 0.0001996391913231496, + "loss": 1.2467, + "step": 21890 + }, + { + "epoch": 0.04652944022852541, + "grad_norm": 0.38454434275627136, + "learning_rate": 0.00019963861303569458, + "loss": 1.2731, + "step": 21900 + }, + { + "epoch": 0.04655068654826447, + "grad_norm": 0.7761757373809814, + "learning_rate": 0.00019963803428602327, + "loss": 1.2351, + "step": 21910 + }, + { + "epoch": 0.04657193286800352, + "grad_norm": 0.38878676295280457, + "learning_rate": 0.00019963745507413838, + "loss": 1.2707, + "step": 21920 + }, + { + "epoch": 0.04659317918774257, + "grad_norm": 0.35926806926727295, + "learning_rate": 0.00019963687540004257, + "loss": 1.2432, + "step": 21930 + }, + { + "epoch": 0.04661442550748163, + "grad_norm": 0.41324061155319214, + "learning_rate": 0.00019963629526373853, + "loss": 1.2374, + "step": 21940 + }, + { + "epoch": 0.04663567182722068, + "grad_norm": 0.3966316878795624, + "learning_rate": 0.00019963571466522894, + "loss": 1.2436, + "step": 21950 + }, + { + "epoch": 0.046656918146959735, + "grad_norm": 0.33440858125686646, + "learning_rate": 0.00019963513360451654, + "loss": 1.2642, + "step": 21960 + }, + { + "epoch": 0.046678164466698785, + "grad_norm": 0.4248179495334625, + "learning_rate": 0.00019963455208160395, + "loss": 1.224, + "step": 21970 + }, + { + "epoch": 0.046699410786437835, + "grad_norm": 0.3868182599544525, + "learning_rate": 0.00019963397009649394, + "loss": 1.2711, + "step": 21980 + }, + { + "epoch": 0.04672065710617689, + "grad_norm": 0.4034278690814972, + "learning_rate": 0.0001996333876491892, + "loss": 1.288, + "step": 21990 + }, + { + "epoch": 0.04674190342591594, + "grad_norm": 0.5117072463035583, + "learning_rate": 0.00019963280473969238, + "loss": 1.2344, + "step": 22000 + }, + { + "epoch": 0.04676314974565499, + "grad_norm": 0.43639034032821655, + "learning_rate": 0.00019963222136800624, + "loss": 1.2476, + "step": 22010 + }, + { + "epoch": 0.04678439606539405, + "grad_norm": 0.5207198858261108, + "learning_rate": 0.00019963163753413342, + "loss": 1.2636, + "step": 22020 + }, + { + "epoch": 0.0468056423851331, + "grad_norm": 0.3389742970466614, + "learning_rate": 0.0001996310532380767, + "loss": 1.3054, + "step": 22030 + }, + { + "epoch": 0.04682688870487215, + "grad_norm": 0.3654020130634308, + "learning_rate": 0.00019963046847983878, + "loss": 1.2291, + "step": 22040 + }, + { + "epoch": 0.04684813502461121, + "grad_norm": 0.3415839374065399, + "learning_rate": 0.00019962988325942233, + "loss": 1.2267, + "step": 22050 + }, + { + "epoch": 0.04686938134435026, + "grad_norm": 0.5576010942459106, + "learning_rate": 0.00019962929757683012, + "loss": 1.2361, + "step": 22060 + }, + { + "epoch": 0.04689062766408931, + "grad_norm": 0.3978949189186096, + "learning_rate": 0.0001996287114320648, + "loss": 1.2648, + "step": 22070 + }, + { + "epoch": 0.046911873983828366, + "grad_norm": 0.3488685190677643, + "learning_rate": 0.00019962812482512914, + "loss": 1.222, + "step": 22080 + }, + { + "epoch": 0.046933120303567416, + "grad_norm": 0.35941410064697266, + "learning_rate": 0.00019962753775602584, + "loss": 1.2711, + "step": 22090 + }, + { + "epoch": 0.046954366623306466, + "grad_norm": 0.3859129250049591, + "learning_rate": 0.00019962695022475763, + "loss": 1.2539, + "step": 22100 + }, + { + "epoch": 0.046975612943045524, + "grad_norm": 0.7472792267799377, + "learning_rate": 0.00019962636223132724, + "loss": 1.24, + "step": 22110 + }, + { + "epoch": 0.046996859262784574, + "grad_norm": 0.43949612975120544, + "learning_rate": 0.0001996257737757374, + "loss": 1.2059, + "step": 22120 + }, + { + "epoch": 0.047018105582523624, + "grad_norm": 0.37462249398231506, + "learning_rate": 0.00019962518485799085, + "loss": 1.2411, + "step": 22130 + }, + { + "epoch": 0.04703935190226268, + "grad_norm": 0.35916757583618164, + "learning_rate": 0.00019962459547809028, + "loss": 1.262, + "step": 22140 + }, + { + "epoch": 0.04706059822200173, + "grad_norm": 0.6368008255958557, + "learning_rate": 0.00019962400563603845, + "loss": 1.2089, + "step": 22150 + }, + { + "epoch": 0.04708184454174078, + "grad_norm": 0.34676557779312134, + "learning_rate": 0.0001996234153318381, + "loss": 1.2387, + "step": 22160 + }, + { + "epoch": 0.04710309086147984, + "grad_norm": 0.43077003955841064, + "learning_rate": 0.00019962282456549194, + "loss": 1.2625, + "step": 22170 + }, + { + "epoch": 0.04712433718121889, + "grad_norm": 0.46706098318099976, + "learning_rate": 0.00019962223333700276, + "loss": 1.2308, + "step": 22180 + }, + { + "epoch": 0.04714558350095795, + "grad_norm": 0.5284795165061951, + "learning_rate": 0.0001996216416463733, + "loss": 1.2467, + "step": 22190 + }, + { + "epoch": 0.047166829820697, + "grad_norm": 0.378838449716568, + "learning_rate": 0.00019962104949360624, + "loss": 1.2729, + "step": 22200 + }, + { + "epoch": 0.04718807614043605, + "grad_norm": 0.35978713631629944, + "learning_rate": 0.0001996204568787044, + "loss": 1.2403, + "step": 22210 + }, + { + "epoch": 0.047209322460175104, + "grad_norm": 0.3891187608242035, + "learning_rate": 0.00019961986380167047, + "loss": 1.2662, + "step": 22220 + }, + { + "epoch": 0.047230568779914155, + "grad_norm": 0.35716861486434937, + "learning_rate": 0.00019961927026250723, + "loss": 1.2198, + "step": 22230 + }, + { + "epoch": 0.047251815099653205, + "grad_norm": 0.4227816164493561, + "learning_rate": 0.00019961867626121745, + "loss": 1.2241, + "step": 22240 + }, + { + "epoch": 0.04727306141939226, + "grad_norm": 0.35429054498672485, + "learning_rate": 0.00019961808179780387, + "loss": 1.2109, + "step": 22250 + }, + { + "epoch": 0.04729430773913131, + "grad_norm": 0.4444109797477722, + "learning_rate": 0.0001996174868722692, + "loss": 1.2356, + "step": 22260 + }, + { + "epoch": 0.04731555405887036, + "grad_norm": 0.6328089237213135, + "learning_rate": 0.00019961689148461628, + "loss": 1.2224, + "step": 22270 + }, + { + "epoch": 0.04733680037860942, + "grad_norm": 0.42832785844802856, + "learning_rate": 0.00019961629563484785, + "loss": 1.2734, + "step": 22280 + }, + { + "epoch": 0.04735804669834847, + "grad_norm": 0.43480825424194336, + "learning_rate": 0.00019961569932296663, + "loss": 1.2265, + "step": 22290 + }, + { + "epoch": 0.04737929301808752, + "grad_norm": 0.3964134454727173, + "learning_rate": 0.00019961510254897546, + "loss": 1.2375, + "step": 22300 + }, + { + "epoch": 0.04740053933782658, + "grad_norm": 0.37150365114212036, + "learning_rate": 0.00019961450531287702, + "loss": 1.2841, + "step": 22310 + }, + { + "epoch": 0.04742178565756563, + "grad_norm": 0.4246037006378174, + "learning_rate": 0.0001996139076146742, + "loss": 1.2629, + "step": 22320 + }, + { + "epoch": 0.04744303197730468, + "grad_norm": 0.40119388699531555, + "learning_rate": 0.00019961330945436961, + "loss": 1.2437, + "step": 22330 + }, + { + "epoch": 0.047464278297043735, + "grad_norm": 0.7269831299781799, + "learning_rate": 0.00019961271083196615, + "loss": 1.2491, + "step": 22340 + }, + { + "epoch": 0.047485524616782786, + "grad_norm": 0.34306231141090393, + "learning_rate": 0.00019961211174746656, + "loss": 1.2532, + "step": 22350 + }, + { + "epoch": 0.047506770936521836, + "grad_norm": 0.478779137134552, + "learning_rate": 0.00019961151220087363, + "loss": 1.2396, + "step": 22360 + }, + { + "epoch": 0.04752801725626089, + "grad_norm": 0.6340217590332031, + "learning_rate": 0.00019961091219219013, + "loss": 1.2307, + "step": 22370 + }, + { + "epoch": 0.047549263575999944, + "grad_norm": 0.35187169909477234, + "learning_rate": 0.00019961031172141885, + "loss": 1.2547, + "step": 22380 + }, + { + "epoch": 0.047570509895739, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.00019960971078856256, + "loss": 1.2377, + "step": 22390 + }, + { + "epoch": 0.04759175621547805, + "grad_norm": 0.39059868454933167, + "learning_rate": 0.00019960910939362405, + "loss": 1.2522, + "step": 22400 + }, + { + "epoch": 0.0476130025352171, + "grad_norm": 0.3590787947177887, + "learning_rate": 0.00019960850753660613, + "loss": 1.2882, + "step": 22410 + }, + { + "epoch": 0.04763424885495616, + "grad_norm": 0.37173113226890564, + "learning_rate": 0.00019960790521751157, + "loss": 1.2631, + "step": 22420 + }, + { + "epoch": 0.04765549517469521, + "grad_norm": 0.3569691479206085, + "learning_rate": 0.00019960730243634317, + "loss": 1.2365, + "step": 22430 + }, + { + "epoch": 0.04767674149443426, + "grad_norm": 0.5750043392181396, + "learning_rate": 0.00019960669919310376, + "loss": 1.2403, + "step": 22440 + }, + { + "epoch": 0.047697987814173316, + "grad_norm": 0.5588710308074951, + "learning_rate": 0.00019960609548779607, + "loss": 1.2603, + "step": 22450 + }, + { + "epoch": 0.04771923413391237, + "grad_norm": 0.43985527753829956, + "learning_rate": 0.00019960549132042296, + "loss": 1.2328, + "step": 22460 + }, + { + "epoch": 0.04774048045365142, + "grad_norm": 0.38184890151023865, + "learning_rate": 0.00019960488669098722, + "loss": 1.2677, + "step": 22470 + }, + { + "epoch": 0.047761726773390474, + "grad_norm": 0.42545878887176514, + "learning_rate": 0.00019960428159949162, + "loss": 1.2432, + "step": 22480 + }, + { + "epoch": 0.047782973093129524, + "grad_norm": 0.4139634370803833, + "learning_rate": 0.00019960367604593903, + "loss": 1.2047, + "step": 22490 + }, + { + "epoch": 0.047804219412868575, + "grad_norm": 0.3641108274459839, + "learning_rate": 0.0001996030700303322, + "loss": 1.2587, + "step": 22500 + }, + { + "epoch": 0.04782546573260763, + "grad_norm": 0.3417103588581085, + "learning_rate": 0.00019960246355267395, + "loss": 1.236, + "step": 22510 + }, + { + "epoch": 0.04784671205234668, + "grad_norm": 0.3616732656955719, + "learning_rate": 0.00019960185661296715, + "loss": 1.2551, + "step": 22520 + }, + { + "epoch": 0.04786795837208573, + "grad_norm": 0.39364394545555115, + "learning_rate": 0.00019960124921121453, + "loss": 1.2177, + "step": 22530 + }, + { + "epoch": 0.04788920469182479, + "grad_norm": 0.4260375201702118, + "learning_rate": 0.000199600641347419, + "loss": 1.2674, + "step": 22540 + }, + { + "epoch": 0.04791045101156384, + "grad_norm": 0.35247477889060974, + "learning_rate": 0.00019960003302158327, + "loss": 1.2422, + "step": 22550 + }, + { + "epoch": 0.04793169733130289, + "grad_norm": 0.4322347939014435, + "learning_rate": 0.00019959942423371027, + "loss": 1.2372, + "step": 22560 + }, + { + "epoch": 0.04795294365104195, + "grad_norm": 0.43763044476509094, + "learning_rate": 0.00019959881498380274, + "loss": 1.2776, + "step": 22570 + }, + { + "epoch": 0.047974189970781, + "grad_norm": 0.431805819272995, + "learning_rate": 0.00019959820527186358, + "loss": 1.2318, + "step": 22580 + }, + { + "epoch": 0.047995436290520055, + "grad_norm": 0.41809841990470886, + "learning_rate": 0.00019959759509789556, + "loss": 1.2377, + "step": 22590 + }, + { + "epoch": 0.048016682610259105, + "grad_norm": 0.38181373476982117, + "learning_rate": 0.00019959698446190153, + "loss": 1.2773, + "step": 22600 + }, + { + "epoch": 0.048037928929998155, + "grad_norm": 0.32855144143104553, + "learning_rate": 0.00019959637336388432, + "loss": 1.2047, + "step": 22610 + }, + { + "epoch": 0.04805917524973721, + "grad_norm": 0.4607836902141571, + "learning_rate": 0.0001995957618038468, + "loss": 1.2047, + "step": 22620 + }, + { + "epoch": 0.04808042156947626, + "grad_norm": 0.39545589685440063, + "learning_rate": 0.00019959514978179174, + "loss": 1.2275, + "step": 22630 + }, + { + "epoch": 0.04810166788921531, + "grad_norm": 0.3439519703388214, + "learning_rate": 0.000199594537297722, + "loss": 1.2568, + "step": 22640 + }, + { + "epoch": 0.04812291420895437, + "grad_norm": 0.5318855047225952, + "learning_rate": 0.00019959392435164048, + "loss": 1.2381, + "step": 22650 + }, + { + "epoch": 0.04814416052869342, + "grad_norm": 0.421092689037323, + "learning_rate": 0.00019959331094354999, + "loss": 1.2204, + "step": 22660 + }, + { + "epoch": 0.04816540684843247, + "grad_norm": 0.48316046595573425, + "learning_rate": 0.00019959269707345333, + "loss": 1.2709, + "step": 22670 + }, + { + "epoch": 0.04818665316817153, + "grad_norm": 0.3452741205692291, + "learning_rate": 0.0001995920827413534, + "loss": 1.2008, + "step": 22680 + }, + { + "epoch": 0.04820789948791058, + "grad_norm": 0.5222562551498413, + "learning_rate": 0.00019959146794725304, + "loss": 1.2103, + "step": 22690 + }, + { + "epoch": 0.04822914580764963, + "grad_norm": 0.5397101044654846, + "learning_rate": 0.00019959085269115508, + "loss": 1.2219, + "step": 22700 + }, + { + "epoch": 0.048250392127388686, + "grad_norm": 0.3587370812892914, + "learning_rate": 0.0001995902369730624, + "loss": 1.2131, + "step": 22710 + }, + { + "epoch": 0.048271638447127736, + "grad_norm": 0.5987287163734436, + "learning_rate": 0.00019958962079297783, + "loss": 1.2435, + "step": 22720 + }, + { + "epoch": 0.04829288476686679, + "grad_norm": 0.39692726731300354, + "learning_rate": 0.00019958900415090427, + "loss": 1.2898, + "step": 22730 + }, + { + "epoch": 0.048314131086605844, + "grad_norm": 0.372063547372818, + "learning_rate": 0.00019958838704684453, + "loss": 1.2577, + "step": 22740 + }, + { + "epoch": 0.048335377406344894, + "grad_norm": 0.3504478931427002, + "learning_rate": 0.0001995877694808015, + "loss": 1.2571, + "step": 22750 + }, + { + "epoch": 0.048356623726083944, + "grad_norm": 0.38808774948120117, + "learning_rate": 0.00019958715145277804, + "loss": 1.2117, + "step": 22760 + }, + { + "epoch": 0.048377870045823, + "grad_norm": 0.4064924120903015, + "learning_rate": 0.00019958653296277702, + "loss": 1.2646, + "step": 22770 + }, + { + "epoch": 0.04839911636556205, + "grad_norm": 0.34220582246780396, + "learning_rate": 0.00019958591401080132, + "loss": 1.2479, + "step": 22780 + }, + { + "epoch": 0.0484203626853011, + "grad_norm": 0.39219987392425537, + "learning_rate": 0.0001995852945968538, + "loss": 1.2093, + "step": 22790 + }, + { + "epoch": 0.04844160900504016, + "grad_norm": 0.44233933091163635, + "learning_rate": 0.00019958467472093734, + "loss": 1.2298, + "step": 22800 + }, + { + "epoch": 0.04846285532477921, + "grad_norm": 0.5458942651748657, + "learning_rate": 0.00019958405438305477, + "loss": 1.24, + "step": 22810 + }, + { + "epoch": 0.04848410164451827, + "grad_norm": 0.34549105167388916, + "learning_rate": 0.00019958343358320905, + "loss": 1.2657, + "step": 22820 + }, + { + "epoch": 0.04850534796425732, + "grad_norm": 0.4754931926727295, + "learning_rate": 0.000199582812321403, + "loss": 1.1941, + "step": 22830 + }, + { + "epoch": 0.04852659428399637, + "grad_norm": 0.3770964443683624, + "learning_rate": 0.00019958219059763952, + "loss": 1.2548, + "step": 22840 + }, + { + "epoch": 0.048547840603735425, + "grad_norm": 0.8188684582710266, + "learning_rate": 0.0001995815684119215, + "loss": 1.2438, + "step": 22850 + }, + { + "epoch": 0.048569086923474475, + "grad_norm": 0.34790995717048645, + "learning_rate": 0.00019958094576425178, + "loss": 1.2445, + "step": 22860 + }, + { + "epoch": 0.048590333243213525, + "grad_norm": 0.565233051776886, + "learning_rate": 0.00019958032265463333, + "loss": 1.2752, + "step": 22870 + }, + { + "epoch": 0.04861157956295258, + "grad_norm": 0.45770639181137085, + "learning_rate": 0.000199579699083069, + "loss": 1.2926, + "step": 22880 + }, + { + "epoch": 0.04863282588269163, + "grad_norm": 0.45462334156036377, + "learning_rate": 0.00019957907504956164, + "loss": 1.2465, + "step": 22890 + }, + { + "epoch": 0.04865407220243068, + "grad_norm": 0.4814175069332123, + "learning_rate": 0.0001995784505541142, + "loss": 1.2495, + "step": 22900 + }, + { + "epoch": 0.04867531852216974, + "grad_norm": 0.4117863178253174, + "learning_rate": 0.0001995778255967296, + "loss": 1.2646, + "step": 22910 + }, + { + "epoch": 0.04869656484190879, + "grad_norm": 0.6138426065444946, + "learning_rate": 0.00019957720017741063, + "loss": 1.2542, + "step": 22920 + }, + { + "epoch": 0.04871781116164784, + "grad_norm": 0.525894820690155, + "learning_rate": 0.00019957657429616032, + "loss": 1.2241, + "step": 22930 + }, + { + "epoch": 0.0487390574813869, + "grad_norm": 0.3925345540046692, + "learning_rate": 0.0001995759479529815, + "loss": 1.244, + "step": 22940 + }, + { + "epoch": 0.04876030380112595, + "grad_norm": 0.4517866373062134, + "learning_rate": 0.0001995753211478771, + "loss": 1.2495, + "step": 22950 + }, + { + "epoch": 0.048781550120865, + "grad_norm": 0.37660545110702515, + "learning_rate": 0.00019957469388085, + "loss": 1.2447, + "step": 22960 + }, + { + "epoch": 0.048802796440604056, + "grad_norm": 0.4087595045566559, + "learning_rate": 0.00019957406615190313, + "loss": 1.2541, + "step": 22970 + }, + { + "epoch": 0.048824042760343106, + "grad_norm": 0.4619152545928955, + "learning_rate": 0.00019957343796103943, + "loss": 1.2623, + "step": 22980 + }, + { + "epoch": 0.048845289080082156, + "grad_norm": 0.42591172456741333, + "learning_rate": 0.00019957280930826176, + "loss": 1.2631, + "step": 22990 + }, + { + "epoch": 0.048866535399821213, + "grad_norm": 0.4318392276763916, + "learning_rate": 0.00019957218019357305, + "loss": 1.2275, + "step": 23000 + }, + { + "epoch": 0.048887781719560264, + "grad_norm": 0.4693271517753601, + "learning_rate": 0.00019957155061697625, + "loss": 1.2729, + "step": 23010 + }, + { + "epoch": 0.04890902803929932, + "grad_norm": 0.4164368808269501, + "learning_rate": 0.00019957092057847425, + "loss": 1.2567, + "step": 23020 + }, + { + "epoch": 0.04893027435903837, + "grad_norm": 0.39400720596313477, + "learning_rate": 0.00019957029007807, + "loss": 1.2662, + "step": 23030 + }, + { + "epoch": 0.04895152067877742, + "grad_norm": 0.4605836272239685, + "learning_rate": 0.0001995696591157664, + "loss": 1.2716, + "step": 23040 + }, + { + "epoch": 0.04897276699851648, + "grad_norm": 0.33529627323150635, + "learning_rate": 0.00019956902769156638, + "loss": 1.2292, + "step": 23050 + }, + { + "epoch": 0.04899401331825553, + "grad_norm": 0.38943567872047424, + "learning_rate": 0.00019956839580547285, + "loss": 1.238, + "step": 23060 + }, + { + "epoch": 0.04901525963799458, + "grad_norm": 0.3839457035064697, + "learning_rate": 0.00019956776345748882, + "loss": 1.2815, + "step": 23070 + }, + { + "epoch": 0.049036505957733637, + "grad_norm": 0.615885317325592, + "learning_rate": 0.00019956713064761715, + "loss": 1.2691, + "step": 23080 + }, + { + "epoch": 0.04905775227747269, + "grad_norm": 0.36337748169898987, + "learning_rate": 0.00019956649737586077, + "loss": 1.2431, + "step": 23090 + }, + { + "epoch": 0.04907899859721174, + "grad_norm": 0.32411807775497437, + "learning_rate": 0.00019956586364222264, + "loss": 1.224, + "step": 23100 + }, + { + "epoch": 0.049100244916950794, + "grad_norm": 0.3737120032310486, + "learning_rate": 0.00019956522944670574, + "loss": 1.2487, + "step": 23110 + }, + { + "epoch": 0.049121491236689845, + "grad_norm": 0.343973845243454, + "learning_rate": 0.00019956459478931296, + "loss": 1.2272, + "step": 23120 + }, + { + "epoch": 0.049142737556428895, + "grad_norm": 0.3533420264720917, + "learning_rate": 0.00019956395967004725, + "loss": 1.276, + "step": 23130 + }, + { + "epoch": 0.04916398387616795, + "grad_norm": 0.3732348680496216, + "learning_rate": 0.00019956332408891154, + "loss": 1.2414, + "step": 23140 + }, + { + "epoch": 0.049185230195907, + "grad_norm": 0.43949219584465027, + "learning_rate": 0.00019956268804590882, + "loss": 1.2517, + "step": 23150 + }, + { + "epoch": 0.04920647651564605, + "grad_norm": 0.3743427097797394, + "learning_rate": 0.00019956205154104205, + "loss": 1.2291, + "step": 23160 + }, + { + "epoch": 0.04922772283538511, + "grad_norm": 0.3212090730667114, + "learning_rate": 0.00019956141457431412, + "loss": 1.2458, + "step": 23170 + }, + { + "epoch": 0.04924896915512416, + "grad_norm": 0.3471202254295349, + "learning_rate": 0.00019956077714572805, + "loss": 1.2361, + "step": 23180 + }, + { + "epoch": 0.04927021547486321, + "grad_norm": 0.32695192098617554, + "learning_rate": 0.00019956013925528674, + "loss": 1.2142, + "step": 23190 + }, + { + "epoch": 0.04929146179460227, + "grad_norm": 0.36456286907196045, + "learning_rate": 0.0001995595009029932, + "loss": 1.2359, + "step": 23200 + }, + { + "epoch": 0.04931270811434132, + "grad_norm": 0.3719993233680725, + "learning_rate": 0.00019955886208885036, + "loss": 1.2332, + "step": 23210 + }, + { + "epoch": 0.04933395443408037, + "grad_norm": 0.3735872507095337, + "learning_rate": 0.0001995582228128612, + "loss": 1.2529, + "step": 23220 + }, + { + "epoch": 0.049355200753819425, + "grad_norm": 0.37146735191345215, + "learning_rate": 0.00019955758307502866, + "loss": 1.2005, + "step": 23230 + }, + { + "epoch": 0.049376447073558476, + "grad_norm": 0.35627806186676025, + "learning_rate": 0.0001995569428753557, + "loss": 1.2173, + "step": 23240 + }, + { + "epoch": 0.04939769339329753, + "grad_norm": 0.3564694821834564, + "learning_rate": 0.00019955630221384535, + "loss": 1.2439, + "step": 23250 + }, + { + "epoch": 0.04941893971303658, + "grad_norm": 0.40923747420310974, + "learning_rate": 0.00019955566109050053, + "loss": 1.2521, + "step": 23260 + }, + { + "epoch": 0.04944018603277563, + "grad_norm": 0.4777584969997406, + "learning_rate": 0.00019955501950532427, + "loss": 1.2597, + "step": 23270 + }, + { + "epoch": 0.04946143235251469, + "grad_norm": 0.44751885533332825, + "learning_rate": 0.00019955437745831946, + "loss": 1.2593, + "step": 23280 + }, + { + "epoch": 0.04948267867225374, + "grad_norm": 0.42884573340415955, + "learning_rate": 0.00019955373494948917, + "loss": 1.2437, + "step": 23290 + }, + { + "epoch": 0.04950392499199279, + "grad_norm": 0.4545450210571289, + "learning_rate": 0.0001995530919788363, + "loss": 1.2641, + "step": 23300 + }, + { + "epoch": 0.04952517131173185, + "grad_norm": 0.5170145034790039, + "learning_rate": 0.00019955244854636387, + "loss": 1.2424, + "step": 23310 + }, + { + "epoch": 0.0495464176314709, + "grad_norm": 0.4341415464878082, + "learning_rate": 0.00019955180465207488, + "loss": 1.1952, + "step": 23320 + }, + { + "epoch": 0.04956766395120995, + "grad_norm": 0.5153058171272278, + "learning_rate": 0.0001995511602959723, + "loss": 1.2438, + "step": 23330 + }, + { + "epoch": 0.049588910270949006, + "grad_norm": 0.5197606086730957, + "learning_rate": 0.0001995505154780591, + "loss": 1.2082, + "step": 23340 + }, + { + "epoch": 0.049610156590688056, + "grad_norm": 0.5756879448890686, + "learning_rate": 0.0001995498701983383, + "loss": 1.256, + "step": 23350 + }, + { + "epoch": 0.04963140291042711, + "grad_norm": 0.45925500988960266, + "learning_rate": 0.0001995492244568129, + "loss": 1.2406, + "step": 23360 + }, + { + "epoch": 0.049652649230166164, + "grad_norm": 0.39613059163093567, + "learning_rate": 0.0001995485782534859, + "loss": 1.2022, + "step": 23370 + }, + { + "epoch": 0.049673895549905214, + "grad_norm": 0.43784797191619873, + "learning_rate": 0.0001995479315883602, + "loss": 1.2733, + "step": 23380 + }, + { + "epoch": 0.049695141869644265, + "grad_norm": 0.518907904624939, + "learning_rate": 0.00019954728446143895, + "loss": 1.2434, + "step": 23390 + }, + { + "epoch": 0.04971638818938332, + "grad_norm": 0.34388744831085205, + "learning_rate": 0.00019954663687272507, + "loss": 1.2464, + "step": 23400 + }, + { + "epoch": 0.04973763450912237, + "grad_norm": 0.502674400806427, + "learning_rate": 0.00019954598882222153, + "loss": 1.2248, + "step": 23410 + }, + { + "epoch": 0.04975888082886142, + "grad_norm": 0.36926329135894775, + "learning_rate": 0.00019954534030993139, + "loss": 1.2653, + "step": 23420 + }, + { + "epoch": 0.04978012714860048, + "grad_norm": 0.4252592921257019, + "learning_rate": 0.00019954469133585766, + "loss": 1.258, + "step": 23430 + }, + { + "epoch": 0.04980137346833953, + "grad_norm": 0.6033987998962402, + "learning_rate": 0.00019954404190000336, + "loss": 1.2329, + "step": 23440 + }, + { + "epoch": 0.04982261978807859, + "grad_norm": 0.38948917388916016, + "learning_rate": 0.00019954339200237148, + "loss": 1.222, + "step": 23450 + }, + { + "epoch": 0.04984386610781764, + "grad_norm": 0.35567569732666016, + "learning_rate": 0.000199542741642965, + "loss": 1.2177, + "step": 23460 + }, + { + "epoch": 0.04986511242755669, + "grad_norm": 0.4365026354789734, + "learning_rate": 0.00019954209082178698, + "loss": 1.2596, + "step": 23470 + }, + { + "epoch": 0.049886358747295745, + "grad_norm": 0.34543296694755554, + "learning_rate": 0.00019954143953884046, + "loss": 1.2325, + "step": 23480 + }, + { + "epoch": 0.049907605067034795, + "grad_norm": 0.3953426778316498, + "learning_rate": 0.0001995407877941284, + "loss": 1.2385, + "step": 23490 + }, + { + "epoch": 0.049928851386773845, + "grad_norm": 0.3432445824146271, + "learning_rate": 0.00019954013558765387, + "loss": 1.2526, + "step": 23500 + }, + { + "epoch": 0.0499500977065129, + "grad_norm": 0.37339526414871216, + "learning_rate": 0.0001995394829194199, + "loss": 1.2621, + "step": 23510 + }, + { + "epoch": 0.04997134402625195, + "grad_norm": 0.41778138279914856, + "learning_rate": 0.00019953882978942947, + "loss": 1.262, + "step": 23520 + }, + { + "epoch": 0.049992590345991, + "grad_norm": 0.6797776222229004, + "learning_rate": 0.0001995381761976857, + "loss": 1.2588, + "step": 23530 + }, + { + "epoch": 0.05001383666573006, + "grad_norm": 0.3378103971481323, + "learning_rate": 0.00019953752214419147, + "loss": 1.2787, + "step": 23540 + }, + { + "epoch": 0.05003508298546911, + "grad_norm": 0.48561251163482666, + "learning_rate": 0.00019953686762894996, + "loss": 1.219, + "step": 23550 + }, + { + "epoch": 0.05005632930520816, + "grad_norm": 0.349027544260025, + "learning_rate": 0.00019953621265196415, + "loss": 1.2078, + "step": 23560 + }, + { + "epoch": 0.05007757562494722, + "grad_norm": 0.6405184864997864, + "learning_rate": 0.0001995355572132371, + "loss": 1.2749, + "step": 23570 + }, + { + "epoch": 0.05009882194468627, + "grad_norm": 0.3436192274093628, + "learning_rate": 0.0001995349013127718, + "loss": 1.2532, + "step": 23580 + }, + { + "epoch": 0.05012006826442532, + "grad_norm": 0.3657911419868469, + "learning_rate": 0.00019953424495057135, + "loss": 1.235, + "step": 23590 + }, + { + "epoch": 0.050141314584164376, + "grad_norm": 0.6082769632339478, + "learning_rate": 0.00019953358812663873, + "loss": 1.2602, + "step": 23600 + }, + { + "epoch": 0.050162560903903426, + "grad_norm": 0.34138938784599304, + "learning_rate": 0.00019953293084097707, + "loss": 1.233, + "step": 23610 + }, + { + "epoch": 0.050183807223642476, + "grad_norm": 0.35538896918296814, + "learning_rate": 0.00019953227309358934, + "loss": 1.2527, + "step": 23620 + }, + { + "epoch": 0.050205053543381534, + "grad_norm": 0.3808159828186035, + "learning_rate": 0.00019953161488447862, + "loss": 1.2609, + "step": 23630 + }, + { + "epoch": 0.050226299863120584, + "grad_norm": 0.655822217464447, + "learning_rate": 0.00019953095621364798, + "loss": 1.2485, + "step": 23640 + }, + { + "epoch": 0.05024754618285964, + "grad_norm": 0.6608279347419739, + "learning_rate": 0.00019953029708110048, + "loss": 1.2545, + "step": 23650 + }, + { + "epoch": 0.05026879250259869, + "grad_norm": 0.5296467542648315, + "learning_rate": 0.00019952963748683916, + "loss": 1.2599, + "step": 23660 + }, + { + "epoch": 0.05029003882233774, + "grad_norm": 0.41308411955833435, + "learning_rate": 0.00019952897743086707, + "loss": 1.2313, + "step": 23670 + }, + { + "epoch": 0.0503112851420768, + "grad_norm": 0.40916216373443604, + "learning_rate": 0.00019952831691318727, + "loss": 1.2253, + "step": 23680 + }, + { + "epoch": 0.05033253146181585, + "grad_norm": 0.3482476472854614, + "learning_rate": 0.0001995276559338029, + "loss": 1.2267, + "step": 23690 + }, + { + "epoch": 0.0503537777815549, + "grad_norm": 0.5009663701057434, + "learning_rate": 0.0001995269944927169, + "loss": 1.2137, + "step": 23700 + }, + { + "epoch": 0.05037502410129396, + "grad_norm": 0.47336849570274353, + "learning_rate": 0.00019952633258993243, + "loss": 1.2692, + "step": 23710 + }, + { + "epoch": 0.05039627042103301, + "grad_norm": 0.5112811923027039, + "learning_rate": 0.00019952567022545251, + "loss": 1.2084, + "step": 23720 + }, + { + "epoch": 0.05041751674077206, + "grad_norm": 1.5697712898254395, + "learning_rate": 0.00019952500739928026, + "loss": 1.2541, + "step": 23730 + }, + { + "epoch": 0.050438763060511115, + "grad_norm": 0.4964537024497986, + "learning_rate": 0.00019952434411141874, + "loss": 1.222, + "step": 23740 + }, + { + "epoch": 0.050460009380250165, + "grad_norm": 0.4273381531238556, + "learning_rate": 0.00019952368036187102, + "loss": 1.2642, + "step": 23750 + }, + { + "epoch": 0.050481255699989215, + "grad_norm": 0.3600373864173889, + "learning_rate": 0.00019952301615064016, + "loss": 1.2539, + "step": 23760 + }, + { + "epoch": 0.05050250201972827, + "grad_norm": 0.41758570075035095, + "learning_rate": 0.00019952235147772924, + "loss": 1.2784, + "step": 23770 + }, + { + "epoch": 0.05052374833946732, + "grad_norm": 0.45422691106796265, + "learning_rate": 0.00019952168634314137, + "loss": 1.2679, + "step": 23780 + }, + { + "epoch": 0.05054499465920637, + "grad_norm": 0.40059638023376465, + "learning_rate": 0.00019952102074687967, + "loss": 1.2664, + "step": 23790 + }, + { + "epoch": 0.05056624097894543, + "grad_norm": 0.47060784697532654, + "learning_rate": 0.00019952035468894712, + "loss": 1.2339, + "step": 23800 + }, + { + "epoch": 0.05058748729868448, + "grad_norm": 0.7560190558433533, + "learning_rate": 0.00019951968816934692, + "loss": 1.2009, + "step": 23810 + }, + { + "epoch": 0.05060873361842353, + "grad_norm": 0.6718135476112366, + "learning_rate": 0.0001995190211880821, + "loss": 1.228, + "step": 23820 + }, + { + "epoch": 0.05062997993816259, + "grad_norm": 0.8088198900222778, + "learning_rate": 0.00019951835374515576, + "loss": 1.2575, + "step": 23830 + }, + { + "epoch": 0.05065122625790164, + "grad_norm": 0.45555171370506287, + "learning_rate": 0.00019951768584057104, + "loss": 1.237, + "step": 23840 + }, + { + "epoch": 0.05067247257764069, + "grad_norm": 0.43191784620285034, + "learning_rate": 0.00019951701747433098, + "loss": 1.259, + "step": 23850 + }, + { + "epoch": 0.050693718897379746, + "grad_norm": 0.3493640422821045, + "learning_rate": 0.0001995163486464387, + "loss": 1.2417, + "step": 23860 + }, + { + "epoch": 0.050714965217118796, + "grad_norm": 0.5660349130630493, + "learning_rate": 0.00019951567935689733, + "loss": 1.2459, + "step": 23870 + }, + { + "epoch": 0.05073621153685785, + "grad_norm": 0.356318861246109, + "learning_rate": 0.00019951500960570993, + "loss": 1.2128, + "step": 23880 + }, + { + "epoch": 0.0507574578565969, + "grad_norm": 0.5575708746910095, + "learning_rate": 0.00019951433939287963, + "loss": 1.2411, + "step": 23890 + }, + { + "epoch": 0.050778704176335954, + "grad_norm": 0.34287527203559875, + "learning_rate": 0.00019951366871840957, + "loss": 1.2549, + "step": 23900 + }, + { + "epoch": 0.05079995049607501, + "grad_norm": 0.48988667130470276, + "learning_rate": 0.0001995129975823028, + "loss": 1.2602, + "step": 23910 + }, + { + "epoch": 0.05082119681581406, + "grad_norm": 0.30180642008781433, + "learning_rate": 0.0001995123259845625, + "loss": 1.2418, + "step": 23920 + }, + { + "epoch": 0.05084244313555311, + "grad_norm": 0.43536457419395447, + "learning_rate": 0.00019951165392519173, + "loss": 1.2569, + "step": 23930 + }, + { + "epoch": 0.05086368945529217, + "grad_norm": 0.4114212989807129, + "learning_rate": 0.0001995109814041936, + "loss": 1.2808, + "step": 23940 + }, + { + "epoch": 0.05088493577503122, + "grad_norm": 0.4229431450366974, + "learning_rate": 0.00019951030842157131, + "loss": 1.2421, + "step": 23950 + }, + { + "epoch": 0.05090618209477027, + "grad_norm": 0.5891639590263367, + "learning_rate": 0.00019950963497732786, + "loss": 1.276, + "step": 23960 + }, + { + "epoch": 0.050927428414509326, + "grad_norm": 0.37005406618118286, + "learning_rate": 0.00019950896107146648, + "loss": 1.2685, + "step": 23970 + }, + { + "epoch": 0.05094867473424838, + "grad_norm": 0.39510178565979004, + "learning_rate": 0.00019950828670399026, + "loss": 1.2312, + "step": 23980 + }, + { + "epoch": 0.05096992105398743, + "grad_norm": 0.34351933002471924, + "learning_rate": 0.00019950761187490233, + "loss": 1.283, + "step": 23990 + }, + { + "epoch": 0.050991167373726484, + "grad_norm": 0.5687845349311829, + "learning_rate": 0.00019950693658420582, + "loss": 1.2084, + "step": 24000 + }, + { + "epoch": 0.051012413693465535, + "grad_norm": 0.5265789031982422, + "learning_rate": 0.00019950626083190386, + "loss": 1.2419, + "step": 24010 + }, + { + "epoch": 0.051033660013204585, + "grad_norm": 0.41976234316825867, + "learning_rate": 0.00019950558461799957, + "loss": 1.2213, + "step": 24020 + }, + { + "epoch": 0.05105490633294364, + "grad_norm": 0.5750281810760498, + "learning_rate": 0.0001995049079424961, + "loss": 1.2297, + "step": 24030 + }, + { + "epoch": 0.05107615265268269, + "grad_norm": 0.6691675186157227, + "learning_rate": 0.00019950423080539658, + "loss": 1.2547, + "step": 24040 + }, + { + "epoch": 0.05109739897242174, + "grad_norm": 0.6747950315475464, + "learning_rate": 0.0001995035532067042, + "loss": 1.2506, + "step": 24050 + }, + { + "epoch": 0.0511186452921608, + "grad_norm": 0.34874096512794495, + "learning_rate": 0.00019950287514642205, + "loss": 1.2512, + "step": 24060 + }, + { + "epoch": 0.05113989161189985, + "grad_norm": 0.35173389315605164, + "learning_rate": 0.00019950219662455328, + "loss": 1.2698, + "step": 24070 + }, + { + "epoch": 0.05116113793163891, + "grad_norm": 0.390973836183548, + "learning_rate": 0.00019950151764110108, + "loss": 1.2547, + "step": 24080 + }, + { + "epoch": 0.05118238425137796, + "grad_norm": 0.33371180295944214, + "learning_rate": 0.00019950083819606854, + "loss": 1.2522, + "step": 24090 + }, + { + "epoch": 0.05120363057111701, + "grad_norm": 0.5283323526382446, + "learning_rate": 0.00019950015828945882, + "loss": 1.231, + "step": 24100 + }, + { + "epoch": 0.051224876890856065, + "grad_norm": 0.5673304200172424, + "learning_rate": 0.00019949947792127513, + "loss": 1.2418, + "step": 24110 + }, + { + "epoch": 0.051246123210595115, + "grad_norm": 0.3781672418117523, + "learning_rate": 0.00019949879709152058, + "loss": 1.2992, + "step": 24120 + }, + { + "epoch": 0.051267369530334166, + "grad_norm": 0.4733152389526367, + "learning_rate": 0.00019949811580019834, + "loss": 1.2375, + "step": 24130 + }, + { + "epoch": 0.05128861585007322, + "grad_norm": 0.4107164144515991, + "learning_rate": 0.00019949743404731155, + "loss": 1.2161, + "step": 24140 + }, + { + "epoch": 0.05130986216981227, + "grad_norm": 0.4129883944988251, + "learning_rate": 0.00019949675183286342, + "loss": 1.2595, + "step": 24150 + }, + { + "epoch": 0.05133110848955132, + "grad_norm": 0.40190017223358154, + "learning_rate": 0.00019949606915685707, + "loss": 1.2595, + "step": 24160 + }, + { + "epoch": 0.05135235480929038, + "grad_norm": 0.37459054589271545, + "learning_rate": 0.00019949538601929568, + "loss": 1.2413, + "step": 24170 + }, + { + "epoch": 0.05137360112902943, + "grad_norm": 0.42100706696510315, + "learning_rate": 0.00019949470242018244, + "loss": 1.2409, + "step": 24180 + }, + { + "epoch": 0.05139484744876848, + "grad_norm": 0.43615520000457764, + "learning_rate": 0.0001994940183595205, + "loss": 1.2597, + "step": 24190 + }, + { + "epoch": 0.05141609376850754, + "grad_norm": 0.42148593068122864, + "learning_rate": 0.00019949333383731301, + "loss": 1.2549, + "step": 24200 + }, + { + "epoch": 0.05143734008824659, + "grad_norm": 0.3592427372932434, + "learning_rate": 0.0001994926488535632, + "loss": 1.2405, + "step": 24210 + }, + { + "epoch": 0.05145858640798564, + "grad_norm": 0.450035959482193, + "learning_rate": 0.00019949196340827424, + "loss": 1.251, + "step": 24220 + }, + { + "epoch": 0.051479832727724696, + "grad_norm": 0.42885392904281616, + "learning_rate": 0.00019949127750144925, + "loss": 1.2631, + "step": 24230 + }, + { + "epoch": 0.051501079047463746, + "grad_norm": 0.33538520336151123, + "learning_rate": 0.00019949059113309145, + "loss": 1.2923, + "step": 24240 + }, + { + "epoch": 0.0515223253672028, + "grad_norm": 0.3758486211299896, + "learning_rate": 0.00019948990430320406, + "loss": 1.245, + "step": 24250 + }, + { + "epoch": 0.051543571686941854, + "grad_norm": 0.7985268235206604, + "learning_rate": 0.0001994892170117902, + "loss": 1.2335, + "step": 24260 + }, + { + "epoch": 0.051564818006680904, + "grad_norm": 0.6339213848114014, + "learning_rate": 0.00019948852925885308, + "loss": 1.244, + "step": 24270 + }, + { + "epoch": 0.051586064326419954, + "grad_norm": 0.44708412885665894, + "learning_rate": 0.00019948784104439592, + "loss": 1.2511, + "step": 24280 + }, + { + "epoch": 0.05160731064615901, + "grad_norm": 0.3272295892238617, + "learning_rate": 0.0001994871523684219, + "loss": 1.2292, + "step": 24290 + }, + { + "epoch": 0.05162855696589806, + "grad_norm": 0.4046657085418701, + "learning_rate": 0.0001994864632309342, + "loss": 1.2376, + "step": 24300 + }, + { + "epoch": 0.05164980328563712, + "grad_norm": 0.3363887071609497, + "learning_rate": 0.00019948577363193602, + "loss": 1.2208, + "step": 24310 + }, + { + "epoch": 0.05167104960537617, + "grad_norm": 0.5632559657096863, + "learning_rate": 0.0001994850835714306, + "loss": 1.208, + "step": 24320 + }, + { + "epoch": 0.05169229592511522, + "grad_norm": 0.4900578260421753, + "learning_rate": 0.00019948439304942107, + "loss": 1.2195, + "step": 24330 + }, + { + "epoch": 0.05171354224485428, + "grad_norm": 0.36952975392341614, + "learning_rate": 0.00019948370206591067, + "loss": 1.1993, + "step": 24340 + }, + { + "epoch": 0.05173478856459333, + "grad_norm": 0.43545815348625183, + "learning_rate": 0.0001994830106209026, + "loss": 1.2576, + "step": 24350 + }, + { + "epoch": 0.05175603488433238, + "grad_norm": 0.4602333903312683, + "learning_rate": 0.00019948231871440007, + "loss": 1.2803, + "step": 24360 + }, + { + "epoch": 0.051777281204071435, + "grad_norm": 0.3908073902130127, + "learning_rate": 0.0001994816263464063, + "loss": 1.2695, + "step": 24370 + }, + { + "epoch": 0.051798527523810485, + "grad_norm": 0.4804028868675232, + "learning_rate": 0.0001994809335169245, + "loss": 1.2479, + "step": 24380 + }, + { + "epoch": 0.051819773843549535, + "grad_norm": 0.35147422552108765, + "learning_rate": 0.00019948024022595787, + "loss": 1.2742, + "step": 24390 + }, + { + "epoch": 0.05184102016328859, + "grad_norm": 0.3588075637817383, + "learning_rate": 0.0001994795464735096, + "loss": 1.214, + "step": 24400 + }, + { + "epoch": 0.05186226648302764, + "grad_norm": 0.3579232096672058, + "learning_rate": 0.000199478852259583, + "loss": 1.2657, + "step": 24410 + }, + { + "epoch": 0.05188351280276669, + "grad_norm": 0.3389606475830078, + "learning_rate": 0.00019947815758418117, + "loss": 1.2661, + "step": 24420 + }, + { + "epoch": 0.05190475912250575, + "grad_norm": 0.44211772084236145, + "learning_rate": 0.00019947746244730746, + "loss": 1.2282, + "step": 24430 + }, + { + "epoch": 0.0519260054422448, + "grad_norm": 0.38755613565444946, + "learning_rate": 0.000199476766848965, + "loss": 1.2327, + "step": 24440 + }, + { + "epoch": 0.05194725176198385, + "grad_norm": 0.42927277088165283, + "learning_rate": 0.00019947607078915703, + "loss": 1.2028, + "step": 24450 + }, + { + "epoch": 0.05196849808172291, + "grad_norm": 0.383444607257843, + "learning_rate": 0.0001994753742678868, + "loss": 1.2593, + "step": 24460 + }, + { + "epoch": 0.05198974440146196, + "grad_norm": 0.3619687259197235, + "learning_rate": 0.00019947467728515753, + "loss": 1.238, + "step": 24470 + }, + { + "epoch": 0.05201099072120101, + "grad_norm": 0.48640868067741394, + "learning_rate": 0.00019947397984097247, + "loss": 1.2154, + "step": 24480 + }, + { + "epoch": 0.052032237040940066, + "grad_norm": 0.36859363317489624, + "learning_rate": 0.00019947328193533484, + "loss": 1.2745, + "step": 24490 + }, + { + "epoch": 0.052053483360679116, + "grad_norm": 0.39926332235336304, + "learning_rate": 0.00019947258356824786, + "loss": 1.2027, + "step": 24500 + }, + { + "epoch": 0.05207472968041817, + "grad_norm": 0.35746440291404724, + "learning_rate": 0.00019947188473971484, + "loss": 1.2373, + "step": 24510 + }, + { + "epoch": 0.052095976000157224, + "grad_norm": 0.35348862409591675, + "learning_rate": 0.00019947118544973895, + "loss": 1.2597, + "step": 24520 + }, + { + "epoch": 0.052117222319896274, + "grad_norm": 0.40164175629615784, + "learning_rate": 0.00019947048569832345, + "loss": 1.2584, + "step": 24530 + }, + { + "epoch": 0.05213846863963533, + "grad_norm": 0.6513522267341614, + "learning_rate": 0.00019946978548547159, + "loss": 1.1972, + "step": 24540 + }, + { + "epoch": 0.05215971495937438, + "grad_norm": 0.44849008321762085, + "learning_rate": 0.00019946908481118664, + "loss": 1.2576, + "step": 24550 + }, + { + "epoch": 0.05218096127911343, + "grad_norm": 0.3502296507358551, + "learning_rate": 0.00019946838367547183, + "loss": 1.2373, + "step": 24560 + }, + { + "epoch": 0.05220220759885249, + "grad_norm": 0.4929656386375427, + "learning_rate": 0.0001994676820783304, + "loss": 1.2426, + "step": 24570 + }, + { + "epoch": 0.05222345391859154, + "grad_norm": 0.4058248996734619, + "learning_rate": 0.00019946698001976562, + "loss": 1.2483, + "step": 24580 + }, + { + "epoch": 0.05224470023833059, + "grad_norm": 0.8685728907585144, + "learning_rate": 0.00019946627749978073, + "loss": 1.2684, + "step": 24590 + }, + { + "epoch": 0.05226594655806965, + "grad_norm": 0.3827713131904602, + "learning_rate": 0.00019946557451837904, + "loss": 1.2867, + "step": 24600 + }, + { + "epoch": 0.0522871928778087, + "grad_norm": 0.36818304657936096, + "learning_rate": 0.00019946487107556376, + "loss": 1.2097, + "step": 24610 + }, + { + "epoch": 0.05230843919754775, + "grad_norm": 0.578409731388092, + "learning_rate": 0.00019946416717133816, + "loss": 1.2089, + "step": 24620 + }, + { + "epoch": 0.052329685517286804, + "grad_norm": 0.3995554745197296, + "learning_rate": 0.00019946346280570553, + "loss": 1.228, + "step": 24630 + }, + { + "epoch": 0.052350931837025855, + "grad_norm": 0.32515570521354675, + "learning_rate": 0.0001994627579786691, + "loss": 1.2398, + "step": 24640 + }, + { + "epoch": 0.052372178156764905, + "grad_norm": 0.33230626583099365, + "learning_rate": 0.00019946205269023216, + "loss": 1.2336, + "step": 24650 + }, + { + "epoch": 0.05239342447650396, + "grad_norm": 0.40767285227775574, + "learning_rate": 0.00019946134694039805, + "loss": 1.2518, + "step": 24660 + }, + { + "epoch": 0.05241467079624301, + "grad_norm": 0.6395394206047058, + "learning_rate": 0.0001994606407291699, + "loss": 1.2378, + "step": 24670 + }, + { + "epoch": 0.05243591711598206, + "grad_norm": 0.4731801450252533, + "learning_rate": 0.0001994599340565511, + "loss": 1.2484, + "step": 24680 + }, + { + "epoch": 0.05245716343572112, + "grad_norm": 0.4583863914012909, + "learning_rate": 0.00019945922692254486, + "loss": 1.2356, + "step": 24690 + }, + { + "epoch": 0.05247840975546017, + "grad_norm": 0.36211904883384705, + "learning_rate": 0.00019945851932715454, + "loss": 1.244, + "step": 24700 + }, + { + "epoch": 0.05249965607519923, + "grad_norm": 0.37798115611076355, + "learning_rate": 0.0001994578112703833, + "loss": 1.2479, + "step": 24710 + }, + { + "epoch": 0.05252090239493828, + "grad_norm": 0.3780800700187683, + "learning_rate": 0.0001994571027522346, + "loss": 1.2624, + "step": 24720 + }, + { + "epoch": 0.05254214871467733, + "grad_norm": 0.35005202889442444, + "learning_rate": 0.00019945639377271155, + "loss": 1.2252, + "step": 24730 + }, + { + "epoch": 0.052563395034416385, + "grad_norm": 0.41263630986213684, + "learning_rate": 0.00019945568433181753, + "loss": 1.2517, + "step": 24740 + }, + { + "epoch": 0.052584641354155436, + "grad_norm": 0.3690487742424011, + "learning_rate": 0.00019945497442955582, + "loss": 1.2791, + "step": 24750 + }, + { + "epoch": 0.052605887673894486, + "grad_norm": 0.3430214822292328, + "learning_rate": 0.00019945426406592972, + "loss": 1.2645, + "step": 24760 + }, + { + "epoch": 0.05262713399363354, + "grad_norm": 0.46599850058555603, + "learning_rate": 0.0001994535532409425, + "loss": 1.2085, + "step": 24770 + }, + { + "epoch": 0.05264838031337259, + "grad_norm": 0.4680480360984802, + "learning_rate": 0.00019945284195459749, + "loss": 1.2291, + "step": 24780 + }, + { + "epoch": 0.052669626633111644, + "grad_norm": 0.4398690462112427, + "learning_rate": 0.00019945213020689794, + "loss": 1.2228, + "step": 24790 + }, + { + "epoch": 0.0526908729528507, + "grad_norm": 0.32145974040031433, + "learning_rate": 0.0001994514179978472, + "loss": 1.1992, + "step": 24800 + }, + { + "epoch": 0.05271211927258975, + "grad_norm": 0.4274260997772217, + "learning_rate": 0.0001994507053274486, + "loss": 1.2512, + "step": 24810 + }, + { + "epoch": 0.0527333655923288, + "grad_norm": 0.5601751208305359, + "learning_rate": 0.00019944999219570532, + "loss": 1.2357, + "step": 24820 + }, + { + "epoch": 0.05275461191206786, + "grad_norm": 0.8193306922912598, + "learning_rate": 0.0001994492786026208, + "loss": 1.2711, + "step": 24830 + }, + { + "epoch": 0.05277585823180691, + "grad_norm": 0.6381454467773438, + "learning_rate": 0.0001994485645481983, + "loss": 1.2101, + "step": 24840 + }, + { + "epoch": 0.05279710455154596, + "grad_norm": 0.39562591910362244, + "learning_rate": 0.00019944785003244114, + "loss": 1.2399, + "step": 24850 + }, + { + "epoch": 0.052818350871285016, + "grad_norm": 0.3677305579185486, + "learning_rate": 0.0001994471350553526, + "loss": 1.2296, + "step": 24860 + }, + { + "epoch": 0.05283959719102407, + "grad_norm": 0.39186495542526245, + "learning_rate": 0.00019944641961693602, + "loss": 1.2411, + "step": 24870 + }, + { + "epoch": 0.05286084351076312, + "grad_norm": 0.4261663854122162, + "learning_rate": 0.00019944570371719476, + "loss": 1.2992, + "step": 24880 + }, + { + "epoch": 0.052882089830502174, + "grad_norm": 0.4135468006134033, + "learning_rate": 0.00019944498735613209, + "loss": 1.2437, + "step": 24890 + }, + { + "epoch": 0.052903336150241224, + "grad_norm": 0.6751583218574524, + "learning_rate": 0.00019944427053375134, + "loss": 1.1736, + "step": 24900 + }, + { + "epoch": 0.052924582469980275, + "grad_norm": 0.4964717626571655, + "learning_rate": 0.00019944355325005587, + "loss": 1.2147, + "step": 24910 + }, + { + "epoch": 0.05294582878971933, + "grad_norm": 0.4732727110385895, + "learning_rate": 0.00019944283550504894, + "loss": 1.2252, + "step": 24920 + }, + { + "epoch": 0.05296707510945838, + "grad_norm": 0.5381280183792114, + "learning_rate": 0.00019944211729873394, + "loss": 1.2085, + "step": 24930 + }, + { + "epoch": 0.05298832142919744, + "grad_norm": 0.36530324816703796, + "learning_rate": 0.0001994413986311142, + "loss": 1.2328, + "step": 24940 + }, + { + "epoch": 0.05300956774893649, + "grad_norm": 0.3990500271320343, + "learning_rate": 0.000199440679502193, + "loss": 1.2373, + "step": 24950 + }, + { + "epoch": 0.05303081406867554, + "grad_norm": 0.3115149140357971, + "learning_rate": 0.00019943995991197376, + "loss": 1.2446, + "step": 24960 + }, + { + "epoch": 0.0530520603884146, + "grad_norm": 0.4526672065258026, + "learning_rate": 0.0001994392398604597, + "loss": 1.2792, + "step": 24970 + }, + { + "epoch": 0.05307330670815365, + "grad_norm": 0.39705801010131836, + "learning_rate": 0.00019943851934765426, + "loss": 1.2349, + "step": 24980 + }, + { + "epoch": 0.0530945530278927, + "grad_norm": 0.733925998210907, + "learning_rate": 0.0001994377983735608, + "loss": 1.219, + "step": 24990 + }, + { + "epoch": 0.053115799347631755, + "grad_norm": 0.4086494743824005, + "learning_rate": 0.00019943707693818259, + "loss": 1.2302, + "step": 25000 + }, + { + "epoch": 0.053137045667370805, + "grad_norm": 0.43468424677848816, + "learning_rate": 0.00019943635504152298, + "loss": 1.2261, + "step": 25010 + }, + { + "epoch": 0.053158291987109856, + "grad_norm": 0.587986171245575, + "learning_rate": 0.00019943563268358535, + "loss": 1.2534, + "step": 25020 + }, + { + "epoch": 0.05317953830684891, + "grad_norm": 0.45164671540260315, + "learning_rate": 0.00019943490986437304, + "loss": 1.2394, + "step": 25030 + }, + { + "epoch": 0.05320078462658796, + "grad_norm": 0.5662877559661865, + "learning_rate": 0.00019943418658388944, + "loss": 1.2464, + "step": 25040 + }, + { + "epoch": 0.05322203094632701, + "grad_norm": 0.36353152990341187, + "learning_rate": 0.00019943346284213785, + "loss": 1.2496, + "step": 25050 + }, + { + "epoch": 0.05324327726606607, + "grad_norm": 0.3101523816585541, + "learning_rate": 0.00019943273863912167, + "loss": 1.2485, + "step": 25060 + }, + { + "epoch": 0.05326452358580512, + "grad_norm": 0.572357714176178, + "learning_rate": 0.0001994320139748442, + "loss": 1.1943, + "step": 25070 + }, + { + "epoch": 0.05328576990554417, + "grad_norm": 0.4562666714191437, + "learning_rate": 0.00019943128884930886, + "loss": 1.2181, + "step": 25080 + }, + { + "epoch": 0.05330701622528323, + "grad_norm": 0.5051101446151733, + "learning_rate": 0.000199430563262519, + "loss": 1.2472, + "step": 25090 + }, + { + "epoch": 0.05332826254502228, + "grad_norm": 0.42604491114616394, + "learning_rate": 0.00019942983721447797, + "loss": 1.2532, + "step": 25100 + }, + { + "epoch": 0.05334950886476133, + "grad_norm": 0.36714568734169006, + "learning_rate": 0.00019942911070518915, + "loss": 1.2405, + "step": 25110 + }, + { + "epoch": 0.053370755184500386, + "grad_norm": 0.3645499050617218, + "learning_rate": 0.0001994283837346559, + "loss": 1.2482, + "step": 25120 + }, + { + "epoch": 0.053392001504239436, + "grad_norm": 0.42420437932014465, + "learning_rate": 0.00019942765630288163, + "loss": 1.2297, + "step": 25130 + }, + { + "epoch": 0.053413247823978494, + "grad_norm": 0.9014157652854919, + "learning_rate": 0.00019942692840986968, + "loss": 1.2443, + "step": 25140 + }, + { + "epoch": 0.053434494143717544, + "grad_norm": 0.4668499827384949, + "learning_rate": 0.0001994262000556234, + "loss": 1.2485, + "step": 25150 + }, + { + "epoch": 0.053455740463456594, + "grad_norm": 0.40873366594314575, + "learning_rate": 0.0001994254712401462, + "loss": 1.2539, + "step": 25160 + }, + { + "epoch": 0.05347698678319565, + "grad_norm": 0.4049037992954254, + "learning_rate": 0.0001994247419634415, + "loss": 1.262, + "step": 25170 + }, + { + "epoch": 0.0534982331029347, + "grad_norm": 0.5616825819015503, + "learning_rate": 0.00019942401222551264, + "loss": 1.2446, + "step": 25180 + }, + { + "epoch": 0.05351947942267375, + "grad_norm": 0.5956825613975525, + "learning_rate": 0.000199423282026363, + "loss": 1.2426, + "step": 25190 + }, + { + "epoch": 0.05354072574241281, + "grad_norm": 0.4859330952167511, + "learning_rate": 0.00019942255136599598, + "loss": 1.2341, + "step": 25200 + }, + { + "epoch": 0.05356197206215186, + "grad_norm": 0.3441159725189209, + "learning_rate": 0.00019942182024441495, + "loss": 1.2443, + "step": 25210 + }, + { + "epoch": 0.05358321838189091, + "grad_norm": 0.39771249890327454, + "learning_rate": 0.00019942108866162335, + "loss": 1.216, + "step": 25220 + }, + { + "epoch": 0.05360446470162997, + "grad_norm": 0.4450543522834778, + "learning_rate": 0.00019942035661762452, + "loss": 1.2368, + "step": 25230 + }, + { + "epoch": 0.05362571102136902, + "grad_norm": 0.34398719668388367, + "learning_rate": 0.00019941962411242187, + "loss": 1.2318, + "step": 25240 + }, + { + "epoch": 0.05364695734110807, + "grad_norm": 0.3629857897758484, + "learning_rate": 0.00019941889114601883, + "loss": 1.2333, + "step": 25250 + }, + { + "epoch": 0.053668203660847125, + "grad_norm": 0.3807019293308258, + "learning_rate": 0.00019941815771841876, + "loss": 1.2519, + "step": 25260 + }, + { + "epoch": 0.053689449980586175, + "grad_norm": 0.560609757900238, + "learning_rate": 0.0001994174238296251, + "loss": 1.2505, + "step": 25270 + }, + { + "epoch": 0.053710696300325225, + "grad_norm": 0.5652046799659729, + "learning_rate": 0.00019941668947964122, + "loss": 1.2314, + "step": 25280 + }, + { + "epoch": 0.05373194262006428, + "grad_norm": 0.3234410285949707, + "learning_rate": 0.00019941595466847054, + "loss": 1.2765, + "step": 25290 + }, + { + "epoch": 0.05375318893980333, + "grad_norm": 0.7732385993003845, + "learning_rate": 0.00019941521939611644, + "loss": 1.2244, + "step": 25300 + }, + { + "epoch": 0.05377443525954238, + "grad_norm": 0.3523139953613281, + "learning_rate": 0.0001994144836625824, + "loss": 1.2378, + "step": 25310 + }, + { + "epoch": 0.05379568157928144, + "grad_norm": 0.6291234493255615, + "learning_rate": 0.00019941374746787176, + "loss": 1.2348, + "step": 25320 + }, + { + "epoch": 0.05381692789902049, + "grad_norm": 0.31891390681266785, + "learning_rate": 0.000199413010811988, + "loss": 1.2341, + "step": 25330 + }, + { + "epoch": 0.05383817421875954, + "grad_norm": 0.46104714274406433, + "learning_rate": 0.0001994122736949345, + "loss": 1.239, + "step": 25340 + }, + { + "epoch": 0.0538594205384986, + "grad_norm": 0.49261489510536194, + "learning_rate": 0.0001994115361167147, + "loss": 1.2558, + "step": 25350 + }, + { + "epoch": 0.05388066685823765, + "grad_norm": 0.444923996925354, + "learning_rate": 0.00019941079807733197, + "loss": 1.2477, + "step": 25360 + }, + { + "epoch": 0.053901913177976706, + "grad_norm": 0.4448537230491638, + "learning_rate": 0.00019941005957678975, + "loss": 1.2178, + "step": 25370 + }, + { + "epoch": 0.053923159497715756, + "grad_norm": 0.36429470777511597, + "learning_rate": 0.00019940932061509152, + "loss": 1.2737, + "step": 25380 + }, + { + "epoch": 0.053944405817454806, + "grad_norm": 0.4367513358592987, + "learning_rate": 0.0001994085811922407, + "loss": 1.2446, + "step": 25390 + }, + { + "epoch": 0.05396565213719386, + "grad_norm": 0.4454650282859802, + "learning_rate": 0.00019940784130824063, + "loss": 1.2574, + "step": 25400 + }, + { + "epoch": 0.053986898456932914, + "grad_norm": 0.8069959878921509, + "learning_rate": 0.00019940710096309484, + "loss": 1.2487, + "step": 25410 + }, + { + "epoch": 0.054008144776671964, + "grad_norm": 0.6194384098052979, + "learning_rate": 0.00019940636015680673, + "loss": 1.2387, + "step": 25420 + }, + { + "epoch": 0.05402939109641102, + "grad_norm": 0.337843656539917, + "learning_rate": 0.00019940561888937973, + "loss": 1.246, + "step": 25430 + }, + { + "epoch": 0.05405063741615007, + "grad_norm": 0.393014132976532, + "learning_rate": 0.0001994048771608173, + "loss": 1.221, + "step": 25440 + }, + { + "epoch": 0.05407188373588912, + "grad_norm": 0.309234619140625, + "learning_rate": 0.00019940413497112288, + "loss": 1.2408, + "step": 25450 + }, + { + "epoch": 0.05409313005562818, + "grad_norm": 0.3503212630748749, + "learning_rate": 0.00019940339232029985, + "loss": 1.2908, + "step": 25460 + }, + { + "epoch": 0.05411437637536723, + "grad_norm": 0.3836843967437744, + "learning_rate": 0.00019940264920835174, + "loss": 1.223, + "step": 25470 + }, + { + "epoch": 0.05413562269510628, + "grad_norm": 0.6607913374900818, + "learning_rate": 0.00019940190563528195, + "loss": 1.2381, + "step": 25480 + }, + { + "epoch": 0.05415686901484534, + "grad_norm": 0.37120872735977173, + "learning_rate": 0.00019940116160109395, + "loss": 1.2612, + "step": 25490 + }, + { + "epoch": 0.05417811533458439, + "grad_norm": 0.3711504638195038, + "learning_rate": 0.00019940041710579117, + "loss": 1.2584, + "step": 25500 + }, + { + "epoch": 0.05419936165432344, + "grad_norm": 0.39607641100883484, + "learning_rate": 0.0001993996721493771, + "loss": 1.2339, + "step": 25510 + }, + { + "epoch": 0.054220607974062494, + "grad_norm": 0.3260195851325989, + "learning_rate": 0.00019939892673185515, + "loss": 1.2731, + "step": 25520 + }, + { + "epoch": 0.054241854293801545, + "grad_norm": 0.451011061668396, + "learning_rate": 0.0001993981808532288, + "loss": 1.2042, + "step": 25530 + }, + { + "epoch": 0.054263100613540595, + "grad_norm": 0.37300431728363037, + "learning_rate": 0.00019939743451350152, + "loss": 1.2343, + "step": 25540 + }, + { + "epoch": 0.05428434693327965, + "grad_norm": 0.4058992862701416, + "learning_rate": 0.00019939668771267678, + "loss": 1.2509, + "step": 25550 + }, + { + "epoch": 0.0543055932530187, + "grad_norm": 0.3985253870487213, + "learning_rate": 0.00019939594045075798, + "loss": 1.218, + "step": 25560 + }, + { + "epoch": 0.05432683957275776, + "grad_norm": 0.37032273411750793, + "learning_rate": 0.00019939519272774866, + "loss": 1.2487, + "step": 25570 + }, + { + "epoch": 0.05434808589249681, + "grad_norm": 0.38702675700187683, + "learning_rate": 0.00019939444454365228, + "loss": 1.2638, + "step": 25580 + }, + { + "epoch": 0.05436933221223586, + "grad_norm": 0.7721424102783203, + "learning_rate": 0.00019939369589847225, + "loss": 1.2597, + "step": 25590 + }, + { + "epoch": 0.05439057853197492, + "grad_norm": 0.5324329137802124, + "learning_rate": 0.00019939294679221213, + "loss": 1.272, + "step": 25600 + }, + { + "epoch": 0.05441182485171397, + "grad_norm": 0.36440524458885193, + "learning_rate": 0.0001993921972248753, + "loss": 1.217, + "step": 25610 + }, + { + "epoch": 0.05443307117145302, + "grad_norm": 0.46913841366767883, + "learning_rate": 0.00019939144719646534, + "loss": 1.2527, + "step": 25620 + }, + { + "epoch": 0.054454317491192075, + "grad_norm": 0.36507079005241394, + "learning_rate": 0.00019939069670698564, + "loss": 1.2505, + "step": 25630 + }, + { + "epoch": 0.054475563810931125, + "grad_norm": 0.5354877710342407, + "learning_rate": 0.00019938994575643975, + "loss": 1.2277, + "step": 25640 + }, + { + "epoch": 0.054496810130670176, + "grad_norm": 0.32057130336761475, + "learning_rate": 0.00019938919434483107, + "loss": 1.2383, + "step": 25650 + }, + { + "epoch": 0.05451805645040923, + "grad_norm": 0.36410966515541077, + "learning_rate": 0.0001993884424721632, + "loss": 1.2124, + "step": 25660 + }, + { + "epoch": 0.05453930277014828, + "grad_norm": 0.3912813365459442, + "learning_rate": 0.00019938769013843954, + "loss": 1.278, + "step": 25670 + }, + { + "epoch": 0.054560549089887334, + "grad_norm": 0.3488604426383972, + "learning_rate": 0.00019938693734366362, + "loss": 1.2468, + "step": 25680 + }, + { + "epoch": 0.05458179540962639, + "grad_norm": 0.5557785630226135, + "learning_rate": 0.00019938618408783888, + "loss": 1.232, + "step": 25690 + }, + { + "epoch": 0.05460304172936544, + "grad_norm": 0.48879295587539673, + "learning_rate": 0.00019938543037096887, + "loss": 1.2608, + "step": 25700 + }, + { + "epoch": 0.05462428804910449, + "grad_norm": 0.6104822754859924, + "learning_rate": 0.00019938467619305707, + "loss": 1.2567, + "step": 25710 + }, + { + "epoch": 0.05464553436884355, + "grad_norm": 0.4417276084423065, + "learning_rate": 0.00019938392155410698, + "loss": 1.2389, + "step": 25720 + }, + { + "epoch": 0.0546667806885826, + "grad_norm": 0.43413951992988586, + "learning_rate": 0.0001993831664541221, + "loss": 1.2383, + "step": 25730 + }, + { + "epoch": 0.05468802700832165, + "grad_norm": 0.3718404471874237, + "learning_rate": 0.00019938241089310593, + "loss": 1.2384, + "step": 25740 + }, + { + "epoch": 0.054709273328060706, + "grad_norm": 0.3517175614833832, + "learning_rate": 0.00019938165487106194, + "loss": 1.2375, + "step": 25750 + }, + { + "epoch": 0.05473051964779976, + "grad_norm": 0.4247318208217621, + "learning_rate": 0.00019938089838799372, + "loss": 1.2059, + "step": 25760 + }, + { + "epoch": 0.054751765967538814, + "grad_norm": 0.32622456550598145, + "learning_rate": 0.00019938014144390472, + "loss": 1.2141, + "step": 25770 + }, + { + "epoch": 0.054773012287277864, + "grad_norm": 0.3204079866409302, + "learning_rate": 0.00019937938403879847, + "loss": 1.1894, + "step": 25780 + }, + { + "epoch": 0.054794258607016914, + "grad_norm": 0.465189665555954, + "learning_rate": 0.00019937862617267846, + "loss": 1.2515, + "step": 25790 + }, + { + "epoch": 0.05481550492675597, + "grad_norm": 0.3863280117511749, + "learning_rate": 0.00019937786784554823, + "loss": 1.2303, + "step": 25800 + }, + { + "epoch": 0.05483675124649502, + "grad_norm": 0.4558713138103485, + "learning_rate": 0.00019937710905741128, + "loss": 1.2845, + "step": 25810 + }, + { + "epoch": 0.05485799756623407, + "grad_norm": 0.429305762052536, + "learning_rate": 0.00019937634980827116, + "loss": 1.2349, + "step": 25820 + }, + { + "epoch": 0.05487924388597313, + "grad_norm": 0.37545138597488403, + "learning_rate": 0.0001993755900981314, + "loss": 1.2056, + "step": 25830 + }, + { + "epoch": 0.05490049020571218, + "grad_norm": 0.49369242787361145, + "learning_rate": 0.00019937482992699542, + "loss": 1.2415, + "step": 25840 + }, + { + "epoch": 0.05492173652545123, + "grad_norm": 0.5260729789733887, + "learning_rate": 0.00019937406929486687, + "loss": 1.2436, + "step": 25850 + }, + { + "epoch": 0.05494298284519029, + "grad_norm": 0.42930302023887634, + "learning_rate": 0.00019937330820174925, + "loss": 1.2608, + "step": 25860 + }, + { + "epoch": 0.05496422916492934, + "grad_norm": 0.35683152079582214, + "learning_rate": 0.00019937254664764603, + "loss": 1.2298, + "step": 25870 + }, + { + "epoch": 0.05498547548466839, + "grad_norm": 0.49753445386886597, + "learning_rate": 0.0001993717846325608, + "loss": 1.2194, + "step": 25880 + }, + { + "epoch": 0.055006721804407445, + "grad_norm": 0.352874755859375, + "learning_rate": 0.0001993710221564971, + "loss": 1.25, + "step": 25890 + }, + { + "epoch": 0.055027968124146495, + "grad_norm": 0.3514905273914337, + "learning_rate": 0.00019937025921945844, + "loss": 1.2681, + "step": 25900 + }, + { + "epoch": 0.055049214443885545, + "grad_norm": 0.35821977257728577, + "learning_rate": 0.00019936949582144835, + "loss": 1.2151, + "step": 25910 + }, + { + "epoch": 0.0550704607636246, + "grad_norm": 0.34171468019485474, + "learning_rate": 0.0001993687319624704, + "loss": 1.2324, + "step": 25920 + }, + { + "epoch": 0.05509170708336365, + "grad_norm": 0.3887169659137726, + "learning_rate": 0.0001993679676425281, + "loss": 1.2233, + "step": 25930 + }, + { + "epoch": 0.0551129534031027, + "grad_norm": 0.3397194743156433, + "learning_rate": 0.00019936720286162506, + "loss": 1.2449, + "step": 25940 + }, + { + "epoch": 0.05513419972284176, + "grad_norm": 0.4003836214542389, + "learning_rate": 0.00019936643761976476, + "loss": 1.217, + "step": 25950 + }, + { + "epoch": 0.05515544604258081, + "grad_norm": 0.4990089237689972, + "learning_rate": 0.00019936567191695078, + "loss": 1.2546, + "step": 25960 + }, + { + "epoch": 0.05517669236231986, + "grad_norm": 0.39445289969444275, + "learning_rate": 0.00019936490575318664, + "loss": 1.2342, + "step": 25970 + }, + { + "epoch": 0.05519793868205892, + "grad_norm": 0.3518037497997284, + "learning_rate": 0.00019936413912847596, + "loss": 1.26, + "step": 25980 + }, + { + "epoch": 0.05521918500179797, + "grad_norm": 0.3954916000366211, + "learning_rate": 0.00019936337204282223, + "loss": 1.2577, + "step": 25990 + }, + { + "epoch": 0.055240431321537026, + "grad_norm": 0.40680912137031555, + "learning_rate": 0.00019936260449622903, + "loss": 1.249, + "step": 26000 + }, + { + "epoch": 0.055261677641276076, + "grad_norm": 0.3801758885383606, + "learning_rate": 0.00019936183648869995, + "loss": 1.2243, + "step": 26010 + }, + { + "epoch": 0.055282923961015126, + "grad_norm": 0.47989609837532043, + "learning_rate": 0.00019936106802023853, + "loss": 1.2222, + "step": 26020 + }, + { + "epoch": 0.055304170280754184, + "grad_norm": 0.3852805495262146, + "learning_rate": 0.00019936029909084832, + "loss": 1.2401, + "step": 26030 + }, + { + "epoch": 0.055325416600493234, + "grad_norm": 0.3150014281272888, + "learning_rate": 0.00019935952970053292, + "loss": 1.2695, + "step": 26040 + }, + { + "epoch": 0.055346662920232284, + "grad_norm": 0.3902243673801422, + "learning_rate": 0.00019935875984929585, + "loss": 1.2307, + "step": 26050 + }, + { + "epoch": 0.05536790923997134, + "grad_norm": 0.5049930214881897, + "learning_rate": 0.00019935798953714073, + "loss": 1.2591, + "step": 26060 + }, + { + "epoch": 0.05538915555971039, + "grad_norm": 0.4054262042045593, + "learning_rate": 0.0001993572187640711, + "loss": 1.2676, + "step": 26070 + }, + { + "epoch": 0.05541040187944944, + "grad_norm": 0.4913196265697479, + "learning_rate": 0.00019935644753009055, + "loss": 1.2193, + "step": 26080 + }, + { + "epoch": 0.0554316481991885, + "grad_norm": 0.3709360957145691, + "learning_rate": 0.00019935567583520268, + "loss": 1.2215, + "step": 26090 + }, + { + "epoch": 0.05545289451892755, + "grad_norm": 0.39726200699806213, + "learning_rate": 0.00019935490367941104, + "loss": 1.2311, + "step": 26100 + }, + { + "epoch": 0.0554741408386666, + "grad_norm": 0.475394070148468, + "learning_rate": 0.00019935413106271922, + "loss": 1.2334, + "step": 26110 + }, + { + "epoch": 0.05549538715840566, + "grad_norm": 0.3374480605125427, + "learning_rate": 0.00019935335798513078, + "loss": 1.2603, + "step": 26120 + }, + { + "epoch": 0.05551663347814471, + "grad_norm": 0.39625757932662964, + "learning_rate": 0.00019935258444664936, + "loss": 1.2425, + "step": 26130 + }, + { + "epoch": 0.05553787979788376, + "grad_norm": 0.7715783715248108, + "learning_rate": 0.0001993518104472785, + "loss": 1.273, + "step": 26140 + }, + { + "epoch": 0.055559126117622815, + "grad_norm": 0.4149562120437622, + "learning_rate": 0.00019935103598702183, + "loss": 1.1884, + "step": 26150 + }, + { + "epoch": 0.055580372437361865, + "grad_norm": 0.32342690229415894, + "learning_rate": 0.00019935026106588294, + "loss": 1.2626, + "step": 26160 + }, + { + "epoch": 0.055601618757100915, + "grad_norm": 0.5075507760047913, + "learning_rate": 0.00019934948568386536, + "loss": 1.2131, + "step": 26170 + }, + { + "epoch": 0.05562286507683997, + "grad_norm": 0.48247575759887695, + "learning_rate": 0.00019934870984097275, + "loss": 1.224, + "step": 26180 + }, + { + "epoch": 0.05564411139657902, + "grad_norm": 0.3581051230430603, + "learning_rate": 0.0001993479335372087, + "loss": 1.2588, + "step": 26190 + }, + { + "epoch": 0.05566535771631808, + "grad_norm": 0.42837172746658325, + "learning_rate": 0.00019934715677257677, + "loss": 1.1661, + "step": 26200 + }, + { + "epoch": 0.05568660403605713, + "grad_norm": 0.37455832958221436, + "learning_rate": 0.00019934637954708065, + "loss": 1.2292, + "step": 26210 + }, + { + "epoch": 0.05570785035579618, + "grad_norm": 0.35224902629852295, + "learning_rate": 0.00019934560186072385, + "loss": 1.2277, + "step": 26220 + }, + { + "epoch": 0.05572909667553524, + "grad_norm": 0.39035552740097046, + "learning_rate": 0.00019934482371351003, + "loss": 1.2527, + "step": 26230 + }, + { + "epoch": 0.05575034299527429, + "grad_norm": 0.6370126605033875, + "learning_rate": 0.00019934404510544277, + "loss": 1.2535, + "step": 26240 + }, + { + "epoch": 0.05577158931501334, + "grad_norm": 0.6871103048324585, + "learning_rate": 0.00019934326603652575, + "loss": 1.2709, + "step": 26250 + }, + { + "epoch": 0.055792835634752395, + "grad_norm": 0.4162043035030365, + "learning_rate": 0.0001993424865067625, + "loss": 1.221, + "step": 26260 + }, + { + "epoch": 0.055814081954491446, + "grad_norm": 0.3619583547115326, + "learning_rate": 0.00019934170651615668, + "loss": 1.2673, + "step": 26270 + }, + { + "epoch": 0.055835328274230496, + "grad_norm": 0.43884843587875366, + "learning_rate": 0.00019934092606471192, + "loss": 1.2189, + "step": 26280 + }, + { + "epoch": 0.05585657459396955, + "grad_norm": 0.48190683126449585, + "learning_rate": 0.0001993401451524318, + "loss": 1.2529, + "step": 26290 + }, + { + "epoch": 0.055877820913708603, + "grad_norm": 0.37671732902526855, + "learning_rate": 0.00019933936377932, + "loss": 1.2643, + "step": 26300 + }, + { + "epoch": 0.055899067233447654, + "grad_norm": 0.5260568857192993, + "learning_rate": 0.00019933858194538004, + "loss": 1.2561, + "step": 26310 + }, + { + "epoch": 0.05592031355318671, + "grad_norm": 0.42680272459983826, + "learning_rate": 0.00019933779965061564, + "loss": 1.2123, + "step": 26320 + }, + { + "epoch": 0.05594155987292576, + "grad_norm": 0.7915481328964233, + "learning_rate": 0.00019933701689503044, + "loss": 1.2251, + "step": 26330 + }, + { + "epoch": 0.05596280619266481, + "grad_norm": 0.3681657910346985, + "learning_rate": 0.000199336233678628, + "loss": 1.1978, + "step": 26340 + }, + { + "epoch": 0.05598405251240387, + "grad_norm": 0.4174920916557312, + "learning_rate": 0.000199335450001412, + "loss": 1.2513, + "step": 26350 + }, + { + "epoch": 0.05600529883214292, + "grad_norm": 0.4508395791053772, + "learning_rate": 0.00019933466586338606, + "loss": 1.2501, + "step": 26360 + }, + { + "epoch": 0.05602654515188197, + "grad_norm": 0.5104049444198608, + "learning_rate": 0.00019933388126455387, + "loss": 1.2583, + "step": 26370 + }, + { + "epoch": 0.05604779147162103, + "grad_norm": 0.437319815158844, + "learning_rate": 0.00019933309620491895, + "loss": 1.2011, + "step": 26380 + }, + { + "epoch": 0.05606903779136008, + "grad_norm": 0.37454915046691895, + "learning_rate": 0.00019933231068448504, + "loss": 1.2664, + "step": 26390 + }, + { + "epoch": 0.05609028411109913, + "grad_norm": 0.38629868626594543, + "learning_rate": 0.00019933152470325574, + "loss": 1.2331, + "step": 26400 + }, + { + "epoch": 0.056111530430838184, + "grad_norm": 0.4679853916168213, + "learning_rate": 0.00019933073826123473, + "loss": 1.2502, + "step": 26410 + }, + { + "epoch": 0.056132776750577235, + "grad_norm": 0.5815054178237915, + "learning_rate": 0.00019932995135842565, + "loss": 1.2251, + "step": 26420 + }, + { + "epoch": 0.05615402307031629, + "grad_norm": 0.4611288607120514, + "learning_rate": 0.00019932916399483213, + "loss": 1.2466, + "step": 26430 + }, + { + "epoch": 0.05617526939005534, + "grad_norm": 0.33474311232566833, + "learning_rate": 0.00019932837617045785, + "loss": 1.2134, + "step": 26440 + }, + { + "epoch": 0.05619651570979439, + "grad_norm": 0.43156948685646057, + "learning_rate": 0.00019932758788530645, + "loss": 1.255, + "step": 26450 + }, + { + "epoch": 0.05621776202953345, + "grad_norm": 0.7407731413841248, + "learning_rate": 0.00019932679913938154, + "loss": 1.2177, + "step": 26460 + }, + { + "epoch": 0.0562390083492725, + "grad_norm": 0.7561128735542297, + "learning_rate": 0.00019932600993268688, + "loss": 1.2154, + "step": 26470 + }, + { + "epoch": 0.05626025466901155, + "grad_norm": 0.6056798696517944, + "learning_rate": 0.00019932522026522604, + "loss": 1.2485, + "step": 26480 + }, + { + "epoch": 0.05628150098875061, + "grad_norm": 0.41564950346946716, + "learning_rate": 0.00019932443013700275, + "loss": 1.2663, + "step": 26490 + }, + { + "epoch": 0.05630274730848966, + "grad_norm": 0.34920766949653625, + "learning_rate": 0.0001993236395480206, + "loss": 1.2406, + "step": 26500 + }, + { + "epoch": 0.05632399362822871, + "grad_norm": 0.34464290738105774, + "learning_rate": 0.00019932284849828334, + "loss": 1.2369, + "step": 26510 + }, + { + "epoch": 0.056345239947967765, + "grad_norm": 0.3870026171207428, + "learning_rate": 0.00019932205698779457, + "loss": 1.2507, + "step": 26520 + }, + { + "epoch": 0.056366486267706815, + "grad_norm": 0.43350833654403687, + "learning_rate": 0.00019932126501655804, + "loss": 1.232, + "step": 26530 + }, + { + "epoch": 0.056387732587445866, + "grad_norm": 0.366701602935791, + "learning_rate": 0.00019932047258457733, + "loss": 1.233, + "step": 26540 + }, + { + "epoch": 0.05640897890718492, + "grad_norm": 0.4627804458141327, + "learning_rate": 0.00019931967969185618, + "loss": 1.2525, + "step": 26550 + }, + { + "epoch": 0.05643022522692397, + "grad_norm": 0.3454352915287018, + "learning_rate": 0.00019931888633839827, + "loss": 1.2594, + "step": 26560 + }, + { + "epoch": 0.056451471546663023, + "grad_norm": 0.42362430691719055, + "learning_rate": 0.00019931809252420727, + "loss": 1.2277, + "step": 26570 + }, + { + "epoch": 0.05647271786640208, + "grad_norm": 0.6166868805885315, + "learning_rate": 0.0001993172982492868, + "loss": 1.2534, + "step": 26580 + }, + { + "epoch": 0.05649396418614113, + "grad_norm": 0.5298596620559692, + "learning_rate": 0.0001993165035136406, + "loss": 1.2499, + "step": 26590 + }, + { + "epoch": 0.05651521050588018, + "grad_norm": 0.36582833528518677, + "learning_rate": 0.0001993157083172724, + "loss": 1.2499, + "step": 26600 + }, + { + "epoch": 0.05653645682561924, + "grad_norm": 0.37887901067733765, + "learning_rate": 0.00019931491266018584, + "loss": 1.2597, + "step": 26610 + }, + { + "epoch": 0.05655770314535829, + "grad_norm": 0.34627610445022583, + "learning_rate": 0.0001993141165423846, + "loss": 1.2512, + "step": 26620 + }, + { + "epoch": 0.056578949465097346, + "grad_norm": 0.37451913952827454, + "learning_rate": 0.00019931331996387238, + "loss": 1.231, + "step": 26630 + }, + { + "epoch": 0.056600195784836396, + "grad_norm": 0.3893952965736389, + "learning_rate": 0.0001993125229246529, + "loss": 1.2302, + "step": 26640 + }, + { + "epoch": 0.056621442104575447, + "grad_norm": 0.6097297668457031, + "learning_rate": 0.0001993117254247298, + "loss": 1.2323, + "step": 26650 + }, + { + "epoch": 0.056642688424314504, + "grad_norm": 0.387380450963974, + "learning_rate": 0.00019931092746410689, + "loss": 1.2244, + "step": 26660 + }, + { + "epoch": 0.056663934744053554, + "grad_norm": 0.4020083546638489, + "learning_rate": 0.00019931012904278775, + "loss": 1.2327, + "step": 26670 + }, + { + "epoch": 0.056685181063792604, + "grad_norm": 0.4695853590965271, + "learning_rate": 0.00019930933016077619, + "loss": 1.2612, + "step": 26680 + }, + { + "epoch": 0.05670642738353166, + "grad_norm": 0.3443182408809662, + "learning_rate": 0.0001993085308180758, + "loss": 1.245, + "step": 26690 + }, + { + "epoch": 0.05672767370327071, + "grad_norm": 0.38462138175964355, + "learning_rate": 0.00019930773101469037, + "loss": 1.2527, + "step": 26700 + }, + { + "epoch": 0.05674892002300976, + "grad_norm": 0.9288682341575623, + "learning_rate": 0.0001993069307506236, + "loss": 1.2134, + "step": 26710 + }, + { + "epoch": 0.05677016634274882, + "grad_norm": 0.5551433563232422, + "learning_rate": 0.0001993061300258792, + "loss": 1.2144, + "step": 26720 + }, + { + "epoch": 0.05679141266248787, + "grad_norm": 0.3511385917663574, + "learning_rate": 0.00019930532884046085, + "loss": 1.2481, + "step": 26730 + }, + { + "epoch": 0.05681265898222692, + "grad_norm": 0.4086706340312958, + "learning_rate": 0.0001993045271943723, + "loss": 1.2134, + "step": 26740 + }, + { + "epoch": 0.05683390530196598, + "grad_norm": 0.36554211378097534, + "learning_rate": 0.0001993037250876173, + "loss": 1.2857, + "step": 26750 + }, + { + "epoch": 0.05685515162170503, + "grad_norm": 0.3903580605983734, + "learning_rate": 0.0001993029225201995, + "loss": 1.2363, + "step": 26760 + }, + { + "epoch": 0.05687639794144408, + "grad_norm": 0.35991621017456055, + "learning_rate": 0.00019930211949212266, + "loss": 1.1997, + "step": 26770 + }, + { + "epoch": 0.056897644261183135, + "grad_norm": 0.4790947139263153, + "learning_rate": 0.0001993013160033905, + "loss": 1.2471, + "step": 26780 + }, + { + "epoch": 0.056918890580922185, + "grad_norm": 0.5423392057418823, + "learning_rate": 0.00019930051205400675, + "loss": 1.2572, + "step": 26790 + }, + { + "epoch": 0.056940136900661235, + "grad_norm": 0.8387675881385803, + "learning_rate": 0.00019929970764397513, + "loss": 1.2575, + "step": 26800 + }, + { + "epoch": 0.05696138322040029, + "grad_norm": 0.3756154775619507, + "learning_rate": 0.0001992989027732994, + "loss": 1.1874, + "step": 26810 + }, + { + "epoch": 0.05698262954013934, + "grad_norm": 0.46651408076286316, + "learning_rate": 0.00019929809744198326, + "loss": 1.2303, + "step": 26820 + }, + { + "epoch": 0.0570038758598784, + "grad_norm": 0.35634106397628784, + "learning_rate": 0.0001992972916500305, + "loss": 1.2265, + "step": 26830 + }, + { + "epoch": 0.05702512217961745, + "grad_norm": 0.33929142355918884, + "learning_rate": 0.00019929648539744477, + "loss": 1.2245, + "step": 26840 + }, + { + "epoch": 0.0570463684993565, + "grad_norm": 0.49925971031188965, + "learning_rate": 0.00019929567868422985, + "loss": 1.2248, + "step": 26850 + }, + { + "epoch": 0.05706761481909556, + "grad_norm": 0.4338636100292206, + "learning_rate": 0.00019929487151038952, + "loss": 1.2489, + "step": 26860 + }, + { + "epoch": 0.05708886113883461, + "grad_norm": 0.45062291622161865, + "learning_rate": 0.00019929406387592748, + "loss": 1.2959, + "step": 26870 + }, + { + "epoch": 0.05711010745857366, + "grad_norm": 0.37378257513046265, + "learning_rate": 0.00019929325578084753, + "loss": 1.2606, + "step": 26880 + }, + { + "epoch": 0.057131353778312716, + "grad_norm": 0.34317049384117126, + "learning_rate": 0.00019929244722515332, + "loss": 1.2472, + "step": 26890 + }, + { + "epoch": 0.057152600098051766, + "grad_norm": 0.7158188819885254, + "learning_rate": 0.00019929163820884868, + "loss": 1.1815, + "step": 26900 + }, + { + "epoch": 0.057173846417790816, + "grad_norm": 0.5053520798683167, + "learning_rate": 0.00019929082873193737, + "loss": 1.2316, + "step": 26910 + }, + { + "epoch": 0.05719509273752987, + "grad_norm": 0.36840149760246277, + "learning_rate": 0.00019929001879442307, + "loss": 1.2234, + "step": 26920 + }, + { + "epoch": 0.057216339057268924, + "grad_norm": 0.36567768454551697, + "learning_rate": 0.00019928920839630966, + "loss": 1.2591, + "step": 26930 + }, + { + "epoch": 0.057237585377007974, + "grad_norm": 0.35374715924263, + "learning_rate": 0.00019928839753760076, + "loss": 1.2515, + "step": 26940 + }, + { + "epoch": 0.05725883169674703, + "grad_norm": 0.41577181220054626, + "learning_rate": 0.00019928758621830023, + "loss": 1.2593, + "step": 26950 + }, + { + "epoch": 0.05728007801648608, + "grad_norm": 0.4029598832130432, + "learning_rate": 0.00019928677443841177, + "loss": 1.2712, + "step": 26960 + }, + { + "epoch": 0.05730132433622513, + "grad_norm": 0.3599306046962738, + "learning_rate": 0.0001992859621979392, + "loss": 1.2177, + "step": 26970 + }, + { + "epoch": 0.05732257065596419, + "grad_norm": 0.3542388677597046, + "learning_rate": 0.00019928514949688625, + "loss": 1.1907, + "step": 26980 + }, + { + "epoch": 0.05734381697570324, + "grad_norm": 0.335592120885849, + "learning_rate": 0.00019928433633525672, + "loss": 1.1989, + "step": 26990 + }, + { + "epoch": 0.05736506329544229, + "grad_norm": 0.4299246370792389, + "learning_rate": 0.00019928352271305437, + "loss": 1.2146, + "step": 27000 + }, + { + "epoch": 0.05738630961518135, + "grad_norm": 0.40783339738845825, + "learning_rate": 0.00019928270863028296, + "loss": 1.1869, + "step": 27010 + }, + { + "epoch": 0.0574075559349204, + "grad_norm": 0.42735210061073303, + "learning_rate": 0.00019928189408694626, + "loss": 1.2193, + "step": 27020 + }, + { + "epoch": 0.05742880225465945, + "grad_norm": 0.39111292362213135, + "learning_rate": 0.00019928107908304807, + "loss": 1.2232, + "step": 27030 + }, + { + "epoch": 0.057450048574398505, + "grad_norm": 0.46430832147598267, + "learning_rate": 0.0001992802636185922, + "loss": 1.2089, + "step": 27040 + }, + { + "epoch": 0.057471294894137555, + "grad_norm": 0.33045604825019836, + "learning_rate": 0.0001992794476935824, + "loss": 1.2393, + "step": 27050 + }, + { + "epoch": 0.05749254121387661, + "grad_norm": 0.6917517185211182, + "learning_rate": 0.00019927863130802243, + "loss": 1.2439, + "step": 27060 + }, + { + "epoch": 0.05751378753361566, + "grad_norm": 0.46373847126960754, + "learning_rate": 0.00019927781446191608, + "loss": 1.2578, + "step": 27070 + }, + { + "epoch": 0.05753503385335471, + "grad_norm": 0.35305726528167725, + "learning_rate": 0.0001992769971552672, + "loss": 1.2371, + "step": 27080 + }, + { + "epoch": 0.05755628017309377, + "grad_norm": 0.3670641779899597, + "learning_rate": 0.00019927617938807953, + "loss": 1.2335, + "step": 27090 + }, + { + "epoch": 0.05757752649283282, + "grad_norm": 0.38989850878715515, + "learning_rate": 0.0001992753611603569, + "loss": 1.2302, + "step": 27100 + }, + { + "epoch": 0.05759877281257187, + "grad_norm": 0.40657100081443787, + "learning_rate": 0.000199274542472103, + "loss": 1.2638, + "step": 27110 + }, + { + "epoch": 0.05762001913231093, + "grad_norm": 0.39339008927345276, + "learning_rate": 0.00019927372332332179, + "loss": 1.2154, + "step": 27120 + }, + { + "epoch": 0.05764126545204998, + "grad_norm": 0.4430629312992096, + "learning_rate": 0.00019927290371401696, + "loss": 1.2153, + "step": 27130 + }, + { + "epoch": 0.05766251177178903, + "grad_norm": 0.3563041090965271, + "learning_rate": 0.00019927208364419236, + "loss": 1.2275, + "step": 27140 + }, + { + "epoch": 0.057683758091528085, + "grad_norm": 0.3823634386062622, + "learning_rate": 0.00019927126311385175, + "loss": 1.2274, + "step": 27150 + }, + { + "epoch": 0.057705004411267136, + "grad_norm": 0.4245425760746002, + "learning_rate": 0.000199270442122999, + "loss": 1.23, + "step": 27160 + }, + { + "epoch": 0.057726250731006186, + "grad_norm": 0.393598347902298, + "learning_rate": 0.00019926962067163783, + "loss": 1.2542, + "step": 27170 + }, + { + "epoch": 0.05774749705074524, + "grad_norm": 0.3846806585788727, + "learning_rate": 0.00019926879875977217, + "loss": 1.2894, + "step": 27180 + }, + { + "epoch": 0.05776874337048429, + "grad_norm": 0.36283355951309204, + "learning_rate": 0.00019926797638740567, + "loss": 1.2484, + "step": 27190 + }, + { + "epoch": 0.057789989690223344, + "grad_norm": 0.4263078570365906, + "learning_rate": 0.00019926715355454233, + "loss": 1.1838, + "step": 27200 + }, + { + "epoch": 0.0578112360099624, + "grad_norm": 0.49770215153694153, + "learning_rate": 0.00019926633026118583, + "loss": 1.1939, + "step": 27210 + }, + { + "epoch": 0.05783248232970145, + "grad_norm": 0.45647650957107544, + "learning_rate": 0.00019926550650734004, + "loss": 1.1933, + "step": 27220 + }, + { + "epoch": 0.0578537286494405, + "grad_norm": 0.4968070685863495, + "learning_rate": 0.0001992646822930088, + "loss": 1.2497, + "step": 27230 + }, + { + "epoch": 0.05787497496917956, + "grad_norm": 0.38042309880256653, + "learning_rate": 0.00019926385761819587, + "loss": 1.2501, + "step": 27240 + }, + { + "epoch": 0.05789622128891861, + "grad_norm": 0.37272346019744873, + "learning_rate": 0.00019926303248290515, + "loss": 1.2655, + "step": 27250 + }, + { + "epoch": 0.057917467608657666, + "grad_norm": 0.343142569065094, + "learning_rate": 0.0001992622068871404, + "loss": 1.2015, + "step": 27260 + }, + { + "epoch": 0.057938713928396716, + "grad_norm": 0.47782501578330994, + "learning_rate": 0.0001992613808309055, + "loss": 1.2206, + "step": 27270 + }, + { + "epoch": 0.05795996024813577, + "grad_norm": 0.46555984020233154, + "learning_rate": 0.00019926055431420428, + "loss": 1.205, + "step": 27280 + }, + { + "epoch": 0.057981206567874824, + "grad_norm": 0.573769211769104, + "learning_rate": 0.00019925972733704056, + "loss": 1.2191, + "step": 27290 + }, + { + "epoch": 0.058002452887613874, + "grad_norm": 0.37154537439346313, + "learning_rate": 0.00019925889989941812, + "loss": 1.2207, + "step": 27300 + }, + { + "epoch": 0.058023699207352925, + "grad_norm": 0.40567246079444885, + "learning_rate": 0.00019925807200134093, + "loss": 1.2562, + "step": 27310 + }, + { + "epoch": 0.05804494552709198, + "grad_norm": 0.638820469379425, + "learning_rate": 0.0001992572436428127, + "loss": 1.2365, + "step": 27320 + }, + { + "epoch": 0.05806619184683103, + "grad_norm": 0.6243806481361389, + "learning_rate": 0.00019925641482383734, + "loss": 1.2154, + "step": 27330 + }, + { + "epoch": 0.05808743816657008, + "grad_norm": 0.49091771245002747, + "learning_rate": 0.00019925558554441869, + "loss": 1.2477, + "step": 27340 + }, + { + "epoch": 0.05810868448630914, + "grad_norm": 0.38903501629829407, + "learning_rate": 0.00019925475580456062, + "loss": 1.2543, + "step": 27350 + }, + { + "epoch": 0.05812993080604819, + "grad_norm": 0.4994620978832245, + "learning_rate": 0.0001992539256042669, + "loss": 1.2285, + "step": 27360 + }, + { + "epoch": 0.05815117712578724, + "grad_norm": 0.6112956404685974, + "learning_rate": 0.00019925309494354148, + "loss": 1.2314, + "step": 27370 + }, + { + "epoch": 0.0581724234455263, + "grad_norm": 0.3942379951477051, + "learning_rate": 0.0001992522638223881, + "loss": 1.2436, + "step": 27380 + }, + { + "epoch": 0.05819366976526535, + "grad_norm": 0.33499571681022644, + "learning_rate": 0.00019925143224081072, + "loss": 1.2407, + "step": 27390 + }, + { + "epoch": 0.0582149160850044, + "grad_norm": 0.6233252882957458, + "learning_rate": 0.00019925060019881317, + "loss": 1.2374, + "step": 27400 + }, + { + "epoch": 0.058236162404743455, + "grad_norm": 0.4064808189868927, + "learning_rate": 0.00019924976769639927, + "loss": 1.2559, + "step": 27410 + }, + { + "epoch": 0.058257408724482505, + "grad_norm": 0.3272790312767029, + "learning_rate": 0.00019924893473357292, + "loss": 1.2375, + "step": 27420 + }, + { + "epoch": 0.058278655044221556, + "grad_norm": 0.40281054377555847, + "learning_rate": 0.00019924810131033795, + "loss": 1.2332, + "step": 27430 + }, + { + "epoch": 0.05829990136396061, + "grad_norm": 0.3923066258430481, + "learning_rate": 0.00019924726742669827, + "loss": 1.2344, + "step": 27440 + }, + { + "epoch": 0.05832114768369966, + "grad_norm": 0.400854229927063, + "learning_rate": 0.0001992464330826577, + "loss": 1.2332, + "step": 27450 + }, + { + "epoch": 0.05834239400343871, + "grad_norm": 0.3833383619785309, + "learning_rate": 0.00019924559827822014, + "loss": 1.2065, + "step": 27460 + }, + { + "epoch": 0.05836364032317777, + "grad_norm": 0.5839700102806091, + "learning_rate": 0.00019924476301338948, + "loss": 1.2519, + "step": 27470 + }, + { + "epoch": 0.05838488664291682, + "grad_norm": 0.3553745150566101, + "learning_rate": 0.00019924392728816958, + "loss": 1.2648, + "step": 27480 + }, + { + "epoch": 0.05840613296265588, + "grad_norm": 0.3288906514644623, + "learning_rate": 0.0001992430911025643, + "loss": 1.2752, + "step": 27490 + }, + { + "epoch": 0.05842737928239493, + "grad_norm": 0.32888591289520264, + "learning_rate": 0.00019924225445657753, + "loss": 1.2754, + "step": 27500 + }, + { + "epoch": 0.05844862560213398, + "grad_norm": 0.49854809045791626, + "learning_rate": 0.00019924141735021316, + "loss": 1.2108, + "step": 27510 + }, + { + "epoch": 0.058469871921873036, + "grad_norm": 0.583575963973999, + "learning_rate": 0.00019924057978347505, + "loss": 1.2426, + "step": 27520 + }, + { + "epoch": 0.058491118241612086, + "grad_norm": 0.3764345645904541, + "learning_rate": 0.0001992397417563671, + "loss": 1.2546, + "step": 27530 + }, + { + "epoch": 0.058512364561351136, + "grad_norm": 0.3767249584197998, + "learning_rate": 0.00019923890326889321, + "loss": 1.234, + "step": 27540 + }, + { + "epoch": 0.058533610881090194, + "grad_norm": 0.38229361176490784, + "learning_rate": 0.00019923806432105726, + "loss": 1.2424, + "step": 27550 + }, + { + "epoch": 0.058554857200829244, + "grad_norm": 0.425687700510025, + "learning_rate": 0.00019923722491286313, + "loss": 1.2301, + "step": 27560 + }, + { + "epoch": 0.058576103520568294, + "grad_norm": 0.5638928413391113, + "learning_rate": 0.00019923638504431473, + "loss": 1.2314, + "step": 27570 + }, + { + "epoch": 0.05859734984030735, + "grad_norm": 0.37766826152801514, + "learning_rate": 0.00019923554471541592, + "loss": 1.2614, + "step": 27580 + }, + { + "epoch": 0.0586185961600464, + "grad_norm": 0.40337204933166504, + "learning_rate": 0.00019923470392617065, + "loss": 1.263, + "step": 27590 + }, + { + "epoch": 0.05863984247978545, + "grad_norm": 0.36984485387802124, + "learning_rate": 0.0001992338626765828, + "loss": 1.2436, + "step": 27600 + }, + { + "epoch": 0.05866108879952451, + "grad_norm": 0.3385940194129944, + "learning_rate": 0.00019923302096665628, + "loss": 1.2472, + "step": 27610 + }, + { + "epoch": 0.05868233511926356, + "grad_norm": 0.4343907833099365, + "learning_rate": 0.00019923217879639499, + "loss": 1.2597, + "step": 27620 + }, + { + "epoch": 0.05870358143900261, + "grad_norm": 0.42626217007637024, + "learning_rate": 0.00019923133616580279, + "loss": 1.2324, + "step": 27630 + }, + { + "epoch": 0.05872482775874167, + "grad_norm": 0.3357534408569336, + "learning_rate": 0.00019923049307488366, + "loss": 1.2208, + "step": 27640 + }, + { + "epoch": 0.05874607407848072, + "grad_norm": 0.47251835465431213, + "learning_rate": 0.00019922964952364148, + "loss": 1.2124, + "step": 27650 + }, + { + "epoch": 0.05876732039821977, + "grad_norm": 0.4115696847438812, + "learning_rate": 0.00019922880551208013, + "loss": 1.2222, + "step": 27660 + }, + { + "epoch": 0.058788566717958825, + "grad_norm": 0.3703017830848694, + "learning_rate": 0.00019922796104020362, + "loss": 1.2244, + "step": 27670 + }, + { + "epoch": 0.058809813037697875, + "grad_norm": 0.4239181578159332, + "learning_rate": 0.00019922711610801573, + "loss": 1.2433, + "step": 27680 + }, + { + "epoch": 0.05883105935743693, + "grad_norm": 0.5730394721031189, + "learning_rate": 0.00019922627071552052, + "loss": 1.2297, + "step": 27690 + }, + { + "epoch": 0.05885230567717598, + "grad_norm": 0.33616921305656433, + "learning_rate": 0.00019922542486272183, + "loss": 1.2328, + "step": 27700 + }, + { + "epoch": 0.05887355199691503, + "grad_norm": 0.39118120074272156, + "learning_rate": 0.00019922457854962362, + "loss": 1.2191, + "step": 27710 + }, + { + "epoch": 0.05889479831665409, + "grad_norm": 0.3232974410057068, + "learning_rate": 0.00019922373177622978, + "loss": 1.2859, + "step": 27720 + }, + { + "epoch": 0.05891604463639314, + "grad_norm": 0.4404892027378082, + "learning_rate": 0.00019922288454254426, + "loss": 1.2052, + "step": 27730 + }, + { + "epoch": 0.05893729095613219, + "grad_norm": 0.6084858775138855, + "learning_rate": 0.000199222036848571, + "loss": 1.2244, + "step": 27740 + }, + { + "epoch": 0.05895853727587125, + "grad_norm": 0.5767728686332703, + "learning_rate": 0.00019922118869431389, + "loss": 1.2268, + "step": 27750 + }, + { + "epoch": 0.0589797835956103, + "grad_norm": 0.45109763741493225, + "learning_rate": 0.00019922034007977693, + "loss": 1.2548, + "step": 27760 + }, + { + "epoch": 0.05900102991534935, + "grad_norm": 0.3936637341976166, + "learning_rate": 0.00019921949100496397, + "loss": 1.2592, + "step": 27770 + }, + { + "epoch": 0.059022276235088406, + "grad_norm": 0.4737092852592468, + "learning_rate": 0.00019921864146987906, + "loss": 1.2599, + "step": 27780 + }, + { + "epoch": 0.059043522554827456, + "grad_norm": 0.4265700876712799, + "learning_rate": 0.00019921779147452604, + "loss": 1.2638, + "step": 27790 + }, + { + "epoch": 0.059064768874566506, + "grad_norm": 0.4902001619338989, + "learning_rate": 0.0001992169410189089, + "loss": 1.211, + "step": 27800 + }, + { + "epoch": 0.05908601519430556, + "grad_norm": 0.34535956382751465, + "learning_rate": 0.0001992160901030316, + "loss": 1.185, + "step": 27810 + }, + { + "epoch": 0.059107261514044614, + "grad_norm": 0.7175641059875488, + "learning_rate": 0.00019921523872689804, + "loss": 1.2405, + "step": 27820 + }, + { + "epoch": 0.059128507833783664, + "grad_norm": 0.44840019941329956, + "learning_rate": 0.0001992143868905122, + "loss": 1.2205, + "step": 27830 + }, + { + "epoch": 0.05914975415352272, + "grad_norm": 0.38904067873954773, + "learning_rate": 0.00019921353459387801, + "loss": 1.2342, + "step": 27840 + }, + { + "epoch": 0.05917100047326177, + "grad_norm": 0.5322742462158203, + "learning_rate": 0.00019921268183699948, + "loss": 1.2328, + "step": 27850 + }, + { + "epoch": 0.05919224679300082, + "grad_norm": 0.384998083114624, + "learning_rate": 0.0001992118286198805, + "loss": 1.2047, + "step": 27860 + }, + { + "epoch": 0.05921349311273988, + "grad_norm": 0.7101864814758301, + "learning_rate": 0.00019921097494252505, + "loss": 1.2375, + "step": 27870 + }, + { + "epoch": 0.05923473943247893, + "grad_norm": 0.3524293303489685, + "learning_rate": 0.0001992101208049371, + "loss": 1.25, + "step": 27880 + }, + { + "epoch": 0.059255985752217986, + "grad_norm": 0.3777482509613037, + "learning_rate": 0.00019920926620712063, + "loss": 1.1946, + "step": 27890 + }, + { + "epoch": 0.05927723207195704, + "grad_norm": 0.589065432548523, + "learning_rate": 0.00019920841114907955, + "loss": 1.2392, + "step": 27900 + }, + { + "epoch": 0.05929847839169609, + "grad_norm": 0.5028794407844543, + "learning_rate": 0.0001992075556308179, + "loss": 1.2417, + "step": 27910 + }, + { + "epoch": 0.059319724711435144, + "grad_norm": 0.3744438588619232, + "learning_rate": 0.00019920669965233957, + "loss": 1.22, + "step": 27920 + }, + { + "epoch": 0.059340971031174194, + "grad_norm": 0.41032877564430237, + "learning_rate": 0.00019920584321364858, + "loss": 1.1897, + "step": 27930 + }, + { + "epoch": 0.059362217350913245, + "grad_norm": 0.38143011927604675, + "learning_rate": 0.0001992049863147489, + "loss": 1.2573, + "step": 27940 + }, + { + "epoch": 0.0593834636706523, + "grad_norm": 0.41036319732666016, + "learning_rate": 0.00019920412895564447, + "loss": 1.2219, + "step": 27950 + }, + { + "epoch": 0.05940470999039135, + "grad_norm": 0.3770989179611206, + "learning_rate": 0.00019920327113633933, + "loss": 1.2496, + "step": 27960 + }, + { + "epoch": 0.0594259563101304, + "grad_norm": 0.4297034442424774, + "learning_rate": 0.00019920241285683736, + "loss": 1.2353, + "step": 27970 + }, + { + "epoch": 0.05944720262986946, + "grad_norm": 0.3659209609031677, + "learning_rate": 0.00019920155411714267, + "loss": 1.2374, + "step": 27980 + }, + { + "epoch": 0.05946844894960851, + "grad_norm": 0.435093492269516, + "learning_rate": 0.00019920069491725912, + "loss": 1.2613, + "step": 27990 + }, + { + "epoch": 0.05948969526934756, + "grad_norm": 0.5230461955070496, + "learning_rate": 0.00019919983525719078, + "loss": 1.2655, + "step": 28000 + }, + { + "epoch": 0.05951094158908662, + "grad_norm": 0.35952746868133545, + "learning_rate": 0.00019919897513694164, + "loss": 1.2184, + "step": 28010 + }, + { + "epoch": 0.05953218790882567, + "grad_norm": 0.3723292648792267, + "learning_rate": 0.0001991981145565156, + "loss": 1.2279, + "step": 28020 + }, + { + "epoch": 0.05955343422856472, + "grad_norm": 0.34009993076324463, + "learning_rate": 0.00019919725351591675, + "loss": 1.2402, + "step": 28030 + }, + { + "epoch": 0.059574680548303775, + "grad_norm": 0.44344282150268555, + "learning_rate": 0.00019919639201514906, + "loss": 1.2444, + "step": 28040 + }, + { + "epoch": 0.059595926868042826, + "grad_norm": 0.6931393146514893, + "learning_rate": 0.0001991955300542165, + "loss": 1.2312, + "step": 28050 + }, + { + "epoch": 0.059617173187781876, + "grad_norm": 0.8838579058647156, + "learning_rate": 0.00019919466763312308, + "loss": 1.2156, + "step": 28060 + }, + { + "epoch": 0.05963841950752093, + "grad_norm": 0.3899818956851959, + "learning_rate": 0.0001991938047518728, + "loss": 1.2104, + "step": 28070 + }, + { + "epoch": 0.05965966582725998, + "grad_norm": 0.6086620688438416, + "learning_rate": 0.00019919294141046968, + "loss": 1.2276, + "step": 28080 + }, + { + "epoch": 0.059680912146999034, + "grad_norm": 0.36951541900634766, + "learning_rate": 0.0001991920776089177, + "loss": 1.2504, + "step": 28090 + }, + { + "epoch": 0.05970215846673809, + "grad_norm": 0.3597952723503113, + "learning_rate": 0.00019919121334722087, + "loss": 1.2318, + "step": 28100 + }, + { + "epoch": 0.05972340478647714, + "grad_norm": 0.8102067708969116, + "learning_rate": 0.00019919034862538322, + "loss": 1.26, + "step": 28110 + }, + { + "epoch": 0.0597446511062162, + "grad_norm": 0.4715833365917206, + "learning_rate": 0.00019918948344340874, + "loss": 1.2403, + "step": 28120 + }, + { + "epoch": 0.05976589742595525, + "grad_norm": 0.36132654547691345, + "learning_rate": 0.00019918861780130145, + "loss": 1.2699, + "step": 28130 + }, + { + "epoch": 0.0597871437456943, + "grad_norm": 0.36443325877189636, + "learning_rate": 0.00019918775169906535, + "loss": 1.2427, + "step": 28140 + }, + { + "epoch": 0.059808390065433356, + "grad_norm": 0.5197269320487976, + "learning_rate": 0.0001991868851367045, + "loss": 1.2254, + "step": 28150 + }, + { + "epoch": 0.059829636385172406, + "grad_norm": 0.38131266832351685, + "learning_rate": 0.00019918601811422292, + "loss": 1.2628, + "step": 28160 + }, + { + "epoch": 0.05985088270491146, + "grad_norm": 0.35624027252197266, + "learning_rate": 0.00019918515063162458, + "loss": 1.2373, + "step": 28170 + }, + { + "epoch": 0.059872129024650514, + "grad_norm": 0.404313325881958, + "learning_rate": 0.00019918428268891352, + "loss": 1.2602, + "step": 28180 + }, + { + "epoch": 0.059893375344389564, + "grad_norm": 0.5410498380661011, + "learning_rate": 0.00019918341428609381, + "loss": 1.2462, + "step": 28190 + }, + { + "epoch": 0.059914621664128614, + "grad_norm": 0.3632294535636902, + "learning_rate": 0.00019918254542316942, + "loss": 1.2283, + "step": 28200 + }, + { + "epoch": 0.05993586798386767, + "grad_norm": 0.4485985040664673, + "learning_rate": 0.00019918167610014442, + "loss": 1.2457, + "step": 28210 + }, + { + "epoch": 0.05995711430360672, + "grad_norm": 0.3656270205974579, + "learning_rate": 0.00019918080631702283, + "loss": 1.2061, + "step": 28220 + }, + { + "epoch": 0.05997836062334577, + "grad_norm": 0.39489516615867615, + "learning_rate": 0.00019917993607380867, + "loss": 1.2637, + "step": 28230 + }, + { + "epoch": 0.05999960694308483, + "grad_norm": 0.4411980211734772, + "learning_rate": 0.000199179065370506, + "loss": 1.2308, + "step": 28240 + }, + { + "epoch": 0.06002085326282388, + "grad_norm": 0.44476738572120667, + "learning_rate": 0.00019917819420711886, + "loss": 1.292, + "step": 28250 + }, + { + "epoch": 0.06004209958256293, + "grad_norm": 0.4763082265853882, + "learning_rate": 0.00019917732258365126, + "loss": 1.2459, + "step": 28260 + }, + { + "epoch": 0.06006334590230199, + "grad_norm": 0.38731512427330017, + "learning_rate": 0.0001991764505001073, + "loss": 1.2636, + "step": 28270 + }, + { + "epoch": 0.06008459222204104, + "grad_norm": 0.42874327301979065, + "learning_rate": 0.00019917557795649096, + "loss": 1.2228, + "step": 28280 + }, + { + "epoch": 0.06010583854178009, + "grad_norm": 0.4948899447917938, + "learning_rate": 0.00019917470495280632, + "loss": 1.2944, + "step": 28290 + }, + { + "epoch": 0.060127084861519145, + "grad_norm": 0.34330594539642334, + "learning_rate": 0.00019917383148905746, + "loss": 1.2566, + "step": 28300 + }, + { + "epoch": 0.060148331181258195, + "grad_norm": 0.32125070691108704, + "learning_rate": 0.00019917295756524838, + "loss": 1.2301, + "step": 28310 + }, + { + "epoch": 0.06016957750099725, + "grad_norm": 0.6187619566917419, + "learning_rate": 0.00019917208318138315, + "loss": 1.223, + "step": 28320 + }, + { + "epoch": 0.0601908238207363, + "grad_norm": 0.36798009276390076, + "learning_rate": 0.00019917120833746584, + "loss": 1.2168, + "step": 28330 + }, + { + "epoch": 0.06021207014047535, + "grad_norm": 0.37591874599456787, + "learning_rate": 0.0001991703330335005, + "loss": 1.2298, + "step": 28340 + }, + { + "epoch": 0.06023331646021441, + "grad_norm": 0.39807242155075073, + "learning_rate": 0.0001991694572694912, + "loss": 1.2275, + "step": 28350 + }, + { + "epoch": 0.06025456277995346, + "grad_norm": 0.4441433548927307, + "learning_rate": 0.00019916858104544196, + "loss": 1.2359, + "step": 28360 + }, + { + "epoch": 0.06027580909969251, + "grad_norm": 0.38866713643074036, + "learning_rate": 0.00019916770436135688, + "loss": 1.2206, + "step": 28370 + }, + { + "epoch": 0.06029705541943157, + "grad_norm": 0.3693794906139374, + "learning_rate": 0.00019916682721724004, + "loss": 1.2347, + "step": 28380 + }, + { + "epoch": 0.06031830173917062, + "grad_norm": 0.41298213601112366, + "learning_rate": 0.0001991659496130955, + "loss": 1.2202, + "step": 28390 + }, + { + "epoch": 0.06033954805890967, + "grad_norm": 0.518657386302948, + "learning_rate": 0.0001991650715489273, + "loss": 1.2474, + "step": 28400 + }, + { + "epoch": 0.060360794378648726, + "grad_norm": 0.48045384883880615, + "learning_rate": 0.00019916419302473958, + "loss": 1.2387, + "step": 28410 + }, + { + "epoch": 0.060382040698387776, + "grad_norm": 0.36533820629119873, + "learning_rate": 0.0001991633140405363, + "loss": 1.2308, + "step": 28420 + }, + { + "epoch": 0.060403287018126826, + "grad_norm": 0.4256630539894104, + "learning_rate": 0.00019916243459632167, + "loss": 1.2145, + "step": 28430 + }, + { + "epoch": 0.060424533337865884, + "grad_norm": 0.36594003438949585, + "learning_rate": 0.0001991615546920997, + "loss": 1.251, + "step": 28440 + }, + { + "epoch": 0.060445779657604934, + "grad_norm": 0.5306403636932373, + "learning_rate": 0.00019916067432787447, + "loss": 1.2299, + "step": 28450 + }, + { + "epoch": 0.060467025977343984, + "grad_norm": 0.3646804690361023, + "learning_rate": 0.00019915979350365007, + "loss": 1.2461, + "step": 28460 + }, + { + "epoch": 0.06048827229708304, + "grad_norm": 0.7087408304214478, + "learning_rate": 0.00019915891221943061, + "loss": 1.2154, + "step": 28470 + }, + { + "epoch": 0.06050951861682209, + "grad_norm": 0.794670581817627, + "learning_rate": 0.00019915803047522015, + "loss": 1.2163, + "step": 28480 + }, + { + "epoch": 0.06053076493656114, + "grad_norm": 0.37990474700927734, + "learning_rate": 0.00019915714827102278, + "loss": 1.2903, + "step": 28490 + }, + { + "epoch": 0.0605520112563002, + "grad_norm": 0.5011847615242004, + "learning_rate": 0.00019915626560684262, + "loss": 1.2415, + "step": 28500 + }, + { + "epoch": 0.06057325757603925, + "grad_norm": 0.7135734558105469, + "learning_rate": 0.00019915538248268372, + "loss": 1.2032, + "step": 28510 + }, + { + "epoch": 0.0605945038957783, + "grad_norm": 0.3476932644844055, + "learning_rate": 0.00019915449889855024, + "loss": 1.2198, + "step": 28520 + }, + { + "epoch": 0.06061575021551736, + "grad_norm": 0.7865474224090576, + "learning_rate": 0.00019915361485444623, + "loss": 1.2473, + "step": 28530 + }, + { + "epoch": 0.06063699653525641, + "grad_norm": 0.5856924653053284, + "learning_rate": 0.0001991527303503758, + "loss": 1.2271, + "step": 28540 + }, + { + "epoch": 0.060658242854995464, + "grad_norm": 0.49810701608657837, + "learning_rate": 0.00019915184538634307, + "loss": 1.2173, + "step": 28550 + }, + { + "epoch": 0.060679489174734515, + "grad_norm": 0.478458970785141, + "learning_rate": 0.00019915095996235214, + "loss": 1.2379, + "step": 28560 + }, + { + "epoch": 0.060700735494473565, + "grad_norm": 0.43895673751831055, + "learning_rate": 0.00019915007407840708, + "loss": 1.1973, + "step": 28570 + }, + { + "epoch": 0.06072198181421262, + "grad_norm": 0.3352224826812744, + "learning_rate": 0.00019914918773451205, + "loss": 1.247, + "step": 28580 + }, + { + "epoch": 0.06074322813395167, + "grad_norm": 0.42605504393577576, + "learning_rate": 0.00019914830093067113, + "loss": 1.198, + "step": 28590 + }, + { + "epoch": 0.06076447445369072, + "grad_norm": 0.45111602544784546, + "learning_rate": 0.00019914741366688842, + "loss": 1.2397, + "step": 28600 + }, + { + "epoch": 0.06078572077342978, + "grad_norm": 0.3324566185474396, + "learning_rate": 0.0001991465259431681, + "loss": 1.2223, + "step": 28610 + }, + { + "epoch": 0.06080696709316883, + "grad_norm": 0.5034562945365906, + "learning_rate": 0.00019914563775951422, + "loss": 1.2516, + "step": 28620 + }, + { + "epoch": 0.06082821341290788, + "grad_norm": 0.3780565857887268, + "learning_rate": 0.00019914474911593097, + "loss": 1.2395, + "step": 28630 + }, + { + "epoch": 0.06084945973264694, + "grad_norm": 0.40680772066116333, + "learning_rate": 0.0001991438600124224, + "loss": 1.259, + "step": 28640 + }, + { + "epoch": 0.06087070605238599, + "grad_norm": 0.3649682402610779, + "learning_rate": 0.00019914297044899265, + "loss": 1.1952, + "step": 28650 + }, + { + "epoch": 0.06089195237212504, + "grad_norm": 0.37562158703804016, + "learning_rate": 0.0001991420804256459, + "loss": 1.2552, + "step": 28660 + }, + { + "epoch": 0.060913198691864096, + "grad_norm": 0.3538934290409088, + "learning_rate": 0.00019914118994238624, + "loss": 1.2396, + "step": 28670 + }, + { + "epoch": 0.060934445011603146, + "grad_norm": 0.48165953159332275, + "learning_rate": 0.0001991402989992178, + "loss": 1.223, + "step": 28680 + }, + { + "epoch": 0.060955691331342196, + "grad_norm": 0.4946977198123932, + "learning_rate": 0.0001991394075961447, + "loss": 1.2073, + "step": 28690 + }, + { + "epoch": 0.06097693765108125, + "grad_norm": 0.4353432059288025, + "learning_rate": 0.00019913851573317108, + "loss": 1.257, + "step": 28700 + }, + { + "epoch": 0.060998183970820304, + "grad_norm": 0.44350647926330566, + "learning_rate": 0.0001991376234103011, + "loss": 1.2116, + "step": 28710 + }, + { + "epoch": 0.061019430290559354, + "grad_norm": 0.44633948802948, + "learning_rate": 0.0001991367306275389, + "loss": 1.2207, + "step": 28720 + }, + { + "epoch": 0.06104067661029841, + "grad_norm": 0.3651543855667114, + "learning_rate": 0.00019913583738488859, + "loss": 1.2642, + "step": 28730 + }, + { + "epoch": 0.06106192293003746, + "grad_norm": 0.6192396283149719, + "learning_rate": 0.00019913494368235435, + "loss": 1.2228, + "step": 28740 + }, + { + "epoch": 0.06108316924977652, + "grad_norm": 0.35376375913619995, + "learning_rate": 0.0001991340495199403, + "loss": 1.2865, + "step": 28750 + }, + { + "epoch": 0.06110441556951557, + "grad_norm": 0.3535098433494568, + "learning_rate": 0.00019913315489765058, + "loss": 1.2445, + "step": 28760 + }, + { + "epoch": 0.06112566188925462, + "grad_norm": 0.529783308506012, + "learning_rate": 0.00019913225981548938, + "loss": 1.229, + "step": 28770 + }, + { + "epoch": 0.061146908208993676, + "grad_norm": 0.8373968005180359, + "learning_rate": 0.00019913136427346083, + "loss": 1.2502, + "step": 28780 + }, + { + "epoch": 0.06116815452873273, + "grad_norm": 0.3582019805908203, + "learning_rate": 0.00019913046827156906, + "loss": 1.2425, + "step": 28790 + }, + { + "epoch": 0.06118940084847178, + "grad_norm": 0.3961372673511505, + "learning_rate": 0.00019912957180981826, + "loss": 1.2347, + "step": 28800 + }, + { + "epoch": 0.061210647168210834, + "grad_norm": 0.43683376908302307, + "learning_rate": 0.00019912867488821258, + "loss": 1.2144, + "step": 28810 + }, + { + "epoch": 0.061231893487949884, + "grad_norm": 0.43141937255859375, + "learning_rate": 0.0001991277775067562, + "loss": 1.2234, + "step": 28820 + }, + { + "epoch": 0.061253139807688935, + "grad_norm": 0.3630026578903198, + "learning_rate": 0.00019912687966545324, + "loss": 1.2435, + "step": 28830 + }, + { + "epoch": 0.06127438612742799, + "grad_norm": 0.3980156481266022, + "learning_rate": 0.00019912598136430788, + "loss": 1.2599, + "step": 28840 + }, + { + "epoch": 0.06129563244716704, + "grad_norm": 0.5157904624938965, + "learning_rate": 0.00019912508260332427, + "loss": 1.2604, + "step": 28850 + }, + { + "epoch": 0.06131687876690609, + "grad_norm": 0.35919737815856934, + "learning_rate": 0.00019912418338250665, + "loss": 1.219, + "step": 28860 + }, + { + "epoch": 0.06133812508664515, + "grad_norm": 0.48779329657554626, + "learning_rate": 0.00019912328370185912, + "loss": 1.1962, + "step": 28870 + }, + { + "epoch": 0.0613593714063842, + "grad_norm": 0.5046443939208984, + "learning_rate": 0.0001991223835613859, + "loss": 1.2218, + "step": 28880 + }, + { + "epoch": 0.06138061772612325, + "grad_norm": 0.4945901930332184, + "learning_rate": 0.00019912148296109112, + "loss": 1.2302, + "step": 28890 + }, + { + "epoch": 0.06140186404586231, + "grad_norm": 0.42312827706336975, + "learning_rate": 0.00019912058190097899, + "loss": 1.2205, + "step": 28900 + }, + { + "epoch": 0.06142311036560136, + "grad_norm": 0.7230531573295593, + "learning_rate": 0.00019911968038105368, + "loss": 1.2127, + "step": 28910 + }, + { + "epoch": 0.06144435668534041, + "grad_norm": 0.4515621066093445, + "learning_rate": 0.0001991187784013194, + "loss": 1.2176, + "step": 28920 + }, + { + "epoch": 0.061465603005079465, + "grad_norm": 0.40693527460098267, + "learning_rate": 0.00019911787596178025, + "loss": 1.2641, + "step": 28930 + }, + { + "epoch": 0.061486849324818516, + "grad_norm": 0.4853534698486328, + "learning_rate": 0.00019911697306244052, + "loss": 1.251, + "step": 28940 + }, + { + "epoch": 0.06150809564455757, + "grad_norm": 0.4605921804904938, + "learning_rate": 0.00019911606970330435, + "loss": 1.1976, + "step": 28950 + }, + { + "epoch": 0.06152934196429662, + "grad_norm": 0.4009764790534973, + "learning_rate": 0.0001991151658843759, + "loss": 1.2624, + "step": 28960 + }, + { + "epoch": 0.06155058828403567, + "grad_norm": 0.8111704587936401, + "learning_rate": 0.00019911426160565944, + "loss": 1.2307, + "step": 28970 + }, + { + "epoch": 0.06157183460377473, + "grad_norm": 0.6662255525588989, + "learning_rate": 0.0001991133568671591, + "loss": 1.1974, + "step": 28980 + }, + { + "epoch": 0.06159308092351378, + "grad_norm": 0.6756460070610046, + "learning_rate": 0.0001991124516688791, + "loss": 1.2075, + "step": 28990 + }, + { + "epoch": 0.06161432724325283, + "grad_norm": 0.40754443407058716, + "learning_rate": 0.00019911154601082363, + "loss": 1.2666, + "step": 29000 + }, + { + "epoch": 0.06163557356299189, + "grad_norm": 0.36002078652381897, + "learning_rate": 0.0001991106398929969, + "loss": 1.2371, + "step": 29010 + }, + { + "epoch": 0.06165681988273094, + "grad_norm": 0.35456594824790955, + "learning_rate": 0.00019910973331540313, + "loss": 1.2041, + "step": 29020 + }, + { + "epoch": 0.06167806620246999, + "grad_norm": 0.4403707981109619, + "learning_rate": 0.00019910882627804648, + "loss": 1.2366, + "step": 29030 + }, + { + "epoch": 0.061699312522209046, + "grad_norm": 0.35756805539131165, + "learning_rate": 0.0001991079187809312, + "loss": 1.2044, + "step": 29040 + }, + { + "epoch": 0.061720558841948096, + "grad_norm": 0.4302145838737488, + "learning_rate": 0.0001991070108240615, + "loss": 1.2461, + "step": 29050 + }, + { + "epoch": 0.06174180516168715, + "grad_norm": 0.3556727170944214, + "learning_rate": 0.00019910610240744155, + "loss": 1.2308, + "step": 29060 + }, + { + "epoch": 0.061763051481426204, + "grad_norm": 0.32650026679039, + "learning_rate": 0.00019910519353107556, + "loss": 1.2223, + "step": 29070 + }, + { + "epoch": 0.061784297801165254, + "grad_norm": 0.5726726055145264, + "learning_rate": 0.00019910428419496782, + "loss": 1.2594, + "step": 29080 + }, + { + "epoch": 0.061805544120904304, + "grad_norm": 0.35341891646385193, + "learning_rate": 0.00019910337439912253, + "loss": 1.2086, + "step": 29090 + }, + { + "epoch": 0.06182679044064336, + "grad_norm": 0.36428284645080566, + "learning_rate": 0.00019910246414354385, + "loss": 1.221, + "step": 29100 + }, + { + "epoch": 0.06184803676038241, + "grad_norm": 0.3663581311702728, + "learning_rate": 0.00019910155342823605, + "loss": 1.235, + "step": 29110 + }, + { + "epoch": 0.06186928308012146, + "grad_norm": 0.40644770860671997, + "learning_rate": 0.00019910064225320335, + "loss": 1.2694, + "step": 29120 + }, + { + "epoch": 0.06189052939986052, + "grad_norm": 0.3507104814052582, + "learning_rate": 0.00019909973061844994, + "loss": 1.213, + "step": 29130 + }, + { + "epoch": 0.06191177571959957, + "grad_norm": 0.3768456280231476, + "learning_rate": 0.0001990988185239801, + "loss": 1.2324, + "step": 29140 + }, + { + "epoch": 0.06193302203933862, + "grad_norm": 0.3639238774776459, + "learning_rate": 0.00019909790596979804, + "loss": 1.2529, + "step": 29150 + }, + { + "epoch": 0.06195426835907768, + "grad_norm": 0.3789615333080292, + "learning_rate": 0.000199096992955908, + "loss": 1.2184, + "step": 29160 + }, + { + "epoch": 0.06197551467881673, + "grad_norm": 0.5187004804611206, + "learning_rate": 0.00019909607948231418, + "loss": 1.2001, + "step": 29170 + }, + { + "epoch": 0.061996760998555785, + "grad_norm": 0.41158878803253174, + "learning_rate": 0.00019909516554902088, + "loss": 1.241, + "step": 29180 + }, + { + "epoch": 0.062018007318294835, + "grad_norm": 0.4459899961948395, + "learning_rate": 0.00019909425115603228, + "loss": 1.2654, + "step": 29190 + }, + { + "epoch": 0.062039253638033885, + "grad_norm": 0.36880195140838623, + "learning_rate": 0.00019909333630335266, + "loss": 1.2131, + "step": 29200 + }, + { + "epoch": 0.06206049995777294, + "grad_norm": 0.3947776257991791, + "learning_rate": 0.00019909242099098626, + "loss": 1.2542, + "step": 29210 + }, + { + "epoch": 0.06208174627751199, + "grad_norm": 0.3829830586910248, + "learning_rate": 0.0001990915052189373, + "loss": 1.2562, + "step": 29220 + }, + { + "epoch": 0.06210299259725104, + "grad_norm": 0.3597177267074585, + "learning_rate": 0.00019909058898721007, + "loss": 1.2113, + "step": 29230 + }, + { + "epoch": 0.0621242389169901, + "grad_norm": 0.4107336401939392, + "learning_rate": 0.0001990896722958088, + "loss": 1.2787, + "step": 29240 + }, + { + "epoch": 0.06214548523672915, + "grad_norm": 0.5767790079116821, + "learning_rate": 0.0001990887551447377, + "loss": 1.2164, + "step": 29250 + }, + { + "epoch": 0.0621667315564682, + "grad_norm": 0.3875485062599182, + "learning_rate": 0.00019908783753400108, + "loss": 1.2372, + "step": 29260 + }, + { + "epoch": 0.06218797787620726, + "grad_norm": 0.35851192474365234, + "learning_rate": 0.0001990869194636032, + "loss": 1.2238, + "step": 29270 + }, + { + "epoch": 0.06220922419594631, + "grad_norm": 0.4721623957157135, + "learning_rate": 0.00019908600093354828, + "loss": 1.1997, + "step": 29280 + }, + { + "epoch": 0.06223047051568536, + "grad_norm": 0.3317793905735016, + "learning_rate": 0.00019908508194384062, + "loss": 1.1924, + "step": 29290 + }, + { + "epoch": 0.062251716835424416, + "grad_norm": 0.3453124761581421, + "learning_rate": 0.00019908416249448443, + "loss": 1.2583, + "step": 29300 + }, + { + "epoch": 0.062272963155163466, + "grad_norm": 0.3505896031856537, + "learning_rate": 0.000199083242585484, + "loss": 1.2142, + "step": 29310 + }, + { + "epoch": 0.062294209474902516, + "grad_norm": 0.43518179655075073, + "learning_rate": 0.00019908232221684365, + "loss": 1.214, + "step": 29320 + }, + { + "epoch": 0.062315455794641574, + "grad_norm": 0.37342700362205505, + "learning_rate": 0.00019908140138856758, + "loss": 1.2567, + "step": 29330 + }, + { + "epoch": 0.062336702114380624, + "grad_norm": 0.3351958096027374, + "learning_rate": 0.0001990804801006601, + "loss": 1.2477, + "step": 29340 + }, + { + "epoch": 0.062357948434119674, + "grad_norm": 0.5791038274765015, + "learning_rate": 0.00019907955835312546, + "loss": 1.1916, + "step": 29350 + }, + { + "epoch": 0.06237919475385873, + "grad_norm": 0.5344705581665039, + "learning_rate": 0.00019907863614596795, + "loss": 1.265, + "step": 29360 + }, + { + "epoch": 0.06240044107359778, + "grad_norm": 0.3568737506866455, + "learning_rate": 0.00019907771347919186, + "loss": 1.2509, + "step": 29370 + }, + { + "epoch": 0.06242168739333684, + "grad_norm": 0.34032338857650757, + "learning_rate": 0.00019907679035280142, + "loss": 1.2272, + "step": 29380 + }, + { + "epoch": 0.06244293371307589, + "grad_norm": 0.33558735251426697, + "learning_rate": 0.000199075866766801, + "loss": 1.1762, + "step": 29390 + }, + { + "epoch": 0.06246418003281494, + "grad_norm": 0.3649517297744751, + "learning_rate": 0.00019907494272119477, + "loss": 1.2289, + "step": 29400 + }, + { + "epoch": 0.062485426352554, + "grad_norm": 0.4944959878921509, + "learning_rate": 0.0001990740182159871, + "loss": 1.2378, + "step": 29410 + }, + { + "epoch": 0.06250667267229304, + "grad_norm": 0.44243282079696655, + "learning_rate": 0.0001990730932511823, + "loss": 1.1944, + "step": 29420 + }, + { + "epoch": 0.0625279189920321, + "grad_norm": 0.4309147298336029, + "learning_rate": 0.00019907216782678457, + "loss": 1.2645, + "step": 29430 + }, + { + "epoch": 0.06254916531177115, + "grad_norm": 0.5344216823577881, + "learning_rate": 0.00019907124194279825, + "loss": 1.2378, + "step": 29440 + }, + { + "epoch": 0.0625704116315102, + "grad_norm": 0.38408541679382324, + "learning_rate": 0.00019907031559922764, + "loss": 1.2482, + "step": 29450 + }, + { + "epoch": 0.06259165795124925, + "grad_norm": 0.4233662486076355, + "learning_rate": 0.00019906938879607705, + "loss": 1.2014, + "step": 29460 + }, + { + "epoch": 0.06261290427098831, + "grad_norm": 0.3945563733577728, + "learning_rate": 0.00019906846153335076, + "loss": 1.2351, + "step": 29470 + }, + { + "epoch": 0.06263415059072737, + "grad_norm": 0.3835134506225586, + "learning_rate": 0.00019906753381105307, + "loss": 1.2038, + "step": 29480 + }, + { + "epoch": 0.06265539691046641, + "grad_norm": 0.393056184053421, + "learning_rate": 0.0001990666056291883, + "loss": 1.2592, + "step": 29490 + }, + { + "epoch": 0.06267664323020547, + "grad_norm": 0.37175697088241577, + "learning_rate": 0.0001990656769877607, + "loss": 1.2332, + "step": 29500 + }, + { + "epoch": 0.06269788954994453, + "grad_norm": 0.3572457432746887, + "learning_rate": 0.00019906474788677466, + "loss": 1.2482, + "step": 29510 + }, + { + "epoch": 0.06271913586968357, + "grad_norm": 0.8025804162025452, + "learning_rate": 0.00019906381832623447, + "loss": 1.2418, + "step": 29520 + }, + { + "epoch": 0.06274038218942263, + "grad_norm": 0.3234923183917999, + "learning_rate": 0.00019906288830614437, + "loss": 1.2078, + "step": 29530 + }, + { + "epoch": 0.06276162850916168, + "grad_norm": 0.34242022037506104, + "learning_rate": 0.00019906195782650875, + "loss": 1.2743, + "step": 29540 + }, + { + "epoch": 0.06278287482890073, + "grad_norm": 0.5085380673408508, + "learning_rate": 0.00019906102688733193, + "loss": 1.2532, + "step": 29550 + }, + { + "epoch": 0.06280412114863979, + "grad_norm": 0.39186015725135803, + "learning_rate": 0.00019906009548861816, + "loss": 1.2023, + "step": 29560 + }, + { + "epoch": 0.06282536746837884, + "grad_norm": 0.3985419273376465, + "learning_rate": 0.00019905916363037186, + "loss": 1.2574, + "step": 29570 + }, + { + "epoch": 0.06284661378811789, + "grad_norm": 0.420650839805603, + "learning_rate": 0.00019905823131259725, + "loss": 1.2373, + "step": 29580 + }, + { + "epoch": 0.06286786010785694, + "grad_norm": 0.3915897309780121, + "learning_rate": 0.0001990572985352987, + "loss": 1.251, + "step": 29590 + }, + { + "epoch": 0.062889106427596, + "grad_norm": 0.3306560814380646, + "learning_rate": 0.00019905636529848056, + "loss": 1.2441, + "step": 29600 + }, + { + "epoch": 0.06291035274733504, + "grad_norm": 0.3403854966163635, + "learning_rate": 0.00019905543160214713, + "loss": 1.2293, + "step": 29610 + }, + { + "epoch": 0.0629315990670741, + "grad_norm": 0.43944674730300903, + "learning_rate": 0.00019905449744630273, + "loss": 1.2653, + "step": 29620 + }, + { + "epoch": 0.06295284538681316, + "grad_norm": 0.3552972078323364, + "learning_rate": 0.00019905356283095175, + "loss": 1.2593, + "step": 29630 + }, + { + "epoch": 0.0629740917065522, + "grad_norm": 0.3495821952819824, + "learning_rate": 0.00019905262775609845, + "loss": 1.2791, + "step": 29640 + }, + { + "epoch": 0.06299533802629126, + "grad_norm": 0.3066365718841553, + "learning_rate": 0.0001990516922217472, + "loss": 1.2631, + "step": 29650 + }, + { + "epoch": 0.06301658434603032, + "grad_norm": 0.5323132872581482, + "learning_rate": 0.0001990507562279024, + "loss": 1.2445, + "step": 29660 + }, + { + "epoch": 0.06303783066576936, + "grad_norm": 0.3323189616203308, + "learning_rate": 0.00019904981977456832, + "loss": 1.2461, + "step": 29670 + }, + { + "epoch": 0.06305907698550842, + "grad_norm": 0.6118967533111572, + "learning_rate": 0.0001990488828617493, + "loss": 1.2125, + "step": 29680 + }, + { + "epoch": 0.06308032330524747, + "grad_norm": 0.48820677399635315, + "learning_rate": 0.0001990479454894497, + "loss": 1.2117, + "step": 29690 + }, + { + "epoch": 0.06310156962498652, + "grad_norm": 0.4620404839515686, + "learning_rate": 0.0001990470076576739, + "loss": 1.2587, + "step": 29700 + }, + { + "epoch": 0.06312281594472557, + "grad_norm": 0.4676188826560974, + "learning_rate": 0.00019904606936642624, + "loss": 1.2244, + "step": 29710 + }, + { + "epoch": 0.06314406226446463, + "grad_norm": 0.3357860743999481, + "learning_rate": 0.00019904513061571103, + "loss": 1.2447, + "step": 29720 + }, + { + "epoch": 0.06316530858420367, + "grad_norm": 0.4029901921749115, + "learning_rate": 0.00019904419140553265, + "loss": 1.2246, + "step": 29730 + }, + { + "epoch": 0.06318655490394273, + "grad_norm": 0.4796127378940582, + "learning_rate": 0.0001990432517358955, + "loss": 1.245, + "step": 29740 + }, + { + "epoch": 0.06320780122368179, + "grad_norm": 0.3479779362678528, + "learning_rate": 0.00019904231160680386, + "loss": 1.2388, + "step": 29750 + }, + { + "epoch": 0.06322904754342083, + "grad_norm": 0.38554835319519043, + "learning_rate": 0.00019904137101826215, + "loss": 1.2006, + "step": 29760 + }, + { + "epoch": 0.06325029386315989, + "grad_norm": 0.3531063497066498, + "learning_rate": 0.0001990404299702747, + "loss": 1.2075, + "step": 29770 + }, + { + "epoch": 0.06327154018289895, + "grad_norm": 0.37978729605674744, + "learning_rate": 0.0001990394884628459, + "loss": 1.2466, + "step": 29780 + }, + { + "epoch": 0.06329278650263799, + "grad_norm": 0.394927978515625, + "learning_rate": 0.00019903854649598013, + "loss": 1.2314, + "step": 29790 + }, + { + "epoch": 0.06331403282237705, + "grad_norm": 0.39310339093208313, + "learning_rate": 0.0001990376040696817, + "loss": 1.1826, + "step": 29800 + }, + { + "epoch": 0.0633352791421161, + "grad_norm": 0.3848629295825958, + "learning_rate": 0.00019903666118395506, + "loss": 1.2313, + "step": 29810 + }, + { + "epoch": 0.06335652546185515, + "grad_norm": 0.37525421380996704, + "learning_rate": 0.00019903571783880453, + "loss": 1.224, + "step": 29820 + }, + { + "epoch": 0.0633777717815942, + "grad_norm": 0.4012526869773865, + "learning_rate": 0.00019903477403423447, + "loss": 1.2142, + "step": 29830 + }, + { + "epoch": 0.06339901810133326, + "grad_norm": 0.41094619035720825, + "learning_rate": 0.00019903382977024933, + "loss": 1.235, + "step": 29840 + }, + { + "epoch": 0.0634202644210723, + "grad_norm": 0.4134773910045624, + "learning_rate": 0.00019903288504685343, + "loss": 1.2513, + "step": 29850 + }, + { + "epoch": 0.06344151074081136, + "grad_norm": 0.38551703095436096, + "learning_rate": 0.00019903193986405113, + "loss": 1.2399, + "step": 29860 + }, + { + "epoch": 0.06346275706055042, + "grad_norm": 0.5809423923492432, + "learning_rate": 0.0001990309942218469, + "loss": 1.2367, + "step": 29870 + }, + { + "epoch": 0.06348400338028946, + "grad_norm": 0.44442039728164673, + "learning_rate": 0.0001990300481202451, + "loss": 1.2168, + "step": 29880 + }, + { + "epoch": 0.06350524970002852, + "grad_norm": 0.32936370372772217, + "learning_rate": 0.00019902910155925009, + "loss": 1.2076, + "step": 29890 + }, + { + "epoch": 0.06352649601976758, + "grad_norm": 0.3613773584365845, + "learning_rate": 0.00019902815453886623, + "loss": 1.2195, + "step": 29900 + }, + { + "epoch": 0.06354774233950664, + "grad_norm": 0.548390805721283, + "learning_rate": 0.00019902720705909798, + "loss": 1.2645, + "step": 29910 + }, + { + "epoch": 0.06356898865924568, + "grad_norm": 0.5536164045333862, + "learning_rate": 0.00019902625911994971, + "loss": 1.2449, + "step": 29920 + }, + { + "epoch": 0.06359023497898474, + "grad_norm": 0.38533738255500793, + "learning_rate": 0.00019902531072142586, + "loss": 1.2555, + "step": 29930 + }, + { + "epoch": 0.0636114812987238, + "grad_norm": 0.33847105503082275, + "learning_rate": 0.00019902436186353074, + "loss": 1.2815, + "step": 29940 + }, + { + "epoch": 0.06363272761846284, + "grad_norm": 0.34318807721138, + "learning_rate": 0.0001990234125462688, + "loss": 1.2429, + "step": 29950 + }, + { + "epoch": 0.0636539739382019, + "grad_norm": 0.41533762216567993, + "learning_rate": 0.00019902246276964444, + "loss": 1.2094, + "step": 29960 + }, + { + "epoch": 0.06367522025794095, + "grad_norm": 0.3128291666507721, + "learning_rate": 0.00019902151253366207, + "loss": 1.217, + "step": 29970 + }, + { + "epoch": 0.06369646657768, + "grad_norm": 0.58467698097229, + "learning_rate": 0.0001990205618383261, + "loss": 1.216, + "step": 29980 + }, + { + "epoch": 0.06371771289741905, + "grad_norm": 0.45315656065940857, + "learning_rate": 0.00019901961068364096, + "loss": 1.2189, + "step": 29990 + }, + { + "epoch": 0.06373895921715811, + "grad_norm": 0.3776419758796692, + "learning_rate": 0.00019901865906961103, + "loss": 1.2922, + "step": 30000 + }, + { + "epoch": 0.06376020553689715, + "grad_norm": 0.44151535630226135, + "learning_rate": 0.00019901770699624075, + "loss": 1.2189, + "step": 30010 + }, + { + "epoch": 0.06378145185663621, + "grad_norm": 0.42613402009010315, + "learning_rate": 0.0001990167544635345, + "loss": 1.2593, + "step": 30020 + }, + { + "epoch": 0.06380269817637527, + "grad_norm": 0.6562868356704712, + "learning_rate": 0.00019901580147149673, + "loss": 1.2224, + "step": 30030 + }, + { + "epoch": 0.06382394449611431, + "grad_norm": 0.41735661029815674, + "learning_rate": 0.0001990148480201318, + "loss": 1.2386, + "step": 30040 + }, + { + "epoch": 0.06384519081585337, + "grad_norm": 0.5015903115272522, + "learning_rate": 0.00019901389410944425, + "loss": 1.2102, + "step": 30050 + }, + { + "epoch": 0.06386643713559242, + "grad_norm": 0.45683756470680237, + "learning_rate": 0.0001990129397394384, + "loss": 1.2268, + "step": 30060 + }, + { + "epoch": 0.06388768345533147, + "grad_norm": 0.43723827600479126, + "learning_rate": 0.0001990119849101187, + "loss": 1.25, + "step": 30070 + }, + { + "epoch": 0.06390892977507052, + "grad_norm": 0.4670069217681885, + "learning_rate": 0.00019901102962148966, + "loss": 1.2459, + "step": 30080 + }, + { + "epoch": 0.06393017609480958, + "grad_norm": 0.4643060564994812, + "learning_rate": 0.00019901007387355558, + "loss": 1.227, + "step": 30090 + }, + { + "epoch": 0.06395142241454863, + "grad_norm": 0.362641304731369, + "learning_rate": 0.000199009117666321, + "loss": 1.212, + "step": 30100 + }, + { + "epoch": 0.06397266873428768, + "grad_norm": 0.4925757050514221, + "learning_rate": 0.0001990081609997903, + "loss": 1.2158, + "step": 30110 + }, + { + "epoch": 0.06399391505402674, + "grad_norm": 0.39037248492240906, + "learning_rate": 0.00019900720387396796, + "loss": 1.2157, + "step": 30120 + }, + { + "epoch": 0.06401516137376578, + "grad_norm": 0.39417070150375366, + "learning_rate": 0.00019900624628885837, + "loss": 1.2328, + "step": 30130 + }, + { + "epoch": 0.06403640769350484, + "grad_norm": 0.4455832540988922, + "learning_rate": 0.000199005288244466, + "loss": 1.2237, + "step": 30140 + }, + { + "epoch": 0.0640576540132439, + "grad_norm": 0.47525542974472046, + "learning_rate": 0.00019900432974079526, + "loss": 1.2315, + "step": 30150 + }, + { + "epoch": 0.06407890033298294, + "grad_norm": 0.421257346868515, + "learning_rate": 0.00019900337077785067, + "loss": 1.214, + "step": 30160 + }, + { + "epoch": 0.064100146652722, + "grad_norm": 0.45019274950027466, + "learning_rate": 0.0001990024113556366, + "loss": 1.2462, + "step": 30170 + }, + { + "epoch": 0.06412139297246106, + "grad_norm": 0.45863065123558044, + "learning_rate": 0.00019900145147415757, + "loss": 1.2097, + "step": 30180 + }, + { + "epoch": 0.0641426392922001, + "grad_norm": 0.3762584924697876, + "learning_rate": 0.000199000491133418, + "loss": 1.2124, + "step": 30190 + }, + { + "epoch": 0.06416388561193916, + "grad_norm": 0.3515234887599945, + "learning_rate": 0.0001989995303334223, + "loss": 1.2572, + "step": 30200 + }, + { + "epoch": 0.06418513193167821, + "grad_norm": 0.543845534324646, + "learning_rate": 0.00019899856907417503, + "loss": 1.2254, + "step": 30210 + }, + { + "epoch": 0.06420637825141726, + "grad_norm": 0.3800329566001892, + "learning_rate": 0.00019899760735568055, + "loss": 1.2445, + "step": 30220 + }, + { + "epoch": 0.06422762457115631, + "grad_norm": 0.4870492219924927, + "learning_rate": 0.00019899664517794338, + "loss": 1.2327, + "step": 30230 + }, + { + "epoch": 0.06424887089089537, + "grad_norm": 0.3552006781101227, + "learning_rate": 0.00019899568254096793, + "loss": 1.2255, + "step": 30240 + }, + { + "epoch": 0.06427011721063441, + "grad_norm": 0.35814782977104187, + "learning_rate": 0.00019899471944475873, + "loss": 1.2267, + "step": 30250 + }, + { + "epoch": 0.06429136353037347, + "grad_norm": 0.34044328331947327, + "learning_rate": 0.00019899375588932022, + "loss": 1.2127, + "step": 30260 + }, + { + "epoch": 0.06431260985011253, + "grad_norm": 0.364179402589798, + "learning_rate": 0.00019899279187465688, + "loss": 1.2231, + "step": 30270 + }, + { + "epoch": 0.06433385616985157, + "grad_norm": 0.5496293306350708, + "learning_rate": 0.00019899182740077315, + "loss": 1.2366, + "step": 30280 + }, + { + "epoch": 0.06435510248959063, + "grad_norm": 0.5130460262298584, + "learning_rate": 0.00019899086246767352, + "loss": 1.2427, + "step": 30290 + }, + { + "epoch": 0.06437634880932969, + "grad_norm": 0.35121116042137146, + "learning_rate": 0.00019898989707536248, + "loss": 1.2439, + "step": 30300 + }, + { + "epoch": 0.06439759512906873, + "grad_norm": 0.3742784857749939, + "learning_rate": 0.0001989889312238445, + "loss": 1.2439, + "step": 30310 + }, + { + "epoch": 0.06441884144880779, + "grad_norm": 0.4015083611011505, + "learning_rate": 0.00019898796491312406, + "loss": 1.231, + "step": 30320 + }, + { + "epoch": 0.06444008776854684, + "grad_norm": 0.3554510474205017, + "learning_rate": 0.00019898699814320563, + "loss": 1.2785, + "step": 30330 + }, + { + "epoch": 0.0644613340882859, + "grad_norm": 0.4295300543308258, + "learning_rate": 0.00019898603091409372, + "loss": 1.2291, + "step": 30340 + }, + { + "epoch": 0.06448258040802494, + "grad_norm": 0.3663000166416168, + "learning_rate": 0.0001989850632257928, + "loss": 1.2498, + "step": 30350 + }, + { + "epoch": 0.064503826727764, + "grad_norm": 0.5406771302223206, + "learning_rate": 0.00019898409507830735, + "loss": 1.2384, + "step": 30360 + }, + { + "epoch": 0.06452507304750306, + "grad_norm": 0.35733863711357117, + "learning_rate": 0.00019898312647164188, + "loss": 1.2694, + "step": 30370 + }, + { + "epoch": 0.0645463193672421, + "grad_norm": 0.7113136649131775, + "learning_rate": 0.00019898215740580088, + "loss": 1.2442, + "step": 30380 + }, + { + "epoch": 0.06456756568698116, + "grad_norm": 0.4860198199748993, + "learning_rate": 0.00019898118788078885, + "loss": 1.254, + "step": 30390 + }, + { + "epoch": 0.06458881200672022, + "grad_norm": 0.36725112795829773, + "learning_rate": 0.00019898021789661028, + "loss": 1.2622, + "step": 30400 + }, + { + "epoch": 0.06461005832645926, + "grad_norm": 0.3521979749202728, + "learning_rate": 0.0001989792474532697, + "loss": 1.2568, + "step": 30410 + }, + { + "epoch": 0.06463130464619832, + "grad_norm": 0.359325647354126, + "learning_rate": 0.0001989782765507715, + "loss": 1.2326, + "step": 30420 + }, + { + "epoch": 0.06465255096593737, + "grad_norm": 0.4218209683895111, + "learning_rate": 0.00019897730518912032, + "loss": 1.2282, + "step": 30430 + }, + { + "epoch": 0.06467379728567642, + "grad_norm": 0.5821555852890015, + "learning_rate": 0.0001989763333683206, + "loss": 1.2274, + "step": 30440 + }, + { + "epoch": 0.06469504360541548, + "grad_norm": 0.3796718120574951, + "learning_rate": 0.00019897536108837686, + "loss": 1.2304, + "step": 30450 + }, + { + "epoch": 0.06471628992515453, + "grad_norm": 0.38607925176620483, + "learning_rate": 0.00019897438834929362, + "loss": 1.2334, + "step": 30460 + }, + { + "epoch": 0.06473753624489358, + "grad_norm": 0.44034886360168457, + "learning_rate": 0.00019897341515107538, + "loss": 1.2422, + "step": 30470 + }, + { + "epoch": 0.06475878256463263, + "grad_norm": 0.47159022092819214, + "learning_rate": 0.00019897244149372667, + "loss": 1.2313, + "step": 30480 + }, + { + "epoch": 0.06478002888437169, + "grad_norm": 0.36147594451904297, + "learning_rate": 0.00019897146737725196, + "loss": 1.2027, + "step": 30490 + }, + { + "epoch": 0.06480127520411073, + "grad_norm": 0.4141593873500824, + "learning_rate": 0.00019897049280165582, + "loss": 1.2439, + "step": 30500 + }, + { + "epoch": 0.06482252152384979, + "grad_norm": 0.5638779997825623, + "learning_rate": 0.00019896951776694275, + "loss": 1.1987, + "step": 30510 + }, + { + "epoch": 0.06484376784358885, + "grad_norm": 0.38317540287971497, + "learning_rate": 0.0001989685422731173, + "loss": 1.2224, + "step": 30520 + }, + { + "epoch": 0.06486501416332789, + "grad_norm": 0.38855472207069397, + "learning_rate": 0.00019896756632018395, + "loss": 1.2646, + "step": 30530 + }, + { + "epoch": 0.06488626048306695, + "grad_norm": 0.34870055317878723, + "learning_rate": 0.00019896658990814725, + "loss": 1.239, + "step": 30540 + }, + { + "epoch": 0.064907506802806, + "grad_norm": 0.34741777181625366, + "learning_rate": 0.00019896561303701173, + "loss": 1.256, + "step": 30550 + }, + { + "epoch": 0.06492875312254505, + "grad_norm": 0.3398537039756775, + "learning_rate": 0.0001989646357067819, + "loss": 1.216, + "step": 30560 + }, + { + "epoch": 0.0649499994422841, + "grad_norm": 0.38704437017440796, + "learning_rate": 0.00019896365791746237, + "loss": 1.225, + "step": 30570 + }, + { + "epoch": 0.06497124576202316, + "grad_norm": 0.36274218559265137, + "learning_rate": 0.00019896267966905756, + "loss": 1.2149, + "step": 30580 + }, + { + "epoch": 0.06499249208176221, + "grad_norm": 0.5274336934089661, + "learning_rate": 0.00019896170096157212, + "loss": 1.2176, + "step": 30590 + }, + { + "epoch": 0.06501373840150126, + "grad_norm": 0.41334080696105957, + "learning_rate": 0.0001989607217950105, + "loss": 1.2285, + "step": 30600 + }, + { + "epoch": 0.06503498472124032, + "grad_norm": 0.6860292553901672, + "learning_rate": 0.00019895974216937732, + "loss": 1.2176, + "step": 30610 + }, + { + "epoch": 0.06505623104097936, + "grad_norm": 0.6611437201499939, + "learning_rate": 0.00019895876208467702, + "loss": 1.2195, + "step": 30620 + }, + { + "epoch": 0.06507747736071842, + "grad_norm": 0.6958082318305969, + "learning_rate": 0.00019895778154091427, + "loss": 1.2312, + "step": 30630 + }, + { + "epoch": 0.06509872368045748, + "grad_norm": 0.4771542549133301, + "learning_rate": 0.00019895680053809355, + "loss": 1.1941, + "step": 30640 + }, + { + "epoch": 0.06511997000019652, + "grad_norm": 0.5006735324859619, + "learning_rate": 0.00019895581907621943, + "loss": 1.2653, + "step": 30650 + }, + { + "epoch": 0.06514121631993558, + "grad_norm": 0.36316052079200745, + "learning_rate": 0.0001989548371552964, + "loss": 1.2602, + "step": 30660 + }, + { + "epoch": 0.06516246263967464, + "grad_norm": 0.38289138674736023, + "learning_rate": 0.00019895385477532914, + "loss": 1.2284, + "step": 30670 + }, + { + "epoch": 0.06518370895941368, + "grad_norm": 0.3339924216270447, + "learning_rate": 0.0001989528719363221, + "loss": 1.2634, + "step": 30680 + }, + { + "epoch": 0.06520495527915274, + "grad_norm": 0.4017369747161865, + "learning_rate": 0.00019895188863827988, + "loss": 1.2446, + "step": 30690 + }, + { + "epoch": 0.0652262015988918, + "grad_norm": 0.37105661630630493, + "learning_rate": 0.00019895090488120706, + "loss": 1.2121, + "step": 30700 + }, + { + "epoch": 0.06524744791863084, + "grad_norm": 0.40797436237335205, + "learning_rate": 0.00019894992066510814, + "loss": 1.249, + "step": 30710 + }, + { + "epoch": 0.0652686942383699, + "grad_norm": 0.6861441135406494, + "learning_rate": 0.00019894893598998775, + "loss": 1.2348, + "step": 30720 + }, + { + "epoch": 0.06528994055810895, + "grad_norm": 0.7134884595870972, + "learning_rate": 0.0001989479508558504, + "loss": 1.2225, + "step": 30730 + }, + { + "epoch": 0.065311186877848, + "grad_norm": 0.3854389190673828, + "learning_rate": 0.00019894696526270075, + "loss": 1.2186, + "step": 30740 + }, + { + "epoch": 0.06533243319758705, + "grad_norm": 0.3767174482345581, + "learning_rate": 0.0001989459792105433, + "loss": 1.2481, + "step": 30750 + }, + { + "epoch": 0.06535367951732611, + "grad_norm": 0.37090086936950684, + "learning_rate": 0.00019894499269938263, + "loss": 1.2439, + "step": 30760 + }, + { + "epoch": 0.06537492583706517, + "grad_norm": 0.4028465747833252, + "learning_rate": 0.00019894400572922333, + "loss": 1.2364, + "step": 30770 + }, + { + "epoch": 0.06539617215680421, + "grad_norm": 0.41661161184310913, + "learning_rate": 0.00019894301830006996, + "loss": 1.2567, + "step": 30780 + }, + { + "epoch": 0.06541741847654327, + "grad_norm": 0.5168125629425049, + "learning_rate": 0.00019894203041192712, + "loss": 1.2181, + "step": 30790 + }, + { + "epoch": 0.06543866479628233, + "grad_norm": 0.3537229597568512, + "learning_rate": 0.0001989410420647994, + "loss": 1.2603, + "step": 30800 + }, + { + "epoch": 0.06545991111602137, + "grad_norm": 0.3833150565624237, + "learning_rate": 0.00019894005325869136, + "loss": 1.2241, + "step": 30810 + }, + { + "epoch": 0.06548115743576043, + "grad_norm": 0.3548775017261505, + "learning_rate": 0.00019893906399360761, + "loss": 1.2693, + "step": 30820 + }, + { + "epoch": 0.06550240375549948, + "grad_norm": 0.4025346338748932, + "learning_rate": 0.00019893807426955276, + "loss": 1.2025, + "step": 30830 + }, + { + "epoch": 0.06552365007523853, + "grad_norm": 0.3636939525604248, + "learning_rate": 0.00019893708408653133, + "loss": 1.2447, + "step": 30840 + }, + { + "epoch": 0.06554489639497758, + "grad_norm": 0.3487277626991272, + "learning_rate": 0.00019893609344454798, + "loss": 1.2336, + "step": 30850 + }, + { + "epoch": 0.06556614271471664, + "grad_norm": 0.4638534486293793, + "learning_rate": 0.00019893510234360724, + "loss": 1.2161, + "step": 30860 + }, + { + "epoch": 0.06558738903445568, + "grad_norm": 0.38617780804634094, + "learning_rate": 0.0001989341107837138, + "loss": 1.2471, + "step": 30870 + }, + { + "epoch": 0.06560863535419474, + "grad_norm": 0.36381813883781433, + "learning_rate": 0.00019893311876487216, + "loss": 1.2249, + "step": 30880 + }, + { + "epoch": 0.0656298816739338, + "grad_norm": 0.6111080050468445, + "learning_rate": 0.000198932126287087, + "loss": 1.2461, + "step": 30890 + }, + { + "epoch": 0.06565112799367284, + "grad_norm": 0.4193234443664551, + "learning_rate": 0.00019893113335036288, + "loss": 1.2581, + "step": 30900 + }, + { + "epoch": 0.0656723743134119, + "grad_norm": 0.8547705411911011, + "learning_rate": 0.00019893013995470442, + "loss": 1.2162, + "step": 30910 + }, + { + "epoch": 0.06569362063315096, + "grad_norm": 0.6097555756568909, + "learning_rate": 0.00019892914610011625, + "loss": 1.1983, + "step": 30920 + }, + { + "epoch": 0.06571486695289, + "grad_norm": 0.33483073115348816, + "learning_rate": 0.00019892815178660294, + "loss": 1.2519, + "step": 30930 + }, + { + "epoch": 0.06573611327262906, + "grad_norm": 0.47308874130249023, + "learning_rate": 0.00019892715701416914, + "loss": 1.2349, + "step": 30940 + }, + { + "epoch": 0.06575735959236811, + "grad_norm": 0.37587666511535645, + "learning_rate": 0.00019892616178281939, + "loss": 1.235, + "step": 30950 + }, + { + "epoch": 0.06577860591210716, + "grad_norm": 0.41789478063583374, + "learning_rate": 0.0001989251660925584, + "loss": 1.263, + "step": 30960 + }, + { + "epoch": 0.06579985223184621, + "grad_norm": 0.539869487285614, + "learning_rate": 0.00019892416994339075, + "loss": 1.2208, + "step": 30970 + }, + { + "epoch": 0.06582109855158527, + "grad_norm": 0.3827168047428131, + "learning_rate": 0.00019892317333532108, + "loss": 1.2263, + "step": 30980 + }, + { + "epoch": 0.06584234487132432, + "grad_norm": 0.34409284591674805, + "learning_rate": 0.00019892217626835398, + "loss": 1.2392, + "step": 30990 + }, + { + "epoch": 0.06586359119106337, + "grad_norm": 0.3473566770553589, + "learning_rate": 0.0001989211787424941, + "loss": 1.2592, + "step": 31000 + }, + { + "epoch": 0.06588483751080243, + "grad_norm": 0.4621666967868805, + "learning_rate": 0.00019892018075774604, + "loss": 1.2594, + "step": 31010 + }, + { + "epoch": 0.06590608383054147, + "grad_norm": 0.5102055072784424, + "learning_rate": 0.00019891918231411445, + "loss": 1.1965, + "step": 31020 + }, + { + "epoch": 0.06592733015028053, + "grad_norm": 0.43397805094718933, + "learning_rate": 0.00019891818341160396, + "loss": 1.2426, + "step": 31030 + }, + { + "epoch": 0.06594857647001959, + "grad_norm": 0.5864611864089966, + "learning_rate": 0.00019891718405021922, + "loss": 1.2216, + "step": 31040 + }, + { + "epoch": 0.06596982278975863, + "grad_norm": 0.4098244607448578, + "learning_rate": 0.0001989161842299648, + "loss": 1.221, + "step": 31050 + }, + { + "epoch": 0.06599106910949769, + "grad_norm": 0.5347037315368652, + "learning_rate": 0.00019891518395084544, + "loss": 1.2101, + "step": 31060 + }, + { + "epoch": 0.06601231542923675, + "grad_norm": 0.46733102202415466, + "learning_rate": 0.00019891418321286574, + "loss": 1.206, + "step": 31070 + }, + { + "epoch": 0.06603356174897579, + "grad_norm": 0.5276093482971191, + "learning_rate": 0.00019891318201603028, + "loss": 1.2697, + "step": 31080 + }, + { + "epoch": 0.06605480806871485, + "grad_norm": 0.4786786437034607, + "learning_rate": 0.00019891218036034375, + "loss": 1.2512, + "step": 31090 + }, + { + "epoch": 0.0660760543884539, + "grad_norm": 0.4163045585155487, + "learning_rate": 0.00019891117824581085, + "loss": 1.2314, + "step": 31100 + }, + { + "epoch": 0.06609730070819295, + "grad_norm": 0.3524056375026703, + "learning_rate": 0.00019891017567243618, + "loss": 1.251, + "step": 31110 + }, + { + "epoch": 0.066118547027932, + "grad_norm": 0.3647618889808655, + "learning_rate": 0.00019890917264022434, + "loss": 1.2382, + "step": 31120 + }, + { + "epoch": 0.06613979334767106, + "grad_norm": 0.445516437292099, + "learning_rate": 0.00019890816914918007, + "loss": 1.2619, + "step": 31130 + }, + { + "epoch": 0.0661610396674101, + "grad_norm": 0.3425311744213104, + "learning_rate": 0.00019890716519930797, + "loss": 1.2553, + "step": 31140 + }, + { + "epoch": 0.06618228598714916, + "grad_norm": 0.6327058672904968, + "learning_rate": 0.00019890616079061273, + "loss": 1.2257, + "step": 31150 + }, + { + "epoch": 0.06620353230688822, + "grad_norm": 0.42780932784080505, + "learning_rate": 0.00019890515592309898, + "loss": 1.2469, + "step": 31160 + }, + { + "epoch": 0.06622477862662726, + "grad_norm": 0.5598697662353516, + "learning_rate": 0.00019890415059677143, + "loss": 1.2178, + "step": 31170 + }, + { + "epoch": 0.06624602494636632, + "grad_norm": 0.32864418625831604, + "learning_rate": 0.0001989031448116347, + "loss": 1.1841, + "step": 31180 + }, + { + "epoch": 0.06626727126610538, + "grad_norm": 0.3663240075111389, + "learning_rate": 0.00019890213856769344, + "loss": 1.2687, + "step": 31190 + }, + { + "epoch": 0.06628851758584443, + "grad_norm": 0.39785757660865784, + "learning_rate": 0.00019890113186495233, + "loss": 1.2182, + "step": 31200 + }, + { + "epoch": 0.06630976390558348, + "grad_norm": 0.5208150744438171, + "learning_rate": 0.00019890012470341611, + "loss": 1.2598, + "step": 31210 + }, + { + "epoch": 0.06633101022532253, + "grad_norm": 0.5780936479568481, + "learning_rate": 0.00019889911708308937, + "loss": 1.2279, + "step": 31220 + }, + { + "epoch": 0.06635225654506159, + "grad_norm": 0.3887481093406677, + "learning_rate": 0.0001988981090039768, + "loss": 1.2202, + "step": 31230 + }, + { + "epoch": 0.06637350286480063, + "grad_norm": 0.40115875005722046, + "learning_rate": 0.00019889710046608312, + "loss": 1.1977, + "step": 31240 + }, + { + "epoch": 0.06639474918453969, + "grad_norm": 0.4690003991127014, + "learning_rate": 0.00019889609146941294, + "loss": 1.2369, + "step": 31250 + }, + { + "epoch": 0.06641599550427875, + "grad_norm": 0.33435580134391785, + "learning_rate": 0.000198895082013971, + "loss": 1.2012, + "step": 31260 + }, + { + "epoch": 0.06643724182401779, + "grad_norm": 0.5502902865409851, + "learning_rate": 0.00019889407209976195, + "loss": 1.2192, + "step": 31270 + }, + { + "epoch": 0.06645848814375685, + "grad_norm": 0.6326519846916199, + "learning_rate": 0.00019889306172679046, + "loss": 1.2309, + "step": 31280 + }, + { + "epoch": 0.06647973446349591, + "grad_norm": 0.3496846854686737, + "learning_rate": 0.00019889205089506127, + "loss": 1.2118, + "step": 31290 + }, + { + "epoch": 0.06650098078323495, + "grad_norm": 0.35816630721092224, + "learning_rate": 0.00019889103960457903, + "loss": 1.2204, + "step": 31300 + }, + { + "epoch": 0.06652222710297401, + "grad_norm": 0.48774266242980957, + "learning_rate": 0.00019889002785534844, + "loss": 1.2243, + "step": 31310 + }, + { + "epoch": 0.06654347342271306, + "grad_norm": 0.37607958912849426, + "learning_rate": 0.0001988890156473742, + "loss": 1.2082, + "step": 31320 + }, + { + "epoch": 0.06656471974245211, + "grad_norm": 0.346468061208725, + "learning_rate": 0.000198888002980661, + "loss": 1.2434, + "step": 31330 + }, + { + "epoch": 0.06658596606219117, + "grad_norm": 0.3425970673561096, + "learning_rate": 0.00019888698985521354, + "loss": 1.2371, + "step": 31340 + }, + { + "epoch": 0.06660721238193022, + "grad_norm": 0.5573973655700684, + "learning_rate": 0.00019888597627103652, + "loss": 1.2273, + "step": 31350 + }, + { + "epoch": 0.06662845870166927, + "grad_norm": 0.42071884870529175, + "learning_rate": 0.00019888496222813463, + "loss": 1.2165, + "step": 31360 + }, + { + "epoch": 0.06664970502140832, + "grad_norm": 0.4182693362236023, + "learning_rate": 0.0001988839477265126, + "loss": 1.2444, + "step": 31370 + }, + { + "epoch": 0.06667095134114738, + "grad_norm": 0.44570988416671753, + "learning_rate": 0.00019888293276617511, + "loss": 1.2198, + "step": 31380 + }, + { + "epoch": 0.06669219766088642, + "grad_norm": 0.45607560873031616, + "learning_rate": 0.0001988819173471269, + "loss": 1.2392, + "step": 31390 + }, + { + "epoch": 0.06671344398062548, + "grad_norm": 0.3917636275291443, + "learning_rate": 0.0001988809014693726, + "loss": 1.2352, + "step": 31400 + }, + { + "epoch": 0.06673469030036454, + "grad_norm": 0.3977293372154236, + "learning_rate": 0.00019887988513291705, + "loss": 1.224, + "step": 31410 + }, + { + "epoch": 0.06675593662010358, + "grad_norm": 0.4262135624885559, + "learning_rate": 0.00019887886833776483, + "loss": 1.225, + "step": 31420 + }, + { + "epoch": 0.06677718293984264, + "grad_norm": 0.4003868103027344, + "learning_rate": 0.0001988778510839208, + "loss": 1.24, + "step": 31430 + }, + { + "epoch": 0.0667984292595817, + "grad_norm": 0.4134019613265991, + "learning_rate": 0.00019887683337138954, + "loss": 1.2908, + "step": 31440 + }, + { + "epoch": 0.06681967557932074, + "grad_norm": 0.3882085382938385, + "learning_rate": 0.00019887581520017586, + "loss": 1.196, + "step": 31450 + }, + { + "epoch": 0.0668409218990598, + "grad_norm": 0.383987694978714, + "learning_rate": 0.00019887479657028446, + "loss": 1.2639, + "step": 31460 + }, + { + "epoch": 0.06686216821879885, + "grad_norm": 0.3412494957447052, + "learning_rate": 0.00019887377748172003, + "loss": 1.221, + "step": 31470 + }, + { + "epoch": 0.0668834145385379, + "grad_norm": 0.42115893959999084, + "learning_rate": 0.00019887275793448734, + "loss": 1.2311, + "step": 31480 + }, + { + "epoch": 0.06690466085827695, + "grad_norm": 0.5004622936248779, + "learning_rate": 0.00019887173792859116, + "loss": 1.2417, + "step": 31490 + }, + { + "epoch": 0.06692590717801601, + "grad_norm": 0.32241249084472656, + "learning_rate": 0.00019887071746403612, + "loss": 1.2535, + "step": 31500 + }, + { + "epoch": 0.06694715349775505, + "grad_norm": 0.753565788269043, + "learning_rate": 0.00019886969654082698, + "loss": 1.1803, + "step": 31510 + }, + { + "epoch": 0.06696839981749411, + "grad_norm": 0.3610093593597412, + "learning_rate": 0.00019886867515896858, + "loss": 1.2604, + "step": 31520 + }, + { + "epoch": 0.06698964613723317, + "grad_norm": 0.7348594069480896, + "learning_rate": 0.00019886765331846552, + "loss": 1.2327, + "step": 31530 + }, + { + "epoch": 0.06701089245697221, + "grad_norm": 0.310596764087677, + "learning_rate": 0.0001988666310193226, + "loss": 1.2143, + "step": 31540 + }, + { + "epoch": 0.06703213877671127, + "grad_norm": 0.517618715763092, + "learning_rate": 0.0001988656082615446, + "loss": 1.2054, + "step": 31550 + }, + { + "epoch": 0.06705338509645033, + "grad_norm": 0.3510233461856842, + "learning_rate": 0.00019886458504513618, + "loss": 1.2345, + "step": 31560 + }, + { + "epoch": 0.06707463141618937, + "grad_norm": 0.45605459809303284, + "learning_rate": 0.00019886356137010215, + "loss": 1.215, + "step": 31570 + }, + { + "epoch": 0.06709587773592843, + "grad_norm": 0.6123008131980896, + "learning_rate": 0.00019886253723644723, + "loss": 1.2168, + "step": 31580 + }, + { + "epoch": 0.06711712405566748, + "grad_norm": 0.3698100745677948, + "learning_rate": 0.0001988615126441762, + "loss": 1.2336, + "step": 31590 + }, + { + "epoch": 0.06713837037540654, + "grad_norm": 0.3493160903453827, + "learning_rate": 0.0001988604875932938, + "loss": 1.2395, + "step": 31600 + }, + { + "epoch": 0.06715961669514559, + "grad_norm": 0.3533746302127838, + "learning_rate": 0.00019885946208380477, + "loss": 1.2247, + "step": 31610 + }, + { + "epoch": 0.06718086301488464, + "grad_norm": 0.4061134159564972, + "learning_rate": 0.00019885843611571385, + "loss": 1.2369, + "step": 31620 + }, + { + "epoch": 0.0672021093346237, + "grad_norm": 0.6134882569313049, + "learning_rate": 0.00019885740968902588, + "loss": 1.2075, + "step": 31630 + }, + { + "epoch": 0.06722335565436274, + "grad_norm": 0.4557090103626251, + "learning_rate": 0.0001988563828037455, + "loss": 1.215, + "step": 31640 + }, + { + "epoch": 0.0672446019741018, + "grad_norm": 0.33001580834388733, + "learning_rate": 0.0001988553554598776, + "loss": 1.2618, + "step": 31650 + }, + { + "epoch": 0.06726584829384086, + "grad_norm": 0.4871387481689453, + "learning_rate": 0.0001988543276574269, + "loss": 1.2145, + "step": 31660 + }, + { + "epoch": 0.0672870946135799, + "grad_norm": 0.3512348532676697, + "learning_rate": 0.0001988532993963981, + "loss": 1.2508, + "step": 31670 + }, + { + "epoch": 0.06730834093331896, + "grad_norm": 0.4594852924346924, + "learning_rate": 0.00019885227067679607, + "loss": 1.1915, + "step": 31680 + }, + { + "epoch": 0.06732958725305802, + "grad_norm": 0.6493983268737793, + "learning_rate": 0.00019885124149862554, + "loss": 1.262, + "step": 31690 + }, + { + "epoch": 0.06735083357279706, + "grad_norm": 0.33797767758369446, + "learning_rate": 0.00019885021186189124, + "loss": 1.208, + "step": 31700 + }, + { + "epoch": 0.06737207989253612, + "grad_norm": 0.4876266419887543, + "learning_rate": 0.00019884918176659802, + "loss": 1.2117, + "step": 31710 + }, + { + "epoch": 0.06739332621227517, + "grad_norm": 0.44597774744033813, + "learning_rate": 0.0001988481512127506, + "loss": 1.2193, + "step": 31720 + }, + { + "epoch": 0.06741457253201422, + "grad_norm": 0.42841649055480957, + "learning_rate": 0.0001988471202003538, + "loss": 1.2139, + "step": 31730 + }, + { + "epoch": 0.06743581885175327, + "grad_norm": 0.40743184089660645, + "learning_rate": 0.00019884608872941238, + "loss": 1.2421, + "step": 31740 + }, + { + "epoch": 0.06745706517149233, + "grad_norm": 0.3498770296573639, + "learning_rate": 0.00019884505679993117, + "loss": 1.249, + "step": 31750 + }, + { + "epoch": 0.06747831149123137, + "grad_norm": 0.4149243235588074, + "learning_rate": 0.00019884402441191492, + "loss": 1.219, + "step": 31760 + }, + { + "epoch": 0.06749955781097043, + "grad_norm": 0.3464469313621521, + "learning_rate": 0.00019884299156536838, + "loss": 1.2273, + "step": 31770 + }, + { + "epoch": 0.06752080413070949, + "grad_norm": 0.40578094124794006, + "learning_rate": 0.00019884195826029642, + "loss": 1.2152, + "step": 31780 + }, + { + "epoch": 0.06754205045044853, + "grad_norm": 0.4280834197998047, + "learning_rate": 0.00019884092449670376, + "loss": 1.2234, + "step": 31790 + }, + { + "epoch": 0.06756329677018759, + "grad_norm": 0.35745012760162354, + "learning_rate": 0.00019883989027459528, + "loss": 1.2245, + "step": 31800 + }, + { + "epoch": 0.06758454308992665, + "grad_norm": 0.531588613986969, + "learning_rate": 0.0001988388555939757, + "loss": 1.1901, + "step": 31810 + }, + { + "epoch": 0.06760578940966569, + "grad_norm": 0.414762943983078, + "learning_rate": 0.00019883782045484986, + "loss": 1.2391, + "step": 31820 + }, + { + "epoch": 0.06762703572940475, + "grad_norm": 0.5490201115608215, + "learning_rate": 0.0001988367848572225, + "loss": 1.2442, + "step": 31830 + }, + { + "epoch": 0.0676482820491438, + "grad_norm": 0.3497278690338135, + "learning_rate": 0.00019883574880109856, + "loss": 1.2396, + "step": 31840 + }, + { + "epoch": 0.06766952836888285, + "grad_norm": 0.451359361410141, + "learning_rate": 0.00019883471228648273, + "loss": 1.2111, + "step": 31850 + }, + { + "epoch": 0.0676907746886219, + "grad_norm": 0.4001086950302124, + "learning_rate": 0.00019883367531337985, + "loss": 1.2458, + "step": 31860 + }, + { + "epoch": 0.06771202100836096, + "grad_norm": 0.3824087381362915, + "learning_rate": 0.00019883263788179472, + "loss": 1.2442, + "step": 31870 + }, + { + "epoch": 0.0677332673281, + "grad_norm": 0.4000837802886963, + "learning_rate": 0.00019883159999173214, + "loss": 1.2227, + "step": 31880 + }, + { + "epoch": 0.06775451364783906, + "grad_norm": 0.491509348154068, + "learning_rate": 0.00019883056164319698, + "loss": 1.2215, + "step": 31890 + }, + { + "epoch": 0.06777575996757812, + "grad_norm": 0.44863617420196533, + "learning_rate": 0.000198829522836194, + "loss": 1.22, + "step": 31900 + }, + { + "epoch": 0.06779700628731716, + "grad_norm": 0.70345538854599, + "learning_rate": 0.00019882848357072807, + "loss": 1.2331, + "step": 31910 + }, + { + "epoch": 0.06781825260705622, + "grad_norm": 0.3499147593975067, + "learning_rate": 0.00019882744384680398, + "loss": 1.2415, + "step": 31920 + }, + { + "epoch": 0.06783949892679528, + "grad_norm": 0.3565594553947449, + "learning_rate": 0.00019882640366442655, + "loss": 1.2207, + "step": 31930 + }, + { + "epoch": 0.06786074524653432, + "grad_norm": 0.41690248250961304, + "learning_rate": 0.00019882536302360062, + "loss": 1.2262, + "step": 31940 + }, + { + "epoch": 0.06788199156627338, + "grad_norm": 0.41252392530441284, + "learning_rate": 0.000198824321924331, + "loss": 1.2725, + "step": 31950 + }, + { + "epoch": 0.06790323788601244, + "grad_norm": 0.4145454466342926, + "learning_rate": 0.00019882328036662251, + "loss": 1.2425, + "step": 31960 + }, + { + "epoch": 0.06792448420575148, + "grad_norm": 0.42716309428215027, + "learning_rate": 0.00019882223835048003, + "loss": 1.2156, + "step": 31970 + }, + { + "epoch": 0.06794573052549054, + "grad_norm": 0.45620402693748474, + "learning_rate": 0.0001988211958759084, + "loss": 1.2502, + "step": 31980 + }, + { + "epoch": 0.06796697684522959, + "grad_norm": 0.3770490884780884, + "learning_rate": 0.00019882015294291237, + "loss": 1.2465, + "step": 31990 + }, + { + "epoch": 0.06798822316496864, + "grad_norm": 0.3587047755718231, + "learning_rate": 0.00019881910955149684, + "loss": 1.2307, + "step": 32000 + }, + { + "epoch": 0.0680094694847077, + "grad_norm": 0.40729087591171265, + "learning_rate": 0.00019881806570166662, + "loss": 1.2104, + "step": 32010 + }, + { + "epoch": 0.06803071580444675, + "grad_norm": 0.4056267738342285, + "learning_rate": 0.0001988170213934266, + "loss": 1.2235, + "step": 32020 + }, + { + "epoch": 0.06805196212418581, + "grad_norm": 0.3959851861000061, + "learning_rate": 0.0001988159766267816, + "loss": 1.2355, + "step": 32030 + }, + { + "epoch": 0.06807320844392485, + "grad_norm": 0.3385111689567566, + "learning_rate": 0.00019881493140173646, + "loss": 1.226, + "step": 32040 + }, + { + "epoch": 0.06809445476366391, + "grad_norm": 0.32445284724235535, + "learning_rate": 0.00019881388571829604, + "loss": 1.2275, + "step": 32050 + }, + { + "epoch": 0.06811570108340297, + "grad_norm": 0.3893258571624756, + "learning_rate": 0.00019881283957646516, + "loss": 1.2116, + "step": 32060 + }, + { + "epoch": 0.06813694740314201, + "grad_norm": 0.38978323340415955, + "learning_rate": 0.0001988117929762487, + "loss": 1.1981, + "step": 32070 + }, + { + "epoch": 0.06815819372288107, + "grad_norm": 0.5156148672103882, + "learning_rate": 0.00019881074591765151, + "loss": 1.2441, + "step": 32080 + }, + { + "epoch": 0.06817944004262012, + "grad_norm": 0.3672584891319275, + "learning_rate": 0.00019880969840067848, + "loss": 1.2745, + "step": 32090 + }, + { + "epoch": 0.06820068636235917, + "grad_norm": 0.3356822729110718, + "learning_rate": 0.0001988086504253344, + "loss": 1.2288, + "step": 32100 + }, + { + "epoch": 0.06822193268209822, + "grad_norm": 0.38310447335243225, + "learning_rate": 0.00019880760199162417, + "loss": 1.2061, + "step": 32110 + }, + { + "epoch": 0.06824317900183728, + "grad_norm": 0.34113767743110657, + "learning_rate": 0.00019880655309955267, + "loss": 1.2332, + "step": 32120 + }, + { + "epoch": 0.06826442532157632, + "grad_norm": 0.35405150055885315, + "learning_rate": 0.00019880550374912475, + "loss": 1.2322, + "step": 32130 + }, + { + "epoch": 0.06828567164131538, + "grad_norm": 0.3411597013473511, + "learning_rate": 0.00019880445394034526, + "loss": 1.2678, + "step": 32140 + }, + { + "epoch": 0.06830691796105444, + "grad_norm": 0.3721053898334503, + "learning_rate": 0.0001988034036732191, + "loss": 1.2174, + "step": 32150 + }, + { + "epoch": 0.06832816428079348, + "grad_norm": 0.35618358850479126, + "learning_rate": 0.00019880235294775112, + "loss": 1.2454, + "step": 32160 + }, + { + "epoch": 0.06834941060053254, + "grad_norm": 0.604436457157135, + "learning_rate": 0.0001988013017639462, + "loss": 1.2319, + "step": 32170 + }, + { + "epoch": 0.0683706569202716, + "grad_norm": 0.6755198836326599, + "learning_rate": 0.00019880025012180923, + "loss": 1.2336, + "step": 32180 + }, + { + "epoch": 0.06839190324001064, + "grad_norm": 0.39165207743644714, + "learning_rate": 0.00019879919802134505, + "loss": 1.216, + "step": 32190 + }, + { + "epoch": 0.0684131495597497, + "grad_norm": 0.4251810908317566, + "learning_rate": 0.0001987981454625586, + "loss": 1.2098, + "step": 32200 + }, + { + "epoch": 0.06843439587948875, + "grad_norm": 0.4273650050163269, + "learning_rate": 0.0001987970924454547, + "loss": 1.2705, + "step": 32210 + }, + { + "epoch": 0.0684556421992278, + "grad_norm": 0.3594929575920105, + "learning_rate": 0.0001987960389700383, + "loss": 1.2441, + "step": 32220 + }, + { + "epoch": 0.06847688851896686, + "grad_norm": 0.4586970806121826, + "learning_rate": 0.00019879498503631423, + "loss": 1.199, + "step": 32230 + }, + { + "epoch": 0.06849813483870591, + "grad_norm": 0.4137786030769348, + "learning_rate": 0.00019879393064428742, + "loss": 1.2158, + "step": 32240 + }, + { + "epoch": 0.06851938115844496, + "grad_norm": 0.3191899359226227, + "learning_rate": 0.00019879287579396273, + "loss": 1.2255, + "step": 32250 + }, + { + "epoch": 0.06854062747818401, + "grad_norm": 0.5826418995857239, + "learning_rate": 0.00019879182048534506, + "loss": 1.205, + "step": 32260 + }, + { + "epoch": 0.06856187379792307, + "grad_norm": 0.456476092338562, + "learning_rate": 0.00019879076471843933, + "loss": 1.223, + "step": 32270 + }, + { + "epoch": 0.06858312011766211, + "grad_norm": 0.32271862030029297, + "learning_rate": 0.0001987897084932504, + "loss": 1.2097, + "step": 32280 + }, + { + "epoch": 0.06860436643740117, + "grad_norm": 0.9855813980102539, + "learning_rate": 0.00019878865180978318, + "loss": 1.2104, + "step": 32290 + }, + { + "epoch": 0.06862561275714023, + "grad_norm": 0.3422226905822754, + "learning_rate": 0.0001987875946680426, + "loss": 1.1876, + "step": 32300 + }, + { + "epoch": 0.06864685907687927, + "grad_norm": 0.3616737723350525, + "learning_rate": 0.00019878653706803355, + "loss": 1.2244, + "step": 32310 + }, + { + "epoch": 0.06866810539661833, + "grad_norm": 0.36594223976135254, + "learning_rate": 0.00019878547900976094, + "loss": 1.2151, + "step": 32320 + }, + { + "epoch": 0.06868935171635739, + "grad_norm": 0.41161757707595825, + "learning_rate": 0.00019878442049322965, + "loss": 1.1925, + "step": 32330 + }, + { + "epoch": 0.06871059803609643, + "grad_norm": 0.5138816237449646, + "learning_rate": 0.0001987833615184446, + "loss": 1.2285, + "step": 32340 + }, + { + "epoch": 0.06873184435583549, + "grad_norm": 0.40018230676651, + "learning_rate": 0.00019878230208541074, + "loss": 1.233, + "step": 32350 + }, + { + "epoch": 0.06875309067557454, + "grad_norm": 0.4734302759170532, + "learning_rate": 0.00019878124219413295, + "loss": 1.2516, + "step": 32360 + }, + { + "epoch": 0.06877433699531359, + "grad_norm": 0.5271779894828796, + "learning_rate": 0.00019878018184461612, + "loss": 1.2374, + "step": 32370 + }, + { + "epoch": 0.06879558331505264, + "grad_norm": 0.5301005244255066, + "learning_rate": 0.00019877912103686524, + "loss": 1.2319, + "step": 32380 + }, + { + "epoch": 0.0688168296347917, + "grad_norm": 0.3345481753349304, + "learning_rate": 0.00019877805977088517, + "loss": 1.2376, + "step": 32390 + }, + { + "epoch": 0.06883807595453074, + "grad_norm": 0.3511449098587036, + "learning_rate": 0.00019877699804668087, + "loss": 1.2641, + "step": 32400 + }, + { + "epoch": 0.0688593222742698, + "grad_norm": 0.47739583253860474, + "learning_rate": 0.00019877593586425726, + "loss": 1.2283, + "step": 32410 + }, + { + "epoch": 0.06888056859400886, + "grad_norm": 0.3650372326374054, + "learning_rate": 0.00019877487322361926, + "loss": 1.2321, + "step": 32420 + }, + { + "epoch": 0.0689018149137479, + "grad_norm": 0.3381108343601227, + "learning_rate": 0.00019877381012477178, + "loss": 1.2303, + "step": 32430 + }, + { + "epoch": 0.06892306123348696, + "grad_norm": 0.42160239815711975, + "learning_rate": 0.00019877274656771978, + "loss": 1.2085, + "step": 32440 + }, + { + "epoch": 0.06894430755322602, + "grad_norm": 0.4235151410102844, + "learning_rate": 0.0001987716825524682, + "loss": 1.2166, + "step": 32450 + }, + { + "epoch": 0.06896555387296507, + "grad_norm": 0.33615973591804504, + "learning_rate": 0.00019877061807902193, + "loss": 1.2255, + "step": 32460 + }, + { + "epoch": 0.06898680019270412, + "grad_norm": 0.344024658203125, + "learning_rate": 0.00019876955314738598, + "loss": 1.214, + "step": 32470 + }, + { + "epoch": 0.06900804651244317, + "grad_norm": 0.518794596195221, + "learning_rate": 0.0001987684877575652, + "loss": 1.2248, + "step": 32480 + }, + { + "epoch": 0.06902929283218223, + "grad_norm": 0.34122511744499207, + "learning_rate": 0.00019876742190956458, + "loss": 1.1912, + "step": 32490 + }, + { + "epoch": 0.06905053915192128, + "grad_norm": 0.5984306931495667, + "learning_rate": 0.0001987663556033891, + "loss": 1.2252, + "step": 32500 + }, + { + "epoch": 0.06907178547166033, + "grad_norm": 0.4476170241832733, + "learning_rate": 0.00019876528883904366, + "loss": 1.2235, + "step": 32510 + }, + { + "epoch": 0.06909303179139939, + "grad_norm": 0.5254939198493958, + "learning_rate": 0.0001987642216165332, + "loss": 1.2155, + "step": 32520 + }, + { + "epoch": 0.06911427811113843, + "grad_norm": 0.5182522535324097, + "learning_rate": 0.00019876315393586272, + "loss": 1.24, + "step": 32530 + }, + { + "epoch": 0.06913552443087749, + "grad_norm": 0.3628413677215576, + "learning_rate": 0.00019876208579703715, + "loss": 1.2134, + "step": 32540 + }, + { + "epoch": 0.06915677075061655, + "grad_norm": 0.34988585114479065, + "learning_rate": 0.0001987610172000614, + "loss": 1.2608, + "step": 32550 + }, + { + "epoch": 0.06917801707035559, + "grad_norm": 0.4641767144203186, + "learning_rate": 0.0001987599481449405, + "loss": 1.2426, + "step": 32560 + }, + { + "epoch": 0.06919926339009465, + "grad_norm": 0.3473912477493286, + "learning_rate": 0.00019875887863167937, + "loss": 1.2082, + "step": 32570 + }, + { + "epoch": 0.0692205097098337, + "grad_norm": 0.40970173478126526, + "learning_rate": 0.00019875780866028295, + "loss": 1.2103, + "step": 32580 + }, + { + "epoch": 0.06924175602957275, + "grad_norm": 0.35436293482780457, + "learning_rate": 0.00019875673823075625, + "loss": 1.1977, + "step": 32590 + }, + { + "epoch": 0.0692630023493118, + "grad_norm": 0.4913097620010376, + "learning_rate": 0.00019875566734310423, + "loss": 1.2344, + "step": 32600 + }, + { + "epoch": 0.06928424866905086, + "grad_norm": 0.3913101553916931, + "learning_rate": 0.00019875459599733182, + "loss": 1.2448, + "step": 32610 + }, + { + "epoch": 0.0693054949887899, + "grad_norm": 0.5692382454872131, + "learning_rate": 0.00019875352419344405, + "loss": 1.2309, + "step": 32620 + }, + { + "epoch": 0.06932674130852896, + "grad_norm": 0.4893902540206909, + "learning_rate": 0.0001987524519314458, + "loss": 1.2854, + "step": 32630 + }, + { + "epoch": 0.06934798762826802, + "grad_norm": 0.4349972903728485, + "learning_rate": 0.00019875137921134215, + "loss": 1.2229, + "step": 32640 + }, + { + "epoch": 0.06936923394800706, + "grad_norm": 0.40219631791114807, + "learning_rate": 0.00019875030603313797, + "loss": 1.27, + "step": 32650 + }, + { + "epoch": 0.06939048026774612, + "grad_norm": 0.3238067626953125, + "learning_rate": 0.00019874923239683834, + "loss": 1.2676, + "step": 32660 + }, + { + "epoch": 0.06941172658748518, + "grad_norm": 0.34776532649993896, + "learning_rate": 0.0001987481583024482, + "loss": 1.2473, + "step": 32670 + }, + { + "epoch": 0.06943297290722422, + "grad_norm": 0.38621100783348083, + "learning_rate": 0.0001987470837499725, + "loss": 1.2487, + "step": 32680 + }, + { + "epoch": 0.06945421922696328, + "grad_norm": 0.6418294906616211, + "learning_rate": 0.00019874600873941628, + "loss": 1.2828, + "step": 32690 + }, + { + "epoch": 0.06947546554670234, + "grad_norm": 0.36192604899406433, + "learning_rate": 0.00019874493327078449, + "loss": 1.2176, + "step": 32700 + }, + { + "epoch": 0.06949671186644138, + "grad_norm": 0.3541179299354553, + "learning_rate": 0.00019874385734408207, + "loss": 1.259, + "step": 32710 + }, + { + "epoch": 0.06951795818618044, + "grad_norm": 0.6039848327636719, + "learning_rate": 0.00019874278095931413, + "loss": 1.2464, + "step": 32720 + }, + { + "epoch": 0.0695392045059195, + "grad_norm": 0.33190181851387024, + "learning_rate": 0.00019874170411648558, + "loss": 1.243, + "step": 32730 + }, + { + "epoch": 0.06956045082565854, + "grad_norm": 0.40935924649238586, + "learning_rate": 0.00019874062681560147, + "loss": 1.2187, + "step": 32740 + }, + { + "epoch": 0.0695816971453976, + "grad_norm": 0.4332648813724518, + "learning_rate": 0.00019873954905666675, + "loss": 1.2261, + "step": 32750 + }, + { + "epoch": 0.06960294346513665, + "grad_norm": 0.35937052965164185, + "learning_rate": 0.00019873847083968642, + "loss": 1.2429, + "step": 32760 + }, + { + "epoch": 0.0696241897848757, + "grad_norm": 0.35580235719680786, + "learning_rate": 0.00019873739216466548, + "loss": 1.2333, + "step": 32770 + }, + { + "epoch": 0.06964543610461475, + "grad_norm": 0.3913087248802185, + "learning_rate": 0.000198736313031609, + "loss": 1.216, + "step": 32780 + }, + { + "epoch": 0.06966668242435381, + "grad_norm": 0.3291832506656647, + "learning_rate": 0.00019873523344052192, + "loss": 1.2483, + "step": 32790 + }, + { + "epoch": 0.06968792874409285, + "grad_norm": 0.3064376711845398, + "learning_rate": 0.00019873415339140927, + "loss": 1.2656, + "step": 32800 + }, + { + "epoch": 0.06970917506383191, + "grad_norm": 0.359287291765213, + "learning_rate": 0.00019873307288427602, + "loss": 1.2734, + "step": 32810 + }, + { + "epoch": 0.06973042138357097, + "grad_norm": 0.5374619364738464, + "learning_rate": 0.00019873199191912727, + "loss": 1.2412, + "step": 32820 + }, + { + "epoch": 0.06975166770331001, + "grad_norm": 0.4031255841255188, + "learning_rate": 0.00019873091049596794, + "loss": 1.2363, + "step": 32830 + }, + { + "epoch": 0.06977291402304907, + "grad_norm": 0.3195471465587616, + "learning_rate": 0.0001987298286148031, + "loss": 1.2013, + "step": 32840 + }, + { + "epoch": 0.06979416034278813, + "grad_norm": 0.43756672739982605, + "learning_rate": 0.00019872874627563776, + "loss": 1.2178, + "step": 32850 + }, + { + "epoch": 0.06981540666252717, + "grad_norm": 0.3530104160308838, + "learning_rate": 0.00019872766347847694, + "loss": 1.2009, + "step": 32860 + }, + { + "epoch": 0.06983665298226623, + "grad_norm": 0.33876338601112366, + "learning_rate": 0.00019872658022332567, + "loss": 1.2148, + "step": 32870 + }, + { + "epoch": 0.06985789930200528, + "grad_norm": 0.34256261587142944, + "learning_rate": 0.00019872549651018894, + "loss": 1.2281, + "step": 32880 + }, + { + "epoch": 0.06987914562174434, + "grad_norm": 0.6657720804214478, + "learning_rate": 0.00019872441233907183, + "loss": 1.2422, + "step": 32890 + }, + { + "epoch": 0.06990039194148338, + "grad_norm": 0.35189753770828247, + "learning_rate": 0.0001987233277099793, + "loss": 1.2464, + "step": 32900 + }, + { + "epoch": 0.06992163826122244, + "grad_norm": 0.33395475149154663, + "learning_rate": 0.0001987222426229165, + "loss": 1.2071, + "step": 32910 + }, + { + "epoch": 0.0699428845809615, + "grad_norm": 0.5220555067062378, + "learning_rate": 0.00019872115707788833, + "loss": 1.2305, + "step": 32920 + }, + { + "epoch": 0.06996413090070054, + "grad_norm": 0.38290277123451233, + "learning_rate": 0.0001987200710748999, + "loss": 1.1888, + "step": 32930 + }, + { + "epoch": 0.0699853772204396, + "grad_norm": 0.47275856137275696, + "learning_rate": 0.00019871898461395625, + "loss": 1.2196, + "step": 32940 + }, + { + "epoch": 0.07000662354017866, + "grad_norm": 0.5110464096069336, + "learning_rate": 0.00019871789769506238, + "loss": 1.2261, + "step": 32950 + }, + { + "epoch": 0.0700278698599177, + "grad_norm": 0.44990453124046326, + "learning_rate": 0.00019871681031822337, + "loss": 1.2307, + "step": 32960 + }, + { + "epoch": 0.07004911617965676, + "grad_norm": 0.3313467502593994, + "learning_rate": 0.00019871572248344422, + "loss": 1.2207, + "step": 32970 + }, + { + "epoch": 0.07007036249939581, + "grad_norm": 0.4326070249080658, + "learning_rate": 0.00019871463419073005, + "loss": 1.2064, + "step": 32980 + }, + { + "epoch": 0.07009160881913486, + "grad_norm": 0.6456534266471863, + "learning_rate": 0.00019871354544008583, + "loss": 1.2847, + "step": 32990 + }, + { + "epoch": 0.07011285513887391, + "grad_norm": 0.3791135251522064, + "learning_rate": 0.00019871245623151663, + "loss": 1.2389, + "step": 33000 + }, + { + "epoch": 0.07013410145861297, + "grad_norm": 0.370647132396698, + "learning_rate": 0.00019871136656502753, + "loss": 1.2829, + "step": 33010 + }, + { + "epoch": 0.07015534777835201, + "grad_norm": 0.4415460228919983, + "learning_rate": 0.0001987102764406236, + "loss": 1.2353, + "step": 33020 + }, + { + "epoch": 0.07017659409809107, + "grad_norm": 0.5725070834159851, + "learning_rate": 0.00019870918585830985, + "loss": 1.2419, + "step": 33030 + }, + { + "epoch": 0.07019784041783013, + "grad_norm": 0.41225147247314453, + "learning_rate": 0.00019870809481809134, + "loss": 1.265, + "step": 33040 + }, + { + "epoch": 0.07021908673756917, + "grad_norm": 0.48030129075050354, + "learning_rate": 0.00019870700331997316, + "loss": 1.21, + "step": 33050 + }, + { + "epoch": 0.07024033305730823, + "grad_norm": 0.37782368063926697, + "learning_rate": 0.00019870591136396038, + "loss": 1.2331, + "step": 33060 + }, + { + "epoch": 0.07026157937704729, + "grad_norm": 0.38193291425704956, + "learning_rate": 0.00019870481895005804, + "loss": 1.2534, + "step": 33070 + }, + { + "epoch": 0.07028282569678633, + "grad_norm": 0.3599178194999695, + "learning_rate": 0.00019870372607827118, + "loss": 1.196, + "step": 33080 + }, + { + "epoch": 0.07030407201652539, + "grad_norm": 0.3789765238761902, + "learning_rate": 0.00019870263274860493, + "loss": 1.1955, + "step": 33090 + }, + { + "epoch": 0.07032531833626444, + "grad_norm": 0.4714445173740387, + "learning_rate": 0.00019870153896106436, + "loss": 1.1895, + "step": 33100 + }, + { + "epoch": 0.07034656465600349, + "grad_norm": 0.41988682746887207, + "learning_rate": 0.0001987004447156545, + "loss": 1.2365, + "step": 33110 + }, + { + "epoch": 0.07036781097574255, + "grad_norm": 0.45189395546913147, + "learning_rate": 0.00019869935001238048, + "loss": 1.1997, + "step": 33120 + }, + { + "epoch": 0.0703890572954816, + "grad_norm": 0.7963657379150391, + "learning_rate": 0.00019869825485124728, + "loss": 1.2521, + "step": 33130 + }, + { + "epoch": 0.07041030361522065, + "grad_norm": 0.458196222782135, + "learning_rate": 0.0001986971592322601, + "loss": 1.2539, + "step": 33140 + }, + { + "epoch": 0.0704315499349597, + "grad_norm": 0.4677490293979645, + "learning_rate": 0.00019869606315542395, + "loss": 1.231, + "step": 33150 + }, + { + "epoch": 0.07045279625469876, + "grad_norm": 0.3845463991165161, + "learning_rate": 0.00019869496662074392, + "loss": 1.2369, + "step": 33160 + }, + { + "epoch": 0.0704740425744378, + "grad_norm": 0.3592968285083771, + "learning_rate": 0.00019869386962822515, + "loss": 1.2252, + "step": 33170 + }, + { + "epoch": 0.07049528889417686, + "grad_norm": 0.3051091134548187, + "learning_rate": 0.00019869277217787263, + "loss": 1.2638, + "step": 33180 + }, + { + "epoch": 0.07051653521391592, + "grad_norm": 0.3708834946155548, + "learning_rate": 0.00019869167426969154, + "loss": 1.2465, + "step": 33190 + }, + { + "epoch": 0.07053778153365496, + "grad_norm": 0.39254045486450195, + "learning_rate": 0.00019869057590368697, + "loss": 1.23, + "step": 33200 + }, + { + "epoch": 0.07055902785339402, + "grad_norm": 0.47122544050216675, + "learning_rate": 0.00019868947707986397, + "loss": 1.2549, + "step": 33210 + }, + { + "epoch": 0.07058027417313308, + "grad_norm": 0.375845342874527, + "learning_rate": 0.00019868837779822763, + "loss": 1.2396, + "step": 33220 + }, + { + "epoch": 0.07060152049287212, + "grad_norm": 0.3566604554653168, + "learning_rate": 0.0001986872780587831, + "loss": 1.2357, + "step": 33230 + }, + { + "epoch": 0.07062276681261118, + "grad_norm": 0.4753573536872864, + "learning_rate": 0.00019868617786153545, + "loss": 1.2202, + "step": 33240 + }, + { + "epoch": 0.07064401313235023, + "grad_norm": 0.38075047731399536, + "learning_rate": 0.00019868507720648978, + "loss": 1.2179, + "step": 33250 + }, + { + "epoch": 0.07066525945208928, + "grad_norm": 0.4525986909866333, + "learning_rate": 0.00019868397609365123, + "loss": 1.2675, + "step": 33260 + }, + { + "epoch": 0.07068650577182833, + "grad_norm": 0.457822322845459, + "learning_rate": 0.00019868287452302487, + "loss": 1.2101, + "step": 33270 + }, + { + "epoch": 0.07070775209156739, + "grad_norm": 0.35906845331192017, + "learning_rate": 0.0001986817724946158, + "loss": 1.2349, + "step": 33280 + }, + { + "epoch": 0.07072899841130643, + "grad_norm": 0.47065022587776184, + "learning_rate": 0.0001986806700084292, + "loss": 1.2604, + "step": 33290 + }, + { + "epoch": 0.07075024473104549, + "grad_norm": 0.4813682436943054, + "learning_rate": 0.00019867956706447011, + "loss": 1.2283, + "step": 33300 + }, + { + "epoch": 0.07077149105078455, + "grad_norm": 0.4903818368911743, + "learning_rate": 0.0001986784636627437, + "loss": 1.2313, + "step": 33310 + }, + { + "epoch": 0.0707927373705236, + "grad_norm": 0.42865413427352905, + "learning_rate": 0.00019867735980325506, + "loss": 1.2424, + "step": 33320 + }, + { + "epoch": 0.07081398369026265, + "grad_norm": 0.3813343048095703, + "learning_rate": 0.0001986762554860093, + "loss": 1.2623, + "step": 33330 + }, + { + "epoch": 0.0708352300100017, + "grad_norm": 0.3413366973400116, + "learning_rate": 0.00019867515071101156, + "loss": 1.2557, + "step": 33340 + }, + { + "epoch": 0.07085647632974076, + "grad_norm": 0.5748891830444336, + "learning_rate": 0.000198674045478267, + "loss": 1.2134, + "step": 33350 + }, + { + "epoch": 0.07087772264947981, + "grad_norm": 0.7228661775588989, + "learning_rate": 0.0001986729397877807, + "loss": 1.3058, + "step": 33360 + }, + { + "epoch": 0.07089896896921886, + "grad_norm": 0.38054513931274414, + "learning_rate": 0.00019867183363955776, + "loss": 1.2059, + "step": 33370 + }, + { + "epoch": 0.07092021528895792, + "grad_norm": 0.5895411968231201, + "learning_rate": 0.00019867072703360337, + "loss": 1.2248, + "step": 33380 + }, + { + "epoch": 0.07094146160869697, + "grad_norm": 0.6173971891403198, + "learning_rate": 0.00019866961996992265, + "loss": 1.2176, + "step": 33390 + }, + { + "epoch": 0.07096270792843602, + "grad_norm": 0.4044257402420044, + "learning_rate": 0.00019866851244852075, + "loss": 1.2227, + "step": 33400 + }, + { + "epoch": 0.07098395424817508, + "grad_norm": 0.3779345154762268, + "learning_rate": 0.00019866740446940274, + "loss": 1.2445, + "step": 33410 + }, + { + "epoch": 0.07100520056791412, + "grad_norm": 0.3636798560619354, + "learning_rate": 0.00019866629603257387, + "loss": 1.2555, + "step": 33420 + }, + { + "epoch": 0.07102644688765318, + "grad_norm": 0.5228453278541565, + "learning_rate": 0.0001986651871380392, + "loss": 1.2147, + "step": 33430 + }, + { + "epoch": 0.07104769320739224, + "grad_norm": 0.45078161358833313, + "learning_rate": 0.0001986640777858039, + "loss": 1.2198, + "step": 33440 + }, + { + "epoch": 0.07106893952713128, + "grad_norm": 0.5667153000831604, + "learning_rate": 0.0001986629679758731, + "loss": 1.2257, + "step": 33450 + }, + { + "epoch": 0.07109018584687034, + "grad_norm": 0.3710997700691223, + "learning_rate": 0.00019866185770825193, + "loss": 1.2395, + "step": 33460 + }, + { + "epoch": 0.0711114321666094, + "grad_norm": 0.3778136670589447, + "learning_rate": 0.0001986607469829456, + "loss": 1.2313, + "step": 33470 + }, + { + "epoch": 0.07113267848634844, + "grad_norm": 0.4489549398422241, + "learning_rate": 0.00019865963579995924, + "loss": 1.2447, + "step": 33480 + }, + { + "epoch": 0.0711539248060875, + "grad_norm": 0.42470452189445496, + "learning_rate": 0.00019865852415929796, + "loss": 1.2203, + "step": 33490 + }, + { + "epoch": 0.07117517112582655, + "grad_norm": 0.33403870463371277, + "learning_rate": 0.000198657412060967, + "loss": 1.253, + "step": 33500 + }, + { + "epoch": 0.0711964174455656, + "grad_norm": 0.6452353596687317, + "learning_rate": 0.00019865629950497143, + "loss": 1.2564, + "step": 33510 + }, + { + "epoch": 0.07121766376530465, + "grad_norm": 0.5584948658943176, + "learning_rate": 0.0001986551864913165, + "loss": 1.1994, + "step": 33520 + }, + { + "epoch": 0.07123891008504371, + "grad_norm": 0.42109060287475586, + "learning_rate": 0.00019865407302000732, + "loss": 1.2441, + "step": 33530 + }, + { + "epoch": 0.07126015640478275, + "grad_norm": 0.431111603975296, + "learning_rate": 0.00019865295909104905, + "loss": 1.248, + "step": 33540 + }, + { + "epoch": 0.07128140272452181, + "grad_norm": 0.3837434649467468, + "learning_rate": 0.00019865184470444684, + "loss": 1.201, + "step": 33550 + }, + { + "epoch": 0.07130264904426087, + "grad_norm": 0.35547739267349243, + "learning_rate": 0.0001986507298602059, + "loss": 1.2466, + "step": 33560 + }, + { + "epoch": 0.07132389536399991, + "grad_norm": 0.36903882026672363, + "learning_rate": 0.00019864961455833143, + "loss": 1.22, + "step": 33570 + }, + { + "epoch": 0.07134514168373897, + "grad_norm": 0.4302821755409241, + "learning_rate": 0.00019864849879882855, + "loss": 1.1965, + "step": 33580 + }, + { + "epoch": 0.07136638800347803, + "grad_norm": 0.601155698299408, + "learning_rate": 0.00019864738258170244, + "loss": 1.25, + "step": 33590 + }, + { + "epoch": 0.07138763432321707, + "grad_norm": 0.38654640316963196, + "learning_rate": 0.00019864626590695829, + "loss": 1.217, + "step": 33600 + }, + { + "epoch": 0.07140888064295613, + "grad_norm": 0.36698129773139954, + "learning_rate": 0.0001986451487746013, + "loss": 1.2138, + "step": 33610 + }, + { + "epoch": 0.07143012696269518, + "grad_norm": 0.4310432970523834, + "learning_rate": 0.00019864403118463657, + "loss": 1.2282, + "step": 33620 + }, + { + "epoch": 0.07145137328243423, + "grad_norm": 0.3667338490486145, + "learning_rate": 0.00019864291313706936, + "loss": 1.1755, + "step": 33630 + }, + { + "epoch": 0.07147261960217328, + "grad_norm": 0.3826691806316376, + "learning_rate": 0.0001986417946319049, + "loss": 1.2239, + "step": 33640 + }, + { + "epoch": 0.07149386592191234, + "grad_norm": 0.414155513048172, + "learning_rate": 0.00019864067566914827, + "loss": 1.2287, + "step": 33650 + }, + { + "epoch": 0.07151511224165139, + "grad_norm": 0.45362719893455505, + "learning_rate": 0.00019863955624880474, + "loss": 1.2241, + "step": 33660 + }, + { + "epoch": 0.07153635856139044, + "grad_norm": 0.461302787065506, + "learning_rate": 0.00019863843637087946, + "loss": 1.2057, + "step": 33670 + }, + { + "epoch": 0.0715576048811295, + "grad_norm": 0.41498106718063354, + "learning_rate": 0.00019863731603537765, + "loss": 1.2582, + "step": 33680 + }, + { + "epoch": 0.07157885120086854, + "grad_norm": 0.5573539733886719, + "learning_rate": 0.0001986361952423045, + "loss": 1.2134, + "step": 33690 + }, + { + "epoch": 0.0716000975206076, + "grad_norm": 0.3583163022994995, + "learning_rate": 0.00019863507399166518, + "loss": 1.219, + "step": 33700 + }, + { + "epoch": 0.07162134384034666, + "grad_norm": 0.3304307758808136, + "learning_rate": 0.00019863395228346495, + "loss": 1.2013, + "step": 33710 + }, + { + "epoch": 0.07164259016008571, + "grad_norm": 0.3515929877758026, + "learning_rate": 0.00019863283011770894, + "loss": 1.2533, + "step": 33720 + }, + { + "epoch": 0.07166383647982476, + "grad_norm": 0.36305758357048035, + "learning_rate": 0.00019863170749440245, + "loss": 1.2382, + "step": 33730 + }, + { + "epoch": 0.07168508279956382, + "grad_norm": 0.434404581785202, + "learning_rate": 0.00019863058441355058, + "loss": 1.2259, + "step": 33740 + }, + { + "epoch": 0.07170632911930287, + "grad_norm": 0.43094637989997864, + "learning_rate": 0.00019862946087515865, + "loss": 1.2403, + "step": 33750 + }, + { + "epoch": 0.07172757543904192, + "grad_norm": 0.41779521107673645, + "learning_rate": 0.00019862833687923177, + "loss": 1.1936, + "step": 33760 + }, + { + "epoch": 0.07174882175878097, + "grad_norm": 0.3688085675239563, + "learning_rate": 0.00019862721242577523, + "loss": 1.2789, + "step": 33770 + }, + { + "epoch": 0.07177006807852003, + "grad_norm": 0.3848753273487091, + "learning_rate": 0.0001986260875147942, + "loss": 1.2377, + "step": 33780 + }, + { + "epoch": 0.07179131439825907, + "grad_norm": 0.39804282784461975, + "learning_rate": 0.0001986249621462939, + "loss": 1.2075, + "step": 33790 + }, + { + "epoch": 0.07181256071799813, + "grad_norm": 0.35793712735176086, + "learning_rate": 0.0001986238363202796, + "loss": 1.2534, + "step": 33800 + }, + { + "epoch": 0.07183380703773719, + "grad_norm": 0.4061836004257202, + "learning_rate": 0.00019862271003675647, + "loss": 1.2642, + "step": 33810 + }, + { + "epoch": 0.07185505335747623, + "grad_norm": 0.44560176134109497, + "learning_rate": 0.00019862158329572976, + "loss": 1.2299, + "step": 33820 + }, + { + "epoch": 0.07187629967721529, + "grad_norm": 0.4921417832374573, + "learning_rate": 0.00019862045609720465, + "loss": 1.2452, + "step": 33830 + }, + { + "epoch": 0.07189754599695435, + "grad_norm": 0.33832865953445435, + "learning_rate": 0.00019861932844118644, + "loss": 1.1924, + "step": 33840 + }, + { + "epoch": 0.07191879231669339, + "grad_norm": 0.35457542538642883, + "learning_rate": 0.00019861820032768035, + "loss": 1.2122, + "step": 33850 + }, + { + "epoch": 0.07194003863643245, + "grad_norm": 0.561398983001709, + "learning_rate": 0.00019861707175669157, + "loss": 1.2263, + "step": 33860 + }, + { + "epoch": 0.0719612849561715, + "grad_norm": 0.4238891899585724, + "learning_rate": 0.00019861594272822537, + "loss": 1.2172, + "step": 33870 + }, + { + "epoch": 0.07198253127591055, + "grad_norm": 0.33776575326919556, + "learning_rate": 0.00019861481324228697, + "loss": 1.2039, + "step": 33880 + }, + { + "epoch": 0.0720037775956496, + "grad_norm": 0.5449756979942322, + "learning_rate": 0.0001986136832988816, + "loss": 1.2196, + "step": 33890 + }, + { + "epoch": 0.07202502391538866, + "grad_norm": 0.41941067576408386, + "learning_rate": 0.00019861255289801454, + "loss": 1.2297, + "step": 33900 + }, + { + "epoch": 0.0720462702351277, + "grad_norm": 0.39166373014450073, + "learning_rate": 0.000198611422039691, + "loss": 1.2531, + "step": 33910 + }, + { + "epoch": 0.07206751655486676, + "grad_norm": 0.47778570652008057, + "learning_rate": 0.00019861029072391624, + "loss": 1.2212, + "step": 33920 + }, + { + "epoch": 0.07208876287460582, + "grad_norm": 0.46040454506874084, + "learning_rate": 0.00019860915895069552, + "loss": 1.2318, + "step": 33930 + }, + { + "epoch": 0.07211000919434486, + "grad_norm": 0.42116597294807434, + "learning_rate": 0.00019860802672003406, + "loss": 1.2336, + "step": 33940 + }, + { + "epoch": 0.07213125551408392, + "grad_norm": 0.36397162079811096, + "learning_rate": 0.00019860689403193712, + "loss": 1.2683, + "step": 33950 + }, + { + "epoch": 0.07215250183382298, + "grad_norm": 0.33128687739372253, + "learning_rate": 0.00019860576088640998, + "loss": 1.2532, + "step": 33960 + }, + { + "epoch": 0.07217374815356202, + "grad_norm": 0.36512064933776855, + "learning_rate": 0.00019860462728345785, + "loss": 1.2079, + "step": 33970 + }, + { + "epoch": 0.07219499447330108, + "grad_norm": 0.4543378949165344, + "learning_rate": 0.00019860349322308607, + "loss": 1.2271, + "step": 33980 + }, + { + "epoch": 0.07221624079304013, + "grad_norm": 0.3809322714805603, + "learning_rate": 0.0001986023587052998, + "loss": 1.2466, + "step": 33990 + }, + { + "epoch": 0.07223748711277918, + "grad_norm": 0.43434441089630127, + "learning_rate": 0.00019860122373010437, + "loss": 1.2259, + "step": 34000 + }, + { + "epoch": 0.07225873343251824, + "grad_norm": 0.43646785616874695, + "learning_rate": 0.00019860008829750503, + "loss": 1.2646, + "step": 34010 + }, + { + "epoch": 0.07227997975225729, + "grad_norm": 0.5105354189872742, + "learning_rate": 0.000198598952407507, + "loss": 1.2399, + "step": 34020 + }, + { + "epoch": 0.07230122607199634, + "grad_norm": 0.5997035503387451, + "learning_rate": 0.00019859781606011565, + "loss": 1.2359, + "step": 34030 + }, + { + "epoch": 0.07232247239173539, + "grad_norm": 0.35758188366889954, + "learning_rate": 0.00019859667925533616, + "loss": 1.2199, + "step": 34040 + }, + { + "epoch": 0.07234371871147445, + "grad_norm": 0.3694457709789276, + "learning_rate": 0.00019859554199317384, + "loss": 1.1971, + "step": 34050 + }, + { + "epoch": 0.0723649650312135, + "grad_norm": 0.35212692618370056, + "learning_rate": 0.00019859440427363398, + "loss": 1.2179, + "step": 34060 + }, + { + "epoch": 0.07238621135095255, + "grad_norm": 0.4131060242652893, + "learning_rate": 0.00019859326609672183, + "loss": 1.24, + "step": 34070 + }, + { + "epoch": 0.07240745767069161, + "grad_norm": 0.43652620911598206, + "learning_rate": 0.0001985921274624427, + "loss": 1.2376, + "step": 34080 + }, + { + "epoch": 0.07242870399043065, + "grad_norm": 0.3836889863014221, + "learning_rate": 0.00019859098837080182, + "loss": 1.2294, + "step": 34090 + }, + { + "epoch": 0.07244995031016971, + "grad_norm": 0.4708402752876282, + "learning_rate": 0.00019858984882180456, + "loss": 1.2356, + "step": 34100 + }, + { + "epoch": 0.07247119662990877, + "grad_norm": 0.8221877217292786, + "learning_rate": 0.0001985887088154561, + "loss": 1.2372, + "step": 34110 + }, + { + "epoch": 0.07249244294964781, + "grad_norm": 0.4660656154155731, + "learning_rate": 0.00019858756835176183, + "loss": 1.2419, + "step": 34120 + }, + { + "epoch": 0.07251368926938687, + "grad_norm": 0.36453962326049805, + "learning_rate": 0.00019858642743072696, + "loss": 1.2413, + "step": 34130 + }, + { + "epoch": 0.07253493558912592, + "grad_norm": 0.3455718755722046, + "learning_rate": 0.0001985852860523568, + "loss": 1.2633, + "step": 34140 + }, + { + "epoch": 0.07255618190886498, + "grad_norm": 0.36416712403297424, + "learning_rate": 0.0001985841442166567, + "loss": 1.1796, + "step": 34150 + }, + { + "epoch": 0.07257742822860402, + "grad_norm": 0.4139789640903473, + "learning_rate": 0.0001985830019236319, + "loss": 1.2365, + "step": 34160 + }, + { + "epoch": 0.07259867454834308, + "grad_norm": 0.3392743468284607, + "learning_rate": 0.00019858185917328772, + "loss": 1.2336, + "step": 34170 + }, + { + "epoch": 0.07261992086808214, + "grad_norm": 0.6351392865180969, + "learning_rate": 0.00019858071596562943, + "loss": 1.2119, + "step": 34180 + }, + { + "epoch": 0.07264116718782118, + "grad_norm": 0.4857178330421448, + "learning_rate": 0.0001985795723006624, + "loss": 1.2491, + "step": 34190 + }, + { + "epoch": 0.07266241350756024, + "grad_norm": 0.3396761417388916, + "learning_rate": 0.00019857842817839187, + "loss": 1.2266, + "step": 34200 + }, + { + "epoch": 0.0726836598272993, + "grad_norm": 0.3927582800388336, + "learning_rate": 0.00019857728359882318, + "loss": 1.205, + "step": 34210 + }, + { + "epoch": 0.07270490614703834, + "grad_norm": 0.6568551659584045, + "learning_rate": 0.00019857613856196163, + "loss": 1.2614, + "step": 34220 + }, + { + "epoch": 0.0727261524667774, + "grad_norm": 0.4739164412021637, + "learning_rate": 0.00019857499306781253, + "loss": 1.235, + "step": 34230 + }, + { + "epoch": 0.07274739878651645, + "grad_norm": 0.33490005135536194, + "learning_rate": 0.0001985738471163812, + "loss": 1.2019, + "step": 34240 + }, + { + "epoch": 0.0727686451062555, + "grad_norm": 0.39640456438064575, + "learning_rate": 0.00019857270070767298, + "loss": 1.185, + "step": 34250 + }, + { + "epoch": 0.07278989142599455, + "grad_norm": 0.7934435606002808, + "learning_rate": 0.00019857155384169314, + "loss": 1.1973, + "step": 34260 + }, + { + "epoch": 0.07281113774573361, + "grad_norm": 0.5891510248184204, + "learning_rate": 0.00019857040651844702, + "loss": 1.234, + "step": 34270 + }, + { + "epoch": 0.07283238406547266, + "grad_norm": 0.4184127449989319, + "learning_rate": 0.00019856925873793993, + "loss": 1.2193, + "step": 34280 + }, + { + "epoch": 0.07285363038521171, + "grad_norm": 0.3584209382534027, + "learning_rate": 0.00019856811050017722, + "loss": 1.2206, + "step": 34290 + }, + { + "epoch": 0.07287487670495077, + "grad_norm": 0.36996302008628845, + "learning_rate": 0.00019856696180516422, + "loss": 1.2329, + "step": 34300 + }, + { + "epoch": 0.07289612302468981, + "grad_norm": 0.3539191484451294, + "learning_rate": 0.00019856581265290625, + "loss": 1.2528, + "step": 34310 + }, + { + "epoch": 0.07291736934442887, + "grad_norm": 0.3500705659389496, + "learning_rate": 0.00019856466304340865, + "loss": 1.2284, + "step": 34320 + }, + { + "epoch": 0.07293861566416793, + "grad_norm": 0.4572902321815491, + "learning_rate": 0.0001985635129766767, + "loss": 1.1681, + "step": 34330 + }, + { + "epoch": 0.07295986198390697, + "grad_norm": 0.42781734466552734, + "learning_rate": 0.0001985623624527158, + "loss": 1.2385, + "step": 34340 + }, + { + "epoch": 0.07298110830364603, + "grad_norm": 0.3904256522655487, + "learning_rate": 0.00019856121147153122, + "loss": 1.2329, + "step": 34350 + }, + { + "epoch": 0.07300235462338509, + "grad_norm": 0.39115744829177856, + "learning_rate": 0.00019856006003312837, + "loss": 1.2008, + "step": 34360 + }, + { + "epoch": 0.07302360094312413, + "grad_norm": 0.532244086265564, + "learning_rate": 0.00019855890813751256, + "loss": 1.2013, + "step": 34370 + }, + { + "epoch": 0.07304484726286319, + "grad_norm": 0.37438759207725525, + "learning_rate": 0.00019855775578468915, + "loss": 1.2038, + "step": 34380 + }, + { + "epoch": 0.07306609358260224, + "grad_norm": 0.4257456958293915, + "learning_rate": 0.00019855660297466346, + "loss": 1.1897, + "step": 34390 + }, + { + "epoch": 0.07308733990234129, + "grad_norm": 0.4385432004928589, + "learning_rate": 0.00019855544970744085, + "loss": 1.2038, + "step": 34400 + }, + { + "epoch": 0.07310858622208034, + "grad_norm": 0.4031370282173157, + "learning_rate": 0.00019855429598302667, + "loss": 1.2418, + "step": 34410 + }, + { + "epoch": 0.0731298325418194, + "grad_norm": 0.34894484281539917, + "learning_rate": 0.00019855314180142623, + "loss": 1.2592, + "step": 34420 + }, + { + "epoch": 0.07315107886155844, + "grad_norm": 1.0257186889648438, + "learning_rate": 0.000198551987162645, + "loss": 1.185, + "step": 34430 + }, + { + "epoch": 0.0731723251812975, + "grad_norm": 0.3693373203277588, + "learning_rate": 0.00019855083206668818, + "loss": 1.2305, + "step": 34440 + }, + { + "epoch": 0.07319357150103656, + "grad_norm": 0.4217144250869751, + "learning_rate": 0.00019854967651356128, + "loss": 1.2459, + "step": 34450 + }, + { + "epoch": 0.0732148178207756, + "grad_norm": 0.5070161819458008, + "learning_rate": 0.00019854852050326955, + "loss": 1.249, + "step": 34460 + }, + { + "epoch": 0.07323606414051466, + "grad_norm": 0.5093840956687927, + "learning_rate": 0.0001985473640358184, + "loss": 1.1861, + "step": 34470 + }, + { + "epoch": 0.07325731046025372, + "grad_norm": 0.621452808380127, + "learning_rate": 0.00019854620711121318, + "loss": 1.2486, + "step": 34480 + }, + { + "epoch": 0.07327855677999276, + "grad_norm": 0.7561589479446411, + "learning_rate": 0.0001985450497294593, + "loss": 1.2266, + "step": 34490 + }, + { + "epoch": 0.07329980309973182, + "grad_norm": 0.5254666209220886, + "learning_rate": 0.00019854389189056203, + "loss": 1.2069, + "step": 34500 + }, + { + "epoch": 0.07332104941947087, + "grad_norm": 0.3738752603530884, + "learning_rate": 0.00019854273359452685, + "loss": 1.1996, + "step": 34510 + }, + { + "epoch": 0.07334229573920992, + "grad_norm": 0.6407920718193054, + "learning_rate": 0.00019854157484135908, + "loss": 1.2155, + "step": 34520 + }, + { + "epoch": 0.07336354205894897, + "grad_norm": 0.38387444615364075, + "learning_rate": 0.00019854041563106413, + "loss": 1.2233, + "step": 34530 + }, + { + "epoch": 0.07338478837868803, + "grad_norm": 0.42037031054496765, + "learning_rate": 0.00019853925596364732, + "loss": 1.2164, + "step": 34540 + }, + { + "epoch": 0.07340603469842708, + "grad_norm": 0.45129871368408203, + "learning_rate": 0.00019853809583911405, + "loss": 1.2403, + "step": 34550 + }, + { + "epoch": 0.07342728101816613, + "grad_norm": 0.4478568732738495, + "learning_rate": 0.00019853693525746975, + "loss": 1.2358, + "step": 34560 + }, + { + "epoch": 0.07344852733790519, + "grad_norm": 0.3485090434551239, + "learning_rate": 0.00019853577421871972, + "loss": 1.2411, + "step": 34570 + }, + { + "epoch": 0.07346977365764425, + "grad_norm": 0.39591386914253235, + "learning_rate": 0.00019853461272286942, + "loss": 1.2445, + "step": 34580 + }, + { + "epoch": 0.07349101997738329, + "grad_norm": 0.5585940480232239, + "learning_rate": 0.0001985334507699242, + "loss": 1.2341, + "step": 34590 + }, + { + "epoch": 0.07351226629712235, + "grad_norm": 0.4788243770599365, + "learning_rate": 0.00019853228835988945, + "loss": 1.2147, + "step": 34600 + }, + { + "epoch": 0.0735335126168614, + "grad_norm": 0.3713412880897522, + "learning_rate": 0.0001985311254927706, + "loss": 1.2364, + "step": 34610 + }, + { + "epoch": 0.07355475893660045, + "grad_norm": 0.4888056516647339, + "learning_rate": 0.000198529962168573, + "loss": 1.2071, + "step": 34620 + }, + { + "epoch": 0.0735760052563395, + "grad_norm": 0.35647740960121155, + "learning_rate": 0.00019852879838730208, + "loss": 1.2081, + "step": 34630 + }, + { + "epoch": 0.07359725157607856, + "grad_norm": 0.44121676683425903, + "learning_rate": 0.0001985276341489632, + "loss": 1.2147, + "step": 34640 + }, + { + "epoch": 0.0736184978958176, + "grad_norm": 0.37208232283592224, + "learning_rate": 0.0001985264694535618, + "loss": 1.2539, + "step": 34650 + }, + { + "epoch": 0.07363974421555666, + "grad_norm": 0.37532633543014526, + "learning_rate": 0.00019852530430110326, + "loss": 1.2429, + "step": 34660 + }, + { + "epoch": 0.07366099053529572, + "grad_norm": 0.41144585609436035, + "learning_rate": 0.00019852413869159299, + "loss": 1.2137, + "step": 34670 + }, + { + "epoch": 0.07368223685503476, + "grad_norm": 0.3584136962890625, + "learning_rate": 0.00019852297262503639, + "loss": 1.2222, + "step": 34680 + }, + { + "epoch": 0.07370348317477382, + "grad_norm": 0.39143243432044983, + "learning_rate": 0.0001985218061014389, + "loss": 1.1977, + "step": 34690 + }, + { + "epoch": 0.07372472949451288, + "grad_norm": 0.49071449041366577, + "learning_rate": 0.00019852063912080588, + "loss": 1.2551, + "step": 34700 + }, + { + "epoch": 0.07374597581425192, + "grad_norm": 0.99661785364151, + "learning_rate": 0.00019851947168314277, + "loss": 1.2134, + "step": 34710 + }, + { + "epoch": 0.07376722213399098, + "grad_norm": 0.8642682433128357, + "learning_rate": 0.00019851830378845502, + "loss": 1.2568, + "step": 34720 + }, + { + "epoch": 0.07378846845373004, + "grad_norm": 0.36855047941207886, + "learning_rate": 0.000198517135436748, + "loss": 1.1904, + "step": 34730 + }, + { + "epoch": 0.07380971477346908, + "grad_norm": 0.8432472348213196, + "learning_rate": 0.00019851596662802713, + "loss": 1.2288, + "step": 34740 + }, + { + "epoch": 0.07383096109320814, + "grad_norm": 0.5926526784896851, + "learning_rate": 0.00019851479736229788, + "loss": 1.2401, + "step": 34750 + }, + { + "epoch": 0.0738522074129472, + "grad_norm": 0.5711331963539124, + "learning_rate": 0.0001985136276395656, + "loss": 1.2218, + "step": 34760 + }, + { + "epoch": 0.07387345373268624, + "grad_norm": 0.4847463071346283, + "learning_rate": 0.0001985124574598358, + "loss": 1.1938, + "step": 34770 + }, + { + "epoch": 0.0738947000524253, + "grad_norm": 0.587609052658081, + "learning_rate": 0.00019851128682311385, + "loss": 1.1978, + "step": 34780 + }, + { + "epoch": 0.07391594637216435, + "grad_norm": 0.37043872475624084, + "learning_rate": 0.0001985101157294052, + "loss": 1.2118, + "step": 34790 + }, + { + "epoch": 0.0739371926919034, + "grad_norm": 0.3953397572040558, + "learning_rate": 0.00019850894417871526, + "loss": 1.1978, + "step": 34800 + }, + { + "epoch": 0.07395843901164245, + "grad_norm": 0.47948190569877625, + "learning_rate": 0.0001985077721710495, + "loss": 1.1647, + "step": 34810 + }, + { + "epoch": 0.07397968533138151, + "grad_norm": 0.349624902009964, + "learning_rate": 0.00019850659970641337, + "loss": 1.2444, + "step": 34820 + }, + { + "epoch": 0.07400093165112055, + "grad_norm": 0.3725999593734741, + "learning_rate": 0.00019850542678481224, + "loss": 1.2482, + "step": 34830 + }, + { + "epoch": 0.07402217797085961, + "grad_norm": 0.361629456281662, + "learning_rate": 0.0001985042534062516, + "loss": 1.2099, + "step": 34840 + }, + { + "epoch": 0.07404342429059867, + "grad_norm": 0.6133326888084412, + "learning_rate": 0.00019850307957073687, + "loss": 1.2225, + "step": 34850 + }, + { + "epoch": 0.07406467061033771, + "grad_norm": 0.3637382984161377, + "learning_rate": 0.00019850190527827356, + "loss": 1.2341, + "step": 34860 + }, + { + "epoch": 0.07408591693007677, + "grad_norm": 0.3447924554347992, + "learning_rate": 0.00019850073052886703, + "loss": 1.2085, + "step": 34870 + }, + { + "epoch": 0.07410716324981582, + "grad_norm": 0.4026646316051483, + "learning_rate": 0.00019849955532252276, + "loss": 1.1986, + "step": 34880 + }, + { + "epoch": 0.07412840956955487, + "grad_norm": 0.36775410175323486, + "learning_rate": 0.00019849837965924623, + "loss": 1.2313, + "step": 34890 + }, + { + "epoch": 0.07414965588929393, + "grad_norm": 0.44317445158958435, + "learning_rate": 0.00019849720353904283, + "loss": 1.2229, + "step": 34900 + }, + { + "epoch": 0.07417090220903298, + "grad_norm": 0.5473032593727112, + "learning_rate": 0.00019849602696191812, + "loss": 1.2429, + "step": 34910 + }, + { + "epoch": 0.07419214852877203, + "grad_norm": 0.4279581606388092, + "learning_rate": 0.00019849484992787747, + "loss": 1.2539, + "step": 34920 + }, + { + "epoch": 0.07421339484851108, + "grad_norm": 0.4655141830444336, + "learning_rate": 0.00019849367243692637, + "loss": 1.219, + "step": 34930 + }, + { + "epoch": 0.07423464116825014, + "grad_norm": 0.43005242943763733, + "learning_rate": 0.00019849249448907025, + "loss": 1.2191, + "step": 34940 + }, + { + "epoch": 0.07425588748798918, + "grad_norm": 0.3956560790538788, + "learning_rate": 0.00019849131608431464, + "loss": 1.1875, + "step": 34950 + }, + { + "epoch": 0.07427713380772824, + "grad_norm": 0.4152277112007141, + "learning_rate": 0.00019849013722266493, + "loss": 1.2757, + "step": 34960 + }, + { + "epoch": 0.0742983801274673, + "grad_norm": 0.4765663743019104, + "learning_rate": 0.00019848895790412668, + "loss": 1.2193, + "step": 34970 + }, + { + "epoch": 0.07431962644720634, + "grad_norm": 0.6138102412223816, + "learning_rate": 0.00019848777812870528, + "loss": 1.2205, + "step": 34980 + }, + { + "epoch": 0.0743408727669454, + "grad_norm": 0.41547051072120667, + "learning_rate": 0.00019848659789640624, + "loss": 1.2173, + "step": 34990 + }, + { + "epoch": 0.07436211908668446, + "grad_norm": 0.3545878827571869, + "learning_rate": 0.00019848541720723502, + "loss": 1.2522, + "step": 35000 + }, + { + "epoch": 0.07438336540642351, + "grad_norm": 0.4678790867328644, + "learning_rate": 0.0001984842360611971, + "loss": 1.2274, + "step": 35010 + }, + { + "epoch": 0.07440461172616256, + "grad_norm": 0.3465358018875122, + "learning_rate": 0.00019848305445829798, + "loss": 1.2437, + "step": 35020 + }, + { + "epoch": 0.07442585804590161, + "grad_norm": 0.44389936327934265, + "learning_rate": 0.00019848187239854313, + "loss": 1.228, + "step": 35030 + }, + { + "epoch": 0.07444710436564067, + "grad_norm": 0.48208799958229065, + "learning_rate": 0.00019848068988193802, + "loss": 1.2295, + "step": 35040 + }, + { + "epoch": 0.07446835068537971, + "grad_norm": 0.3321342468261719, + "learning_rate": 0.0001984795069084881, + "loss": 1.2179, + "step": 35050 + }, + { + "epoch": 0.07448959700511877, + "grad_norm": 0.41119030117988586, + "learning_rate": 0.00019847832347819896, + "loss": 1.2104, + "step": 35060 + }, + { + "epoch": 0.07451084332485783, + "grad_norm": 0.45763492584228516, + "learning_rate": 0.00019847713959107602, + "loss": 1.1935, + "step": 35070 + }, + { + "epoch": 0.07453208964459687, + "grad_norm": 0.42382699251174927, + "learning_rate": 0.00019847595524712477, + "loss": 1.2287, + "step": 35080 + }, + { + "epoch": 0.07455333596433593, + "grad_norm": 0.3134878873825073, + "learning_rate": 0.00019847477044635073, + "loss": 1.202, + "step": 35090 + }, + { + "epoch": 0.07457458228407499, + "grad_norm": 0.38800060749053955, + "learning_rate": 0.00019847358518875936, + "loss": 1.1908, + "step": 35100 + }, + { + "epoch": 0.07459582860381403, + "grad_norm": 0.3778310716152191, + "learning_rate": 0.0001984723994743562, + "loss": 1.2217, + "step": 35110 + }, + { + "epoch": 0.07461707492355309, + "grad_norm": 0.41539710760116577, + "learning_rate": 0.00019847121330314674, + "loss": 1.2596, + "step": 35120 + }, + { + "epoch": 0.07463832124329214, + "grad_norm": 0.6416877508163452, + "learning_rate": 0.00019847002667513648, + "loss": 1.198, + "step": 35130 + }, + { + "epoch": 0.07465956756303119, + "grad_norm": 0.3875325918197632, + "learning_rate": 0.0001984688395903309, + "loss": 1.215, + "step": 35140 + }, + { + "epoch": 0.07468081388277024, + "grad_norm": 0.4331740438938141, + "learning_rate": 0.00019846765204873553, + "loss": 1.2158, + "step": 35150 + }, + { + "epoch": 0.0747020602025093, + "grad_norm": 0.55470871925354, + "learning_rate": 0.00019846646405035587, + "loss": 1.2077, + "step": 35160 + }, + { + "epoch": 0.07472330652224835, + "grad_norm": 0.35830894112586975, + "learning_rate": 0.00019846527559519745, + "loss": 1.1992, + "step": 35170 + }, + { + "epoch": 0.0747445528419874, + "grad_norm": 0.34596124291419983, + "learning_rate": 0.00019846408668326577, + "loss": 1.2404, + "step": 35180 + }, + { + "epoch": 0.07476579916172646, + "grad_norm": 0.4002028703689575, + "learning_rate": 0.00019846289731456634, + "loss": 1.2702, + "step": 35190 + }, + { + "epoch": 0.0747870454814655, + "grad_norm": 0.3777618706226349, + "learning_rate": 0.0001984617074891047, + "loss": 1.1967, + "step": 35200 + }, + { + "epoch": 0.07480829180120456, + "grad_norm": 0.4430149793624878, + "learning_rate": 0.0001984605172068863, + "loss": 1.1849, + "step": 35210 + }, + { + "epoch": 0.07482953812094362, + "grad_norm": 0.3742290735244751, + "learning_rate": 0.00019845932646791675, + "loss": 1.2568, + "step": 35220 + }, + { + "epoch": 0.07485078444068266, + "grad_norm": 0.46117907762527466, + "learning_rate": 0.00019845813527220155, + "loss": 1.2406, + "step": 35230 + }, + { + "epoch": 0.07487203076042172, + "grad_norm": 0.4088512063026428, + "learning_rate": 0.0001984569436197462, + "loss": 1.2129, + "step": 35240 + }, + { + "epoch": 0.07489327708016078, + "grad_norm": 0.327280730009079, + "learning_rate": 0.00019845575151055626, + "loss": 1.2539, + "step": 35250 + }, + { + "epoch": 0.07491452339989982, + "grad_norm": 0.33188021183013916, + "learning_rate": 0.0001984545589446372, + "loss": 1.2627, + "step": 35260 + }, + { + "epoch": 0.07493576971963888, + "grad_norm": 0.40714895725250244, + "learning_rate": 0.00019845336592199463, + "loss": 1.229, + "step": 35270 + }, + { + "epoch": 0.07495701603937793, + "grad_norm": 0.48948347568511963, + "learning_rate": 0.00019845217244263404, + "loss": 1.2207, + "step": 35280 + }, + { + "epoch": 0.07497826235911698, + "grad_norm": 0.46311548352241516, + "learning_rate": 0.00019845097850656096, + "loss": 1.196, + "step": 35290 + }, + { + "epoch": 0.07499950867885603, + "grad_norm": 0.4145439565181732, + "learning_rate": 0.00019844978411378094, + "loss": 1.2018, + "step": 35300 + }, + { + "epoch": 0.07502075499859509, + "grad_norm": 0.7811167240142822, + "learning_rate": 0.00019844858926429954, + "loss": 1.2359, + "step": 35310 + }, + { + "epoch": 0.07504200131833413, + "grad_norm": 0.35557371377944946, + "learning_rate": 0.00019844739395812226, + "loss": 1.24, + "step": 35320 + }, + { + "epoch": 0.07506324763807319, + "grad_norm": 0.3937785029411316, + "learning_rate": 0.00019844619819525469, + "loss": 1.2418, + "step": 35330 + }, + { + "epoch": 0.07508449395781225, + "grad_norm": 0.35788872838020325, + "learning_rate": 0.0001984450019757024, + "loss": 1.2458, + "step": 35340 + }, + { + "epoch": 0.07510574027755129, + "grad_norm": 0.4960259199142456, + "learning_rate": 0.00019844380529947082, + "loss": 1.26, + "step": 35350 + }, + { + "epoch": 0.07512698659729035, + "grad_norm": 0.49614304304122925, + "learning_rate": 0.00019844260816656565, + "loss": 1.2388, + "step": 35360 + }, + { + "epoch": 0.0751482329170294, + "grad_norm": 0.38918647170066833, + "learning_rate": 0.00019844141057699232, + "loss": 1.2412, + "step": 35370 + }, + { + "epoch": 0.07516947923676845, + "grad_norm": 0.37913888692855835, + "learning_rate": 0.00019844021253075645, + "loss": 1.2275, + "step": 35380 + }, + { + "epoch": 0.0751907255565075, + "grad_norm": 0.3547292947769165, + "learning_rate": 0.0001984390140278636, + "loss": 1.2493, + "step": 35390 + }, + { + "epoch": 0.07521197187624656, + "grad_norm": 0.5798970460891724, + "learning_rate": 0.0001984378150683193, + "loss": 1.2369, + "step": 35400 + }, + { + "epoch": 0.07523321819598561, + "grad_norm": 0.40055057406425476, + "learning_rate": 0.00019843661565212915, + "loss": 1.2405, + "step": 35410 + }, + { + "epoch": 0.07525446451572466, + "grad_norm": 0.3708507716655731, + "learning_rate": 0.00019843541577929867, + "loss": 1.2578, + "step": 35420 + }, + { + "epoch": 0.07527571083546372, + "grad_norm": 0.465350478887558, + "learning_rate": 0.00019843421544983345, + "loss": 1.2208, + "step": 35430 + }, + { + "epoch": 0.07529695715520278, + "grad_norm": 0.3834581673145294, + "learning_rate": 0.00019843301466373906, + "loss": 1.2282, + "step": 35440 + }, + { + "epoch": 0.07531820347494182, + "grad_norm": 0.43676143884658813, + "learning_rate": 0.00019843181342102105, + "loss": 1.2115, + "step": 35450 + }, + { + "epoch": 0.07533944979468088, + "grad_norm": 0.34760773181915283, + "learning_rate": 0.00019843061172168503, + "loss": 1.2518, + "step": 35460 + }, + { + "epoch": 0.07536069611441994, + "grad_norm": 0.8944988250732422, + "learning_rate": 0.00019842940956573654, + "loss": 1.2204, + "step": 35470 + }, + { + "epoch": 0.07538194243415898, + "grad_norm": 0.3780127763748169, + "learning_rate": 0.0001984282069531812, + "loss": 1.2115, + "step": 35480 + }, + { + "epoch": 0.07540318875389804, + "grad_norm": 0.5202549695968628, + "learning_rate": 0.00019842700388402452, + "loss": 1.2306, + "step": 35490 + }, + { + "epoch": 0.0754244350736371, + "grad_norm": 0.3385724723339081, + "learning_rate": 0.00019842580035827213, + "loss": 1.2628, + "step": 35500 + }, + { + "epoch": 0.07544568139337614, + "grad_norm": 0.3559023439884186, + "learning_rate": 0.0001984245963759296, + "loss": 1.2135, + "step": 35510 + }, + { + "epoch": 0.0754669277131152, + "grad_norm": 0.3593302071094513, + "learning_rate": 0.00019842339193700255, + "loss": 1.2353, + "step": 35520 + }, + { + "epoch": 0.07548817403285425, + "grad_norm": 0.502986490726471, + "learning_rate": 0.0001984221870414965, + "loss": 1.2344, + "step": 35530 + }, + { + "epoch": 0.0755094203525933, + "grad_norm": 0.32208144664764404, + "learning_rate": 0.0001984209816894171, + "loss": 1.2392, + "step": 35540 + }, + { + "epoch": 0.07553066667233235, + "grad_norm": 0.39164015650749207, + "learning_rate": 0.0001984197758807699, + "loss": 1.2277, + "step": 35550 + }, + { + "epoch": 0.07555191299207141, + "grad_norm": 0.32437780499458313, + "learning_rate": 0.00019841856961556049, + "loss": 1.2232, + "step": 35560 + }, + { + "epoch": 0.07557315931181045, + "grad_norm": 0.38401713967323303, + "learning_rate": 0.00019841736289379452, + "loss": 1.1875, + "step": 35570 + }, + { + "epoch": 0.07559440563154951, + "grad_norm": 0.367587149143219, + "learning_rate": 0.00019841615571547752, + "loss": 1.1909, + "step": 35580 + }, + { + "epoch": 0.07561565195128857, + "grad_norm": 0.41973111033439636, + "learning_rate": 0.00019841494808061513, + "loss": 1.2105, + "step": 35590 + }, + { + "epoch": 0.07563689827102761, + "grad_norm": 0.35201627016067505, + "learning_rate": 0.00019841373998921298, + "loss": 1.2049, + "step": 35600 + }, + { + "epoch": 0.07565814459076667, + "grad_norm": 0.40715619921684265, + "learning_rate": 0.00019841253144127662, + "loss": 1.2666, + "step": 35610 + }, + { + "epoch": 0.07567939091050573, + "grad_norm": 0.45809170603752136, + "learning_rate": 0.00019841132243681166, + "loss": 1.2105, + "step": 35620 + }, + { + "epoch": 0.07570063723024477, + "grad_norm": 0.33874788880348206, + "learning_rate": 0.00019841011297582376, + "loss": 1.2344, + "step": 35630 + }, + { + "epoch": 0.07572188354998383, + "grad_norm": 0.42499852180480957, + "learning_rate": 0.00019840890305831846, + "loss": 1.227, + "step": 35640 + }, + { + "epoch": 0.07574312986972288, + "grad_norm": 0.47863686084747314, + "learning_rate": 0.00019840769268430138, + "loss": 1.2144, + "step": 35650 + }, + { + "epoch": 0.07576437618946193, + "grad_norm": 0.3394961655139923, + "learning_rate": 0.00019840648185377824, + "loss": 1.2466, + "step": 35660 + }, + { + "epoch": 0.07578562250920098, + "grad_norm": 0.3912738859653473, + "learning_rate": 0.00019840527056675454, + "loss": 1.2254, + "step": 35670 + }, + { + "epoch": 0.07580686882894004, + "grad_norm": 0.33438488841056824, + "learning_rate": 0.00019840405882323595, + "loss": 1.2244, + "step": 35680 + }, + { + "epoch": 0.07582811514867908, + "grad_norm": 0.5473013520240784, + "learning_rate": 0.00019840284662322807, + "loss": 1.2122, + "step": 35690 + }, + { + "epoch": 0.07584936146841814, + "grad_norm": 0.3274756669998169, + "learning_rate": 0.00019840163396673652, + "loss": 1.2283, + "step": 35700 + }, + { + "epoch": 0.0758706077881572, + "grad_norm": 0.7213840484619141, + "learning_rate": 0.00019840042085376695, + "loss": 1.2228, + "step": 35710 + }, + { + "epoch": 0.07589185410789624, + "grad_norm": 0.4600060284137726, + "learning_rate": 0.000198399207284325, + "loss": 1.2082, + "step": 35720 + }, + { + "epoch": 0.0759131004276353, + "grad_norm": 0.3549066185951233, + "learning_rate": 0.00019839799325841625, + "loss": 1.2214, + "step": 35730 + }, + { + "epoch": 0.07593434674737436, + "grad_norm": 0.6735023260116577, + "learning_rate": 0.00019839677877604635, + "loss": 1.23, + "step": 35740 + }, + { + "epoch": 0.0759555930671134, + "grad_norm": 0.41151607036590576, + "learning_rate": 0.00019839556383722095, + "loss": 1.2191, + "step": 35750 + }, + { + "epoch": 0.07597683938685246, + "grad_norm": 0.36429259181022644, + "learning_rate": 0.0001983943484419457, + "loss": 1.2565, + "step": 35760 + }, + { + "epoch": 0.07599808570659151, + "grad_norm": 0.4168282747268677, + "learning_rate": 0.00019839313259022618, + "loss": 1.2002, + "step": 35770 + }, + { + "epoch": 0.07601933202633056, + "grad_norm": 0.6414825916290283, + "learning_rate": 0.0001983919162820681, + "loss": 1.2373, + "step": 35780 + }, + { + "epoch": 0.07604057834606961, + "grad_norm": 0.5958241820335388, + "learning_rate": 0.00019839069951747704, + "loss": 1.2305, + "step": 35790 + }, + { + "epoch": 0.07606182466580867, + "grad_norm": 0.4111982583999634, + "learning_rate": 0.00019838948229645867, + "loss": 1.2256, + "step": 35800 + }, + { + "epoch": 0.07608307098554772, + "grad_norm": 0.3946593701839447, + "learning_rate": 0.00019838826461901867, + "loss": 1.1925, + "step": 35810 + }, + { + "epoch": 0.07610431730528677, + "grad_norm": 0.5341569185256958, + "learning_rate": 0.00019838704648516265, + "loss": 1.2044, + "step": 35820 + }, + { + "epoch": 0.07612556362502583, + "grad_norm": 0.5811922550201416, + "learning_rate": 0.00019838582789489623, + "loss": 1.2729, + "step": 35830 + }, + { + "epoch": 0.07614680994476489, + "grad_norm": 0.36234402656555176, + "learning_rate": 0.00019838460884822516, + "loss": 1.2607, + "step": 35840 + }, + { + "epoch": 0.07616805626450393, + "grad_norm": 0.3774336278438568, + "learning_rate": 0.000198383389345155, + "loss": 1.2241, + "step": 35850 + }, + { + "epoch": 0.07618930258424299, + "grad_norm": 0.3682750165462494, + "learning_rate": 0.00019838216938569144, + "loss": 1.2228, + "step": 35860 + }, + { + "epoch": 0.07621054890398204, + "grad_norm": 0.37127944827079773, + "learning_rate": 0.0001983809489698402, + "loss": 1.2554, + "step": 35870 + }, + { + "epoch": 0.07623179522372109, + "grad_norm": 0.3308808207511902, + "learning_rate": 0.00019837972809760683, + "loss": 1.1998, + "step": 35880 + }, + { + "epoch": 0.07625304154346015, + "grad_norm": 0.40692347288131714, + "learning_rate": 0.00019837850676899705, + "loss": 1.2195, + "step": 35890 + }, + { + "epoch": 0.0762742878631992, + "grad_norm": 0.3544767498970032, + "learning_rate": 0.00019837728498401654, + "loss": 1.228, + "step": 35900 + }, + { + "epoch": 0.07629553418293825, + "grad_norm": 0.44920870661735535, + "learning_rate": 0.00019837606274267095, + "loss": 1.2543, + "step": 35910 + }, + { + "epoch": 0.0763167805026773, + "grad_norm": 0.42683419585227966, + "learning_rate": 0.00019837484004496593, + "loss": 1.2546, + "step": 35920 + }, + { + "epoch": 0.07633802682241636, + "grad_norm": 0.32902976870536804, + "learning_rate": 0.0001983736168909072, + "loss": 1.2196, + "step": 35930 + }, + { + "epoch": 0.0763592731421554, + "grad_norm": 0.4049833118915558, + "learning_rate": 0.0001983723932805004, + "loss": 1.218, + "step": 35940 + }, + { + "epoch": 0.07638051946189446, + "grad_norm": 0.7248024940490723, + "learning_rate": 0.00019837116921375122, + "loss": 1.223, + "step": 35950 + }, + { + "epoch": 0.07640176578163352, + "grad_norm": 0.34465813636779785, + "learning_rate": 0.00019836994469066532, + "loss": 1.2228, + "step": 35960 + }, + { + "epoch": 0.07642301210137256, + "grad_norm": 0.33531805872917175, + "learning_rate": 0.0001983687197112484, + "loss": 1.2218, + "step": 35970 + }, + { + "epoch": 0.07644425842111162, + "grad_norm": 0.44989219307899475, + "learning_rate": 0.00019836749427550615, + "loss": 1.1968, + "step": 35980 + }, + { + "epoch": 0.07646550474085068, + "grad_norm": 0.35069775581359863, + "learning_rate": 0.00019836626838344422, + "loss": 1.2128, + "step": 35990 + }, + { + "epoch": 0.07648675106058972, + "grad_norm": 0.4167632758617401, + "learning_rate": 0.0001983650420350683, + "loss": 1.1994, + "step": 36000 + }, + { + "epoch": 0.07650799738032878, + "grad_norm": 0.36838242411613464, + "learning_rate": 0.00019836381523038413, + "loss": 1.2169, + "step": 36010 + }, + { + "epoch": 0.07652924370006783, + "grad_norm": 0.5576869249343872, + "learning_rate": 0.00019836258796939734, + "loss": 1.252, + "step": 36020 + }, + { + "epoch": 0.07655049001980688, + "grad_norm": 0.5751132369041443, + "learning_rate": 0.00019836136025211368, + "loss": 1.1972, + "step": 36030 + }, + { + "epoch": 0.07657173633954593, + "grad_norm": 0.339394748210907, + "learning_rate": 0.0001983601320785388, + "loss": 1.2144, + "step": 36040 + }, + { + "epoch": 0.07659298265928499, + "grad_norm": 0.4095757007598877, + "learning_rate": 0.0001983589034486784, + "loss": 1.2279, + "step": 36050 + }, + { + "epoch": 0.07661422897902403, + "grad_norm": 0.504069447517395, + "learning_rate": 0.00019835767436253822, + "loss": 1.1678, + "step": 36060 + }, + { + "epoch": 0.07663547529876309, + "grad_norm": 0.5719659924507141, + "learning_rate": 0.00019835644482012391, + "loss": 1.2067, + "step": 36070 + }, + { + "epoch": 0.07665672161850215, + "grad_norm": 0.46937206387519836, + "learning_rate": 0.00019835521482144123, + "loss": 1.2519, + "step": 36080 + }, + { + "epoch": 0.07667796793824119, + "grad_norm": 0.4632151126861572, + "learning_rate": 0.0001983539843664958, + "loss": 1.1984, + "step": 36090 + }, + { + "epoch": 0.07669921425798025, + "grad_norm": 0.41972023248672485, + "learning_rate": 0.00019835275345529344, + "loss": 1.2415, + "step": 36100 + }, + { + "epoch": 0.07672046057771931, + "grad_norm": 0.5267425775527954, + "learning_rate": 0.00019835152208783977, + "loss": 1.2057, + "step": 36110 + }, + { + "epoch": 0.07674170689745835, + "grad_norm": 0.5850309133529663, + "learning_rate": 0.00019835029026414055, + "loss": 1.2029, + "step": 36120 + }, + { + "epoch": 0.07676295321719741, + "grad_norm": 0.8053157329559326, + "learning_rate": 0.00019834905798420146, + "loss": 1.2394, + "step": 36130 + }, + { + "epoch": 0.07678419953693646, + "grad_norm": 0.4132903814315796, + "learning_rate": 0.00019834782524802823, + "loss": 1.2131, + "step": 36140 + }, + { + "epoch": 0.07680544585667551, + "grad_norm": 0.4775589406490326, + "learning_rate": 0.0001983465920556266, + "loss": 1.2621, + "step": 36150 + }, + { + "epoch": 0.07682669217641457, + "grad_norm": 0.7321507334709167, + "learning_rate": 0.00019834535840700226, + "loss": 1.2223, + "step": 36160 + }, + { + "epoch": 0.07684793849615362, + "grad_norm": 0.37331169843673706, + "learning_rate": 0.00019834412430216097, + "loss": 1.2317, + "step": 36170 + }, + { + "epoch": 0.07686918481589267, + "grad_norm": 0.35601064562797546, + "learning_rate": 0.0001983428897411084, + "loss": 1.231, + "step": 36180 + }, + { + "epoch": 0.07689043113563172, + "grad_norm": 0.3802315592765808, + "learning_rate": 0.00019834165472385035, + "loss": 1.1821, + "step": 36190 + }, + { + "epoch": 0.07691167745537078, + "grad_norm": 0.3650524616241455, + "learning_rate": 0.0001983404192503925, + "loss": 1.2422, + "step": 36200 + }, + { + "epoch": 0.07693292377510982, + "grad_norm": 0.33710992336273193, + "learning_rate": 0.00019833918332074055, + "loss": 1.2774, + "step": 36210 + }, + { + "epoch": 0.07695417009484888, + "grad_norm": 0.34918028116226196, + "learning_rate": 0.00019833794693490032, + "loss": 1.2239, + "step": 36220 + }, + { + "epoch": 0.07697541641458794, + "grad_norm": 0.349139004945755, + "learning_rate": 0.00019833671009287748, + "loss": 1.2314, + "step": 36230 + }, + { + "epoch": 0.07699666273432698, + "grad_norm": 0.33036723732948303, + "learning_rate": 0.00019833547279467773, + "loss": 1.1807, + "step": 36240 + }, + { + "epoch": 0.07701790905406604, + "grad_norm": 0.3479318618774414, + "learning_rate": 0.00019833423504030695, + "loss": 1.2267, + "step": 36250 + }, + { + "epoch": 0.0770391553738051, + "grad_norm": 0.40339943766593933, + "learning_rate": 0.00019833299682977078, + "loss": 1.2375, + "step": 36260 + }, + { + "epoch": 0.07706040169354415, + "grad_norm": 0.45388081669807434, + "learning_rate": 0.00019833175816307493, + "loss": 1.2203, + "step": 36270 + }, + { + "epoch": 0.0770816480132832, + "grad_norm": 0.3999592363834381, + "learning_rate": 0.00019833051904022525, + "loss": 1.2624, + "step": 36280 + }, + { + "epoch": 0.07710289433302225, + "grad_norm": 0.3458629846572876, + "learning_rate": 0.00019832927946122743, + "loss": 1.2385, + "step": 36290 + }, + { + "epoch": 0.07712414065276131, + "grad_norm": 0.39873701333999634, + "learning_rate": 0.0001983280394260872, + "loss": 1.1888, + "step": 36300 + }, + { + "epoch": 0.07714538697250035, + "grad_norm": 0.37993210554122925, + "learning_rate": 0.00019832679893481037, + "loss": 1.2697, + "step": 36310 + }, + { + "epoch": 0.07716663329223941, + "grad_norm": 0.54108065366745, + "learning_rate": 0.00019832555798740263, + "loss": 1.2231, + "step": 36320 + }, + { + "epoch": 0.07718787961197847, + "grad_norm": 0.39334744215011597, + "learning_rate": 0.0001983243165838698, + "loss": 1.2239, + "step": 36330 + }, + { + "epoch": 0.07720912593171751, + "grad_norm": 0.38679391145706177, + "learning_rate": 0.0001983230747242176, + "loss": 1.2417, + "step": 36340 + }, + { + "epoch": 0.07723037225145657, + "grad_norm": 0.3564576804637909, + "learning_rate": 0.0001983218324084518, + "loss": 1.2424, + "step": 36350 + }, + { + "epoch": 0.07725161857119563, + "grad_norm": 0.4435421824455261, + "learning_rate": 0.00019832058963657813, + "loss": 1.2536, + "step": 36360 + }, + { + "epoch": 0.07727286489093467, + "grad_norm": 0.36624929308891296, + "learning_rate": 0.0001983193464086024, + "loss": 1.2226, + "step": 36370 + }, + { + "epoch": 0.07729411121067373, + "grad_norm": 0.361558198928833, + "learning_rate": 0.0001983181027245304, + "loss": 1.2558, + "step": 36380 + }, + { + "epoch": 0.07731535753041278, + "grad_norm": 0.6983163952827454, + "learning_rate": 0.00019831685858436783, + "loss": 1.2163, + "step": 36390 + }, + { + "epoch": 0.07733660385015183, + "grad_norm": 0.7092081904411316, + "learning_rate": 0.0001983156139881205, + "loss": 1.2121, + "step": 36400 + }, + { + "epoch": 0.07735785016989088, + "grad_norm": 0.5952584743499756, + "learning_rate": 0.0001983143689357942, + "loss": 1.2272, + "step": 36410 + }, + { + "epoch": 0.07737909648962994, + "grad_norm": 0.4586656093597412, + "learning_rate": 0.00019831312342739467, + "loss": 1.21, + "step": 36420 + }, + { + "epoch": 0.07740034280936899, + "grad_norm": 0.4197600185871124, + "learning_rate": 0.0001983118774629277, + "loss": 1.2586, + "step": 36430 + }, + { + "epoch": 0.07742158912910804, + "grad_norm": 0.3639428913593292, + "learning_rate": 0.0001983106310423991, + "loss": 1.2479, + "step": 36440 + }, + { + "epoch": 0.0774428354488471, + "grad_norm": 0.544296145439148, + "learning_rate": 0.00019830938416581456, + "loss": 1.2191, + "step": 36450 + }, + { + "epoch": 0.07746408176858614, + "grad_norm": 0.3850380480289459, + "learning_rate": 0.00019830813683318, + "loss": 1.2755, + "step": 36460 + }, + { + "epoch": 0.0774853280883252, + "grad_norm": 0.5069561004638672, + "learning_rate": 0.00019830688904450109, + "loss": 1.2556, + "step": 36470 + }, + { + "epoch": 0.07750657440806426, + "grad_norm": 0.36430594325065613, + "learning_rate": 0.0001983056407997837, + "loss": 1.2209, + "step": 36480 + }, + { + "epoch": 0.0775278207278033, + "grad_norm": 0.3302017152309418, + "learning_rate": 0.00019830439209903356, + "loss": 1.1807, + "step": 36490 + }, + { + "epoch": 0.07754906704754236, + "grad_norm": 0.5654780268669128, + "learning_rate": 0.00019830314294225646, + "loss": 1.2367, + "step": 36500 + }, + { + "epoch": 0.07757031336728142, + "grad_norm": 0.42280659079551697, + "learning_rate": 0.00019830189332945824, + "loss": 1.2609, + "step": 36510 + }, + { + "epoch": 0.07759155968702046, + "grad_norm": 0.34969648718833923, + "learning_rate": 0.00019830064326064472, + "loss": 1.2358, + "step": 36520 + }, + { + "epoch": 0.07761280600675952, + "grad_norm": 0.5751362442970276, + "learning_rate": 0.0001982993927358216, + "loss": 1.2202, + "step": 36530 + }, + { + "epoch": 0.07763405232649857, + "grad_norm": 0.40850841999053955, + "learning_rate": 0.00019829814175499478, + "loss": 1.2226, + "step": 36540 + }, + { + "epoch": 0.07765529864623762, + "grad_norm": 0.48470810055732727, + "learning_rate": 0.00019829689031817003, + "loss": 1.2432, + "step": 36550 + }, + { + "epoch": 0.07767654496597667, + "grad_norm": 0.45591238141059875, + "learning_rate": 0.0001982956384253531, + "loss": 1.2362, + "step": 36560 + }, + { + "epoch": 0.07769779128571573, + "grad_norm": 0.4948340654373169, + "learning_rate": 0.00019829438607654986, + "loss": 1.2355, + "step": 36570 + }, + { + "epoch": 0.07771903760545477, + "grad_norm": 0.38966819643974304, + "learning_rate": 0.00019829313327176612, + "loss": 1.2448, + "step": 36580 + }, + { + "epoch": 0.07774028392519383, + "grad_norm": 0.6061038374900818, + "learning_rate": 0.00019829188001100766, + "loss": 1.2168, + "step": 36590 + }, + { + "epoch": 0.07776153024493289, + "grad_norm": 0.5149372220039368, + "learning_rate": 0.00019829062629428033, + "loss": 1.1863, + "step": 36600 + }, + { + "epoch": 0.07778277656467193, + "grad_norm": 0.6908892393112183, + "learning_rate": 0.0001982893721215899, + "loss": 1.2071, + "step": 36610 + }, + { + "epoch": 0.07780402288441099, + "grad_norm": 0.7318817377090454, + "learning_rate": 0.00019828811749294224, + "loss": 1.2328, + "step": 36620 + }, + { + "epoch": 0.07782526920415005, + "grad_norm": 0.5369572639465332, + "learning_rate": 0.0001982868624083431, + "loss": 1.2493, + "step": 36630 + }, + { + "epoch": 0.07784651552388909, + "grad_norm": 0.3756740689277649, + "learning_rate": 0.00019828560686779837, + "loss": 1.2386, + "step": 36640 + }, + { + "epoch": 0.07786776184362815, + "grad_norm": 0.40333425998687744, + "learning_rate": 0.00019828435087131387, + "loss": 1.2084, + "step": 36650 + }, + { + "epoch": 0.0778890081633672, + "grad_norm": 0.35142192244529724, + "learning_rate": 0.0001982830944188954, + "loss": 1.2162, + "step": 36660 + }, + { + "epoch": 0.07791025448310625, + "grad_norm": 0.39704495668411255, + "learning_rate": 0.00019828183751054875, + "loss": 1.1969, + "step": 36670 + }, + { + "epoch": 0.0779315008028453, + "grad_norm": 0.5350301861763, + "learning_rate": 0.00019828058014627987, + "loss": 1.2452, + "step": 36680 + }, + { + "epoch": 0.07795274712258436, + "grad_norm": 0.4279558062553406, + "learning_rate": 0.00019827932232609446, + "loss": 1.2146, + "step": 36690 + }, + { + "epoch": 0.07797399344232342, + "grad_norm": 0.39042043685913086, + "learning_rate": 0.00019827806404999843, + "loss": 1.2179, + "step": 36700 + }, + { + "epoch": 0.07799523976206246, + "grad_norm": 0.3469587564468384, + "learning_rate": 0.00019827680531799761, + "loss": 1.1876, + "step": 36710 + }, + { + "epoch": 0.07801648608180152, + "grad_norm": 0.3517674207687378, + "learning_rate": 0.00019827554613009783, + "loss": 1.2675, + "step": 36720 + }, + { + "epoch": 0.07803773240154058, + "grad_norm": 0.4278165400028229, + "learning_rate": 0.00019827428648630494, + "loss": 1.2294, + "step": 36730 + }, + { + "epoch": 0.07805897872127962, + "grad_norm": 0.37171679735183716, + "learning_rate": 0.00019827302638662471, + "loss": 1.2368, + "step": 36740 + }, + { + "epoch": 0.07808022504101868, + "grad_norm": 0.3529261648654938, + "learning_rate": 0.00019827176583106312, + "loss": 1.2309, + "step": 36750 + }, + { + "epoch": 0.07810147136075773, + "grad_norm": 0.35570815205574036, + "learning_rate": 0.00019827050481962592, + "loss": 1.2016, + "step": 36760 + }, + { + "epoch": 0.07812271768049678, + "grad_norm": 0.5896065831184387, + "learning_rate": 0.000198269243352319, + "loss": 1.1924, + "step": 36770 + }, + { + "epoch": 0.07814396400023584, + "grad_norm": 0.3683023452758789, + "learning_rate": 0.0001982679814291482, + "loss": 1.2312, + "step": 36780 + }, + { + "epoch": 0.07816521031997489, + "grad_norm": 0.34589821100234985, + "learning_rate": 0.00019826671905011936, + "loss": 1.2313, + "step": 36790 + }, + { + "epoch": 0.07818645663971394, + "grad_norm": 0.32662132382392883, + "learning_rate": 0.00019826545621523834, + "loss": 1.229, + "step": 36800 + }, + { + "epoch": 0.078207702959453, + "grad_norm": 0.4126325845718384, + "learning_rate": 0.00019826419292451103, + "loss": 1.2317, + "step": 36810 + }, + { + "epoch": 0.07822894927919205, + "grad_norm": 0.4080583453178406, + "learning_rate": 0.00019826292917794324, + "loss": 1.2187, + "step": 36820 + }, + { + "epoch": 0.0782501955989311, + "grad_norm": 0.4156731367111206, + "learning_rate": 0.0001982616649755409, + "loss": 1.2408, + "step": 36830 + }, + { + "epoch": 0.07827144191867015, + "grad_norm": 0.37365126609802246, + "learning_rate": 0.0001982604003173098, + "loss": 1.2077, + "step": 36840 + }, + { + "epoch": 0.07829268823840921, + "grad_norm": 0.5604127645492554, + "learning_rate": 0.00019825913520325585, + "loss": 1.2, + "step": 36850 + }, + { + "epoch": 0.07831393455814825, + "grad_norm": 0.4325930178165436, + "learning_rate": 0.0001982578696333849, + "loss": 1.2245, + "step": 36860 + }, + { + "epoch": 0.07833518087788731, + "grad_norm": 0.4597231149673462, + "learning_rate": 0.0001982566036077029, + "loss": 1.2448, + "step": 36870 + }, + { + "epoch": 0.07835642719762637, + "grad_norm": 0.42742666602134705, + "learning_rate": 0.00019825533712621555, + "loss": 1.2283, + "step": 36880 + }, + { + "epoch": 0.07837767351736541, + "grad_norm": 0.3438252806663513, + "learning_rate": 0.0001982540701889289, + "loss": 1.225, + "step": 36890 + }, + { + "epoch": 0.07839891983710447, + "grad_norm": 0.3143714666366577, + "learning_rate": 0.00019825280279584876, + "loss": 1.2505, + "step": 36900 + }, + { + "epoch": 0.07842016615684352, + "grad_norm": 0.406141996383667, + "learning_rate": 0.00019825153494698096, + "loss": 1.1835, + "step": 36910 + }, + { + "epoch": 0.07844141247658257, + "grad_norm": 0.6471395492553711, + "learning_rate": 0.00019825026664233145, + "loss": 1.2378, + "step": 36920 + }, + { + "epoch": 0.07846265879632162, + "grad_norm": 0.34664106369018555, + "learning_rate": 0.00019824899788190608, + "loss": 1.2297, + "step": 36930 + }, + { + "epoch": 0.07848390511606068, + "grad_norm": 0.3610133230686188, + "learning_rate": 0.00019824772866571075, + "loss": 1.2447, + "step": 36940 + }, + { + "epoch": 0.07850515143579972, + "grad_norm": 0.36884284019470215, + "learning_rate": 0.00019824645899375133, + "loss": 1.2506, + "step": 36950 + }, + { + "epoch": 0.07852639775553878, + "grad_norm": 0.417453408241272, + "learning_rate": 0.00019824518886603376, + "loss": 1.2446, + "step": 36960 + }, + { + "epoch": 0.07854764407527784, + "grad_norm": 0.38861364126205444, + "learning_rate": 0.0001982439182825639, + "loss": 1.2087, + "step": 36970 + }, + { + "epoch": 0.07856889039501688, + "grad_norm": 0.6156056523323059, + "learning_rate": 0.0001982426472433476, + "loss": 1.2274, + "step": 36980 + }, + { + "epoch": 0.07859013671475594, + "grad_norm": 0.3497461974620819, + "learning_rate": 0.00019824137574839083, + "loss": 1.2223, + "step": 36990 + }, + { + "epoch": 0.078611383034495, + "grad_norm": 0.6171861290931702, + "learning_rate": 0.00019824010379769944, + "loss": 1.1996, + "step": 37000 + }, + { + "epoch": 0.07863262935423404, + "grad_norm": 0.3925982415676117, + "learning_rate": 0.00019823883139127936, + "loss": 1.2433, + "step": 37010 + }, + { + "epoch": 0.0786538756739731, + "grad_norm": 0.3387570083141327, + "learning_rate": 0.00019823755852913647, + "loss": 1.243, + "step": 37020 + }, + { + "epoch": 0.07867512199371215, + "grad_norm": 0.3404376208782196, + "learning_rate": 0.00019823628521127665, + "loss": 1.2475, + "step": 37030 + }, + { + "epoch": 0.0786963683134512, + "grad_norm": 0.3652242124080658, + "learning_rate": 0.00019823501143770587, + "loss": 1.2237, + "step": 37040 + }, + { + "epoch": 0.07871761463319026, + "grad_norm": 0.5084404349327087, + "learning_rate": 0.00019823373720843001, + "loss": 1.2003, + "step": 37050 + }, + { + "epoch": 0.07873886095292931, + "grad_norm": 0.4010491669178009, + "learning_rate": 0.00019823246252345498, + "loss": 1.2413, + "step": 37060 + }, + { + "epoch": 0.07876010727266836, + "grad_norm": 0.3693988621234894, + "learning_rate": 0.0001982311873827867, + "loss": 1.2369, + "step": 37070 + }, + { + "epoch": 0.07878135359240741, + "grad_norm": 0.5608027577400208, + "learning_rate": 0.00019822991178643101, + "loss": 1.2074, + "step": 37080 + }, + { + "epoch": 0.07880259991214647, + "grad_norm": 0.42668724060058594, + "learning_rate": 0.00019822863573439397, + "loss": 1.221, + "step": 37090 + }, + { + "epoch": 0.07882384623188551, + "grad_norm": 0.39590105414390564, + "learning_rate": 0.0001982273592266814, + "loss": 1.2124, + "step": 37100 + }, + { + "epoch": 0.07884509255162457, + "grad_norm": 0.3547859489917755, + "learning_rate": 0.00019822608226329925, + "loss": 1.1989, + "step": 37110 + }, + { + "epoch": 0.07886633887136363, + "grad_norm": 0.47662293910980225, + "learning_rate": 0.00019822480484425346, + "loss": 1.228, + "step": 37120 + }, + { + "epoch": 0.07888758519110269, + "grad_norm": 0.3421047031879425, + "learning_rate": 0.00019822352696954992, + "loss": 1.2709, + "step": 37130 + }, + { + "epoch": 0.07890883151084173, + "grad_norm": 0.4620232582092285, + "learning_rate": 0.00019822224863919456, + "loss": 1.1714, + "step": 37140 + }, + { + "epoch": 0.07893007783058079, + "grad_norm": 0.630141019821167, + "learning_rate": 0.00019822096985319334, + "loss": 1.2252, + "step": 37150 + }, + { + "epoch": 0.07895132415031984, + "grad_norm": 0.39296838641166687, + "learning_rate": 0.00019821969061155218, + "loss": 1.228, + "step": 37160 + }, + { + "epoch": 0.07897257047005889, + "grad_norm": 0.36748382449150085, + "learning_rate": 0.000198218410914277, + "loss": 1.2319, + "step": 37170 + }, + { + "epoch": 0.07899381678979794, + "grad_norm": 0.3666574954986572, + "learning_rate": 0.00019821713076137372, + "loss": 1.1856, + "step": 37180 + }, + { + "epoch": 0.079015063109537, + "grad_norm": 0.4698690176010132, + "learning_rate": 0.0001982158501528484, + "loss": 1.2353, + "step": 37190 + }, + { + "epoch": 0.07903630942927604, + "grad_norm": 0.3830912709236145, + "learning_rate": 0.0001982145690887068, + "loss": 1.2468, + "step": 37200 + }, + { + "epoch": 0.0790575557490151, + "grad_norm": 0.3349054157733917, + "learning_rate": 0.00019821328756895496, + "loss": 1.2193, + "step": 37210 + }, + { + "epoch": 0.07907880206875416, + "grad_norm": 0.5198194980621338, + "learning_rate": 0.00019821200559359882, + "loss": 1.2247, + "step": 37220 + }, + { + "epoch": 0.0791000483884932, + "grad_norm": 0.6133796572685242, + "learning_rate": 0.00019821072316264434, + "loss": 1.2614, + "step": 37230 + }, + { + "epoch": 0.07912129470823226, + "grad_norm": 0.5381436944007874, + "learning_rate": 0.00019820944027609745, + "loss": 1.2063, + "step": 37240 + }, + { + "epoch": 0.07914254102797132, + "grad_norm": 0.37669745087623596, + "learning_rate": 0.00019820815693396407, + "loss": 1.2059, + "step": 37250 + }, + { + "epoch": 0.07916378734771036, + "grad_norm": 0.4105495512485504, + "learning_rate": 0.0001982068731362502, + "loss": 1.2425, + "step": 37260 + }, + { + "epoch": 0.07918503366744942, + "grad_norm": 0.3661375343799591, + "learning_rate": 0.00019820558888296178, + "loss": 1.2355, + "step": 37270 + }, + { + "epoch": 0.07920627998718847, + "grad_norm": 0.5295947194099426, + "learning_rate": 0.00019820430417410477, + "loss": 1.2025, + "step": 37280 + }, + { + "epoch": 0.07922752630692752, + "grad_norm": 0.33336615562438965, + "learning_rate": 0.00019820301900968512, + "loss": 1.222, + "step": 37290 + }, + { + "epoch": 0.07924877262666657, + "grad_norm": 0.544114351272583, + "learning_rate": 0.00019820173338970883, + "loss": 1.2157, + "step": 37300 + }, + { + "epoch": 0.07927001894640563, + "grad_norm": 0.625573456287384, + "learning_rate": 0.00019820044731418178, + "loss": 1.2392, + "step": 37310 + }, + { + "epoch": 0.07929126526614468, + "grad_norm": 0.3977108597755432, + "learning_rate": 0.00019819916078311006, + "loss": 1.2332, + "step": 37320 + }, + { + "epoch": 0.07931251158588373, + "grad_norm": 0.4214913845062256, + "learning_rate": 0.0001981978737964995, + "loss": 1.2106, + "step": 37330 + }, + { + "epoch": 0.07933375790562279, + "grad_norm": 0.4490053951740265, + "learning_rate": 0.00019819658635435617, + "loss": 1.2394, + "step": 37340 + }, + { + "epoch": 0.07935500422536183, + "grad_norm": 0.35931116342544556, + "learning_rate": 0.00019819529845668598, + "loss": 1.2364, + "step": 37350 + }, + { + "epoch": 0.07937625054510089, + "grad_norm": 0.35217419266700745, + "learning_rate": 0.00019819401010349495, + "loss": 1.218, + "step": 37360 + }, + { + "epoch": 0.07939749686483995, + "grad_norm": 0.38165390491485596, + "learning_rate": 0.00019819272129478906, + "loss": 1.2247, + "step": 37370 + }, + { + "epoch": 0.07941874318457899, + "grad_norm": 0.4087578356266022, + "learning_rate": 0.00019819143203057425, + "loss": 1.2698, + "step": 37380 + }, + { + "epoch": 0.07943998950431805, + "grad_norm": 0.48338863253593445, + "learning_rate": 0.00019819014231085654, + "loss": 1.2384, + "step": 37390 + }, + { + "epoch": 0.0794612358240571, + "grad_norm": 0.5528599619865417, + "learning_rate": 0.00019818885213564185, + "loss": 1.2277, + "step": 37400 + }, + { + "epoch": 0.07948248214379615, + "grad_norm": 0.4882422387599945, + "learning_rate": 0.00019818756150493628, + "loss": 1.2226, + "step": 37410 + }, + { + "epoch": 0.0795037284635352, + "grad_norm": 0.6550886034965515, + "learning_rate": 0.00019818627041874568, + "loss": 1.2198, + "step": 37420 + }, + { + "epoch": 0.07952497478327426, + "grad_norm": 0.552498996257782, + "learning_rate": 0.00019818497887707612, + "loss": 1.2193, + "step": 37430 + }, + { + "epoch": 0.0795462211030133, + "grad_norm": 0.6239751577377319, + "learning_rate": 0.0001981836868799336, + "loss": 1.2336, + "step": 37440 + }, + { + "epoch": 0.07956746742275236, + "grad_norm": 0.4406842291355133, + "learning_rate": 0.00019818239442732406, + "loss": 1.2242, + "step": 37450 + }, + { + "epoch": 0.07958871374249142, + "grad_norm": 0.47757095098495483, + "learning_rate": 0.00019818110151925355, + "loss": 1.2099, + "step": 37460 + }, + { + "epoch": 0.07960996006223046, + "grad_norm": 0.34274446964263916, + "learning_rate": 0.00019817980815572804, + "loss": 1.2092, + "step": 37470 + }, + { + "epoch": 0.07963120638196952, + "grad_norm": 0.4369792640209198, + "learning_rate": 0.00019817851433675353, + "loss": 1.1981, + "step": 37480 + }, + { + "epoch": 0.07965245270170858, + "grad_norm": 0.4204519987106323, + "learning_rate": 0.000198177220062336, + "loss": 1.2405, + "step": 37490 + }, + { + "epoch": 0.07967369902144762, + "grad_norm": 0.3188028931617737, + "learning_rate": 0.0001981759253324815, + "loss": 1.2229, + "step": 37500 + }, + { + "epoch": 0.07969494534118668, + "grad_norm": 0.5393826961517334, + "learning_rate": 0.00019817463014719602, + "loss": 1.2172, + "step": 37510 + }, + { + "epoch": 0.07971619166092574, + "grad_norm": 0.4720030725002289, + "learning_rate": 0.00019817333450648555, + "loss": 1.2177, + "step": 37520 + }, + { + "epoch": 0.07973743798066478, + "grad_norm": 0.3681173622608185, + "learning_rate": 0.00019817203841035613, + "loss": 1.1879, + "step": 37530 + }, + { + "epoch": 0.07975868430040384, + "grad_norm": 0.36581823229789734, + "learning_rate": 0.00019817074185881373, + "loss": 1.1942, + "step": 37540 + }, + { + "epoch": 0.0797799306201429, + "grad_norm": 0.3388773798942566, + "learning_rate": 0.0001981694448518644, + "loss": 1.225, + "step": 37550 + }, + { + "epoch": 0.07980117693988195, + "grad_norm": 0.46594923734664917, + "learning_rate": 0.00019816814738951414, + "loss": 1.2384, + "step": 37560 + }, + { + "epoch": 0.079822423259621, + "grad_norm": 0.35231590270996094, + "learning_rate": 0.000198166849471769, + "loss": 1.2301, + "step": 37570 + }, + { + "epoch": 0.07984366957936005, + "grad_norm": 0.7568521499633789, + "learning_rate": 0.00019816555109863495, + "loss": 1.1954, + "step": 37580 + }, + { + "epoch": 0.07986491589909911, + "grad_norm": 0.6620786786079407, + "learning_rate": 0.00019816425227011807, + "loss": 1.2835, + "step": 37590 + }, + { + "epoch": 0.07988616221883815, + "grad_norm": 0.6245778799057007, + "learning_rate": 0.00019816295298622432, + "loss": 1.2412, + "step": 37600 + }, + { + "epoch": 0.07990740853857721, + "grad_norm": 0.3439975082874298, + "learning_rate": 0.00019816165324695976, + "loss": 1.2214, + "step": 37610 + }, + { + "epoch": 0.07992865485831627, + "grad_norm": 0.4106175899505615, + "learning_rate": 0.00019816035305233047, + "loss": 1.2806, + "step": 37620 + }, + { + "epoch": 0.07994990117805531, + "grad_norm": 0.3753405511379242, + "learning_rate": 0.0001981590524023424, + "loss": 1.2395, + "step": 37630 + }, + { + "epoch": 0.07997114749779437, + "grad_norm": 0.4231901466846466, + "learning_rate": 0.0001981577512970016, + "loss": 1.2345, + "step": 37640 + }, + { + "epoch": 0.07999239381753342, + "grad_norm": 0.3587484657764435, + "learning_rate": 0.00019815644973631417, + "loss": 1.2354, + "step": 37650 + }, + { + "epoch": 0.08001364013727247, + "grad_norm": 0.4084208011627197, + "learning_rate": 0.00019815514772028605, + "loss": 1.2645, + "step": 37660 + }, + { + "epoch": 0.08003488645701153, + "grad_norm": 0.4369297921657562, + "learning_rate": 0.00019815384524892338, + "loss": 1.2528, + "step": 37670 + }, + { + "epoch": 0.08005613277675058, + "grad_norm": 0.6073344349861145, + "learning_rate": 0.00019815254232223212, + "loss": 1.2333, + "step": 37680 + }, + { + "epoch": 0.08007737909648963, + "grad_norm": 0.48480919003486633, + "learning_rate": 0.00019815123894021834, + "loss": 1.2478, + "step": 37690 + }, + { + "epoch": 0.08009862541622868, + "grad_norm": 0.40051671862602234, + "learning_rate": 0.0001981499351028881, + "loss": 1.2115, + "step": 37700 + }, + { + "epoch": 0.08011987173596774, + "grad_norm": 0.40817558765411377, + "learning_rate": 0.00019814863081024745, + "loss": 1.2433, + "step": 37710 + }, + { + "epoch": 0.08014111805570678, + "grad_norm": 0.3289916515350342, + "learning_rate": 0.00019814732606230243, + "loss": 1.2037, + "step": 37720 + }, + { + "epoch": 0.08016236437544584, + "grad_norm": 0.38709166646003723, + "learning_rate": 0.00019814602085905915, + "loss": 1.207, + "step": 37730 + }, + { + "epoch": 0.0801836106951849, + "grad_norm": 0.39223718643188477, + "learning_rate": 0.00019814471520052354, + "loss": 1.2175, + "step": 37740 + }, + { + "epoch": 0.08020485701492394, + "grad_norm": 0.5329062342643738, + "learning_rate": 0.00019814340908670173, + "loss": 1.1881, + "step": 37750 + }, + { + "epoch": 0.080226103334663, + "grad_norm": 0.39649274945259094, + "learning_rate": 0.0001981421025175998, + "loss": 1.2118, + "step": 37760 + }, + { + "epoch": 0.08024734965440206, + "grad_norm": 0.3386557102203369, + "learning_rate": 0.00019814079549322375, + "loss": 1.2535, + "step": 37770 + }, + { + "epoch": 0.0802685959741411, + "grad_norm": 0.3934323787689209, + "learning_rate": 0.00019813948801357972, + "loss": 1.2302, + "step": 37780 + }, + { + "epoch": 0.08028984229388016, + "grad_norm": 0.3739506006240845, + "learning_rate": 0.00019813818007867367, + "loss": 1.2863, + "step": 37790 + }, + { + "epoch": 0.08031108861361921, + "grad_norm": 0.4451095461845398, + "learning_rate": 0.0001981368716885118, + "loss": 1.2123, + "step": 37800 + }, + { + "epoch": 0.08033233493335826, + "grad_norm": 0.3394067883491516, + "learning_rate": 0.0001981355628431001, + "loss": 1.1997, + "step": 37810 + }, + { + "epoch": 0.08035358125309731, + "grad_norm": 0.5683655738830566, + "learning_rate": 0.00019813425354244465, + "loss": 1.2012, + "step": 37820 + }, + { + "epoch": 0.08037482757283637, + "grad_norm": 0.4507599174976349, + "learning_rate": 0.00019813294378655152, + "loss": 1.2348, + "step": 37830 + }, + { + "epoch": 0.08039607389257541, + "grad_norm": 0.34308210015296936, + "learning_rate": 0.0001981316335754268, + "loss": 1.2227, + "step": 37840 + }, + { + "epoch": 0.08041732021231447, + "grad_norm": 0.37177231907844543, + "learning_rate": 0.00019813032290907654, + "loss": 1.2244, + "step": 37850 + }, + { + "epoch": 0.08043856653205353, + "grad_norm": 0.3432164788246155, + "learning_rate": 0.00019812901178750686, + "loss": 1.2804, + "step": 37860 + }, + { + "epoch": 0.08045981285179257, + "grad_norm": 0.48491454124450684, + "learning_rate": 0.0001981277002107238, + "loss": 1.2076, + "step": 37870 + }, + { + "epoch": 0.08048105917153163, + "grad_norm": 0.4117030203342438, + "learning_rate": 0.0001981263881787335, + "loss": 1.2481, + "step": 37880 + }, + { + "epoch": 0.08050230549127069, + "grad_norm": 0.5274141430854797, + "learning_rate": 0.000198125075691542, + "loss": 1.2373, + "step": 37890 + }, + { + "epoch": 0.08052355181100973, + "grad_norm": 0.42359015345573425, + "learning_rate": 0.0001981237627491554, + "loss": 1.2013, + "step": 37900 + }, + { + "epoch": 0.08054479813074879, + "grad_norm": 0.35298478603363037, + "learning_rate": 0.0001981224493515798, + "loss": 1.2404, + "step": 37910 + }, + { + "epoch": 0.08056604445048784, + "grad_norm": 0.5587897896766663, + "learning_rate": 0.00019812113549882128, + "loss": 1.2311, + "step": 37920 + }, + { + "epoch": 0.08058729077022689, + "grad_norm": 0.6678736805915833, + "learning_rate": 0.00019811982119088595, + "loss": 1.2027, + "step": 37930 + }, + { + "epoch": 0.08060853708996595, + "grad_norm": 0.5888508558273315, + "learning_rate": 0.00019811850642777987, + "loss": 1.2145, + "step": 37940 + }, + { + "epoch": 0.080629783409705, + "grad_norm": 0.3657837212085724, + "learning_rate": 0.0001981171912095092, + "loss": 1.2359, + "step": 37950 + }, + { + "epoch": 0.08065102972944406, + "grad_norm": 0.38461557030677795, + "learning_rate": 0.00019811587553608, + "loss": 1.2279, + "step": 37960 + }, + { + "epoch": 0.0806722760491831, + "grad_norm": 0.4021267890930176, + "learning_rate": 0.00019811455940749837, + "loss": 1.2474, + "step": 37970 + }, + { + "epoch": 0.08069352236892216, + "grad_norm": 0.4308260679244995, + "learning_rate": 0.00019811324282377043, + "loss": 1.2106, + "step": 37980 + }, + { + "epoch": 0.08071476868866122, + "grad_norm": 0.36665216088294983, + "learning_rate": 0.00019811192578490227, + "loss": 1.2334, + "step": 37990 + }, + { + "epoch": 0.08073601500840026, + "grad_norm": 0.6650519967079163, + "learning_rate": 0.00019811060829090006, + "loss": 1.2361, + "step": 38000 + }, + { + "epoch": 0.08075726132813932, + "grad_norm": 0.3651743531227112, + "learning_rate": 0.00019810929034176983, + "loss": 1.2451, + "step": 38010 + }, + { + "epoch": 0.08077850764787838, + "grad_norm": 0.4594978094100952, + "learning_rate": 0.0001981079719375177, + "loss": 1.2171, + "step": 38020 + }, + { + "epoch": 0.08079975396761742, + "grad_norm": 0.3327193260192871, + "learning_rate": 0.00019810665307814986, + "loss": 1.2182, + "step": 38030 + }, + { + "epoch": 0.08082100028735648, + "grad_norm": 0.35964784026145935, + "learning_rate": 0.0001981053337636724, + "loss": 1.2613, + "step": 38040 + }, + { + "epoch": 0.08084224660709553, + "grad_norm": 0.36531123518943787, + "learning_rate": 0.00019810401399409137, + "loss": 1.2126, + "step": 38050 + }, + { + "epoch": 0.08086349292683458, + "grad_norm": 0.3893096446990967, + "learning_rate": 0.000198102693769413, + "loss": 1.223, + "step": 38060 + }, + { + "epoch": 0.08088473924657363, + "grad_norm": 0.3786863386631012, + "learning_rate": 0.0001981013730896433, + "loss": 1.2145, + "step": 38070 + }, + { + "epoch": 0.08090598556631269, + "grad_norm": 0.35849225521087646, + "learning_rate": 0.00019810005195478853, + "loss": 1.2156, + "step": 38080 + }, + { + "epoch": 0.08092723188605173, + "grad_norm": 0.4090849459171295, + "learning_rate": 0.0001980987303648547, + "loss": 1.2175, + "step": 38090 + }, + { + "epoch": 0.08094847820579079, + "grad_norm": 0.3379718065261841, + "learning_rate": 0.000198097408319848, + "loss": 1.2519, + "step": 38100 + }, + { + "epoch": 0.08096972452552985, + "grad_norm": 0.4076424539089203, + "learning_rate": 0.00019809608581977455, + "loss": 1.2224, + "step": 38110 + }, + { + "epoch": 0.08099097084526889, + "grad_norm": 0.34384414553642273, + "learning_rate": 0.00019809476286464047, + "loss": 1.2392, + "step": 38120 + }, + { + "epoch": 0.08101221716500795, + "grad_norm": 0.4937514364719391, + "learning_rate": 0.0001980934394544519, + "loss": 1.2426, + "step": 38130 + }, + { + "epoch": 0.081033463484747, + "grad_norm": 0.3998253047466278, + "learning_rate": 0.00019809211558921504, + "loss": 1.2144, + "step": 38140 + }, + { + "epoch": 0.08105470980448605, + "grad_norm": 0.3435596823692322, + "learning_rate": 0.00019809079126893595, + "loss": 1.2334, + "step": 38150 + }, + { + "epoch": 0.08107595612422511, + "grad_norm": 0.36933597922325134, + "learning_rate": 0.00019808946649362078, + "loss": 1.2372, + "step": 38160 + }, + { + "epoch": 0.08109720244396416, + "grad_norm": 0.3907890021800995, + "learning_rate": 0.00019808814126327575, + "loss": 1.2499, + "step": 38170 + }, + { + "epoch": 0.08111844876370321, + "grad_norm": 0.5235149264335632, + "learning_rate": 0.00019808681557790693, + "loss": 1.2368, + "step": 38180 + }, + { + "epoch": 0.08113969508344226, + "grad_norm": 0.3960829973220825, + "learning_rate": 0.0001980854894375205, + "loss": 1.2261, + "step": 38190 + }, + { + "epoch": 0.08116094140318132, + "grad_norm": 2.1066458225250244, + "learning_rate": 0.0001980841628421226, + "loss": 1.2166, + "step": 38200 + }, + { + "epoch": 0.08118218772292037, + "grad_norm": 0.3491906225681305, + "learning_rate": 0.00019808283579171942, + "loss": 1.2013, + "step": 38210 + }, + { + "epoch": 0.08120343404265942, + "grad_norm": 0.3475070893764496, + "learning_rate": 0.00019808150828631708, + "loss": 1.2145, + "step": 38220 + }, + { + "epoch": 0.08122468036239848, + "grad_norm": 0.48691627383232117, + "learning_rate": 0.00019808018032592176, + "loss": 1.2459, + "step": 38230 + }, + { + "epoch": 0.08124592668213752, + "grad_norm": 0.39228010177612305, + "learning_rate": 0.00019807885191053959, + "loss": 1.2133, + "step": 38240 + }, + { + "epoch": 0.08126717300187658, + "grad_norm": 0.3618859648704529, + "learning_rate": 0.00019807752304017674, + "loss": 1.2289, + "step": 38250 + }, + { + "epoch": 0.08128841932161564, + "grad_norm": 0.33788973093032837, + "learning_rate": 0.00019807619371483944, + "loss": 1.1914, + "step": 38260 + }, + { + "epoch": 0.08130966564135468, + "grad_norm": 0.5250856876373291, + "learning_rate": 0.00019807486393453374, + "loss": 1.2164, + "step": 38270 + }, + { + "epoch": 0.08133091196109374, + "grad_norm": 0.35082241892814636, + "learning_rate": 0.00019807353369926588, + "loss": 1.1966, + "step": 38280 + }, + { + "epoch": 0.0813521582808328, + "grad_norm": 0.3682171106338501, + "learning_rate": 0.00019807220300904207, + "loss": 1.2464, + "step": 38290 + }, + { + "epoch": 0.08137340460057184, + "grad_norm": 0.596433699131012, + "learning_rate": 0.00019807087186386838, + "loss": 1.2178, + "step": 38300 + }, + { + "epoch": 0.0813946509203109, + "grad_norm": 0.3470810651779175, + "learning_rate": 0.0001980695402637511, + "loss": 1.2198, + "step": 38310 + }, + { + "epoch": 0.08141589724004995, + "grad_norm": 0.31787198781967163, + "learning_rate": 0.0001980682082086963, + "loss": 1.2658, + "step": 38320 + }, + { + "epoch": 0.081437143559789, + "grad_norm": 0.36374717950820923, + "learning_rate": 0.0001980668756987102, + "loss": 1.2537, + "step": 38330 + }, + { + "epoch": 0.08145838987952805, + "grad_norm": 0.3622872531414032, + "learning_rate": 0.000198065542733799, + "loss": 1.2183, + "step": 38340 + }, + { + "epoch": 0.08147963619926711, + "grad_norm": 0.5596815943717957, + "learning_rate": 0.0001980642093139689, + "loss": 1.2452, + "step": 38350 + }, + { + "epoch": 0.08150088251900615, + "grad_norm": 0.37715715169906616, + "learning_rate": 0.000198062875439226, + "loss": 1.2956, + "step": 38360 + }, + { + "epoch": 0.08152212883874521, + "grad_norm": 0.6385335326194763, + "learning_rate": 0.0001980615411095766, + "loss": 1.2226, + "step": 38370 + }, + { + "epoch": 0.08154337515848427, + "grad_norm": 0.5999801158905029, + "learning_rate": 0.0001980602063250268, + "loss": 1.2324, + "step": 38380 + }, + { + "epoch": 0.08156462147822333, + "grad_norm": 0.410325288772583, + "learning_rate": 0.00019805887108558285, + "loss": 1.1742, + "step": 38390 + }, + { + "epoch": 0.08158586779796237, + "grad_norm": 0.37606143951416016, + "learning_rate": 0.0001980575353912509, + "loss": 1.2201, + "step": 38400 + }, + { + "epoch": 0.08160711411770143, + "grad_norm": 0.4494180679321289, + "learning_rate": 0.0001980561992420372, + "loss": 1.1934, + "step": 38410 + }, + { + "epoch": 0.08162836043744048, + "grad_norm": 0.6171980500221252, + "learning_rate": 0.0001980548626379479, + "loss": 1.2411, + "step": 38420 + }, + { + "epoch": 0.08164960675717953, + "grad_norm": 0.6576895117759705, + "learning_rate": 0.0001980535255789892, + "loss": 1.2104, + "step": 38430 + }, + { + "epoch": 0.08167085307691858, + "grad_norm": 0.5533120036125183, + "learning_rate": 0.00019805218806516733, + "loss": 1.2029, + "step": 38440 + }, + { + "epoch": 0.08169209939665764, + "grad_norm": 0.4047757387161255, + "learning_rate": 0.00019805085009648847, + "loss": 1.2322, + "step": 38450 + }, + { + "epoch": 0.08171334571639668, + "grad_norm": 0.446417897939682, + "learning_rate": 0.00019804951167295886, + "loss": 1.2121, + "step": 38460 + }, + { + "epoch": 0.08173459203613574, + "grad_norm": 0.5081830024719238, + "learning_rate": 0.00019804817279458467, + "loss": 1.2416, + "step": 38470 + }, + { + "epoch": 0.0817558383558748, + "grad_norm": 0.3923141658306122, + "learning_rate": 0.00019804683346137214, + "loss": 1.2067, + "step": 38480 + }, + { + "epoch": 0.08177708467561384, + "grad_norm": 0.7345935702323914, + "learning_rate": 0.00019804549367332744, + "loss": 1.2231, + "step": 38490 + }, + { + "epoch": 0.0817983309953529, + "grad_norm": 0.5067293643951416, + "learning_rate": 0.00019804415343045687, + "loss": 1.2151, + "step": 38500 + }, + { + "epoch": 0.08181957731509196, + "grad_norm": 0.49902570247650146, + "learning_rate": 0.00019804281273276656, + "loss": 1.2189, + "step": 38510 + }, + { + "epoch": 0.081840823634831, + "grad_norm": 0.48649078607559204, + "learning_rate": 0.0001980414715802628, + "loss": 1.1877, + "step": 38520 + }, + { + "epoch": 0.08186206995457006, + "grad_norm": 0.3645254969596863, + "learning_rate": 0.00019804012997295175, + "loss": 1.2488, + "step": 38530 + }, + { + "epoch": 0.08188331627430911, + "grad_norm": 0.3541102111339569, + "learning_rate": 0.00019803878791083967, + "loss": 1.2526, + "step": 38540 + }, + { + "epoch": 0.08190456259404816, + "grad_norm": 0.547900915145874, + "learning_rate": 0.00019803744539393273, + "loss": 1.2685, + "step": 38550 + }, + { + "epoch": 0.08192580891378722, + "grad_norm": 0.3822389245033264, + "learning_rate": 0.00019803610242223726, + "loss": 1.2371, + "step": 38560 + }, + { + "epoch": 0.08194705523352627, + "grad_norm": 0.3365839123725891, + "learning_rate": 0.0001980347589957594, + "loss": 1.2878, + "step": 38570 + }, + { + "epoch": 0.08196830155326532, + "grad_norm": 0.5989617109298706, + "learning_rate": 0.00019803341511450543, + "loss": 1.2544, + "step": 38580 + }, + { + "epoch": 0.08198954787300437, + "grad_norm": 0.4850156009197235, + "learning_rate": 0.00019803207077848158, + "loss": 1.2344, + "step": 38590 + }, + { + "epoch": 0.08201079419274343, + "grad_norm": 0.3678799569606781, + "learning_rate": 0.0001980307259876941, + "loss": 1.2395, + "step": 38600 + }, + { + "epoch": 0.08203204051248247, + "grad_norm": 0.34861359000205994, + "learning_rate": 0.0001980293807421491, + "loss": 1.2225, + "step": 38610 + }, + { + "epoch": 0.08205328683222153, + "grad_norm": 0.4032555818557739, + "learning_rate": 0.00019802803504185304, + "loss": 1.232, + "step": 38620 + }, + { + "epoch": 0.08207453315196059, + "grad_norm": 0.37489432096481323, + "learning_rate": 0.000198026688886812, + "loss": 1.2294, + "step": 38630 + }, + { + "epoch": 0.08209577947169963, + "grad_norm": 0.5375221967697144, + "learning_rate": 0.00019802534227703227, + "loss": 1.1917, + "step": 38640 + }, + { + "epoch": 0.08211702579143869, + "grad_norm": 0.510090708732605, + "learning_rate": 0.00019802399521252006, + "loss": 1.2098, + "step": 38650 + }, + { + "epoch": 0.08213827211117775, + "grad_norm": 0.4005191922187805, + "learning_rate": 0.00019802264769328172, + "loss": 1.2166, + "step": 38660 + }, + { + "epoch": 0.08215951843091679, + "grad_norm": 0.33740657567977905, + "learning_rate": 0.0001980212997193234, + "loss": 1.2179, + "step": 38670 + }, + { + "epoch": 0.08218076475065585, + "grad_norm": 0.3561663329601288, + "learning_rate": 0.0001980199512906514, + "loss": 1.2278, + "step": 38680 + }, + { + "epoch": 0.0822020110703949, + "grad_norm": 0.45254582166671753, + "learning_rate": 0.00019801860240727196, + "loss": 1.2025, + "step": 38690 + }, + { + "epoch": 0.08222325739013395, + "grad_norm": 0.38781309127807617, + "learning_rate": 0.00019801725306919137, + "loss": 1.2107, + "step": 38700 + }, + { + "epoch": 0.082244503709873, + "grad_norm": 0.40939056873321533, + "learning_rate": 0.00019801590327641586, + "loss": 1.1915, + "step": 38710 + }, + { + "epoch": 0.08226575002961206, + "grad_norm": 0.5288417339324951, + "learning_rate": 0.0001980145530289517, + "loss": 1.1959, + "step": 38720 + }, + { + "epoch": 0.0822869963493511, + "grad_norm": 0.9688785076141357, + "learning_rate": 0.00019801320232680513, + "loss": 1.1986, + "step": 38730 + }, + { + "epoch": 0.08230824266909016, + "grad_norm": 0.6007251739501953, + "learning_rate": 0.00019801185116998245, + "loss": 1.2226, + "step": 38740 + }, + { + "epoch": 0.08232948898882922, + "grad_norm": 0.5038107633590698, + "learning_rate": 0.00019801049955848988, + "loss": 1.2487, + "step": 38750 + }, + { + "epoch": 0.08235073530856826, + "grad_norm": 0.44326168298721313, + "learning_rate": 0.00019800914749233374, + "loss": 1.2268, + "step": 38760 + }, + { + "epoch": 0.08237198162830732, + "grad_norm": 0.4046681821346283, + "learning_rate": 0.0001980077949715203, + "loss": 1.2157, + "step": 38770 + }, + { + "epoch": 0.08239322794804638, + "grad_norm": 0.36491990089416504, + "learning_rate": 0.0001980064419960558, + "loss": 1.222, + "step": 38780 + }, + { + "epoch": 0.08241447426778542, + "grad_norm": 0.4972419738769531, + "learning_rate": 0.00019800508856594655, + "loss": 1.2071, + "step": 38790 + }, + { + "epoch": 0.08243572058752448, + "grad_norm": 0.4922252297401428, + "learning_rate": 0.0001980037346811988, + "loss": 1.2041, + "step": 38800 + }, + { + "epoch": 0.08245696690726353, + "grad_norm": 0.35506340861320496, + "learning_rate": 0.00019800238034181885, + "loss": 1.2334, + "step": 38810 + }, + { + "epoch": 0.08247821322700259, + "grad_norm": 0.4840454161167145, + "learning_rate": 0.000198001025547813, + "loss": 1.2603, + "step": 38820 + }, + { + "epoch": 0.08249945954674164, + "grad_norm": 0.4149545133113861, + "learning_rate": 0.0001979996702991875, + "loss": 1.2229, + "step": 38830 + }, + { + "epoch": 0.08252070586648069, + "grad_norm": 0.33762675523757935, + "learning_rate": 0.0001979983145959486, + "loss": 1.1942, + "step": 38840 + }, + { + "epoch": 0.08254195218621975, + "grad_norm": 0.3976956307888031, + "learning_rate": 0.0001979969584381027, + "loss": 1.2262, + "step": 38850 + }, + { + "epoch": 0.08256319850595879, + "grad_norm": 0.39065393805503845, + "learning_rate": 0.000197995601825656, + "loss": 1.2441, + "step": 38860 + }, + { + "epoch": 0.08258444482569785, + "grad_norm": 0.4477425515651703, + "learning_rate": 0.00019799424475861485, + "loss": 1.2052, + "step": 38870 + }, + { + "epoch": 0.08260569114543691, + "grad_norm": 0.34730809926986694, + "learning_rate": 0.00019799288723698548, + "loss": 1.2272, + "step": 38880 + }, + { + "epoch": 0.08262693746517595, + "grad_norm": 0.321196049451828, + "learning_rate": 0.00019799152926077427, + "loss": 1.2498, + "step": 38890 + }, + { + "epoch": 0.08264818378491501, + "grad_norm": 0.37678009271621704, + "learning_rate": 0.00019799017082998744, + "loss": 1.2057, + "step": 38900 + }, + { + "epoch": 0.08266943010465407, + "grad_norm": 0.3636118769645691, + "learning_rate": 0.00019798881194463134, + "loss": 1.2416, + "step": 38910 + }, + { + "epoch": 0.08269067642439311, + "grad_norm": 0.3351154029369354, + "learning_rate": 0.00019798745260471225, + "loss": 1.2394, + "step": 38920 + }, + { + "epoch": 0.08271192274413217, + "grad_norm": 0.4904910922050476, + "learning_rate": 0.00019798609281023648, + "loss": 1.2264, + "step": 38930 + }, + { + "epoch": 0.08273316906387122, + "grad_norm": 0.380941778421402, + "learning_rate": 0.00019798473256121035, + "loss": 1.1578, + "step": 38940 + }, + { + "epoch": 0.08275441538361027, + "grad_norm": 0.4804306924343109, + "learning_rate": 0.00019798337185764018, + "loss": 1.2458, + "step": 38950 + }, + { + "epoch": 0.08277566170334932, + "grad_norm": 0.3546751141548157, + "learning_rate": 0.00019798201069953225, + "loss": 1.1962, + "step": 38960 + }, + { + "epoch": 0.08279690802308838, + "grad_norm": 0.3139996826648712, + "learning_rate": 0.00019798064908689288, + "loss": 1.2469, + "step": 38970 + }, + { + "epoch": 0.08281815434282742, + "grad_norm": 0.42834439873695374, + "learning_rate": 0.00019797928701972844, + "loss": 1.21, + "step": 38980 + }, + { + "epoch": 0.08283940066256648, + "grad_norm": 0.3506881892681122, + "learning_rate": 0.00019797792449804515, + "loss": 1.2474, + "step": 38990 + }, + { + "epoch": 0.08286064698230554, + "grad_norm": 0.5457578897476196, + "learning_rate": 0.0001979765615218494, + "loss": 1.2687, + "step": 39000 + }, + { + "epoch": 0.08288189330204458, + "grad_norm": 0.354844868183136, + "learning_rate": 0.00019797519809114752, + "loss": 1.2007, + "step": 39010 + }, + { + "epoch": 0.08290313962178364, + "grad_norm": 0.4438607692718506, + "learning_rate": 0.00019797383420594583, + "loss": 1.2212, + "step": 39020 + }, + { + "epoch": 0.0829243859415227, + "grad_norm": 0.40097033977508545, + "learning_rate": 0.0001979724698662506, + "loss": 1.1981, + "step": 39030 + }, + { + "epoch": 0.08294563226126174, + "grad_norm": 0.38095030188560486, + "learning_rate": 0.00019797110507206823, + "loss": 1.2507, + "step": 39040 + }, + { + "epoch": 0.0829668785810008, + "grad_norm": 0.5308555960655212, + "learning_rate": 0.000197969739823405, + "loss": 1.2218, + "step": 39050 + }, + { + "epoch": 0.08298812490073985, + "grad_norm": 0.3999587595462799, + "learning_rate": 0.00019796837412026726, + "loss": 1.2223, + "step": 39060 + }, + { + "epoch": 0.0830093712204789, + "grad_norm": 0.3876534104347229, + "learning_rate": 0.00019796700796266136, + "loss": 1.2453, + "step": 39070 + }, + { + "epoch": 0.08303061754021795, + "grad_norm": 0.39944854378700256, + "learning_rate": 0.00019796564135059362, + "loss": 1.2192, + "step": 39080 + }, + { + "epoch": 0.08305186385995701, + "grad_norm": 0.35703563690185547, + "learning_rate": 0.0001979642742840704, + "loss": 1.2349, + "step": 39090 + }, + { + "epoch": 0.08307311017969606, + "grad_norm": 0.43324699997901917, + "learning_rate": 0.00019796290676309802, + "loss": 1.2075, + "step": 39100 + }, + { + "epoch": 0.08309435649943511, + "grad_norm": 0.5110551118850708, + "learning_rate": 0.00019796153878768284, + "loss": 1.2058, + "step": 39110 + }, + { + "epoch": 0.08311560281917417, + "grad_norm": 0.4183647036552429, + "learning_rate": 0.00019796017035783118, + "loss": 1.2764, + "step": 39120 + }, + { + "epoch": 0.08313684913891321, + "grad_norm": 0.6362654566764832, + "learning_rate": 0.00019795880147354942, + "loss": 1.269, + "step": 39130 + }, + { + "epoch": 0.08315809545865227, + "grad_norm": 0.40006023645401, + "learning_rate": 0.0001979574321348439, + "loss": 1.2339, + "step": 39140 + }, + { + "epoch": 0.08317934177839133, + "grad_norm": 0.5927096605300903, + "learning_rate": 0.00019795606234172098, + "loss": 1.1923, + "step": 39150 + }, + { + "epoch": 0.08320058809813037, + "grad_norm": 0.4042358100414276, + "learning_rate": 0.000197954692094187, + "loss": 1.2365, + "step": 39160 + }, + { + "epoch": 0.08322183441786943, + "grad_norm": 0.39290913939476013, + "learning_rate": 0.0001979533213922483, + "loss": 1.2268, + "step": 39170 + }, + { + "epoch": 0.08324308073760849, + "grad_norm": 0.3590940237045288, + "learning_rate": 0.00019795195023591125, + "loss": 1.2253, + "step": 39180 + }, + { + "epoch": 0.08326432705734753, + "grad_norm": 0.5506341457366943, + "learning_rate": 0.00019795057862518226, + "loss": 1.2076, + "step": 39190 + }, + { + "epoch": 0.08328557337708659, + "grad_norm": 0.36980125308036804, + "learning_rate": 0.0001979492065600676, + "loss": 1.2308, + "step": 39200 + }, + { + "epoch": 0.08330681969682564, + "grad_norm": 0.36076459288597107, + "learning_rate": 0.00019794783404057374, + "loss": 1.2192, + "step": 39210 + }, + { + "epoch": 0.08332806601656469, + "grad_norm": 0.39142075181007385, + "learning_rate": 0.00019794646106670695, + "loss": 1.2507, + "step": 39220 + }, + { + "epoch": 0.08334931233630374, + "grad_norm": 0.790238618850708, + "learning_rate": 0.00019794508763847367, + "loss": 1.2198, + "step": 39230 + }, + { + "epoch": 0.0833705586560428, + "grad_norm": 0.3883480727672577, + "learning_rate": 0.0001979437137558802, + "loss": 1.21, + "step": 39240 + }, + { + "epoch": 0.08339180497578186, + "grad_norm": 0.4617142081260681, + "learning_rate": 0.000197942339418933, + "loss": 1.201, + "step": 39250 + }, + { + "epoch": 0.0834130512955209, + "grad_norm": 0.44634759426116943, + "learning_rate": 0.0001979409646276384, + "loss": 1.22, + "step": 39260 + }, + { + "epoch": 0.08343429761525996, + "grad_norm": 0.4990612864494324, + "learning_rate": 0.00019793958938200276, + "loss": 1.2086, + "step": 39270 + }, + { + "epoch": 0.08345554393499902, + "grad_norm": 0.454734206199646, + "learning_rate": 0.0001979382136820325, + "loss": 1.2158, + "step": 39280 + }, + { + "epoch": 0.08347679025473806, + "grad_norm": 0.3296569883823395, + "learning_rate": 0.000197936837527734, + "loss": 1.2154, + "step": 39290 + }, + { + "epoch": 0.08349803657447712, + "grad_norm": 0.3879701793193817, + "learning_rate": 0.00019793546091911358, + "loss": 1.2211, + "step": 39300 + }, + { + "epoch": 0.08351928289421617, + "grad_norm": 0.33428454399108887, + "learning_rate": 0.0001979340838561777, + "loss": 1.2218, + "step": 39310 + }, + { + "epoch": 0.08354052921395522, + "grad_norm": 0.3166261315345764, + "learning_rate": 0.0001979327063389327, + "loss": 1.1968, + "step": 39320 + }, + { + "epoch": 0.08356177553369427, + "grad_norm": 0.37858474254608154, + "learning_rate": 0.00019793132836738503, + "loss": 1.2183, + "step": 39330 + }, + { + "epoch": 0.08358302185343333, + "grad_norm": 0.382676362991333, + "learning_rate": 0.000197929949941541, + "loss": 1.2222, + "step": 39340 + }, + { + "epoch": 0.08360426817317237, + "grad_norm": 0.3336533010005951, + "learning_rate": 0.00019792857106140706, + "loss": 1.2209, + "step": 39350 + }, + { + "epoch": 0.08362551449291143, + "grad_norm": 0.3491182327270508, + "learning_rate": 0.0001979271917269896, + "loss": 1.2278, + "step": 39360 + }, + { + "epoch": 0.08364676081265049, + "grad_norm": 0.3805896043777466, + "learning_rate": 0.000197925811938295, + "loss": 1.2266, + "step": 39370 + }, + { + "epoch": 0.08366800713238953, + "grad_norm": 0.4596502482891083, + "learning_rate": 0.00019792443169532967, + "loss": 1.2205, + "step": 39380 + }, + { + "epoch": 0.08368925345212859, + "grad_norm": 0.4948384165763855, + "learning_rate": 0.00019792305099810003, + "loss": 1.2588, + "step": 39390 + }, + { + "epoch": 0.08371049977186765, + "grad_norm": 0.3328603208065033, + "learning_rate": 0.00019792166984661247, + "loss": 1.2059, + "step": 39400 + }, + { + "epoch": 0.08373174609160669, + "grad_norm": 0.4175824522972107, + "learning_rate": 0.00019792028824087336, + "loss": 1.2662, + "step": 39410 + }, + { + "epoch": 0.08375299241134575, + "grad_norm": 0.3500354588031769, + "learning_rate": 0.0001979189061808892, + "loss": 1.1736, + "step": 39420 + }, + { + "epoch": 0.0837742387310848, + "grad_norm": 0.40433523058891296, + "learning_rate": 0.00019791752366666633, + "loss": 1.2031, + "step": 39430 + }, + { + "epoch": 0.08379548505082385, + "grad_norm": 0.42786213755607605, + "learning_rate": 0.00019791614069821116, + "loss": 1.1883, + "step": 39440 + }, + { + "epoch": 0.0838167313705629, + "grad_norm": 0.37540602684020996, + "learning_rate": 0.00019791475727553014, + "loss": 1.2151, + "step": 39450 + }, + { + "epoch": 0.08383797769030196, + "grad_norm": 0.4469187557697296, + "learning_rate": 0.00019791337339862967, + "loss": 1.1862, + "step": 39460 + }, + { + "epoch": 0.083859224010041, + "grad_norm": 0.4320175051689148, + "learning_rate": 0.0001979119890675162, + "loss": 1.1943, + "step": 39470 + }, + { + "epoch": 0.08388047032978006, + "grad_norm": 0.3159637153148651, + "learning_rate": 0.0001979106042821961, + "loss": 1.2176, + "step": 39480 + }, + { + "epoch": 0.08390171664951912, + "grad_norm": 0.3577759265899658, + "learning_rate": 0.00019790921904267584, + "loss": 1.2728, + "step": 39490 + }, + { + "epoch": 0.08392296296925816, + "grad_norm": 0.40142112970352173, + "learning_rate": 0.00019790783334896182, + "loss": 1.232, + "step": 39500 + }, + { + "epoch": 0.08394420928899722, + "grad_norm": 0.46781036257743835, + "learning_rate": 0.00019790644720106044, + "loss": 1.216, + "step": 39510 + }, + { + "epoch": 0.08396545560873628, + "grad_norm": 0.44737717509269714, + "learning_rate": 0.0001979050605989782, + "loss": 1.2509, + "step": 39520 + }, + { + "epoch": 0.08398670192847532, + "grad_norm": 0.34662920236587524, + "learning_rate": 0.00019790367354272147, + "loss": 1.2317, + "step": 39530 + }, + { + "epoch": 0.08400794824821438, + "grad_norm": 0.38437604904174805, + "learning_rate": 0.00019790228603229671, + "loss": 1.223, + "step": 39540 + }, + { + "epoch": 0.08402919456795344, + "grad_norm": 0.3576034605503082, + "learning_rate": 0.0001979008980677104, + "loss": 1.2488, + "step": 39550 + }, + { + "epoch": 0.08405044088769248, + "grad_norm": 0.36473655700683594, + "learning_rate": 0.0001978995096489689, + "loss": 1.2213, + "step": 39560 + }, + { + "epoch": 0.08407168720743154, + "grad_norm": 0.357953280210495, + "learning_rate": 0.0001978981207760787, + "loss": 1.1975, + "step": 39570 + }, + { + "epoch": 0.0840929335271706, + "grad_norm": 0.5490621328353882, + "learning_rate": 0.00019789673144904625, + "loss": 1.2585, + "step": 39580 + }, + { + "epoch": 0.08411417984690964, + "grad_norm": 0.5028707981109619, + "learning_rate": 0.00019789534166787792, + "loss": 1.2243, + "step": 39590 + }, + { + "epoch": 0.0841354261666487, + "grad_norm": 0.3297739028930664, + "learning_rate": 0.00019789395143258025, + "loss": 1.2012, + "step": 39600 + }, + { + "epoch": 0.08415667248638775, + "grad_norm": 0.4283115267753601, + "learning_rate": 0.00019789256074315964, + "loss": 1.2176, + "step": 39610 + }, + { + "epoch": 0.0841779188061268, + "grad_norm": 0.43724754452705383, + "learning_rate": 0.00019789116959962257, + "loss": 1.2381, + "step": 39620 + }, + { + "epoch": 0.08419916512586585, + "grad_norm": 0.697408139705658, + "learning_rate": 0.00019788977800197545, + "loss": 1.2095, + "step": 39630 + }, + { + "epoch": 0.08422041144560491, + "grad_norm": 0.3308625817298889, + "learning_rate": 0.00019788838595022478, + "loss": 1.201, + "step": 39640 + }, + { + "epoch": 0.08424165776534395, + "grad_norm": 0.5043078660964966, + "learning_rate": 0.000197886993444377, + "loss": 1.2325, + "step": 39650 + }, + { + "epoch": 0.08426290408508301, + "grad_norm": 0.7350984215736389, + "learning_rate": 0.0001978856004844386, + "loss": 1.2281, + "step": 39660 + }, + { + "epoch": 0.08428415040482207, + "grad_norm": 0.40177613496780396, + "learning_rate": 0.00019788420707041594, + "loss": 1.2595, + "step": 39670 + }, + { + "epoch": 0.08430539672456112, + "grad_norm": 0.5654754638671875, + "learning_rate": 0.0001978828132023156, + "loss": 1.216, + "step": 39680 + }, + { + "epoch": 0.08432664304430017, + "grad_norm": 0.3529897928237915, + "learning_rate": 0.000197881418880144, + "loss": 1.2004, + "step": 39690 + }, + { + "epoch": 0.08434788936403922, + "grad_norm": 0.36556291580200195, + "learning_rate": 0.0001978800241039076, + "loss": 1.2214, + "step": 39700 + }, + { + "epoch": 0.08436913568377828, + "grad_norm": 0.521203875541687, + "learning_rate": 0.00019787862887361284, + "loss": 1.2173, + "step": 39710 + }, + { + "epoch": 0.08439038200351733, + "grad_norm": 0.46020835638046265, + "learning_rate": 0.00019787723318926627, + "loss": 1.2376, + "step": 39720 + }, + { + "epoch": 0.08441162832325638, + "grad_norm": 0.6013027429580688, + "learning_rate": 0.00019787583705087433, + "loss": 1.2624, + "step": 39730 + }, + { + "epoch": 0.08443287464299544, + "grad_norm": 0.650457501411438, + "learning_rate": 0.00019787444045844346, + "loss": 1.2314, + "step": 39740 + }, + { + "epoch": 0.08445412096273448, + "grad_norm": 0.714241623878479, + "learning_rate": 0.0001978730434119802, + "loss": 1.2209, + "step": 39750 + }, + { + "epoch": 0.08447536728247354, + "grad_norm": 0.38469016551971436, + "learning_rate": 0.000197871645911491, + "loss": 1.1999, + "step": 39760 + }, + { + "epoch": 0.0844966136022126, + "grad_norm": 0.6477208137512207, + "learning_rate": 0.00019787024795698237, + "loss": 1.2487, + "step": 39770 + }, + { + "epoch": 0.08451785992195164, + "grad_norm": 0.943854570388794, + "learning_rate": 0.0001978688495484607, + "loss": 1.2318, + "step": 39780 + }, + { + "epoch": 0.0845391062416907, + "grad_norm": 0.35811156034469604, + "learning_rate": 0.00019786745068593262, + "loss": 1.2676, + "step": 39790 + }, + { + "epoch": 0.08456035256142976, + "grad_norm": 0.35134410858154297, + "learning_rate": 0.00019786605136940452, + "loss": 1.1904, + "step": 39800 + }, + { + "epoch": 0.0845815988811688, + "grad_norm": 0.45842939615249634, + "learning_rate": 0.0001978646515988829, + "loss": 1.2015, + "step": 39810 + }, + { + "epoch": 0.08460284520090786, + "grad_norm": 0.3729172646999359, + "learning_rate": 0.0001978632513743743, + "loss": 1.2555, + "step": 39820 + }, + { + "epoch": 0.08462409152064691, + "grad_norm": 0.35593220591545105, + "learning_rate": 0.00019786185069588518, + "loss": 1.1949, + "step": 39830 + }, + { + "epoch": 0.08464533784038596, + "grad_norm": 0.3999347984790802, + "learning_rate": 0.00019786044956342203, + "loss": 1.2439, + "step": 39840 + }, + { + "epoch": 0.08466658416012501, + "grad_norm": 0.37279555201530457, + "learning_rate": 0.00019785904797699135, + "loss": 1.2586, + "step": 39850 + }, + { + "epoch": 0.08468783047986407, + "grad_norm": 0.5829684138298035, + "learning_rate": 0.00019785764593659969, + "loss": 1.2408, + "step": 39860 + }, + { + "epoch": 0.08470907679960311, + "grad_norm": 0.40796566009521484, + "learning_rate": 0.0001978562434422535, + "loss": 1.2481, + "step": 39870 + }, + { + "epoch": 0.08473032311934217, + "grad_norm": 0.38745787739753723, + "learning_rate": 0.0001978548404939593, + "loss": 1.2564, + "step": 39880 + }, + { + "epoch": 0.08475156943908123, + "grad_norm": 0.42111214995384216, + "learning_rate": 0.0001978534370917236, + "loss": 1.2318, + "step": 39890 + }, + { + "epoch": 0.08477281575882027, + "grad_norm": 0.5352404713630676, + "learning_rate": 0.00019785203323555294, + "loss": 1.2589, + "step": 39900 + }, + { + "epoch": 0.08479406207855933, + "grad_norm": 0.35934439301490784, + "learning_rate": 0.00019785062892545377, + "loss": 1.2298, + "step": 39910 + }, + { + "epoch": 0.08481530839829839, + "grad_norm": 0.37938109040260315, + "learning_rate": 0.0001978492241614327, + "loss": 1.2125, + "step": 39920 + }, + { + "epoch": 0.08483655471803743, + "grad_norm": 1.3525490760803223, + "learning_rate": 0.00019784781894349612, + "loss": 1.1967, + "step": 39930 + }, + { + "epoch": 0.08485780103777649, + "grad_norm": 0.3359939455986023, + "learning_rate": 0.00019784641327165065, + "loss": 1.1921, + "step": 39940 + }, + { + "epoch": 0.08487904735751554, + "grad_norm": 0.40856149792671204, + "learning_rate": 0.0001978450071459028, + "loss": 1.1903, + "step": 39950 + }, + { + "epoch": 0.08490029367725459, + "grad_norm": 0.4462795555591583, + "learning_rate": 0.00019784360056625903, + "loss": 1.2019, + "step": 39960 + }, + { + "epoch": 0.08492153999699364, + "grad_norm": 0.5108676552772522, + "learning_rate": 0.00019784219353272592, + "loss": 1.1674, + "step": 39970 + }, + { + "epoch": 0.0849427863167327, + "grad_norm": 0.3774508833885193, + "learning_rate": 0.00019784078604530997, + "loss": 1.2075, + "step": 39980 + }, + { + "epoch": 0.08496403263647175, + "grad_norm": 0.37737733125686646, + "learning_rate": 0.00019783937810401777, + "loss": 1.2035, + "step": 39990 + }, + { + "epoch": 0.0849852789562108, + "grad_norm": 0.5021188259124756, + "learning_rate": 0.00019783796970885574, + "loss": 1.2294, + "step": 40000 + }, + { + "epoch": 0.08500652527594986, + "grad_norm": 0.55497807264328, + "learning_rate": 0.0001978365608598305, + "loss": 1.198, + "step": 40010 + }, + { + "epoch": 0.0850277715956889, + "grad_norm": 0.42708298563957214, + "learning_rate": 0.0001978351515569486, + "loss": 1.2381, + "step": 40020 + }, + { + "epoch": 0.08504901791542796, + "grad_norm": 0.41687268018722534, + "learning_rate": 0.00019783374180021648, + "loss": 1.2181, + "step": 40030 + }, + { + "epoch": 0.08507026423516702, + "grad_norm": 0.31952279806137085, + "learning_rate": 0.00019783233158964075, + "loss": 1.2424, + "step": 40040 + }, + { + "epoch": 0.08509151055490606, + "grad_norm": 0.5223589539527893, + "learning_rate": 0.00019783092092522797, + "loss": 1.2095, + "step": 40050 + }, + { + "epoch": 0.08511275687464512, + "grad_norm": 0.35712873935699463, + "learning_rate": 0.00019782950980698464, + "loss": 1.2321, + "step": 40060 + }, + { + "epoch": 0.08513400319438418, + "grad_norm": 0.33003851771354675, + "learning_rate": 0.00019782809823491732, + "loss": 1.2547, + "step": 40070 + }, + { + "epoch": 0.08515524951412323, + "grad_norm": 0.47179731726646423, + "learning_rate": 0.00019782668620903256, + "loss": 1.2226, + "step": 40080 + }, + { + "epoch": 0.08517649583386228, + "grad_norm": 0.3643975555896759, + "learning_rate": 0.0001978252737293369, + "loss": 1.2594, + "step": 40090 + }, + { + "epoch": 0.08519774215360133, + "grad_norm": 0.32705768942832947, + "learning_rate": 0.00019782386079583692, + "loss": 1.2143, + "step": 40100 + }, + { + "epoch": 0.08521898847334039, + "grad_norm": 0.3809202313423157, + "learning_rate": 0.00019782244740853915, + "loss": 1.2163, + "step": 40110 + }, + { + "epoch": 0.08524023479307943, + "grad_norm": 0.3477994501590729, + "learning_rate": 0.00019782103356745013, + "loss": 1.2466, + "step": 40120 + }, + { + "epoch": 0.08526148111281849, + "grad_norm": 0.46762070059776306, + "learning_rate": 0.00019781961927257648, + "loss": 1.1912, + "step": 40130 + }, + { + "epoch": 0.08528272743255755, + "grad_norm": 0.37733179330825806, + "learning_rate": 0.0001978182045239247, + "loss": 1.2001, + "step": 40140 + }, + { + "epoch": 0.08530397375229659, + "grad_norm": 0.3710990846157074, + "learning_rate": 0.00019781678932150138, + "loss": 1.1936, + "step": 40150 + }, + { + "epoch": 0.08532522007203565, + "grad_norm": 0.3427871763706207, + "learning_rate": 0.00019781537366531308, + "loss": 1.249, + "step": 40160 + }, + { + "epoch": 0.0853464663917747, + "grad_norm": 0.44412076473236084, + "learning_rate": 0.00019781395755536635, + "loss": 1.221, + "step": 40170 + }, + { + "epoch": 0.08536771271151375, + "grad_norm": 0.4490886628627777, + "learning_rate": 0.00019781254099166781, + "loss": 1.2241, + "step": 40180 + }, + { + "epoch": 0.0853889590312528, + "grad_norm": 0.34641581773757935, + "learning_rate": 0.000197811123974224, + "loss": 1.2347, + "step": 40190 + }, + { + "epoch": 0.08541020535099186, + "grad_norm": 0.3376452922821045, + "learning_rate": 0.00019780970650304147, + "loss": 1.2476, + "step": 40200 + }, + { + "epoch": 0.0854314516707309, + "grad_norm": 0.38403573632240295, + "learning_rate": 0.0001978082885781268, + "loss": 1.2274, + "step": 40210 + }, + { + "epoch": 0.08545269799046996, + "grad_norm": 0.40283405780792236, + "learning_rate": 0.00019780687019948663, + "loss": 1.2033, + "step": 40220 + }, + { + "epoch": 0.08547394431020902, + "grad_norm": 0.3920096755027771, + "learning_rate": 0.00019780545136712747, + "loss": 1.2028, + "step": 40230 + }, + { + "epoch": 0.08549519062994806, + "grad_norm": 0.39083436131477356, + "learning_rate": 0.0001978040320810559, + "loss": 1.2591, + "step": 40240 + }, + { + "epoch": 0.08551643694968712, + "grad_norm": 0.3937593102455139, + "learning_rate": 0.00019780261234127858, + "loss": 1.2067, + "step": 40250 + }, + { + "epoch": 0.08553768326942618, + "grad_norm": 0.3660230040550232, + "learning_rate": 0.000197801192147802, + "loss": 1.247, + "step": 40260 + }, + { + "epoch": 0.08555892958916522, + "grad_norm": 0.6536799669265747, + "learning_rate": 0.00019779977150063285, + "loss": 1.224, + "step": 40270 + }, + { + "epoch": 0.08558017590890428, + "grad_norm": 0.4202069342136383, + "learning_rate": 0.00019779835039977764, + "loss": 1.2203, + "step": 40280 + }, + { + "epoch": 0.08560142222864334, + "grad_norm": 0.4163689613342285, + "learning_rate": 0.000197796928845243, + "loss": 1.2602, + "step": 40290 + }, + { + "epoch": 0.08562266854838238, + "grad_norm": 0.36221304535865784, + "learning_rate": 0.00019779550683703546, + "loss": 1.1947, + "step": 40300 + }, + { + "epoch": 0.08564391486812144, + "grad_norm": 0.37690794467926025, + "learning_rate": 0.00019779408437516172, + "loss": 1.2618, + "step": 40310 + }, + { + "epoch": 0.0856651611878605, + "grad_norm": 0.3310394585132599, + "learning_rate": 0.0001977926614596283, + "loss": 1.2006, + "step": 40320 + }, + { + "epoch": 0.08568640750759954, + "grad_norm": 0.4129329025745392, + "learning_rate": 0.00019779123809044182, + "loss": 1.2154, + "step": 40330 + }, + { + "epoch": 0.0857076538273386, + "grad_norm": 0.36137285828590393, + "learning_rate": 0.00019778981426760893, + "loss": 1.2508, + "step": 40340 + }, + { + "epoch": 0.08572890014707765, + "grad_norm": 0.3757370710372925, + "learning_rate": 0.00019778838999113618, + "loss": 1.2417, + "step": 40350 + }, + { + "epoch": 0.0857501464668167, + "grad_norm": 0.3746199607849121, + "learning_rate": 0.00019778696526103018, + "loss": 1.1988, + "step": 40360 + }, + { + "epoch": 0.08577139278655575, + "grad_norm": 0.4106649160385132, + "learning_rate": 0.00019778554007729755, + "loss": 1.2146, + "step": 40370 + }, + { + "epoch": 0.08579263910629481, + "grad_norm": 0.4678076207637787, + "learning_rate": 0.0001977841144399449, + "loss": 1.2242, + "step": 40380 + }, + { + "epoch": 0.08581388542603385, + "grad_norm": 0.3588945269584656, + "learning_rate": 0.0001977826883489789, + "loss": 1.2041, + "step": 40390 + }, + { + "epoch": 0.08583513174577291, + "grad_norm": 0.36988377571105957, + "learning_rate": 0.00019778126180440607, + "loss": 1.2577, + "step": 40400 + }, + { + "epoch": 0.08585637806551197, + "grad_norm": 0.38251304626464844, + "learning_rate": 0.00019777983480623304, + "loss": 1.2318, + "step": 40410 + }, + { + "epoch": 0.08587762438525101, + "grad_norm": 0.3705804944038391, + "learning_rate": 0.0001977784073544665, + "loss": 1.2146, + "step": 40420 + }, + { + "epoch": 0.08589887070499007, + "grad_norm": 0.3296132981777191, + "learning_rate": 0.00019777697944911303, + "loss": 1.2361, + "step": 40430 + }, + { + "epoch": 0.08592011702472913, + "grad_norm": 0.3738018870353699, + "learning_rate": 0.00019777555109017924, + "loss": 1.2001, + "step": 40440 + }, + { + "epoch": 0.08594136334446817, + "grad_norm": 0.39102768898010254, + "learning_rate": 0.00019777412227767177, + "loss": 1.1891, + "step": 40450 + }, + { + "epoch": 0.08596260966420723, + "grad_norm": 0.3835037350654602, + "learning_rate": 0.00019777269301159728, + "loss": 1.2293, + "step": 40460 + }, + { + "epoch": 0.08598385598394628, + "grad_norm": 0.4741373658180237, + "learning_rate": 0.00019777126329196233, + "loss": 1.2254, + "step": 40470 + }, + { + "epoch": 0.08600510230368533, + "grad_norm": 0.4276779294013977, + "learning_rate": 0.0001977698331187736, + "loss": 1.2728, + "step": 40480 + }, + { + "epoch": 0.08602634862342438, + "grad_norm": 0.3996749520301819, + "learning_rate": 0.00019776840249203774, + "loss": 1.2512, + "step": 40490 + }, + { + "epoch": 0.08604759494316344, + "grad_norm": 0.5110123753547668, + "learning_rate": 0.00019776697141176138, + "loss": 1.2171, + "step": 40500 + }, + { + "epoch": 0.0860688412629025, + "grad_norm": 0.5188087821006775, + "learning_rate": 0.0001977655398779511, + "loss": 1.2514, + "step": 40510 + }, + { + "epoch": 0.08609008758264154, + "grad_norm": 0.3510338366031647, + "learning_rate": 0.00019776410789061358, + "loss": 1.2158, + "step": 40520 + }, + { + "epoch": 0.0861113339023806, + "grad_norm": 0.3824930489063263, + "learning_rate": 0.00019776267544975552, + "loss": 1.2624, + "step": 40530 + }, + { + "epoch": 0.08613258022211966, + "grad_norm": 0.43981391191482544, + "learning_rate": 0.00019776124255538346, + "loss": 1.244, + "step": 40540 + }, + { + "epoch": 0.0861538265418587, + "grad_norm": 0.41158151626586914, + "learning_rate": 0.0001977598092075041, + "loss": 1.2628, + "step": 40550 + }, + { + "epoch": 0.08617507286159776, + "grad_norm": 0.4199158251285553, + "learning_rate": 0.00019775837540612413, + "loss": 1.2053, + "step": 40560 + }, + { + "epoch": 0.08619631918133681, + "grad_norm": 0.4235318601131439, + "learning_rate": 0.0001977569411512501, + "loss": 1.2486, + "step": 40570 + }, + { + "epoch": 0.08621756550107586, + "grad_norm": 0.3328097462654114, + "learning_rate": 0.00019775550644288877, + "loss": 1.2182, + "step": 40580 + }, + { + "epoch": 0.08623881182081491, + "grad_norm": 0.4862784743309021, + "learning_rate": 0.0001977540712810467, + "loss": 1.2279, + "step": 40590 + }, + { + "epoch": 0.08626005814055397, + "grad_norm": 0.35227081179618835, + "learning_rate": 0.00019775263566573062, + "loss": 1.1891, + "step": 40600 + }, + { + "epoch": 0.08628130446029302, + "grad_norm": 0.5782122611999512, + "learning_rate": 0.00019775119959694718, + "loss": 1.2162, + "step": 40610 + }, + { + "epoch": 0.08630255078003207, + "grad_norm": 0.5558004379272461, + "learning_rate": 0.000197749763074703, + "loss": 1.2379, + "step": 40620 + }, + { + "epoch": 0.08632379709977113, + "grad_norm": 0.3374710977077484, + "learning_rate": 0.0001977483260990048, + "loss": 1.2342, + "step": 40630 + }, + { + "epoch": 0.08634504341951017, + "grad_norm": 0.3587123453617096, + "learning_rate": 0.0001977468886698592, + "loss": 1.2071, + "step": 40640 + }, + { + "epoch": 0.08636628973924923, + "grad_norm": 0.4571719467639923, + "learning_rate": 0.00019774545078727286, + "loss": 1.2149, + "step": 40650 + }, + { + "epoch": 0.08638753605898829, + "grad_norm": 0.441906601190567, + "learning_rate": 0.00019774401245125254, + "loss": 1.2129, + "step": 40660 + }, + { + "epoch": 0.08640878237872733, + "grad_norm": 0.38829314708709717, + "learning_rate": 0.00019774257366180476, + "loss": 1.2674, + "step": 40670 + }, + { + "epoch": 0.08643002869846639, + "grad_norm": 0.37281525135040283, + "learning_rate": 0.00019774113441893635, + "loss": 1.2447, + "step": 40680 + }, + { + "epoch": 0.08645127501820545, + "grad_norm": 0.3578064739704132, + "learning_rate": 0.0001977396947226539, + "loss": 1.1981, + "step": 40690 + }, + { + "epoch": 0.08647252133794449, + "grad_norm": 0.3498438596725464, + "learning_rate": 0.00019773825457296412, + "loss": 1.2528, + "step": 40700 + }, + { + "epoch": 0.08649376765768355, + "grad_norm": 0.38350147008895874, + "learning_rate": 0.00019773681396987366, + "loss": 1.2439, + "step": 40710 + }, + { + "epoch": 0.0865150139774226, + "grad_norm": 0.42178410291671753, + "learning_rate": 0.0001977353729133892, + "loss": 1.2476, + "step": 40720 + }, + { + "epoch": 0.08653626029716165, + "grad_norm": 0.37095415592193604, + "learning_rate": 0.00019773393140351745, + "loss": 1.1911, + "step": 40730 + }, + { + "epoch": 0.0865575066169007, + "grad_norm": 0.38398540019989014, + "learning_rate": 0.00019773248944026511, + "loss": 1.1919, + "step": 40740 + }, + { + "epoch": 0.08657875293663976, + "grad_norm": 0.5355706810951233, + "learning_rate": 0.00019773104702363887, + "loss": 1.1935, + "step": 40750 + }, + { + "epoch": 0.0865999992563788, + "grad_norm": 0.39033985137939453, + "learning_rate": 0.00019772960415364538, + "loss": 1.2105, + "step": 40760 + }, + { + "epoch": 0.08662124557611786, + "grad_norm": 0.5534896850585938, + "learning_rate": 0.00019772816083029135, + "loss": 1.2135, + "step": 40770 + }, + { + "epoch": 0.08664249189585692, + "grad_norm": 0.40446189045906067, + "learning_rate": 0.00019772671705358349, + "loss": 1.2262, + "step": 40780 + }, + { + "epoch": 0.08666373821559596, + "grad_norm": 0.38663557171821594, + "learning_rate": 0.0001977252728235285, + "loss": 1.2248, + "step": 40790 + }, + { + "epoch": 0.08668498453533502, + "grad_norm": 0.6774907112121582, + "learning_rate": 0.00019772382814013307, + "loss": 1.2098, + "step": 40800 + }, + { + "epoch": 0.08670623085507408, + "grad_norm": 0.3846881091594696, + "learning_rate": 0.0001977223830034039, + "loss": 1.227, + "step": 40810 + }, + { + "epoch": 0.08672747717481312, + "grad_norm": 0.40745192766189575, + "learning_rate": 0.00019772093741334765, + "loss": 1.2343, + "step": 40820 + }, + { + "epoch": 0.08674872349455218, + "grad_norm": 0.34758007526397705, + "learning_rate": 0.0001977194913699711, + "loss": 1.2374, + "step": 40830 + }, + { + "epoch": 0.08676996981429123, + "grad_norm": 0.3589681386947632, + "learning_rate": 0.00019771804487328095, + "loss": 1.2537, + "step": 40840 + }, + { + "epoch": 0.08679121613403028, + "grad_norm": 0.3662518560886383, + "learning_rate": 0.00019771659792328387, + "loss": 1.2372, + "step": 40850 + }, + { + "epoch": 0.08681246245376933, + "grad_norm": 0.40140992403030396, + "learning_rate": 0.00019771515051998658, + "loss": 1.2091, + "step": 40860 + }, + { + "epoch": 0.08683370877350839, + "grad_norm": 0.3631368577480316, + "learning_rate": 0.0001977137026633958, + "loss": 1.2076, + "step": 40870 + }, + { + "epoch": 0.08685495509324744, + "grad_norm": 0.5362302660942078, + "learning_rate": 0.0001977122543535183, + "loss": 1.2176, + "step": 40880 + }, + { + "epoch": 0.08687620141298649, + "grad_norm": 0.4634813368320465, + "learning_rate": 0.0001977108055903607, + "loss": 1.2077, + "step": 40890 + }, + { + "epoch": 0.08689744773272555, + "grad_norm": 0.3447198271751404, + "learning_rate": 0.0001977093563739298, + "loss": 1.2444, + "step": 40900 + }, + { + "epoch": 0.08691869405246459, + "grad_norm": 0.43379807472229004, + "learning_rate": 0.0001977079067042323, + "loss": 1.1894, + "step": 40910 + }, + { + "epoch": 0.08693994037220365, + "grad_norm": 0.40565142035484314, + "learning_rate": 0.00019770645658127492, + "loss": 1.216, + "step": 40920 + }, + { + "epoch": 0.08696118669194271, + "grad_norm": 0.4814138114452362, + "learning_rate": 0.00019770500600506434, + "loss": 1.2243, + "step": 40930 + }, + { + "epoch": 0.08698243301168176, + "grad_norm": 0.4121120274066925, + "learning_rate": 0.00019770355497560738, + "loss": 1.2438, + "step": 40940 + }, + { + "epoch": 0.08700367933142081, + "grad_norm": 0.5405805706977844, + "learning_rate": 0.0001977021034929107, + "loss": 1.2582, + "step": 40950 + }, + { + "epoch": 0.08702492565115987, + "grad_norm": 0.4164850115776062, + "learning_rate": 0.00019770065155698107, + "loss": 1.1914, + "step": 40960 + }, + { + "epoch": 0.08704617197089892, + "grad_norm": 0.3784996569156647, + "learning_rate": 0.00019769919916782524, + "loss": 1.2359, + "step": 40970 + }, + { + "epoch": 0.08706741829063797, + "grad_norm": 0.3626635670661926, + "learning_rate": 0.0001976977463254499, + "loss": 1.22, + "step": 40980 + }, + { + "epoch": 0.08708866461037702, + "grad_norm": 0.37798795104026794, + "learning_rate": 0.00019769629302986182, + "loss": 1.2041, + "step": 40990 + }, + { + "epoch": 0.08710991093011608, + "grad_norm": 0.3237488865852356, + "learning_rate": 0.0001976948392810677, + "loss": 1.2521, + "step": 41000 + }, + { + "epoch": 0.08713115724985512, + "grad_norm": 0.4474277198314667, + "learning_rate": 0.00019769338507907435, + "loss": 1.2173, + "step": 41010 + }, + { + "epoch": 0.08715240356959418, + "grad_norm": 0.3519093990325928, + "learning_rate": 0.0001976919304238885, + "loss": 1.2142, + "step": 41020 + }, + { + "epoch": 0.08717364988933324, + "grad_norm": 0.8189487457275391, + "learning_rate": 0.00019769047531551683, + "loss": 1.2231, + "step": 41030 + }, + { + "epoch": 0.08719489620907228, + "grad_norm": 0.5536440014839172, + "learning_rate": 0.00019768901975396617, + "loss": 1.2251, + "step": 41040 + }, + { + "epoch": 0.08721614252881134, + "grad_norm": 0.42609459161758423, + "learning_rate": 0.00019768756373924324, + "loss": 1.18, + "step": 41050 + }, + { + "epoch": 0.0872373888485504, + "grad_norm": 0.411554753780365, + "learning_rate": 0.0001976861072713548, + "loss": 1.2069, + "step": 41060 + }, + { + "epoch": 0.08725863516828944, + "grad_norm": 0.6186410188674927, + "learning_rate": 0.00019768465035030762, + "loss": 1.2036, + "step": 41070 + }, + { + "epoch": 0.0872798814880285, + "grad_norm": 0.3623008131980896, + "learning_rate": 0.00019768319297610844, + "loss": 1.203, + "step": 41080 + }, + { + "epoch": 0.08730112780776755, + "grad_norm": 0.3771592080593109, + "learning_rate": 0.00019768173514876403, + "loss": 1.2067, + "step": 41090 + }, + { + "epoch": 0.0873223741275066, + "grad_norm": 0.5978964567184448, + "learning_rate": 0.00019768027686828111, + "loss": 1.2501, + "step": 41100 + }, + { + "epoch": 0.08734362044724565, + "grad_norm": 0.45026424527168274, + "learning_rate": 0.00019767881813466653, + "loss": 1.2017, + "step": 41110 + }, + { + "epoch": 0.08736486676698471, + "grad_norm": 0.46603691577911377, + "learning_rate": 0.000197677358947927, + "loss": 1.2038, + "step": 41120 + }, + { + "epoch": 0.08738611308672375, + "grad_norm": 0.4732888638973236, + "learning_rate": 0.00019767589930806928, + "loss": 1.2385, + "step": 41130 + }, + { + "epoch": 0.08740735940646281, + "grad_norm": 0.3791617751121521, + "learning_rate": 0.00019767443921510017, + "loss": 1.1948, + "step": 41140 + }, + { + "epoch": 0.08742860572620187, + "grad_norm": 0.333264023065567, + "learning_rate": 0.00019767297866902638, + "loss": 1.2183, + "step": 41150 + }, + { + "epoch": 0.08744985204594091, + "grad_norm": 0.5805333852767944, + "learning_rate": 0.00019767151766985478, + "loss": 1.2391, + "step": 41160 + }, + { + "epoch": 0.08747109836567997, + "grad_norm": 0.3822091519832611, + "learning_rate": 0.0001976700562175921, + "loss": 1.189, + "step": 41170 + }, + { + "epoch": 0.08749234468541903, + "grad_norm": 0.4058079719543457, + "learning_rate": 0.00019766859431224516, + "loss": 1.1625, + "step": 41180 + }, + { + "epoch": 0.08751359100515807, + "grad_norm": 0.5150647759437561, + "learning_rate": 0.00019766713195382068, + "loss": 1.2294, + "step": 41190 + }, + { + "epoch": 0.08753483732489713, + "grad_norm": 0.3540745675563812, + "learning_rate": 0.00019766566914232548, + "loss": 1.2041, + "step": 41200 + }, + { + "epoch": 0.08755608364463618, + "grad_norm": 0.6614488959312439, + "learning_rate": 0.00019766420587776632, + "loss": 1.1885, + "step": 41210 + }, + { + "epoch": 0.08757732996437523, + "grad_norm": 0.39569905400276184, + "learning_rate": 0.00019766274216015002, + "loss": 1.1911, + "step": 41220 + }, + { + "epoch": 0.08759857628411429, + "grad_norm": 0.402605801820755, + "learning_rate": 0.0001976612779894833, + "loss": 1.2191, + "step": 41230 + }, + { + "epoch": 0.08761982260385334, + "grad_norm": 0.3232239782810211, + "learning_rate": 0.00019765981336577308, + "loss": 1.2472, + "step": 41240 + }, + { + "epoch": 0.08764106892359239, + "grad_norm": 0.3394501805305481, + "learning_rate": 0.00019765834828902606, + "loss": 1.2097, + "step": 41250 + }, + { + "epoch": 0.08766231524333144, + "grad_norm": 0.327077716588974, + "learning_rate": 0.00019765688275924902, + "loss": 1.263, + "step": 41260 + }, + { + "epoch": 0.0876835615630705, + "grad_norm": 0.34998828172683716, + "learning_rate": 0.00019765541677644883, + "loss": 1.1859, + "step": 41270 + }, + { + "epoch": 0.08770480788280954, + "grad_norm": 0.34072214365005493, + "learning_rate": 0.00019765395034063222, + "loss": 1.2227, + "step": 41280 + }, + { + "epoch": 0.0877260542025486, + "grad_norm": 0.5104411244392395, + "learning_rate": 0.00019765248345180608, + "loss": 1.2495, + "step": 41290 + }, + { + "epoch": 0.08774730052228766, + "grad_norm": 0.4487139880657196, + "learning_rate": 0.00019765101610997714, + "loss": 1.2142, + "step": 41300 + }, + { + "epoch": 0.0877685468420267, + "grad_norm": 0.33296239376068115, + "learning_rate": 0.00019764954831515223, + "loss": 1.2113, + "step": 41310 + }, + { + "epoch": 0.08778979316176576, + "grad_norm": 0.42561113834381104, + "learning_rate": 0.00019764808006733815, + "loss": 1.2399, + "step": 41320 + }, + { + "epoch": 0.08781103948150482, + "grad_norm": 0.3663341999053955, + "learning_rate": 0.00019764661136654175, + "loss": 1.2103, + "step": 41330 + }, + { + "epoch": 0.08783228580124386, + "grad_norm": 0.3630523681640625, + "learning_rate": 0.00019764514221276978, + "loss": 1.2343, + "step": 41340 + }, + { + "epoch": 0.08785353212098292, + "grad_norm": 0.42155277729034424, + "learning_rate": 0.00019764367260602912, + "loss": 1.228, + "step": 41350 + }, + { + "epoch": 0.08787477844072197, + "grad_norm": 0.4930296838283539, + "learning_rate": 0.00019764220254632652, + "loss": 1.2235, + "step": 41360 + }, + { + "epoch": 0.08789602476046103, + "grad_norm": 0.33149823546409607, + "learning_rate": 0.00019764073203366886, + "loss": 1.2329, + "step": 41370 + }, + { + "epoch": 0.08791727108020007, + "grad_norm": 0.3491116762161255, + "learning_rate": 0.00019763926106806294, + "loss": 1.2501, + "step": 41380 + }, + { + "epoch": 0.08793851739993913, + "grad_norm": 0.4906582236289978, + "learning_rate": 0.00019763778964951558, + "loss": 1.22, + "step": 41390 + }, + { + "epoch": 0.08795976371967819, + "grad_norm": 0.37990885972976685, + "learning_rate": 0.0001976363177780336, + "loss": 1.2443, + "step": 41400 + }, + { + "epoch": 0.08798101003941723, + "grad_norm": 0.3642898201942444, + "learning_rate": 0.00019763484545362383, + "loss": 1.2148, + "step": 41410 + }, + { + "epoch": 0.08800225635915629, + "grad_norm": 0.3417400121688843, + "learning_rate": 0.00019763337267629313, + "loss": 1.2383, + "step": 41420 + }, + { + "epoch": 0.08802350267889535, + "grad_norm": 0.3834196925163269, + "learning_rate": 0.00019763189944604825, + "loss": 1.2081, + "step": 41430 + }, + { + "epoch": 0.08804474899863439, + "grad_norm": 0.46176326274871826, + "learning_rate": 0.00019763042576289612, + "loss": 1.2298, + "step": 41440 + }, + { + "epoch": 0.08806599531837345, + "grad_norm": 0.3938440978527069, + "learning_rate": 0.00019762895162684353, + "loss": 1.1965, + "step": 41450 + }, + { + "epoch": 0.0880872416381125, + "grad_norm": 0.3648845851421356, + "learning_rate": 0.00019762747703789735, + "loss": 1.2287, + "step": 41460 + }, + { + "epoch": 0.08810848795785155, + "grad_norm": 0.3635503053665161, + "learning_rate": 0.00019762600199606437, + "loss": 1.2294, + "step": 41470 + }, + { + "epoch": 0.0881297342775906, + "grad_norm": 0.4050208628177643, + "learning_rate": 0.00019762452650135147, + "loss": 1.2433, + "step": 41480 + }, + { + "epoch": 0.08815098059732966, + "grad_norm": 0.5197353363037109, + "learning_rate": 0.00019762305055376545, + "loss": 1.2067, + "step": 41490 + }, + { + "epoch": 0.0881722269170687, + "grad_norm": 0.3940749168395996, + "learning_rate": 0.00019762157415331326, + "loss": 1.1975, + "step": 41500 + }, + { + "epoch": 0.08819347323680776, + "grad_norm": 0.5160437226295471, + "learning_rate": 0.00019762009730000163, + "loss": 1.2003, + "step": 41510 + }, + { + "epoch": 0.08821471955654682, + "grad_norm": 0.4561008810997009, + "learning_rate": 0.00019761861999383746, + "loss": 1.2052, + "step": 41520 + }, + { + "epoch": 0.08823596587628586, + "grad_norm": 0.3507247865200043, + "learning_rate": 0.00019761714223482762, + "loss": 1.1853, + "step": 41530 + }, + { + "epoch": 0.08825721219602492, + "grad_norm": 0.32299819588661194, + "learning_rate": 0.00019761566402297896, + "loss": 1.2036, + "step": 41540 + }, + { + "epoch": 0.08827845851576398, + "grad_norm": 0.38574665784835815, + "learning_rate": 0.00019761418535829833, + "loss": 1.2507, + "step": 41550 + }, + { + "epoch": 0.08829970483550302, + "grad_norm": 0.5245962142944336, + "learning_rate": 0.00019761270624079255, + "loss": 1.2267, + "step": 41560 + }, + { + "epoch": 0.08832095115524208, + "grad_norm": 0.34279853105545044, + "learning_rate": 0.00019761122667046852, + "loss": 1.2376, + "step": 41570 + }, + { + "epoch": 0.08834219747498114, + "grad_norm": 1.116472840309143, + "learning_rate": 0.0001976097466473331, + "loss": 1.191, + "step": 41580 + }, + { + "epoch": 0.08836344379472018, + "grad_norm": 0.6707748770713806, + "learning_rate": 0.0001976082661713932, + "loss": 1.2162, + "step": 41590 + }, + { + "epoch": 0.08838469011445924, + "grad_norm": 0.6639862656593323, + "learning_rate": 0.0001976067852426556, + "loss": 1.19, + "step": 41600 + }, + { + "epoch": 0.08840593643419829, + "grad_norm": 0.4144224524497986, + "learning_rate": 0.00019760530386112723, + "loss": 1.2259, + "step": 41610 + }, + { + "epoch": 0.08842718275393734, + "grad_norm": 0.3800775706768036, + "learning_rate": 0.00019760382202681494, + "loss": 1.2192, + "step": 41620 + }, + { + "epoch": 0.0884484290736764, + "grad_norm": 0.34822484850883484, + "learning_rate": 0.00019760233973972562, + "loss": 1.1845, + "step": 41630 + }, + { + "epoch": 0.08846967539341545, + "grad_norm": 0.3808070123195648, + "learning_rate": 0.00019760085699986612, + "loss": 1.2193, + "step": 41640 + }, + { + "epoch": 0.0884909217131545, + "grad_norm": 0.3926427662372589, + "learning_rate": 0.00019759937380724334, + "loss": 1.2271, + "step": 41650 + }, + { + "epoch": 0.08851216803289355, + "grad_norm": 0.5403350591659546, + "learning_rate": 0.00019759789016186415, + "loss": 1.2359, + "step": 41660 + }, + { + "epoch": 0.08853341435263261, + "grad_norm": 0.5742577910423279, + "learning_rate": 0.00019759640606373545, + "loss": 1.1993, + "step": 41670 + }, + { + "epoch": 0.08855466067237165, + "grad_norm": 0.4633432924747467, + "learning_rate": 0.0001975949215128641, + "loss": 1.2107, + "step": 41680 + }, + { + "epoch": 0.08857590699211071, + "grad_norm": 0.4826051592826843, + "learning_rate": 0.00019759343650925705, + "loss": 1.1992, + "step": 41690 + }, + { + "epoch": 0.08859715331184977, + "grad_norm": 0.3549315631389618, + "learning_rate": 0.0001975919510529211, + "loss": 1.1968, + "step": 41700 + }, + { + "epoch": 0.08861839963158881, + "grad_norm": 0.38271018862724304, + "learning_rate": 0.00019759046514386316, + "loss": 1.1833, + "step": 41710 + }, + { + "epoch": 0.08863964595132787, + "grad_norm": 0.37859803438186646, + "learning_rate": 0.00019758897878209016, + "loss": 1.2366, + "step": 41720 + }, + { + "epoch": 0.08866089227106692, + "grad_norm": 0.322752982378006, + "learning_rate": 0.00019758749196760898, + "loss": 1.2204, + "step": 41730 + }, + { + "epoch": 0.08868213859080597, + "grad_norm": 0.3986281454563141, + "learning_rate": 0.00019758600470042652, + "loss": 1.2101, + "step": 41740 + }, + { + "epoch": 0.08870338491054502, + "grad_norm": 0.41446444392204285, + "learning_rate": 0.00019758451698054965, + "loss": 1.191, + "step": 41750 + }, + { + "epoch": 0.08872463123028408, + "grad_norm": 0.519321084022522, + "learning_rate": 0.0001975830288079853, + "loss": 1.2192, + "step": 41760 + }, + { + "epoch": 0.08874587755002313, + "grad_norm": 0.427794486284256, + "learning_rate": 0.0001975815401827404, + "loss": 1.2425, + "step": 41770 + }, + { + "epoch": 0.08876712386976218, + "grad_norm": 0.5448949933052063, + "learning_rate": 0.0001975800511048218, + "loss": 1.2315, + "step": 41780 + }, + { + "epoch": 0.08878837018950124, + "grad_norm": 0.7571097016334534, + "learning_rate": 0.00019757856157423645, + "loss": 1.191, + "step": 41790 + }, + { + "epoch": 0.0888096165092403, + "grad_norm": 0.5920254588127136, + "learning_rate": 0.00019757707159099121, + "loss": 1.2521, + "step": 41800 + }, + { + "epoch": 0.08883086282897934, + "grad_norm": 0.5204418897628784, + "learning_rate": 0.00019757558115509303, + "loss": 1.2027, + "step": 41810 + }, + { + "epoch": 0.0888521091487184, + "grad_norm": 0.4085301160812378, + "learning_rate": 0.00019757409026654882, + "loss": 1.2312, + "step": 41820 + }, + { + "epoch": 0.08887335546845745, + "grad_norm": 0.3659689426422119, + "learning_rate": 0.00019757259892536552, + "loss": 1.2351, + "step": 41830 + }, + { + "epoch": 0.0888946017881965, + "grad_norm": 0.32679063081741333, + "learning_rate": 0.00019757110713154998, + "loss": 1.2086, + "step": 41840 + }, + { + "epoch": 0.08891584810793556, + "grad_norm": 0.357492059469223, + "learning_rate": 0.00019756961488510922, + "loss": 1.2244, + "step": 41850 + }, + { + "epoch": 0.08893709442767461, + "grad_norm": 0.5755389332771301, + "learning_rate": 0.00019756812218605006, + "loss": 1.2019, + "step": 41860 + }, + { + "epoch": 0.08895834074741366, + "grad_norm": 0.42476925253868103, + "learning_rate": 0.00019756662903437947, + "loss": 1.2478, + "step": 41870 + }, + { + "epoch": 0.08897958706715271, + "grad_norm": 0.3668525815010071, + "learning_rate": 0.00019756513543010438, + "loss": 1.2356, + "step": 41880 + }, + { + "epoch": 0.08900083338689177, + "grad_norm": 0.3582085371017456, + "learning_rate": 0.00019756364137323172, + "loss": 1.2285, + "step": 41890 + }, + { + "epoch": 0.08902207970663081, + "grad_norm": 0.3825005888938904, + "learning_rate": 0.0001975621468637684, + "loss": 1.2214, + "step": 41900 + }, + { + "epoch": 0.08904332602636987, + "grad_norm": 0.39973393082618713, + "learning_rate": 0.0001975606519017214, + "loss": 1.2464, + "step": 41910 + }, + { + "epoch": 0.08906457234610893, + "grad_norm": 0.3602531850337982, + "learning_rate": 0.0001975591564870976, + "loss": 1.2187, + "step": 41920 + }, + { + "epoch": 0.08908581866584797, + "grad_norm": 0.36061838269233704, + "learning_rate": 0.00019755766061990397, + "loss": 1.2625, + "step": 41930 + }, + { + "epoch": 0.08910706498558703, + "grad_norm": 0.49744856357574463, + "learning_rate": 0.00019755616430014742, + "loss": 1.2175, + "step": 41940 + }, + { + "epoch": 0.08912831130532609, + "grad_norm": 0.44933563470840454, + "learning_rate": 0.00019755466752783494, + "loss": 1.2333, + "step": 41950 + }, + { + "epoch": 0.08914955762506513, + "grad_norm": 0.47695693373680115, + "learning_rate": 0.00019755317030297344, + "loss": 1.2138, + "step": 41960 + }, + { + "epoch": 0.08917080394480419, + "grad_norm": 0.49683964252471924, + "learning_rate": 0.00019755167262556987, + "loss": 1.2245, + "step": 41970 + }, + { + "epoch": 0.08919205026454324, + "grad_norm": 0.559623658657074, + "learning_rate": 0.00019755017449563116, + "loss": 1.2396, + "step": 41980 + }, + { + "epoch": 0.08921329658428229, + "grad_norm": 0.3419676125049591, + "learning_rate": 0.00019754867591316428, + "loss": 1.2237, + "step": 41990 + }, + { + "epoch": 0.08923454290402134, + "grad_norm": 0.40883001685142517, + "learning_rate": 0.0001975471768781762, + "loss": 1.2381, + "step": 42000 + }, + { + "epoch": 0.0892557892237604, + "grad_norm": 0.37024444341659546, + "learning_rate": 0.00019754567739067383, + "loss": 1.2247, + "step": 42010 + }, + { + "epoch": 0.08927703554349944, + "grad_norm": 0.4077759087085724, + "learning_rate": 0.00019754417745066415, + "loss": 1.2158, + "step": 42020 + }, + { + "epoch": 0.0892982818632385, + "grad_norm": 0.3685835897922516, + "learning_rate": 0.00019754267705815416, + "loss": 1.2132, + "step": 42030 + }, + { + "epoch": 0.08931952818297756, + "grad_norm": 0.321776419878006, + "learning_rate": 0.0001975411762131507, + "loss": 1.1927, + "step": 42040 + }, + { + "epoch": 0.0893407745027166, + "grad_norm": 0.39683619141578674, + "learning_rate": 0.00019753967491566087, + "loss": 1.2105, + "step": 42050 + }, + { + "epoch": 0.08936202082245566, + "grad_norm": 0.5144784450531006, + "learning_rate": 0.00019753817316569156, + "loss": 1.2513, + "step": 42060 + }, + { + "epoch": 0.08938326714219472, + "grad_norm": 0.3692624270915985, + "learning_rate": 0.00019753667096324975, + "loss": 1.223, + "step": 42070 + }, + { + "epoch": 0.08940451346193376, + "grad_norm": 0.3333643674850464, + "learning_rate": 0.0001975351683083424, + "loss": 1.2386, + "step": 42080 + }, + { + "epoch": 0.08942575978167282, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0001975336652009765, + "loss": 1.1881, + "step": 42090 + }, + { + "epoch": 0.08944700610141187, + "grad_norm": 0.5093224048614502, + "learning_rate": 0.000197532161641159, + "loss": 1.2501, + "step": 42100 + }, + { + "epoch": 0.08946825242115092, + "grad_norm": 0.5382843613624573, + "learning_rate": 0.00019753065762889688, + "loss": 1.2174, + "step": 42110 + }, + { + "epoch": 0.08948949874088998, + "grad_norm": 0.5226273536682129, + "learning_rate": 0.00019752915316419717, + "loss": 1.2179, + "step": 42120 + }, + { + "epoch": 0.08951074506062903, + "grad_norm": 0.3830009400844574, + "learning_rate": 0.00019752764824706675, + "loss": 1.2122, + "step": 42130 + }, + { + "epoch": 0.08953199138036808, + "grad_norm": 0.32709604501724243, + "learning_rate": 0.0001975261428775127, + "loss": 1.2057, + "step": 42140 + }, + { + "epoch": 0.08955323770010713, + "grad_norm": 0.36734873056411743, + "learning_rate": 0.00019752463705554195, + "loss": 1.2089, + "step": 42150 + }, + { + "epoch": 0.08957448401984619, + "grad_norm": 0.39770859479904175, + "learning_rate": 0.00019752313078116147, + "loss": 1.2144, + "step": 42160 + }, + { + "epoch": 0.08959573033958523, + "grad_norm": 0.41366198658943176, + "learning_rate": 0.00019752162405437828, + "loss": 1.2386, + "step": 42170 + }, + { + "epoch": 0.08961697665932429, + "grad_norm": 0.44767147302627563, + "learning_rate": 0.00019752011687519934, + "loss": 1.2206, + "step": 42180 + }, + { + "epoch": 0.08963822297906335, + "grad_norm": 0.48842188715934753, + "learning_rate": 0.00019751860924363171, + "loss": 1.224, + "step": 42190 + }, + { + "epoch": 0.08965946929880239, + "grad_norm": 0.6034210324287415, + "learning_rate": 0.00019751710115968233, + "loss": 1.2176, + "step": 42200 + }, + { + "epoch": 0.08968071561854145, + "grad_norm": 0.6628895401954651, + "learning_rate": 0.00019751559262335815, + "loss": 1.2026, + "step": 42210 + }, + { + "epoch": 0.0897019619382805, + "grad_norm": 0.42129090428352356, + "learning_rate": 0.00019751408363466628, + "loss": 1.19, + "step": 42220 + }, + { + "epoch": 0.08972320825801956, + "grad_norm": 0.4821723401546478, + "learning_rate": 0.0001975125741936136, + "loss": 1.2, + "step": 42230 + }, + { + "epoch": 0.0897444545777586, + "grad_norm": 0.3998466730117798, + "learning_rate": 0.00019751106430020724, + "loss": 1.244, + "step": 42240 + }, + { + "epoch": 0.08976570089749766, + "grad_norm": 0.6706409454345703, + "learning_rate": 0.0001975095539544541, + "loss": 1.2364, + "step": 42250 + }, + { + "epoch": 0.08978694721723672, + "grad_norm": 0.3874000608921051, + "learning_rate": 0.0001975080431563612, + "loss": 1.2525, + "step": 42260 + }, + { + "epoch": 0.08980819353697576, + "grad_norm": 0.4420970678329468, + "learning_rate": 0.00019750653190593563, + "loss": 1.2283, + "step": 42270 + }, + { + "epoch": 0.08982943985671482, + "grad_norm": 0.3909628093242645, + "learning_rate": 0.0001975050202031843, + "loss": 1.1881, + "step": 42280 + }, + { + "epoch": 0.08985068617645388, + "grad_norm": 0.4854196608066559, + "learning_rate": 0.0001975035080481143, + "loss": 1.205, + "step": 42290 + }, + { + "epoch": 0.08987193249619292, + "grad_norm": 0.34592923521995544, + "learning_rate": 0.00019750199544073255, + "loss": 1.1774, + "step": 42300 + }, + { + "epoch": 0.08989317881593198, + "grad_norm": 0.5923677682876587, + "learning_rate": 0.00019750048238104616, + "loss": 1.2667, + "step": 42310 + }, + { + "epoch": 0.08991442513567104, + "grad_norm": 0.3558439016342163, + "learning_rate": 0.00019749896886906213, + "loss": 1.2562, + "step": 42320 + }, + { + "epoch": 0.08993567145541008, + "grad_norm": 0.37853187322616577, + "learning_rate": 0.00019749745490478743, + "loss": 1.2065, + "step": 42330 + }, + { + "epoch": 0.08995691777514914, + "grad_norm": 0.42073601484298706, + "learning_rate": 0.00019749594048822916, + "loss": 1.2053, + "step": 42340 + }, + { + "epoch": 0.0899781640948882, + "grad_norm": 0.43227019906044006, + "learning_rate": 0.0001974944256193943, + "loss": 1.2268, + "step": 42350 + }, + { + "epoch": 0.08999941041462724, + "grad_norm": 0.36713042855262756, + "learning_rate": 0.00019749291029828986, + "loss": 1.2432, + "step": 42360 + }, + { + "epoch": 0.0900206567343663, + "grad_norm": 0.37690362334251404, + "learning_rate": 0.0001974913945249229, + "loss": 1.2082, + "step": 42370 + }, + { + "epoch": 0.09004190305410535, + "grad_norm": 0.3713800013065338, + "learning_rate": 0.00019748987829930044, + "loss": 1.2432, + "step": 42380 + }, + { + "epoch": 0.0900631493738444, + "grad_norm": 0.39573124051094055, + "learning_rate": 0.00019748836162142953, + "loss": 1.2201, + "step": 42390 + }, + { + "epoch": 0.09008439569358345, + "grad_norm": 0.38496828079223633, + "learning_rate": 0.00019748684449131715, + "loss": 1.2105, + "step": 42400 + }, + { + "epoch": 0.09010564201332251, + "grad_norm": 0.3491532504558563, + "learning_rate": 0.00019748532690897045, + "loss": 1.2346, + "step": 42410 + }, + { + "epoch": 0.09012688833306155, + "grad_norm": 0.34952312707901, + "learning_rate": 0.00019748380887439634, + "loss": 1.2585, + "step": 42420 + }, + { + "epoch": 0.09014813465280061, + "grad_norm": 0.42358261346817017, + "learning_rate": 0.00019748229038760195, + "loss": 1.2065, + "step": 42430 + }, + { + "epoch": 0.09016938097253967, + "grad_norm": 0.4438229501247406, + "learning_rate": 0.0001974807714485943, + "loss": 1.2167, + "step": 42440 + }, + { + "epoch": 0.09019062729227871, + "grad_norm": 0.5288995504379272, + "learning_rate": 0.00019747925205738044, + "loss": 1.1982, + "step": 42450 + }, + { + "epoch": 0.09021187361201777, + "grad_norm": 0.3839528560638428, + "learning_rate": 0.0001974777322139674, + "loss": 1.2269, + "step": 42460 + }, + { + "epoch": 0.09023311993175682, + "grad_norm": 0.48734644055366516, + "learning_rate": 0.00019747621191836223, + "loss": 1.2299, + "step": 42470 + }, + { + "epoch": 0.09025436625149587, + "grad_norm": 0.37145209312438965, + "learning_rate": 0.000197474691170572, + "loss": 1.1902, + "step": 42480 + }, + { + "epoch": 0.09027561257123493, + "grad_norm": 0.7084202170372009, + "learning_rate": 0.0001974731699706038, + "loss": 1.2175, + "step": 42490 + }, + { + "epoch": 0.09029685889097398, + "grad_norm": 0.4003945589065552, + "learning_rate": 0.0001974716483184646, + "loss": 1.2248, + "step": 42500 + }, + { + "epoch": 0.09031810521071303, + "grad_norm": 0.3554442822933197, + "learning_rate": 0.0001974701262141615, + "loss": 1.2533, + "step": 42510 + }, + { + "epoch": 0.09033935153045208, + "grad_norm": 0.4293617606163025, + "learning_rate": 0.00019746860365770158, + "loss": 1.1987, + "step": 42520 + }, + { + "epoch": 0.09036059785019114, + "grad_norm": 0.3944071829319, + "learning_rate": 0.00019746708064909187, + "loss": 1.2015, + "step": 42530 + }, + { + "epoch": 0.09038184416993018, + "grad_norm": 0.462321937084198, + "learning_rate": 0.0001974655571883395, + "loss": 1.1779, + "step": 42540 + }, + { + "epoch": 0.09040309048966924, + "grad_norm": 0.4108859598636627, + "learning_rate": 0.00019746403327545144, + "loss": 1.2188, + "step": 42550 + }, + { + "epoch": 0.0904243368094083, + "grad_norm": 0.3960527181625366, + "learning_rate": 0.00019746250891043484, + "loss": 1.2407, + "step": 42560 + }, + { + "epoch": 0.09044558312914734, + "grad_norm": 0.4843006432056427, + "learning_rate": 0.00019746098409329673, + "loss": 1.2269, + "step": 42570 + }, + { + "epoch": 0.0904668294488864, + "grad_norm": 0.3968120515346527, + "learning_rate": 0.0001974594588240442, + "loss": 1.2066, + "step": 42580 + }, + { + "epoch": 0.09048807576862546, + "grad_norm": 0.43124571442604065, + "learning_rate": 0.00019745793310268432, + "loss": 1.2353, + "step": 42590 + }, + { + "epoch": 0.0905093220883645, + "grad_norm": 0.36307477951049805, + "learning_rate": 0.0001974564069292242, + "loss": 1.2135, + "step": 42600 + }, + { + "epoch": 0.09053056840810356, + "grad_norm": 0.4738486707210541, + "learning_rate": 0.00019745488030367084, + "loss": 1.2553, + "step": 42610 + }, + { + "epoch": 0.09055181472784261, + "grad_norm": 0.3486763536930084, + "learning_rate": 0.00019745335322603137, + "loss": 1.2102, + "step": 42620 + }, + { + "epoch": 0.09057306104758167, + "grad_norm": 0.5071797966957092, + "learning_rate": 0.00019745182569631288, + "loss": 1.241, + "step": 42630 + }, + { + "epoch": 0.09059430736732071, + "grad_norm": 0.355720579624176, + "learning_rate": 0.00019745029771452247, + "loss": 1.2326, + "step": 42640 + }, + { + "epoch": 0.09061555368705977, + "grad_norm": 0.6583822965621948, + "learning_rate": 0.00019744876928066721, + "loss": 1.2235, + "step": 42650 + }, + { + "epoch": 0.09063680000679883, + "grad_norm": 0.44274649024009705, + "learning_rate": 0.00019744724039475416, + "loss": 1.2221, + "step": 42660 + }, + { + "epoch": 0.09065804632653787, + "grad_norm": 0.36151257157325745, + "learning_rate": 0.0001974457110567905, + "loss": 1.2333, + "step": 42670 + }, + { + "epoch": 0.09067929264627693, + "grad_norm": 0.3526574671268463, + "learning_rate": 0.0001974441812667832, + "loss": 1.2128, + "step": 42680 + }, + { + "epoch": 0.09070053896601599, + "grad_norm": 0.3872529864311218, + "learning_rate": 0.00019744265102473945, + "loss": 1.2501, + "step": 42690 + }, + { + "epoch": 0.09072178528575503, + "grad_norm": 0.4366814196109772, + "learning_rate": 0.0001974411203306663, + "loss": 1.2391, + "step": 42700 + }, + { + "epoch": 0.09074303160549409, + "grad_norm": 0.4271455705165863, + "learning_rate": 0.00019743958918457091, + "loss": 1.2417, + "step": 42710 + }, + { + "epoch": 0.09076427792523314, + "grad_norm": 0.44427672028541565, + "learning_rate": 0.00019743805758646032, + "loss": 1.2335, + "step": 42720 + }, + { + "epoch": 0.09078552424497219, + "grad_norm": 0.4800972640514374, + "learning_rate": 0.00019743652553634166, + "loss": 1.174, + "step": 42730 + }, + { + "epoch": 0.09080677056471124, + "grad_norm": 0.4048900604248047, + "learning_rate": 0.00019743499303422204, + "loss": 1.2116, + "step": 42740 + }, + { + "epoch": 0.0908280168844503, + "grad_norm": 0.4167279005050659, + "learning_rate": 0.00019743346008010856, + "loss": 1.2372, + "step": 42750 + }, + { + "epoch": 0.09084926320418935, + "grad_norm": 0.36483412981033325, + "learning_rate": 0.00019743192667400836, + "loss": 1.2261, + "step": 42760 + }, + { + "epoch": 0.0908705095239284, + "grad_norm": 0.3558283746242523, + "learning_rate": 0.0001974303928159285, + "loss": 1.2499, + "step": 42770 + }, + { + "epoch": 0.09089175584366746, + "grad_norm": 0.35260170698165894, + "learning_rate": 0.00019742885850587615, + "loss": 1.2316, + "step": 42780 + }, + { + "epoch": 0.0909130021634065, + "grad_norm": 0.6456207036972046, + "learning_rate": 0.0001974273237438584, + "loss": 1.2105, + "step": 42790 + }, + { + "epoch": 0.09093424848314556, + "grad_norm": 0.40812283754348755, + "learning_rate": 0.00019742578852988235, + "loss": 1.1723, + "step": 42800 + }, + { + "epoch": 0.09095549480288462, + "grad_norm": 0.3363371789455414, + "learning_rate": 0.00019742425286395517, + "loss": 1.2202, + "step": 42810 + }, + { + "epoch": 0.09097674112262366, + "grad_norm": 0.3482915461063385, + "learning_rate": 0.00019742271674608395, + "loss": 1.2026, + "step": 42820 + }, + { + "epoch": 0.09099798744236272, + "grad_norm": 0.5557394027709961, + "learning_rate": 0.00019742118017627581, + "loss": 1.1962, + "step": 42830 + }, + { + "epoch": 0.09101923376210178, + "grad_norm": 0.3251957595348358, + "learning_rate": 0.0001974196431545379, + "loss": 1.2486, + "step": 42840 + }, + { + "epoch": 0.09104048008184082, + "grad_norm": 0.3494524359703064, + "learning_rate": 0.00019741810568087737, + "loss": 1.2311, + "step": 42850 + }, + { + "epoch": 0.09106172640157988, + "grad_norm": 0.4203610122203827, + "learning_rate": 0.0001974165677553013, + "loss": 1.2145, + "step": 42860 + }, + { + "epoch": 0.09108297272131893, + "grad_norm": 0.5606570243835449, + "learning_rate": 0.00019741502937781685, + "loss": 1.2097, + "step": 42870 + }, + { + "epoch": 0.09110421904105798, + "grad_norm": 0.3924444615840912, + "learning_rate": 0.00019741349054843116, + "loss": 1.1948, + "step": 42880 + }, + { + "epoch": 0.09112546536079703, + "grad_norm": 0.3508324921131134, + "learning_rate": 0.00019741195126715138, + "loss": 1.2332, + "step": 42890 + }, + { + "epoch": 0.09114671168053609, + "grad_norm": 0.4044727385044098, + "learning_rate": 0.0001974104115339846, + "loss": 1.2177, + "step": 42900 + }, + { + "epoch": 0.09116795800027513, + "grad_norm": 0.35659265518188477, + "learning_rate": 0.000197408871348938, + "loss": 1.2146, + "step": 42910 + }, + { + "epoch": 0.09118920432001419, + "grad_norm": 0.3790283501148224, + "learning_rate": 0.00019740733071201875, + "loss": 1.246, + "step": 42920 + }, + { + "epoch": 0.09121045063975325, + "grad_norm": 0.4414637088775635, + "learning_rate": 0.00019740578962323398, + "loss": 1.1896, + "step": 42930 + }, + { + "epoch": 0.09123169695949229, + "grad_norm": 0.3788025677204132, + "learning_rate": 0.0001974042480825908, + "loss": 1.2161, + "step": 42940 + }, + { + "epoch": 0.09125294327923135, + "grad_norm": 0.33502018451690674, + "learning_rate": 0.0001974027060900964, + "loss": 1.2274, + "step": 42950 + }, + { + "epoch": 0.0912741895989704, + "grad_norm": 0.389974981546402, + "learning_rate": 0.0001974011636457579, + "loss": 1.212, + "step": 42960 + }, + { + "epoch": 0.09129543591870945, + "grad_norm": 0.3288074731826782, + "learning_rate": 0.0001973996207495825, + "loss": 1.2134, + "step": 42970 + }, + { + "epoch": 0.09131668223844851, + "grad_norm": 0.390631765127182, + "learning_rate": 0.00019739807740157733, + "loss": 1.1778, + "step": 42980 + }, + { + "epoch": 0.09133792855818756, + "grad_norm": 0.4040866196155548, + "learning_rate": 0.00019739653360174956, + "loss": 1.1969, + "step": 42990 + }, + { + "epoch": 0.09135917487792661, + "grad_norm": 0.39439642429351807, + "learning_rate": 0.00019739498935010633, + "loss": 1.2224, + "step": 43000 + }, + { + "epoch": 0.09138042119766566, + "grad_norm": 0.44736355543136597, + "learning_rate": 0.0001973934446466548, + "loss": 1.2419, + "step": 43010 + }, + { + "epoch": 0.09140166751740472, + "grad_norm": 0.5467149019241333, + "learning_rate": 0.00019739189949140223, + "loss": 1.2388, + "step": 43020 + }, + { + "epoch": 0.09142291383714377, + "grad_norm": 0.36071154475212097, + "learning_rate": 0.00019739035388435566, + "loss": 1.2327, + "step": 43030 + }, + { + "epoch": 0.09144416015688282, + "grad_norm": 0.3132125735282898, + "learning_rate": 0.00019738880782552232, + "loss": 1.2144, + "step": 43040 + }, + { + "epoch": 0.09146540647662188, + "grad_norm": 0.3797055780887604, + "learning_rate": 0.0001973872613149094, + "loss": 1.2152, + "step": 43050 + }, + { + "epoch": 0.09148665279636094, + "grad_norm": 0.3564883768558502, + "learning_rate": 0.000197385714352524, + "loss": 1.2152, + "step": 43060 + }, + { + "epoch": 0.09150789911609998, + "grad_norm": 0.797813355922699, + "learning_rate": 0.00019738416693837338, + "loss": 1.1786, + "step": 43070 + }, + { + "epoch": 0.09152914543583904, + "grad_norm": 0.34669962525367737, + "learning_rate": 0.00019738261907246467, + "loss": 1.199, + "step": 43080 + }, + { + "epoch": 0.0915503917555781, + "grad_norm": 0.5977039933204651, + "learning_rate": 0.00019738107075480506, + "loss": 1.2127, + "step": 43090 + }, + { + "epoch": 0.09157163807531714, + "grad_norm": 0.4702274203300476, + "learning_rate": 0.00019737952198540178, + "loss": 1.1923, + "step": 43100 + }, + { + "epoch": 0.0915928843950562, + "grad_norm": 0.413229376077652, + "learning_rate": 0.00019737797276426193, + "loss": 1.2433, + "step": 43110 + }, + { + "epoch": 0.09161413071479525, + "grad_norm": 0.401858925819397, + "learning_rate": 0.00019737642309139276, + "loss": 1.2423, + "step": 43120 + }, + { + "epoch": 0.0916353770345343, + "grad_norm": 0.34050455689430237, + "learning_rate": 0.0001973748729668014, + "loss": 1.2164, + "step": 43130 + }, + { + "epoch": 0.09165662335427335, + "grad_norm": 0.5051305294036865, + "learning_rate": 0.00019737332239049514, + "loss": 1.1944, + "step": 43140 + }, + { + "epoch": 0.09167786967401241, + "grad_norm": 0.3388909697532654, + "learning_rate": 0.00019737177136248106, + "loss": 1.2037, + "step": 43150 + }, + { + "epoch": 0.09169911599375145, + "grad_norm": 0.3433595895767212, + "learning_rate": 0.00019737021988276644, + "loss": 1.2414, + "step": 43160 + }, + { + "epoch": 0.09172036231349051, + "grad_norm": 0.3548092544078827, + "learning_rate": 0.00019736866795135843, + "loss": 1.2283, + "step": 43170 + }, + { + "epoch": 0.09174160863322957, + "grad_norm": 0.3681517243385315, + "learning_rate": 0.00019736711556826424, + "loss": 1.2039, + "step": 43180 + }, + { + "epoch": 0.09176285495296861, + "grad_norm": 0.39808937907218933, + "learning_rate": 0.00019736556273349106, + "loss": 1.2315, + "step": 43190 + }, + { + "epoch": 0.09178410127270767, + "grad_norm": 0.44116681814193726, + "learning_rate": 0.00019736400944704615, + "loss": 1.2094, + "step": 43200 + }, + { + "epoch": 0.09180534759244673, + "grad_norm": 0.4156041741371155, + "learning_rate": 0.00019736245570893662, + "loss": 1.2053, + "step": 43210 + }, + { + "epoch": 0.09182659391218577, + "grad_norm": 0.4706812798976898, + "learning_rate": 0.00019736090151916978, + "loss": 1.206, + "step": 43220 + }, + { + "epoch": 0.09184784023192483, + "grad_norm": 0.4062104821205139, + "learning_rate": 0.00019735934687775276, + "loss": 1.2093, + "step": 43230 + }, + { + "epoch": 0.09186908655166388, + "grad_norm": 0.4790036678314209, + "learning_rate": 0.0001973577917846928, + "loss": 1.2185, + "step": 43240 + }, + { + "epoch": 0.09189033287140293, + "grad_norm": 0.3923487961292267, + "learning_rate": 0.00019735623623999714, + "loss": 1.1868, + "step": 43250 + }, + { + "epoch": 0.09191157919114198, + "grad_norm": 0.46817827224731445, + "learning_rate": 0.00019735468024367295, + "loss": 1.2358, + "step": 43260 + }, + { + "epoch": 0.09193282551088104, + "grad_norm": 0.7690906524658203, + "learning_rate": 0.00019735312379572749, + "loss": 1.2169, + "step": 43270 + }, + { + "epoch": 0.09195407183062008, + "grad_norm": 0.5583779215812683, + "learning_rate": 0.00019735156689616796, + "loss": 1.1947, + "step": 43280 + }, + { + "epoch": 0.09197531815035914, + "grad_norm": 0.4278891980648041, + "learning_rate": 0.0001973500095450016, + "loss": 1.2383, + "step": 43290 + }, + { + "epoch": 0.0919965644700982, + "grad_norm": 0.3529817759990692, + "learning_rate": 0.00019734845174223557, + "loss": 1.2097, + "step": 43300 + }, + { + "epoch": 0.09201781078983724, + "grad_norm": 0.37990981340408325, + "learning_rate": 0.00019734689348787718, + "loss": 1.2113, + "step": 43310 + }, + { + "epoch": 0.0920390571095763, + "grad_norm": 0.3329210877418518, + "learning_rate": 0.0001973453347819336, + "loss": 1.2038, + "step": 43320 + }, + { + "epoch": 0.09206030342931536, + "grad_norm": 0.3737020194530487, + "learning_rate": 0.00019734377562441208, + "loss": 1.2385, + "step": 43330 + }, + { + "epoch": 0.0920815497490544, + "grad_norm": 0.4012746214866638, + "learning_rate": 0.0001973422160153199, + "loss": 1.2281, + "step": 43340 + }, + { + "epoch": 0.09210279606879346, + "grad_norm": 0.374796062707901, + "learning_rate": 0.00019734065595466423, + "loss": 1.2169, + "step": 43350 + }, + { + "epoch": 0.09212404238853251, + "grad_norm": 0.39042964577674866, + "learning_rate": 0.00019733909544245232, + "loss": 1.22, + "step": 43360 + }, + { + "epoch": 0.09214528870827156, + "grad_norm": 0.46960777044296265, + "learning_rate": 0.0001973375344786914, + "loss": 1.2383, + "step": 43370 + }, + { + "epoch": 0.09216653502801062, + "grad_norm": 0.42452380061149597, + "learning_rate": 0.00019733597306338877, + "loss": 1.2261, + "step": 43380 + }, + { + "epoch": 0.09218778134774967, + "grad_norm": 0.5199688076972961, + "learning_rate": 0.0001973344111965516, + "loss": 1.2462, + "step": 43390 + }, + { + "epoch": 0.09220902766748872, + "grad_norm": 0.3638581335544586, + "learning_rate": 0.00019733284887818719, + "loss": 1.2236, + "step": 43400 + }, + { + "epoch": 0.09223027398722777, + "grad_norm": 0.353396475315094, + "learning_rate": 0.00019733128610830277, + "loss": 1.2348, + "step": 43410 + }, + { + "epoch": 0.09225152030696683, + "grad_norm": 0.3385533094406128, + "learning_rate": 0.00019732972288690555, + "loss": 1.2111, + "step": 43420 + }, + { + "epoch": 0.09227276662670587, + "grad_norm": 0.41565486788749695, + "learning_rate": 0.00019732815921400283, + "loss": 1.215, + "step": 43430 + }, + { + "epoch": 0.09229401294644493, + "grad_norm": 0.48272356390953064, + "learning_rate": 0.00019732659508960186, + "loss": 1.2351, + "step": 43440 + }, + { + "epoch": 0.09231525926618399, + "grad_norm": 0.4043862819671631, + "learning_rate": 0.0001973250305137099, + "loss": 1.2184, + "step": 43450 + }, + { + "epoch": 0.09233650558592303, + "grad_norm": 0.7319403886795044, + "learning_rate": 0.00019732346548633416, + "loss": 1.2126, + "step": 43460 + }, + { + "epoch": 0.09235775190566209, + "grad_norm": 0.34265756607055664, + "learning_rate": 0.00019732190000748193, + "loss": 1.21, + "step": 43470 + }, + { + "epoch": 0.09237899822540115, + "grad_norm": 0.418475478887558, + "learning_rate": 0.0001973203340771605, + "loss": 1.2525, + "step": 43480 + }, + { + "epoch": 0.0924002445451402, + "grad_norm": 0.38206198811531067, + "learning_rate": 0.00019731876769537711, + "loss": 1.2266, + "step": 43490 + }, + { + "epoch": 0.09242149086487925, + "grad_norm": 0.3511909544467926, + "learning_rate": 0.00019731720086213904, + "loss": 1.2346, + "step": 43500 + }, + { + "epoch": 0.0924427371846183, + "grad_norm": 0.3191313147544861, + "learning_rate": 0.00019731563357745354, + "loss": 1.2475, + "step": 43510 + }, + { + "epoch": 0.09246398350435736, + "grad_norm": 0.5507602691650391, + "learning_rate": 0.00019731406584132786, + "loss": 1.236, + "step": 43520 + }, + { + "epoch": 0.0924852298240964, + "grad_norm": 0.3709183931350708, + "learning_rate": 0.0001973124976537693, + "loss": 1.2005, + "step": 43530 + }, + { + "epoch": 0.09250647614383546, + "grad_norm": 0.37308433651924133, + "learning_rate": 0.00019731092901478515, + "loss": 1.2097, + "step": 43540 + }, + { + "epoch": 0.09252772246357452, + "grad_norm": 0.3917671740055084, + "learning_rate": 0.00019730935992438267, + "loss": 1.2241, + "step": 43550 + }, + { + "epoch": 0.09254896878331356, + "grad_norm": 0.3697156012058258, + "learning_rate": 0.00019730779038256913, + "loss": 1.2024, + "step": 43560 + }, + { + "epoch": 0.09257021510305262, + "grad_norm": 0.3503657877445221, + "learning_rate": 0.00019730622038935183, + "loss": 1.2356, + "step": 43570 + }, + { + "epoch": 0.09259146142279168, + "grad_norm": 0.30953991413116455, + "learning_rate": 0.000197304649944738, + "loss": 1.2049, + "step": 43580 + }, + { + "epoch": 0.09261270774253072, + "grad_norm": 0.3468119502067566, + "learning_rate": 0.00019730307904873502, + "loss": 1.2433, + "step": 43590 + }, + { + "epoch": 0.09263395406226978, + "grad_norm": 0.5379517078399658, + "learning_rate": 0.00019730150770135013, + "loss": 1.2007, + "step": 43600 + }, + { + "epoch": 0.09265520038200883, + "grad_norm": 0.46676987409591675, + "learning_rate": 0.00019729993590259057, + "loss": 1.2313, + "step": 43610 + }, + { + "epoch": 0.09267644670174788, + "grad_norm": 0.5479178428649902, + "learning_rate": 0.0001972983636524637, + "loss": 1.1872, + "step": 43620 + }, + { + "epoch": 0.09269769302148693, + "grad_norm": 0.3769724667072296, + "learning_rate": 0.00019729679095097677, + "loss": 1.2486, + "step": 43630 + }, + { + "epoch": 0.09271893934122599, + "grad_norm": 0.3680904805660248, + "learning_rate": 0.00019729521779813714, + "loss": 1.2257, + "step": 43640 + }, + { + "epoch": 0.09274018566096504, + "grad_norm": 0.3530229926109314, + "learning_rate": 0.00019729364419395203, + "loss": 1.2471, + "step": 43650 + }, + { + "epoch": 0.09276143198070409, + "grad_norm": 0.3680826723575592, + "learning_rate": 0.0001972920701384288, + "loss": 1.226, + "step": 43660 + }, + { + "epoch": 0.09278267830044315, + "grad_norm": 0.649127721786499, + "learning_rate": 0.00019729049563157468, + "loss": 1.2309, + "step": 43670 + }, + { + "epoch": 0.0928039246201822, + "grad_norm": 0.40519118309020996, + "learning_rate": 0.00019728892067339703, + "loss": 1.208, + "step": 43680 + }, + { + "epoch": 0.09282517093992125, + "grad_norm": 0.34234610199928284, + "learning_rate": 0.00019728734526390314, + "loss": 1.2557, + "step": 43690 + }, + { + "epoch": 0.09284641725966031, + "grad_norm": 0.3538919985294342, + "learning_rate": 0.00019728576940310037, + "loss": 1.2419, + "step": 43700 + }, + { + "epoch": 0.09286766357939935, + "grad_norm": 0.3759753406047821, + "learning_rate": 0.00019728419309099595, + "loss": 1.2041, + "step": 43710 + }, + { + "epoch": 0.09288890989913841, + "grad_norm": 0.44289690256118774, + "learning_rate": 0.00019728261632759723, + "loss": 1.221, + "step": 43720 + }, + { + "epoch": 0.09291015621887747, + "grad_norm": 0.4535600244998932, + "learning_rate": 0.0001972810391129115, + "loss": 1.2074, + "step": 43730 + }, + { + "epoch": 0.09293140253861651, + "grad_norm": 0.3257148861885071, + "learning_rate": 0.00019727946144694613, + "loss": 1.1919, + "step": 43740 + }, + { + "epoch": 0.09295264885835557, + "grad_norm": 0.380019873380661, + "learning_rate": 0.00019727788332970839, + "loss": 1.2261, + "step": 43750 + }, + { + "epoch": 0.09297389517809462, + "grad_norm": 0.5327597856521606, + "learning_rate": 0.0001972763047612056, + "loss": 1.2103, + "step": 43760 + }, + { + "epoch": 0.09299514149783367, + "grad_norm": 0.4187738001346588, + "learning_rate": 0.00019727472574144513, + "loss": 1.1914, + "step": 43770 + }, + { + "epoch": 0.09301638781757272, + "grad_norm": 0.44578418135643005, + "learning_rate": 0.00019727314627043425, + "loss": 1.2275, + "step": 43780 + }, + { + "epoch": 0.09303763413731178, + "grad_norm": 0.5331407785415649, + "learning_rate": 0.00019727156634818032, + "loss": 1.2302, + "step": 43790 + }, + { + "epoch": 0.09305888045705082, + "grad_norm": 0.32796943187713623, + "learning_rate": 0.00019726998597469067, + "loss": 1.228, + "step": 43800 + }, + { + "epoch": 0.09308012677678988, + "grad_norm": 0.570178747177124, + "learning_rate": 0.00019726840514997264, + "loss": 1.2259, + "step": 43810 + }, + { + "epoch": 0.09310137309652894, + "grad_norm": 0.3869315981864929, + "learning_rate": 0.0001972668238740335, + "loss": 1.1815, + "step": 43820 + }, + { + "epoch": 0.09312261941626798, + "grad_norm": 0.39669346809387207, + "learning_rate": 0.00019726524214688063, + "loss": 1.2267, + "step": 43830 + }, + { + "epoch": 0.09314386573600704, + "grad_norm": 0.38262295722961426, + "learning_rate": 0.0001972636599685214, + "loss": 1.2378, + "step": 43840 + }, + { + "epoch": 0.0931651120557461, + "grad_norm": 0.3563249707221985, + "learning_rate": 0.0001972620773389631, + "loss": 1.2332, + "step": 43850 + }, + { + "epoch": 0.09318635837548514, + "grad_norm": 0.37830230593681335, + "learning_rate": 0.00019726049425821313, + "loss": 1.2241, + "step": 43860 + }, + { + "epoch": 0.0932076046952242, + "grad_norm": 0.4157574474811554, + "learning_rate": 0.00019725891072627872, + "loss": 1.2315, + "step": 43870 + }, + { + "epoch": 0.09322885101496325, + "grad_norm": 0.433915913105011, + "learning_rate": 0.00019725732674316732, + "loss": 1.206, + "step": 43880 + }, + { + "epoch": 0.0932500973347023, + "grad_norm": 0.46527186036109924, + "learning_rate": 0.00019725574230888626, + "loss": 1.2024, + "step": 43890 + }, + { + "epoch": 0.09327134365444135, + "grad_norm": 0.4426596462726593, + "learning_rate": 0.00019725415742344286, + "loss": 1.2588, + "step": 43900 + }, + { + "epoch": 0.09329258997418041, + "grad_norm": 0.4750540554523468, + "learning_rate": 0.0001972525720868445, + "loss": 1.2136, + "step": 43910 + }, + { + "epoch": 0.09331383629391947, + "grad_norm": 0.42917126417160034, + "learning_rate": 0.00019725098629909852, + "loss": 1.2524, + "step": 43920 + }, + { + "epoch": 0.09333508261365851, + "grad_norm": 0.38965708017349243, + "learning_rate": 0.0001972494000602123, + "loss": 1.2303, + "step": 43930 + }, + { + "epoch": 0.09335632893339757, + "grad_norm": 0.3675435185432434, + "learning_rate": 0.00019724781337019317, + "loss": 1.2497, + "step": 43940 + }, + { + "epoch": 0.09337757525313663, + "grad_norm": 0.37073540687561035, + "learning_rate": 0.00019724622622904848, + "loss": 1.2385, + "step": 43950 + }, + { + "epoch": 0.09339882157287567, + "grad_norm": 0.5663598775863647, + "learning_rate": 0.00019724463863678558, + "loss": 1.2632, + "step": 43960 + }, + { + "epoch": 0.09342006789261473, + "grad_norm": 0.4073825478553772, + "learning_rate": 0.00019724305059341191, + "loss": 1.1939, + "step": 43970 + }, + { + "epoch": 0.09344131421235378, + "grad_norm": 0.3943777084350586, + "learning_rate": 0.0001972414620989348, + "loss": 1.1994, + "step": 43980 + }, + { + "epoch": 0.09346256053209283, + "grad_norm": 0.3789030909538269, + "learning_rate": 0.0001972398731533616, + "loss": 1.2828, + "step": 43990 + }, + { + "epoch": 0.09348380685183189, + "grad_norm": 0.37180542945861816, + "learning_rate": 0.0001972382837566997, + "loss": 1.2393, + "step": 44000 + }, + { + "epoch": 0.09350505317157094, + "grad_norm": 0.36447563767433167, + "learning_rate": 0.00019723669390895644, + "loss": 1.2341, + "step": 44010 + }, + { + "epoch": 0.09352629949130999, + "grad_norm": 0.37691718339920044, + "learning_rate": 0.00019723510361013924, + "loss": 1.2141, + "step": 44020 + }, + { + "epoch": 0.09354754581104904, + "grad_norm": 0.4440952241420746, + "learning_rate": 0.00019723351286025546, + "loss": 1.2172, + "step": 44030 + }, + { + "epoch": 0.0935687921307881, + "grad_norm": 0.34411823749542236, + "learning_rate": 0.00019723192165931245, + "loss": 1.2285, + "step": 44040 + }, + { + "epoch": 0.09359003845052714, + "grad_norm": 0.39407747983932495, + "learning_rate": 0.00019723033000731765, + "loss": 1.2568, + "step": 44050 + }, + { + "epoch": 0.0936112847702662, + "grad_norm": 0.359382301568985, + "learning_rate": 0.0001972287379042784, + "loss": 1.2012, + "step": 44060 + }, + { + "epoch": 0.09363253109000526, + "grad_norm": 0.3801405727863312, + "learning_rate": 0.0001972271453502021, + "loss": 1.2139, + "step": 44070 + }, + { + "epoch": 0.0936537774097443, + "grad_norm": 0.4206717014312744, + "learning_rate": 0.00019722555234509613, + "loss": 1.256, + "step": 44080 + }, + { + "epoch": 0.09367502372948336, + "grad_norm": 0.46459510922431946, + "learning_rate": 0.0001972239588889679, + "loss": 1.1996, + "step": 44090 + }, + { + "epoch": 0.09369627004922242, + "grad_norm": 0.3383939564228058, + "learning_rate": 0.0001972223649818248, + "loss": 1.2167, + "step": 44100 + }, + { + "epoch": 0.09371751636896146, + "grad_norm": 0.3546484708786011, + "learning_rate": 0.0001972207706236742, + "loss": 1.2008, + "step": 44110 + }, + { + "epoch": 0.09373876268870052, + "grad_norm": 0.4667653441429138, + "learning_rate": 0.00019721917581452353, + "loss": 1.2407, + "step": 44120 + }, + { + "epoch": 0.09376000900843957, + "grad_norm": 0.3738372325897217, + "learning_rate": 0.00019721758055438012, + "loss": 1.1912, + "step": 44130 + }, + { + "epoch": 0.09378125532817862, + "grad_norm": 0.6046764254570007, + "learning_rate": 0.0001972159848432514, + "loss": 1.2264, + "step": 44140 + }, + { + "epoch": 0.09380250164791767, + "grad_norm": 0.5943695306777954, + "learning_rate": 0.00019721438868114483, + "loss": 1.2443, + "step": 44150 + }, + { + "epoch": 0.09382374796765673, + "grad_norm": 0.66765296459198, + "learning_rate": 0.00019721279206806779, + "loss": 1.2402, + "step": 44160 + }, + { + "epoch": 0.09384499428739577, + "grad_norm": 0.4314115643501282, + "learning_rate": 0.00019721119500402763, + "loss": 1.2159, + "step": 44170 + }, + { + "epoch": 0.09386624060713483, + "grad_norm": 0.3652271032333374, + "learning_rate": 0.00019720959748903183, + "loss": 1.1855, + "step": 44180 + }, + { + "epoch": 0.09388748692687389, + "grad_norm": 0.32920703291893005, + "learning_rate": 0.00019720799952308775, + "loss": 1.224, + "step": 44190 + }, + { + "epoch": 0.09390873324661293, + "grad_norm": 0.4076266884803772, + "learning_rate": 0.00019720640110620284, + "loss": 1.2379, + "step": 44200 + }, + { + "epoch": 0.09392997956635199, + "grad_norm": 0.41829439997673035, + "learning_rate": 0.00019720480223838448, + "loss": 1.2305, + "step": 44210 + }, + { + "epoch": 0.09395122588609105, + "grad_norm": 0.6909310221672058, + "learning_rate": 0.0001972032029196401, + "loss": 1.2064, + "step": 44220 + }, + { + "epoch": 0.09397247220583009, + "grad_norm": 0.6194040775299072, + "learning_rate": 0.00019720160314997713, + "loss": 1.26, + "step": 44230 + }, + { + "epoch": 0.09399371852556915, + "grad_norm": 0.4718143939971924, + "learning_rate": 0.000197200002929403, + "loss": 1.2173, + "step": 44240 + }, + { + "epoch": 0.0940149648453082, + "grad_norm": 0.3356419801712036, + "learning_rate": 0.0001971984022579251, + "loss": 1.2303, + "step": 44250 + }, + { + "epoch": 0.09403621116504725, + "grad_norm": 0.3828662633895874, + "learning_rate": 0.00019719680113555087, + "loss": 1.1831, + "step": 44260 + }, + { + "epoch": 0.0940574574847863, + "grad_norm": 0.334332138299942, + "learning_rate": 0.00019719519956228773, + "loss": 1.1728, + "step": 44270 + }, + { + "epoch": 0.09407870380452536, + "grad_norm": 0.45843279361724854, + "learning_rate": 0.00019719359753814315, + "loss": 1.1947, + "step": 44280 + }, + { + "epoch": 0.0940999501242644, + "grad_norm": 0.40239417552948, + "learning_rate": 0.00019719199506312453, + "loss": 1.2456, + "step": 44290 + }, + { + "epoch": 0.09412119644400346, + "grad_norm": 0.517814040184021, + "learning_rate": 0.0001971903921372393, + "loss": 1.21, + "step": 44300 + }, + { + "epoch": 0.09414244276374252, + "grad_norm": 0.3578246235847473, + "learning_rate": 0.0001971887887604949, + "loss": 1.2217, + "step": 44310 + }, + { + "epoch": 0.09416368908348156, + "grad_norm": 0.3660714328289032, + "learning_rate": 0.00019718718493289873, + "loss": 1.2286, + "step": 44320 + }, + { + "epoch": 0.09418493540322062, + "grad_norm": 0.3677418828010559, + "learning_rate": 0.00019718558065445833, + "loss": 1.1762, + "step": 44330 + }, + { + "epoch": 0.09420618172295968, + "grad_norm": 0.3470211923122406, + "learning_rate": 0.00019718397592518105, + "loss": 1.2122, + "step": 44340 + }, + { + "epoch": 0.09422742804269874, + "grad_norm": 0.42020004987716675, + "learning_rate": 0.00019718237074507438, + "loss": 1.2404, + "step": 44350 + }, + { + "epoch": 0.09424867436243778, + "grad_norm": 0.6038693785667419, + "learning_rate": 0.00019718076511414574, + "loss": 1.2549, + "step": 44360 + }, + { + "epoch": 0.09426992068217684, + "grad_norm": 0.5339728593826294, + "learning_rate": 0.0001971791590324026, + "loss": 1.2442, + "step": 44370 + }, + { + "epoch": 0.0942911670019159, + "grad_norm": 0.3668026924133301, + "learning_rate": 0.0001971775524998524, + "loss": 1.2277, + "step": 44380 + }, + { + "epoch": 0.09431241332165494, + "grad_norm": 0.36562812328338623, + "learning_rate": 0.00019717594551650259, + "loss": 1.204, + "step": 44390 + }, + { + "epoch": 0.094333659641394, + "grad_norm": 0.38382449746131897, + "learning_rate": 0.0001971743380823606, + "loss": 1.2206, + "step": 44400 + }, + { + "epoch": 0.09435490596113305, + "grad_norm": 0.37326353788375854, + "learning_rate": 0.00019717273019743397, + "loss": 1.1734, + "step": 44410 + }, + { + "epoch": 0.0943761522808721, + "grad_norm": 0.38263803720474243, + "learning_rate": 0.00019717112186173007, + "loss": 1.2276, + "step": 44420 + }, + { + "epoch": 0.09439739860061115, + "grad_norm": 0.35280245542526245, + "learning_rate": 0.0001971695130752564, + "loss": 1.2593, + "step": 44430 + }, + { + "epoch": 0.09441864492035021, + "grad_norm": 0.3671458065509796, + "learning_rate": 0.00019716790383802045, + "loss": 1.2718, + "step": 44440 + }, + { + "epoch": 0.09443989124008925, + "grad_norm": 0.4000076651573181, + "learning_rate": 0.0001971662941500296, + "loss": 1.2753, + "step": 44450 + }, + { + "epoch": 0.09446113755982831, + "grad_norm": 0.37487754225730896, + "learning_rate": 0.0001971646840112914, + "loss": 1.2302, + "step": 44460 + }, + { + "epoch": 0.09448238387956737, + "grad_norm": 0.509874165058136, + "learning_rate": 0.0001971630734218133, + "loss": 1.1969, + "step": 44470 + }, + { + "epoch": 0.09450363019930641, + "grad_norm": 0.5510044693946838, + "learning_rate": 0.00019716146238160274, + "loss": 1.2257, + "step": 44480 + }, + { + "epoch": 0.09452487651904547, + "grad_norm": 0.3311774432659149, + "learning_rate": 0.00019715985089066723, + "loss": 1.2161, + "step": 44490 + }, + { + "epoch": 0.09454612283878452, + "grad_norm": 0.3922610282897949, + "learning_rate": 0.0001971582389490142, + "loss": 1.2295, + "step": 44500 + }, + { + "epoch": 0.09456736915852357, + "grad_norm": 0.37630876898765564, + "learning_rate": 0.00019715662655665116, + "loss": 1.1955, + "step": 44510 + }, + { + "epoch": 0.09458861547826262, + "grad_norm": 0.3739081621170044, + "learning_rate": 0.00019715501371358562, + "loss": 1.2075, + "step": 44520 + }, + { + "epoch": 0.09460986179800168, + "grad_norm": 0.36919793486595154, + "learning_rate": 0.00019715340041982502, + "loss": 1.1793, + "step": 44530 + }, + { + "epoch": 0.09463110811774073, + "grad_norm": 0.36142876744270325, + "learning_rate": 0.00019715178667537684, + "loss": 1.1973, + "step": 44540 + }, + { + "epoch": 0.09465235443747978, + "grad_norm": 0.3392115533351898, + "learning_rate": 0.0001971501724802486, + "loss": 1.24, + "step": 44550 + }, + { + "epoch": 0.09467360075721884, + "grad_norm": 0.5148680210113525, + "learning_rate": 0.00019714855783444773, + "loss": 1.2193, + "step": 44560 + }, + { + "epoch": 0.09469484707695788, + "grad_norm": 0.36278843879699707, + "learning_rate": 0.0001971469427379818, + "loss": 1.2093, + "step": 44570 + }, + { + "epoch": 0.09471609339669694, + "grad_norm": 0.39956584572792053, + "learning_rate": 0.00019714532719085823, + "loss": 1.2342, + "step": 44580 + }, + { + "epoch": 0.094737339716436, + "grad_norm": 0.3942663371562958, + "learning_rate": 0.00019714371119308456, + "loss": 1.2029, + "step": 44590 + }, + { + "epoch": 0.09475858603617504, + "grad_norm": 0.39643725752830505, + "learning_rate": 0.00019714209474466828, + "loss": 1.2156, + "step": 44600 + }, + { + "epoch": 0.0947798323559141, + "grad_norm": 0.38029906153678894, + "learning_rate": 0.00019714047784561686, + "loss": 1.229, + "step": 44610 + }, + { + "epoch": 0.09480107867565316, + "grad_norm": 0.36429670453071594, + "learning_rate": 0.00019713886049593785, + "loss": 1.2039, + "step": 44620 + }, + { + "epoch": 0.0948223249953922, + "grad_norm": 0.44330906867980957, + "learning_rate": 0.00019713724269563868, + "loss": 1.1772, + "step": 44630 + }, + { + "epoch": 0.09484357131513126, + "grad_norm": 0.4006069302558899, + "learning_rate": 0.00019713562444472694, + "loss": 1.2513, + "step": 44640 + }, + { + "epoch": 0.09486481763487031, + "grad_norm": 0.3550579249858856, + "learning_rate": 0.00019713400574321007, + "loss": 1.208, + "step": 44650 + }, + { + "epoch": 0.09488606395460936, + "grad_norm": 0.33531856536865234, + "learning_rate": 0.0001971323865910956, + "loss": 1.2724, + "step": 44660 + }, + { + "epoch": 0.09490731027434841, + "grad_norm": 0.58162921667099, + "learning_rate": 0.00019713076698839107, + "loss": 1.1719, + "step": 44670 + }, + { + "epoch": 0.09492855659408747, + "grad_norm": 0.36086350679397583, + "learning_rate": 0.00019712914693510394, + "loss": 1.1875, + "step": 44680 + }, + { + "epoch": 0.09494980291382651, + "grad_norm": 0.5169561505317688, + "learning_rate": 0.00019712752643124176, + "loss": 1.2509, + "step": 44690 + }, + { + "epoch": 0.09497104923356557, + "grad_norm": 0.34043067693710327, + "learning_rate": 0.00019712590547681205, + "loss": 1.2069, + "step": 44700 + }, + { + "epoch": 0.09499229555330463, + "grad_norm": 0.7086315751075745, + "learning_rate": 0.00019712428407182232, + "loss": 1.1856, + "step": 44710 + }, + { + "epoch": 0.09501354187304367, + "grad_norm": 0.34389781951904297, + "learning_rate": 0.0001971226622162801, + "loss": 1.2142, + "step": 44720 + }, + { + "epoch": 0.09503478819278273, + "grad_norm": 0.40024811029434204, + "learning_rate": 0.00019712103991019285, + "loss": 1.2164, + "step": 44730 + }, + { + "epoch": 0.09505603451252179, + "grad_norm": 0.35619091987609863, + "learning_rate": 0.00019711941715356822, + "loss": 1.2266, + "step": 44740 + }, + { + "epoch": 0.09507728083226084, + "grad_norm": 0.34055817127227783, + "learning_rate": 0.00019711779394641362, + "loss": 1.2175, + "step": 44750 + }, + { + "epoch": 0.09509852715199989, + "grad_norm": 0.4204173982143402, + "learning_rate": 0.00019711617028873667, + "loss": 1.1892, + "step": 44760 + }, + { + "epoch": 0.09511977347173894, + "grad_norm": 0.3433148264884949, + "learning_rate": 0.00019711454618054485, + "loss": 1.2204, + "step": 44770 + }, + { + "epoch": 0.095141019791478, + "grad_norm": 0.3450428545475006, + "learning_rate": 0.00019711292162184573, + "loss": 1.2484, + "step": 44780 + }, + { + "epoch": 0.09516226611121704, + "grad_norm": 0.3462899327278137, + "learning_rate": 0.00019711129661264675, + "loss": 1.2207, + "step": 44790 + }, + { + "epoch": 0.0951835124309561, + "grad_norm": 0.42198237776756287, + "learning_rate": 0.0001971096711529556, + "loss": 1.2313, + "step": 44800 + }, + { + "epoch": 0.09520475875069516, + "grad_norm": 0.377646803855896, + "learning_rate": 0.0001971080452427797, + "loss": 1.2606, + "step": 44810 + }, + { + "epoch": 0.0952260050704342, + "grad_norm": 0.34751996397972107, + "learning_rate": 0.0001971064188821266, + "loss": 1.2469, + "step": 44820 + }, + { + "epoch": 0.09524725139017326, + "grad_norm": 0.3480342924594879, + "learning_rate": 0.00019710479207100395, + "loss": 1.208, + "step": 44830 + }, + { + "epoch": 0.09526849770991232, + "grad_norm": 0.3208253085613251, + "learning_rate": 0.00019710316480941916, + "loss": 1.211, + "step": 44840 + }, + { + "epoch": 0.09528974402965136, + "grad_norm": 0.35121381282806396, + "learning_rate": 0.00019710153709737987, + "loss": 1.2021, + "step": 44850 + }, + { + "epoch": 0.09531099034939042, + "grad_norm": 0.3557077944278717, + "learning_rate": 0.00019709990893489364, + "loss": 1.1991, + "step": 44860 + }, + { + "epoch": 0.09533223666912947, + "grad_norm": 0.4178940951824188, + "learning_rate": 0.0001970982803219679, + "loss": 1.2815, + "step": 44870 + }, + { + "epoch": 0.09535348298886852, + "grad_norm": 0.40148431062698364, + "learning_rate": 0.00019709665125861037, + "loss": 1.2243, + "step": 44880 + }, + { + "epoch": 0.09537472930860758, + "grad_norm": 0.35663291811943054, + "learning_rate": 0.0001970950217448285, + "loss": 1.2107, + "step": 44890 + }, + { + "epoch": 0.09539597562834663, + "grad_norm": 0.358432799577713, + "learning_rate": 0.00019709339178062988, + "loss": 1.2341, + "step": 44900 + }, + { + "epoch": 0.09541722194808568, + "grad_norm": 0.3827897012233734, + "learning_rate": 0.00019709176136602205, + "loss": 1.1801, + "step": 44910 + }, + { + "epoch": 0.09543846826782473, + "grad_norm": 0.47815167903900146, + "learning_rate": 0.0001970901305010126, + "loss": 1.2171, + "step": 44920 + }, + { + "epoch": 0.09545971458756379, + "grad_norm": 0.695732057094574, + "learning_rate": 0.00019708849918560913, + "loss": 1.2135, + "step": 44930 + }, + { + "epoch": 0.09548096090730283, + "grad_norm": 0.4050646424293518, + "learning_rate": 0.00019708686741981912, + "loss": 1.2177, + "step": 44940 + }, + { + "epoch": 0.09550220722704189, + "grad_norm": 0.3635634779930115, + "learning_rate": 0.0001970852352036502, + "loss": 1.2287, + "step": 44950 + }, + { + "epoch": 0.09552345354678095, + "grad_norm": 0.39859673380851746, + "learning_rate": 0.00019708360253710994, + "loss": 1.2223, + "step": 44960 + }, + { + "epoch": 0.09554469986651999, + "grad_norm": 0.8022134304046631, + "learning_rate": 0.00019708196942020586, + "loss": 1.2414, + "step": 44970 + }, + { + "epoch": 0.09556594618625905, + "grad_norm": 0.45593878626823425, + "learning_rate": 0.0001970803358529456, + "loss": 1.2147, + "step": 44980 + }, + { + "epoch": 0.0955871925059981, + "grad_norm": 0.46908360719680786, + "learning_rate": 0.00019707870183533675, + "loss": 1.2309, + "step": 44990 + }, + { + "epoch": 0.09560843882573715, + "grad_norm": 0.5963352918624878, + "learning_rate": 0.0001970770673673868, + "loss": 1.215, + "step": 45000 + }, + { + "epoch": 0.0956296851454762, + "grad_norm": 0.6420010924339294, + "learning_rate": 0.00019707543244910342, + "loss": 1.2479, + "step": 45010 + }, + { + "epoch": 0.09565093146521526, + "grad_norm": 0.41772639751434326, + "learning_rate": 0.00019707379708049414, + "loss": 1.2048, + "step": 45020 + }, + { + "epoch": 0.09567217778495431, + "grad_norm": 0.3371010422706604, + "learning_rate": 0.00019707216126156659, + "loss": 1.2315, + "step": 45030 + }, + { + "epoch": 0.09569342410469336, + "grad_norm": 0.3603573441505432, + "learning_rate": 0.00019707052499232832, + "loss": 1.1897, + "step": 45040 + }, + { + "epoch": 0.09571467042443242, + "grad_norm": 0.3558235168457031, + "learning_rate": 0.00019706888827278693, + "loss": 1.2108, + "step": 45050 + }, + { + "epoch": 0.09573591674417146, + "grad_norm": 0.350399911403656, + "learning_rate": 0.00019706725110295, + "loss": 1.2452, + "step": 45060 + }, + { + "epoch": 0.09575716306391052, + "grad_norm": 0.34180688858032227, + "learning_rate": 0.0001970656134828252, + "loss": 1.2057, + "step": 45070 + }, + { + "epoch": 0.09577840938364958, + "grad_norm": 0.33904996514320374, + "learning_rate": 0.00019706397541242003, + "loss": 1.1765, + "step": 45080 + }, + { + "epoch": 0.09579965570338862, + "grad_norm": 0.4274469017982483, + "learning_rate": 0.00019706233689174217, + "loss": 1.2235, + "step": 45090 + }, + { + "epoch": 0.09582090202312768, + "grad_norm": 0.367058128118515, + "learning_rate": 0.00019706069792079913, + "loss": 1.2171, + "step": 45100 + }, + { + "epoch": 0.09584214834286674, + "grad_norm": 0.44438666105270386, + "learning_rate": 0.0001970590584995986, + "loss": 1.2051, + "step": 45110 + }, + { + "epoch": 0.09586339466260578, + "grad_norm": 0.7411170601844788, + "learning_rate": 0.00019705741862814815, + "loss": 1.2292, + "step": 45120 + }, + { + "epoch": 0.09588464098234484, + "grad_norm": 0.4110260009765625, + "learning_rate": 0.00019705577830645535, + "loss": 1.2177, + "step": 45130 + }, + { + "epoch": 0.0959058873020839, + "grad_norm": 0.36664775013923645, + "learning_rate": 0.00019705413753452788, + "loss": 1.2293, + "step": 45140 + }, + { + "epoch": 0.09592713362182294, + "grad_norm": 0.3675108253955841, + "learning_rate": 0.0001970524963123733, + "loss": 1.2057, + "step": 45150 + }, + { + "epoch": 0.095948379941562, + "grad_norm": 0.3736995458602905, + "learning_rate": 0.00019705085463999925, + "loss": 1.221, + "step": 45160 + }, + { + "epoch": 0.09596962626130105, + "grad_norm": 0.32279080152511597, + "learning_rate": 0.0001970492125174133, + "loss": 1.2114, + "step": 45170 + }, + { + "epoch": 0.09599087258104011, + "grad_norm": 0.3681323230266571, + "learning_rate": 0.00019704756994462318, + "loss": 1.2248, + "step": 45180 + }, + { + "epoch": 0.09601211890077915, + "grad_norm": 0.3506157696247101, + "learning_rate": 0.00019704592692163634, + "loss": 1.2181, + "step": 45190 + }, + { + "epoch": 0.09603336522051821, + "grad_norm": 0.37361693382263184, + "learning_rate": 0.00019704428344846056, + "loss": 1.1935, + "step": 45200 + }, + { + "epoch": 0.09605461154025727, + "grad_norm": 0.38021835684776306, + "learning_rate": 0.00019704263952510337, + "loss": 1.2352, + "step": 45210 + }, + { + "epoch": 0.09607585785999631, + "grad_norm": 0.37077999114990234, + "learning_rate": 0.00019704099515157247, + "loss": 1.2212, + "step": 45220 + }, + { + "epoch": 0.09609710417973537, + "grad_norm": 0.34588077664375305, + "learning_rate": 0.0001970393503278754, + "loss": 1.205, + "step": 45230 + }, + { + "epoch": 0.09611835049947443, + "grad_norm": 0.4630748927593231, + "learning_rate": 0.00019703770505401984, + "loss": 1.2076, + "step": 45240 + }, + { + "epoch": 0.09613959681921347, + "grad_norm": 0.5589176416397095, + "learning_rate": 0.0001970360593300134, + "loss": 1.1979, + "step": 45250 + }, + { + "epoch": 0.09616084313895253, + "grad_norm": 0.31424590945243835, + "learning_rate": 0.00019703441315586378, + "loss": 1.2064, + "step": 45260 + }, + { + "epoch": 0.09618208945869158, + "grad_norm": 0.44168880581855774, + "learning_rate": 0.0001970327665315785, + "loss": 1.2477, + "step": 45270 + }, + { + "epoch": 0.09620333577843063, + "grad_norm": 0.33632126450538635, + "learning_rate": 0.00019703111945716531, + "loss": 1.2577, + "step": 45280 + }, + { + "epoch": 0.09622458209816968, + "grad_norm": 0.3779919445514679, + "learning_rate": 0.00019702947193263182, + "loss": 1.2759, + "step": 45290 + }, + { + "epoch": 0.09624582841790874, + "grad_norm": 0.3789803981781006, + "learning_rate": 0.0001970278239579856, + "loss": 1.2303, + "step": 45300 + }, + { + "epoch": 0.09626707473764778, + "grad_norm": 0.39569899439811707, + "learning_rate": 0.00019702617553323443, + "loss": 1.2061, + "step": 45310 + }, + { + "epoch": 0.09628832105738684, + "grad_norm": 0.4448627829551697, + "learning_rate": 0.0001970245266583858, + "loss": 1.2237, + "step": 45320 + }, + { + "epoch": 0.0963095673771259, + "grad_norm": 0.5113937854766846, + "learning_rate": 0.0001970228773334475, + "loss": 1.2412, + "step": 45330 + }, + { + "epoch": 0.09633081369686494, + "grad_norm": 0.37453821301460266, + "learning_rate": 0.0001970212275584271, + "loss": 1.2422, + "step": 45340 + }, + { + "epoch": 0.096352060016604, + "grad_norm": 0.4657462239265442, + "learning_rate": 0.00019701957733333228, + "loss": 1.2259, + "step": 45350 + }, + { + "epoch": 0.09637330633634306, + "grad_norm": 0.4385123550891876, + "learning_rate": 0.00019701792665817065, + "loss": 1.2472, + "step": 45360 + }, + { + "epoch": 0.0963945526560821, + "grad_norm": 0.39589017629623413, + "learning_rate": 0.00019701627553294997, + "loss": 1.2254, + "step": 45370 + }, + { + "epoch": 0.09641579897582116, + "grad_norm": 0.5462239980697632, + "learning_rate": 0.00019701462395767778, + "loss": 1.1712, + "step": 45380 + }, + { + "epoch": 0.09643704529556021, + "grad_norm": 0.5919922590255737, + "learning_rate": 0.0001970129719323618, + "loss": 1.2325, + "step": 45390 + }, + { + "epoch": 0.09645829161529926, + "grad_norm": 0.46376651525497437, + "learning_rate": 0.00019701131945700976, + "loss": 1.2369, + "step": 45400 + }, + { + "epoch": 0.09647953793503831, + "grad_norm": 0.5552404522895813, + "learning_rate": 0.00019700966653162918, + "loss": 1.2166, + "step": 45410 + }, + { + "epoch": 0.09650078425477737, + "grad_norm": 0.5362884998321533, + "learning_rate": 0.00019700801315622782, + "loss": 1.2143, + "step": 45420 + }, + { + "epoch": 0.09652203057451642, + "grad_norm": 0.3367921710014343, + "learning_rate": 0.00019700635933081337, + "loss": 1.1936, + "step": 45430 + }, + { + "epoch": 0.09654327689425547, + "grad_norm": 0.3482700288295746, + "learning_rate": 0.00019700470505539346, + "loss": 1.2751, + "step": 45440 + }, + { + "epoch": 0.09656452321399453, + "grad_norm": 0.40157631039619446, + "learning_rate": 0.00019700305032997574, + "loss": 1.2017, + "step": 45450 + }, + { + "epoch": 0.09658576953373357, + "grad_norm": 0.6492900252342224, + "learning_rate": 0.00019700139515456793, + "loss": 1.2309, + "step": 45460 + }, + { + "epoch": 0.09660701585347263, + "grad_norm": 0.6447449326515198, + "learning_rate": 0.0001969997395291777, + "loss": 1.221, + "step": 45470 + }, + { + "epoch": 0.09662826217321169, + "grad_norm": 0.34024855494499207, + "learning_rate": 0.00019699808345381272, + "loss": 1.2261, + "step": 45480 + }, + { + "epoch": 0.09664950849295073, + "grad_norm": 0.3556975722312927, + "learning_rate": 0.00019699642692848067, + "loss": 1.2332, + "step": 45490 + }, + { + "epoch": 0.09667075481268979, + "grad_norm": 0.3860901892185211, + "learning_rate": 0.00019699476995318924, + "loss": 1.1613, + "step": 45500 + }, + { + "epoch": 0.09669200113242885, + "grad_norm": 0.3560326397418976, + "learning_rate": 0.00019699311252794615, + "loss": 1.2137, + "step": 45510 + }, + { + "epoch": 0.09671324745216789, + "grad_norm": 0.3501054048538208, + "learning_rate": 0.00019699145465275903, + "loss": 1.232, + "step": 45520 + }, + { + "epoch": 0.09673449377190695, + "grad_norm": 0.42285779118537903, + "learning_rate": 0.00019698979632763562, + "loss": 1.1934, + "step": 45530 + }, + { + "epoch": 0.096755740091646, + "grad_norm": 0.3848707675933838, + "learning_rate": 0.00019698813755258357, + "loss": 1.215, + "step": 45540 + }, + { + "epoch": 0.09677698641138505, + "grad_norm": 0.34930112957954407, + "learning_rate": 0.00019698647832761058, + "loss": 1.2217, + "step": 45550 + }, + { + "epoch": 0.0967982327311241, + "grad_norm": 0.4152994453907013, + "learning_rate": 0.00019698481865272437, + "loss": 1.179, + "step": 45560 + }, + { + "epoch": 0.09681947905086316, + "grad_norm": 0.3700980246067047, + "learning_rate": 0.00019698315852793266, + "loss": 1.2443, + "step": 45570 + }, + { + "epoch": 0.0968407253706022, + "grad_norm": 0.31345418095588684, + "learning_rate": 0.0001969814979532431, + "loss": 1.1932, + "step": 45580 + }, + { + "epoch": 0.09686197169034126, + "grad_norm": 0.3536425232887268, + "learning_rate": 0.00019697983692866343, + "loss": 1.2181, + "step": 45590 + }, + { + "epoch": 0.09688321801008032, + "grad_norm": 0.4422661364078522, + "learning_rate": 0.00019697817545420138, + "loss": 1.2077, + "step": 45600 + }, + { + "epoch": 0.09690446432981938, + "grad_norm": 0.38526651263237, + "learning_rate": 0.00019697651352986454, + "loss": 1.2126, + "step": 45610 + }, + { + "epoch": 0.09692571064955842, + "grad_norm": 0.34079959988594055, + "learning_rate": 0.00019697485115566075, + "loss": 1.1949, + "step": 45620 + }, + { + "epoch": 0.09694695696929748, + "grad_norm": 0.3893817365169525, + "learning_rate": 0.00019697318833159763, + "loss": 1.2302, + "step": 45630 + }, + { + "epoch": 0.09696820328903653, + "grad_norm": 0.3307812809944153, + "learning_rate": 0.00019697152505768298, + "loss": 1.2184, + "step": 45640 + }, + { + "epoch": 0.09698944960877558, + "grad_norm": 0.36495649814605713, + "learning_rate": 0.00019696986133392445, + "loss": 1.2196, + "step": 45650 + }, + { + "epoch": 0.09701069592851463, + "grad_norm": 0.42683979868888855, + "learning_rate": 0.00019696819716032979, + "loss": 1.2276, + "step": 45660 + }, + { + "epoch": 0.09703194224825369, + "grad_norm": 0.4143052101135254, + "learning_rate": 0.0001969665325369067, + "loss": 1.2269, + "step": 45670 + }, + { + "epoch": 0.09705318856799273, + "grad_norm": 0.4325764775276184, + "learning_rate": 0.0001969648674636629, + "loss": 1.1916, + "step": 45680 + }, + { + "epoch": 0.09707443488773179, + "grad_norm": 0.43471962213516235, + "learning_rate": 0.00019696320194060611, + "loss": 1.1892, + "step": 45690 + }, + { + "epoch": 0.09709568120747085, + "grad_norm": 0.40216952562332153, + "learning_rate": 0.0001969615359677441, + "loss": 1.2654, + "step": 45700 + }, + { + "epoch": 0.09711692752720989, + "grad_norm": 0.33842045068740845, + "learning_rate": 0.00019695986954508456, + "loss": 1.1753, + "step": 45710 + }, + { + "epoch": 0.09713817384694895, + "grad_norm": 0.5128862857818604, + "learning_rate": 0.00019695820267263522, + "loss": 1.1995, + "step": 45720 + }, + { + "epoch": 0.09715942016668801, + "grad_norm": 0.575766384601593, + "learning_rate": 0.00019695653535040385, + "loss": 1.1982, + "step": 45730 + }, + { + "epoch": 0.09718066648642705, + "grad_norm": 0.39824390411376953, + "learning_rate": 0.00019695486757839815, + "loss": 1.2382, + "step": 45740 + }, + { + "epoch": 0.09720191280616611, + "grad_norm": 0.40585556626319885, + "learning_rate": 0.0001969531993566258, + "loss": 1.1647, + "step": 45750 + }, + { + "epoch": 0.09722315912590516, + "grad_norm": 0.4946162700653076, + "learning_rate": 0.0001969515306850947, + "loss": 1.2055, + "step": 45760 + }, + { + "epoch": 0.09724440544564421, + "grad_norm": 0.41478845477104187, + "learning_rate": 0.00019694986156381243, + "loss": 1.2486, + "step": 45770 + }, + { + "epoch": 0.09726565176538327, + "grad_norm": 0.33982595801353455, + "learning_rate": 0.0001969481919927868, + "loss": 1.1842, + "step": 45780 + }, + { + "epoch": 0.09728689808512232, + "grad_norm": 0.37452682852745056, + "learning_rate": 0.00019694652197202553, + "loss": 1.2125, + "step": 45790 + }, + { + "epoch": 0.09730814440486137, + "grad_norm": 0.36223429441452026, + "learning_rate": 0.00019694485150153647, + "loss": 1.227, + "step": 45800 + }, + { + "epoch": 0.09732939072460042, + "grad_norm": 0.35854044556617737, + "learning_rate": 0.0001969431805813272, + "loss": 1.2204, + "step": 45810 + }, + { + "epoch": 0.09735063704433948, + "grad_norm": 0.33143168687820435, + "learning_rate": 0.00019694150921140558, + "loss": 1.1704, + "step": 45820 + }, + { + "epoch": 0.09737188336407852, + "grad_norm": 0.5304598808288574, + "learning_rate": 0.00019693983739177938, + "loss": 1.2556, + "step": 45830 + }, + { + "epoch": 0.09739312968381758, + "grad_norm": 0.9132729172706604, + "learning_rate": 0.00019693816512245625, + "loss": 1.2101, + "step": 45840 + }, + { + "epoch": 0.09741437600355664, + "grad_norm": 0.5368357300758362, + "learning_rate": 0.00019693649240344407, + "loss": 1.2434, + "step": 45850 + }, + { + "epoch": 0.09743562232329568, + "grad_norm": 0.33340123295783997, + "learning_rate": 0.00019693481923475052, + "loss": 1.1976, + "step": 45860 + }, + { + "epoch": 0.09745686864303474, + "grad_norm": 0.36889326572418213, + "learning_rate": 0.00019693314561638342, + "loss": 1.1986, + "step": 45870 + }, + { + "epoch": 0.0974781149627738, + "grad_norm": 0.49238377809524536, + "learning_rate": 0.00019693147154835049, + "loss": 1.2218, + "step": 45880 + }, + { + "epoch": 0.09749936128251284, + "grad_norm": 0.4561399221420288, + "learning_rate": 0.00019692979703065945, + "loss": 1.2212, + "step": 45890 + }, + { + "epoch": 0.0975206076022519, + "grad_norm": 0.3717871904373169, + "learning_rate": 0.00019692812206331816, + "loss": 1.2601, + "step": 45900 + }, + { + "epoch": 0.09754185392199095, + "grad_norm": 0.4288810193538666, + "learning_rate": 0.00019692644664633436, + "loss": 1.1583, + "step": 45910 + }, + { + "epoch": 0.09756310024173, + "grad_norm": 0.35254764556884766, + "learning_rate": 0.0001969247707797158, + "loss": 1.2001, + "step": 45920 + }, + { + "epoch": 0.09758434656146905, + "grad_norm": 0.3548104763031006, + "learning_rate": 0.0001969230944634703, + "loss": 1.2538, + "step": 45930 + }, + { + "epoch": 0.09760559288120811, + "grad_norm": 0.35166028141975403, + "learning_rate": 0.00019692141769760558, + "loss": 1.2461, + "step": 45940 + }, + { + "epoch": 0.09762683920094715, + "grad_norm": 0.39908841252326965, + "learning_rate": 0.00019691974048212943, + "loss": 1.2315, + "step": 45950 + }, + { + "epoch": 0.09764808552068621, + "grad_norm": 0.40316879749298096, + "learning_rate": 0.00019691806281704969, + "loss": 1.1937, + "step": 45960 + }, + { + "epoch": 0.09766933184042527, + "grad_norm": 0.32391616702079773, + "learning_rate": 0.00019691638470237407, + "loss": 1.2203, + "step": 45970 + }, + { + "epoch": 0.09769057816016431, + "grad_norm": 0.3630993366241455, + "learning_rate": 0.00019691470613811038, + "loss": 1.2273, + "step": 45980 + }, + { + "epoch": 0.09771182447990337, + "grad_norm": 0.487839937210083, + "learning_rate": 0.00019691302712426643, + "loss": 1.2192, + "step": 45990 + }, + { + "epoch": 0.09773307079964243, + "grad_norm": 0.6016359925270081, + "learning_rate": 0.00019691134766084995, + "loss": 1.2119, + "step": 46000 + }, + { + "epoch": 0.09775431711938147, + "grad_norm": 0.6005394458770752, + "learning_rate": 0.0001969096677478688, + "loss": 1.2023, + "step": 46010 + }, + { + "epoch": 0.09777556343912053, + "grad_norm": 0.41953200101852417, + "learning_rate": 0.00019690798738533071, + "loss": 1.214, + "step": 46020 + }, + { + "epoch": 0.09779680975885958, + "grad_norm": 0.37671807408332825, + "learning_rate": 0.00019690630657324355, + "loss": 1.1938, + "step": 46030 + }, + { + "epoch": 0.09781805607859864, + "grad_norm": 0.3455255329608917, + "learning_rate": 0.00019690462531161504, + "loss": 1.1811, + "step": 46040 + }, + { + "epoch": 0.09783930239833769, + "grad_norm": 0.3336434066295624, + "learning_rate": 0.00019690294360045303, + "loss": 1.2028, + "step": 46050 + }, + { + "epoch": 0.09786054871807674, + "grad_norm": 0.35751715302467346, + "learning_rate": 0.0001969012614397653, + "loss": 1.2045, + "step": 46060 + }, + { + "epoch": 0.0978817950378158, + "grad_norm": 0.36687174439430237, + "learning_rate": 0.00019689957882955964, + "loss": 1.2059, + "step": 46070 + }, + { + "epoch": 0.09790304135755484, + "grad_norm": 0.36510124802589417, + "learning_rate": 0.0001968978957698439, + "loss": 1.2262, + "step": 46080 + }, + { + "epoch": 0.0979242876772939, + "grad_norm": 0.3668157756328583, + "learning_rate": 0.00019689621226062586, + "loss": 1.2206, + "step": 46090 + }, + { + "epoch": 0.09794553399703296, + "grad_norm": 0.4084283411502838, + "learning_rate": 0.0001968945283019133, + "loss": 1.212, + "step": 46100 + }, + { + "epoch": 0.097966780316772, + "grad_norm": 0.4645160138607025, + "learning_rate": 0.0001968928438937141, + "loss": 1.218, + "step": 46110 + }, + { + "epoch": 0.09798802663651106, + "grad_norm": 0.36319029331207275, + "learning_rate": 0.000196891159036036, + "loss": 1.2554, + "step": 46120 + }, + { + "epoch": 0.09800927295625012, + "grad_norm": 0.6029577255249023, + "learning_rate": 0.00019688947372888688, + "loss": 1.2088, + "step": 46130 + }, + { + "epoch": 0.09803051927598916, + "grad_norm": 0.3816668689250946, + "learning_rate": 0.00019688778797227453, + "loss": 1.2064, + "step": 46140 + }, + { + "epoch": 0.09805176559572822, + "grad_norm": 0.48363006114959717, + "learning_rate": 0.00019688610176620677, + "loss": 1.253, + "step": 46150 + }, + { + "epoch": 0.09807301191546727, + "grad_norm": 0.45435455441474915, + "learning_rate": 0.0001968844151106914, + "loss": 1.1672, + "step": 46160 + }, + { + "epoch": 0.09809425823520632, + "grad_norm": 0.6061800718307495, + "learning_rate": 0.0001968827280057363, + "loss": 1.2101, + "step": 46170 + }, + { + "epoch": 0.09811550455494537, + "grad_norm": 0.3964274227619171, + "learning_rate": 0.00019688104045134923, + "loss": 1.2204, + "step": 46180 + }, + { + "epoch": 0.09813675087468443, + "grad_norm": 0.7910317778587341, + "learning_rate": 0.00019687935244753807, + "loss": 1.1933, + "step": 46190 + }, + { + "epoch": 0.09815799719442347, + "grad_norm": 0.38174572587013245, + "learning_rate": 0.00019687766399431063, + "loss": 1.2317, + "step": 46200 + }, + { + "epoch": 0.09817924351416253, + "grad_norm": 0.3776785135269165, + "learning_rate": 0.00019687597509167475, + "loss": 1.2035, + "step": 46210 + }, + { + "epoch": 0.09820048983390159, + "grad_norm": 0.51534104347229, + "learning_rate": 0.00019687428573963823, + "loss": 1.1652, + "step": 46220 + }, + { + "epoch": 0.09822173615364063, + "grad_norm": 0.4127725660800934, + "learning_rate": 0.00019687259593820897, + "loss": 1.2032, + "step": 46230 + }, + { + "epoch": 0.09824298247337969, + "grad_norm": 0.39042606949806213, + "learning_rate": 0.00019687090568739477, + "loss": 1.2222, + "step": 46240 + }, + { + "epoch": 0.09826422879311875, + "grad_norm": 0.34548136591911316, + "learning_rate": 0.00019686921498720348, + "loss": 1.2338, + "step": 46250 + }, + { + "epoch": 0.09828547511285779, + "grad_norm": 0.38264498114585876, + "learning_rate": 0.0001968675238376429, + "loss": 1.2095, + "step": 46260 + }, + { + "epoch": 0.09830672143259685, + "grad_norm": 0.45565494894981384, + "learning_rate": 0.00019686583223872095, + "loss": 1.2027, + "step": 46270 + }, + { + "epoch": 0.0983279677523359, + "grad_norm": 0.3514977991580963, + "learning_rate": 0.00019686414019044542, + "loss": 1.2395, + "step": 46280 + }, + { + "epoch": 0.09834921407207495, + "grad_norm": 0.34266453981399536, + "learning_rate": 0.0001968624476928242, + "loss": 1.2236, + "step": 46290 + }, + { + "epoch": 0.098370460391814, + "grad_norm": 0.342540442943573, + "learning_rate": 0.00019686075474586513, + "loss": 1.2311, + "step": 46300 + }, + { + "epoch": 0.09839170671155306, + "grad_norm": 0.3713934123516083, + "learning_rate": 0.00019685906134957605, + "loss": 1.2342, + "step": 46310 + }, + { + "epoch": 0.0984129530312921, + "grad_norm": 0.36804285645484924, + "learning_rate": 0.0001968573675039648, + "loss": 1.2044, + "step": 46320 + }, + { + "epoch": 0.09843419935103116, + "grad_norm": 0.37590786814689636, + "learning_rate": 0.00019685567320903928, + "loss": 1.2387, + "step": 46330 + }, + { + "epoch": 0.09845544567077022, + "grad_norm": 0.35511723160743713, + "learning_rate": 0.0001968539784648073, + "loss": 1.2004, + "step": 46340 + }, + { + "epoch": 0.09847669199050926, + "grad_norm": 0.3890310525894165, + "learning_rate": 0.0001968522832712768, + "loss": 1.2383, + "step": 46350 + }, + { + "epoch": 0.09849793831024832, + "grad_norm": 0.39726653695106506, + "learning_rate": 0.00019685058762845557, + "loss": 1.2345, + "step": 46360 + }, + { + "epoch": 0.09851918462998738, + "grad_norm": 0.3315986394882202, + "learning_rate": 0.00019684889153635145, + "loss": 1.2161, + "step": 46370 + }, + { + "epoch": 0.09854043094972642, + "grad_norm": 0.39154085516929626, + "learning_rate": 0.00019684719499497245, + "loss": 1.2261, + "step": 46380 + }, + { + "epoch": 0.09856167726946548, + "grad_norm": 0.3680853545665741, + "learning_rate": 0.00019684549800432627, + "loss": 1.2171, + "step": 46390 + }, + { + "epoch": 0.09858292358920454, + "grad_norm": 0.3897370994091034, + "learning_rate": 0.00019684380056442092, + "loss": 1.1936, + "step": 46400 + }, + { + "epoch": 0.09860416990894358, + "grad_norm": 0.3358486294746399, + "learning_rate": 0.00019684210267526418, + "loss": 1.1915, + "step": 46410 + }, + { + "epoch": 0.09862541622868264, + "grad_norm": 0.33272555470466614, + "learning_rate": 0.00019684040433686395, + "loss": 1.1871, + "step": 46420 + }, + { + "epoch": 0.09864666254842169, + "grad_norm": 0.4823155105113983, + "learning_rate": 0.00019683870554922813, + "loss": 1.2347, + "step": 46430 + }, + { + "epoch": 0.09866790886816074, + "grad_norm": 0.3531831204891205, + "learning_rate": 0.00019683700631236458, + "loss": 1.2224, + "step": 46440 + }, + { + "epoch": 0.0986891551878998, + "grad_norm": 0.45221686363220215, + "learning_rate": 0.00019683530662628122, + "loss": 1.2108, + "step": 46450 + }, + { + "epoch": 0.09871040150763885, + "grad_norm": 0.3910132646560669, + "learning_rate": 0.0001968336064909859, + "loss": 1.2165, + "step": 46460 + }, + { + "epoch": 0.09873164782737791, + "grad_norm": 0.453581303358078, + "learning_rate": 0.00019683190590648653, + "loss": 1.2362, + "step": 46470 + }, + { + "epoch": 0.09875289414711695, + "grad_norm": 0.470120906829834, + "learning_rate": 0.00019683020487279093, + "loss": 1.194, + "step": 46480 + }, + { + "epoch": 0.09877414046685601, + "grad_norm": 0.3783995509147644, + "learning_rate": 0.00019682850338990712, + "loss": 1.2151, + "step": 46490 + }, + { + "epoch": 0.09879538678659507, + "grad_norm": 0.395707368850708, + "learning_rate": 0.00019682680145784284, + "loss": 1.1837, + "step": 46500 + }, + { + "epoch": 0.09881663310633411, + "grad_norm": 0.356702983379364, + "learning_rate": 0.0001968250990766061, + "loss": 1.2025, + "step": 46510 + }, + { + "epoch": 0.09883787942607317, + "grad_norm": 0.37288475036621094, + "learning_rate": 0.00019682339624620476, + "loss": 1.2113, + "step": 46520 + }, + { + "epoch": 0.09885912574581222, + "grad_norm": 0.49767711758613586, + "learning_rate": 0.0001968216929666467, + "loss": 1.2061, + "step": 46530 + }, + { + "epoch": 0.09888037206555127, + "grad_norm": 0.5013999342918396, + "learning_rate": 0.00019681998923793987, + "loss": 1.2207, + "step": 46540 + }, + { + "epoch": 0.09890161838529032, + "grad_norm": 0.41569334268569946, + "learning_rate": 0.00019681828506009212, + "loss": 1.2246, + "step": 46550 + }, + { + "epoch": 0.09892286470502938, + "grad_norm": 0.4085160195827484, + "learning_rate": 0.00019681658043311137, + "loss": 1.1839, + "step": 46560 + }, + { + "epoch": 0.09894411102476842, + "grad_norm": 0.3855364918708801, + "learning_rate": 0.00019681487535700557, + "loss": 1.223, + "step": 46570 + }, + { + "epoch": 0.09896535734450748, + "grad_norm": 0.36500284075737, + "learning_rate": 0.0001968131698317826, + "loss": 1.2053, + "step": 46580 + }, + { + "epoch": 0.09898660366424654, + "grad_norm": 0.39877843856811523, + "learning_rate": 0.00019681146385745033, + "loss": 1.2119, + "step": 46590 + }, + { + "epoch": 0.09900784998398558, + "grad_norm": 0.34991613030433655, + "learning_rate": 0.00019680975743401673, + "loss": 1.2222, + "step": 46600 + }, + { + "epoch": 0.09902909630372464, + "grad_norm": 0.48342373967170715, + "learning_rate": 0.00019680805056148973, + "loss": 1.1942, + "step": 46610 + }, + { + "epoch": 0.0990503426234637, + "grad_norm": 0.6320345997810364, + "learning_rate": 0.00019680634323987717, + "loss": 1.1915, + "step": 46620 + }, + { + "epoch": 0.09907158894320274, + "grad_norm": 0.569328248500824, + "learning_rate": 0.00019680463546918703, + "loss": 1.2251, + "step": 46630 + }, + { + "epoch": 0.0990928352629418, + "grad_norm": 0.3078470230102539, + "learning_rate": 0.00019680292724942721, + "loss": 1.2232, + "step": 46640 + }, + { + "epoch": 0.09911408158268085, + "grad_norm": 0.42546290159225464, + "learning_rate": 0.00019680121858060568, + "loss": 1.1976, + "step": 46650 + }, + { + "epoch": 0.0991353279024199, + "grad_norm": 0.4685881733894348, + "learning_rate": 0.00019679950946273027, + "loss": 1.21, + "step": 46660 + }, + { + "epoch": 0.09915657422215896, + "grad_norm": 0.3247661888599396, + "learning_rate": 0.00019679779989580903, + "loss": 1.2146, + "step": 46670 + }, + { + "epoch": 0.09917782054189801, + "grad_norm": 0.4233071208000183, + "learning_rate": 0.0001967960898798498, + "loss": 1.2025, + "step": 46680 + }, + { + "epoch": 0.09919906686163706, + "grad_norm": 0.35427215695381165, + "learning_rate": 0.00019679437941486053, + "loss": 1.2534, + "step": 46690 + }, + { + "epoch": 0.09922031318137611, + "grad_norm": 0.3314899206161499, + "learning_rate": 0.00019679266850084915, + "loss": 1.2346, + "step": 46700 + }, + { + "epoch": 0.09924155950111517, + "grad_norm": 0.32258909940719604, + "learning_rate": 0.00019679095713782366, + "loss": 1.1907, + "step": 46710 + }, + { + "epoch": 0.09926280582085421, + "grad_norm": 0.3756873309612274, + "learning_rate": 0.00019678924532579194, + "loss": 1.2514, + "step": 46720 + }, + { + "epoch": 0.09928405214059327, + "grad_norm": 0.38322916626930237, + "learning_rate": 0.00019678753306476195, + "loss": 1.1924, + "step": 46730 + }, + { + "epoch": 0.09930529846033233, + "grad_norm": 0.3724236488342285, + "learning_rate": 0.0001967858203547416, + "loss": 1.224, + "step": 46740 + }, + { + "epoch": 0.09932654478007137, + "grad_norm": 0.36415019631385803, + "learning_rate": 0.00019678410719573886, + "loss": 1.226, + "step": 46750 + }, + { + "epoch": 0.09934779109981043, + "grad_norm": 0.3380279839038849, + "learning_rate": 0.0001967823935877617, + "loss": 1.1892, + "step": 46760 + }, + { + "epoch": 0.09936903741954949, + "grad_norm": 0.34010306000709534, + "learning_rate": 0.00019678067953081803, + "loss": 1.1866, + "step": 46770 + }, + { + "epoch": 0.09939028373928853, + "grad_norm": 0.3395647406578064, + "learning_rate": 0.0001967789650249158, + "loss": 1.187, + "step": 46780 + }, + { + "epoch": 0.09941153005902759, + "grad_norm": 0.34944817423820496, + "learning_rate": 0.000196777250070063, + "loss": 1.2757, + "step": 46790 + }, + { + "epoch": 0.09943277637876664, + "grad_norm": 0.37976977229118347, + "learning_rate": 0.00019677553466626758, + "loss": 1.2127, + "step": 46800 + }, + { + "epoch": 0.09945402269850569, + "grad_norm": 0.3648374378681183, + "learning_rate": 0.0001967738188135375, + "loss": 1.2478, + "step": 46810 + }, + { + "epoch": 0.09947526901824474, + "grad_norm": 0.3607464134693146, + "learning_rate": 0.00019677210251188066, + "loss": 1.2049, + "step": 46820 + }, + { + "epoch": 0.0994965153379838, + "grad_norm": 0.3881317973136902, + "learning_rate": 0.00019677038576130512, + "loss": 1.1823, + "step": 46830 + }, + { + "epoch": 0.09951776165772284, + "grad_norm": 0.33413898944854736, + "learning_rate": 0.00019676866856181872, + "loss": 1.1938, + "step": 46840 + }, + { + "epoch": 0.0995390079774619, + "grad_norm": 0.47572988271713257, + "learning_rate": 0.00019676695091342956, + "loss": 1.1999, + "step": 46850 + }, + { + "epoch": 0.09956025429720096, + "grad_norm": 0.4420117437839508, + "learning_rate": 0.00019676523281614552, + "loss": 1.2182, + "step": 46860 + }, + { + "epoch": 0.09958150061694002, + "grad_norm": 0.3951041102409363, + "learning_rate": 0.0001967635142699746, + "loss": 1.1983, + "step": 46870 + }, + { + "epoch": 0.09960274693667906, + "grad_norm": 0.3917999863624573, + "learning_rate": 0.00019676179527492477, + "loss": 1.2152, + "step": 46880 + }, + { + "epoch": 0.09962399325641812, + "grad_norm": 0.35289159417152405, + "learning_rate": 0.00019676007583100397, + "loss": 1.2377, + "step": 46890 + }, + { + "epoch": 0.09964523957615717, + "grad_norm": 0.32401785254478455, + "learning_rate": 0.00019675835593822023, + "loss": 1.1804, + "step": 46900 + }, + { + "epoch": 0.09966648589589622, + "grad_norm": 0.3486576974391937, + "learning_rate": 0.00019675663559658152, + "loss": 1.2251, + "step": 46910 + }, + { + "epoch": 0.09968773221563527, + "grad_norm": 0.36346864700317383, + "learning_rate": 0.00019675491480609578, + "loss": 1.1925, + "step": 46920 + }, + { + "epoch": 0.09970897853537433, + "grad_norm": 0.8260560631752014, + "learning_rate": 0.00019675319356677104, + "loss": 1.2064, + "step": 46930 + }, + { + "epoch": 0.09973022485511338, + "grad_norm": 0.5051314830780029, + "learning_rate": 0.00019675147187861526, + "loss": 1.1989, + "step": 46940 + }, + { + "epoch": 0.09975147117485243, + "grad_norm": 0.3639712929725647, + "learning_rate": 0.00019674974974163642, + "loss": 1.1989, + "step": 46950 + }, + { + "epoch": 0.09977271749459149, + "grad_norm": 0.4508886933326721, + "learning_rate": 0.00019674802715584252, + "loss": 1.204, + "step": 46960 + }, + { + "epoch": 0.09979396381433053, + "grad_norm": 0.570828378200531, + "learning_rate": 0.00019674630412124157, + "loss": 1.2406, + "step": 46970 + }, + { + "epoch": 0.09981521013406959, + "grad_norm": 0.5371390581130981, + "learning_rate": 0.00019674458063784154, + "loss": 1.2209, + "step": 46980 + }, + { + "epoch": 0.09983645645380865, + "grad_norm": 0.3550439476966858, + "learning_rate": 0.0001967428567056504, + "loss": 1.2317, + "step": 46990 + }, + { + "epoch": 0.09985770277354769, + "grad_norm": 0.31161198019981384, + "learning_rate": 0.0001967411323246762, + "loss": 1.2, + "step": 47000 + }, + { + "epoch": 0.09987894909328675, + "grad_norm": 0.41449078917503357, + "learning_rate": 0.0001967394074949269, + "loss": 1.1913, + "step": 47010 + }, + { + "epoch": 0.0999001954130258, + "grad_norm": 0.35602203011512756, + "learning_rate": 0.00019673768221641056, + "loss": 1.1734, + "step": 47020 + }, + { + "epoch": 0.09992144173276485, + "grad_norm": 0.4124370515346527, + "learning_rate": 0.0001967359564891351, + "loss": 1.2119, + "step": 47030 + }, + { + "epoch": 0.0999426880525039, + "grad_norm": 0.5796201825141907, + "learning_rate": 0.00019673423031310857, + "loss": 1.2276, + "step": 47040 + }, + { + "epoch": 0.09996393437224296, + "grad_norm": 0.5499954223632812, + "learning_rate": 0.00019673250368833898, + "loss": 1.2105, + "step": 47050 + }, + { + "epoch": 0.099985180691982, + "grad_norm": 0.5430469512939453, + "learning_rate": 0.00019673077661483434, + "loss": 1.1813, + "step": 47060 + }, + { + "epoch": 0.10000642701172106, + "grad_norm": 0.797410249710083, + "learning_rate": 0.00019672904909260263, + "loss": 1.2454, + "step": 47070 + }, + { + "epoch": 0.10002767333146012, + "grad_norm": 0.3466276526451111, + "learning_rate": 0.0001967273211216519, + "loss": 1.2437, + "step": 47080 + }, + { + "epoch": 0.10004891965119916, + "grad_norm": 0.3715229034423828, + "learning_rate": 0.00019672559270199016, + "loss": 1.1973, + "step": 47090 + }, + { + "epoch": 0.10007016597093822, + "grad_norm": 0.43381983041763306, + "learning_rate": 0.00019672386383362542, + "loss": 1.1855, + "step": 47100 + }, + { + "epoch": 0.10009141229067728, + "grad_norm": 0.4572511315345764, + "learning_rate": 0.0001967221345165657, + "loss": 1.2345, + "step": 47110 + }, + { + "epoch": 0.10011265861041632, + "grad_norm": 0.5357984900474548, + "learning_rate": 0.00019672040475081904, + "loss": 1.2198, + "step": 47120 + }, + { + "epoch": 0.10013390493015538, + "grad_norm": 0.35369300842285156, + "learning_rate": 0.0001967186745363934, + "loss": 1.2516, + "step": 47130 + }, + { + "epoch": 0.10015515124989444, + "grad_norm": 0.387882262468338, + "learning_rate": 0.00019671694387329692, + "loss": 1.2024, + "step": 47140 + }, + { + "epoch": 0.10017639756963348, + "grad_norm": 0.3984658122062683, + "learning_rate": 0.00019671521276153753, + "loss": 1.2147, + "step": 47150 + }, + { + "epoch": 0.10019764388937254, + "grad_norm": 0.3727385103702545, + "learning_rate": 0.0001967134812011233, + "loss": 1.1903, + "step": 47160 + }, + { + "epoch": 0.1002188902091116, + "grad_norm": 0.36478281021118164, + "learning_rate": 0.00019671174919206225, + "loss": 1.2403, + "step": 47170 + }, + { + "epoch": 0.10024013652885064, + "grad_norm": 0.3143441081047058, + "learning_rate": 0.00019671001673436244, + "loss": 1.2367, + "step": 47180 + }, + { + "epoch": 0.1002613828485897, + "grad_norm": 0.4175029993057251, + "learning_rate": 0.00019670828382803185, + "loss": 1.2518, + "step": 47190 + }, + { + "epoch": 0.10028262916832875, + "grad_norm": 0.3450263440608978, + "learning_rate": 0.0001967065504730786, + "loss": 1.2157, + "step": 47200 + }, + { + "epoch": 0.1003038754880678, + "grad_norm": 0.3282938599586487, + "learning_rate": 0.00019670481666951066, + "loss": 1.248, + "step": 47210 + }, + { + "epoch": 0.10032512180780685, + "grad_norm": 0.35797855257987976, + "learning_rate": 0.00019670308241733612, + "loss": 1.1933, + "step": 47220 + }, + { + "epoch": 0.10034636812754591, + "grad_norm": 0.320973664522171, + "learning_rate": 0.00019670134771656298, + "loss": 1.1995, + "step": 47230 + }, + { + "epoch": 0.10036761444728495, + "grad_norm": 0.374017596244812, + "learning_rate": 0.00019669961256719933, + "loss": 1.2301, + "step": 47240 + }, + { + "epoch": 0.10038886076702401, + "grad_norm": 0.32081523537635803, + "learning_rate": 0.0001966978769692532, + "loss": 1.2133, + "step": 47250 + }, + { + "epoch": 0.10041010708676307, + "grad_norm": 0.41415879130363464, + "learning_rate": 0.00019669614092273263, + "loss": 1.2306, + "step": 47260 + }, + { + "epoch": 0.10043135340650211, + "grad_norm": 0.3256511688232422, + "learning_rate": 0.0001966944044276457, + "loss": 1.2173, + "step": 47270 + }, + { + "epoch": 0.10045259972624117, + "grad_norm": 0.3910072445869446, + "learning_rate": 0.0001966926674840005, + "loss": 1.2105, + "step": 47280 + }, + { + "epoch": 0.10047384604598023, + "grad_norm": 0.3781132102012634, + "learning_rate": 0.00019669093009180497, + "loss": 1.2078, + "step": 47290 + }, + { + "epoch": 0.10049509236571928, + "grad_norm": 0.3605137765407562, + "learning_rate": 0.00019668919225106725, + "loss": 1.2144, + "step": 47300 + }, + { + "epoch": 0.10051633868545833, + "grad_norm": 0.4342609941959381, + "learning_rate": 0.00019668745396179542, + "loss": 1.258, + "step": 47310 + }, + { + "epoch": 0.10053758500519738, + "grad_norm": 0.44622305035591125, + "learning_rate": 0.0001966857152239975, + "loss": 1.1957, + "step": 47320 + }, + { + "epoch": 0.10055883132493644, + "grad_norm": 0.44434231519699097, + "learning_rate": 0.00019668397603768157, + "loss": 1.2795, + "step": 47330 + }, + { + "epoch": 0.10058007764467548, + "grad_norm": 0.6017922759056091, + "learning_rate": 0.00019668223640285568, + "loss": 1.23, + "step": 47340 + }, + { + "epoch": 0.10060132396441454, + "grad_norm": 0.41374126076698303, + "learning_rate": 0.00019668049631952795, + "loss": 1.2488, + "step": 47350 + }, + { + "epoch": 0.1006225702841536, + "grad_norm": 0.6315613389015198, + "learning_rate": 0.00019667875578770642, + "loss": 1.2179, + "step": 47360 + }, + { + "epoch": 0.10064381660389264, + "grad_norm": 0.3403237462043762, + "learning_rate": 0.00019667701480739915, + "loss": 1.221, + "step": 47370 + }, + { + "epoch": 0.1006650629236317, + "grad_norm": 0.34554827213287354, + "learning_rate": 0.00019667527337861423, + "loss": 1.2057, + "step": 47380 + }, + { + "epoch": 0.10068630924337076, + "grad_norm": 0.41538840532302856, + "learning_rate": 0.00019667353150135973, + "loss": 1.1591, + "step": 47390 + }, + { + "epoch": 0.1007075555631098, + "grad_norm": 0.36749470233917236, + "learning_rate": 0.00019667178917564377, + "loss": 1.2087, + "step": 47400 + }, + { + "epoch": 0.10072880188284886, + "grad_norm": 0.5106729865074158, + "learning_rate": 0.0001966700464014744, + "loss": 1.1981, + "step": 47410 + }, + { + "epoch": 0.10075004820258791, + "grad_norm": 0.6024112105369568, + "learning_rate": 0.0001966683031788597, + "loss": 1.2114, + "step": 47420 + }, + { + "epoch": 0.10077129452232696, + "grad_norm": 0.4752345085144043, + "learning_rate": 0.00019666655950780778, + "loss": 1.2232, + "step": 47430 + }, + { + "epoch": 0.10079254084206601, + "grad_norm": 0.3534458875656128, + "learning_rate": 0.0001966648153883267, + "loss": 1.2518, + "step": 47440 + }, + { + "epoch": 0.10081378716180507, + "grad_norm": 0.3367778956890106, + "learning_rate": 0.00019666307082042455, + "loss": 1.215, + "step": 47450 + }, + { + "epoch": 0.10083503348154411, + "grad_norm": 0.40491345524787903, + "learning_rate": 0.00019666132580410943, + "loss": 1.2206, + "step": 47460 + }, + { + "epoch": 0.10085627980128317, + "grad_norm": 0.36616525053977966, + "learning_rate": 0.00019665958033938946, + "loss": 1.2228, + "step": 47470 + }, + { + "epoch": 0.10087752612102223, + "grad_norm": 0.3798183500766754, + "learning_rate": 0.00019665783442627272, + "loss": 1.2467, + "step": 47480 + }, + { + "epoch": 0.10089877244076127, + "grad_norm": 0.45912858843803406, + "learning_rate": 0.0001966560880647673, + "loss": 1.2185, + "step": 47490 + }, + { + "epoch": 0.10092001876050033, + "grad_norm": 0.36277660727500916, + "learning_rate": 0.0001966543412548813, + "loss": 1.2118, + "step": 47500 + }, + { + "epoch": 0.10094126508023939, + "grad_norm": 0.36857956647872925, + "learning_rate": 0.00019665259399662287, + "loss": 1.2276, + "step": 47510 + }, + { + "epoch": 0.10096251139997843, + "grad_norm": 0.3474496901035309, + "learning_rate": 0.00019665084629000006, + "loss": 1.2293, + "step": 47520 + }, + { + "epoch": 0.10098375771971749, + "grad_norm": 0.3979780375957489, + "learning_rate": 0.000196649098135021, + "loss": 1.2106, + "step": 47530 + }, + { + "epoch": 0.10100500403945654, + "grad_norm": 0.359373539686203, + "learning_rate": 0.00019664734953169378, + "loss": 1.2173, + "step": 47540 + }, + { + "epoch": 0.10102625035919559, + "grad_norm": 0.4809451699256897, + "learning_rate": 0.00019664560048002653, + "loss": 1.2319, + "step": 47550 + }, + { + "epoch": 0.10104749667893465, + "grad_norm": 0.4961544871330261, + "learning_rate": 0.00019664385098002737, + "loss": 1.2361, + "step": 47560 + }, + { + "epoch": 0.1010687429986737, + "grad_norm": 0.6763209104537964, + "learning_rate": 0.00019664210103170437, + "loss": 1.1795, + "step": 47570 + }, + { + "epoch": 0.10108998931841275, + "grad_norm": 0.4066031277179718, + "learning_rate": 0.00019664035063506573, + "loss": 1.207, + "step": 47580 + }, + { + "epoch": 0.1011112356381518, + "grad_norm": 0.3322834372520447, + "learning_rate": 0.0001966385997901195, + "loss": 1.1942, + "step": 47590 + }, + { + "epoch": 0.10113248195789086, + "grad_norm": 0.3481619954109192, + "learning_rate": 0.00019663684849687382, + "loss": 1.2017, + "step": 47600 + }, + { + "epoch": 0.1011537282776299, + "grad_norm": 0.4125218689441681, + "learning_rate": 0.00019663509675533683, + "loss": 1.2501, + "step": 47610 + }, + { + "epoch": 0.10117497459736896, + "grad_norm": 0.5075086355209351, + "learning_rate": 0.00019663334456551668, + "loss": 1.1664, + "step": 47620 + }, + { + "epoch": 0.10119622091710802, + "grad_norm": 0.6339812278747559, + "learning_rate": 0.00019663159192742142, + "loss": 1.2096, + "step": 47630 + }, + { + "epoch": 0.10121746723684706, + "grad_norm": 0.4942723512649536, + "learning_rate": 0.00019662983884105923, + "loss": 1.1999, + "step": 47640 + }, + { + "epoch": 0.10123871355658612, + "grad_norm": 0.3872245252132416, + "learning_rate": 0.00019662808530643825, + "loss": 1.203, + "step": 47650 + }, + { + "epoch": 0.10125995987632518, + "grad_norm": 0.5187631845474243, + "learning_rate": 0.0001966263313235666, + "loss": 1.2056, + "step": 47660 + }, + { + "epoch": 0.10128120619606422, + "grad_norm": 0.36017441749572754, + "learning_rate": 0.00019662457689245243, + "loss": 1.1984, + "step": 47670 + }, + { + "epoch": 0.10130245251580328, + "grad_norm": 0.4632522463798523, + "learning_rate": 0.00019662282201310385, + "loss": 1.219, + "step": 47680 + }, + { + "epoch": 0.10132369883554233, + "grad_norm": 0.4609951078891754, + "learning_rate": 0.00019662106668552905, + "loss": 1.2215, + "step": 47690 + }, + { + "epoch": 0.10134494515528138, + "grad_norm": 0.3405320942401886, + "learning_rate": 0.00019661931090973612, + "loss": 1.2267, + "step": 47700 + }, + { + "epoch": 0.10136619147502043, + "grad_norm": 0.33285337686538696, + "learning_rate": 0.0001966175546857332, + "loss": 1.2065, + "step": 47710 + }, + { + "epoch": 0.10138743779475949, + "grad_norm": 0.4138963520526886, + "learning_rate": 0.00019661579801352847, + "loss": 1.2209, + "step": 47720 + }, + { + "epoch": 0.10140868411449855, + "grad_norm": 0.3720000684261322, + "learning_rate": 0.0001966140408931301, + "loss": 1.2283, + "step": 47730 + }, + { + "epoch": 0.10142993043423759, + "grad_norm": 0.3893931210041046, + "learning_rate": 0.0001966122833245462, + "loss": 1.1989, + "step": 47740 + }, + { + "epoch": 0.10145117675397665, + "grad_norm": 0.3449976146221161, + "learning_rate": 0.00019661052530778493, + "loss": 1.2599, + "step": 47750 + }, + { + "epoch": 0.1014724230737157, + "grad_norm": 0.35638314485549927, + "learning_rate": 0.00019660876684285446, + "loss": 1.2257, + "step": 47760 + }, + { + "epoch": 0.10149366939345475, + "grad_norm": 0.42815807461738586, + "learning_rate": 0.00019660700792976292, + "loss": 1.1904, + "step": 47770 + }, + { + "epoch": 0.1015149157131938, + "grad_norm": 0.377684623003006, + "learning_rate": 0.0001966052485685185, + "loss": 1.2436, + "step": 47780 + }, + { + "epoch": 0.10153616203293286, + "grad_norm": 0.37657928466796875, + "learning_rate": 0.00019660348875912936, + "loss": 1.237, + "step": 47790 + }, + { + "epoch": 0.10155740835267191, + "grad_norm": 0.39138466119766235, + "learning_rate": 0.00019660172850160366, + "loss": 1.2, + "step": 47800 + }, + { + "epoch": 0.10157865467241096, + "grad_norm": 0.35918205976486206, + "learning_rate": 0.00019659996779594955, + "loss": 1.1951, + "step": 47810 + }, + { + "epoch": 0.10159990099215002, + "grad_norm": 0.3330279290676117, + "learning_rate": 0.0001965982066421752, + "loss": 1.206, + "step": 47820 + }, + { + "epoch": 0.10162114731188907, + "grad_norm": 0.36255890130996704, + "learning_rate": 0.00019659644504028877, + "loss": 1.1975, + "step": 47830 + }, + { + "epoch": 0.10164239363162812, + "grad_norm": 0.5024212002754211, + "learning_rate": 0.00019659468299029848, + "loss": 1.1872, + "step": 47840 + }, + { + "epoch": 0.10166363995136718, + "grad_norm": 0.40898624062538147, + "learning_rate": 0.00019659292049221247, + "loss": 1.2063, + "step": 47850 + }, + { + "epoch": 0.10168488627110622, + "grad_norm": 0.4296853840351105, + "learning_rate": 0.0001965911575460389, + "loss": 1.1742, + "step": 47860 + }, + { + "epoch": 0.10170613259084528, + "grad_norm": 0.3734687864780426, + "learning_rate": 0.000196589394151786, + "loss": 1.2153, + "step": 47870 + }, + { + "epoch": 0.10172737891058434, + "grad_norm": 0.6376393437385559, + "learning_rate": 0.00019658763030946187, + "loss": 1.1949, + "step": 47880 + }, + { + "epoch": 0.10174862523032338, + "grad_norm": 0.5083280801773071, + "learning_rate": 0.00019658586601907477, + "loss": 1.1944, + "step": 47890 + }, + { + "epoch": 0.10176987155006244, + "grad_norm": 0.34257951378822327, + "learning_rate": 0.00019658410128063286, + "loss": 1.2008, + "step": 47900 + }, + { + "epoch": 0.1017911178698015, + "grad_norm": 0.3397018015384674, + "learning_rate": 0.0001965823360941443, + "loss": 1.2107, + "step": 47910 + }, + { + "epoch": 0.10181236418954054, + "grad_norm": 0.35894349217414856, + "learning_rate": 0.0001965805704596173, + "loss": 1.2366, + "step": 47920 + }, + { + "epoch": 0.1018336105092796, + "grad_norm": 0.3974301517009735, + "learning_rate": 0.0001965788043770601, + "loss": 1.1906, + "step": 47930 + }, + { + "epoch": 0.10185485682901865, + "grad_norm": 0.36691299080848694, + "learning_rate": 0.00019657703784648078, + "loss": 1.2391, + "step": 47940 + }, + { + "epoch": 0.1018761031487577, + "grad_norm": 0.3473052680492401, + "learning_rate": 0.00019657527086788763, + "loss": 1.1616, + "step": 47950 + }, + { + "epoch": 0.10189734946849675, + "grad_norm": 0.4181726574897766, + "learning_rate": 0.0001965735034412888, + "loss": 1.2331, + "step": 47960 + }, + { + "epoch": 0.10191859578823581, + "grad_norm": 0.3242053985595703, + "learning_rate": 0.0001965717355666925, + "loss": 1.2361, + "step": 47970 + }, + { + "epoch": 0.10193984210797485, + "grad_norm": 0.4321350157260895, + "learning_rate": 0.00019656996724410695, + "loss": 1.2211, + "step": 47980 + }, + { + "epoch": 0.10196108842771391, + "grad_norm": 0.4322989583015442, + "learning_rate": 0.00019656819847354033, + "loss": 1.2322, + "step": 47990 + }, + { + "epoch": 0.10198233474745297, + "grad_norm": 0.3331896960735321, + "learning_rate": 0.00019656642925500086, + "loss": 1.2089, + "step": 48000 + }, + { + "epoch": 0.10200358106719201, + "grad_norm": 0.3420170843601227, + "learning_rate": 0.00019656465958849673, + "loss": 1.1995, + "step": 48010 + }, + { + "epoch": 0.10202482738693107, + "grad_norm": 0.3301888406276703, + "learning_rate": 0.00019656288947403615, + "loss": 1.1811, + "step": 48020 + }, + { + "epoch": 0.10204607370667013, + "grad_norm": 0.3862003684043884, + "learning_rate": 0.00019656111891162736, + "loss": 1.2177, + "step": 48030 + }, + { + "epoch": 0.10206732002640917, + "grad_norm": 0.445232629776001, + "learning_rate": 0.00019655934790127857, + "loss": 1.2106, + "step": 48040 + }, + { + "epoch": 0.10208856634614823, + "grad_norm": 0.3201339840888977, + "learning_rate": 0.00019655757644299792, + "loss": 1.249, + "step": 48050 + }, + { + "epoch": 0.10210981266588728, + "grad_norm": 0.7513635754585266, + "learning_rate": 0.00019655580453679374, + "loss": 1.2523, + "step": 48060 + }, + { + "epoch": 0.10213105898562633, + "grad_norm": 0.615537703037262, + "learning_rate": 0.00019655403218267418, + "loss": 1.2062, + "step": 48070 + }, + { + "epoch": 0.10215230530536538, + "grad_norm": 0.33056312799453735, + "learning_rate": 0.00019655225938064748, + "loss": 1.2065, + "step": 48080 + }, + { + "epoch": 0.10217355162510444, + "grad_norm": 0.32831212878227234, + "learning_rate": 0.00019655048613072187, + "loss": 1.2262, + "step": 48090 + }, + { + "epoch": 0.10219479794484349, + "grad_norm": 0.7065915465354919, + "learning_rate": 0.00019654871243290556, + "loss": 1.2246, + "step": 48100 + }, + { + "epoch": 0.10221604426458254, + "grad_norm": 0.5033416152000427, + "learning_rate": 0.0001965469382872068, + "loss": 1.2184, + "step": 48110 + }, + { + "epoch": 0.1022372905843216, + "grad_norm": 0.3772578239440918, + "learning_rate": 0.00019654516369363377, + "loss": 1.2546, + "step": 48120 + }, + { + "epoch": 0.10225853690406064, + "grad_norm": 0.3230730891227722, + "learning_rate": 0.0001965433886521948, + "loss": 1.1945, + "step": 48130 + }, + { + "epoch": 0.1022797832237997, + "grad_norm": 0.35091185569763184, + "learning_rate": 0.000196541613162898, + "loss": 1.2217, + "step": 48140 + }, + { + "epoch": 0.10230102954353876, + "grad_norm": 0.35029226541519165, + "learning_rate": 0.00019653983722575172, + "loss": 1.2092, + "step": 48150 + }, + { + "epoch": 0.10232227586327781, + "grad_norm": 0.334978848695755, + "learning_rate": 0.00019653806084076413, + "loss": 1.2197, + "step": 48160 + }, + { + "epoch": 0.10234352218301686, + "grad_norm": 0.40076759457588196, + "learning_rate": 0.0001965362840079435, + "loss": 1.2119, + "step": 48170 + }, + { + "epoch": 0.10236476850275592, + "grad_norm": 0.34387528896331787, + "learning_rate": 0.00019653450672729803, + "loss": 1.2119, + "step": 48180 + }, + { + "epoch": 0.10238601482249497, + "grad_norm": 0.4111328721046448, + "learning_rate": 0.00019653272899883604, + "loss": 1.2409, + "step": 48190 + }, + { + "epoch": 0.10240726114223402, + "grad_norm": 0.4903002977371216, + "learning_rate": 0.0001965309508225657, + "loss": 1.2485, + "step": 48200 + }, + { + "epoch": 0.10242850746197307, + "grad_norm": 0.314145028591156, + "learning_rate": 0.0001965291721984953, + "loss": 1.2149, + "step": 48210 + }, + { + "epoch": 0.10244975378171213, + "grad_norm": 0.3595901131629944, + "learning_rate": 0.00019652739312663308, + "loss": 1.21, + "step": 48220 + }, + { + "epoch": 0.10247100010145117, + "grad_norm": 0.3321797847747803, + "learning_rate": 0.0001965256136069873, + "loss": 1.2196, + "step": 48230 + }, + { + "epoch": 0.10249224642119023, + "grad_norm": 0.33693525195121765, + "learning_rate": 0.00019652383363956622, + "loss": 1.2253, + "step": 48240 + }, + { + "epoch": 0.10251349274092929, + "grad_norm": 0.3096349239349365, + "learning_rate": 0.00019652205322437806, + "loss": 1.1967, + "step": 48250 + }, + { + "epoch": 0.10253473906066833, + "grad_norm": 0.3946145176887512, + "learning_rate": 0.00019652027236143115, + "loss": 1.2272, + "step": 48260 + }, + { + "epoch": 0.10255598538040739, + "grad_norm": 0.38023748993873596, + "learning_rate": 0.00019651849105073368, + "loss": 1.2165, + "step": 48270 + }, + { + "epoch": 0.10257723170014645, + "grad_norm": 0.373943030834198, + "learning_rate": 0.00019651670929229394, + "loss": 1.2349, + "step": 48280 + }, + { + "epoch": 0.10259847801988549, + "grad_norm": 0.3774549663066864, + "learning_rate": 0.0001965149270861202, + "loss": 1.1963, + "step": 48290 + }, + { + "epoch": 0.10261972433962455, + "grad_norm": 0.5696398019790649, + "learning_rate": 0.00019651314443222073, + "loss": 1.1868, + "step": 48300 + }, + { + "epoch": 0.1026409706593636, + "grad_norm": 0.34662413597106934, + "learning_rate": 0.0001965113613306038, + "loss": 1.2129, + "step": 48310 + }, + { + "epoch": 0.10266221697910265, + "grad_norm": 0.5489829182624817, + "learning_rate": 0.00019650957778127767, + "loss": 1.2043, + "step": 48320 + }, + { + "epoch": 0.1026834632988417, + "grad_norm": 0.38388291001319885, + "learning_rate": 0.0001965077937842506, + "loss": 1.1903, + "step": 48330 + }, + { + "epoch": 0.10270470961858076, + "grad_norm": 0.37260231375694275, + "learning_rate": 0.0001965060093395309, + "loss": 1.2201, + "step": 48340 + }, + { + "epoch": 0.1027259559383198, + "grad_norm": 0.5008670687675476, + "learning_rate": 0.00019650422444712686, + "loss": 1.1992, + "step": 48350 + }, + { + "epoch": 0.10274720225805886, + "grad_norm": 0.3467923104763031, + "learning_rate": 0.00019650243910704671, + "loss": 1.1998, + "step": 48360 + }, + { + "epoch": 0.10276844857779792, + "grad_norm": 0.4176049530506134, + "learning_rate": 0.00019650065331929873, + "loss": 1.2194, + "step": 48370 + }, + { + "epoch": 0.10278969489753696, + "grad_norm": 0.5909037590026855, + "learning_rate": 0.0001964988670838913, + "loss": 1.2097, + "step": 48380 + }, + { + "epoch": 0.10281094121727602, + "grad_norm": 0.43972349166870117, + "learning_rate": 0.00019649708040083257, + "loss": 1.2011, + "step": 48390 + }, + { + "epoch": 0.10283218753701508, + "grad_norm": 0.4029296040534973, + "learning_rate": 0.00019649529327013093, + "loss": 1.21, + "step": 48400 + }, + { + "epoch": 0.10285343385675412, + "grad_norm": 0.35158953070640564, + "learning_rate": 0.00019649350569179462, + "loss": 1.2135, + "step": 48410 + }, + { + "epoch": 0.10287468017649318, + "grad_norm": 0.3694283664226532, + "learning_rate": 0.00019649171766583195, + "loss": 1.1985, + "step": 48420 + }, + { + "epoch": 0.10289592649623223, + "grad_norm": 0.35094472765922546, + "learning_rate": 0.0001964899291922512, + "loss": 1.1711, + "step": 48430 + }, + { + "epoch": 0.10291717281597128, + "grad_norm": 0.3441838026046753, + "learning_rate": 0.0001964881402710607, + "loss": 1.2305, + "step": 48440 + }, + { + "epoch": 0.10293841913571034, + "grad_norm": 0.5619609355926514, + "learning_rate": 0.0001964863509022687, + "loss": 1.1784, + "step": 48450 + }, + { + "epoch": 0.10295966545544939, + "grad_norm": 0.5770743489265442, + "learning_rate": 0.00019648456108588356, + "loss": 1.1893, + "step": 48460 + }, + { + "epoch": 0.10298091177518844, + "grad_norm": 0.48250699043273926, + "learning_rate": 0.0001964827708219135, + "loss": 1.2358, + "step": 48470 + }, + { + "epoch": 0.10300215809492749, + "grad_norm": 0.34154200553894043, + "learning_rate": 0.00019648098011036693, + "loss": 1.2342, + "step": 48480 + }, + { + "epoch": 0.10302340441466655, + "grad_norm": 0.3810344338417053, + "learning_rate": 0.0001964791889512521, + "loss": 1.2617, + "step": 48490 + }, + { + "epoch": 0.1030446507344056, + "grad_norm": 0.34740081429481506, + "learning_rate": 0.0001964773973445773, + "loss": 1.2818, + "step": 48500 + }, + { + "epoch": 0.10306589705414465, + "grad_norm": 0.32147085666656494, + "learning_rate": 0.00019647560529035085, + "loss": 1.2257, + "step": 48510 + }, + { + "epoch": 0.10308714337388371, + "grad_norm": 0.35816630721092224, + "learning_rate": 0.0001964738127885811, + "loss": 1.2471, + "step": 48520 + }, + { + "epoch": 0.10310838969362275, + "grad_norm": 0.3613907992839813, + "learning_rate": 0.00019647201983927633, + "loss": 1.2518, + "step": 48530 + }, + { + "epoch": 0.10312963601336181, + "grad_norm": 0.3333801031112671, + "learning_rate": 0.00019647022644244487, + "loss": 1.2161, + "step": 48540 + }, + { + "epoch": 0.10315088233310087, + "grad_norm": 0.44253009557724, + "learning_rate": 0.00019646843259809503, + "loss": 1.2038, + "step": 48550 + }, + { + "epoch": 0.10317212865283991, + "grad_norm": 0.6138597726821899, + "learning_rate": 0.00019646663830623516, + "loss": 1.2156, + "step": 48560 + }, + { + "epoch": 0.10319337497257897, + "grad_norm": 0.339587926864624, + "learning_rate": 0.00019646484356687353, + "loss": 1.1913, + "step": 48570 + }, + { + "epoch": 0.10321462129231802, + "grad_norm": 0.3689868450164795, + "learning_rate": 0.00019646304838001854, + "loss": 1.2121, + "step": 48580 + }, + { + "epoch": 0.10323586761205708, + "grad_norm": 0.3965214192867279, + "learning_rate": 0.00019646125274567846, + "loss": 1.2182, + "step": 48590 + }, + { + "epoch": 0.10325711393179612, + "grad_norm": 0.38398048281669617, + "learning_rate": 0.00019645945666386163, + "loss": 1.2111, + "step": 48600 + }, + { + "epoch": 0.10327836025153518, + "grad_norm": 0.3417658507823944, + "learning_rate": 0.00019645766013457638, + "loss": 1.2273, + "step": 48610 + }, + { + "epoch": 0.10329960657127424, + "grad_norm": 0.36285531520843506, + "learning_rate": 0.00019645586315783107, + "loss": 1.2402, + "step": 48620 + }, + { + "epoch": 0.10332085289101328, + "grad_norm": 0.38046619296073914, + "learning_rate": 0.00019645406573363402, + "loss": 1.2093, + "step": 48630 + }, + { + "epoch": 0.10334209921075234, + "grad_norm": 0.4290890097618103, + "learning_rate": 0.00019645226786199355, + "loss": 1.2493, + "step": 48640 + }, + { + "epoch": 0.1033633455304914, + "grad_norm": 0.36582043766975403, + "learning_rate": 0.00019645046954291803, + "loss": 1.1966, + "step": 48650 + }, + { + "epoch": 0.10338459185023044, + "grad_norm": 0.45139458775520325, + "learning_rate": 0.00019644867077641575, + "loss": 1.1808, + "step": 48660 + }, + { + "epoch": 0.1034058381699695, + "grad_norm": 0.38276681303977966, + "learning_rate": 0.00019644687156249516, + "loss": 1.2334, + "step": 48670 + }, + { + "epoch": 0.10342708448970855, + "grad_norm": 0.3232603371143341, + "learning_rate": 0.00019644507190116447, + "loss": 1.2037, + "step": 48680 + }, + { + "epoch": 0.1034483308094476, + "grad_norm": 0.3846132457256317, + "learning_rate": 0.00019644327179243215, + "loss": 1.2762, + "step": 48690 + }, + { + "epoch": 0.10346957712918665, + "grad_norm": 0.5490344166755676, + "learning_rate": 0.00019644147123630649, + "loss": 1.1967, + "step": 48700 + }, + { + "epoch": 0.10349082344892571, + "grad_norm": 0.3544943034648895, + "learning_rate": 0.00019643967023279582, + "loss": 1.2548, + "step": 48710 + }, + { + "epoch": 0.10351206976866476, + "grad_norm": 0.3507348895072937, + "learning_rate": 0.00019643786878190857, + "loss": 1.2261, + "step": 48720 + }, + { + "epoch": 0.10353331608840381, + "grad_norm": 0.3677404224872589, + "learning_rate": 0.00019643606688365305, + "loss": 1.1976, + "step": 48730 + }, + { + "epoch": 0.10355456240814287, + "grad_norm": 0.3524252772331238, + "learning_rate": 0.00019643426453803758, + "loss": 1.2113, + "step": 48740 + }, + { + "epoch": 0.10357580872788191, + "grad_norm": 0.5251856446266174, + "learning_rate": 0.00019643246174507058, + "loss": 1.2248, + "step": 48750 + }, + { + "epoch": 0.10359705504762097, + "grad_norm": 0.3574824631214142, + "learning_rate": 0.0001964306585047604, + "loss": 1.217, + "step": 48760 + }, + { + "epoch": 0.10361830136736003, + "grad_norm": 0.6974388360977173, + "learning_rate": 0.0001964288548171154, + "loss": 1.2263, + "step": 48770 + }, + { + "epoch": 0.10363954768709907, + "grad_norm": 0.7099800109863281, + "learning_rate": 0.00019642705068214394, + "loss": 1.2102, + "step": 48780 + }, + { + "epoch": 0.10366079400683813, + "grad_norm": 0.4835274815559387, + "learning_rate": 0.00019642524609985443, + "loss": 1.2445, + "step": 48790 + }, + { + "epoch": 0.10368204032657719, + "grad_norm": 0.39699265360832214, + "learning_rate": 0.00019642344107025518, + "loss": 1.2536, + "step": 48800 + }, + { + "epoch": 0.10370328664631623, + "grad_norm": 0.4531601071357727, + "learning_rate": 0.00019642163559335457, + "loss": 1.2411, + "step": 48810 + }, + { + "epoch": 0.10372453296605529, + "grad_norm": 0.37252965569496155, + "learning_rate": 0.00019641982966916104, + "loss": 1.2056, + "step": 48820 + }, + { + "epoch": 0.10374577928579434, + "grad_norm": 0.33982911705970764, + "learning_rate": 0.00019641802329768292, + "loss": 1.2405, + "step": 48830 + }, + { + "epoch": 0.10376702560553339, + "grad_norm": 0.4012894630432129, + "learning_rate": 0.00019641621647892857, + "loss": 1.2437, + "step": 48840 + }, + { + "epoch": 0.10378827192527244, + "grad_norm": 0.5908187627792358, + "learning_rate": 0.0001964144092129064, + "loss": 1.2049, + "step": 48850 + }, + { + "epoch": 0.1038095182450115, + "grad_norm": 0.33937618136405945, + "learning_rate": 0.00019641260149962482, + "loss": 1.208, + "step": 48860 + }, + { + "epoch": 0.10383076456475054, + "grad_norm": 0.37721726298332214, + "learning_rate": 0.00019641079333909214, + "loss": 1.214, + "step": 48870 + }, + { + "epoch": 0.1038520108844896, + "grad_norm": 0.3510363698005676, + "learning_rate": 0.00019640898473131683, + "loss": 1.2133, + "step": 48880 + }, + { + "epoch": 0.10387325720422866, + "grad_norm": 0.49990975856781006, + "learning_rate": 0.00019640717567630725, + "loss": 1.2027, + "step": 48890 + }, + { + "epoch": 0.1038945035239677, + "grad_norm": 0.34709614515304565, + "learning_rate": 0.00019640536617407175, + "loss": 1.2299, + "step": 48900 + }, + { + "epoch": 0.10391574984370676, + "grad_norm": 0.3705205023288727, + "learning_rate": 0.0001964035562246188, + "loss": 1.2422, + "step": 48910 + }, + { + "epoch": 0.10393699616344582, + "grad_norm": 0.3920150101184845, + "learning_rate": 0.0001964017458279567, + "loss": 1.1692, + "step": 48920 + }, + { + "epoch": 0.10395824248318486, + "grad_norm": 0.6835892796516418, + "learning_rate": 0.00019639993498409396, + "loss": 1.1966, + "step": 48930 + }, + { + "epoch": 0.10397948880292392, + "grad_norm": 0.44015640020370483, + "learning_rate": 0.00019639812369303888, + "loss": 1.2021, + "step": 48940 + }, + { + "epoch": 0.10400073512266297, + "grad_norm": 0.42829760909080505, + "learning_rate": 0.00019639631195479993, + "loss": 1.1993, + "step": 48950 + }, + { + "epoch": 0.10402198144240202, + "grad_norm": 0.34014350175857544, + "learning_rate": 0.00019639449976938548, + "loss": 1.2294, + "step": 48960 + }, + { + "epoch": 0.10404322776214107, + "grad_norm": 0.3611302971839905, + "learning_rate": 0.00019639268713680399, + "loss": 1.2404, + "step": 48970 + }, + { + "epoch": 0.10406447408188013, + "grad_norm": 0.38533270359039307, + "learning_rate": 0.00019639087405706377, + "loss": 1.2062, + "step": 48980 + }, + { + "epoch": 0.10408572040161919, + "grad_norm": 0.4386252462863922, + "learning_rate": 0.00019638906053017334, + "loss": 1.1923, + "step": 48990 + }, + { + "epoch": 0.10410696672135823, + "grad_norm": 0.4640715420246124, + "learning_rate": 0.00019638724655614102, + "loss": 1.2493, + "step": 49000 + }, + { + "epoch": 0.10412821304109729, + "grad_norm": 0.33997857570648193, + "learning_rate": 0.0001963854321349753, + "loss": 1.2074, + "step": 49010 + }, + { + "epoch": 0.10414945936083635, + "grad_norm": 0.3555375635623932, + "learning_rate": 0.00019638361726668455, + "loss": 1.2205, + "step": 49020 + }, + { + "epoch": 0.10417070568057539, + "grad_norm": 0.3658335506916046, + "learning_rate": 0.00019638180195127717, + "loss": 1.2501, + "step": 49030 + }, + { + "epoch": 0.10419195200031445, + "grad_norm": 0.37624308466911316, + "learning_rate": 0.00019637998618876165, + "loss": 1.2329, + "step": 49040 + }, + { + "epoch": 0.1042131983200535, + "grad_norm": 0.48098015785217285, + "learning_rate": 0.00019637816997914635, + "loss": 1.2278, + "step": 49050 + }, + { + "epoch": 0.10423444463979255, + "grad_norm": 0.3825477063655853, + "learning_rate": 0.00019637635332243975, + "loss": 1.2202, + "step": 49060 + }, + { + "epoch": 0.1042556909595316, + "grad_norm": 0.48989060521125793, + "learning_rate": 0.00019637453621865024, + "loss": 1.1961, + "step": 49070 + }, + { + "epoch": 0.10427693727927066, + "grad_norm": 0.33028289675712585, + "learning_rate": 0.00019637271866778625, + "loss": 1.2479, + "step": 49080 + }, + { + "epoch": 0.1042981835990097, + "grad_norm": 0.4240339398384094, + "learning_rate": 0.0001963709006698562, + "loss": 1.208, + "step": 49090 + }, + { + "epoch": 0.10431942991874876, + "grad_norm": 0.36732909083366394, + "learning_rate": 0.00019636908222486857, + "loss": 1.2224, + "step": 49100 + }, + { + "epoch": 0.10434067623848782, + "grad_norm": 0.41057220101356506, + "learning_rate": 0.00019636726333283178, + "loss": 1.226, + "step": 49110 + }, + { + "epoch": 0.10436192255822686, + "grad_norm": 0.34632813930511475, + "learning_rate": 0.00019636544399375422, + "loss": 1.2359, + "step": 49120 + }, + { + "epoch": 0.10438316887796592, + "grad_norm": 0.37212538719177246, + "learning_rate": 0.0001963636242076444, + "loss": 1.2001, + "step": 49130 + }, + { + "epoch": 0.10440441519770498, + "grad_norm": 0.42839178442955017, + "learning_rate": 0.0001963618039745107, + "loss": 1.2024, + "step": 49140 + }, + { + "epoch": 0.10442566151744402, + "grad_norm": 0.4491060972213745, + "learning_rate": 0.00019635998329436163, + "loss": 1.2006, + "step": 49150 + }, + { + "epoch": 0.10444690783718308, + "grad_norm": 0.598915696144104, + "learning_rate": 0.00019635816216720555, + "loss": 1.1892, + "step": 49160 + }, + { + "epoch": 0.10446815415692214, + "grad_norm": 0.6885921955108643, + "learning_rate": 0.00019635634059305098, + "loss": 1.1806, + "step": 49170 + }, + { + "epoch": 0.10448940047666118, + "grad_norm": 0.5685209631919861, + "learning_rate": 0.00019635451857190635, + "loss": 1.2102, + "step": 49180 + }, + { + "epoch": 0.10451064679640024, + "grad_norm": 0.3335922658443451, + "learning_rate": 0.0001963526961037801, + "loss": 1.2336, + "step": 49190 + }, + { + "epoch": 0.1045318931161393, + "grad_norm": 0.358547180891037, + "learning_rate": 0.00019635087318868068, + "loss": 1.1898, + "step": 49200 + }, + { + "epoch": 0.10455313943587834, + "grad_norm": 0.3676273226737976, + "learning_rate": 0.0001963490498266166, + "loss": 1.2194, + "step": 49210 + }, + { + "epoch": 0.1045743857556174, + "grad_norm": 0.3550269603729248, + "learning_rate": 0.00019634722601759622, + "loss": 1.1803, + "step": 49220 + }, + { + "epoch": 0.10459563207535645, + "grad_norm": 0.43851345777511597, + "learning_rate": 0.0001963454017616281, + "loss": 1.207, + "step": 49230 + }, + { + "epoch": 0.1046168783950955, + "grad_norm": 0.35272935032844543, + "learning_rate": 0.00019634357705872066, + "loss": 1.168, + "step": 49240 + }, + { + "epoch": 0.10463812471483455, + "grad_norm": 0.3629777729511261, + "learning_rate": 0.00019634175190888235, + "loss": 1.214, + "step": 49250 + }, + { + "epoch": 0.10465937103457361, + "grad_norm": 0.38238728046417236, + "learning_rate": 0.00019633992631212167, + "loss": 1.2072, + "step": 49260 + }, + { + "epoch": 0.10468061735431265, + "grad_norm": 0.37899723649024963, + "learning_rate": 0.00019633810026844706, + "loss": 1.185, + "step": 49270 + }, + { + "epoch": 0.10470186367405171, + "grad_norm": 0.3751921057701111, + "learning_rate": 0.000196336273777867, + "loss": 1.1966, + "step": 49280 + }, + { + "epoch": 0.10472310999379077, + "grad_norm": 0.4161316156387329, + "learning_rate": 0.00019633444684038996, + "loss": 1.211, + "step": 49290 + }, + { + "epoch": 0.10474435631352981, + "grad_norm": 0.43478575348854065, + "learning_rate": 0.0001963326194560244, + "loss": 1.2518, + "step": 49300 + }, + { + "epoch": 0.10476560263326887, + "grad_norm": 0.5150311589241028, + "learning_rate": 0.00019633079162477884, + "loss": 1.2016, + "step": 49310 + }, + { + "epoch": 0.10478684895300792, + "grad_norm": 0.7001569867134094, + "learning_rate": 0.00019632896334666175, + "loss": 1.2557, + "step": 49320 + }, + { + "epoch": 0.10480809527274697, + "grad_norm": 0.7741338610649109, + "learning_rate": 0.00019632713462168158, + "loss": 1.232, + "step": 49330 + }, + { + "epoch": 0.10482934159248603, + "grad_norm": 0.3747975528240204, + "learning_rate": 0.00019632530544984685, + "loss": 1.254, + "step": 49340 + }, + { + "epoch": 0.10485058791222508, + "grad_norm": 0.3803033232688904, + "learning_rate": 0.00019632347583116598, + "loss": 1.2477, + "step": 49350 + }, + { + "epoch": 0.10487183423196413, + "grad_norm": 0.5018254518508911, + "learning_rate": 0.00019632164576564753, + "loss": 1.1836, + "step": 49360 + }, + { + "epoch": 0.10489308055170318, + "grad_norm": 0.4457484483718872, + "learning_rate": 0.00019631981525329998, + "loss": 1.2535, + "step": 49370 + }, + { + "epoch": 0.10491432687144224, + "grad_norm": 0.37294939160346985, + "learning_rate": 0.0001963179842941318, + "loss": 1.2122, + "step": 49380 + }, + { + "epoch": 0.10493557319118128, + "grad_norm": 0.3454376757144928, + "learning_rate": 0.00019631615288815145, + "loss": 1.2201, + "step": 49390 + }, + { + "epoch": 0.10495681951092034, + "grad_norm": 0.37626415491104126, + "learning_rate": 0.0001963143210353675, + "loss": 1.2239, + "step": 49400 + }, + { + "epoch": 0.1049780658306594, + "grad_norm": 0.4095008373260498, + "learning_rate": 0.0001963124887357884, + "loss": 1.1892, + "step": 49410 + }, + { + "epoch": 0.10499931215039846, + "grad_norm": 0.7789289951324463, + "learning_rate": 0.00019631065598942266, + "loss": 1.2403, + "step": 49420 + }, + { + "epoch": 0.1050205584701375, + "grad_norm": 0.6072856187820435, + "learning_rate": 0.00019630882279627877, + "loss": 1.1742, + "step": 49430 + }, + { + "epoch": 0.10504180478987656, + "grad_norm": 0.5576834082603455, + "learning_rate": 0.00019630698915636528, + "loss": 1.2259, + "step": 49440 + }, + { + "epoch": 0.10506305110961561, + "grad_norm": 0.40368035435676575, + "learning_rate": 0.00019630515506969065, + "loss": 1.2076, + "step": 49450 + }, + { + "epoch": 0.10508429742935466, + "grad_norm": 0.4028623104095459, + "learning_rate": 0.0001963033205362634, + "loss": 1.2093, + "step": 49460 + }, + { + "epoch": 0.10510554374909371, + "grad_norm": 0.33380210399627686, + "learning_rate": 0.00019630148555609207, + "loss": 1.208, + "step": 49470 + }, + { + "epoch": 0.10512679006883277, + "grad_norm": 0.32898321747779846, + "learning_rate": 0.00019629965012918512, + "loss": 1.1565, + "step": 49480 + }, + { + "epoch": 0.10514803638857181, + "grad_norm": 0.35038453340530396, + "learning_rate": 0.00019629781425555107, + "loss": 1.1927, + "step": 49490 + }, + { + "epoch": 0.10516928270831087, + "grad_norm": 0.3244130611419678, + "learning_rate": 0.00019629597793519848, + "loss": 1.2234, + "step": 49500 + }, + { + "epoch": 0.10519052902804993, + "grad_norm": 0.40882760286331177, + "learning_rate": 0.00019629414116813582, + "loss": 1.1996, + "step": 49510 + }, + { + "epoch": 0.10521177534778897, + "grad_norm": 0.5675496459007263, + "learning_rate": 0.00019629230395437167, + "loss": 1.1915, + "step": 49520 + }, + { + "epoch": 0.10523302166752803, + "grad_norm": 0.4142972528934479, + "learning_rate": 0.0001962904662939145, + "loss": 1.2018, + "step": 49530 + }, + { + "epoch": 0.10525426798726709, + "grad_norm": 0.33974066376686096, + "learning_rate": 0.00019628862818677284, + "loss": 1.2202, + "step": 49540 + }, + { + "epoch": 0.10527551430700613, + "grad_norm": 0.36482489109039307, + "learning_rate": 0.00019628678963295527, + "loss": 1.1772, + "step": 49550 + }, + { + "epoch": 0.10529676062674519, + "grad_norm": 0.42000535130500793, + "learning_rate": 0.00019628495063247022, + "loss": 1.1894, + "step": 49560 + }, + { + "epoch": 0.10531800694648424, + "grad_norm": 0.36430925130844116, + "learning_rate": 0.00019628311118532633, + "loss": 1.1618, + "step": 49570 + }, + { + "epoch": 0.10533925326622329, + "grad_norm": 0.35896432399749756, + "learning_rate": 0.00019628127129153202, + "loss": 1.2477, + "step": 49580 + }, + { + "epoch": 0.10536049958596234, + "grad_norm": 0.3546282649040222, + "learning_rate": 0.00019627943095109593, + "loss": 1.2412, + "step": 49590 + }, + { + "epoch": 0.1053817459057014, + "grad_norm": 0.32601800560951233, + "learning_rate": 0.00019627759016402655, + "loss": 1.2051, + "step": 49600 + }, + { + "epoch": 0.10540299222544044, + "grad_norm": 0.3465915024280548, + "learning_rate": 0.00019627574893033245, + "loss": 1.2003, + "step": 49610 + }, + { + "epoch": 0.1054242385451795, + "grad_norm": 0.5833266973495483, + "learning_rate": 0.0001962739072500221, + "loss": 1.2219, + "step": 49620 + }, + { + "epoch": 0.10544548486491856, + "grad_norm": 0.36263468861579895, + "learning_rate": 0.00019627206512310411, + "loss": 1.2224, + "step": 49630 + }, + { + "epoch": 0.1054667311846576, + "grad_norm": 0.3585442006587982, + "learning_rate": 0.000196270222549587, + "loss": 1.2122, + "step": 49640 + }, + { + "epoch": 0.10548797750439666, + "grad_norm": 0.37385112047195435, + "learning_rate": 0.0001962683795294793, + "loss": 1.2063, + "step": 49650 + }, + { + "epoch": 0.10550922382413572, + "grad_norm": 0.40724948048591614, + "learning_rate": 0.00019626653606278965, + "loss": 1.2523, + "step": 49660 + }, + { + "epoch": 0.10553047014387476, + "grad_norm": 0.38892704248428345, + "learning_rate": 0.00019626469214952644, + "loss": 1.2031, + "step": 49670 + }, + { + "epoch": 0.10555171646361382, + "grad_norm": 0.3643975555896759, + "learning_rate": 0.0001962628477896984, + "loss": 1.1918, + "step": 49680 + }, + { + "epoch": 0.10557296278335287, + "grad_norm": 0.46298348903656006, + "learning_rate": 0.00019626100298331394, + "loss": 1.1989, + "step": 49690 + }, + { + "epoch": 0.10559420910309192, + "grad_norm": 0.391830176115036, + "learning_rate": 0.0001962591577303817, + "loss": 1.2071, + "step": 49700 + }, + { + "epoch": 0.10561545542283098, + "grad_norm": 0.3155232071876526, + "learning_rate": 0.00019625731203091023, + "loss": 1.2034, + "step": 49710 + }, + { + "epoch": 0.10563670174257003, + "grad_norm": 0.38742274045944214, + "learning_rate": 0.0001962554658849081, + "loss": 1.2148, + "step": 49720 + }, + { + "epoch": 0.10565794806230908, + "grad_norm": 0.3867049813270569, + "learning_rate": 0.0001962536192923838, + "loss": 1.2004, + "step": 49730 + }, + { + "epoch": 0.10567919438204813, + "grad_norm": 0.43980875611305237, + "learning_rate": 0.000196251772253346, + "loss": 1.2143, + "step": 49740 + }, + { + "epoch": 0.10570044070178719, + "grad_norm": 0.4893444776535034, + "learning_rate": 0.00019624992476780323, + "loss": 1.2148, + "step": 49750 + }, + { + "epoch": 0.10572168702152623, + "grad_norm": 0.3646671772003174, + "learning_rate": 0.000196248076835764, + "loss": 1.2388, + "step": 49760 + }, + { + "epoch": 0.10574293334126529, + "grad_norm": 0.514509379863739, + "learning_rate": 0.00019624622845723698, + "loss": 1.2054, + "step": 49770 + }, + { + "epoch": 0.10576417966100435, + "grad_norm": 0.3261631429195404, + "learning_rate": 0.00019624437963223067, + "loss": 1.1982, + "step": 49780 + }, + { + "epoch": 0.10578542598074339, + "grad_norm": 0.32661008834838867, + "learning_rate": 0.0001962425303607537, + "loss": 1.2096, + "step": 49790 + }, + { + "epoch": 0.10580667230048245, + "grad_norm": 0.32326897978782654, + "learning_rate": 0.0001962406806428146, + "loss": 1.2343, + "step": 49800 + }, + { + "epoch": 0.1058279186202215, + "grad_norm": 0.34138399362564087, + "learning_rate": 0.000196238830478422, + "loss": 1.1959, + "step": 49810 + }, + { + "epoch": 0.10584916493996055, + "grad_norm": 0.44784674048423767, + "learning_rate": 0.00019623697986758442, + "loss": 1.2279, + "step": 49820 + }, + { + "epoch": 0.1058704112596996, + "grad_norm": 0.34585702419281006, + "learning_rate": 0.0001962351288103105, + "loss": 1.2228, + "step": 49830 + }, + { + "epoch": 0.10589165757943866, + "grad_norm": 0.4148368835449219, + "learning_rate": 0.0001962332773066088, + "loss": 1.2001, + "step": 49840 + }, + { + "epoch": 0.10591290389917772, + "grad_norm": 0.4006091356277466, + "learning_rate": 0.00019623142535648796, + "loss": 1.2617, + "step": 49850 + }, + { + "epoch": 0.10593415021891676, + "grad_norm": 0.36360520124435425, + "learning_rate": 0.0001962295729599565, + "loss": 1.2593, + "step": 49860 + }, + { + "epoch": 0.10595539653865582, + "grad_norm": 0.3777841329574585, + "learning_rate": 0.00019622772011702302, + "loss": 1.1962, + "step": 49870 + }, + { + "epoch": 0.10597664285839488, + "grad_norm": 0.3743411600589752, + "learning_rate": 0.00019622586682769617, + "loss": 1.1932, + "step": 49880 + }, + { + "epoch": 0.10599788917813392, + "grad_norm": 0.33333516120910645, + "learning_rate": 0.00019622401309198447, + "loss": 1.2184, + "step": 49890 + }, + { + "epoch": 0.10601913549787298, + "grad_norm": 0.374613493680954, + "learning_rate": 0.00019622215890989663, + "loss": 1.2116, + "step": 49900 + }, + { + "epoch": 0.10604038181761204, + "grad_norm": 0.5046667456626892, + "learning_rate": 0.00019622030428144112, + "loss": 1.2178, + "step": 49910 + }, + { + "epoch": 0.10606162813735108, + "grad_norm": 0.38648146390914917, + "learning_rate": 0.00019621844920662666, + "loss": 1.1827, + "step": 49920 + }, + { + "epoch": 0.10608287445709014, + "grad_norm": 0.35170280933380127, + "learning_rate": 0.00019621659368546177, + "loss": 1.2263, + "step": 49930 + }, + { + "epoch": 0.1061041207768292, + "grad_norm": 0.33473846316337585, + "learning_rate": 0.0001962147377179551, + "loss": 1.1854, + "step": 49940 + }, + { + "epoch": 0.10612536709656824, + "grad_norm": 0.33587589859962463, + "learning_rate": 0.00019621288130411527, + "loss": 1.2083, + "step": 49950 + }, + { + "epoch": 0.1061466134163073, + "grad_norm": 0.3333272933959961, + "learning_rate": 0.00019621102444395086, + "loss": 1.2034, + "step": 49960 + }, + { + "epoch": 0.10616785973604635, + "grad_norm": 0.33129608631134033, + "learning_rate": 0.00019620916713747048, + "loss": 1.209, + "step": 49970 + }, + { + "epoch": 0.1061891060557854, + "grad_norm": 0.3174738585948944, + "learning_rate": 0.0001962073093846828, + "loss": 1.2102, + "step": 49980 + }, + { + "epoch": 0.10621035237552445, + "grad_norm": 0.37797287106513977, + "learning_rate": 0.00019620545118559637, + "loss": 1.1995, + "step": 49990 + }, + { + "epoch": 0.10623159869526351, + "grad_norm": 0.3738493323326111, + "learning_rate": 0.00019620359254021984, + "loss": 1.1893, + "step": 50000 + }, + { + "epoch": 0.10625284501500255, + "grad_norm": 0.46521997451782227, + "learning_rate": 0.00019620173344856185, + "loss": 1.1984, + "step": 50010 + }, + { + "epoch": 0.10627409133474161, + "grad_norm": 0.3344343900680542, + "learning_rate": 0.000196199873910631, + "loss": 1.2008, + "step": 50020 + }, + { + "epoch": 0.10629533765448067, + "grad_norm": 0.4464644491672516, + "learning_rate": 0.00019619801392643593, + "loss": 1.2134, + "step": 50030 + }, + { + "epoch": 0.10631658397421971, + "grad_norm": 0.3631044328212738, + "learning_rate": 0.00019619615349598525, + "loss": 1.1994, + "step": 50040 + }, + { + "epoch": 0.10633783029395877, + "grad_norm": 0.3962871730327606, + "learning_rate": 0.00019619429261928762, + "loss": 1.2298, + "step": 50050 + }, + { + "epoch": 0.10635907661369783, + "grad_norm": 0.3924501836299896, + "learning_rate": 0.00019619243129635164, + "loss": 1.2414, + "step": 50060 + }, + { + "epoch": 0.10638032293343687, + "grad_norm": 0.34624797105789185, + "learning_rate": 0.00019619056952718594, + "loss": 1.2204, + "step": 50070 + }, + { + "epoch": 0.10640156925317593, + "grad_norm": 0.3488965928554535, + "learning_rate": 0.00019618870731179922, + "loss": 1.2029, + "step": 50080 + }, + { + "epoch": 0.10642281557291498, + "grad_norm": 0.34652870893478394, + "learning_rate": 0.00019618684465020007, + "loss": 1.213, + "step": 50090 + }, + { + "epoch": 0.10644406189265403, + "grad_norm": 0.36388781666755676, + "learning_rate": 0.0001961849815423971, + "loss": 1.2108, + "step": 50100 + }, + { + "epoch": 0.10646530821239308, + "grad_norm": 0.3671700358390808, + "learning_rate": 0.000196183117988399, + "loss": 1.1681, + "step": 50110 + }, + { + "epoch": 0.10648655453213214, + "grad_norm": 0.4113169312477112, + "learning_rate": 0.0001961812539882144, + "loss": 1.2403, + "step": 50120 + }, + { + "epoch": 0.10650780085187118, + "grad_norm": 0.4019358456134796, + "learning_rate": 0.00019617938954185197, + "loss": 1.2438, + "step": 50130 + }, + { + "epoch": 0.10652904717161024, + "grad_norm": 0.48123687505722046, + "learning_rate": 0.00019617752464932034, + "loss": 1.2128, + "step": 50140 + }, + { + "epoch": 0.1065502934913493, + "grad_norm": 0.352992445230484, + "learning_rate": 0.00019617565931062816, + "loss": 1.2466, + "step": 50150 + }, + { + "epoch": 0.10657153981108834, + "grad_norm": 0.3390234410762787, + "learning_rate": 0.00019617379352578407, + "loss": 1.2817, + "step": 50160 + }, + { + "epoch": 0.1065927861308274, + "grad_norm": 0.3941490948200226, + "learning_rate": 0.00019617192729479675, + "loss": 1.2366, + "step": 50170 + }, + { + "epoch": 0.10661403245056646, + "grad_norm": 0.44893255829811096, + "learning_rate": 0.00019617006061767482, + "loss": 1.1996, + "step": 50180 + }, + { + "epoch": 0.1066352787703055, + "grad_norm": 0.37746328115463257, + "learning_rate": 0.00019616819349442698, + "loss": 1.2064, + "step": 50190 + }, + { + "epoch": 0.10665652509004456, + "grad_norm": 0.3920738697052002, + "learning_rate": 0.0001961663259250619, + "loss": 1.2192, + "step": 50200 + }, + { + "epoch": 0.10667777140978361, + "grad_norm": 0.3833751976490021, + "learning_rate": 0.00019616445790958818, + "loss": 1.2054, + "step": 50210 + }, + { + "epoch": 0.10669901772952266, + "grad_norm": 0.35698622465133667, + "learning_rate": 0.00019616258944801456, + "loss": 1.2148, + "step": 50220 + }, + { + "epoch": 0.10672026404926171, + "grad_norm": 0.4328767955303192, + "learning_rate": 0.00019616072054034963, + "loss": 1.1972, + "step": 50230 + }, + { + "epoch": 0.10674151036900077, + "grad_norm": 0.3074212372303009, + "learning_rate": 0.00019615885118660215, + "loss": 1.2118, + "step": 50240 + }, + { + "epoch": 0.10676275668873982, + "grad_norm": 0.33468806743621826, + "learning_rate": 0.00019615698138678074, + "loss": 1.2246, + "step": 50250 + }, + { + "epoch": 0.10678400300847887, + "grad_norm": 0.3324488401412964, + "learning_rate": 0.00019615511114089406, + "loss": 1.2279, + "step": 50260 + }, + { + "epoch": 0.10680524932821793, + "grad_norm": 0.7383138537406921, + "learning_rate": 0.00019615324044895083, + "loss": 1.187, + "step": 50270 + }, + { + "epoch": 0.10682649564795699, + "grad_norm": 0.5349938869476318, + "learning_rate": 0.00019615136931095968, + "loss": 1.2034, + "step": 50280 + }, + { + "epoch": 0.10684774196769603, + "grad_norm": 0.5203927755355835, + "learning_rate": 0.00019614949772692931, + "loss": 1.2004, + "step": 50290 + }, + { + "epoch": 0.10686898828743509, + "grad_norm": 0.36578771471977234, + "learning_rate": 0.0001961476256968684, + "loss": 1.2419, + "step": 50300 + }, + { + "epoch": 0.10689023460717414, + "grad_norm": 0.3781841993331909, + "learning_rate": 0.00019614575322078568, + "loss": 1.1798, + "step": 50310 + }, + { + "epoch": 0.10691148092691319, + "grad_norm": 0.5641831755638123, + "learning_rate": 0.00019614388029868976, + "loss": 1.2089, + "step": 50320 + }, + { + "epoch": 0.10693272724665225, + "grad_norm": 0.6216785311698914, + "learning_rate": 0.00019614200693058936, + "loss": 1.2014, + "step": 50330 + }, + { + "epoch": 0.1069539735663913, + "grad_norm": 0.3764824867248535, + "learning_rate": 0.0001961401331164932, + "loss": 1.2457, + "step": 50340 + }, + { + "epoch": 0.10697521988613035, + "grad_norm": 0.4204555153846741, + "learning_rate": 0.00019613825885640994, + "loss": 1.2009, + "step": 50350 + }, + { + "epoch": 0.1069964662058694, + "grad_norm": 0.43941569328308105, + "learning_rate": 0.00019613638415034824, + "loss": 1.2343, + "step": 50360 + }, + { + "epoch": 0.10701771252560846, + "grad_norm": 0.36985599994659424, + "learning_rate": 0.0001961345089983169, + "loss": 1.2317, + "step": 50370 + }, + { + "epoch": 0.1070389588453475, + "grad_norm": 0.3577015995979309, + "learning_rate": 0.00019613263340032453, + "loss": 1.2675, + "step": 50380 + }, + { + "epoch": 0.10706020516508656, + "grad_norm": 0.7026738524436951, + "learning_rate": 0.00019613075735637985, + "loss": 1.2005, + "step": 50390 + }, + { + "epoch": 0.10708145148482562, + "grad_norm": 0.8190441727638245, + "learning_rate": 0.00019612888086649157, + "loss": 1.2166, + "step": 50400 + }, + { + "epoch": 0.10710269780456466, + "grad_norm": 0.49397093057632446, + "learning_rate": 0.00019612700393066841, + "loss": 1.22, + "step": 50410 + }, + { + "epoch": 0.10712394412430372, + "grad_norm": 0.4081210494041443, + "learning_rate": 0.00019612512654891906, + "loss": 1.2057, + "step": 50420 + }, + { + "epoch": 0.10714519044404278, + "grad_norm": 0.4151727557182312, + "learning_rate": 0.0001961232487212522, + "loss": 1.209, + "step": 50430 + }, + { + "epoch": 0.10716643676378182, + "grad_norm": 0.32963481545448303, + "learning_rate": 0.00019612137044767663, + "loss": 1.2287, + "step": 50440 + }, + { + "epoch": 0.10718768308352088, + "grad_norm": 0.3494763970375061, + "learning_rate": 0.00019611949172820093, + "loss": 1.2075, + "step": 50450 + }, + { + "epoch": 0.10720892940325993, + "grad_norm": 0.4050593972206116, + "learning_rate": 0.00019611761256283395, + "loss": 1.1851, + "step": 50460 + }, + { + "epoch": 0.10723017572299898, + "grad_norm": 0.45884987711906433, + "learning_rate": 0.00019611573295158434, + "loss": 1.1883, + "step": 50470 + }, + { + "epoch": 0.10725142204273803, + "grad_norm": 0.38387030363082886, + "learning_rate": 0.00019611385289446082, + "loss": 1.22, + "step": 50480 + }, + { + "epoch": 0.10727266836247709, + "grad_norm": 0.3190310001373291, + "learning_rate": 0.00019611197239147212, + "loss": 1.2377, + "step": 50490 + }, + { + "epoch": 0.10729391468221613, + "grad_norm": 0.38595637679100037, + "learning_rate": 0.00019611009144262695, + "loss": 1.2139, + "step": 50500 + }, + { + "epoch": 0.10731516100195519, + "grad_norm": 0.3448941111564636, + "learning_rate": 0.00019610821004793408, + "loss": 1.2408, + "step": 50510 + }, + { + "epoch": 0.10733640732169425, + "grad_norm": 0.6985213756561279, + "learning_rate": 0.00019610632820740218, + "loss": 1.2046, + "step": 50520 + }, + { + "epoch": 0.10735765364143329, + "grad_norm": 0.38549795746803284, + "learning_rate": 0.00019610444592103998, + "loss": 1.2222, + "step": 50530 + }, + { + "epoch": 0.10737889996117235, + "grad_norm": 0.334097683429718, + "learning_rate": 0.00019610256318885628, + "loss": 1.1774, + "step": 50540 + }, + { + "epoch": 0.10740014628091141, + "grad_norm": 0.3150816261768341, + "learning_rate": 0.00019610068001085974, + "loss": 1.1833, + "step": 50550 + }, + { + "epoch": 0.10742139260065045, + "grad_norm": 0.39696192741394043, + "learning_rate": 0.00019609879638705918, + "loss": 1.1847, + "step": 50560 + }, + { + "epoch": 0.10744263892038951, + "grad_norm": 0.42566928267478943, + "learning_rate": 0.00019609691231746323, + "loss": 1.2183, + "step": 50570 + }, + { + "epoch": 0.10746388524012856, + "grad_norm": 0.3531680405139923, + "learning_rate": 0.00019609502780208068, + "loss": 1.2323, + "step": 50580 + }, + { + "epoch": 0.10748513155986761, + "grad_norm": 0.34203049540519714, + "learning_rate": 0.00019609314284092032, + "loss": 1.2424, + "step": 50590 + }, + { + "epoch": 0.10750637787960667, + "grad_norm": 0.42011064291000366, + "learning_rate": 0.00019609125743399084, + "loss": 1.1976, + "step": 50600 + }, + { + "epoch": 0.10752762419934572, + "grad_norm": 0.42311957478523254, + "learning_rate": 0.00019608937158130098, + "loss": 1.2169, + "step": 50610 + }, + { + "epoch": 0.10754887051908477, + "grad_norm": 0.34975528717041016, + "learning_rate": 0.0001960874852828595, + "loss": 1.2627, + "step": 50620 + }, + { + "epoch": 0.10757011683882382, + "grad_norm": 0.44451460242271423, + "learning_rate": 0.0001960855985386752, + "loss": 1.2271, + "step": 50630 + }, + { + "epoch": 0.10759136315856288, + "grad_norm": 0.41902652382850647, + "learning_rate": 0.00019608371134875675, + "loss": 1.2349, + "step": 50640 + }, + { + "epoch": 0.10761260947830192, + "grad_norm": 0.4228070080280304, + "learning_rate": 0.00019608182371311296, + "loss": 1.2098, + "step": 50650 + }, + { + "epoch": 0.10763385579804098, + "grad_norm": 0.5382101535797119, + "learning_rate": 0.00019607993563175257, + "loss": 1.1952, + "step": 50660 + }, + { + "epoch": 0.10765510211778004, + "grad_norm": 0.47682827711105347, + "learning_rate": 0.0001960780471046843, + "loss": 1.233, + "step": 50670 + }, + { + "epoch": 0.10767634843751908, + "grad_norm": 0.4019811153411865, + "learning_rate": 0.000196076158131917, + "loss": 1.2082, + "step": 50680 + }, + { + "epoch": 0.10769759475725814, + "grad_norm": 0.3256792724132538, + "learning_rate": 0.00019607426871345938, + "loss": 1.1838, + "step": 50690 + }, + { + "epoch": 0.1077188410769972, + "grad_norm": 0.42936086654663086, + "learning_rate": 0.00019607237884932018, + "loss": 1.2196, + "step": 50700 + }, + { + "epoch": 0.10774008739673625, + "grad_norm": 0.49819982051849365, + "learning_rate": 0.0001960704885395082, + "loss": 1.2256, + "step": 50710 + }, + { + "epoch": 0.1077613337164753, + "grad_norm": 0.38142961263656616, + "learning_rate": 0.00019606859778403222, + "loss": 1.1975, + "step": 50720 + }, + { + "epoch": 0.10778258003621435, + "grad_norm": 0.32696038484573364, + "learning_rate": 0.00019606670658290097, + "loss": 1.2352, + "step": 50730 + }, + { + "epoch": 0.10780382635595341, + "grad_norm": 0.34081175923347473, + "learning_rate": 0.00019606481493612327, + "loss": 1.2027, + "step": 50740 + }, + { + "epoch": 0.10782507267569245, + "grad_norm": 0.4451906979084015, + "learning_rate": 0.00019606292284370786, + "loss": 1.2359, + "step": 50750 + }, + { + "epoch": 0.10784631899543151, + "grad_norm": 0.5839590430259705, + "learning_rate": 0.00019606103030566353, + "loss": 1.226, + "step": 50760 + }, + { + "epoch": 0.10786756531517057, + "grad_norm": 0.40945836901664734, + "learning_rate": 0.00019605913732199904, + "loss": 1.158, + "step": 50770 + }, + { + "epoch": 0.10788881163490961, + "grad_norm": 0.3753330409526825, + "learning_rate": 0.00019605724389272323, + "loss": 1.1989, + "step": 50780 + }, + { + "epoch": 0.10791005795464867, + "grad_norm": 0.4097663164138794, + "learning_rate": 0.00019605535001784483, + "loss": 1.2093, + "step": 50790 + }, + { + "epoch": 0.10793130427438773, + "grad_norm": 0.33036237955093384, + "learning_rate": 0.00019605345569737265, + "loss": 1.2304, + "step": 50800 + }, + { + "epoch": 0.10795255059412677, + "grad_norm": 0.36196771264076233, + "learning_rate": 0.00019605156093131543, + "loss": 1.1946, + "step": 50810 + }, + { + "epoch": 0.10797379691386583, + "grad_norm": 0.34128570556640625, + "learning_rate": 0.00019604966571968202, + "loss": 1.2373, + "step": 50820 + }, + { + "epoch": 0.10799504323360488, + "grad_norm": 0.40553852915763855, + "learning_rate": 0.00019604777006248118, + "loss": 1.1893, + "step": 50830 + }, + { + "epoch": 0.10801628955334393, + "grad_norm": 0.3581165075302124, + "learning_rate": 0.00019604587395972173, + "loss": 1.1961, + "step": 50840 + }, + { + "epoch": 0.10803753587308298, + "grad_norm": 0.6441290974617004, + "learning_rate": 0.00019604397741141243, + "loss": 1.1987, + "step": 50850 + }, + { + "epoch": 0.10805878219282204, + "grad_norm": 0.7414237856864929, + "learning_rate": 0.0001960420804175621, + "loss": 1.213, + "step": 50860 + }, + { + "epoch": 0.10808002851256109, + "grad_norm": 0.4997185170650482, + "learning_rate": 0.00019604018297817954, + "loss": 1.2382, + "step": 50870 + }, + { + "epoch": 0.10810127483230014, + "grad_norm": 0.42330318689346313, + "learning_rate": 0.00019603828509327356, + "loss": 1.1879, + "step": 50880 + }, + { + "epoch": 0.1081225211520392, + "grad_norm": 0.37018173933029175, + "learning_rate": 0.0001960363867628529, + "loss": 1.2258, + "step": 50890 + }, + { + "epoch": 0.10814376747177824, + "grad_norm": 0.5562825202941895, + "learning_rate": 0.00019603448798692646, + "loss": 1.1822, + "step": 50900 + }, + { + "epoch": 0.1081650137915173, + "grad_norm": 0.6111330389976501, + "learning_rate": 0.00019603258876550302, + "loss": 1.2375, + "step": 50910 + }, + { + "epoch": 0.10818626011125636, + "grad_norm": 0.4944499135017395, + "learning_rate": 0.00019603068909859137, + "loss": 1.1767, + "step": 50920 + }, + { + "epoch": 0.1082075064309954, + "grad_norm": 0.6143206357955933, + "learning_rate": 0.00019602878898620035, + "loss": 1.2021, + "step": 50930 + }, + { + "epoch": 0.10822875275073446, + "grad_norm": 0.48115676641464233, + "learning_rate": 0.0001960268884283387, + "loss": 1.2631, + "step": 50940 + }, + { + "epoch": 0.10824999907047352, + "grad_norm": 0.4120844006538391, + "learning_rate": 0.00019602498742501533, + "loss": 1.2124, + "step": 50950 + }, + { + "epoch": 0.10827124539021256, + "grad_norm": 0.530128002166748, + "learning_rate": 0.000196023085976239, + "loss": 1.2521, + "step": 50960 + }, + { + "epoch": 0.10829249170995162, + "grad_norm": 0.6289206147193909, + "learning_rate": 0.00019602118408201856, + "loss": 1.187, + "step": 50970 + }, + { + "epoch": 0.10831373802969067, + "grad_norm": 0.5654434561729431, + "learning_rate": 0.0001960192817423628, + "loss": 1.2012, + "step": 50980 + }, + { + "epoch": 0.10833498434942972, + "grad_norm": 0.7543283104896545, + "learning_rate": 0.0001960173789572806, + "loss": 1.2582, + "step": 50990 + }, + { + "epoch": 0.10835623066916877, + "grad_norm": 1.0447289943695068, + "learning_rate": 0.00019601547572678074, + "loss": 1.1932, + "step": 51000 + }, + { + "epoch": 0.10837747698890783, + "grad_norm": 0.5987885594367981, + "learning_rate": 0.00019601357205087204, + "loss": 1.2309, + "step": 51010 + }, + { + "epoch": 0.10839872330864687, + "grad_norm": 0.4107261300086975, + "learning_rate": 0.00019601166792956341, + "loss": 1.1978, + "step": 51020 + }, + { + "epoch": 0.10841996962838593, + "grad_norm": 0.3307528495788574, + "learning_rate": 0.00019600976336286362, + "loss": 1.2128, + "step": 51030 + }, + { + "epoch": 0.10844121594812499, + "grad_norm": 0.4125402867794037, + "learning_rate": 0.00019600785835078145, + "loss": 1.2626, + "step": 51040 + }, + { + "epoch": 0.10846246226786403, + "grad_norm": 0.32683536410331726, + "learning_rate": 0.00019600595289332584, + "loss": 1.2286, + "step": 51050 + }, + { + "epoch": 0.10848370858760309, + "grad_norm": 0.3140173852443695, + "learning_rate": 0.00019600404699050562, + "loss": 1.2034, + "step": 51060 + }, + { + "epoch": 0.10850495490734215, + "grad_norm": 0.3321906328201294, + "learning_rate": 0.00019600214064232954, + "loss": 1.2111, + "step": 51070 + }, + { + "epoch": 0.10852620122708119, + "grad_norm": 0.398461252450943, + "learning_rate": 0.00019600023384880655, + "loss": 1.2083, + "step": 51080 + }, + { + "epoch": 0.10854744754682025, + "grad_norm": 0.7113984227180481, + "learning_rate": 0.0001959983266099454, + "loss": 1.1874, + "step": 51090 + }, + { + "epoch": 0.1085686938665593, + "grad_norm": 0.45537638664245605, + "learning_rate": 0.00019599641892575505, + "loss": 1.2382, + "step": 51100 + }, + { + "epoch": 0.10858994018629836, + "grad_norm": 0.6668384671211243, + "learning_rate": 0.00019599451079624426, + "loss": 1.2634, + "step": 51110 + }, + { + "epoch": 0.1086111865060374, + "grad_norm": 0.3673287630081177, + "learning_rate": 0.00019599260222142188, + "loss": 1.2285, + "step": 51120 + }, + { + "epoch": 0.10863243282577646, + "grad_norm": 0.4191957712173462, + "learning_rate": 0.00019599069320129682, + "loss": 1.2091, + "step": 51130 + }, + { + "epoch": 0.10865367914551552, + "grad_norm": 0.38166993856430054, + "learning_rate": 0.0001959887837358779, + "loss": 1.2033, + "step": 51140 + }, + { + "epoch": 0.10867492546525456, + "grad_norm": 0.36085405945777893, + "learning_rate": 0.000195986873825174, + "loss": 1.2047, + "step": 51150 + }, + { + "epoch": 0.10869617178499362, + "grad_norm": 0.3206142783164978, + "learning_rate": 0.00019598496346919395, + "loss": 1.2052, + "step": 51160 + }, + { + "epoch": 0.10871741810473268, + "grad_norm": 0.3294389545917511, + "learning_rate": 0.00019598305266794663, + "loss": 1.2172, + "step": 51170 + }, + { + "epoch": 0.10873866442447172, + "grad_norm": 0.42147400975227356, + "learning_rate": 0.0001959811414214409, + "loss": 1.1978, + "step": 51180 + }, + { + "epoch": 0.10875991074421078, + "grad_norm": 0.3856017291545868, + "learning_rate": 0.00019597922972968562, + "loss": 1.2313, + "step": 51190 + }, + { + "epoch": 0.10878115706394983, + "grad_norm": 0.34847238659858704, + "learning_rate": 0.00019597731759268968, + "loss": 1.2035, + "step": 51200 + }, + { + "epoch": 0.10880240338368888, + "grad_norm": 0.3299292027950287, + "learning_rate": 0.00019597540501046194, + "loss": 1.2238, + "step": 51210 + }, + { + "epoch": 0.10882364970342794, + "grad_norm": 0.4917009174823761, + "learning_rate": 0.00019597349198301127, + "loss": 1.2123, + "step": 51220 + }, + { + "epoch": 0.10884489602316699, + "grad_norm": 0.5081589818000793, + "learning_rate": 0.00019597157851034653, + "loss": 1.2094, + "step": 51230 + }, + { + "epoch": 0.10886614234290604, + "grad_norm": 0.4646768867969513, + "learning_rate": 0.0001959696645924766, + "loss": 1.2255, + "step": 51240 + }, + { + "epoch": 0.1088873886626451, + "grad_norm": 0.336809366941452, + "learning_rate": 0.00019596775022941038, + "loss": 1.2452, + "step": 51250 + }, + { + "epoch": 0.10890863498238415, + "grad_norm": 0.350281685590744, + "learning_rate": 0.00019596583542115674, + "loss": 1.2096, + "step": 51260 + }, + { + "epoch": 0.1089298813021232, + "grad_norm": 0.34933218359947205, + "learning_rate": 0.00019596392016772458, + "loss": 1.1937, + "step": 51270 + }, + { + "epoch": 0.10895112762186225, + "grad_norm": 0.36417433619499207, + "learning_rate": 0.00019596200446912274, + "loss": 1.2373, + "step": 51280 + }, + { + "epoch": 0.10897237394160131, + "grad_norm": 0.34791404008865356, + "learning_rate": 0.00019596008832536014, + "loss": 1.1894, + "step": 51290 + }, + { + "epoch": 0.10899362026134035, + "grad_norm": 0.3425121009349823, + "learning_rate": 0.00019595817173644569, + "loss": 1.2093, + "step": 51300 + }, + { + "epoch": 0.10901486658107941, + "grad_norm": 0.4320519268512726, + "learning_rate": 0.00019595625470238817, + "loss": 1.2253, + "step": 51310 + }, + { + "epoch": 0.10903611290081847, + "grad_norm": 0.36219120025634766, + "learning_rate": 0.00019595433722319663, + "loss": 1.2135, + "step": 51320 + }, + { + "epoch": 0.10905735922055751, + "grad_norm": 0.4063878059387207, + "learning_rate": 0.00019595241929887987, + "loss": 1.2496, + "step": 51330 + }, + { + "epoch": 0.10907860554029657, + "grad_norm": 0.3672649562358856, + "learning_rate": 0.00019595050092944681, + "loss": 1.2191, + "step": 51340 + }, + { + "epoch": 0.10909985186003562, + "grad_norm": 0.3543028235435486, + "learning_rate": 0.00019594858211490634, + "loss": 1.2092, + "step": 51350 + }, + { + "epoch": 0.10912109817977467, + "grad_norm": 0.5267260074615479, + "learning_rate": 0.0001959466628552674, + "loss": 1.1913, + "step": 51360 + }, + { + "epoch": 0.10914234449951372, + "grad_norm": 0.5017749071121216, + "learning_rate": 0.00019594474315053883, + "loss": 1.2261, + "step": 51370 + }, + { + "epoch": 0.10916359081925278, + "grad_norm": 0.3843027353286743, + "learning_rate": 0.00019594282300072955, + "loss": 1.2476, + "step": 51380 + }, + { + "epoch": 0.10918483713899182, + "grad_norm": 0.3528479039669037, + "learning_rate": 0.0001959409024058485, + "loss": 1.2353, + "step": 51390 + }, + { + "epoch": 0.10920608345873088, + "grad_norm": 0.36898866295814514, + "learning_rate": 0.00019593898136590458, + "loss": 1.2199, + "step": 51400 + }, + { + "epoch": 0.10922732977846994, + "grad_norm": 0.41626375913619995, + "learning_rate": 0.00019593705988090672, + "loss": 1.2533, + "step": 51410 + }, + { + "epoch": 0.10924857609820898, + "grad_norm": 0.3915005326271057, + "learning_rate": 0.00019593513795086379, + "loss": 1.1614, + "step": 51420 + }, + { + "epoch": 0.10926982241794804, + "grad_norm": 0.4545738101005554, + "learning_rate": 0.0001959332155757847, + "loss": 1.1915, + "step": 51430 + }, + { + "epoch": 0.1092910687376871, + "grad_norm": 0.47634291648864746, + "learning_rate": 0.00019593129275567841, + "loss": 1.2139, + "step": 51440 + }, + { + "epoch": 0.10931231505742614, + "grad_norm": 0.36584874987602234, + "learning_rate": 0.00019592936949055384, + "loss": 1.1897, + "step": 51450 + }, + { + "epoch": 0.1093335613771652, + "grad_norm": 0.3272492289543152, + "learning_rate": 0.00019592744578041983, + "loss": 1.2233, + "step": 51460 + }, + { + "epoch": 0.10935480769690425, + "grad_norm": 0.3443218767642975, + "learning_rate": 0.00019592552162528542, + "loss": 1.2026, + "step": 51470 + }, + { + "epoch": 0.1093760540166433, + "grad_norm": 0.3847114145755768, + "learning_rate": 0.00019592359702515948, + "loss": 1.2215, + "step": 51480 + }, + { + "epoch": 0.10939730033638236, + "grad_norm": 0.36244866251945496, + "learning_rate": 0.00019592167198005095, + "loss": 1.1881, + "step": 51490 + }, + { + "epoch": 0.10941854665612141, + "grad_norm": 0.42229869961738586, + "learning_rate": 0.00019591974648996874, + "loss": 1.2167, + "step": 51500 + }, + { + "epoch": 0.10943979297586046, + "grad_norm": 0.3344082832336426, + "learning_rate": 0.0001959178205549218, + "loss": 1.2217, + "step": 51510 + }, + { + "epoch": 0.10946103929559951, + "grad_norm": 0.5264338850975037, + "learning_rate": 0.00019591589417491904, + "loss": 1.1848, + "step": 51520 + }, + { + "epoch": 0.10948228561533857, + "grad_norm": 0.321068674325943, + "learning_rate": 0.00019591396734996942, + "loss": 1.2139, + "step": 51530 + }, + { + "epoch": 0.10950353193507763, + "grad_norm": 0.33629879355430603, + "learning_rate": 0.00019591204008008188, + "loss": 1.2096, + "step": 51540 + }, + { + "epoch": 0.10952477825481667, + "grad_norm": 0.417092889547348, + "learning_rate": 0.00019591011236526538, + "loss": 1.2086, + "step": 51550 + }, + { + "epoch": 0.10954602457455573, + "grad_norm": 0.4153492748737335, + "learning_rate": 0.00019590818420552882, + "loss": 1.2072, + "step": 51560 + }, + { + "epoch": 0.10956727089429479, + "grad_norm": 0.3510877192020416, + "learning_rate": 0.00019590625560088115, + "loss": 1.2529, + "step": 51570 + }, + { + "epoch": 0.10958851721403383, + "grad_norm": 0.45564183592796326, + "learning_rate": 0.0001959043265513313, + "loss": 1.2258, + "step": 51580 + }, + { + "epoch": 0.10960976353377289, + "grad_norm": 0.3263612985610962, + "learning_rate": 0.00019590239705688829, + "loss": 1.2267, + "step": 51590 + }, + { + "epoch": 0.10963100985351194, + "grad_norm": 0.35532626509666443, + "learning_rate": 0.000195900467117561, + "loss": 1.2079, + "step": 51600 + }, + { + "epoch": 0.10965225617325099, + "grad_norm": 0.3927209973335266, + "learning_rate": 0.00019589853673335843, + "loss": 1.1809, + "step": 51610 + }, + { + "epoch": 0.10967350249299004, + "grad_norm": 0.37716659903526306, + "learning_rate": 0.00019589660590428953, + "loss": 1.2485, + "step": 51620 + }, + { + "epoch": 0.1096947488127291, + "grad_norm": 0.32286331057548523, + "learning_rate": 0.00019589467463036322, + "loss": 1.2506, + "step": 51630 + }, + { + "epoch": 0.10971599513246814, + "grad_norm": 0.38215237855911255, + "learning_rate": 0.00019589274291158848, + "loss": 1.238, + "step": 51640 + }, + { + "epoch": 0.1097372414522072, + "grad_norm": 0.3412897288799286, + "learning_rate": 0.00019589081074797426, + "loss": 1.2385, + "step": 51650 + }, + { + "epoch": 0.10975848777194626, + "grad_norm": 0.4886055588722229, + "learning_rate": 0.00019588887813952956, + "loss": 1.21, + "step": 51660 + }, + { + "epoch": 0.1097797340916853, + "grad_norm": 0.3966464102268219, + "learning_rate": 0.0001958869450862633, + "loss": 1.2181, + "step": 51670 + }, + { + "epoch": 0.10980098041142436, + "grad_norm": 0.44327089190483093, + "learning_rate": 0.00019588501158818449, + "loss": 1.1976, + "step": 51680 + }, + { + "epoch": 0.10982222673116342, + "grad_norm": 0.3564201593399048, + "learning_rate": 0.00019588307764530205, + "loss": 1.219, + "step": 51690 + }, + { + "epoch": 0.10984347305090246, + "grad_norm": 0.7022504806518555, + "learning_rate": 0.000195881143257625, + "loss": 1.2017, + "step": 51700 + }, + { + "epoch": 0.10986471937064152, + "grad_norm": 0.6079725623130798, + "learning_rate": 0.0001958792084251623, + "loss": 1.2206, + "step": 51710 + }, + { + "epoch": 0.10988596569038057, + "grad_norm": 0.6272546648979187, + "learning_rate": 0.0001958772731479229, + "loss": 1.1999, + "step": 51720 + }, + { + "epoch": 0.10990721201011962, + "grad_norm": 0.5422349572181702, + "learning_rate": 0.0001958753374259158, + "loss": 1.2201, + "step": 51730 + }, + { + "epoch": 0.10992845832985867, + "grad_norm": 0.5582864284515381, + "learning_rate": 0.00019587340125914996, + "loss": 1.2274, + "step": 51740 + }, + { + "epoch": 0.10994970464959773, + "grad_norm": 0.44766750931739807, + "learning_rate": 0.00019587146464763438, + "loss": 1.2183, + "step": 51750 + }, + { + "epoch": 0.10997095096933678, + "grad_norm": 0.38094672560691833, + "learning_rate": 0.00019586952759137807, + "loss": 1.1815, + "step": 51760 + }, + { + "epoch": 0.10999219728907583, + "grad_norm": 0.36862778663635254, + "learning_rate": 0.00019586759009038995, + "loss": 1.2101, + "step": 51770 + }, + { + "epoch": 0.11001344360881489, + "grad_norm": 0.32907813787460327, + "learning_rate": 0.00019586565214467906, + "loss": 1.1892, + "step": 51780 + }, + { + "epoch": 0.11003468992855393, + "grad_norm": 0.40186038613319397, + "learning_rate": 0.00019586371375425435, + "loss": 1.2069, + "step": 51790 + }, + { + "epoch": 0.11005593624829299, + "grad_norm": 0.40324366092681885, + "learning_rate": 0.0001958617749191249, + "loss": 1.2297, + "step": 51800 + }, + { + "epoch": 0.11007718256803205, + "grad_norm": 0.37186068296432495, + "learning_rate": 0.0001958598356392996, + "loss": 1.2438, + "step": 51810 + }, + { + "epoch": 0.11009842888777109, + "grad_norm": 0.4044865369796753, + "learning_rate": 0.00019585789591478747, + "loss": 1.2158, + "step": 51820 + }, + { + "epoch": 0.11011967520751015, + "grad_norm": 0.41356194019317627, + "learning_rate": 0.00019585595574559753, + "loss": 1.2327, + "step": 51830 + }, + { + "epoch": 0.1101409215272492, + "grad_norm": 0.3947400748729706, + "learning_rate": 0.0001958540151317388, + "loss": 1.2075, + "step": 51840 + }, + { + "epoch": 0.11016216784698825, + "grad_norm": 0.4243188500404358, + "learning_rate": 0.00019585207407322024, + "loss": 1.2253, + "step": 51850 + }, + { + "epoch": 0.1101834141667273, + "grad_norm": 0.7369034290313721, + "learning_rate": 0.0001958501325700509, + "loss": 1.2119, + "step": 51860 + }, + { + "epoch": 0.11020466048646636, + "grad_norm": 0.5714353322982788, + "learning_rate": 0.0001958481906222397, + "loss": 1.2382, + "step": 51870 + }, + { + "epoch": 0.1102259068062054, + "grad_norm": 0.6242411136627197, + "learning_rate": 0.00019584624822979575, + "loss": 1.2018, + "step": 51880 + }, + { + "epoch": 0.11024715312594446, + "grad_norm": 0.34625476598739624, + "learning_rate": 0.000195844305392728, + "loss": 1.278, + "step": 51890 + }, + { + "epoch": 0.11026839944568352, + "grad_norm": 0.5154086351394653, + "learning_rate": 0.0001958423621110455, + "loss": 1.2179, + "step": 51900 + }, + { + "epoch": 0.11028964576542256, + "grad_norm": 0.3810734450817108, + "learning_rate": 0.00019584041838475723, + "loss": 1.2467, + "step": 51910 + }, + { + "epoch": 0.11031089208516162, + "grad_norm": 0.5520268082618713, + "learning_rate": 0.00019583847421387224, + "loss": 1.1844, + "step": 51920 + }, + { + "epoch": 0.11033213840490068, + "grad_norm": 0.40836578607559204, + "learning_rate": 0.0001958365295983995, + "loss": 1.1901, + "step": 51930 + }, + { + "epoch": 0.11035338472463972, + "grad_norm": 0.37987738847732544, + "learning_rate": 0.00019583458453834807, + "loss": 1.1921, + "step": 51940 + }, + { + "epoch": 0.11037463104437878, + "grad_norm": 0.34577977657318115, + "learning_rate": 0.00019583263903372698, + "loss": 1.1886, + "step": 51950 + }, + { + "epoch": 0.11039587736411784, + "grad_norm": 0.38684630393981934, + "learning_rate": 0.00019583069308454521, + "loss": 1.2284, + "step": 51960 + }, + { + "epoch": 0.1104171236838569, + "grad_norm": 0.3894471228122711, + "learning_rate": 0.00019582874669081185, + "loss": 1.2004, + "step": 51970 + }, + { + "epoch": 0.11043837000359594, + "grad_norm": 0.3389537036418915, + "learning_rate": 0.0001958267998525359, + "loss": 1.2505, + "step": 51980 + }, + { + "epoch": 0.110459616323335, + "grad_norm": 0.3416109085083008, + "learning_rate": 0.00019582485256972637, + "loss": 1.2093, + "step": 51990 + }, + { + "epoch": 0.11048086264307405, + "grad_norm": 0.42122843861579895, + "learning_rate": 0.0001958229048423923, + "loss": 1.1983, + "step": 52000 + }, + { + "epoch": 0.1105021089628131, + "grad_norm": 0.5437819361686707, + "learning_rate": 0.00019582095667054273, + "loss": 1.2564, + "step": 52010 + }, + { + "epoch": 0.11052335528255215, + "grad_norm": 0.6805195212364197, + "learning_rate": 0.00019581900805418673, + "loss": 1.2197, + "step": 52020 + }, + { + "epoch": 0.11054460160229121, + "grad_norm": 0.40045925974845886, + "learning_rate": 0.00019581705899333332, + "loss": 1.2341, + "step": 52030 + }, + { + "epoch": 0.11056584792203025, + "grad_norm": 0.3485940396785736, + "learning_rate": 0.00019581510948799152, + "loss": 1.2639, + "step": 52040 + }, + { + "epoch": 0.11058709424176931, + "grad_norm": 0.40437692403793335, + "learning_rate": 0.00019581315953817036, + "loss": 1.216, + "step": 52050 + }, + { + "epoch": 0.11060834056150837, + "grad_norm": 0.3685152232646942, + "learning_rate": 0.00019581120914387895, + "loss": 1.2147, + "step": 52060 + }, + { + "epoch": 0.11062958688124741, + "grad_norm": 0.3368823528289795, + "learning_rate": 0.00019580925830512628, + "loss": 1.2374, + "step": 52070 + }, + { + "epoch": 0.11065083320098647, + "grad_norm": 0.34671562910079956, + "learning_rate": 0.00019580730702192144, + "loss": 1.211, + "step": 52080 + }, + { + "epoch": 0.11067207952072552, + "grad_norm": 0.40622565150260925, + "learning_rate": 0.00019580535529427345, + "loss": 1.191, + "step": 52090 + }, + { + "epoch": 0.11069332584046457, + "grad_norm": 0.3835681676864624, + "learning_rate": 0.00019580340312219137, + "loss": 1.1289, + "step": 52100 + }, + { + "epoch": 0.11071457216020363, + "grad_norm": 0.7901905179023743, + "learning_rate": 0.00019580145050568428, + "loss": 1.1942, + "step": 52110 + }, + { + "epoch": 0.11073581847994268, + "grad_norm": 0.8067057132720947, + "learning_rate": 0.00019579949744476122, + "loss": 1.2369, + "step": 52120 + }, + { + "epoch": 0.11075706479968173, + "grad_norm": 0.5262416005134583, + "learning_rate": 0.00019579754393943123, + "loss": 1.2066, + "step": 52130 + }, + { + "epoch": 0.11077831111942078, + "grad_norm": 0.3313099145889282, + "learning_rate": 0.0001957955899897034, + "loss": 1.236, + "step": 52140 + }, + { + "epoch": 0.11079955743915984, + "grad_norm": 0.48324406147003174, + "learning_rate": 0.0001957936355955868, + "loss": 1.224, + "step": 52150 + }, + { + "epoch": 0.11082080375889888, + "grad_norm": 0.34914684295654297, + "learning_rate": 0.0001957916807570905, + "loss": 1.2161, + "step": 52160 + }, + { + "epoch": 0.11084205007863794, + "grad_norm": 0.3507426381111145, + "learning_rate": 0.0001957897254742235, + "loss": 1.2034, + "step": 52170 + }, + { + "epoch": 0.110863296398377, + "grad_norm": 0.32537251710891724, + "learning_rate": 0.00019578776974699499, + "loss": 1.2127, + "step": 52180 + }, + { + "epoch": 0.11088454271811604, + "grad_norm": 0.3657948970794678, + "learning_rate": 0.0001957858135754139, + "loss": 1.2021, + "step": 52190 + }, + { + "epoch": 0.1109057890378551, + "grad_norm": 0.38416361808776855, + "learning_rate": 0.00019578385695948942, + "loss": 1.1999, + "step": 52200 + }, + { + "epoch": 0.11092703535759416, + "grad_norm": 0.34410884976387024, + "learning_rate": 0.0001957818998992306, + "loss": 1.176, + "step": 52210 + }, + { + "epoch": 0.1109482816773332, + "grad_norm": 0.33231595158576965, + "learning_rate": 0.0001957799423946465, + "loss": 1.2169, + "step": 52220 + }, + { + "epoch": 0.11096952799707226, + "grad_norm": 0.34217920899391174, + "learning_rate": 0.00019577798444574617, + "loss": 1.1815, + "step": 52230 + }, + { + "epoch": 0.11099077431681131, + "grad_norm": 0.3667428493499756, + "learning_rate": 0.00019577602605253875, + "loss": 1.145, + "step": 52240 + }, + { + "epoch": 0.11101202063655036, + "grad_norm": 0.5128313899040222, + "learning_rate": 0.0001957740672150333, + "loss": 1.2198, + "step": 52250 + }, + { + "epoch": 0.11103326695628941, + "grad_norm": 0.35316410660743713, + "learning_rate": 0.00019577210793323893, + "loss": 1.2077, + "step": 52260 + }, + { + "epoch": 0.11105451327602847, + "grad_norm": 0.5086024403572083, + "learning_rate": 0.00019577014820716467, + "loss": 1.2038, + "step": 52270 + }, + { + "epoch": 0.11107575959576751, + "grad_norm": 0.38805946707725525, + "learning_rate": 0.00019576818803681968, + "loss": 1.1947, + "step": 52280 + }, + { + "epoch": 0.11109700591550657, + "grad_norm": 0.318893700838089, + "learning_rate": 0.00019576622742221306, + "loss": 1.2101, + "step": 52290 + }, + { + "epoch": 0.11111825223524563, + "grad_norm": 0.328117311000824, + "learning_rate": 0.00019576426636335378, + "loss": 1.1983, + "step": 52300 + }, + { + "epoch": 0.11113949855498467, + "grad_norm": 0.34611815214157104, + "learning_rate": 0.0001957623048602511, + "loss": 1.2245, + "step": 52310 + }, + { + "epoch": 0.11116074487472373, + "grad_norm": 0.4388067424297333, + "learning_rate": 0.000195760342912914, + "loss": 1.2048, + "step": 52320 + }, + { + "epoch": 0.11118199119446279, + "grad_norm": 0.5000554919242859, + "learning_rate": 0.00019575838052135166, + "loss": 1.256, + "step": 52330 + }, + { + "epoch": 0.11120323751420183, + "grad_norm": 0.5203457474708557, + "learning_rate": 0.00019575641768557311, + "loss": 1.2212, + "step": 52340 + }, + { + "epoch": 0.11122448383394089, + "grad_norm": 0.3587513864040375, + "learning_rate": 0.0001957544544055875, + "loss": 1.2281, + "step": 52350 + }, + { + "epoch": 0.11124573015367994, + "grad_norm": 0.4514562785625458, + "learning_rate": 0.00019575249068140395, + "loss": 1.1919, + "step": 52360 + }, + { + "epoch": 0.11126697647341899, + "grad_norm": 0.33056262135505676, + "learning_rate": 0.00019575052651303154, + "loss": 1.2222, + "step": 52370 + }, + { + "epoch": 0.11128822279315805, + "grad_norm": 0.43800121545791626, + "learning_rate": 0.00019574856190047944, + "loss": 1.2102, + "step": 52380 + }, + { + "epoch": 0.1113094691128971, + "grad_norm": 0.5127748847007751, + "learning_rate": 0.00019574659684375666, + "loss": 1.2134, + "step": 52390 + }, + { + "epoch": 0.11133071543263616, + "grad_norm": 0.32045289874076843, + "learning_rate": 0.0001957446313428724, + "loss": 1.1967, + "step": 52400 + }, + { + "epoch": 0.1113519617523752, + "grad_norm": 0.35628825426101685, + "learning_rate": 0.00019574266539783573, + "loss": 1.2423, + "step": 52410 + }, + { + "epoch": 0.11137320807211426, + "grad_norm": 0.36610907316207886, + "learning_rate": 0.0001957406990086558, + "loss": 1.2005, + "step": 52420 + }, + { + "epoch": 0.11139445439185332, + "grad_norm": 0.3427351117134094, + "learning_rate": 0.00019573873217534172, + "loss": 1.2276, + "step": 52430 + }, + { + "epoch": 0.11141570071159236, + "grad_norm": 0.3240310847759247, + "learning_rate": 0.00019573676489790262, + "loss": 1.2408, + "step": 52440 + }, + { + "epoch": 0.11143694703133142, + "grad_norm": 0.34408098459243774, + "learning_rate": 0.00019573479717634765, + "loss": 1.2117, + "step": 52450 + }, + { + "epoch": 0.11145819335107048, + "grad_norm": 0.39125892519950867, + "learning_rate": 0.0001957328290106859, + "loss": 1.2292, + "step": 52460 + }, + { + "epoch": 0.11147943967080952, + "grad_norm": 0.3773176074028015, + "learning_rate": 0.00019573086040092647, + "loss": 1.2327, + "step": 52470 + }, + { + "epoch": 0.11150068599054858, + "grad_norm": 0.32470792531967163, + "learning_rate": 0.00019572889134707858, + "loss": 1.2368, + "step": 52480 + }, + { + "epoch": 0.11152193231028763, + "grad_norm": 0.3657768666744232, + "learning_rate": 0.00019572692184915128, + "loss": 1.194, + "step": 52490 + }, + { + "epoch": 0.11154317863002668, + "grad_norm": 0.43900230526924133, + "learning_rate": 0.00019572495190715378, + "loss": 1.2363, + "step": 52500 + }, + { + "epoch": 0.11156442494976573, + "grad_norm": 0.36286428570747375, + "learning_rate": 0.00019572298152109515, + "loss": 1.2242, + "step": 52510 + }, + { + "epoch": 0.11158567126950479, + "grad_norm": 0.37991905212402344, + "learning_rate": 0.00019572101069098457, + "loss": 1.2292, + "step": 52520 + }, + { + "epoch": 0.11160691758924383, + "grad_norm": 0.3932630121707916, + "learning_rate": 0.0001957190394168312, + "loss": 1.2359, + "step": 52530 + }, + { + "epoch": 0.11162816390898289, + "grad_norm": 0.4978724420070648, + "learning_rate": 0.00019571706769864412, + "loss": 1.221, + "step": 52540 + }, + { + "epoch": 0.11164941022872195, + "grad_norm": 0.35446539521217346, + "learning_rate": 0.00019571509553643256, + "loss": 1.2176, + "step": 52550 + }, + { + "epoch": 0.11167065654846099, + "grad_norm": 0.35231655836105347, + "learning_rate": 0.0001957131229302056, + "loss": 1.2083, + "step": 52560 + }, + { + "epoch": 0.11169190286820005, + "grad_norm": 0.5438043475151062, + "learning_rate": 0.00019571114987997242, + "loss": 1.1894, + "step": 52570 + }, + { + "epoch": 0.1117131491879391, + "grad_norm": 0.32679253816604614, + "learning_rate": 0.00019570917638574219, + "loss": 1.2446, + "step": 52580 + }, + { + "epoch": 0.11173439550767815, + "grad_norm": 0.4205717444419861, + "learning_rate": 0.000195707202447524, + "loss": 1.2055, + "step": 52590 + }, + { + "epoch": 0.11175564182741721, + "grad_norm": 0.5532844662666321, + "learning_rate": 0.00019570522806532708, + "loss": 1.2226, + "step": 52600 + }, + { + "epoch": 0.11177688814715626, + "grad_norm": 0.4942512810230255, + "learning_rate": 0.00019570325323916055, + "loss": 1.23, + "step": 52610 + }, + { + "epoch": 0.11179813446689531, + "grad_norm": 0.590740978717804, + "learning_rate": 0.00019570127796903356, + "loss": 1.2157, + "step": 52620 + }, + { + "epoch": 0.11181938078663436, + "grad_norm": 0.33124881982803345, + "learning_rate": 0.00019569930225495533, + "loss": 1.2086, + "step": 52630 + }, + { + "epoch": 0.11184062710637342, + "grad_norm": 0.3493780493736267, + "learning_rate": 0.00019569732609693498, + "loss": 1.2505, + "step": 52640 + }, + { + "epoch": 0.11186187342611247, + "grad_norm": 0.36133503913879395, + "learning_rate": 0.0001956953494949817, + "loss": 1.2223, + "step": 52650 + }, + { + "epoch": 0.11188311974585152, + "grad_norm": 0.4075638949871063, + "learning_rate": 0.0001956933724491046, + "loss": 1.1742, + "step": 52660 + }, + { + "epoch": 0.11190436606559058, + "grad_norm": 0.5407796502113342, + "learning_rate": 0.00019569139495931296, + "loss": 1.202, + "step": 52670 + }, + { + "epoch": 0.11192561238532962, + "grad_norm": 0.5128801465034485, + "learning_rate": 0.00019568941702561584, + "loss": 1.1776, + "step": 52680 + }, + { + "epoch": 0.11194685870506868, + "grad_norm": 0.3493131399154663, + "learning_rate": 0.00019568743864802247, + "loss": 1.1871, + "step": 52690 + }, + { + "epoch": 0.11196810502480774, + "grad_norm": 0.376094251871109, + "learning_rate": 0.00019568545982654207, + "loss": 1.2034, + "step": 52700 + }, + { + "epoch": 0.11198935134454678, + "grad_norm": 0.34851986169815063, + "learning_rate": 0.00019568348056118372, + "loss": 1.2174, + "step": 52710 + }, + { + "epoch": 0.11201059766428584, + "grad_norm": 0.3825744688510895, + "learning_rate": 0.00019568150085195665, + "loss": 1.2336, + "step": 52720 + }, + { + "epoch": 0.1120318439840249, + "grad_norm": 0.4603023827075958, + "learning_rate": 0.0001956795206988701, + "loss": 1.2162, + "step": 52730 + }, + { + "epoch": 0.11205309030376394, + "grad_norm": 0.4006264805793762, + "learning_rate": 0.0001956775401019332, + "loss": 1.1868, + "step": 52740 + }, + { + "epoch": 0.112074336623503, + "grad_norm": 0.5743856430053711, + "learning_rate": 0.00019567555906115512, + "loss": 1.1975, + "step": 52750 + }, + { + "epoch": 0.11209558294324205, + "grad_norm": 0.3344166874885559, + "learning_rate": 0.00019567357757654504, + "loss": 1.2034, + "step": 52760 + }, + { + "epoch": 0.1121168292629811, + "grad_norm": 0.45286834239959717, + "learning_rate": 0.00019567159564811223, + "loss": 1.1985, + "step": 52770 + }, + { + "epoch": 0.11213807558272015, + "grad_norm": 0.3426220417022705, + "learning_rate": 0.00019566961327586584, + "loss": 1.2203, + "step": 52780 + }, + { + "epoch": 0.11215932190245921, + "grad_norm": 0.3757534325122833, + "learning_rate": 0.00019566763045981503, + "loss": 1.1958, + "step": 52790 + }, + { + "epoch": 0.11218056822219825, + "grad_norm": 0.4657466411590576, + "learning_rate": 0.00019566564719996908, + "loss": 1.1818, + "step": 52800 + }, + { + "epoch": 0.11220181454193731, + "grad_norm": 0.3310096263885498, + "learning_rate": 0.00019566366349633708, + "loss": 1.2193, + "step": 52810 + }, + { + "epoch": 0.11222306086167637, + "grad_norm": 0.3298216462135315, + "learning_rate": 0.00019566167934892834, + "loss": 1.2263, + "step": 52820 + }, + { + "epoch": 0.11224430718141543, + "grad_norm": 0.36154645681381226, + "learning_rate": 0.000195659694757752, + "loss": 1.2514, + "step": 52830 + }, + { + "epoch": 0.11226555350115447, + "grad_norm": 0.3424535393714905, + "learning_rate": 0.0001956577097228173, + "loss": 1.2116, + "step": 52840 + }, + { + "epoch": 0.11228679982089353, + "grad_norm": 0.5845991969108582, + "learning_rate": 0.00019565572424413343, + "loss": 1.2238, + "step": 52850 + }, + { + "epoch": 0.11230804614063258, + "grad_norm": 0.35310855507850647, + "learning_rate": 0.0001956537383217096, + "loss": 1.1922, + "step": 52860 + }, + { + "epoch": 0.11232929246037163, + "grad_norm": 0.34539446234703064, + "learning_rate": 0.000195651751955555, + "loss": 1.1934, + "step": 52870 + }, + { + "epoch": 0.11235053878011068, + "grad_norm": 0.5387718677520752, + "learning_rate": 0.0001956497651456789, + "loss": 1.2131, + "step": 52880 + }, + { + "epoch": 0.11237178509984974, + "grad_norm": 0.32694727182388306, + "learning_rate": 0.00019564777789209052, + "loss": 1.2213, + "step": 52890 + }, + { + "epoch": 0.11239303141958878, + "grad_norm": 0.4501783549785614, + "learning_rate": 0.000195645790194799, + "loss": 1.1893, + "step": 52900 + }, + { + "epoch": 0.11241427773932784, + "grad_norm": 0.36931270360946655, + "learning_rate": 0.00019564380205381362, + "loss": 1.1829, + "step": 52910 + }, + { + "epoch": 0.1124355240590669, + "grad_norm": 0.36938685178756714, + "learning_rate": 0.00019564181346914361, + "loss": 1.208, + "step": 52920 + }, + { + "epoch": 0.11245677037880594, + "grad_norm": 0.441194087266922, + "learning_rate": 0.00019563982444079816, + "loss": 1.179, + "step": 52930 + }, + { + "epoch": 0.112478016698545, + "grad_norm": 0.4371684491634369, + "learning_rate": 0.0001956378349687865, + "loss": 1.1994, + "step": 52940 + }, + { + "epoch": 0.11249926301828406, + "grad_norm": 0.37644192576408386, + "learning_rate": 0.0001956358450531179, + "loss": 1.2237, + "step": 52950 + }, + { + "epoch": 0.1125205093380231, + "grad_norm": 0.3401361405849457, + "learning_rate": 0.0001956338546938015, + "loss": 1.2216, + "step": 52960 + }, + { + "epoch": 0.11254175565776216, + "grad_norm": 0.3738355338573456, + "learning_rate": 0.00019563186389084667, + "loss": 1.2129, + "step": 52970 + }, + { + "epoch": 0.11256300197750121, + "grad_norm": 0.3367062509059906, + "learning_rate": 0.00019562987264426252, + "loss": 1.1898, + "step": 52980 + }, + { + "epoch": 0.11258424829724026, + "grad_norm": 0.4047057032585144, + "learning_rate": 0.00019562788095405838, + "loss": 1.224, + "step": 52990 + }, + { + "epoch": 0.11260549461697932, + "grad_norm": 0.7281494140625, + "learning_rate": 0.00019562588882024342, + "loss": 1.1986, + "step": 53000 + }, + { + "epoch": 0.11262674093671837, + "grad_norm": 0.4563693404197693, + "learning_rate": 0.0001956238962428269, + "loss": 1.2213, + "step": 53010 + }, + { + "epoch": 0.11264798725645742, + "grad_norm": 0.4165923595428467, + "learning_rate": 0.0001956219032218181, + "loss": 1.1943, + "step": 53020 + }, + { + "epoch": 0.11266923357619647, + "grad_norm": 0.3451806604862213, + "learning_rate": 0.00019561990975722622, + "loss": 1.1944, + "step": 53030 + }, + { + "epoch": 0.11269047989593553, + "grad_norm": 0.34885483980178833, + "learning_rate": 0.00019561791584906052, + "loss": 1.2339, + "step": 53040 + }, + { + "epoch": 0.11271172621567457, + "grad_norm": 0.4988696873188019, + "learning_rate": 0.00019561592149733027, + "loss": 1.2548, + "step": 53050 + }, + { + "epoch": 0.11273297253541363, + "grad_norm": 0.34822314977645874, + "learning_rate": 0.0001956139267020447, + "loss": 1.1902, + "step": 53060 + }, + { + "epoch": 0.11275421885515269, + "grad_norm": 0.39091333746910095, + "learning_rate": 0.0001956119314632131, + "loss": 1.2024, + "step": 53070 + }, + { + "epoch": 0.11277546517489173, + "grad_norm": 0.3410940170288086, + "learning_rate": 0.00019560993578084465, + "loss": 1.2282, + "step": 53080 + }, + { + "epoch": 0.11279671149463079, + "grad_norm": 0.39875349402427673, + "learning_rate": 0.00019560793965494867, + "loss": 1.2288, + "step": 53090 + }, + { + "epoch": 0.11281795781436985, + "grad_norm": 0.31855523586273193, + "learning_rate": 0.00019560594308553443, + "loss": 1.1921, + "step": 53100 + }, + { + "epoch": 0.11283920413410889, + "grad_norm": 0.4193689525127411, + "learning_rate": 0.00019560394607261113, + "loss": 1.1771, + "step": 53110 + }, + { + "epoch": 0.11286045045384795, + "grad_norm": 0.3677315413951874, + "learning_rate": 0.00019560194861618807, + "loss": 1.2331, + "step": 53120 + }, + { + "epoch": 0.112881696773587, + "grad_norm": 0.41497287154197693, + "learning_rate": 0.00019559995071627453, + "loss": 1.2315, + "step": 53130 + }, + { + "epoch": 0.11290294309332605, + "grad_norm": 0.3643295466899872, + "learning_rate": 0.00019559795237287974, + "loss": 1.201, + "step": 53140 + }, + { + "epoch": 0.1129241894130651, + "grad_norm": 0.47915518283843994, + "learning_rate": 0.00019559595358601304, + "loss": 1.2188, + "step": 53150 + }, + { + "epoch": 0.11294543573280416, + "grad_norm": 0.3859911859035492, + "learning_rate": 0.00019559395435568363, + "loss": 1.2243, + "step": 53160 + }, + { + "epoch": 0.1129666820525432, + "grad_norm": 0.34191247820854187, + "learning_rate": 0.0001955919546819008, + "loss": 1.2017, + "step": 53170 + }, + { + "epoch": 0.11298792837228226, + "grad_norm": 0.35261785984039307, + "learning_rate": 0.00019558995456467386, + "loss": 1.2366, + "step": 53180 + }, + { + "epoch": 0.11300917469202132, + "grad_norm": 0.4841579794883728, + "learning_rate": 0.00019558795400401204, + "loss": 1.1808, + "step": 53190 + }, + { + "epoch": 0.11303042101176036, + "grad_norm": 0.33633187413215637, + "learning_rate": 0.00019558595299992465, + "loss": 1.222, + "step": 53200 + }, + { + "epoch": 0.11305166733149942, + "grad_norm": 0.4744851589202881, + "learning_rate": 0.00019558395155242097, + "loss": 1.1979, + "step": 53210 + }, + { + "epoch": 0.11307291365123848, + "grad_norm": 0.46212202310562134, + "learning_rate": 0.0001955819496615103, + "loss": 1.2171, + "step": 53220 + }, + { + "epoch": 0.11309415997097753, + "grad_norm": 0.42361804842948914, + "learning_rate": 0.00019557994732720188, + "loss": 1.2284, + "step": 53230 + }, + { + "epoch": 0.11311540629071658, + "grad_norm": 0.43868401646614075, + "learning_rate": 0.00019557794454950507, + "loss": 1.216, + "step": 53240 + }, + { + "epoch": 0.11313665261045563, + "grad_norm": 0.35546067357063293, + "learning_rate": 0.0001955759413284291, + "loss": 1.1902, + "step": 53250 + }, + { + "epoch": 0.11315789893019469, + "grad_norm": 0.445808082818985, + "learning_rate": 0.00019557393766398326, + "loss": 1.2022, + "step": 53260 + }, + { + "epoch": 0.11317914524993374, + "grad_norm": 0.6122779250144958, + "learning_rate": 0.00019557193355617688, + "loss": 1.237, + "step": 53270 + }, + { + "epoch": 0.11320039156967279, + "grad_norm": 0.49818718433380127, + "learning_rate": 0.00019556992900501924, + "loss": 1.2109, + "step": 53280 + }, + { + "epoch": 0.11322163788941185, + "grad_norm": 0.3649202585220337, + "learning_rate": 0.00019556792401051966, + "loss": 1.2098, + "step": 53290 + }, + { + "epoch": 0.11324288420915089, + "grad_norm": 0.3951483368873596, + "learning_rate": 0.00019556591857268743, + "loss": 1.2052, + "step": 53300 + }, + { + "epoch": 0.11326413052888995, + "grad_norm": 0.3531395494937897, + "learning_rate": 0.0001955639126915318, + "loss": 1.2558, + "step": 53310 + }, + { + "epoch": 0.11328537684862901, + "grad_norm": 0.4870574176311493, + "learning_rate": 0.00019556190636706215, + "loss": 1.1857, + "step": 53320 + }, + { + "epoch": 0.11330662316836805, + "grad_norm": 0.5040104389190674, + "learning_rate": 0.00019555989959928777, + "loss": 1.2142, + "step": 53330 + }, + { + "epoch": 0.11332786948810711, + "grad_norm": 0.3357049524784088, + "learning_rate": 0.00019555789238821794, + "loss": 1.2317, + "step": 53340 + }, + { + "epoch": 0.11334911580784617, + "grad_norm": 0.3698482811450958, + "learning_rate": 0.000195555884733862, + "loss": 1.2271, + "step": 53350 + }, + { + "epoch": 0.11337036212758521, + "grad_norm": 0.3544113039970398, + "learning_rate": 0.00019555387663622924, + "loss": 1.2132, + "step": 53360 + }, + { + "epoch": 0.11339160844732427, + "grad_norm": 0.3608119487762451, + "learning_rate": 0.000195551868095329, + "loss": 1.2368, + "step": 53370 + }, + { + "epoch": 0.11341285476706332, + "grad_norm": 0.38019561767578125, + "learning_rate": 0.0001955498591111706, + "loss": 1.1853, + "step": 53380 + }, + { + "epoch": 0.11343410108680237, + "grad_norm": 0.356046199798584, + "learning_rate": 0.0001955478496837633, + "loss": 1.1574, + "step": 53390 + }, + { + "epoch": 0.11345534740654142, + "grad_norm": 0.40385541319847107, + "learning_rate": 0.0001955458398131165, + "loss": 1.2001, + "step": 53400 + }, + { + "epoch": 0.11347659372628048, + "grad_norm": 0.33576878905296326, + "learning_rate": 0.0001955438294992395, + "loss": 1.2368, + "step": 53410 + }, + { + "epoch": 0.11349784004601952, + "grad_norm": 0.3582116365432739, + "learning_rate": 0.00019554181874214158, + "loss": 1.2136, + "step": 53420 + }, + { + "epoch": 0.11351908636575858, + "grad_norm": 0.4070928394794464, + "learning_rate": 0.00019553980754183212, + "loss": 1.2318, + "step": 53430 + }, + { + "epoch": 0.11354033268549764, + "grad_norm": 0.30580392479896545, + "learning_rate": 0.00019553779589832044, + "loss": 1.2025, + "step": 53440 + }, + { + "epoch": 0.11356157900523668, + "grad_norm": 0.5917583107948303, + "learning_rate": 0.00019553578381161586, + "loss": 1.2325, + "step": 53450 + }, + { + "epoch": 0.11358282532497574, + "grad_norm": 0.4805757403373718, + "learning_rate": 0.0001955337712817277, + "loss": 1.218, + "step": 53460 + }, + { + "epoch": 0.1136040716447148, + "grad_norm": 0.47837328910827637, + "learning_rate": 0.00019553175830866538, + "loss": 1.1818, + "step": 53470 + }, + { + "epoch": 0.11362531796445384, + "grad_norm": 0.341991662979126, + "learning_rate": 0.00019552974489243813, + "loss": 1.2099, + "step": 53480 + }, + { + "epoch": 0.1136465642841929, + "grad_norm": 0.371693879365921, + "learning_rate": 0.0001955277310330553, + "loss": 1.2161, + "step": 53490 + }, + { + "epoch": 0.11366781060393195, + "grad_norm": 0.3335396349430084, + "learning_rate": 0.0001955257167305263, + "loss": 1.2277, + "step": 53500 + }, + { + "epoch": 0.113689056923671, + "grad_norm": 0.3993526101112366, + "learning_rate": 0.00019552370198486045, + "loss": 1.2237, + "step": 53510 + }, + { + "epoch": 0.11371030324341005, + "grad_norm": 0.6113518476486206, + "learning_rate": 0.00019552168679606705, + "loss": 1.1856, + "step": 53520 + }, + { + "epoch": 0.11373154956314911, + "grad_norm": 0.3400152325630188, + "learning_rate": 0.00019551967116415554, + "loss": 1.2014, + "step": 53530 + }, + { + "epoch": 0.11375279588288816, + "grad_norm": 0.35826629400253296, + "learning_rate": 0.00019551765508913518, + "loss": 1.2163, + "step": 53540 + }, + { + "epoch": 0.11377404220262721, + "grad_norm": 0.4800134599208832, + "learning_rate": 0.00019551563857101535, + "loss": 1.2734, + "step": 53550 + }, + { + "epoch": 0.11379528852236627, + "grad_norm": 0.3303671181201935, + "learning_rate": 0.00019551362160980543, + "loss": 1.2439, + "step": 53560 + }, + { + "epoch": 0.11381653484210531, + "grad_norm": 0.38981199264526367, + "learning_rate": 0.00019551160420551474, + "loss": 1.2198, + "step": 53570 + }, + { + "epoch": 0.11383778116184437, + "grad_norm": 0.36041200160980225, + "learning_rate": 0.0001955095863581527, + "loss": 1.2382, + "step": 53580 + }, + { + "epoch": 0.11385902748158343, + "grad_norm": 0.5970730781555176, + "learning_rate": 0.00019550756806772856, + "loss": 1.2438, + "step": 53590 + }, + { + "epoch": 0.11388027380132247, + "grad_norm": 0.4665132164955139, + "learning_rate": 0.00019550554933425179, + "loss": 1.1787, + "step": 53600 + }, + { + "epoch": 0.11390152012106153, + "grad_norm": 0.3691147565841675, + "learning_rate": 0.0001955035301577317, + "loss": 1.1771, + "step": 53610 + }, + { + "epoch": 0.11392276644080059, + "grad_norm": 0.3712714612483978, + "learning_rate": 0.0001955015105381777, + "loss": 1.222, + "step": 53620 + }, + { + "epoch": 0.11394401276053963, + "grad_norm": 0.41520389914512634, + "learning_rate": 0.00019549949047559912, + "loss": 1.2091, + "step": 53630 + }, + { + "epoch": 0.11396525908027869, + "grad_norm": 0.3576088547706604, + "learning_rate": 0.0001954974699700053, + "loss": 1.2047, + "step": 53640 + }, + { + "epoch": 0.11398650540001774, + "grad_norm": 0.32373368740081787, + "learning_rate": 0.0001954954490214057, + "loss": 1.1955, + "step": 53650 + }, + { + "epoch": 0.1140077517197568, + "grad_norm": 0.357520192861557, + "learning_rate": 0.00019549342762980963, + "loss": 1.223, + "step": 53660 + }, + { + "epoch": 0.11402899803949584, + "grad_norm": 0.4850322902202606, + "learning_rate": 0.00019549140579522653, + "loss": 1.1955, + "step": 53670 + }, + { + "epoch": 0.1140502443592349, + "grad_norm": 0.38825494050979614, + "learning_rate": 0.00019548938351766568, + "loss": 1.2657, + "step": 53680 + }, + { + "epoch": 0.11407149067897396, + "grad_norm": 0.3625427782535553, + "learning_rate": 0.00019548736079713655, + "loss": 1.2073, + "step": 53690 + }, + { + "epoch": 0.114092736998713, + "grad_norm": 0.3457599878311157, + "learning_rate": 0.00019548533763364846, + "loss": 1.2168, + "step": 53700 + }, + { + "epoch": 0.11411398331845206, + "grad_norm": 0.33901169896125793, + "learning_rate": 0.00019548331402721085, + "loss": 1.2063, + "step": 53710 + }, + { + "epoch": 0.11413522963819112, + "grad_norm": 0.33245784044265747, + "learning_rate": 0.00019548128997783307, + "loss": 1.2322, + "step": 53720 + }, + { + "epoch": 0.11415647595793016, + "grad_norm": 0.35787153244018555, + "learning_rate": 0.00019547926548552455, + "loss": 1.2143, + "step": 53730 + }, + { + "epoch": 0.11417772227766922, + "grad_norm": 0.3384595513343811, + "learning_rate": 0.0001954772405502946, + "loss": 1.2446, + "step": 53740 + }, + { + "epoch": 0.11419896859740827, + "grad_norm": 0.3217652142047882, + "learning_rate": 0.00019547521517215271, + "loss": 1.2254, + "step": 53750 + }, + { + "epoch": 0.11422021491714732, + "grad_norm": 0.4243943989276886, + "learning_rate": 0.00019547318935110824, + "loss": 1.2258, + "step": 53760 + }, + { + "epoch": 0.11424146123688637, + "grad_norm": 1.0074466466903687, + "learning_rate": 0.00019547116308717052, + "loss": 1.2186, + "step": 53770 + }, + { + "epoch": 0.11426270755662543, + "grad_norm": 0.5638347864151001, + "learning_rate": 0.00019546913638034908, + "loss": 1.2029, + "step": 53780 + }, + { + "epoch": 0.11428395387636447, + "grad_norm": 0.46554481983184814, + "learning_rate": 0.0001954671092306532, + "loss": 1.2355, + "step": 53790 + }, + { + "epoch": 0.11430520019610353, + "grad_norm": 0.6611311435699463, + "learning_rate": 0.0001954650816380924, + "loss": 1.2342, + "step": 53800 + }, + { + "epoch": 0.11432644651584259, + "grad_norm": 0.527286946773529, + "learning_rate": 0.00019546305360267595, + "loss": 1.2412, + "step": 53810 + }, + { + "epoch": 0.11434769283558163, + "grad_norm": 0.5203188061714172, + "learning_rate": 0.00019546102512441336, + "loss": 1.2314, + "step": 53820 + }, + { + "epoch": 0.11436893915532069, + "grad_norm": 0.3340308964252472, + "learning_rate": 0.00019545899620331404, + "loss": 1.1967, + "step": 53830 + }, + { + "epoch": 0.11439018547505975, + "grad_norm": 0.3525812327861786, + "learning_rate": 0.00019545696683938733, + "loss": 1.2342, + "step": 53840 + }, + { + "epoch": 0.11441143179479879, + "grad_norm": 0.44000694155693054, + "learning_rate": 0.0001954549370326427, + "loss": 1.2204, + "step": 53850 + }, + { + "epoch": 0.11443267811453785, + "grad_norm": 0.3833557963371277, + "learning_rate": 0.00019545290678308955, + "loss": 1.1892, + "step": 53860 + }, + { + "epoch": 0.1144539244342769, + "grad_norm": 0.3598296344280243, + "learning_rate": 0.0001954508760907373, + "loss": 1.1708, + "step": 53870 + }, + { + "epoch": 0.11447517075401595, + "grad_norm": 0.3284853398799896, + "learning_rate": 0.00019544884495559535, + "loss": 1.2089, + "step": 53880 + }, + { + "epoch": 0.114496417073755, + "grad_norm": 0.4143942892551422, + "learning_rate": 0.00019544681337767316, + "loss": 1.2019, + "step": 53890 + }, + { + "epoch": 0.11451766339349406, + "grad_norm": 0.40400728583335876, + "learning_rate": 0.00019544478135698014, + "loss": 1.2368, + "step": 53900 + }, + { + "epoch": 0.1145389097132331, + "grad_norm": 0.3911696970462799, + "learning_rate": 0.00019544274889352571, + "loss": 1.1908, + "step": 53910 + }, + { + "epoch": 0.11456015603297216, + "grad_norm": 0.5760965943336487, + "learning_rate": 0.00019544071598731933, + "loss": 1.1822, + "step": 53920 + }, + { + "epoch": 0.11458140235271122, + "grad_norm": 0.3594358265399933, + "learning_rate": 0.00019543868263837035, + "loss": 1.2175, + "step": 53930 + }, + { + "epoch": 0.11460264867245026, + "grad_norm": 0.3285655677318573, + "learning_rate": 0.0001954366488466883, + "loss": 1.2259, + "step": 53940 + }, + { + "epoch": 0.11462389499218932, + "grad_norm": 0.3380107283592224, + "learning_rate": 0.0001954346146122825, + "loss": 1.1805, + "step": 53950 + }, + { + "epoch": 0.11464514131192838, + "grad_norm": 0.5539560914039612, + "learning_rate": 0.00019543257993516252, + "loss": 1.1693, + "step": 53960 + }, + { + "epoch": 0.11466638763166742, + "grad_norm": 0.6331459879875183, + "learning_rate": 0.00019543054481533775, + "loss": 1.1944, + "step": 53970 + }, + { + "epoch": 0.11468763395140648, + "grad_norm": 0.4367080628871918, + "learning_rate": 0.00019542850925281758, + "loss": 1.1975, + "step": 53980 + }, + { + "epoch": 0.11470888027114554, + "grad_norm": 0.3680785894393921, + "learning_rate": 0.00019542647324761146, + "loss": 1.2099, + "step": 53990 + }, + { + "epoch": 0.11473012659088458, + "grad_norm": 0.4923422634601593, + "learning_rate": 0.00019542443679972893, + "loss": 1.2196, + "step": 54000 + }, + { + "epoch": 0.11475137291062364, + "grad_norm": 0.6230224370956421, + "learning_rate": 0.0001954223999091793, + "loss": 1.2322, + "step": 54010 + }, + { + "epoch": 0.1147726192303627, + "grad_norm": 0.40304994583129883, + "learning_rate": 0.00019542036257597217, + "loss": 1.2477, + "step": 54020 + }, + { + "epoch": 0.11479386555010174, + "grad_norm": 0.3699142038822174, + "learning_rate": 0.00019541832480011686, + "loss": 1.2019, + "step": 54030 + }, + { + "epoch": 0.1148151118698408, + "grad_norm": 0.47744515538215637, + "learning_rate": 0.00019541628658162287, + "loss": 1.2001, + "step": 54040 + }, + { + "epoch": 0.11483635818957985, + "grad_norm": 0.4764863848686218, + "learning_rate": 0.00019541424792049968, + "loss": 1.1893, + "step": 54050 + }, + { + "epoch": 0.1148576045093189, + "grad_norm": 0.37758275866508484, + "learning_rate": 0.0001954122088167567, + "loss": 1.2015, + "step": 54060 + }, + { + "epoch": 0.11487885082905795, + "grad_norm": 0.3432614803314209, + "learning_rate": 0.00019541016927040344, + "loss": 1.1981, + "step": 54070 + }, + { + "epoch": 0.11490009714879701, + "grad_norm": 0.45912644267082214, + "learning_rate": 0.0001954081292814493, + "loss": 1.213, + "step": 54080 + }, + { + "epoch": 0.11492134346853607, + "grad_norm": 0.35171619057655334, + "learning_rate": 0.0001954060888499038, + "loss": 1.1639, + "step": 54090 + }, + { + "epoch": 0.11494258978827511, + "grad_norm": 0.3382038176059723, + "learning_rate": 0.0001954040479757764, + "loss": 1.2364, + "step": 54100 + }, + { + "epoch": 0.11496383610801417, + "grad_norm": 0.4372880160808563, + "learning_rate": 0.00019540200665907654, + "loss": 1.2312, + "step": 54110 + }, + { + "epoch": 0.11498508242775322, + "grad_norm": 0.5412399768829346, + "learning_rate": 0.00019539996489981369, + "loss": 1.2262, + "step": 54120 + }, + { + "epoch": 0.11500632874749227, + "grad_norm": 0.39288586378097534, + "learning_rate": 0.00019539792269799734, + "loss": 1.222, + "step": 54130 + }, + { + "epoch": 0.11502757506723132, + "grad_norm": 0.38397178053855896, + "learning_rate": 0.000195395880053637, + "loss": 1.1997, + "step": 54140 + }, + { + "epoch": 0.11504882138697038, + "grad_norm": 0.4713166356086731, + "learning_rate": 0.00019539383696674204, + "loss": 1.2056, + "step": 54150 + }, + { + "epoch": 0.11507006770670943, + "grad_norm": 0.3917907774448395, + "learning_rate": 0.000195391793437322, + "loss": 1.1905, + "step": 54160 + }, + { + "epoch": 0.11509131402644848, + "grad_norm": 0.40717974305152893, + "learning_rate": 0.00019538974946538642, + "loss": 1.2169, + "step": 54170 + }, + { + "epoch": 0.11511256034618754, + "grad_norm": 0.3917825222015381, + "learning_rate": 0.00019538770505094467, + "loss": 1.1811, + "step": 54180 + }, + { + "epoch": 0.11513380666592658, + "grad_norm": 0.5506864190101624, + "learning_rate": 0.0001953856601940063, + "loss": 1.2298, + "step": 54190 + }, + { + "epoch": 0.11515505298566564, + "grad_norm": 0.4171656370162964, + "learning_rate": 0.0001953836148945808, + "loss": 1.224, + "step": 54200 + }, + { + "epoch": 0.1151762993054047, + "grad_norm": 0.42080405354499817, + "learning_rate": 0.0001953815691526776, + "loss": 1.1747, + "step": 54210 + }, + { + "epoch": 0.11519754562514374, + "grad_norm": 0.5838817954063416, + "learning_rate": 0.00019537952296830625, + "loss": 1.2245, + "step": 54220 + }, + { + "epoch": 0.1152187919448828, + "grad_norm": 0.3991452157497406, + "learning_rate": 0.00019537747634147624, + "loss": 1.2064, + "step": 54230 + }, + { + "epoch": 0.11524003826462186, + "grad_norm": 0.3193269371986389, + "learning_rate": 0.00019537542927219702, + "loss": 1.2375, + "step": 54240 + }, + { + "epoch": 0.1152612845843609, + "grad_norm": 0.4841330945491791, + "learning_rate": 0.00019537338176047814, + "loss": 1.1932, + "step": 54250 + }, + { + "epoch": 0.11528253090409996, + "grad_norm": 0.40842655301094055, + "learning_rate": 0.00019537133380632905, + "loss": 1.2201, + "step": 54260 + }, + { + "epoch": 0.11530377722383901, + "grad_norm": 0.45919346809387207, + "learning_rate": 0.00019536928540975928, + "loss": 1.1729, + "step": 54270 + }, + { + "epoch": 0.11532502354357806, + "grad_norm": 0.3333297669887543, + "learning_rate": 0.00019536723657077832, + "loss": 1.17, + "step": 54280 + }, + { + "epoch": 0.11534626986331711, + "grad_norm": 0.36678630113601685, + "learning_rate": 0.0001953651872893957, + "loss": 1.2058, + "step": 54290 + }, + { + "epoch": 0.11536751618305617, + "grad_norm": 0.46342289447784424, + "learning_rate": 0.00019536313756562086, + "loss": 1.2338, + "step": 54300 + }, + { + "epoch": 0.11538876250279521, + "grad_norm": 0.433482825756073, + "learning_rate": 0.0001953610873994634, + "loss": 1.2243, + "step": 54310 + }, + { + "epoch": 0.11541000882253427, + "grad_norm": 0.39634454250335693, + "learning_rate": 0.00019535903679093272, + "loss": 1.2258, + "step": 54320 + }, + { + "epoch": 0.11543125514227333, + "grad_norm": 0.6535974740982056, + "learning_rate": 0.00019535698574003844, + "loss": 1.2178, + "step": 54330 + }, + { + "epoch": 0.11545250146201237, + "grad_norm": 0.5164614915847778, + "learning_rate": 0.00019535493424679005, + "loss": 1.2054, + "step": 54340 + }, + { + "epoch": 0.11547374778175143, + "grad_norm": 0.4936073124408722, + "learning_rate": 0.000195352882311197, + "loss": 1.2583, + "step": 54350 + }, + { + "epoch": 0.11549499410149049, + "grad_norm": 0.6490308046340942, + "learning_rate": 0.00019535082993326887, + "loss": 1.2265, + "step": 54360 + }, + { + "epoch": 0.11551624042122953, + "grad_norm": 0.510177493095398, + "learning_rate": 0.0001953487771130152, + "loss": 1.2347, + "step": 54370 + }, + { + "epoch": 0.11553748674096859, + "grad_norm": 0.32660022377967834, + "learning_rate": 0.00019534672385044543, + "loss": 1.1838, + "step": 54380 + }, + { + "epoch": 0.11555873306070764, + "grad_norm": 0.3401787281036377, + "learning_rate": 0.00019534467014556918, + "loss": 1.199, + "step": 54390 + }, + { + "epoch": 0.11557997938044669, + "grad_norm": 0.3990958631038666, + "learning_rate": 0.0001953426159983959, + "loss": 1.2377, + "step": 54400 + }, + { + "epoch": 0.11560122570018574, + "grad_norm": 0.45196160674095154, + "learning_rate": 0.00019534056140893517, + "loss": 1.2232, + "step": 54410 + }, + { + "epoch": 0.1156224720199248, + "grad_norm": 0.44063809514045715, + "learning_rate": 0.0001953385063771965, + "loss": 1.187, + "step": 54420 + }, + { + "epoch": 0.11564371833966385, + "grad_norm": 0.3636045753955841, + "learning_rate": 0.0001953364509031894, + "loss": 1.2193, + "step": 54430 + }, + { + "epoch": 0.1156649646594029, + "grad_norm": 0.4621356129646301, + "learning_rate": 0.00019533439498692348, + "loss": 1.1788, + "step": 54440 + }, + { + "epoch": 0.11568621097914196, + "grad_norm": 0.45155930519104004, + "learning_rate": 0.00019533233862840818, + "loss": 1.209, + "step": 54450 + }, + { + "epoch": 0.115707457298881, + "grad_norm": 0.3768151104450226, + "learning_rate": 0.00019533028182765308, + "loss": 1.2421, + "step": 54460 + }, + { + "epoch": 0.11572870361862006, + "grad_norm": 0.37578797340393066, + "learning_rate": 0.00019532822458466777, + "loss": 1.2115, + "step": 54470 + }, + { + "epoch": 0.11574994993835912, + "grad_norm": 0.5751631855964661, + "learning_rate": 0.00019532616689946176, + "loss": 1.1979, + "step": 54480 + }, + { + "epoch": 0.11577119625809816, + "grad_norm": 0.4360728859901428, + "learning_rate": 0.00019532410877204454, + "loss": 1.2233, + "step": 54490 + }, + { + "epoch": 0.11579244257783722, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.00019532205020242574, + "loss": 1.243, + "step": 54500 + }, + { + "epoch": 0.11581368889757628, + "grad_norm": 0.359063059091568, + "learning_rate": 0.00019531999119061486, + "loss": 1.2158, + "step": 54510 + }, + { + "epoch": 0.11583493521731533, + "grad_norm": 0.32917606830596924, + "learning_rate": 0.00019531793173662145, + "loss": 1.196, + "step": 54520 + }, + { + "epoch": 0.11585618153705438, + "grad_norm": 0.40914440155029297, + "learning_rate": 0.00019531587184045508, + "loss": 1.2385, + "step": 54530 + }, + { + "epoch": 0.11587742785679343, + "grad_norm": 0.400953471660614, + "learning_rate": 0.00019531381150212534, + "loss": 1.2225, + "step": 54540 + }, + { + "epoch": 0.11589867417653249, + "grad_norm": 0.36018210649490356, + "learning_rate": 0.0001953117507216417, + "loss": 1.252, + "step": 54550 + }, + { + "epoch": 0.11591992049627153, + "grad_norm": 0.4216156005859375, + "learning_rate": 0.00019530968949901383, + "loss": 1.1933, + "step": 54560 + }, + { + "epoch": 0.11594116681601059, + "grad_norm": 0.47337251901626587, + "learning_rate": 0.0001953076278342512, + "loss": 1.2007, + "step": 54570 + }, + { + "epoch": 0.11596241313574965, + "grad_norm": 0.5441904664039612, + "learning_rate": 0.00019530556572736342, + "loss": 1.1982, + "step": 54580 + }, + { + "epoch": 0.11598365945548869, + "grad_norm": 0.47983381152153015, + "learning_rate": 0.00019530350317836006, + "loss": 1.2079, + "step": 54590 + }, + { + "epoch": 0.11600490577522775, + "grad_norm": 0.505972683429718, + "learning_rate": 0.00019530144018725066, + "loss": 1.2141, + "step": 54600 + }, + { + "epoch": 0.1160261520949668, + "grad_norm": 0.35405391454696655, + "learning_rate": 0.00019529937675404478, + "loss": 1.2192, + "step": 54610 + }, + { + "epoch": 0.11604739841470585, + "grad_norm": 0.3750385046005249, + "learning_rate": 0.00019529731287875202, + "loss": 1.1937, + "step": 54620 + }, + { + "epoch": 0.1160686447344449, + "grad_norm": 0.47415512800216675, + "learning_rate": 0.00019529524856138196, + "loss": 1.2524, + "step": 54630 + }, + { + "epoch": 0.11608989105418396, + "grad_norm": 0.4500731825828552, + "learning_rate": 0.00019529318380194418, + "loss": 1.2216, + "step": 54640 + }, + { + "epoch": 0.116111137373923, + "grad_norm": 0.33058133721351624, + "learning_rate": 0.00019529111860044823, + "loss": 1.1782, + "step": 54650 + }, + { + "epoch": 0.11613238369366206, + "grad_norm": 0.40907561779022217, + "learning_rate": 0.00019528905295690367, + "loss": 1.202, + "step": 54660 + }, + { + "epoch": 0.11615363001340112, + "grad_norm": 0.3906172811985016, + "learning_rate": 0.00019528698687132018, + "loss": 1.214, + "step": 54670 + }, + { + "epoch": 0.11617487633314016, + "grad_norm": 0.4033646583557129, + "learning_rate": 0.00019528492034370722, + "loss": 1.218, + "step": 54680 + }, + { + "epoch": 0.11619612265287922, + "grad_norm": 0.3490275740623474, + "learning_rate": 0.0001952828533740745, + "loss": 1.1844, + "step": 54690 + }, + { + "epoch": 0.11621736897261828, + "grad_norm": 0.3770236670970917, + "learning_rate": 0.0001952807859624315, + "loss": 1.2249, + "step": 54700 + }, + { + "epoch": 0.11623861529235732, + "grad_norm": 0.4502312242984772, + "learning_rate": 0.00019527871810878783, + "loss": 1.2192, + "step": 54710 + }, + { + "epoch": 0.11625986161209638, + "grad_norm": 0.4309649169445038, + "learning_rate": 0.00019527664981315316, + "loss": 1.1855, + "step": 54720 + }, + { + "epoch": 0.11628110793183544, + "grad_norm": 0.33513009548187256, + "learning_rate": 0.00019527458107553703, + "loss": 1.1836, + "step": 54730 + }, + { + "epoch": 0.11630235425157448, + "grad_norm": 0.3224586546421051, + "learning_rate": 0.000195272511895949, + "loss": 1.2306, + "step": 54740 + }, + { + "epoch": 0.11632360057131354, + "grad_norm": 0.3454630672931671, + "learning_rate": 0.00019527044227439874, + "loss": 1.2133, + "step": 54750 + }, + { + "epoch": 0.1163448468910526, + "grad_norm": 0.3371381461620331, + "learning_rate": 0.00019526837221089583, + "loss": 1.1776, + "step": 54760 + }, + { + "epoch": 0.11636609321079164, + "grad_norm": 0.45761775970458984, + "learning_rate": 0.00019526630170544983, + "loss": 1.2244, + "step": 54770 + }, + { + "epoch": 0.1163873395305307, + "grad_norm": 0.3522191047668457, + "learning_rate": 0.0001952642307580704, + "loss": 1.2138, + "step": 54780 + }, + { + "epoch": 0.11640858585026975, + "grad_norm": 0.37043043971061707, + "learning_rate": 0.00019526215936876713, + "loss": 1.1943, + "step": 54790 + }, + { + "epoch": 0.1164298321700088, + "grad_norm": 0.3368622064590454, + "learning_rate": 0.0001952600875375496, + "loss": 1.1892, + "step": 54800 + }, + { + "epoch": 0.11645107848974785, + "grad_norm": 0.33309704065322876, + "learning_rate": 0.00019525801526442744, + "loss": 1.201, + "step": 54810 + }, + { + "epoch": 0.11647232480948691, + "grad_norm": 0.38713446259498596, + "learning_rate": 0.00019525594254941032, + "loss": 1.1778, + "step": 54820 + }, + { + "epoch": 0.11649357112922595, + "grad_norm": 0.4879594147205353, + "learning_rate": 0.00019525386939250774, + "loss": 1.2146, + "step": 54830 + }, + { + "epoch": 0.11651481744896501, + "grad_norm": 0.3924001455307007, + "learning_rate": 0.0001952517957937294, + "loss": 1.1863, + "step": 54840 + }, + { + "epoch": 0.11653606376870407, + "grad_norm": 0.34864553809165955, + "learning_rate": 0.0001952497217530849, + "loss": 1.2117, + "step": 54850 + }, + { + "epoch": 0.11655731008844311, + "grad_norm": 0.4603098928928375, + "learning_rate": 0.0001952476472705839, + "loss": 1.263, + "step": 54860 + }, + { + "epoch": 0.11657855640818217, + "grad_norm": 0.5874302387237549, + "learning_rate": 0.00019524557234623592, + "loss": 1.2041, + "step": 54870 + }, + { + "epoch": 0.11659980272792123, + "grad_norm": 0.4732016921043396, + "learning_rate": 0.0001952434969800507, + "loss": 1.2062, + "step": 54880 + }, + { + "epoch": 0.11662104904766027, + "grad_norm": 0.3295927047729492, + "learning_rate": 0.00019524142117203778, + "loss": 1.1875, + "step": 54890 + }, + { + "epoch": 0.11664229536739933, + "grad_norm": 0.35447752475738525, + "learning_rate": 0.00019523934492220684, + "loss": 1.2234, + "step": 54900 + }, + { + "epoch": 0.11666354168713838, + "grad_norm": 0.36768701672554016, + "learning_rate": 0.0001952372682305675, + "loss": 1.1767, + "step": 54910 + }, + { + "epoch": 0.11668478800687743, + "grad_norm": 0.3898129463195801, + "learning_rate": 0.00019523519109712937, + "loss": 1.2191, + "step": 54920 + }, + { + "epoch": 0.11670603432661648, + "grad_norm": 0.3914327919483185, + "learning_rate": 0.00019523311352190214, + "loss": 1.1932, + "step": 54930 + }, + { + "epoch": 0.11672728064635554, + "grad_norm": 0.448100209236145, + "learning_rate": 0.0001952310355048954, + "loss": 1.221, + "step": 54940 + }, + { + "epoch": 0.1167485269660946, + "grad_norm": 0.4104442894458771, + "learning_rate": 0.00019522895704611882, + "loss": 1.2124, + "step": 54950 + }, + { + "epoch": 0.11676977328583364, + "grad_norm": 0.36180025339126587, + "learning_rate": 0.000195226878145582, + "loss": 1.1823, + "step": 54960 + }, + { + "epoch": 0.1167910196055727, + "grad_norm": 0.37649211287498474, + "learning_rate": 0.00019522479880329464, + "loss": 1.2238, + "step": 54970 + }, + { + "epoch": 0.11681226592531176, + "grad_norm": 0.6312360763549805, + "learning_rate": 0.00019522271901926632, + "loss": 1.2277, + "step": 54980 + }, + { + "epoch": 0.1168335122450508, + "grad_norm": 0.3807428181171417, + "learning_rate": 0.00019522063879350677, + "loss": 1.2233, + "step": 54990 + }, + { + "epoch": 0.11685475856478986, + "grad_norm": 0.37058866024017334, + "learning_rate": 0.00019521855812602555, + "loss": 1.2333, + "step": 55000 + }, + { + "epoch": 0.11687600488452891, + "grad_norm": 0.44926634430885315, + "learning_rate": 0.00019521647701683237, + "loss": 1.1948, + "step": 55010 + }, + { + "epoch": 0.11689725120426796, + "grad_norm": 0.36020052433013916, + "learning_rate": 0.00019521439546593689, + "loss": 1.2594, + "step": 55020 + }, + { + "epoch": 0.11691849752400701, + "grad_norm": 0.586170494556427, + "learning_rate": 0.0001952123134733487, + "loss": 1.2017, + "step": 55030 + }, + { + "epoch": 0.11693974384374607, + "grad_norm": 0.5733214616775513, + "learning_rate": 0.00019521023103907754, + "loss": 1.2041, + "step": 55040 + }, + { + "epoch": 0.11696099016348512, + "grad_norm": 0.5091713666915894, + "learning_rate": 0.00019520814816313304, + "loss": 1.1963, + "step": 55050 + }, + { + "epoch": 0.11698223648322417, + "grad_norm": 0.36049985885620117, + "learning_rate": 0.00019520606484552485, + "loss": 1.2095, + "step": 55060 + }, + { + "epoch": 0.11700348280296323, + "grad_norm": 0.46974417567253113, + "learning_rate": 0.00019520398108626265, + "loss": 1.2117, + "step": 55070 + }, + { + "epoch": 0.11702472912270227, + "grad_norm": 0.4105778932571411, + "learning_rate": 0.0001952018968853561, + "loss": 1.2043, + "step": 55080 + }, + { + "epoch": 0.11704597544244133, + "grad_norm": 0.5163971781730652, + "learning_rate": 0.00019519981224281482, + "loss": 1.2319, + "step": 55090 + }, + { + "epoch": 0.11706722176218039, + "grad_norm": 0.3474569320678711, + "learning_rate": 0.00019519772715864857, + "loss": 1.1946, + "step": 55100 + }, + { + "epoch": 0.11708846808191943, + "grad_norm": 0.5645756125450134, + "learning_rate": 0.00019519564163286699, + "loss": 1.2027, + "step": 55110 + }, + { + "epoch": 0.11710971440165849, + "grad_norm": 0.41389453411102295, + "learning_rate": 0.00019519355566547972, + "loss": 1.1872, + "step": 55120 + }, + { + "epoch": 0.11713096072139755, + "grad_norm": 0.4656020700931549, + "learning_rate": 0.00019519146925649645, + "loss": 1.2165, + "step": 55130 + }, + { + "epoch": 0.11715220704113659, + "grad_norm": 0.43345263600349426, + "learning_rate": 0.0001951893824059269, + "loss": 1.227, + "step": 55140 + }, + { + "epoch": 0.11717345336087565, + "grad_norm": 0.347557008266449, + "learning_rate": 0.0001951872951137807, + "loss": 1.2087, + "step": 55150 + }, + { + "epoch": 0.1171946996806147, + "grad_norm": 0.3369697332382202, + "learning_rate": 0.00019518520738006755, + "loss": 1.1814, + "step": 55160 + }, + { + "epoch": 0.11721594600035375, + "grad_norm": 0.8848709464073181, + "learning_rate": 0.00019518311920479716, + "loss": 1.2364, + "step": 55170 + }, + { + "epoch": 0.1172371923200928, + "grad_norm": 0.44551631808280945, + "learning_rate": 0.0001951810305879792, + "loss": 1.2419, + "step": 55180 + }, + { + "epoch": 0.11725843863983186, + "grad_norm": 0.47559618949890137, + "learning_rate": 0.00019517894152962332, + "loss": 1.1801, + "step": 55190 + }, + { + "epoch": 0.1172796849595709, + "grad_norm": 0.5542007684707642, + "learning_rate": 0.00019517685202973925, + "loss": 1.209, + "step": 55200 + }, + { + "epoch": 0.11730093127930996, + "grad_norm": 0.3457120358943939, + "learning_rate": 0.0001951747620883367, + "loss": 1.1985, + "step": 55210 + }, + { + "epoch": 0.11732217759904902, + "grad_norm": 0.43746238946914673, + "learning_rate": 0.00019517267170542532, + "loss": 1.1929, + "step": 55220 + }, + { + "epoch": 0.11734342391878806, + "grad_norm": 0.35478323698043823, + "learning_rate": 0.00019517058088101483, + "loss": 1.2, + "step": 55230 + }, + { + "epoch": 0.11736467023852712, + "grad_norm": 0.3715648949146271, + "learning_rate": 0.00019516848961511496, + "loss": 1.2052, + "step": 55240 + }, + { + "epoch": 0.11738591655826618, + "grad_norm": 0.4129120707511902, + "learning_rate": 0.00019516639790773533, + "loss": 1.1756, + "step": 55250 + }, + { + "epoch": 0.11740716287800522, + "grad_norm": 0.35028114914894104, + "learning_rate": 0.00019516430575888577, + "loss": 1.2362, + "step": 55260 + }, + { + "epoch": 0.11742840919774428, + "grad_norm": 0.301724910736084, + "learning_rate": 0.00019516221316857584, + "loss": 1.226, + "step": 55270 + }, + { + "epoch": 0.11744965551748333, + "grad_norm": 0.3815803825855255, + "learning_rate": 0.00019516012013681534, + "loss": 1.1994, + "step": 55280 + }, + { + "epoch": 0.11747090183722238, + "grad_norm": 0.5352025628089905, + "learning_rate": 0.00019515802666361396, + "loss": 1.2863, + "step": 55290 + }, + { + "epoch": 0.11749214815696143, + "grad_norm": 0.46694231033325195, + "learning_rate": 0.0001951559327489814, + "loss": 1.1929, + "step": 55300 + }, + { + "epoch": 0.11751339447670049, + "grad_norm": 0.36573588848114014, + "learning_rate": 0.0001951538383929274, + "loss": 1.2015, + "step": 55310 + }, + { + "epoch": 0.11753464079643954, + "grad_norm": 0.31696775555610657, + "learning_rate": 0.00019515174359546162, + "loss": 1.1931, + "step": 55320 + }, + { + "epoch": 0.11755588711617859, + "grad_norm": 0.42170658707618713, + "learning_rate": 0.00019514964835659385, + "loss": 1.2257, + "step": 55330 + }, + { + "epoch": 0.11757713343591765, + "grad_norm": 0.3393196761608124, + "learning_rate": 0.00019514755267633374, + "loss": 1.2187, + "step": 55340 + }, + { + "epoch": 0.1175983797556567, + "grad_norm": 0.33845654129981995, + "learning_rate": 0.00019514545655469108, + "loss": 1.1981, + "step": 55350 + }, + { + "epoch": 0.11761962607539575, + "grad_norm": 0.44299471378326416, + "learning_rate": 0.00019514335999167552, + "loss": 1.2087, + "step": 55360 + }, + { + "epoch": 0.11764087239513481, + "grad_norm": 0.3742339015007019, + "learning_rate": 0.00019514126298729684, + "loss": 1.2276, + "step": 55370 + }, + { + "epoch": 0.11766211871487386, + "grad_norm": 0.4332398474216461, + "learning_rate": 0.00019513916554156477, + "loss": 1.1882, + "step": 55380 + }, + { + "epoch": 0.11768336503461291, + "grad_norm": 0.34312644600868225, + "learning_rate": 0.00019513706765448904, + "loss": 1.2121, + "step": 55390 + }, + { + "epoch": 0.11770461135435197, + "grad_norm": 0.5086438059806824, + "learning_rate": 0.00019513496932607933, + "loss": 1.1774, + "step": 55400 + }, + { + "epoch": 0.11772585767409102, + "grad_norm": 0.49141961336135864, + "learning_rate": 0.00019513287055634542, + "loss": 1.1739, + "step": 55410 + }, + { + "epoch": 0.11774710399383007, + "grad_norm": 0.3511594831943512, + "learning_rate": 0.000195130771345297, + "loss": 1.167, + "step": 55420 + }, + { + "epoch": 0.11776835031356912, + "grad_norm": 0.43179234862327576, + "learning_rate": 0.0001951286716929439, + "loss": 1.1941, + "step": 55430 + }, + { + "epoch": 0.11778959663330818, + "grad_norm": 0.3220602869987488, + "learning_rate": 0.00019512657159929578, + "loss": 1.2192, + "step": 55440 + }, + { + "epoch": 0.11781084295304722, + "grad_norm": 0.3960776925086975, + "learning_rate": 0.0001951244710643624, + "loss": 1.2196, + "step": 55450 + }, + { + "epoch": 0.11783208927278628, + "grad_norm": 0.3452523648738861, + "learning_rate": 0.00019512237008815348, + "loss": 1.203, + "step": 55460 + }, + { + "epoch": 0.11785333559252534, + "grad_norm": 0.37004679441452026, + "learning_rate": 0.00019512026867067882, + "loss": 1.2053, + "step": 55470 + }, + { + "epoch": 0.11787458191226438, + "grad_norm": 0.4323633313179016, + "learning_rate": 0.00019511816681194816, + "loss": 1.2036, + "step": 55480 + }, + { + "epoch": 0.11789582823200344, + "grad_norm": 0.3439529538154602, + "learning_rate": 0.00019511606451197124, + "loss": 1.1969, + "step": 55490 + }, + { + "epoch": 0.1179170745517425, + "grad_norm": 0.35699936747550964, + "learning_rate": 0.0001951139617707578, + "loss": 1.2134, + "step": 55500 + }, + { + "epoch": 0.11793832087148154, + "grad_norm": 0.3292331397533417, + "learning_rate": 0.00019511185858831759, + "loss": 1.192, + "step": 55510 + }, + { + "epoch": 0.1179595671912206, + "grad_norm": 0.4003482758998871, + "learning_rate": 0.00019510975496466037, + "loss": 1.2232, + "step": 55520 + }, + { + "epoch": 0.11798081351095965, + "grad_norm": 0.489934504032135, + "learning_rate": 0.0001951076508997959, + "loss": 1.1751, + "step": 55530 + }, + { + "epoch": 0.1180020598306987, + "grad_norm": 0.35945838689804077, + "learning_rate": 0.000195105546393734, + "loss": 1.2101, + "step": 55540 + }, + { + "epoch": 0.11802330615043775, + "grad_norm": 0.4084795117378235, + "learning_rate": 0.00019510344144648434, + "loss": 1.2146, + "step": 55550 + }, + { + "epoch": 0.11804455247017681, + "grad_norm": 0.4558325707912445, + "learning_rate": 0.0001951013360580567, + "loss": 1.2375, + "step": 55560 + }, + { + "epoch": 0.11806579878991585, + "grad_norm": 0.3413698375225067, + "learning_rate": 0.0001950992302284609, + "loss": 1.1897, + "step": 55570 + }, + { + "epoch": 0.11808704510965491, + "grad_norm": 0.3439413011074066, + "learning_rate": 0.00019509712395770668, + "loss": 1.2048, + "step": 55580 + }, + { + "epoch": 0.11810829142939397, + "grad_norm": 0.394396036863327, + "learning_rate": 0.00019509501724580383, + "loss": 1.2214, + "step": 55590 + }, + { + "epoch": 0.11812953774913301, + "grad_norm": 0.44994568824768066, + "learning_rate": 0.00019509291009276204, + "loss": 1.211, + "step": 55600 + }, + { + "epoch": 0.11815078406887207, + "grad_norm": 0.34521445631980896, + "learning_rate": 0.0001950908024985912, + "loss": 1.1963, + "step": 55610 + }, + { + "epoch": 0.11817203038861113, + "grad_norm": 0.4296328127384186, + "learning_rate": 0.00019508869446330102, + "loss": 1.2188, + "step": 55620 + }, + { + "epoch": 0.11819327670835017, + "grad_norm": 0.3406585156917572, + "learning_rate": 0.00019508658598690133, + "loss": 1.1955, + "step": 55630 + }, + { + "epoch": 0.11821452302808923, + "grad_norm": 0.32416483759880066, + "learning_rate": 0.00019508447706940182, + "loss": 1.2478, + "step": 55640 + }, + { + "epoch": 0.11823576934782828, + "grad_norm": 0.3960822522640228, + "learning_rate": 0.00019508236771081236, + "loss": 1.1893, + "step": 55650 + }, + { + "epoch": 0.11825701566756733, + "grad_norm": 0.32886338233947754, + "learning_rate": 0.00019508025791114267, + "loss": 1.2397, + "step": 55660 + }, + { + "epoch": 0.11827826198730639, + "grad_norm": 0.4212195575237274, + "learning_rate": 0.00019507814767040263, + "loss": 1.2009, + "step": 55670 + }, + { + "epoch": 0.11829950830704544, + "grad_norm": 0.39308401942253113, + "learning_rate": 0.00019507603698860193, + "loss": 1.1921, + "step": 55680 + }, + { + "epoch": 0.11832075462678449, + "grad_norm": 0.3691920042037964, + "learning_rate": 0.00019507392586575042, + "loss": 1.2111, + "step": 55690 + }, + { + "epoch": 0.11834200094652354, + "grad_norm": 0.3614768981933594, + "learning_rate": 0.00019507181430185785, + "loss": 1.1887, + "step": 55700 + }, + { + "epoch": 0.1183632472662626, + "grad_norm": 0.3362428843975067, + "learning_rate": 0.00019506970229693407, + "loss": 1.211, + "step": 55710 + }, + { + "epoch": 0.11838449358600164, + "grad_norm": 0.3511494994163513, + "learning_rate": 0.00019506758985098882, + "loss": 1.1622, + "step": 55720 + }, + { + "epoch": 0.1184057399057407, + "grad_norm": 0.5070692300796509, + "learning_rate": 0.00019506547696403195, + "loss": 1.2068, + "step": 55730 + }, + { + "epoch": 0.11842698622547976, + "grad_norm": 0.4581919312477112, + "learning_rate": 0.0001950633636360732, + "loss": 1.21, + "step": 55740 + }, + { + "epoch": 0.1184482325452188, + "grad_norm": 0.40354210138320923, + "learning_rate": 0.00019506124986712244, + "loss": 1.2167, + "step": 55750 + }, + { + "epoch": 0.11846947886495786, + "grad_norm": 0.4821908473968506, + "learning_rate": 0.00019505913565718945, + "loss": 1.2037, + "step": 55760 + }, + { + "epoch": 0.11849072518469692, + "grad_norm": 0.44402259588241577, + "learning_rate": 0.00019505702100628406, + "loss": 1.1987, + "step": 55770 + }, + { + "epoch": 0.11851197150443597, + "grad_norm": 0.47485360503196716, + "learning_rate": 0.00019505490591441603, + "loss": 1.2123, + "step": 55780 + }, + { + "epoch": 0.11853321782417502, + "grad_norm": 0.38980039954185486, + "learning_rate": 0.0001950527903815952, + "loss": 1.1822, + "step": 55790 + }, + { + "epoch": 0.11855446414391407, + "grad_norm": 0.4689936637878418, + "learning_rate": 0.00019505067440783136, + "loss": 1.2283, + "step": 55800 + }, + { + "epoch": 0.11857571046365313, + "grad_norm": 0.7507191300392151, + "learning_rate": 0.00019504855799313436, + "loss": 1.2328, + "step": 55810 + }, + { + "epoch": 0.11859695678339217, + "grad_norm": 0.5786362290382385, + "learning_rate": 0.000195046441137514, + "loss": 1.2009, + "step": 55820 + }, + { + "epoch": 0.11861820310313123, + "grad_norm": 0.48257535696029663, + "learning_rate": 0.0001950443238409801, + "loss": 1.2343, + "step": 55830 + }, + { + "epoch": 0.11863944942287029, + "grad_norm": 0.5266129970550537, + "learning_rate": 0.00019504220610354252, + "loss": 1.2233, + "step": 55840 + }, + { + "epoch": 0.11866069574260933, + "grad_norm": 0.37359076738357544, + "learning_rate": 0.00019504008792521103, + "loss": 1.2125, + "step": 55850 + }, + { + "epoch": 0.11868194206234839, + "grad_norm": 0.35765624046325684, + "learning_rate": 0.00019503796930599548, + "loss": 1.2126, + "step": 55860 + }, + { + "epoch": 0.11870318838208745, + "grad_norm": 0.34345656633377075, + "learning_rate": 0.00019503585024590567, + "loss": 1.2131, + "step": 55870 + }, + { + "epoch": 0.11872443470182649, + "grad_norm": 0.3337790369987488, + "learning_rate": 0.00019503373074495148, + "loss": 1.2371, + "step": 55880 + }, + { + "epoch": 0.11874568102156555, + "grad_norm": 0.3797248899936676, + "learning_rate": 0.0001950316108031427, + "loss": 1.2147, + "step": 55890 + }, + { + "epoch": 0.1187669273413046, + "grad_norm": 0.35152971744537354, + "learning_rate": 0.0001950294904204892, + "loss": 1.2216, + "step": 55900 + }, + { + "epoch": 0.11878817366104365, + "grad_norm": 0.3644471764564514, + "learning_rate": 0.0001950273695970008, + "loss": 1.239, + "step": 55910 + }, + { + "epoch": 0.1188094199807827, + "grad_norm": 0.36819443106651306, + "learning_rate": 0.00019502524833268732, + "loss": 1.216, + "step": 55920 + }, + { + "epoch": 0.11883066630052176, + "grad_norm": 0.3397957384586334, + "learning_rate": 0.00019502312662755862, + "loss": 1.1809, + "step": 55930 + }, + { + "epoch": 0.1188519126202608, + "grad_norm": 0.4117399752140045, + "learning_rate": 0.00019502100448162455, + "loss": 1.2189, + "step": 55940 + }, + { + "epoch": 0.11887315893999986, + "grad_norm": 0.3388478457927704, + "learning_rate": 0.00019501888189489496, + "loss": 1.2165, + "step": 55950 + }, + { + "epoch": 0.11889440525973892, + "grad_norm": 0.36757034063339233, + "learning_rate": 0.00019501675886737964, + "loss": 1.1933, + "step": 55960 + }, + { + "epoch": 0.11891565157947796, + "grad_norm": 0.41912952065467834, + "learning_rate": 0.0001950146353990885, + "loss": 1.2148, + "step": 55970 + }, + { + "epoch": 0.11893689789921702, + "grad_norm": 0.3381246328353882, + "learning_rate": 0.00019501251149003134, + "loss": 1.2349, + "step": 55980 + }, + { + "epoch": 0.11895814421895608, + "grad_norm": 0.3961070477962494, + "learning_rate": 0.00019501038714021807, + "loss": 1.2232, + "step": 55990 + }, + { + "epoch": 0.11897939053869512, + "grad_norm": 0.3921176791191101, + "learning_rate": 0.00019500826234965853, + "loss": 1.2049, + "step": 56000 + }, + { + "epoch": 0.11900063685843418, + "grad_norm": 0.5206471085548401, + "learning_rate": 0.00019500613711836252, + "loss": 1.2272, + "step": 56010 + }, + { + "epoch": 0.11902188317817324, + "grad_norm": 0.4021837115287781, + "learning_rate": 0.00019500401144633997, + "loss": 1.2319, + "step": 56020 + }, + { + "epoch": 0.11904312949791228, + "grad_norm": 0.4697909355163574, + "learning_rate": 0.0001950018853336007, + "loss": 1.2271, + "step": 56030 + }, + { + "epoch": 0.11906437581765134, + "grad_norm": 0.39308398962020874, + "learning_rate": 0.0001949997587801546, + "loss": 1.1946, + "step": 56040 + }, + { + "epoch": 0.11908562213739039, + "grad_norm": 0.338173508644104, + "learning_rate": 0.0001949976317860115, + "loss": 1.2436, + "step": 56050 + }, + { + "epoch": 0.11910686845712944, + "grad_norm": 0.4472600519657135, + "learning_rate": 0.00019499550435118131, + "loss": 1.1863, + "step": 56060 + }, + { + "epoch": 0.1191281147768685, + "grad_norm": 0.45277726650238037, + "learning_rate": 0.00019499337647567387, + "loss": 1.165, + "step": 56070 + }, + { + "epoch": 0.11914936109660755, + "grad_norm": 0.4391416311264038, + "learning_rate": 0.00019499124815949904, + "loss": 1.1735, + "step": 56080 + }, + { + "epoch": 0.1191706074163466, + "grad_norm": 0.4451081156730652, + "learning_rate": 0.00019498911940266672, + "loss": 1.2122, + "step": 56090 + }, + { + "epoch": 0.11919185373608565, + "grad_norm": 0.3448525369167328, + "learning_rate": 0.0001949869902051868, + "loss": 1.1802, + "step": 56100 + }, + { + "epoch": 0.11921310005582471, + "grad_norm": 0.41522058844566345, + "learning_rate": 0.00019498486056706912, + "loss": 1.2084, + "step": 56110 + }, + { + "epoch": 0.11923434637556375, + "grad_norm": 0.3627246022224426, + "learning_rate": 0.00019498273048832357, + "loss": 1.2042, + "step": 56120 + }, + { + "epoch": 0.11925559269530281, + "grad_norm": 0.49863216280937195, + "learning_rate": 0.00019498059996896002, + "loss": 1.214, + "step": 56130 + }, + { + "epoch": 0.11927683901504187, + "grad_norm": 0.47496625781059265, + "learning_rate": 0.0001949784690089884, + "loss": 1.2332, + "step": 56140 + }, + { + "epoch": 0.11929808533478091, + "grad_norm": 0.704565703868866, + "learning_rate": 0.00019497633760841851, + "loss": 1.209, + "step": 56150 + }, + { + "epoch": 0.11931933165451997, + "grad_norm": 0.48758599162101746, + "learning_rate": 0.00019497420576726032, + "loss": 1.1613, + "step": 56160 + }, + { + "epoch": 0.11934057797425902, + "grad_norm": 0.38127148151397705, + "learning_rate": 0.0001949720734855237, + "loss": 1.2175, + "step": 56170 + }, + { + "epoch": 0.11936182429399807, + "grad_norm": 0.3474550247192383, + "learning_rate": 0.00019496994076321852, + "loss": 1.2203, + "step": 56180 + }, + { + "epoch": 0.11938307061373712, + "grad_norm": 0.5882521867752075, + "learning_rate": 0.00019496780760035467, + "loss": 1.2252, + "step": 56190 + }, + { + "epoch": 0.11940431693347618, + "grad_norm": 0.4622008204460144, + "learning_rate": 0.00019496567399694207, + "loss": 1.212, + "step": 56200 + }, + { + "epoch": 0.11942556325321524, + "grad_norm": 0.34853824973106384, + "learning_rate": 0.00019496353995299061, + "loss": 1.2013, + "step": 56210 + }, + { + "epoch": 0.11944680957295428, + "grad_norm": 0.44252243638038635, + "learning_rate": 0.00019496140546851017, + "loss": 1.197, + "step": 56220 + }, + { + "epoch": 0.11946805589269334, + "grad_norm": 0.36792242527008057, + "learning_rate": 0.00019495927054351072, + "loss": 1.2227, + "step": 56230 + }, + { + "epoch": 0.1194893022124324, + "grad_norm": 0.5170416235923767, + "learning_rate": 0.00019495713517800206, + "loss": 1.2011, + "step": 56240 + }, + { + "epoch": 0.11951054853217144, + "grad_norm": 0.3949393332004547, + "learning_rate": 0.00019495499937199418, + "loss": 1.2126, + "step": 56250 + }, + { + "epoch": 0.1195317948519105, + "grad_norm": 0.5259349942207336, + "learning_rate": 0.00019495286312549694, + "loss": 1.232, + "step": 56260 + }, + { + "epoch": 0.11955304117164955, + "grad_norm": 0.5687379240989685, + "learning_rate": 0.00019495072643852028, + "loss": 1.177, + "step": 56270 + }, + { + "epoch": 0.1195742874913886, + "grad_norm": 0.3803199827671051, + "learning_rate": 0.00019494858931107408, + "loss": 1.1626, + "step": 56280 + }, + { + "epoch": 0.11959553381112766, + "grad_norm": 0.4161173105239868, + "learning_rate": 0.00019494645174316828, + "loss": 1.2047, + "step": 56290 + }, + { + "epoch": 0.11961678013086671, + "grad_norm": 0.4628751277923584, + "learning_rate": 0.00019494431373481277, + "loss": 1.2697, + "step": 56300 + }, + { + "epoch": 0.11963802645060576, + "grad_norm": 0.8498740792274475, + "learning_rate": 0.00019494217528601753, + "loss": 1.2324, + "step": 56310 + }, + { + "epoch": 0.11965927277034481, + "grad_norm": 0.582740306854248, + "learning_rate": 0.0001949400363967924, + "loss": 1.2232, + "step": 56320 + }, + { + "epoch": 0.11968051909008387, + "grad_norm": 0.44246870279312134, + "learning_rate": 0.00019493789706714732, + "loss": 1.2302, + "step": 56330 + }, + { + "epoch": 0.11970176540982291, + "grad_norm": 0.34074288606643677, + "learning_rate": 0.00019493575729709228, + "loss": 1.211, + "step": 56340 + }, + { + "epoch": 0.11972301172956197, + "grad_norm": 0.4275619089603424, + "learning_rate": 0.00019493361708663713, + "loss": 1.2129, + "step": 56350 + }, + { + "epoch": 0.11974425804930103, + "grad_norm": 0.5572340488433838, + "learning_rate": 0.00019493147643579183, + "loss": 1.2091, + "step": 56360 + }, + { + "epoch": 0.11976550436904007, + "grad_norm": 0.44186899065971375, + "learning_rate": 0.00019492933534456631, + "loss": 1.2239, + "step": 56370 + }, + { + "epoch": 0.11978675068877913, + "grad_norm": 0.40897446870803833, + "learning_rate": 0.00019492719381297051, + "loss": 1.1908, + "step": 56380 + }, + { + "epoch": 0.11980799700851819, + "grad_norm": 0.3462803363800049, + "learning_rate": 0.00019492505184101435, + "loss": 1.1884, + "step": 56390 + }, + { + "epoch": 0.11982924332825723, + "grad_norm": 0.4503011703491211, + "learning_rate": 0.00019492290942870776, + "loss": 1.2188, + "step": 56400 + }, + { + "epoch": 0.11985048964799629, + "grad_norm": 0.3604753613471985, + "learning_rate": 0.0001949207665760607, + "loss": 1.1956, + "step": 56410 + }, + { + "epoch": 0.11987173596773534, + "grad_norm": 0.3297182321548462, + "learning_rate": 0.0001949186232830831, + "loss": 1.2064, + "step": 56420 + }, + { + "epoch": 0.11989298228747439, + "grad_norm": 0.40783995389938354, + "learning_rate": 0.00019491647954978492, + "loss": 1.2224, + "step": 56430 + }, + { + "epoch": 0.11991422860721344, + "grad_norm": 0.37798988819122314, + "learning_rate": 0.00019491433537617605, + "loss": 1.1993, + "step": 56440 + }, + { + "epoch": 0.1199354749269525, + "grad_norm": 0.4526877701282501, + "learning_rate": 0.0001949121907622665, + "loss": 1.2268, + "step": 56450 + }, + { + "epoch": 0.11995672124669154, + "grad_norm": 0.41643333435058594, + "learning_rate": 0.00019491004570806617, + "loss": 1.205, + "step": 56460 + }, + { + "epoch": 0.1199779675664306, + "grad_norm": 0.6187468767166138, + "learning_rate": 0.00019490790021358504, + "loss": 1.2428, + "step": 56470 + }, + { + "epoch": 0.11999921388616966, + "grad_norm": 0.34196534752845764, + "learning_rate": 0.00019490575427883307, + "loss": 1.1988, + "step": 56480 + }, + { + "epoch": 0.1200204602059087, + "grad_norm": 0.4105073809623718, + "learning_rate": 0.00019490360790382018, + "loss": 1.2117, + "step": 56490 + }, + { + "epoch": 0.12004170652564776, + "grad_norm": 0.36865243315696716, + "learning_rate": 0.00019490146108855637, + "loss": 1.1741, + "step": 56500 + }, + { + "epoch": 0.12006295284538682, + "grad_norm": 0.35204192996025085, + "learning_rate": 0.00019489931383305157, + "loss": 1.1878, + "step": 56510 + }, + { + "epoch": 0.12008419916512586, + "grad_norm": 0.48516741394996643, + "learning_rate": 0.0001948971661373157, + "loss": 1.1729, + "step": 56520 + }, + { + "epoch": 0.12010544548486492, + "grad_norm": 0.3738390803337097, + "learning_rate": 0.00019489501800135884, + "loss": 1.1946, + "step": 56530 + }, + { + "epoch": 0.12012669180460397, + "grad_norm": 0.42806828022003174, + "learning_rate": 0.00019489286942519086, + "loss": 1.2161, + "step": 56540 + }, + { + "epoch": 0.12014793812434302, + "grad_norm": 0.35682249069213867, + "learning_rate": 0.00019489072040882173, + "loss": 1.1665, + "step": 56550 + }, + { + "epoch": 0.12016918444408208, + "grad_norm": 0.3557257652282715, + "learning_rate": 0.00019488857095226148, + "loss": 1.1925, + "step": 56560 + }, + { + "epoch": 0.12019043076382113, + "grad_norm": 0.3413209915161133, + "learning_rate": 0.00019488642105552, + "loss": 1.2401, + "step": 56570 + }, + { + "epoch": 0.12021167708356018, + "grad_norm": 0.33425626158714294, + "learning_rate": 0.00019488427071860732, + "loss": 1.1799, + "step": 56580 + }, + { + "epoch": 0.12023292340329923, + "grad_norm": 0.47207725048065186, + "learning_rate": 0.0001948821199415334, + "loss": 1.2291, + "step": 56590 + }, + { + "epoch": 0.12025416972303829, + "grad_norm": 0.4241943955421448, + "learning_rate": 0.00019487996872430828, + "loss": 1.1829, + "step": 56600 + }, + { + "epoch": 0.12027541604277733, + "grad_norm": 0.3978102505207062, + "learning_rate": 0.0001948778170669418, + "loss": 1.2016, + "step": 56610 + }, + { + "epoch": 0.12029666236251639, + "grad_norm": 0.3442418873310089, + "learning_rate": 0.00019487566496944404, + "loss": 1.2328, + "step": 56620 + }, + { + "epoch": 0.12031790868225545, + "grad_norm": 0.3594733476638794, + "learning_rate": 0.000194873512431825, + "loss": 1.2128, + "step": 56630 + }, + { + "epoch": 0.1203391550019945, + "grad_norm": 0.4543834924697876, + "learning_rate": 0.00019487135945409458, + "loss": 1.1746, + "step": 56640 + }, + { + "epoch": 0.12036040132173355, + "grad_norm": 0.3414407968521118, + "learning_rate": 0.0001948692060362628, + "loss": 1.2275, + "step": 56650 + }, + { + "epoch": 0.1203816476414726, + "grad_norm": 0.36709755659103394, + "learning_rate": 0.00019486705217833974, + "loss": 1.2081, + "step": 56660 + }, + { + "epoch": 0.12040289396121166, + "grad_norm": 0.4309971332550049, + "learning_rate": 0.00019486489788033526, + "loss": 1.1914, + "step": 56670 + }, + { + "epoch": 0.1204241402809507, + "grad_norm": 0.3755400478839874, + "learning_rate": 0.00019486274314225942, + "loss": 1.2294, + "step": 56680 + }, + { + "epoch": 0.12044538660068976, + "grad_norm": 0.4984561502933502, + "learning_rate": 0.00019486058796412223, + "loss": 1.2226, + "step": 56690 + }, + { + "epoch": 0.12046663292042882, + "grad_norm": 0.4861827492713928, + "learning_rate": 0.00019485843234593364, + "loss": 1.2179, + "step": 56700 + }, + { + "epoch": 0.12048787924016786, + "grad_norm": 0.4899511933326721, + "learning_rate": 0.00019485627628770366, + "loss": 1.2581, + "step": 56710 + }, + { + "epoch": 0.12050912555990692, + "grad_norm": 0.5923722982406616, + "learning_rate": 0.00019485411978944234, + "loss": 1.194, + "step": 56720 + }, + { + "epoch": 0.12053037187964598, + "grad_norm": 0.31588858366012573, + "learning_rate": 0.00019485196285115962, + "loss": 1.2, + "step": 56730 + }, + { + "epoch": 0.12055161819938502, + "grad_norm": 0.46463218331336975, + "learning_rate": 0.00019484980547286557, + "loss": 1.2265, + "step": 56740 + }, + { + "epoch": 0.12057286451912408, + "grad_norm": 0.36574587225914, + "learning_rate": 0.00019484764765457016, + "loss": 1.2325, + "step": 56750 + }, + { + "epoch": 0.12059411083886314, + "grad_norm": 0.7421652674674988, + "learning_rate": 0.00019484548939628334, + "loss": 1.2431, + "step": 56760 + }, + { + "epoch": 0.12061535715860218, + "grad_norm": 0.48394855856895447, + "learning_rate": 0.00019484333069801527, + "loss": 1.1966, + "step": 56770 + }, + { + "epoch": 0.12063660347834124, + "grad_norm": 0.453742653131485, + "learning_rate": 0.0001948411715597758, + "loss": 1.2141, + "step": 56780 + }, + { + "epoch": 0.1206578497980803, + "grad_norm": 0.33880215883255005, + "learning_rate": 0.0001948390119815751, + "loss": 1.2113, + "step": 56790 + }, + { + "epoch": 0.12067909611781934, + "grad_norm": 0.33592453598976135, + "learning_rate": 0.00019483685196342308, + "loss": 1.1873, + "step": 56800 + }, + { + "epoch": 0.1207003424375584, + "grad_norm": 0.3917711675167084, + "learning_rate": 0.00019483469150532976, + "loss": 1.2241, + "step": 56810 + }, + { + "epoch": 0.12072158875729745, + "grad_norm": 0.3249267041683197, + "learning_rate": 0.00019483253060730525, + "loss": 1.1999, + "step": 56820 + }, + { + "epoch": 0.1207428350770365, + "grad_norm": 0.33806154131889343, + "learning_rate": 0.00019483036926935948, + "loss": 1.1923, + "step": 56830 + }, + { + "epoch": 0.12076408139677555, + "grad_norm": 0.4287015199661255, + "learning_rate": 0.00019482820749150252, + "loss": 1.1941, + "step": 56840 + }, + { + "epoch": 0.12078532771651461, + "grad_norm": 0.348627507686615, + "learning_rate": 0.00019482604527374444, + "loss": 1.2096, + "step": 56850 + }, + { + "epoch": 0.12080657403625365, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.00019482388261609517, + "loss": 1.2293, + "step": 56860 + }, + { + "epoch": 0.12082782035599271, + "grad_norm": 0.47283488512039185, + "learning_rate": 0.00019482171951856482, + "loss": 1.1945, + "step": 56870 + }, + { + "epoch": 0.12084906667573177, + "grad_norm": 0.4299822151660919, + "learning_rate": 0.00019481955598116342, + "loss": 1.1865, + "step": 56880 + }, + { + "epoch": 0.12087031299547081, + "grad_norm": 0.37171053886413574, + "learning_rate": 0.00019481739200390098, + "loss": 1.1777, + "step": 56890 + }, + { + "epoch": 0.12089155931520987, + "grad_norm": 0.3539045453071594, + "learning_rate": 0.0001948152275867875, + "loss": 1.2303, + "step": 56900 + }, + { + "epoch": 0.12091280563494892, + "grad_norm": 0.3087972402572632, + "learning_rate": 0.0001948130627298331, + "loss": 1.1899, + "step": 56910 + }, + { + "epoch": 0.12093405195468797, + "grad_norm": 0.4731426537036896, + "learning_rate": 0.0001948108974330478, + "loss": 1.2139, + "step": 56920 + }, + { + "epoch": 0.12095529827442703, + "grad_norm": 0.4078727662563324, + "learning_rate": 0.00019480873169644162, + "loss": 1.2495, + "step": 56930 + }, + { + "epoch": 0.12097654459416608, + "grad_norm": 0.4088551104068756, + "learning_rate": 0.00019480656552002464, + "loss": 1.2032, + "step": 56940 + }, + { + "epoch": 0.12099779091390513, + "grad_norm": 0.38544952869415283, + "learning_rate": 0.00019480439890380688, + "loss": 1.1705, + "step": 56950 + }, + { + "epoch": 0.12101903723364418, + "grad_norm": 0.47684645652770996, + "learning_rate": 0.0001948022318477984, + "loss": 1.1914, + "step": 56960 + }, + { + "epoch": 0.12104028355338324, + "grad_norm": 0.7335971593856812, + "learning_rate": 0.00019480006435200923, + "loss": 1.1744, + "step": 56970 + }, + { + "epoch": 0.12106152987312228, + "grad_norm": 0.33572283387184143, + "learning_rate": 0.0001947978964164495, + "loss": 1.2145, + "step": 56980 + }, + { + "epoch": 0.12108277619286134, + "grad_norm": 0.4340001046657562, + "learning_rate": 0.00019479572804112917, + "loss": 1.2239, + "step": 56990 + }, + { + "epoch": 0.1211040225126004, + "grad_norm": 0.3473954200744629, + "learning_rate": 0.00019479355922605838, + "loss": 1.1741, + "step": 57000 + }, + { + "epoch": 0.12112526883233944, + "grad_norm": 0.3430631458759308, + "learning_rate": 0.00019479138997124712, + "loss": 1.2099, + "step": 57010 + }, + { + "epoch": 0.1211465151520785, + "grad_norm": 0.385551393032074, + "learning_rate": 0.00019478922027670552, + "loss": 1.1914, + "step": 57020 + }, + { + "epoch": 0.12116776147181756, + "grad_norm": 0.3586744964122772, + "learning_rate": 0.00019478705014244359, + "loss": 1.1492, + "step": 57030 + }, + { + "epoch": 0.1211890077915566, + "grad_norm": 0.37098202109336853, + "learning_rate": 0.00019478487956847143, + "loss": 1.2344, + "step": 57040 + }, + { + "epoch": 0.12121025411129566, + "grad_norm": 0.31762954592704773, + "learning_rate": 0.0001947827085547991, + "loss": 1.2194, + "step": 57050 + }, + { + "epoch": 0.12123150043103471, + "grad_norm": 0.34016352891921997, + "learning_rate": 0.00019478053710143668, + "loss": 1.1952, + "step": 57060 + }, + { + "epoch": 0.12125274675077377, + "grad_norm": 0.502906858921051, + "learning_rate": 0.0001947783652083942, + "loss": 1.2154, + "step": 57070 + }, + { + "epoch": 0.12127399307051281, + "grad_norm": 0.4282011091709137, + "learning_rate": 0.0001947761928756818, + "loss": 1.2212, + "step": 57080 + }, + { + "epoch": 0.12129523939025187, + "grad_norm": 0.3749598562717438, + "learning_rate": 0.00019477402010330952, + "loss": 1.2304, + "step": 57090 + }, + { + "epoch": 0.12131648570999093, + "grad_norm": 0.4344439208507538, + "learning_rate": 0.00019477184689128745, + "loss": 1.2342, + "step": 57100 + }, + { + "epoch": 0.12133773202972997, + "grad_norm": 0.33373069763183594, + "learning_rate": 0.00019476967323962566, + "loss": 1.2286, + "step": 57110 + }, + { + "epoch": 0.12135897834946903, + "grad_norm": 0.36990365386009216, + "learning_rate": 0.00019476749914833422, + "loss": 1.224, + "step": 57120 + }, + { + "epoch": 0.12138022466920809, + "grad_norm": 0.4887876510620117, + "learning_rate": 0.00019476532461742328, + "loss": 1.2221, + "step": 57130 + }, + { + "epoch": 0.12140147098894713, + "grad_norm": 0.3813793957233429, + "learning_rate": 0.00019476314964690287, + "loss": 1.2184, + "step": 57140 + }, + { + "epoch": 0.12142271730868619, + "grad_norm": 0.31448981165885925, + "learning_rate": 0.00019476097423678308, + "loss": 1.2001, + "step": 57150 + }, + { + "epoch": 0.12144396362842524, + "grad_norm": 0.4813230335712433, + "learning_rate": 0.00019475879838707402, + "loss": 1.2, + "step": 57160 + }, + { + "epoch": 0.12146520994816429, + "grad_norm": 0.4293104410171509, + "learning_rate": 0.00019475662209778575, + "loss": 1.2128, + "step": 57170 + }, + { + "epoch": 0.12148645626790334, + "grad_norm": 0.3692798912525177, + "learning_rate": 0.00019475444536892845, + "loss": 1.2074, + "step": 57180 + }, + { + "epoch": 0.1215077025876424, + "grad_norm": 0.43577897548675537, + "learning_rate": 0.00019475226820051211, + "loss": 1.2291, + "step": 57190 + }, + { + "epoch": 0.12152894890738145, + "grad_norm": 0.3899393379688263, + "learning_rate": 0.0001947500905925469, + "loss": 1.1724, + "step": 57200 + }, + { + "epoch": 0.1215501952271205, + "grad_norm": 0.33434855937957764, + "learning_rate": 0.00019474791254504292, + "loss": 1.2138, + "step": 57210 + }, + { + "epoch": 0.12157144154685956, + "grad_norm": 0.39857563376426697, + "learning_rate": 0.00019474573405801025, + "loss": 1.2069, + "step": 57220 + }, + { + "epoch": 0.1215926878665986, + "grad_norm": 0.32966330647468567, + "learning_rate": 0.00019474355513145902, + "loss": 1.2156, + "step": 57230 + }, + { + "epoch": 0.12161393418633766, + "grad_norm": 0.35723012685775757, + "learning_rate": 0.00019474137576539925, + "loss": 1.1837, + "step": 57240 + }, + { + "epoch": 0.12163518050607672, + "grad_norm": 0.3452502489089966, + "learning_rate": 0.00019473919595984122, + "loss": 1.1779, + "step": 57250 + }, + { + "epoch": 0.12165642682581576, + "grad_norm": 0.39292436838150024, + "learning_rate": 0.00019473701571479487, + "loss": 1.1715, + "step": 57260 + }, + { + "epoch": 0.12167767314555482, + "grad_norm": 0.3977205753326416, + "learning_rate": 0.00019473483503027042, + "loss": 1.2028, + "step": 57270 + }, + { + "epoch": 0.12169891946529388, + "grad_norm": 0.4129056930541992, + "learning_rate": 0.00019473265390627793, + "loss": 1.225, + "step": 57280 + }, + { + "epoch": 0.12172016578503292, + "grad_norm": 0.3466992676258087, + "learning_rate": 0.00019473047234282756, + "loss": 1.1749, + "step": 57290 + }, + { + "epoch": 0.12174141210477198, + "grad_norm": 0.4052610397338867, + "learning_rate": 0.0001947282903399294, + "loss": 1.2599, + "step": 57300 + }, + { + "epoch": 0.12176265842451103, + "grad_norm": 0.40301644802093506, + "learning_rate": 0.00019472610789759358, + "loss": 1.2089, + "step": 57310 + }, + { + "epoch": 0.12178390474425008, + "grad_norm": 0.32102420926094055, + "learning_rate": 0.00019472392501583024, + "loss": 1.241, + "step": 57320 + }, + { + "epoch": 0.12180515106398913, + "grad_norm": 0.43691152334213257, + "learning_rate": 0.00019472174169464946, + "loss": 1.2046, + "step": 57330 + }, + { + "epoch": 0.12182639738372819, + "grad_norm": 0.4736135005950928, + "learning_rate": 0.00019471955793406144, + "loss": 1.2285, + "step": 57340 + }, + { + "epoch": 0.12184764370346723, + "grad_norm": 0.6419433951377869, + "learning_rate": 0.00019471737373407625, + "loss": 1.2006, + "step": 57350 + }, + { + "epoch": 0.12186889002320629, + "grad_norm": 0.4314805567264557, + "learning_rate": 0.00019471518909470404, + "loss": 1.2007, + "step": 57360 + }, + { + "epoch": 0.12189013634294535, + "grad_norm": 0.30584320425987244, + "learning_rate": 0.00019471300401595497, + "loss": 1.207, + "step": 57370 + }, + { + "epoch": 0.12191138266268439, + "grad_norm": 0.37229764461517334, + "learning_rate": 0.00019471081849783915, + "loss": 1.2418, + "step": 57380 + }, + { + "epoch": 0.12193262898242345, + "grad_norm": 0.3558831512928009, + "learning_rate": 0.00019470863254036674, + "loss": 1.2089, + "step": 57390 + }, + { + "epoch": 0.1219538753021625, + "grad_norm": 0.40367579460144043, + "learning_rate": 0.0001947064461435478, + "loss": 1.2205, + "step": 57400 + }, + { + "epoch": 0.12197512162190155, + "grad_norm": 0.590538501739502, + "learning_rate": 0.0001947042593073926, + "loss": 1.2224, + "step": 57410 + }, + { + "epoch": 0.12199636794164061, + "grad_norm": 0.32494691014289856, + "learning_rate": 0.0001947020720319112, + "loss": 1.2197, + "step": 57420 + }, + { + "epoch": 0.12201761426137966, + "grad_norm": 0.36051633954048157, + "learning_rate": 0.00019469988431711376, + "loss": 1.2433, + "step": 57430 + }, + { + "epoch": 0.12203886058111871, + "grad_norm": 0.3675197660923004, + "learning_rate": 0.00019469769616301045, + "loss": 1.219, + "step": 57440 + }, + { + "epoch": 0.12206010690085776, + "grad_norm": 0.33510124683380127, + "learning_rate": 0.0001946955075696114, + "loss": 1.1949, + "step": 57450 + }, + { + "epoch": 0.12208135322059682, + "grad_norm": 0.4652650058269501, + "learning_rate": 0.00019469331853692678, + "loss": 1.1881, + "step": 57460 + }, + { + "epoch": 0.12210259954033588, + "grad_norm": 0.4192184805870056, + "learning_rate": 0.0001946911290649667, + "loss": 1.2201, + "step": 57470 + }, + { + "epoch": 0.12212384586007492, + "grad_norm": 0.3309726119041443, + "learning_rate": 0.0001946889391537414, + "loss": 1.2008, + "step": 57480 + }, + { + "epoch": 0.12214509217981398, + "grad_norm": 0.35672351717948914, + "learning_rate": 0.00019468674880326097, + "loss": 1.2015, + "step": 57490 + }, + { + "epoch": 0.12216633849955304, + "grad_norm": 0.43826109170913696, + "learning_rate": 0.0001946845580135356, + "loss": 1.1922, + "step": 57500 + }, + { + "epoch": 0.12218758481929208, + "grad_norm": 0.4612908363342285, + "learning_rate": 0.00019468236678457543, + "loss": 1.2274, + "step": 57510 + }, + { + "epoch": 0.12220883113903114, + "grad_norm": 0.37018194794654846, + "learning_rate": 0.00019468017511639065, + "loss": 1.2169, + "step": 57520 + }, + { + "epoch": 0.1222300774587702, + "grad_norm": 0.3481687307357788, + "learning_rate": 0.0001946779830089914, + "loss": 1.2121, + "step": 57530 + }, + { + "epoch": 0.12225132377850924, + "grad_norm": 0.3205099403858185, + "learning_rate": 0.00019467579046238788, + "loss": 1.2315, + "step": 57540 + }, + { + "epoch": 0.1222725700982483, + "grad_norm": 0.42688730359077454, + "learning_rate": 0.00019467359747659023, + "loss": 1.211, + "step": 57550 + }, + { + "epoch": 0.12229381641798735, + "grad_norm": 0.33263787627220154, + "learning_rate": 0.00019467140405160865, + "loss": 1.1904, + "step": 57560 + }, + { + "epoch": 0.1223150627377264, + "grad_norm": 0.35647138953208923, + "learning_rate": 0.0001946692101874533, + "loss": 1.2301, + "step": 57570 + }, + { + "epoch": 0.12233630905746545, + "grad_norm": 0.5592330694198608, + "learning_rate": 0.00019466701588413435, + "loss": 1.1986, + "step": 57580 + }, + { + "epoch": 0.12235755537720451, + "grad_norm": 0.3395279347896576, + "learning_rate": 0.00019466482114166204, + "loss": 1.231, + "step": 57590 + }, + { + "epoch": 0.12237880169694355, + "grad_norm": 0.3633936941623688, + "learning_rate": 0.00019466262596004645, + "loss": 1.2295, + "step": 57600 + }, + { + "epoch": 0.12240004801668261, + "grad_norm": 0.2984895408153534, + "learning_rate": 0.00019466043033929782, + "loss": 1.181, + "step": 57610 + }, + { + "epoch": 0.12242129433642167, + "grad_norm": 0.4071109890937805, + "learning_rate": 0.00019465823427942635, + "loss": 1.2446, + "step": 57620 + }, + { + "epoch": 0.12244254065616071, + "grad_norm": 0.34661754965782166, + "learning_rate": 0.0001946560377804422, + "loss": 1.2314, + "step": 57630 + }, + { + "epoch": 0.12246378697589977, + "grad_norm": 0.4030947983264923, + "learning_rate": 0.00019465384084235555, + "loss": 1.2496, + "step": 57640 + }, + { + "epoch": 0.12248503329563883, + "grad_norm": 0.5961849689483643, + "learning_rate": 0.00019465164346517663, + "loss": 1.1856, + "step": 57650 + }, + { + "epoch": 0.12250627961537787, + "grad_norm": 0.325636088848114, + "learning_rate": 0.00019464944564891564, + "loss": 1.2243, + "step": 57660 + }, + { + "epoch": 0.12252752593511693, + "grad_norm": 0.34902235865592957, + "learning_rate": 0.00019464724739358272, + "loss": 1.1925, + "step": 57670 + }, + { + "epoch": 0.12254877225485598, + "grad_norm": 0.4962388873100281, + "learning_rate": 0.00019464504869918806, + "loss": 1.2441, + "step": 57680 + }, + { + "epoch": 0.12257001857459503, + "grad_norm": 0.4538576006889343, + "learning_rate": 0.00019464284956574192, + "loss": 1.2176, + "step": 57690 + }, + { + "epoch": 0.12259126489433408, + "grad_norm": 0.38431960344314575, + "learning_rate": 0.0001946406499932545, + "loss": 1.2158, + "step": 57700 + }, + { + "epoch": 0.12261251121407314, + "grad_norm": 0.30861973762512207, + "learning_rate": 0.00019463844998173597, + "loss": 1.1936, + "step": 57710 + }, + { + "epoch": 0.12263375753381218, + "grad_norm": 0.40994468331336975, + "learning_rate": 0.00019463624953119653, + "loss": 1.2249, + "step": 57720 + }, + { + "epoch": 0.12265500385355124, + "grad_norm": 0.3965461552143097, + "learning_rate": 0.0001946340486416464, + "loss": 1.1871, + "step": 57730 + }, + { + "epoch": 0.1226762501732903, + "grad_norm": 0.48390674591064453, + "learning_rate": 0.0001946318473130958, + "loss": 1.1818, + "step": 57740 + }, + { + "epoch": 0.12269749649302934, + "grad_norm": 0.38137200474739075, + "learning_rate": 0.00019462964554555496, + "loss": 1.2272, + "step": 57750 + }, + { + "epoch": 0.1227187428127684, + "grad_norm": 0.35434868931770325, + "learning_rate": 0.00019462744333903405, + "loss": 1.2357, + "step": 57760 + }, + { + "epoch": 0.12273998913250746, + "grad_norm": 0.3556872010231018, + "learning_rate": 0.00019462524069354328, + "loss": 1.2109, + "step": 57770 + }, + { + "epoch": 0.1227612354522465, + "grad_norm": 0.35309645533561707, + "learning_rate": 0.00019462303760909292, + "loss": 1.2438, + "step": 57780 + }, + { + "epoch": 0.12278248177198556, + "grad_norm": 0.32881760597229004, + "learning_rate": 0.00019462083408569315, + "loss": 1.1892, + "step": 57790 + }, + { + "epoch": 0.12280372809172461, + "grad_norm": 0.4359093904495239, + "learning_rate": 0.0001946186301233542, + "loss": 1.2264, + "step": 57800 + }, + { + "epoch": 0.12282497441146366, + "grad_norm": 0.3680889904499054, + "learning_rate": 0.00019461642572208635, + "loss": 1.2095, + "step": 57810 + }, + { + "epoch": 0.12284622073120272, + "grad_norm": 0.478940486907959, + "learning_rate": 0.00019461422088189973, + "loss": 1.2097, + "step": 57820 + }, + { + "epoch": 0.12286746705094177, + "grad_norm": 0.3547939956188202, + "learning_rate": 0.00019461201560280462, + "loss": 1.2027, + "step": 57830 + }, + { + "epoch": 0.12288871337068082, + "grad_norm": 0.6580577492713928, + "learning_rate": 0.00019460980988481125, + "loss": 1.1809, + "step": 57840 + }, + { + "epoch": 0.12290995969041987, + "grad_norm": 0.37116092443466187, + "learning_rate": 0.00019460760372792983, + "loss": 1.2044, + "step": 57850 + }, + { + "epoch": 0.12293120601015893, + "grad_norm": 0.3336234986782074, + "learning_rate": 0.00019460539713217064, + "loss": 1.1927, + "step": 57860 + }, + { + "epoch": 0.12295245232989797, + "grad_norm": 0.38333433866500854, + "learning_rate": 0.00019460319009754383, + "loss": 1.2053, + "step": 57870 + }, + { + "epoch": 0.12297369864963703, + "grad_norm": 0.34826403856277466, + "learning_rate": 0.00019460098262405974, + "loss": 1.2801, + "step": 57880 + }, + { + "epoch": 0.12299494496937609, + "grad_norm": 0.43851232528686523, + "learning_rate": 0.00019459877471172856, + "loss": 1.2285, + "step": 57890 + }, + { + "epoch": 0.12301619128911515, + "grad_norm": 0.3799078166484833, + "learning_rate": 0.0001945965663605605, + "loss": 1.2339, + "step": 57900 + }, + { + "epoch": 0.12303743760885419, + "grad_norm": 0.40113183856010437, + "learning_rate": 0.0001945943575705659, + "loss": 1.1797, + "step": 57910 + }, + { + "epoch": 0.12305868392859325, + "grad_norm": 0.564729630947113, + "learning_rate": 0.0001945921483417549, + "loss": 1.225, + "step": 57920 + }, + { + "epoch": 0.1230799302483323, + "grad_norm": 0.38708552718162537, + "learning_rate": 0.00019458993867413782, + "loss": 1.2071, + "step": 57930 + }, + { + "epoch": 0.12310117656807135, + "grad_norm": 0.44404834508895874, + "learning_rate": 0.00019458772856772487, + "loss": 1.2327, + "step": 57940 + }, + { + "epoch": 0.1231224228878104, + "grad_norm": 0.41768303513526917, + "learning_rate": 0.00019458551802252632, + "loss": 1.1602, + "step": 57950 + }, + { + "epoch": 0.12314366920754946, + "grad_norm": 0.49226808547973633, + "learning_rate": 0.00019458330703855242, + "loss": 1.2276, + "step": 57960 + }, + { + "epoch": 0.1231649155272885, + "grad_norm": 0.3199482560157776, + "learning_rate": 0.00019458109561581346, + "loss": 1.2093, + "step": 57970 + }, + { + "epoch": 0.12318616184702756, + "grad_norm": 0.3474564850330353, + "learning_rate": 0.00019457888375431966, + "loss": 1.1681, + "step": 57980 + }, + { + "epoch": 0.12320740816676662, + "grad_norm": 0.34787634015083313, + "learning_rate": 0.00019457667145408126, + "loss": 1.2057, + "step": 57990 + }, + { + "epoch": 0.12322865448650566, + "grad_norm": 0.3323448896408081, + "learning_rate": 0.00019457445871510855, + "loss": 1.2097, + "step": 58000 + }, + { + "epoch": 0.12324990080624472, + "grad_norm": 0.4460098147392273, + "learning_rate": 0.00019457224553741183, + "loss": 1.2159, + "step": 58010 + }, + { + "epoch": 0.12327114712598378, + "grad_norm": 0.37265482544898987, + "learning_rate": 0.00019457003192100133, + "loss": 1.1942, + "step": 58020 + }, + { + "epoch": 0.12329239344572282, + "grad_norm": 0.5240738987922668, + "learning_rate": 0.0001945678178658873, + "loss": 1.1906, + "step": 58030 + }, + { + "epoch": 0.12331363976546188, + "grad_norm": 0.3404009938240051, + "learning_rate": 0.00019456560337208006, + "loss": 1.1994, + "step": 58040 + }, + { + "epoch": 0.12333488608520093, + "grad_norm": 0.34347838163375854, + "learning_rate": 0.00019456338843958982, + "loss": 1.2363, + "step": 58050 + }, + { + "epoch": 0.12335613240493998, + "grad_norm": 0.37687617540359497, + "learning_rate": 0.0001945611730684269, + "loss": 1.1758, + "step": 58060 + }, + { + "epoch": 0.12337737872467903, + "grad_norm": 0.32821178436279297, + "learning_rate": 0.0001945589572586016, + "loss": 1.2056, + "step": 58070 + }, + { + "epoch": 0.12339862504441809, + "grad_norm": 0.460608571767807, + "learning_rate": 0.00019455674101012416, + "loss": 1.2006, + "step": 58080 + }, + { + "epoch": 0.12341987136415714, + "grad_norm": 0.39299261569976807, + "learning_rate": 0.00019455452432300482, + "loss": 1.1806, + "step": 58090 + }, + { + "epoch": 0.12344111768389619, + "grad_norm": 0.38317227363586426, + "learning_rate": 0.00019455230719725398, + "loss": 1.2144, + "step": 58100 + }, + { + "epoch": 0.12346236400363525, + "grad_norm": 0.363502562046051, + "learning_rate": 0.0001945500896328818, + "loss": 1.231, + "step": 58110 + }, + { + "epoch": 0.1234836103233743, + "grad_norm": 0.35741594433784485, + "learning_rate": 0.00019454787162989865, + "loss": 1.2239, + "step": 58120 + }, + { + "epoch": 0.12350485664311335, + "grad_norm": 0.5294461250305176, + "learning_rate": 0.00019454565318831479, + "loss": 1.1666, + "step": 58130 + }, + { + "epoch": 0.12352610296285241, + "grad_norm": 0.3228849172592163, + "learning_rate": 0.0001945434343081405, + "loss": 1.2227, + "step": 58140 + }, + { + "epoch": 0.12354734928259145, + "grad_norm": 0.38964465260505676, + "learning_rate": 0.00019454121498938608, + "loss": 1.2385, + "step": 58150 + }, + { + "epoch": 0.12356859560233051, + "grad_norm": 0.37790149450302124, + "learning_rate": 0.00019453899523206184, + "loss": 1.1878, + "step": 58160 + }, + { + "epoch": 0.12358984192206957, + "grad_norm": 0.5823348164558411, + "learning_rate": 0.0001945367750361781, + "loss": 1.1927, + "step": 58170 + }, + { + "epoch": 0.12361108824180861, + "grad_norm": 0.5046330094337463, + "learning_rate": 0.00019453455440174508, + "loss": 1.1808, + "step": 58180 + }, + { + "epoch": 0.12363233456154767, + "grad_norm": 0.40421155095100403, + "learning_rate": 0.0001945323333287732, + "loss": 1.192, + "step": 58190 + }, + { + "epoch": 0.12365358088128672, + "grad_norm": 0.3643091917037964, + "learning_rate": 0.00019453011181727261, + "loss": 1.1957, + "step": 58200 + }, + { + "epoch": 0.12367482720102577, + "grad_norm": 0.3249781131744385, + "learning_rate": 0.00019452788986725375, + "loss": 1.2178, + "step": 58210 + }, + { + "epoch": 0.12369607352076482, + "grad_norm": 0.38439348340034485, + "learning_rate": 0.00019452566747872686, + "loss": 1.1975, + "step": 58220 + }, + { + "epoch": 0.12371731984050388, + "grad_norm": 0.3493238091468811, + "learning_rate": 0.00019452344465170228, + "loss": 1.2244, + "step": 58230 + }, + { + "epoch": 0.12373856616024292, + "grad_norm": 0.32224857807159424, + "learning_rate": 0.0001945212213861903, + "loss": 1.2007, + "step": 58240 + }, + { + "epoch": 0.12375981247998198, + "grad_norm": 0.3703421652317047, + "learning_rate": 0.00019451899768220123, + "loss": 1.2136, + "step": 58250 + }, + { + "epoch": 0.12378105879972104, + "grad_norm": 0.45744192600250244, + "learning_rate": 0.00019451677353974538, + "loss": 1.2444, + "step": 58260 + }, + { + "epoch": 0.12380230511946008, + "grad_norm": 0.3919408321380615, + "learning_rate": 0.00019451454895883313, + "loss": 1.2037, + "step": 58270 + }, + { + "epoch": 0.12382355143919914, + "grad_norm": 0.3817576467990875, + "learning_rate": 0.00019451232393947473, + "loss": 1.1941, + "step": 58280 + }, + { + "epoch": 0.1238447977589382, + "grad_norm": 0.3752637207508087, + "learning_rate": 0.00019451009848168053, + "loss": 1.197, + "step": 58290 + }, + { + "epoch": 0.12386604407867724, + "grad_norm": 0.3744973838329315, + "learning_rate": 0.00019450787258546082, + "loss": 1.2165, + "step": 58300 + }, + { + "epoch": 0.1238872903984163, + "grad_norm": 0.34201815724372864, + "learning_rate": 0.000194505646250826, + "loss": 1.2141, + "step": 58310 + }, + { + "epoch": 0.12390853671815535, + "grad_norm": 0.5542741417884827, + "learning_rate": 0.00019450341947778633, + "loss": 1.2319, + "step": 58320 + }, + { + "epoch": 0.12392978303789441, + "grad_norm": 0.6045094728469849, + "learning_rate": 0.00019450119226635214, + "loss": 1.1746, + "step": 58330 + }, + { + "epoch": 0.12395102935763345, + "grad_norm": 0.43575042486190796, + "learning_rate": 0.00019449896461653382, + "loss": 1.2379, + "step": 58340 + }, + { + "epoch": 0.12397227567737251, + "grad_norm": 0.37534841895103455, + "learning_rate": 0.00019449673652834166, + "loss": 1.2059, + "step": 58350 + }, + { + "epoch": 0.12399352199711157, + "grad_norm": 0.3369283676147461, + "learning_rate": 0.000194494508001786, + "loss": 1.2086, + "step": 58360 + }, + { + "epoch": 0.12401476831685061, + "grad_norm": 0.3480965793132782, + "learning_rate": 0.0001944922790368772, + "loss": 1.224, + "step": 58370 + }, + { + "epoch": 0.12403601463658967, + "grad_norm": 0.4394851326942444, + "learning_rate": 0.00019449004963362554, + "loss": 1.2285, + "step": 58380 + }, + { + "epoch": 0.12405726095632873, + "grad_norm": 0.3588860034942627, + "learning_rate": 0.00019448781979204146, + "loss": 1.2388, + "step": 58390 + }, + { + "epoch": 0.12407850727606777, + "grad_norm": 0.43904978036880493, + "learning_rate": 0.0001944855895121352, + "loss": 1.1835, + "step": 58400 + }, + { + "epoch": 0.12409975359580683, + "grad_norm": 0.3500526547431946, + "learning_rate": 0.00019448335879391717, + "loss": 1.2206, + "step": 58410 + }, + { + "epoch": 0.12412099991554588, + "grad_norm": 0.42097708582878113, + "learning_rate": 0.00019448112763739773, + "loss": 1.193, + "step": 58420 + }, + { + "epoch": 0.12414224623528493, + "grad_norm": 0.33077266812324524, + "learning_rate": 0.00019447889604258715, + "loss": 1.2155, + "step": 58430 + }, + { + "epoch": 0.12416349255502399, + "grad_norm": 0.42959240078926086, + "learning_rate": 0.00019447666400949587, + "loss": 1.2345, + "step": 58440 + }, + { + "epoch": 0.12418473887476304, + "grad_norm": 0.3253169357776642, + "learning_rate": 0.0001944744315381342, + "loss": 1.2033, + "step": 58450 + }, + { + "epoch": 0.12420598519450209, + "grad_norm": 0.4381653964519501, + "learning_rate": 0.0001944721986285125, + "loss": 1.1682, + "step": 58460 + }, + { + "epoch": 0.12422723151424114, + "grad_norm": 0.4179016351699829, + "learning_rate": 0.00019446996528064113, + "loss": 1.2154, + "step": 58470 + }, + { + "epoch": 0.1242484778339802, + "grad_norm": 0.5461993217468262, + "learning_rate": 0.00019446773149453048, + "loss": 1.2141, + "step": 58480 + }, + { + "epoch": 0.12426972415371924, + "grad_norm": 0.36367300152778625, + "learning_rate": 0.00019446549727019087, + "loss": 1.2052, + "step": 58490 + }, + { + "epoch": 0.1242909704734583, + "grad_norm": 0.5014421939849854, + "learning_rate": 0.00019446326260763269, + "loss": 1.229, + "step": 58500 + }, + { + "epoch": 0.12431221679319736, + "grad_norm": 0.3344736099243164, + "learning_rate": 0.00019446102750686627, + "loss": 1.2298, + "step": 58510 + }, + { + "epoch": 0.1243334631129364, + "grad_norm": 0.3620796799659729, + "learning_rate": 0.00019445879196790204, + "loss": 1.2172, + "step": 58520 + }, + { + "epoch": 0.12435470943267546, + "grad_norm": 0.34936416149139404, + "learning_rate": 0.00019445655599075032, + "loss": 1.2298, + "step": 58530 + }, + { + "epoch": 0.12437595575241452, + "grad_norm": 0.41395968198776245, + "learning_rate": 0.0001944543195754215, + "loss": 1.1816, + "step": 58540 + }, + { + "epoch": 0.12439720207215356, + "grad_norm": 0.3344417214393616, + "learning_rate": 0.00019445208272192598, + "loss": 1.2141, + "step": 58550 + }, + { + "epoch": 0.12441844839189262, + "grad_norm": 0.3858461380004883, + "learning_rate": 0.00019444984543027407, + "loss": 1.1954, + "step": 58560 + }, + { + "epoch": 0.12443969471163167, + "grad_norm": 0.5166085958480835, + "learning_rate": 0.0001944476077004762, + "loss": 1.1845, + "step": 58570 + }, + { + "epoch": 0.12446094103137072, + "grad_norm": 0.37450864911079407, + "learning_rate": 0.00019444536953254273, + "loss": 1.1686, + "step": 58580 + }, + { + "epoch": 0.12448218735110977, + "grad_norm": 0.3347383141517639, + "learning_rate": 0.00019444313092648407, + "loss": 1.1651, + "step": 58590 + }, + { + "epoch": 0.12450343367084883, + "grad_norm": 0.3519822955131531, + "learning_rate": 0.0001944408918823106, + "loss": 1.1873, + "step": 58600 + }, + { + "epoch": 0.12452467999058787, + "grad_norm": 0.3628332316875458, + "learning_rate": 0.00019443865240003266, + "loss": 1.2084, + "step": 58610 + }, + { + "epoch": 0.12454592631032693, + "grad_norm": 0.4002545475959778, + "learning_rate": 0.00019443641247966068, + "loss": 1.2156, + "step": 58620 + }, + { + "epoch": 0.12456717263006599, + "grad_norm": 0.34292072057724, + "learning_rate": 0.00019443417212120504, + "loss": 1.2188, + "step": 58630 + }, + { + "epoch": 0.12458841894980503, + "grad_norm": 0.34579920768737793, + "learning_rate": 0.00019443193132467616, + "loss": 1.2283, + "step": 58640 + }, + { + "epoch": 0.12460966526954409, + "grad_norm": 0.34839606285095215, + "learning_rate": 0.0001944296900900844, + "loss": 1.2, + "step": 58650 + }, + { + "epoch": 0.12463091158928315, + "grad_norm": 0.44156312942504883, + "learning_rate": 0.00019442744841744014, + "loss": 1.2043, + "step": 58660 + }, + { + "epoch": 0.12465215790902219, + "grad_norm": 0.6548213362693787, + "learning_rate": 0.00019442520630675385, + "loss": 1.2007, + "step": 58670 + }, + { + "epoch": 0.12467340422876125, + "grad_norm": 0.6642503142356873, + "learning_rate": 0.00019442296375803586, + "loss": 1.1889, + "step": 58680 + }, + { + "epoch": 0.1246946505485003, + "grad_norm": 0.3545719385147095, + "learning_rate": 0.0001944207207712966, + "loss": 1.1973, + "step": 58690 + }, + { + "epoch": 0.12471589686823935, + "grad_norm": 0.41487589478492737, + "learning_rate": 0.0001944184773465465, + "loss": 1.2521, + "step": 58700 + }, + { + "epoch": 0.1247371431879784, + "grad_norm": 0.39215126633644104, + "learning_rate": 0.00019441623348379593, + "loss": 1.1946, + "step": 58710 + }, + { + "epoch": 0.12475838950771746, + "grad_norm": 0.48322904109954834, + "learning_rate": 0.0001944139891830553, + "loss": 1.1988, + "step": 58720 + }, + { + "epoch": 0.1247796358274565, + "grad_norm": 0.4501478672027588, + "learning_rate": 0.00019441174444433503, + "loss": 1.1982, + "step": 58730 + }, + { + "epoch": 0.12480088214719556, + "grad_norm": 0.39304256439208984, + "learning_rate": 0.00019440949926764555, + "loss": 1.2093, + "step": 58740 + }, + { + "epoch": 0.12482212846693462, + "grad_norm": 0.3428983688354492, + "learning_rate": 0.00019440725365299727, + "loss": 1.1993, + "step": 58750 + }, + { + "epoch": 0.12484337478667368, + "grad_norm": 0.3455224335193634, + "learning_rate": 0.00019440500760040057, + "loss": 1.2435, + "step": 58760 + }, + { + "epoch": 0.12486462110641272, + "grad_norm": 0.40272510051727295, + "learning_rate": 0.00019440276110986593, + "loss": 1.1832, + "step": 58770 + }, + { + "epoch": 0.12488586742615178, + "grad_norm": 0.3354950249195099, + "learning_rate": 0.00019440051418140374, + "loss": 1.1981, + "step": 58780 + }, + { + "epoch": 0.12490711374589084, + "grad_norm": 0.4093473553657532, + "learning_rate": 0.0001943982668150244, + "loss": 1.2488, + "step": 58790 + }, + { + "epoch": 0.12492836006562988, + "grad_norm": 0.35194769501686096, + "learning_rate": 0.00019439601901073835, + "loss": 1.1993, + "step": 58800 + }, + { + "epoch": 0.12494960638536894, + "grad_norm": 0.34237295389175415, + "learning_rate": 0.00019439377076855606, + "loss": 1.1805, + "step": 58810 + }, + { + "epoch": 0.124970852705108, + "grad_norm": 0.3332383632659912, + "learning_rate": 0.0001943915220884879, + "loss": 1.2288, + "step": 58820 + }, + { + "epoch": 0.12499209902484704, + "grad_norm": 0.391813725233078, + "learning_rate": 0.00019438927297054433, + "loss": 1.211, + "step": 58830 + }, + { + "epoch": 0.12501334534458608, + "grad_norm": 0.3375985622406006, + "learning_rate": 0.0001943870234147358, + "loss": 1.2054, + "step": 58840 + }, + { + "epoch": 0.12503459166432515, + "grad_norm": 0.3378129005432129, + "learning_rate": 0.00019438477342107268, + "loss": 1.2321, + "step": 58850 + }, + { + "epoch": 0.1250558379840642, + "grad_norm": 0.39712223410606384, + "learning_rate": 0.0001943825229895655, + "loss": 1.2032, + "step": 58860 + }, + { + "epoch": 0.12507708430380324, + "grad_norm": 0.31637194752693176, + "learning_rate": 0.00019438027212022462, + "loss": 1.2151, + "step": 58870 + }, + { + "epoch": 0.1250983306235423, + "grad_norm": 0.34580981731414795, + "learning_rate": 0.00019437802081306055, + "loss": 1.1903, + "step": 58880 + }, + { + "epoch": 0.12511957694328135, + "grad_norm": 0.47231996059417725, + "learning_rate": 0.00019437576906808368, + "loss": 1.2048, + "step": 58890 + }, + { + "epoch": 0.1251408232630204, + "grad_norm": 0.3966009318828583, + "learning_rate": 0.0001943735168853045, + "loss": 1.2033, + "step": 58900 + }, + { + "epoch": 0.12516206958275947, + "grad_norm": 0.36977526545524597, + "learning_rate": 0.00019437126426473335, + "loss": 1.2477, + "step": 58910 + }, + { + "epoch": 0.1251833159024985, + "grad_norm": 0.4301724135875702, + "learning_rate": 0.00019436901120638083, + "loss": 1.2056, + "step": 58920 + }, + { + "epoch": 0.12520456222223755, + "grad_norm": 0.49791771173477173, + "learning_rate": 0.0001943667577102573, + "loss": 1.2188, + "step": 58930 + }, + { + "epoch": 0.12522580854197662, + "grad_norm": 0.3512600064277649, + "learning_rate": 0.00019436450377637322, + "loss": 1.2067, + "step": 58940 + }, + { + "epoch": 0.12524705486171567, + "grad_norm": 0.3869268000125885, + "learning_rate": 0.0001943622494047391, + "loss": 1.173, + "step": 58950 + }, + { + "epoch": 0.12526830118145474, + "grad_norm": 0.32645076513290405, + "learning_rate": 0.00019435999459536536, + "loss": 1.2086, + "step": 58960 + }, + { + "epoch": 0.12528954750119378, + "grad_norm": 0.32957011461257935, + "learning_rate": 0.00019435773934826242, + "loss": 1.2424, + "step": 58970 + }, + { + "epoch": 0.12531079382093283, + "grad_norm": 0.3401363790035248, + "learning_rate": 0.00019435548366344082, + "loss": 1.2216, + "step": 58980 + }, + { + "epoch": 0.1253320401406719, + "grad_norm": 0.5165562033653259, + "learning_rate": 0.00019435322754091097, + "loss": 1.2466, + "step": 58990 + }, + { + "epoch": 0.12535328646041094, + "grad_norm": 0.40665873885154724, + "learning_rate": 0.00019435097098068334, + "loss": 1.2053, + "step": 59000 + }, + { + "epoch": 0.12537453278014998, + "grad_norm": 0.37538236379623413, + "learning_rate": 0.00019434871398276845, + "loss": 1.2066, + "step": 59010 + }, + { + "epoch": 0.12539577909988905, + "grad_norm": 0.3357439935207367, + "learning_rate": 0.0001943464565471767, + "loss": 1.2448, + "step": 59020 + }, + { + "epoch": 0.1254170254196281, + "grad_norm": 0.3250288963317871, + "learning_rate": 0.00019434419867391857, + "loss": 1.1688, + "step": 59030 + }, + { + "epoch": 0.12543827173936714, + "grad_norm": 0.42525869607925415, + "learning_rate": 0.0001943419403630046, + "loss": 1.2029, + "step": 59040 + }, + { + "epoch": 0.1254595180591062, + "grad_norm": 0.3995668292045593, + "learning_rate": 0.0001943396816144452, + "loss": 1.2105, + "step": 59050 + }, + { + "epoch": 0.12548076437884526, + "grad_norm": 0.3268483877182007, + "learning_rate": 0.00019433742242825085, + "loss": 1.2531, + "step": 59060 + }, + { + "epoch": 0.1255020106985843, + "grad_norm": 0.3492790460586548, + "learning_rate": 0.0001943351628044321, + "loss": 1.1845, + "step": 59070 + }, + { + "epoch": 0.12552325701832337, + "grad_norm": 0.3292267918586731, + "learning_rate": 0.00019433290274299935, + "loss": 1.1705, + "step": 59080 + }, + { + "epoch": 0.1255445033380624, + "grad_norm": 0.3449069857597351, + "learning_rate": 0.00019433064224396313, + "loss": 1.2238, + "step": 59090 + }, + { + "epoch": 0.12556574965780146, + "grad_norm": 0.3990594744682312, + "learning_rate": 0.0001943283813073339, + "loss": 1.2353, + "step": 59100 + }, + { + "epoch": 0.12558699597754053, + "grad_norm": 0.348417192697525, + "learning_rate": 0.0001943261199331222, + "loss": 1.2429, + "step": 59110 + }, + { + "epoch": 0.12560824229727957, + "grad_norm": 0.6344890594482422, + "learning_rate": 0.0001943238581213384, + "loss": 1.2366, + "step": 59120 + }, + { + "epoch": 0.12562948861701861, + "grad_norm": 0.3605632483959198, + "learning_rate": 0.00019432159587199317, + "loss": 1.1709, + "step": 59130 + }, + { + "epoch": 0.12565073493675769, + "grad_norm": 0.3910951614379883, + "learning_rate": 0.00019431933318509684, + "loss": 1.1633, + "step": 59140 + }, + { + "epoch": 0.12567198125649673, + "grad_norm": 0.3687137961387634, + "learning_rate": 0.00019431707006066002, + "loss": 1.2289, + "step": 59150 + }, + { + "epoch": 0.12569322757623577, + "grad_norm": 0.4100908041000366, + "learning_rate": 0.00019431480649869314, + "loss": 1.1973, + "step": 59160 + }, + { + "epoch": 0.12571447389597484, + "grad_norm": 0.34691357612609863, + "learning_rate": 0.00019431254249920675, + "loss": 1.2114, + "step": 59170 + }, + { + "epoch": 0.1257357202157139, + "grad_norm": 0.3709934949874878, + "learning_rate": 0.0001943102780622113, + "loss": 1.2239, + "step": 59180 + }, + { + "epoch": 0.12575696653545293, + "grad_norm": 0.4142650067806244, + "learning_rate": 0.00019430801318771735, + "loss": 1.2113, + "step": 59190 + }, + { + "epoch": 0.125778212855192, + "grad_norm": 0.33751893043518066, + "learning_rate": 0.00019430574787573536, + "loss": 1.1996, + "step": 59200 + }, + { + "epoch": 0.12579945917493104, + "grad_norm": 0.31955599784851074, + "learning_rate": 0.00019430348212627583, + "loss": 1.2098, + "step": 59210 + }, + { + "epoch": 0.1258207054946701, + "grad_norm": 0.38611242175102234, + "learning_rate": 0.00019430121593934932, + "loss": 1.2415, + "step": 59220 + }, + { + "epoch": 0.12584195181440916, + "grad_norm": 0.40382009744644165, + "learning_rate": 0.00019429894931496632, + "loss": 1.1877, + "step": 59230 + }, + { + "epoch": 0.1258631981341482, + "grad_norm": 0.6998656988143921, + "learning_rate": 0.00019429668225313737, + "loss": 1.2181, + "step": 59240 + }, + { + "epoch": 0.12588444445388725, + "grad_norm": 0.523800790309906, + "learning_rate": 0.00019429441475387292, + "loss": 1.2188, + "step": 59250 + }, + { + "epoch": 0.12590569077362632, + "grad_norm": 0.8336186408996582, + "learning_rate": 0.00019429214681718354, + "loss": 1.1631, + "step": 59260 + }, + { + "epoch": 0.12592693709336536, + "grad_norm": 0.7201374769210815, + "learning_rate": 0.00019428987844307972, + "loss": 1.2454, + "step": 59270 + }, + { + "epoch": 0.1259481834131044, + "grad_norm": 0.6818558573722839, + "learning_rate": 0.00019428760963157208, + "loss": 1.2062, + "step": 59280 + }, + { + "epoch": 0.12596942973284347, + "grad_norm": 0.5356825590133667, + "learning_rate": 0.00019428534038267098, + "loss": 1.2031, + "step": 59290 + }, + { + "epoch": 0.12599067605258252, + "grad_norm": 0.6013524532318115, + "learning_rate": 0.0001942830706963871, + "loss": 1.2238, + "step": 59300 + }, + { + "epoch": 0.12601192237232156, + "grad_norm": 0.49778860807418823, + "learning_rate": 0.00019428080057273087, + "loss": 1.1966, + "step": 59310 + }, + { + "epoch": 0.12603316869206063, + "grad_norm": 0.4150718152523041, + "learning_rate": 0.0001942785300117129, + "loss": 1.1856, + "step": 59320 + }, + { + "epoch": 0.12605441501179968, + "grad_norm": 0.3251405954360962, + "learning_rate": 0.0001942762590133436, + "loss": 1.1942, + "step": 59330 + }, + { + "epoch": 0.12607566133153872, + "grad_norm": 0.45957130193710327, + "learning_rate": 0.00019427398757763365, + "loss": 1.2031, + "step": 59340 + }, + { + "epoch": 0.1260969076512778, + "grad_norm": 0.38357412815093994, + "learning_rate": 0.0001942717157045935, + "loss": 1.1459, + "step": 59350 + }, + { + "epoch": 0.12611815397101683, + "grad_norm": 0.4793568253517151, + "learning_rate": 0.00019426944339423372, + "loss": 1.2248, + "step": 59360 + }, + { + "epoch": 0.12613940029075588, + "grad_norm": 0.3395139276981354, + "learning_rate": 0.0001942671706465648, + "loss": 1.2068, + "step": 59370 + }, + { + "epoch": 0.12616064661049495, + "grad_norm": 0.353083074092865, + "learning_rate": 0.00019426489746159734, + "loss": 1.2157, + "step": 59380 + }, + { + "epoch": 0.126181892930234, + "grad_norm": 0.41180145740509033, + "learning_rate": 0.0001942626238393419, + "loss": 1.2422, + "step": 59390 + }, + { + "epoch": 0.12620313924997303, + "grad_norm": 0.42119330167770386, + "learning_rate": 0.00019426034977980899, + "loss": 1.2193, + "step": 59400 + }, + { + "epoch": 0.1262243855697121, + "grad_norm": 0.32078441977500916, + "learning_rate": 0.00019425807528300915, + "loss": 1.2317, + "step": 59410 + }, + { + "epoch": 0.12624563188945115, + "grad_norm": 0.37144654989242554, + "learning_rate": 0.00019425580034895294, + "loss": 1.2297, + "step": 59420 + }, + { + "epoch": 0.1262668782091902, + "grad_norm": 0.3393357992172241, + "learning_rate": 0.00019425352497765092, + "loss": 1.1773, + "step": 59430 + }, + { + "epoch": 0.12628812452892926, + "grad_norm": 0.3968982398509979, + "learning_rate": 0.00019425124916911367, + "loss": 1.2121, + "step": 59440 + }, + { + "epoch": 0.1263093708486683, + "grad_norm": 0.42914873361587524, + "learning_rate": 0.0001942489729233517, + "loss": 1.2019, + "step": 59450 + }, + { + "epoch": 0.12633061716840735, + "grad_norm": 0.47081640362739563, + "learning_rate": 0.0001942466962403756, + "loss": 1.2082, + "step": 59460 + }, + { + "epoch": 0.12635186348814642, + "grad_norm": 0.5352026224136353, + "learning_rate": 0.00019424441912019593, + "loss": 1.2009, + "step": 59470 + }, + { + "epoch": 0.12637310980788546, + "grad_norm": 0.33472076058387756, + "learning_rate": 0.00019424214156282322, + "loss": 1.2155, + "step": 59480 + }, + { + "epoch": 0.1263943561276245, + "grad_norm": 0.3675887882709503, + "learning_rate": 0.00019423986356826808, + "loss": 1.2153, + "step": 59490 + }, + { + "epoch": 0.12641560244736358, + "grad_norm": 0.3845188021659851, + "learning_rate": 0.00019423758513654104, + "loss": 1.1974, + "step": 59500 + }, + { + "epoch": 0.12643684876710262, + "grad_norm": 0.4076785743236542, + "learning_rate": 0.00019423530626765274, + "loss": 1.2419, + "step": 59510 + }, + { + "epoch": 0.12645809508684167, + "grad_norm": 0.37727418541908264, + "learning_rate": 0.00019423302696161364, + "loss": 1.1971, + "step": 59520 + }, + { + "epoch": 0.12647934140658074, + "grad_norm": 0.40817710757255554, + "learning_rate": 0.00019423074721843441, + "loss": 1.2361, + "step": 59530 + }, + { + "epoch": 0.12650058772631978, + "grad_norm": 0.32063770294189453, + "learning_rate": 0.00019422846703812558, + "loss": 1.2189, + "step": 59540 + }, + { + "epoch": 0.12652183404605882, + "grad_norm": 0.3282710015773773, + "learning_rate": 0.00019422618642069773, + "loss": 1.1743, + "step": 59550 + }, + { + "epoch": 0.1265430803657979, + "grad_norm": 0.31668731570243835, + "learning_rate": 0.00019422390536616145, + "loss": 1.2045, + "step": 59560 + }, + { + "epoch": 0.12656432668553694, + "grad_norm": 0.3442654311656952, + "learning_rate": 0.00019422162387452733, + "loss": 1.1741, + "step": 59570 + }, + { + "epoch": 0.12658557300527598, + "grad_norm": 0.37405943870544434, + "learning_rate": 0.00019421934194580593, + "loss": 1.2202, + "step": 59580 + }, + { + "epoch": 0.12660681932501505, + "grad_norm": 0.42330634593963623, + "learning_rate": 0.00019421705958000784, + "loss": 1.1752, + "step": 59590 + }, + { + "epoch": 0.1266280656447541, + "grad_norm": 0.34989696741104126, + "learning_rate": 0.00019421477677714365, + "loss": 1.246, + "step": 59600 + }, + { + "epoch": 0.12664931196449314, + "grad_norm": 0.3778626620769501, + "learning_rate": 0.000194212493537224, + "loss": 1.2081, + "step": 59610 + }, + { + "epoch": 0.1266705582842322, + "grad_norm": 0.34858036041259766, + "learning_rate": 0.0001942102098602594, + "loss": 1.2175, + "step": 59620 + }, + { + "epoch": 0.12669180460397125, + "grad_norm": 0.3215596675872803, + "learning_rate": 0.00019420792574626047, + "loss": 1.1821, + "step": 59630 + }, + { + "epoch": 0.1267130509237103, + "grad_norm": 0.35784289240837097, + "learning_rate": 0.00019420564119523786, + "loss": 1.2039, + "step": 59640 + }, + { + "epoch": 0.12673429724344937, + "grad_norm": 0.3815777599811554, + "learning_rate": 0.0001942033562072021, + "loss": 1.2138, + "step": 59650 + }, + { + "epoch": 0.1267555435631884, + "grad_norm": 0.3423871099948883, + "learning_rate": 0.00019420107078216381, + "loss": 1.228, + "step": 59660 + }, + { + "epoch": 0.12677678988292745, + "grad_norm": 0.3582225739955902, + "learning_rate": 0.00019419878492013358, + "loss": 1.1974, + "step": 59670 + }, + { + "epoch": 0.12679803620266653, + "grad_norm": 0.3885154724121094, + "learning_rate": 0.00019419649862112206, + "loss": 1.2155, + "step": 59680 + }, + { + "epoch": 0.12681928252240557, + "grad_norm": 0.3450356125831604, + "learning_rate": 0.00019419421188513982, + "loss": 1.2119, + "step": 59690 + }, + { + "epoch": 0.1268405288421446, + "grad_norm": 0.36947083473205566, + "learning_rate": 0.00019419192471219744, + "loss": 1.2533, + "step": 59700 + }, + { + "epoch": 0.12686177516188368, + "grad_norm": 0.40868812799453735, + "learning_rate": 0.0001941896371023056, + "loss": 1.1931, + "step": 59710 + }, + { + "epoch": 0.12688302148162273, + "grad_norm": 0.3611011207103729, + "learning_rate": 0.0001941873490554749, + "loss": 1.1867, + "step": 59720 + }, + { + "epoch": 0.12690426780136177, + "grad_norm": 0.42699599266052246, + "learning_rate": 0.0001941850605717159, + "loss": 1.2085, + "step": 59730 + }, + { + "epoch": 0.12692551412110084, + "grad_norm": 0.42022377252578735, + "learning_rate": 0.00019418277165103925, + "loss": 1.2175, + "step": 59740 + }, + { + "epoch": 0.12694676044083988, + "grad_norm": 0.3156929314136505, + "learning_rate": 0.00019418048229345554, + "loss": 1.1747, + "step": 59750 + }, + { + "epoch": 0.12696800676057893, + "grad_norm": 0.3390989303588867, + "learning_rate": 0.00019417819249897543, + "loss": 1.208, + "step": 59760 + }, + { + "epoch": 0.126989253080318, + "grad_norm": 0.3847326636314392, + "learning_rate": 0.00019417590226760952, + "loss": 1.1942, + "step": 59770 + }, + { + "epoch": 0.12701049940005704, + "grad_norm": 0.329957515001297, + "learning_rate": 0.00019417361159936847, + "loss": 1.2473, + "step": 59780 + }, + { + "epoch": 0.12703174571979609, + "grad_norm": 0.43825262784957886, + "learning_rate": 0.00019417132049426286, + "loss": 1.1832, + "step": 59790 + }, + { + "epoch": 0.12705299203953516, + "grad_norm": 0.3846834599971771, + "learning_rate": 0.00019416902895230334, + "loss": 1.2011, + "step": 59800 + }, + { + "epoch": 0.1270742383592742, + "grad_norm": 0.4990527331829071, + "learning_rate": 0.00019416673697350052, + "loss": 1.2037, + "step": 59810 + }, + { + "epoch": 0.12709548467901327, + "grad_norm": 0.3421323895454407, + "learning_rate": 0.00019416444455786505, + "loss": 1.1786, + "step": 59820 + }, + { + "epoch": 0.12711673099875231, + "grad_norm": 0.3091193437576294, + "learning_rate": 0.00019416215170540757, + "loss": 1.1751, + "step": 59830 + }, + { + "epoch": 0.12713797731849136, + "grad_norm": 0.33380839228630066, + "learning_rate": 0.00019415985841613872, + "loss": 1.1975, + "step": 59840 + }, + { + "epoch": 0.12715922363823043, + "grad_norm": 0.32366251945495605, + "learning_rate": 0.00019415756469006912, + "loss": 1.1905, + "step": 59850 + }, + { + "epoch": 0.12718046995796947, + "grad_norm": 0.5683107972145081, + "learning_rate": 0.00019415527052720944, + "loss": 1.1958, + "step": 59860 + }, + { + "epoch": 0.12720171627770852, + "grad_norm": 0.3890274465084076, + "learning_rate": 0.00019415297592757024, + "loss": 1.2201, + "step": 59870 + }, + { + "epoch": 0.1272229625974476, + "grad_norm": 0.5046582221984863, + "learning_rate": 0.00019415068089116225, + "loss": 1.2343, + "step": 59880 + }, + { + "epoch": 0.12724420891718663, + "grad_norm": 0.383796364068985, + "learning_rate": 0.00019414838541799613, + "loss": 1.1923, + "step": 59890 + }, + { + "epoch": 0.12726545523692567, + "grad_norm": 0.39058995246887207, + "learning_rate": 0.00019414608950808245, + "loss": 1.1789, + "step": 59900 + }, + { + "epoch": 0.12728670155666474, + "grad_norm": 0.34469446539878845, + "learning_rate": 0.0001941437931614319, + "loss": 1.2022, + "step": 59910 + }, + { + "epoch": 0.1273079478764038, + "grad_norm": 0.31541159749031067, + "learning_rate": 0.00019414149637805517, + "loss": 1.2327, + "step": 59920 + }, + { + "epoch": 0.12732919419614283, + "grad_norm": 0.3698410987854004, + "learning_rate": 0.00019413919915796285, + "loss": 1.2363, + "step": 59930 + }, + { + "epoch": 0.1273504405158819, + "grad_norm": 0.355816125869751, + "learning_rate": 0.0001941369015011656, + "loss": 1.1917, + "step": 59940 + }, + { + "epoch": 0.12737168683562095, + "grad_norm": 0.42653727531433105, + "learning_rate": 0.00019413460340767414, + "loss": 1.2127, + "step": 59950 + }, + { + "epoch": 0.12739293315536, + "grad_norm": 0.5257704854011536, + "learning_rate": 0.0001941323048774991, + "loss": 1.2207, + "step": 59960 + }, + { + "epoch": 0.12741417947509906, + "grad_norm": 0.42798805236816406, + "learning_rate": 0.0001941300059106511, + "loss": 1.2183, + "step": 59970 + }, + { + "epoch": 0.1274354257948381, + "grad_norm": 0.38968831300735474, + "learning_rate": 0.00019412770650714087, + "loss": 1.2023, + "step": 59980 + }, + { + "epoch": 0.12745667211457715, + "grad_norm": 0.40161213278770447, + "learning_rate": 0.00019412540666697902, + "loss": 1.2099, + "step": 59990 + }, + { + "epoch": 0.12747791843431622, + "grad_norm": 0.3563777208328247, + "learning_rate": 0.00019412310639017628, + "loss": 1.2281, + "step": 60000 + }, + { + "epoch": 0.12749916475405526, + "grad_norm": 0.34803506731987, + "learning_rate": 0.00019412080567674328, + "loss": 1.2007, + "step": 60010 + }, + { + "epoch": 0.1275204110737943, + "grad_norm": 0.4243672490119934, + "learning_rate": 0.00019411850452669068, + "loss": 1.2067, + "step": 60020 + }, + { + "epoch": 0.12754165739353338, + "grad_norm": 0.42808327078819275, + "learning_rate": 0.00019411620294002918, + "loss": 1.2438, + "step": 60030 + }, + { + "epoch": 0.12756290371327242, + "grad_norm": 0.4687354862689972, + "learning_rate": 0.00019411390091676945, + "loss": 1.2413, + "step": 60040 + }, + { + "epoch": 0.12758415003301146, + "grad_norm": 0.49331533908843994, + "learning_rate": 0.00019411159845692216, + "loss": 1.1972, + "step": 60050 + }, + { + "epoch": 0.12760539635275053, + "grad_norm": 0.38423818349838257, + "learning_rate": 0.00019410929556049804, + "loss": 1.2168, + "step": 60060 + }, + { + "epoch": 0.12762664267248958, + "grad_norm": 0.4334772527217865, + "learning_rate": 0.00019410699222750772, + "loss": 1.1901, + "step": 60070 + }, + { + "epoch": 0.12764788899222862, + "grad_norm": 0.35555437207221985, + "learning_rate": 0.0001941046884579619, + "loss": 1.1942, + "step": 60080 + }, + { + "epoch": 0.1276691353119677, + "grad_norm": 0.4332692623138428, + "learning_rate": 0.00019410238425187127, + "loss": 1.2445, + "step": 60090 + }, + { + "epoch": 0.12769038163170673, + "grad_norm": 0.3463904857635498, + "learning_rate": 0.00019410007960924647, + "loss": 1.1935, + "step": 60100 + }, + { + "epoch": 0.12771162795144578, + "grad_norm": 0.4149988889694214, + "learning_rate": 0.0001940977745300983, + "loss": 1.1943, + "step": 60110 + }, + { + "epoch": 0.12773287427118485, + "grad_norm": 0.36345282196998596, + "learning_rate": 0.00019409546901443737, + "loss": 1.2076, + "step": 60120 + }, + { + "epoch": 0.1277541205909239, + "grad_norm": 0.5162017345428467, + "learning_rate": 0.00019409316306227438, + "loss": 1.2391, + "step": 60130 + }, + { + "epoch": 0.12777536691066294, + "grad_norm": 0.3363488018512726, + "learning_rate": 0.00019409085667362007, + "loss": 1.2196, + "step": 60140 + }, + { + "epoch": 0.127796613230402, + "grad_norm": 0.3584454655647278, + "learning_rate": 0.00019408854984848505, + "loss": 1.2112, + "step": 60150 + }, + { + "epoch": 0.12781785955014105, + "grad_norm": 0.341301292181015, + "learning_rate": 0.00019408624258688016, + "loss": 1.1885, + "step": 60160 + }, + { + "epoch": 0.1278391058698801, + "grad_norm": 0.3439040184020996, + "learning_rate": 0.00019408393488881597, + "loss": 1.2307, + "step": 60170 + }, + { + "epoch": 0.12786035218961916, + "grad_norm": 0.42885175347328186, + "learning_rate": 0.00019408162675430327, + "loss": 1.2642, + "step": 60180 + }, + { + "epoch": 0.1278815985093582, + "grad_norm": 0.4453258216381073, + "learning_rate": 0.0001940793181833527, + "loss": 1.1896, + "step": 60190 + }, + { + "epoch": 0.12790284482909725, + "grad_norm": 0.4648436903953552, + "learning_rate": 0.00019407700917597508, + "loss": 1.2084, + "step": 60200 + }, + { + "epoch": 0.12792409114883632, + "grad_norm": 0.39669308066368103, + "learning_rate": 0.000194074699732181, + "loss": 1.2061, + "step": 60210 + }, + { + "epoch": 0.12794533746857537, + "grad_norm": 0.39501842856407166, + "learning_rate": 0.00019407238985198123, + "loss": 1.2011, + "step": 60220 + }, + { + "epoch": 0.1279665837883144, + "grad_norm": 0.36398136615753174, + "learning_rate": 0.00019407007953538649, + "loss": 1.1898, + "step": 60230 + }, + { + "epoch": 0.12798783010805348, + "grad_norm": 0.5449216365814209, + "learning_rate": 0.00019406776878240744, + "loss": 1.2037, + "step": 60240 + }, + { + "epoch": 0.12800907642779252, + "grad_norm": 0.3668470084667206, + "learning_rate": 0.00019406545759305488, + "loss": 1.1987, + "step": 60250 + }, + { + "epoch": 0.12803032274753157, + "grad_norm": 0.4616071283817291, + "learning_rate": 0.0001940631459673395, + "loss": 1.2029, + "step": 60260 + }, + { + "epoch": 0.12805156906727064, + "grad_norm": 0.5261346697807312, + "learning_rate": 0.000194060833905272, + "loss": 1.1997, + "step": 60270 + }, + { + "epoch": 0.12807281538700968, + "grad_norm": 0.37584149837493896, + "learning_rate": 0.00019405852140686312, + "loss": 1.1832, + "step": 60280 + }, + { + "epoch": 0.12809406170674872, + "grad_norm": 0.617038905620575, + "learning_rate": 0.0001940562084721236, + "loss": 1.1972, + "step": 60290 + }, + { + "epoch": 0.1281153080264878, + "grad_norm": 0.3493243455886841, + "learning_rate": 0.0001940538951010642, + "loss": 1.1798, + "step": 60300 + }, + { + "epoch": 0.12813655434622684, + "grad_norm": 0.35019204020500183, + "learning_rate": 0.0001940515812936956, + "loss": 1.2503, + "step": 60310 + }, + { + "epoch": 0.12815780066596588, + "grad_norm": 0.39608630537986755, + "learning_rate": 0.0001940492670500285, + "loss": 1.1685, + "step": 60320 + }, + { + "epoch": 0.12817904698570495, + "grad_norm": 0.6565674543380737, + "learning_rate": 0.00019404695237007372, + "loss": 1.1915, + "step": 60330 + }, + { + "epoch": 0.128200293305444, + "grad_norm": 0.3603017330169678, + "learning_rate": 0.00019404463725384194, + "loss": 1.2145, + "step": 60340 + }, + { + "epoch": 0.12822153962518304, + "grad_norm": 0.48575806617736816, + "learning_rate": 0.00019404232170134396, + "loss": 1.2057, + "step": 60350 + }, + { + "epoch": 0.1282427859449221, + "grad_norm": 0.3237549960613251, + "learning_rate": 0.00019404000571259045, + "loss": 1.213, + "step": 60360 + }, + { + "epoch": 0.12826403226466115, + "grad_norm": 0.4311726689338684, + "learning_rate": 0.00019403768928759219, + "loss": 1.1744, + "step": 60370 + }, + { + "epoch": 0.1282852785844002, + "grad_norm": 0.41441911458969116, + "learning_rate": 0.00019403537242635991, + "loss": 1.2405, + "step": 60380 + }, + { + "epoch": 0.12830652490413927, + "grad_norm": 0.3950975835323334, + "learning_rate": 0.00019403305512890438, + "loss": 1.2044, + "step": 60390 + }, + { + "epoch": 0.1283277712238783, + "grad_norm": 0.33545657992362976, + "learning_rate": 0.00019403073739523633, + "loss": 1.231, + "step": 60400 + }, + { + "epoch": 0.12834901754361736, + "grad_norm": 0.4297516644001007, + "learning_rate": 0.00019402841922536653, + "loss": 1.2052, + "step": 60410 + }, + { + "epoch": 0.12837026386335643, + "grad_norm": 0.3244020640850067, + "learning_rate": 0.00019402610061930572, + "loss": 1.2312, + "step": 60420 + }, + { + "epoch": 0.12839151018309547, + "grad_norm": 0.3646220564842224, + "learning_rate": 0.0001940237815770647, + "loss": 1.2306, + "step": 60430 + }, + { + "epoch": 0.1284127565028345, + "grad_norm": 0.3469895124435425, + "learning_rate": 0.00019402146209865413, + "loss": 1.2016, + "step": 60440 + }, + { + "epoch": 0.12843400282257358, + "grad_norm": 0.3496251702308655, + "learning_rate": 0.00019401914218408485, + "loss": 1.1633, + "step": 60450 + }, + { + "epoch": 0.12845524914231263, + "grad_norm": 0.4831198751926422, + "learning_rate": 0.0001940168218333676, + "loss": 1.2545, + "step": 60460 + }, + { + "epoch": 0.12847649546205167, + "grad_norm": 0.423785924911499, + "learning_rate": 0.00019401450104651312, + "loss": 1.2028, + "step": 60470 + }, + { + "epoch": 0.12849774178179074, + "grad_norm": 0.7841302752494812, + "learning_rate": 0.00019401217982353223, + "loss": 1.1936, + "step": 60480 + }, + { + "epoch": 0.12851898810152979, + "grad_norm": 0.9052339196205139, + "learning_rate": 0.00019400985816443568, + "loss": 1.2253, + "step": 60490 + }, + { + "epoch": 0.12854023442126883, + "grad_norm": 0.47754618525505066, + "learning_rate": 0.0001940075360692342, + "loss": 1.1994, + "step": 60500 + }, + { + "epoch": 0.1285614807410079, + "grad_norm": 0.4089285731315613, + "learning_rate": 0.00019400521353793858, + "loss": 1.2158, + "step": 60510 + }, + { + "epoch": 0.12858272706074694, + "grad_norm": 0.33018356561660767, + "learning_rate": 0.00019400289057055963, + "loss": 1.1948, + "step": 60520 + }, + { + "epoch": 0.128603973380486, + "grad_norm": 0.378875732421875, + "learning_rate": 0.0001940005671671081, + "loss": 1.2061, + "step": 60530 + }, + { + "epoch": 0.12862521970022506, + "grad_norm": 0.45920759439468384, + "learning_rate": 0.00019399824332759472, + "loss": 1.2159, + "step": 60540 + }, + { + "epoch": 0.1286464660199641, + "grad_norm": 0.36719009280204773, + "learning_rate": 0.00019399591905203036, + "loss": 1.2205, + "step": 60550 + }, + { + "epoch": 0.12866771233970314, + "grad_norm": 0.3631909489631653, + "learning_rate": 0.00019399359434042578, + "loss": 1.1803, + "step": 60560 + }, + { + "epoch": 0.12868895865944222, + "grad_norm": 0.35585686564445496, + "learning_rate": 0.0001939912691927917, + "loss": 1.161, + "step": 60570 + }, + { + "epoch": 0.12871020497918126, + "grad_norm": 0.37130439281463623, + "learning_rate": 0.000193988943609139, + "loss": 1.1822, + "step": 60580 + }, + { + "epoch": 0.1287314512989203, + "grad_norm": 0.4674611985683441, + "learning_rate": 0.00019398661758947836, + "loss": 1.1733, + "step": 60590 + }, + { + "epoch": 0.12875269761865937, + "grad_norm": 0.35302767157554626, + "learning_rate": 0.00019398429113382066, + "loss": 1.1943, + "step": 60600 + }, + { + "epoch": 0.12877394393839842, + "grad_norm": 0.3351539075374603, + "learning_rate": 0.00019398196424217666, + "loss": 1.1801, + "step": 60610 + }, + { + "epoch": 0.12879519025813746, + "grad_norm": 0.32760411500930786, + "learning_rate": 0.00019397963691455714, + "loss": 1.1706, + "step": 60620 + }, + { + "epoch": 0.12881643657787653, + "grad_norm": 0.46459630131721497, + "learning_rate": 0.00019397730915097293, + "loss": 1.1662, + "step": 60630 + }, + { + "epoch": 0.12883768289761557, + "grad_norm": 0.6489558815956116, + "learning_rate": 0.0001939749809514348, + "loss": 1.2092, + "step": 60640 + }, + { + "epoch": 0.12885892921735465, + "grad_norm": 0.3361892104148865, + "learning_rate": 0.00019397265231595358, + "loss": 1.2021, + "step": 60650 + }, + { + "epoch": 0.1288801755370937, + "grad_norm": 0.5529860258102417, + "learning_rate": 0.00019397032324454006, + "loss": 1.23, + "step": 60660 + }, + { + "epoch": 0.12890142185683273, + "grad_norm": 0.5254616737365723, + "learning_rate": 0.00019396799373720499, + "loss": 1.2244, + "step": 60670 + }, + { + "epoch": 0.1289226681765718, + "grad_norm": 0.5127575993537903, + "learning_rate": 0.00019396566379395927, + "loss": 1.2204, + "step": 60680 + }, + { + "epoch": 0.12894391449631085, + "grad_norm": 0.5374829769134521, + "learning_rate": 0.00019396333341481362, + "loss": 1.1734, + "step": 60690 + }, + { + "epoch": 0.1289651608160499, + "grad_norm": 0.8726638555526733, + "learning_rate": 0.0001939610025997789, + "loss": 1.2458, + "step": 60700 + }, + { + "epoch": 0.12898640713578896, + "grad_norm": 0.40610572695732117, + "learning_rate": 0.00019395867134886597, + "loss": 1.2542, + "step": 60710 + }, + { + "epoch": 0.129007653455528, + "grad_norm": 0.6065261363983154, + "learning_rate": 0.00019395633966208552, + "loss": 1.1971, + "step": 60720 + }, + { + "epoch": 0.12902889977526705, + "grad_norm": 0.4271472096443176, + "learning_rate": 0.00019395400753944846, + "loss": 1.2229, + "step": 60730 + }, + { + "epoch": 0.12905014609500612, + "grad_norm": 0.3322129547595978, + "learning_rate": 0.0001939516749809656, + "loss": 1.1461, + "step": 60740 + }, + { + "epoch": 0.12907139241474516, + "grad_norm": 0.36353546380996704, + "learning_rate": 0.00019394934198664775, + "loss": 1.2192, + "step": 60750 + }, + { + "epoch": 0.1290926387344842, + "grad_norm": 0.45688730478286743, + "learning_rate": 0.0001939470085565057, + "loss": 1.2127, + "step": 60760 + }, + { + "epoch": 0.12911388505422328, + "grad_norm": 0.3300446569919586, + "learning_rate": 0.00019394467469055032, + "loss": 1.2082, + "step": 60770 + }, + { + "epoch": 0.12913513137396232, + "grad_norm": 0.43864884972572327, + "learning_rate": 0.0001939423403887924, + "loss": 1.2181, + "step": 60780 + }, + { + "epoch": 0.12915637769370136, + "grad_norm": 0.34949228167533875, + "learning_rate": 0.00019394000565124282, + "loss": 1.2236, + "step": 60790 + }, + { + "epoch": 0.12917762401344043, + "grad_norm": 0.3752736449241638, + "learning_rate": 0.00019393767047791235, + "loss": 1.1855, + "step": 60800 + }, + { + "epoch": 0.12919887033317948, + "grad_norm": 0.3390258848667145, + "learning_rate": 0.00019393533486881186, + "loss": 1.2014, + "step": 60810 + }, + { + "epoch": 0.12922011665291852, + "grad_norm": 0.3998483419418335, + "learning_rate": 0.00019393299882395216, + "loss": 1.198, + "step": 60820 + }, + { + "epoch": 0.1292413629726576, + "grad_norm": 0.37786662578582764, + "learning_rate": 0.0001939306623433441, + "loss": 1.2245, + "step": 60830 + }, + { + "epoch": 0.12926260929239664, + "grad_norm": 0.3279555141925812, + "learning_rate": 0.00019392832542699856, + "loss": 1.1969, + "step": 60840 + }, + { + "epoch": 0.12928385561213568, + "grad_norm": 0.34978151321411133, + "learning_rate": 0.00019392598807492633, + "loss": 1.1758, + "step": 60850 + }, + { + "epoch": 0.12930510193187475, + "grad_norm": 0.3737647831439972, + "learning_rate": 0.00019392365028713827, + "loss": 1.1993, + "step": 60860 + }, + { + "epoch": 0.1293263482516138, + "grad_norm": 0.3910844624042511, + "learning_rate": 0.0001939213120636452, + "loss": 1.1931, + "step": 60870 + }, + { + "epoch": 0.12934759457135284, + "grad_norm": 0.3962837755680084, + "learning_rate": 0.00019391897340445796, + "loss": 1.1792, + "step": 60880 + }, + { + "epoch": 0.1293688408910919, + "grad_norm": 0.3734790086746216, + "learning_rate": 0.00019391663430958747, + "loss": 1.1968, + "step": 60890 + }, + { + "epoch": 0.12939008721083095, + "grad_norm": 0.4704481065273285, + "learning_rate": 0.00019391429477904452, + "loss": 1.2106, + "step": 60900 + }, + { + "epoch": 0.12941133353057, + "grad_norm": 0.6845141649246216, + "learning_rate": 0.00019391195481284, + "loss": 1.2555, + "step": 60910 + }, + { + "epoch": 0.12943257985030907, + "grad_norm": 0.41931018233299255, + "learning_rate": 0.0001939096144109847, + "loss": 1.2034, + "step": 60920 + }, + { + "epoch": 0.1294538261700481, + "grad_norm": 0.4028439521789551, + "learning_rate": 0.00019390727357348957, + "loss": 1.2545, + "step": 60930 + }, + { + "epoch": 0.12947507248978715, + "grad_norm": 0.37428736686706543, + "learning_rate": 0.00019390493230036539, + "loss": 1.2365, + "step": 60940 + }, + { + "epoch": 0.12949631880952622, + "grad_norm": 0.34532833099365234, + "learning_rate": 0.00019390259059162305, + "loss": 1.2045, + "step": 60950 + }, + { + "epoch": 0.12951756512926527, + "grad_norm": 0.48213931918144226, + "learning_rate": 0.00019390024844727344, + "loss": 1.2575, + "step": 60960 + }, + { + "epoch": 0.1295388114490043, + "grad_norm": 0.407177209854126, + "learning_rate": 0.00019389790586732736, + "loss": 1.187, + "step": 60970 + }, + { + "epoch": 0.12956005776874338, + "grad_norm": 0.3999790549278259, + "learning_rate": 0.00019389556285179577, + "loss": 1.1907, + "step": 60980 + }, + { + "epoch": 0.12958130408848242, + "grad_norm": 0.34523946046829224, + "learning_rate": 0.00019389321940068945, + "loss": 1.2109, + "step": 60990 + }, + { + "epoch": 0.12960255040822147, + "grad_norm": 0.4934353232383728, + "learning_rate": 0.00019389087551401933, + "loss": 1.221, + "step": 61000 + }, + { + "epoch": 0.12962379672796054, + "grad_norm": 0.3590502440929413, + "learning_rate": 0.00019388853119179622, + "loss": 1.187, + "step": 61010 + }, + { + "epoch": 0.12964504304769958, + "grad_norm": 0.3414711356163025, + "learning_rate": 0.00019388618643403105, + "loss": 1.2142, + "step": 61020 + }, + { + "epoch": 0.12966628936743863, + "grad_norm": 0.3445853888988495, + "learning_rate": 0.00019388384124073468, + "loss": 1.1976, + "step": 61030 + }, + { + "epoch": 0.1296875356871777, + "grad_norm": 0.3358880281448364, + "learning_rate": 0.000193881495611918, + "loss": 1.2398, + "step": 61040 + }, + { + "epoch": 0.12970878200691674, + "grad_norm": 0.3386763036251068, + "learning_rate": 0.00019387914954759189, + "loss": 1.2196, + "step": 61050 + }, + { + "epoch": 0.12973002832665578, + "grad_norm": 0.33596399426460266, + "learning_rate": 0.0001938768030477672, + "loss": 1.2125, + "step": 61060 + }, + { + "epoch": 0.12975127464639485, + "grad_norm": 0.3559757173061371, + "learning_rate": 0.00019387445611245485, + "loss": 1.2331, + "step": 61070 + }, + { + "epoch": 0.1297725209661339, + "grad_norm": 0.38681313395500183, + "learning_rate": 0.00019387210874166573, + "loss": 1.2428, + "step": 61080 + }, + { + "epoch": 0.12979376728587294, + "grad_norm": 0.3471197783946991, + "learning_rate": 0.0001938697609354107, + "loss": 1.2267, + "step": 61090 + }, + { + "epoch": 0.129815013605612, + "grad_norm": 0.4370884597301483, + "learning_rate": 0.0001938674126937007, + "loss": 1.2038, + "step": 61100 + }, + { + "epoch": 0.12983625992535106, + "grad_norm": 0.3460918962955475, + "learning_rate": 0.00019386506401654655, + "loss": 1.2451, + "step": 61110 + }, + { + "epoch": 0.1298575062450901, + "grad_norm": 0.31062692403793335, + "learning_rate": 0.00019386271490395921, + "loss": 1.2221, + "step": 61120 + }, + { + "epoch": 0.12987875256482917, + "grad_norm": 0.3183495104312897, + "learning_rate": 0.00019386036535594956, + "loss": 1.2234, + "step": 61130 + }, + { + "epoch": 0.1298999988845682, + "grad_norm": 0.5091956853866577, + "learning_rate": 0.00019385801537252849, + "loss": 1.2209, + "step": 61140 + }, + { + "epoch": 0.12992124520430726, + "grad_norm": 0.3532818853855133, + "learning_rate": 0.0001938556649537069, + "loss": 1.1786, + "step": 61150 + }, + { + "epoch": 0.12994249152404633, + "grad_norm": 0.4447973072528839, + "learning_rate": 0.0001938533140994957, + "loss": 1.1948, + "step": 61160 + }, + { + "epoch": 0.12996373784378537, + "grad_norm": 0.6579145789146423, + "learning_rate": 0.0001938509628099058, + "loss": 1.1747, + "step": 61170 + }, + { + "epoch": 0.12998498416352441, + "grad_norm": 0.45715492963790894, + "learning_rate": 0.0001938486110849481, + "loss": 1.1769, + "step": 61180 + }, + { + "epoch": 0.13000623048326349, + "grad_norm": 0.5091846585273743, + "learning_rate": 0.00019384625892463348, + "loss": 1.2145, + "step": 61190 + }, + { + "epoch": 0.13002747680300253, + "grad_norm": 0.6231058239936829, + "learning_rate": 0.0001938439063289729, + "loss": 1.2196, + "step": 61200 + }, + { + "epoch": 0.13004872312274157, + "grad_norm": 0.5825707316398621, + "learning_rate": 0.00019384155329797728, + "loss": 1.1881, + "step": 61210 + }, + { + "epoch": 0.13006996944248064, + "grad_norm": 0.3600819408893585, + "learning_rate": 0.0001938391998316575, + "loss": 1.1808, + "step": 61220 + }, + { + "epoch": 0.1300912157622197, + "grad_norm": 0.3724936544895172, + "learning_rate": 0.00019383684593002448, + "loss": 1.1854, + "step": 61230 + }, + { + "epoch": 0.13011246208195873, + "grad_norm": 0.3939109146595001, + "learning_rate": 0.00019383449159308914, + "loss": 1.1776, + "step": 61240 + }, + { + "epoch": 0.1301337084016978, + "grad_norm": 0.4276266396045685, + "learning_rate": 0.00019383213682086243, + "loss": 1.2171, + "step": 61250 + }, + { + "epoch": 0.13015495472143684, + "grad_norm": 0.3267381489276886, + "learning_rate": 0.00019382978161335525, + "loss": 1.1942, + "step": 61260 + }, + { + "epoch": 0.1301762010411759, + "grad_norm": 0.407179057598114, + "learning_rate": 0.0001938274259705785, + "loss": 1.2101, + "step": 61270 + }, + { + "epoch": 0.13019744736091496, + "grad_norm": 0.3218834102153778, + "learning_rate": 0.00019382506989254318, + "loss": 1.1776, + "step": 61280 + }, + { + "epoch": 0.130218693680654, + "grad_norm": 0.3485143482685089, + "learning_rate": 0.0001938227133792602, + "loss": 1.2027, + "step": 61290 + }, + { + "epoch": 0.13023994000039305, + "grad_norm": 0.392577201128006, + "learning_rate": 0.0001938203564307404, + "loss": 1.1888, + "step": 61300 + }, + { + "epoch": 0.13026118632013212, + "grad_norm": 0.41514772176742554, + "learning_rate": 0.00019381799904699484, + "loss": 1.2263, + "step": 61310 + }, + { + "epoch": 0.13028243263987116, + "grad_norm": 0.44022077322006226, + "learning_rate": 0.00019381564122803436, + "loss": 1.2483, + "step": 61320 + }, + { + "epoch": 0.1303036789596102, + "grad_norm": 0.38921838998794556, + "learning_rate": 0.00019381328297386996, + "loss": 1.2167, + "step": 61330 + }, + { + "epoch": 0.13032492527934927, + "grad_norm": 0.3538103997707367, + "learning_rate": 0.00019381092428451252, + "loss": 1.223, + "step": 61340 + }, + { + "epoch": 0.13034617159908832, + "grad_norm": 0.32938310503959656, + "learning_rate": 0.00019380856515997303, + "loss": 1.1985, + "step": 61350 + }, + { + "epoch": 0.13036741791882736, + "grad_norm": 0.6257529258728027, + "learning_rate": 0.00019380620560026244, + "loss": 1.1986, + "step": 61360 + }, + { + "epoch": 0.13038866423856643, + "grad_norm": 0.7201675176620483, + "learning_rate": 0.00019380384560539166, + "loss": 1.2018, + "step": 61370 + }, + { + "epoch": 0.13040991055830548, + "grad_norm": 0.3406963646411896, + "learning_rate": 0.00019380148517537168, + "loss": 1.2064, + "step": 61380 + }, + { + "epoch": 0.13043115687804452, + "grad_norm": 0.3710666000843048, + "learning_rate": 0.00019379912431021341, + "loss": 1.2071, + "step": 61390 + }, + { + "epoch": 0.1304524031977836, + "grad_norm": 0.46031454205513, + "learning_rate": 0.00019379676300992782, + "loss": 1.1908, + "step": 61400 + }, + { + "epoch": 0.13047364951752263, + "grad_norm": 0.35366272926330566, + "learning_rate": 0.00019379440127452585, + "loss": 1.1671, + "step": 61410 + }, + { + "epoch": 0.13049489583726168, + "grad_norm": 0.33172720670700073, + "learning_rate": 0.00019379203910401848, + "loss": 1.2152, + "step": 61420 + }, + { + "epoch": 0.13051614215700075, + "grad_norm": 0.38589227199554443, + "learning_rate": 0.00019378967649841665, + "loss": 1.1983, + "step": 61430 + }, + { + "epoch": 0.1305373884767398, + "grad_norm": 0.4571869969367981, + "learning_rate": 0.00019378731345773132, + "loss": 1.1967, + "step": 61440 + }, + { + "epoch": 0.13055863479647883, + "grad_norm": 0.34038254618644714, + "learning_rate": 0.00019378494998197344, + "loss": 1.2338, + "step": 61450 + }, + { + "epoch": 0.1305798811162179, + "grad_norm": 0.33444517850875854, + "learning_rate": 0.000193782586071154, + "loss": 1.2227, + "step": 61460 + }, + { + "epoch": 0.13060112743595695, + "grad_norm": 0.3688046932220459, + "learning_rate": 0.00019378022172528397, + "loss": 1.2009, + "step": 61470 + }, + { + "epoch": 0.130622373755696, + "grad_norm": 0.32616838812828064, + "learning_rate": 0.0001937778569443743, + "loss": 1.2071, + "step": 61480 + }, + { + "epoch": 0.13064362007543506, + "grad_norm": 0.3709123730659485, + "learning_rate": 0.00019377549172843597, + "loss": 1.2289, + "step": 61490 + }, + { + "epoch": 0.1306648663951741, + "grad_norm": 0.4596889019012451, + "learning_rate": 0.00019377312607747992, + "loss": 1.2262, + "step": 61500 + }, + { + "epoch": 0.13068611271491318, + "grad_norm": 0.3887590765953064, + "learning_rate": 0.00019377075999151716, + "loss": 1.1819, + "step": 61510 + }, + { + "epoch": 0.13070735903465222, + "grad_norm": 0.37237218022346497, + "learning_rate": 0.00019376839347055869, + "loss": 1.1713, + "step": 61520 + }, + { + "epoch": 0.13072860535439126, + "grad_norm": 0.39838868379592896, + "learning_rate": 0.00019376602651461543, + "loss": 1.1729, + "step": 61530 + }, + { + "epoch": 0.13074985167413034, + "grad_norm": 0.35363349318504333, + "learning_rate": 0.0001937636591236984, + "loss": 1.1744, + "step": 61540 + }, + { + "epoch": 0.13077109799386938, + "grad_norm": 0.608910858631134, + "learning_rate": 0.00019376129129781853, + "loss": 1.201, + "step": 61550 + }, + { + "epoch": 0.13079234431360842, + "grad_norm": 0.4621765911579132, + "learning_rate": 0.00019375892303698686, + "loss": 1.1799, + "step": 61560 + }, + { + "epoch": 0.1308135906333475, + "grad_norm": 0.38059431314468384, + "learning_rate": 0.00019375655434121435, + "loss": 1.2267, + "step": 61570 + }, + { + "epoch": 0.13083483695308654, + "grad_norm": 0.41318538784980774, + "learning_rate": 0.00019375418521051202, + "loss": 1.1919, + "step": 61580 + }, + { + "epoch": 0.13085608327282558, + "grad_norm": 0.5594647526741028, + "learning_rate": 0.0001937518156448908, + "loss": 1.1855, + "step": 61590 + }, + { + "epoch": 0.13087732959256465, + "grad_norm": 0.3842316269874573, + "learning_rate": 0.00019374944564436178, + "loss": 1.2133, + "step": 61600 + }, + { + "epoch": 0.1308985759123037, + "grad_norm": 0.3143360912799835, + "learning_rate": 0.0001937470752089358, + "loss": 1.2403, + "step": 61610 + }, + { + "epoch": 0.13091982223204274, + "grad_norm": 0.4596409201622009, + "learning_rate": 0.00019374470433862403, + "loss": 1.1922, + "step": 61620 + }, + { + "epoch": 0.1309410685517818, + "grad_norm": 0.39505070447921753, + "learning_rate": 0.00019374233303343736, + "loss": 1.1906, + "step": 61630 + }, + { + "epoch": 0.13096231487152085, + "grad_norm": 0.40439873933792114, + "learning_rate": 0.0001937399612933868, + "loss": 1.1942, + "step": 61640 + }, + { + "epoch": 0.1309835611912599, + "grad_norm": 0.3715982735157013, + "learning_rate": 0.00019373758911848339, + "loss": 1.1624, + "step": 61650 + }, + { + "epoch": 0.13100480751099897, + "grad_norm": 0.38072991371154785, + "learning_rate": 0.0001937352165087381, + "loss": 1.2235, + "step": 61660 + }, + { + "epoch": 0.131026053830738, + "grad_norm": 0.33043673634529114, + "learning_rate": 0.00019373284346416197, + "loss": 1.211, + "step": 61670 + }, + { + "epoch": 0.13104730015047705, + "grad_norm": 0.3969142735004425, + "learning_rate": 0.00019373046998476598, + "loss": 1.2244, + "step": 61680 + }, + { + "epoch": 0.13106854647021612, + "grad_norm": 0.33608779311180115, + "learning_rate": 0.00019372809607056115, + "loss": 1.1968, + "step": 61690 + }, + { + "epoch": 0.13108979278995517, + "grad_norm": 0.34869834780693054, + "learning_rate": 0.0001937257217215585, + "loss": 1.1961, + "step": 61700 + }, + { + "epoch": 0.1311110391096942, + "grad_norm": 0.4597533047199249, + "learning_rate": 0.00019372334693776902, + "loss": 1.2101, + "step": 61710 + }, + { + "epoch": 0.13113228542943328, + "grad_norm": 0.45453789830207825, + "learning_rate": 0.00019372097171920373, + "loss": 1.1898, + "step": 61720 + }, + { + "epoch": 0.13115353174917233, + "grad_norm": 0.39236757159233093, + "learning_rate": 0.0001937185960658737, + "loss": 1.1968, + "step": 61730 + }, + { + "epoch": 0.13117477806891137, + "grad_norm": 0.3512004017829895, + "learning_rate": 0.00019371621997778986, + "loss": 1.2204, + "step": 61740 + }, + { + "epoch": 0.13119602438865044, + "grad_norm": 0.7545173168182373, + "learning_rate": 0.00019371384345496332, + "loss": 1.2029, + "step": 61750 + }, + { + "epoch": 0.13121727070838948, + "grad_norm": 0.33767879009246826, + "learning_rate": 0.00019371146649740505, + "loss": 1.2245, + "step": 61760 + }, + { + "epoch": 0.13123851702812853, + "grad_norm": 0.3578895330429077, + "learning_rate": 0.0001937090891051261, + "loss": 1.2411, + "step": 61770 + }, + { + "epoch": 0.1312597633478676, + "grad_norm": 0.5305561423301697, + "learning_rate": 0.00019370671127813752, + "loss": 1.2025, + "step": 61780 + }, + { + "epoch": 0.13128100966760664, + "grad_norm": 0.3474670946598053, + "learning_rate": 0.00019370433301645027, + "loss": 1.238, + "step": 61790 + }, + { + "epoch": 0.13130225598734568, + "grad_norm": 0.3762388527393341, + "learning_rate": 0.00019370195432007545, + "loss": 1.1818, + "step": 61800 + }, + { + "epoch": 0.13132350230708476, + "grad_norm": 0.3552713394165039, + "learning_rate": 0.00019369957518902407, + "loss": 1.1794, + "step": 61810 + }, + { + "epoch": 0.1313447486268238, + "grad_norm": 0.3421344459056854, + "learning_rate": 0.00019369719562330716, + "loss": 1.2022, + "step": 61820 + }, + { + "epoch": 0.13136599494656284, + "grad_norm": 0.5324142575263977, + "learning_rate": 0.0001936948156229358, + "loss": 1.2202, + "step": 61830 + }, + { + "epoch": 0.1313872412663019, + "grad_norm": 0.6637104153633118, + "learning_rate": 0.00019369243518792097, + "loss": 1.2283, + "step": 61840 + }, + { + "epoch": 0.13140848758604096, + "grad_norm": 0.46398720145225525, + "learning_rate": 0.00019369005431827373, + "loss": 1.1858, + "step": 61850 + }, + { + "epoch": 0.13142973390578, + "grad_norm": 0.3558966815471649, + "learning_rate": 0.00019368767301400515, + "loss": 1.2073, + "step": 61860 + }, + { + "epoch": 0.13145098022551907, + "grad_norm": 0.32970210909843445, + "learning_rate": 0.00019368529127512628, + "loss": 1.203, + "step": 61870 + }, + { + "epoch": 0.13147222654525811, + "grad_norm": 0.40211382508277893, + "learning_rate": 0.0001936829091016481, + "loss": 1.2492, + "step": 61880 + }, + { + "epoch": 0.13149347286499716, + "grad_norm": 0.3809279501438141, + "learning_rate": 0.00019368052649358175, + "loss": 1.2, + "step": 61890 + }, + { + "epoch": 0.13151471918473623, + "grad_norm": 0.3548700511455536, + "learning_rate": 0.00019367814345093822, + "loss": 1.2187, + "step": 61900 + }, + { + "epoch": 0.13153596550447527, + "grad_norm": 0.3642164170742035, + "learning_rate": 0.00019367575997372857, + "loss": 1.2046, + "step": 61910 + }, + { + "epoch": 0.13155721182421432, + "grad_norm": 0.3394564688205719, + "learning_rate": 0.0001936733760619639, + "loss": 1.1662, + "step": 61920 + }, + { + "epoch": 0.1315784581439534, + "grad_norm": 0.3641751706600189, + "learning_rate": 0.00019367099171565527, + "loss": 1.2082, + "step": 61930 + }, + { + "epoch": 0.13159970446369243, + "grad_norm": 0.3289339244365692, + "learning_rate": 0.0001936686069348137, + "loss": 1.1881, + "step": 61940 + }, + { + "epoch": 0.13162095078343147, + "grad_norm": 0.4025399088859558, + "learning_rate": 0.00019366622171945025, + "loss": 1.2007, + "step": 61950 + }, + { + "epoch": 0.13164219710317054, + "grad_norm": 0.3859935402870178, + "learning_rate": 0.00019366383606957602, + "loss": 1.1957, + "step": 61960 + }, + { + "epoch": 0.1316634434229096, + "grad_norm": 0.3612760901451111, + "learning_rate": 0.00019366144998520204, + "loss": 1.2135, + "step": 61970 + }, + { + "epoch": 0.13168468974264863, + "grad_norm": 0.6279416680335999, + "learning_rate": 0.0001936590634663394, + "loss": 1.2026, + "step": 61980 + }, + { + "epoch": 0.1317059360623877, + "grad_norm": 0.36846911907196045, + "learning_rate": 0.0001936566765129992, + "loss": 1.2102, + "step": 61990 + }, + { + "epoch": 0.13172718238212675, + "grad_norm": 0.46803489327430725, + "learning_rate": 0.00019365428912519243, + "loss": 1.199, + "step": 62000 + }, + { + "epoch": 0.1317484287018658, + "grad_norm": 0.5571045279502869, + "learning_rate": 0.00019365190130293025, + "loss": 1.2224, + "step": 62010 + }, + { + "epoch": 0.13176967502160486, + "grad_norm": 0.33946332335472107, + "learning_rate": 0.0001936495130462237, + "loss": 1.1638, + "step": 62020 + }, + { + "epoch": 0.1317909213413439, + "grad_norm": 0.42721158266067505, + "learning_rate": 0.00019364712435508385, + "loss": 1.2408, + "step": 62030 + }, + { + "epoch": 0.13181216766108295, + "grad_norm": 0.40523386001586914, + "learning_rate": 0.0001936447352295218, + "loss": 1.1886, + "step": 62040 + }, + { + "epoch": 0.13183341398082202, + "grad_norm": 0.40677931904792786, + "learning_rate": 0.00019364234566954865, + "loss": 1.2184, + "step": 62050 + }, + { + "epoch": 0.13185466030056106, + "grad_norm": 0.34658539295196533, + "learning_rate": 0.00019363995567517542, + "loss": 1.242, + "step": 62060 + }, + { + "epoch": 0.1318759066203001, + "grad_norm": 0.3591732680797577, + "learning_rate": 0.00019363756524641326, + "loss": 1.1919, + "step": 62070 + }, + { + "epoch": 0.13189715294003918, + "grad_norm": 0.44371598958969116, + "learning_rate": 0.00019363517438327323, + "loss": 1.2152, + "step": 62080 + }, + { + "epoch": 0.13191839925977822, + "grad_norm": 0.3139362931251526, + "learning_rate": 0.00019363278308576646, + "loss": 1.2317, + "step": 62090 + }, + { + "epoch": 0.13193964557951726, + "grad_norm": 0.33662083745002747, + "learning_rate": 0.00019363039135390398, + "loss": 1.2143, + "step": 62100 + }, + { + "epoch": 0.13196089189925633, + "grad_norm": 0.5276017785072327, + "learning_rate": 0.0001936279991876969, + "loss": 1.2007, + "step": 62110 + }, + { + "epoch": 0.13198213821899538, + "grad_norm": 0.350008487701416, + "learning_rate": 0.00019362560658715635, + "loss": 1.2262, + "step": 62120 + }, + { + "epoch": 0.13200338453873442, + "grad_norm": 0.3965320885181427, + "learning_rate": 0.00019362321355229342, + "loss": 1.1814, + "step": 62130 + }, + { + "epoch": 0.1320246308584735, + "grad_norm": 0.46125200390815735, + "learning_rate": 0.0001936208200831192, + "loss": 1.2317, + "step": 62140 + }, + { + "epoch": 0.13204587717821253, + "grad_norm": 0.547206461429596, + "learning_rate": 0.0001936184261796448, + "loss": 1.2395, + "step": 62150 + }, + { + "epoch": 0.13206712349795158, + "grad_norm": 0.44253939390182495, + "learning_rate": 0.00019361603184188132, + "loss": 1.2063, + "step": 62160 + }, + { + "epoch": 0.13208836981769065, + "grad_norm": 0.3687870502471924, + "learning_rate": 0.0001936136370698399, + "loss": 1.2456, + "step": 62170 + }, + { + "epoch": 0.1321096161374297, + "grad_norm": 0.3348683714866638, + "learning_rate": 0.0001936112418635316, + "loss": 1.215, + "step": 62180 + }, + { + "epoch": 0.13213086245716874, + "grad_norm": 0.33853423595428467, + "learning_rate": 0.00019360884622296752, + "loss": 1.2315, + "step": 62190 + }, + { + "epoch": 0.1321521087769078, + "grad_norm": 0.4770808517932892, + "learning_rate": 0.00019360645014815882, + "loss": 1.2326, + "step": 62200 + }, + { + "epoch": 0.13217335509664685, + "grad_norm": 0.32996490597724915, + "learning_rate": 0.0001936040536391166, + "loss": 1.2289, + "step": 62210 + }, + { + "epoch": 0.1321946014163859, + "grad_norm": 0.3323967456817627, + "learning_rate": 0.00019360165669585197, + "loss": 1.2178, + "step": 62220 + }, + { + "epoch": 0.13221584773612496, + "grad_norm": 0.4128265976905823, + "learning_rate": 0.00019359925931837605, + "loss": 1.1938, + "step": 62230 + }, + { + "epoch": 0.132237094055864, + "grad_norm": 0.49674472212791443, + "learning_rate": 0.00019359686150669997, + "loss": 1.1929, + "step": 62240 + }, + { + "epoch": 0.13225834037560305, + "grad_norm": 0.47989752888679504, + "learning_rate": 0.00019359446326083486, + "loss": 1.2138, + "step": 62250 + }, + { + "epoch": 0.13227958669534212, + "grad_norm": 0.4854673147201538, + "learning_rate": 0.00019359206458079183, + "loss": 1.1925, + "step": 62260 + }, + { + "epoch": 0.13230083301508117, + "grad_norm": 0.37083640694618225, + "learning_rate": 0.00019358966546658202, + "loss": 1.2158, + "step": 62270 + }, + { + "epoch": 0.1323220793348202, + "grad_norm": 0.3828258514404297, + "learning_rate": 0.0001935872659182165, + "loss": 1.1814, + "step": 62280 + }, + { + "epoch": 0.13234332565455928, + "grad_norm": 0.3613477349281311, + "learning_rate": 0.00019358486593570653, + "loss": 1.2502, + "step": 62290 + }, + { + "epoch": 0.13236457197429832, + "grad_norm": 0.36151888966560364, + "learning_rate": 0.00019358246551906313, + "loss": 1.1964, + "step": 62300 + }, + { + "epoch": 0.13238581829403737, + "grad_norm": 0.36045902967453003, + "learning_rate": 0.0001935800646682975, + "loss": 1.1887, + "step": 62310 + }, + { + "epoch": 0.13240706461377644, + "grad_norm": 0.39765670895576477, + "learning_rate": 0.00019357766338342066, + "loss": 1.2189, + "step": 62320 + }, + { + "epoch": 0.13242831093351548, + "grad_norm": 0.47626447677612305, + "learning_rate": 0.0001935752616644439, + "loss": 1.2295, + "step": 62330 + }, + { + "epoch": 0.13244955725325452, + "grad_norm": 0.43835213780403137, + "learning_rate": 0.0001935728595113783, + "loss": 1.2166, + "step": 62340 + }, + { + "epoch": 0.1324708035729936, + "grad_norm": 0.3204367160797119, + "learning_rate": 0.000193570456924235, + "loss": 1.2748, + "step": 62350 + }, + { + "epoch": 0.13249204989273264, + "grad_norm": 0.3696579039096832, + "learning_rate": 0.00019356805390302516, + "loss": 1.2212, + "step": 62360 + }, + { + "epoch": 0.1325132962124717, + "grad_norm": 0.3741706609725952, + "learning_rate": 0.00019356565044775992, + "loss": 1.2036, + "step": 62370 + }, + { + "epoch": 0.13253454253221075, + "grad_norm": 0.3273124694824219, + "learning_rate": 0.0001935632465584504, + "loss": 1.2563, + "step": 62380 + }, + { + "epoch": 0.1325557888519498, + "grad_norm": 0.38024184107780457, + "learning_rate": 0.00019356084223510776, + "loss": 1.2003, + "step": 62390 + }, + { + "epoch": 0.13257703517168887, + "grad_norm": 0.5569319128990173, + "learning_rate": 0.0001935584374777432, + "loss": 1.2326, + "step": 62400 + }, + { + "epoch": 0.1325982814914279, + "grad_norm": 0.40011531114578247, + "learning_rate": 0.00019355603228636784, + "loss": 1.1944, + "step": 62410 + }, + { + "epoch": 0.13261952781116695, + "grad_norm": 0.3729434907436371, + "learning_rate": 0.00019355362666099283, + "loss": 1.2014, + "step": 62420 + }, + { + "epoch": 0.13264077413090603, + "grad_norm": 0.3881126642227173, + "learning_rate": 0.00019355122060162936, + "loss": 1.1892, + "step": 62430 + }, + { + "epoch": 0.13266202045064507, + "grad_norm": 0.31840527057647705, + "learning_rate": 0.00019354881410828858, + "loss": 1.2627, + "step": 62440 + }, + { + "epoch": 0.1326832667703841, + "grad_norm": 0.5377467274665833, + "learning_rate": 0.00019354640718098162, + "loss": 1.2063, + "step": 62450 + }, + { + "epoch": 0.13270451309012318, + "grad_norm": 0.4361065924167633, + "learning_rate": 0.00019354399981971968, + "loss": 1.2008, + "step": 62460 + }, + { + "epoch": 0.13272575940986223, + "grad_norm": 0.38856038451194763, + "learning_rate": 0.00019354159202451395, + "loss": 1.1812, + "step": 62470 + }, + { + "epoch": 0.13274700572960127, + "grad_norm": 0.462958425283432, + "learning_rate": 0.00019353918379537554, + "loss": 1.2076, + "step": 62480 + }, + { + "epoch": 0.13276825204934034, + "grad_norm": 0.349252313375473, + "learning_rate": 0.00019353677513231566, + "loss": 1.2122, + "step": 62490 + }, + { + "epoch": 0.13278949836907938, + "grad_norm": 0.3547405004501343, + "learning_rate": 0.00019353436603534546, + "loss": 1.2165, + "step": 62500 + }, + { + "epoch": 0.13281074468881843, + "grad_norm": 0.313536137342453, + "learning_rate": 0.00019353195650447614, + "loss": 1.22, + "step": 62510 + }, + { + "epoch": 0.1328319910085575, + "grad_norm": 0.41542884707450867, + "learning_rate": 0.00019352954653971887, + "loss": 1.1909, + "step": 62520 + }, + { + "epoch": 0.13285323732829654, + "grad_norm": 0.4307152032852173, + "learning_rate": 0.00019352713614108487, + "loss": 1.1922, + "step": 62530 + }, + { + "epoch": 0.13287448364803559, + "grad_norm": 0.3744797706604004, + "learning_rate": 0.00019352472530858523, + "loss": 1.2255, + "step": 62540 + }, + { + "epoch": 0.13289572996777466, + "grad_norm": 0.32843485474586487, + "learning_rate": 0.00019352231404223117, + "loss": 1.2263, + "step": 62550 + }, + { + "epoch": 0.1329169762875137, + "grad_norm": 0.34931033849716187, + "learning_rate": 0.00019351990234203395, + "loss": 1.2197, + "step": 62560 + }, + { + "epoch": 0.13293822260725274, + "grad_norm": 0.5599557757377625, + "learning_rate": 0.00019351749020800465, + "loss": 1.2181, + "step": 62570 + }, + { + "epoch": 0.13295946892699181, + "grad_norm": 0.5091832280158997, + "learning_rate": 0.0001935150776401545, + "loss": 1.2133, + "step": 62580 + }, + { + "epoch": 0.13298071524673086, + "grad_norm": 0.4873722195625305, + "learning_rate": 0.00019351266463849473, + "loss": 1.2162, + "step": 62590 + }, + { + "epoch": 0.1330019615664699, + "grad_norm": 0.3213942050933838, + "learning_rate": 0.0001935102512030365, + "loss": 1.2118, + "step": 62600 + }, + { + "epoch": 0.13302320788620897, + "grad_norm": 0.3735823929309845, + "learning_rate": 0.000193507837333791, + "loss": 1.191, + "step": 62610 + }, + { + "epoch": 0.13304445420594802, + "grad_norm": 0.3896942734718323, + "learning_rate": 0.00019350542303076944, + "loss": 1.1975, + "step": 62620 + }, + { + "epoch": 0.13306570052568706, + "grad_norm": 0.35228458046913147, + "learning_rate": 0.000193503008293983, + "loss": 1.1599, + "step": 62630 + }, + { + "epoch": 0.13308694684542613, + "grad_norm": 0.38204893469810486, + "learning_rate": 0.0001935005931234429, + "loss": 1.2082, + "step": 62640 + }, + { + "epoch": 0.13310819316516517, + "grad_norm": 0.3921249806880951, + "learning_rate": 0.00019349817751916035, + "loss": 1.1812, + "step": 62650 + }, + { + "epoch": 0.13312943948490422, + "grad_norm": 0.33623185753822327, + "learning_rate": 0.00019349576148114657, + "loss": 1.2184, + "step": 62660 + }, + { + "epoch": 0.1331506858046433, + "grad_norm": 0.5257618427276611, + "learning_rate": 0.0001934933450094127, + "loss": 1.1963, + "step": 62670 + }, + { + "epoch": 0.13317193212438233, + "grad_norm": 0.38962242007255554, + "learning_rate": 0.00019349092810397003, + "loss": 1.2248, + "step": 62680 + }, + { + "epoch": 0.13319317844412137, + "grad_norm": 0.5284601449966431, + "learning_rate": 0.0001934885107648297, + "loss": 1.241, + "step": 62690 + }, + { + "epoch": 0.13321442476386045, + "grad_norm": 0.523084282875061, + "learning_rate": 0.000193486092992003, + "loss": 1.1947, + "step": 62700 + }, + { + "epoch": 0.1332356710835995, + "grad_norm": 0.37586790323257446, + "learning_rate": 0.00019348367478550107, + "loss": 1.1821, + "step": 62710 + }, + { + "epoch": 0.13325691740333853, + "grad_norm": 0.6152288317680359, + "learning_rate": 0.00019348125614533518, + "loss": 1.1853, + "step": 62720 + }, + { + "epoch": 0.1332781637230776, + "grad_norm": 0.32934847474098206, + "learning_rate": 0.00019347883707151656, + "loss": 1.2075, + "step": 62730 + }, + { + "epoch": 0.13329941004281665, + "grad_norm": 0.3122413754463196, + "learning_rate": 0.00019347641756405635, + "loss": 1.2038, + "step": 62740 + }, + { + "epoch": 0.1333206563625557, + "grad_norm": 0.35514065623283386, + "learning_rate": 0.00019347399762296584, + "loss": 1.2064, + "step": 62750 + }, + { + "epoch": 0.13334190268229476, + "grad_norm": 0.5796473026275635, + "learning_rate": 0.00019347157724825632, + "loss": 1.2076, + "step": 62760 + }, + { + "epoch": 0.1333631490020338, + "grad_norm": 0.5571184158325195, + "learning_rate": 0.00019346915643993887, + "loss": 1.2166, + "step": 62770 + }, + { + "epoch": 0.13338439532177285, + "grad_norm": 0.4952801764011383, + "learning_rate": 0.0001934667351980248, + "loss": 1.1976, + "step": 62780 + }, + { + "epoch": 0.13340564164151192, + "grad_norm": 0.34734445810317993, + "learning_rate": 0.00019346431352252537, + "loss": 1.2213, + "step": 62790 + }, + { + "epoch": 0.13342688796125096, + "grad_norm": 0.42299845814704895, + "learning_rate": 0.00019346189141345176, + "loss": 1.2258, + "step": 62800 + }, + { + "epoch": 0.13344813428099, + "grad_norm": 0.7000302076339722, + "learning_rate": 0.0001934594688708152, + "loss": 1.2067, + "step": 62810 + }, + { + "epoch": 0.13346938060072908, + "grad_norm": 0.44560110569000244, + "learning_rate": 0.00019345704589462698, + "loss": 1.2038, + "step": 62820 + }, + { + "epoch": 0.13349062692046812, + "grad_norm": 0.33804526925086975, + "learning_rate": 0.0001934546224848983, + "loss": 1.1873, + "step": 62830 + }, + { + "epoch": 0.13351187324020716, + "grad_norm": 0.325439989566803, + "learning_rate": 0.00019345219864164043, + "loss": 1.2373, + "step": 62840 + }, + { + "epoch": 0.13353311955994623, + "grad_norm": 0.37796691060066223, + "learning_rate": 0.00019344977436486463, + "loss": 1.2214, + "step": 62850 + }, + { + "epoch": 0.13355436587968528, + "grad_norm": 0.3902392089366913, + "learning_rate": 0.00019344734965458206, + "loss": 1.1977, + "step": 62860 + }, + { + "epoch": 0.13357561219942432, + "grad_norm": 0.4691033661365509, + "learning_rate": 0.00019344492451080404, + "loss": 1.2191, + "step": 62870 + }, + { + "epoch": 0.1335968585191634, + "grad_norm": 0.5724672675132751, + "learning_rate": 0.0001934424989335418, + "loss": 1.2368, + "step": 62880 + }, + { + "epoch": 0.13361810483890244, + "grad_norm": 0.7898520231246948, + "learning_rate": 0.00019344007292280662, + "loss": 1.2257, + "step": 62890 + }, + { + "epoch": 0.13363935115864148, + "grad_norm": 0.3805020749568939, + "learning_rate": 0.0001934376464786097, + "loss": 1.224, + "step": 62900 + }, + { + "epoch": 0.13366059747838055, + "grad_norm": 0.34977152943611145, + "learning_rate": 0.00019343521960096235, + "loss": 1.2125, + "step": 62910 + }, + { + "epoch": 0.1336818437981196, + "grad_norm": 0.5182188153266907, + "learning_rate": 0.0001934327922898758, + "loss": 1.2623, + "step": 62920 + }, + { + "epoch": 0.13370309011785864, + "grad_norm": 0.3107888698577881, + "learning_rate": 0.00019343036454536131, + "loss": 1.2447, + "step": 62930 + }, + { + "epoch": 0.1337243364375977, + "grad_norm": 0.3639746904373169, + "learning_rate": 0.00019342793636743016, + "loss": 1.1857, + "step": 62940 + }, + { + "epoch": 0.13374558275733675, + "grad_norm": 0.4500874876976013, + "learning_rate": 0.00019342550775609357, + "loss": 1.2073, + "step": 62950 + }, + { + "epoch": 0.1337668290770758, + "grad_norm": 0.3491474986076355, + "learning_rate": 0.00019342307871136286, + "loss": 1.188, + "step": 62960 + }, + { + "epoch": 0.13378807539681487, + "grad_norm": 0.40493130683898926, + "learning_rate": 0.00019342064923324925, + "loss": 1.2383, + "step": 62970 + }, + { + "epoch": 0.1338093217165539, + "grad_norm": 0.41096231341362, + "learning_rate": 0.00019341821932176406, + "loss": 1.2602, + "step": 62980 + }, + { + "epoch": 0.13383056803629295, + "grad_norm": 0.3292977511882782, + "learning_rate": 0.00019341578897691855, + "loss": 1.1772, + "step": 62990 + }, + { + "epoch": 0.13385181435603202, + "grad_norm": 0.34835493564605713, + "learning_rate": 0.00019341335819872394, + "loss": 1.2045, + "step": 63000 + }, + { + "epoch": 0.13387306067577107, + "grad_norm": 0.5300415754318237, + "learning_rate": 0.00019341092698719158, + "loss": 1.1624, + "step": 63010 + }, + { + "epoch": 0.1338943069955101, + "grad_norm": 0.6936304569244385, + "learning_rate": 0.0001934084953423327, + "loss": 1.2165, + "step": 63020 + }, + { + "epoch": 0.13391555331524918, + "grad_norm": 0.48787787556648254, + "learning_rate": 0.0001934060632641586, + "loss": 1.1646, + "step": 63030 + }, + { + "epoch": 0.13393679963498822, + "grad_norm": 0.5067340135574341, + "learning_rate": 0.00019340363075268056, + "loss": 1.2444, + "step": 63040 + }, + { + "epoch": 0.13395804595472727, + "grad_norm": 0.43742451071739197, + "learning_rate": 0.00019340119780790986, + "loss": 1.2218, + "step": 63050 + }, + { + "epoch": 0.13397929227446634, + "grad_norm": 0.3763223886489868, + "learning_rate": 0.00019339876442985777, + "loss": 1.2021, + "step": 63060 + }, + { + "epoch": 0.13400053859420538, + "grad_norm": 0.3520102798938751, + "learning_rate": 0.00019339633061853562, + "loss": 1.1998, + "step": 63070 + }, + { + "epoch": 0.13402178491394443, + "grad_norm": 0.33826521039009094, + "learning_rate": 0.00019339389637395467, + "loss": 1.1868, + "step": 63080 + }, + { + "epoch": 0.1340430312336835, + "grad_norm": 0.3757309019565582, + "learning_rate": 0.00019339146169612624, + "loss": 1.1874, + "step": 63090 + }, + { + "epoch": 0.13406427755342254, + "grad_norm": 0.3362612724304199, + "learning_rate": 0.00019338902658506157, + "loss": 1.1805, + "step": 63100 + }, + { + "epoch": 0.13408552387316158, + "grad_norm": 0.32557159662246704, + "learning_rate": 0.00019338659104077202, + "loss": 1.1868, + "step": 63110 + }, + { + "epoch": 0.13410677019290065, + "grad_norm": 0.36192020773887634, + "learning_rate": 0.00019338415506326887, + "loss": 1.2311, + "step": 63120 + }, + { + "epoch": 0.1341280165126397, + "grad_norm": 0.3509942889213562, + "learning_rate": 0.00019338171865256336, + "loss": 1.1996, + "step": 63130 + }, + { + "epoch": 0.13414926283237874, + "grad_norm": 0.3784002959728241, + "learning_rate": 0.00019337928180866688, + "loss": 1.2016, + "step": 63140 + }, + { + "epoch": 0.1341705091521178, + "grad_norm": 0.5121545791625977, + "learning_rate": 0.00019337684453159067, + "loss": 1.1994, + "step": 63150 + }, + { + "epoch": 0.13419175547185686, + "grad_norm": 0.3382102847099304, + "learning_rate": 0.00019337440682134607, + "loss": 1.1985, + "step": 63160 + }, + { + "epoch": 0.1342130017915959, + "grad_norm": 0.3870947062969208, + "learning_rate": 0.0001933719686779444, + "loss": 1.2387, + "step": 63170 + }, + { + "epoch": 0.13423424811133497, + "grad_norm": 0.3285522162914276, + "learning_rate": 0.0001933695301013969, + "loss": 1.1965, + "step": 63180 + }, + { + "epoch": 0.134255494431074, + "grad_norm": 0.3274693191051483, + "learning_rate": 0.00019336709109171502, + "loss": 1.2202, + "step": 63190 + }, + { + "epoch": 0.13427674075081308, + "grad_norm": 0.36633193492889404, + "learning_rate": 0.00019336465164890992, + "loss": 1.2051, + "step": 63200 + }, + { + "epoch": 0.13429798707055213, + "grad_norm": 0.4043840765953064, + "learning_rate": 0.00019336221177299295, + "loss": 1.181, + "step": 63210 + }, + { + "epoch": 0.13431923339029117, + "grad_norm": 0.43166834115982056, + "learning_rate": 0.00019335977146397554, + "loss": 1.1993, + "step": 63220 + }, + { + "epoch": 0.13434047971003024, + "grad_norm": 0.3221701681613922, + "learning_rate": 0.0001933573307218689, + "loss": 1.1688, + "step": 63230 + }, + { + "epoch": 0.13436172602976929, + "grad_norm": 0.33191952109336853, + "learning_rate": 0.00019335488954668438, + "loss": 1.2133, + "step": 63240 + }, + { + "epoch": 0.13438297234950833, + "grad_norm": 0.3566425144672394, + "learning_rate": 0.00019335244793843332, + "loss": 1.2127, + "step": 63250 + }, + { + "epoch": 0.1344042186692474, + "grad_norm": 0.3680981993675232, + "learning_rate": 0.00019335000589712704, + "loss": 1.2165, + "step": 63260 + }, + { + "epoch": 0.13442546498898644, + "grad_norm": 0.34498831629753113, + "learning_rate": 0.00019334756342277688, + "loss": 1.2449, + "step": 63270 + }, + { + "epoch": 0.1344467113087255, + "grad_norm": 0.32378149032592773, + "learning_rate": 0.00019334512051539412, + "loss": 1.2059, + "step": 63280 + }, + { + "epoch": 0.13446795762846456, + "grad_norm": 0.6339020729064941, + "learning_rate": 0.00019334267717499012, + "loss": 1.2008, + "step": 63290 + }, + { + "epoch": 0.1344892039482036, + "grad_norm": 0.39248356223106384, + "learning_rate": 0.00019334023340157624, + "loss": 1.1917, + "step": 63300 + }, + { + "epoch": 0.13451045026794264, + "grad_norm": 0.32721415162086487, + "learning_rate": 0.0001933377891951638, + "loss": 1.1976, + "step": 63310 + }, + { + "epoch": 0.13453169658768172, + "grad_norm": 0.3639782667160034, + "learning_rate": 0.00019333534455576415, + "loss": 1.21, + "step": 63320 + }, + { + "epoch": 0.13455294290742076, + "grad_norm": 0.42431777715682983, + "learning_rate": 0.0001933328994833886, + "loss": 1.1943, + "step": 63330 + }, + { + "epoch": 0.1345741892271598, + "grad_norm": 0.5767587423324585, + "learning_rate": 0.0001933304539780485, + "loss": 1.1966, + "step": 63340 + }, + { + "epoch": 0.13459543554689887, + "grad_norm": 0.3422752320766449, + "learning_rate": 0.0001933280080397552, + "loss": 1.2135, + "step": 63350 + }, + { + "epoch": 0.13461668186663792, + "grad_norm": 0.41941606998443604, + "learning_rate": 0.0001933255616685201, + "loss": 1.2245, + "step": 63360 + }, + { + "epoch": 0.13463792818637696, + "grad_norm": 0.4482526183128357, + "learning_rate": 0.00019332311486435446, + "loss": 1.2246, + "step": 63370 + }, + { + "epoch": 0.13465917450611603, + "grad_norm": 0.6148274540901184, + "learning_rate": 0.00019332066762726968, + "loss": 1.1837, + "step": 63380 + }, + { + "epoch": 0.13468042082585507, + "grad_norm": 0.7985623478889465, + "learning_rate": 0.00019331821995727708, + "loss": 1.2396, + "step": 63390 + }, + { + "epoch": 0.13470166714559412, + "grad_norm": 0.4416220784187317, + "learning_rate": 0.00019331577185438808, + "loss": 1.2166, + "step": 63400 + }, + { + "epoch": 0.1347229134653332, + "grad_norm": 0.33691275119781494, + "learning_rate": 0.00019331332331861396, + "loss": 1.176, + "step": 63410 + }, + { + "epoch": 0.13474415978507223, + "grad_norm": 0.3462012708187103, + "learning_rate": 0.00019331087434996613, + "loss": 1.2093, + "step": 63420 + }, + { + "epoch": 0.13476540610481128, + "grad_norm": 0.33128371834754944, + "learning_rate": 0.0001933084249484559, + "loss": 1.2193, + "step": 63430 + }, + { + "epoch": 0.13478665242455035, + "grad_norm": 0.3540799915790558, + "learning_rate": 0.0001933059751140947, + "loss": 1.2008, + "step": 63440 + }, + { + "epoch": 0.1348078987442894, + "grad_norm": 0.39937686920166016, + "learning_rate": 0.00019330352484689387, + "loss": 1.2487, + "step": 63450 + }, + { + "epoch": 0.13482914506402843, + "grad_norm": 0.4712822437286377, + "learning_rate": 0.00019330107414686473, + "loss": 1.2364, + "step": 63460 + }, + { + "epoch": 0.1348503913837675, + "grad_norm": 0.35802748799324036, + "learning_rate": 0.00019329862301401872, + "loss": 1.2225, + "step": 63470 + }, + { + "epoch": 0.13487163770350655, + "grad_norm": 0.38922929763793945, + "learning_rate": 0.00019329617144836714, + "loss": 1.2332, + "step": 63480 + }, + { + "epoch": 0.1348928840232456, + "grad_norm": 0.4515446424484253, + "learning_rate": 0.0001932937194499214, + "loss": 1.2029, + "step": 63490 + }, + { + "epoch": 0.13491413034298466, + "grad_norm": 0.3162612020969391, + "learning_rate": 0.0001932912670186929, + "loss": 1.166, + "step": 63500 + }, + { + "epoch": 0.1349353766627237, + "grad_norm": 0.5243152976036072, + "learning_rate": 0.00019328881415469297, + "loss": 1.217, + "step": 63510 + }, + { + "epoch": 0.13495662298246275, + "grad_norm": 0.3533020317554474, + "learning_rate": 0.00019328636085793303, + "loss": 1.2357, + "step": 63520 + }, + { + "epoch": 0.13497786930220182, + "grad_norm": 0.41540926694869995, + "learning_rate": 0.00019328390712842445, + "loss": 1.184, + "step": 63530 + }, + { + "epoch": 0.13499911562194086, + "grad_norm": 0.3385264575481415, + "learning_rate": 0.00019328145296617854, + "loss": 1.2033, + "step": 63540 + }, + { + "epoch": 0.1350203619416799, + "grad_norm": 0.32275390625, + "learning_rate": 0.0001932789983712068, + "loss": 1.2262, + "step": 63550 + }, + { + "epoch": 0.13504160826141898, + "grad_norm": 0.32725611329078674, + "learning_rate": 0.00019327654334352055, + "loss": 1.1818, + "step": 63560 + }, + { + "epoch": 0.13506285458115802, + "grad_norm": 0.4213753640651703, + "learning_rate": 0.00019327408788313117, + "loss": 1.2224, + "step": 63570 + }, + { + "epoch": 0.13508410090089706, + "grad_norm": 0.5133394002914429, + "learning_rate": 0.0001932716319900501, + "loss": 1.1909, + "step": 63580 + }, + { + "epoch": 0.13510534722063613, + "grad_norm": 0.47741934657096863, + "learning_rate": 0.00019326917566428868, + "loss": 1.2176, + "step": 63590 + }, + { + "epoch": 0.13512659354037518, + "grad_norm": 0.33613619208335876, + "learning_rate": 0.00019326671890585834, + "loss": 1.2234, + "step": 63600 + }, + { + "epoch": 0.13514783986011422, + "grad_norm": 0.34730011224746704, + "learning_rate": 0.0001932642617147705, + "loss": 1.2006, + "step": 63610 + }, + { + "epoch": 0.1351690861798533, + "grad_norm": 0.33271855115890503, + "learning_rate": 0.0001932618040910365, + "loss": 1.2058, + "step": 63620 + }, + { + "epoch": 0.13519033249959234, + "grad_norm": 0.5796521306037903, + "learning_rate": 0.00019325934603466776, + "loss": 1.1947, + "step": 63630 + }, + { + "epoch": 0.13521157881933138, + "grad_norm": 0.6297010779380798, + "learning_rate": 0.00019325688754567567, + "loss": 1.218, + "step": 63640 + }, + { + "epoch": 0.13523282513907045, + "grad_norm": 0.5490739345550537, + "learning_rate": 0.0001932544286240717, + "loss": 1.1907, + "step": 63650 + }, + { + "epoch": 0.1352540714588095, + "grad_norm": 0.3328763246536255, + "learning_rate": 0.00019325196926986716, + "loss": 1.1923, + "step": 63660 + }, + { + "epoch": 0.13527531777854854, + "grad_norm": 0.337618350982666, + "learning_rate": 0.00019324950948307356, + "loss": 1.2426, + "step": 63670 + }, + { + "epoch": 0.1352965640982876, + "grad_norm": 0.4034671485424042, + "learning_rate": 0.00019324704926370224, + "loss": 1.2432, + "step": 63680 + }, + { + "epoch": 0.13531781041802665, + "grad_norm": 0.33718228340148926, + "learning_rate": 0.0001932445886117646, + "loss": 1.2222, + "step": 63690 + }, + { + "epoch": 0.1353390567377657, + "grad_norm": 0.3218410909175873, + "learning_rate": 0.00019324212752727213, + "loss": 1.2273, + "step": 63700 + }, + { + "epoch": 0.13536030305750477, + "grad_norm": 0.3468562960624695, + "learning_rate": 0.0001932396660102362, + "loss": 1.1947, + "step": 63710 + }, + { + "epoch": 0.1353815493772438, + "grad_norm": 0.3392552435398102, + "learning_rate": 0.0001932372040606682, + "loss": 1.245, + "step": 63720 + }, + { + "epoch": 0.13540279569698285, + "grad_norm": 0.31721141934394836, + "learning_rate": 0.00019323474167857964, + "loss": 1.1978, + "step": 63730 + }, + { + "epoch": 0.13542404201672192, + "grad_norm": 0.4063226878643036, + "learning_rate": 0.00019323227886398184, + "loss": 1.2273, + "step": 63740 + }, + { + "epoch": 0.13544528833646097, + "grad_norm": 0.35920411348342896, + "learning_rate": 0.00019322981561688626, + "loss": 1.1938, + "step": 63750 + }, + { + "epoch": 0.1354665346562, + "grad_norm": 0.34377142786979675, + "learning_rate": 0.00019322735193730436, + "loss": 1.2047, + "step": 63760 + }, + { + "epoch": 0.13548778097593908, + "grad_norm": 0.3436441123485565, + "learning_rate": 0.00019322488782524757, + "loss": 1.1633, + "step": 63770 + }, + { + "epoch": 0.13550902729567812, + "grad_norm": 0.3247266113758087, + "learning_rate": 0.00019322242328072728, + "loss": 1.2056, + "step": 63780 + }, + { + "epoch": 0.13553027361541717, + "grad_norm": 0.3730684220790863, + "learning_rate": 0.00019321995830375493, + "loss": 1.2219, + "step": 63790 + }, + { + "epoch": 0.13555151993515624, + "grad_norm": 0.39595121145248413, + "learning_rate": 0.000193217492894342, + "loss": 1.2055, + "step": 63800 + }, + { + "epoch": 0.13557276625489528, + "grad_norm": 0.4101845622062683, + "learning_rate": 0.00019321502705249986, + "loss": 1.1832, + "step": 63810 + }, + { + "epoch": 0.13559401257463433, + "grad_norm": 0.3209364116191864, + "learning_rate": 0.00019321256077823996, + "loss": 1.1994, + "step": 63820 + }, + { + "epoch": 0.1356152588943734, + "grad_norm": 0.39616695046424866, + "learning_rate": 0.0001932100940715738, + "loss": 1.1875, + "step": 63830 + }, + { + "epoch": 0.13563650521411244, + "grad_norm": 0.3148147165775299, + "learning_rate": 0.00019320762693251278, + "loss": 1.1797, + "step": 63840 + }, + { + "epoch": 0.13565775153385148, + "grad_norm": 0.4062342643737793, + "learning_rate": 0.00019320515936106835, + "loss": 1.1921, + "step": 63850 + }, + { + "epoch": 0.13567899785359055, + "grad_norm": 0.4152895212173462, + "learning_rate": 0.00019320269135725195, + "loss": 1.2024, + "step": 63860 + }, + { + "epoch": 0.1357002441733296, + "grad_norm": 0.4005035161972046, + "learning_rate": 0.00019320022292107503, + "loss": 1.185, + "step": 63870 + }, + { + "epoch": 0.13572149049306864, + "grad_norm": 0.3773094117641449, + "learning_rate": 0.00019319775405254904, + "loss": 1.1966, + "step": 63880 + }, + { + "epoch": 0.1357427368128077, + "grad_norm": 0.34163689613342285, + "learning_rate": 0.00019319528475168546, + "loss": 1.2047, + "step": 63890 + }, + { + "epoch": 0.13576398313254676, + "grad_norm": 0.5309532284736633, + "learning_rate": 0.00019319281501849574, + "loss": 1.2283, + "step": 63900 + }, + { + "epoch": 0.1357852294522858, + "grad_norm": 0.33282509446144104, + "learning_rate": 0.0001931903448529913, + "loss": 1.2066, + "step": 63910 + }, + { + "epoch": 0.13580647577202487, + "grad_norm": 0.351519376039505, + "learning_rate": 0.00019318787425518362, + "loss": 1.1991, + "step": 63920 + }, + { + "epoch": 0.1358277220917639, + "grad_norm": 0.3679982125759125, + "learning_rate": 0.00019318540322508417, + "loss": 1.2212, + "step": 63930 + }, + { + "epoch": 0.13584896841150296, + "grad_norm": 0.3613322973251343, + "learning_rate": 0.0001931829317627044, + "loss": 1.2426, + "step": 63940 + }, + { + "epoch": 0.13587021473124203, + "grad_norm": 0.31162357330322266, + "learning_rate": 0.00019318045986805576, + "loss": 1.1952, + "step": 63950 + }, + { + "epoch": 0.13589146105098107, + "grad_norm": 0.3174651265144348, + "learning_rate": 0.00019317798754114974, + "loss": 1.1985, + "step": 63960 + }, + { + "epoch": 0.13591270737072011, + "grad_norm": 0.38899946212768555, + "learning_rate": 0.0001931755147819978, + "loss": 1.1888, + "step": 63970 + }, + { + "epoch": 0.13593395369045919, + "grad_norm": 0.4280236065387726, + "learning_rate": 0.00019317304159061145, + "loss": 1.2422, + "step": 63980 + }, + { + "epoch": 0.13595520001019823, + "grad_norm": 0.37407201528549194, + "learning_rate": 0.0001931705679670021, + "loss": 1.1779, + "step": 63990 + }, + { + "epoch": 0.13597644632993727, + "grad_norm": 0.36457231640815735, + "learning_rate": 0.00019316809391118126, + "loss": 1.1925, + "step": 64000 + }, + { + "epoch": 0.13599769264967634, + "grad_norm": 0.40774670243263245, + "learning_rate": 0.0001931656194231604, + "loss": 1.2203, + "step": 64010 + }, + { + "epoch": 0.1360189389694154, + "grad_norm": 0.3777037560939789, + "learning_rate": 0.00019316314450295102, + "loss": 1.1951, + "step": 64020 + }, + { + "epoch": 0.13604018528915443, + "grad_norm": 0.32758381962776184, + "learning_rate": 0.00019316066915056458, + "loss": 1.2081, + "step": 64030 + }, + { + "epoch": 0.1360614316088935, + "grad_norm": 0.33662644028663635, + "learning_rate": 0.00019315819336601254, + "loss": 1.1912, + "step": 64040 + }, + { + "epoch": 0.13608267792863254, + "grad_norm": 0.37793874740600586, + "learning_rate": 0.00019315571714930641, + "loss": 1.227, + "step": 64050 + }, + { + "epoch": 0.13610392424837162, + "grad_norm": 0.34472256898880005, + "learning_rate": 0.00019315324050045768, + "loss": 1.1733, + "step": 64060 + }, + { + "epoch": 0.13612517056811066, + "grad_norm": 0.37439852952957153, + "learning_rate": 0.00019315076341947784, + "loss": 1.1912, + "step": 64070 + }, + { + "epoch": 0.1361464168878497, + "grad_norm": 0.35760697722435, + "learning_rate": 0.00019314828590637838, + "loss": 1.1891, + "step": 64080 + }, + { + "epoch": 0.13616766320758877, + "grad_norm": 0.37689611315727234, + "learning_rate": 0.00019314580796117077, + "loss": 1.2846, + "step": 64090 + }, + { + "epoch": 0.13618890952732782, + "grad_norm": 0.38948148488998413, + "learning_rate": 0.00019314332958386655, + "loss": 1.1947, + "step": 64100 + }, + { + "epoch": 0.13621015584706686, + "grad_norm": 0.4113651514053345, + "learning_rate": 0.00019314085077447716, + "loss": 1.1983, + "step": 64110 + }, + { + "epoch": 0.13623140216680593, + "grad_norm": 0.34839147329330444, + "learning_rate": 0.00019313837153301419, + "loss": 1.2003, + "step": 64120 + }, + { + "epoch": 0.13625264848654497, + "grad_norm": 0.3442058861255646, + "learning_rate": 0.000193135891859489, + "loss": 1.2412, + "step": 64130 + }, + { + "epoch": 0.13627389480628402, + "grad_norm": 0.3203839659690857, + "learning_rate": 0.00019313341175391323, + "loss": 1.1931, + "step": 64140 + }, + { + "epoch": 0.1362951411260231, + "grad_norm": 0.3284285068511963, + "learning_rate": 0.0001931309312162983, + "loss": 1.2394, + "step": 64150 + }, + { + "epoch": 0.13631638744576213, + "grad_norm": 0.369831919670105, + "learning_rate": 0.00019312845024665575, + "loss": 1.1937, + "step": 64160 + }, + { + "epoch": 0.13633763376550118, + "grad_norm": 0.3121185898780823, + "learning_rate": 0.00019312596884499709, + "loss": 1.1996, + "step": 64170 + }, + { + "epoch": 0.13635888008524025, + "grad_norm": 0.3452396094799042, + "learning_rate": 0.00019312348701133384, + "loss": 1.1899, + "step": 64180 + }, + { + "epoch": 0.1363801264049793, + "grad_norm": 0.3418006896972656, + "learning_rate": 0.00019312100474567747, + "loss": 1.1944, + "step": 64190 + }, + { + "epoch": 0.13640137272471833, + "grad_norm": 0.34023284912109375, + "learning_rate": 0.00019311852204803955, + "loss": 1.2027, + "step": 64200 + }, + { + "epoch": 0.1364226190444574, + "grad_norm": 0.718639612197876, + "learning_rate": 0.00019311603891843156, + "loss": 1.178, + "step": 64210 + }, + { + "epoch": 0.13644386536419645, + "grad_norm": 0.3613215386867523, + "learning_rate": 0.00019311355535686503, + "loss": 1.1986, + "step": 64220 + }, + { + "epoch": 0.1364651116839355, + "grad_norm": 0.3968524634838104, + "learning_rate": 0.00019311107136335147, + "loss": 1.1675, + "step": 64230 + }, + { + "epoch": 0.13648635800367456, + "grad_norm": 0.5130131244659424, + "learning_rate": 0.00019310858693790244, + "loss": 1.2096, + "step": 64240 + }, + { + "epoch": 0.1365076043234136, + "grad_norm": 0.3718901574611664, + "learning_rate": 0.0001931061020805294, + "loss": 1.187, + "step": 64250 + }, + { + "epoch": 0.13652885064315265, + "grad_norm": 0.5280067920684814, + "learning_rate": 0.00019310361679124394, + "loss": 1.1703, + "step": 64260 + }, + { + "epoch": 0.13655009696289172, + "grad_norm": 0.465008944272995, + "learning_rate": 0.00019310113107005758, + "loss": 1.1715, + "step": 64270 + }, + { + "epoch": 0.13657134328263076, + "grad_norm": 0.34703609347343445, + "learning_rate": 0.00019309864491698182, + "loss": 1.2179, + "step": 64280 + }, + { + "epoch": 0.1365925896023698, + "grad_norm": 0.3456876575946808, + "learning_rate": 0.00019309615833202823, + "loss": 1.1854, + "step": 64290 + }, + { + "epoch": 0.13661383592210888, + "grad_norm": 0.4095401465892792, + "learning_rate": 0.00019309367131520831, + "loss": 1.1511, + "step": 64300 + }, + { + "epoch": 0.13663508224184792, + "grad_norm": 0.32733702659606934, + "learning_rate": 0.0001930911838665336, + "loss": 1.2091, + "step": 64310 + }, + { + "epoch": 0.13665632856158696, + "grad_norm": 0.3475979268550873, + "learning_rate": 0.00019308869598601565, + "loss": 1.2235, + "step": 64320 + }, + { + "epoch": 0.13667757488132604, + "grad_norm": 0.3861352503299713, + "learning_rate": 0.000193086207673666, + "loss": 1.2229, + "step": 64330 + }, + { + "epoch": 0.13669882120106508, + "grad_norm": 0.5289942026138306, + "learning_rate": 0.00019308371892949624, + "loss": 1.2306, + "step": 64340 + }, + { + "epoch": 0.13672006752080412, + "grad_norm": 0.6578383445739746, + "learning_rate": 0.00019308122975351784, + "loss": 1.1998, + "step": 64350 + }, + { + "epoch": 0.1367413138405432, + "grad_norm": 0.7131315469741821, + "learning_rate": 0.00019307874014574237, + "loss": 1.2319, + "step": 64360 + }, + { + "epoch": 0.13676256016028224, + "grad_norm": 0.35252997279167175, + "learning_rate": 0.00019307625010618138, + "loss": 1.2176, + "step": 64370 + }, + { + "epoch": 0.13678380648002128, + "grad_norm": 0.4465612769126892, + "learning_rate": 0.00019307375963484644, + "loss": 1.2333, + "step": 64380 + }, + { + "epoch": 0.13680505279976035, + "grad_norm": 0.36689868569374084, + "learning_rate": 0.00019307126873174907, + "loss": 1.2597, + "step": 64390 + }, + { + "epoch": 0.1368262991194994, + "grad_norm": 0.3520098626613617, + "learning_rate": 0.00019306877739690088, + "loss": 1.203, + "step": 64400 + }, + { + "epoch": 0.13684754543923844, + "grad_norm": 0.43162253499031067, + "learning_rate": 0.00019306628563031336, + "loss": 1.1972, + "step": 64410 + }, + { + "epoch": 0.1368687917589775, + "grad_norm": 0.3241070806980133, + "learning_rate": 0.00019306379343199815, + "loss": 1.2279, + "step": 64420 + }, + { + "epoch": 0.13689003807871655, + "grad_norm": 0.34317296743392944, + "learning_rate": 0.00019306130080196674, + "loss": 1.1941, + "step": 64430 + }, + { + "epoch": 0.1369112843984556, + "grad_norm": 0.38821038603782654, + "learning_rate": 0.0001930588077402307, + "loss": 1.1961, + "step": 64440 + }, + { + "epoch": 0.13693253071819467, + "grad_norm": 0.48076605796813965, + "learning_rate": 0.0001930563142468016, + "loss": 1.2063, + "step": 64450 + }, + { + "epoch": 0.1369537770379337, + "grad_norm": 0.37118715047836304, + "learning_rate": 0.00019305382032169105, + "loss": 1.2139, + "step": 64460 + }, + { + "epoch": 0.13697502335767275, + "grad_norm": 0.3938905596733093, + "learning_rate": 0.00019305132596491057, + "loss": 1.2087, + "step": 64470 + }, + { + "epoch": 0.13699626967741182, + "grad_norm": 0.4092769920825958, + "learning_rate": 0.00019304883117647178, + "loss": 1.2218, + "step": 64480 + }, + { + "epoch": 0.13701751599715087, + "grad_norm": 0.379719078540802, + "learning_rate": 0.00019304633595638619, + "loss": 1.2113, + "step": 64490 + }, + { + "epoch": 0.1370387623168899, + "grad_norm": 0.5553067922592163, + "learning_rate": 0.0001930438403046654, + "loss": 1.1693, + "step": 64500 + }, + { + "epoch": 0.13706000863662898, + "grad_norm": 0.36400261521339417, + "learning_rate": 0.00019304134422132104, + "loss": 1.2075, + "step": 64510 + }, + { + "epoch": 0.13708125495636803, + "grad_norm": 0.3879847228527069, + "learning_rate": 0.00019303884770636458, + "loss": 1.2198, + "step": 64520 + }, + { + "epoch": 0.13710250127610707, + "grad_norm": 0.7053675055503845, + "learning_rate": 0.00019303635075980773, + "loss": 1.2151, + "step": 64530 + }, + { + "epoch": 0.13712374759584614, + "grad_norm": 0.435568630695343, + "learning_rate": 0.00019303385338166195, + "loss": 1.203, + "step": 64540 + }, + { + "epoch": 0.13714499391558518, + "grad_norm": 0.5236890316009521, + "learning_rate": 0.00019303135557193892, + "loss": 1.2234, + "step": 64550 + }, + { + "epoch": 0.13716624023532423, + "grad_norm": 0.4501417577266693, + "learning_rate": 0.00019302885733065017, + "loss": 1.2066, + "step": 64560 + }, + { + "epoch": 0.1371874865550633, + "grad_norm": 0.33698034286499023, + "learning_rate": 0.00019302635865780732, + "loss": 1.2193, + "step": 64570 + }, + { + "epoch": 0.13720873287480234, + "grad_norm": 0.341897189617157, + "learning_rate": 0.00019302385955342194, + "loss": 1.2053, + "step": 64580 + }, + { + "epoch": 0.13722997919454138, + "grad_norm": 0.33755871653556824, + "learning_rate": 0.00019302136001750567, + "loss": 1.188, + "step": 64590 + }, + { + "epoch": 0.13725122551428046, + "grad_norm": 0.3757224977016449, + "learning_rate": 0.00019301886005007002, + "loss": 1.2283, + "step": 64600 + }, + { + "epoch": 0.1372724718340195, + "grad_norm": 0.36324742436408997, + "learning_rate": 0.00019301635965112665, + "loss": 1.1714, + "step": 64610 + }, + { + "epoch": 0.13729371815375854, + "grad_norm": 0.3459022641181946, + "learning_rate": 0.00019301385882068716, + "loss": 1.2536, + "step": 64620 + }, + { + "epoch": 0.1373149644734976, + "grad_norm": 0.33945101499557495, + "learning_rate": 0.00019301135755876313, + "loss": 1.2032, + "step": 64630 + }, + { + "epoch": 0.13733621079323666, + "grad_norm": 0.32872211933135986, + "learning_rate": 0.00019300885586536617, + "loss": 1.2226, + "step": 64640 + }, + { + "epoch": 0.1373574571129757, + "grad_norm": 0.3306592106819153, + "learning_rate": 0.00019300635374050785, + "loss": 1.2038, + "step": 64650 + }, + { + "epoch": 0.13737870343271477, + "grad_norm": 0.3537338078022003, + "learning_rate": 0.00019300385118419986, + "loss": 1.2193, + "step": 64660 + }, + { + "epoch": 0.13739994975245381, + "grad_norm": 0.33030861616134644, + "learning_rate": 0.00019300134819645376, + "loss": 1.2258, + "step": 64670 + }, + { + "epoch": 0.13742119607219286, + "grad_norm": 0.442554771900177, + "learning_rate": 0.0001929988447772811, + "loss": 1.157, + "step": 64680 + }, + { + "epoch": 0.13744244239193193, + "grad_norm": 0.4413556158542633, + "learning_rate": 0.00019299634092669363, + "loss": 1.2123, + "step": 64690 + }, + { + "epoch": 0.13746368871167097, + "grad_norm": 0.3511594235897064, + "learning_rate": 0.00019299383664470284, + "loss": 1.2083, + "step": 64700 + }, + { + "epoch": 0.13748493503141002, + "grad_norm": 0.4068877696990967, + "learning_rate": 0.00019299133193132045, + "loss": 1.2167, + "step": 64710 + }, + { + "epoch": 0.1375061813511491, + "grad_norm": 0.33539068698883057, + "learning_rate": 0.00019298882678655798, + "loss": 1.2177, + "step": 64720 + }, + { + "epoch": 0.13752742767088813, + "grad_norm": 0.5292364358901978, + "learning_rate": 0.0001929863212104271, + "loss": 1.216, + "step": 64730 + }, + { + "epoch": 0.13754867399062717, + "grad_norm": 0.542367696762085, + "learning_rate": 0.00019298381520293945, + "loss": 1.1539, + "step": 64740 + }, + { + "epoch": 0.13756992031036624, + "grad_norm": 0.32363125681877136, + "learning_rate": 0.00019298130876410662, + "loss": 1.2063, + "step": 64750 + }, + { + "epoch": 0.1375911666301053, + "grad_norm": 0.3539910316467285, + "learning_rate": 0.00019297880189394028, + "loss": 1.2056, + "step": 64760 + }, + { + "epoch": 0.13761241294984433, + "grad_norm": 0.3190124034881592, + "learning_rate": 0.000192976294592452, + "loss": 1.2018, + "step": 64770 + }, + { + "epoch": 0.1376336592695834, + "grad_norm": 0.34340545535087585, + "learning_rate": 0.00019297378685965345, + "loss": 1.1944, + "step": 64780 + }, + { + "epoch": 0.13765490558932245, + "grad_norm": 0.46981510519981384, + "learning_rate": 0.00019297127869555628, + "loss": 1.2224, + "step": 64790 + }, + { + "epoch": 0.1376761519090615, + "grad_norm": 0.3783242702484131, + "learning_rate": 0.00019296877010017212, + "loss": 1.2143, + "step": 64800 + }, + { + "epoch": 0.13769739822880056, + "grad_norm": 0.346821129322052, + "learning_rate": 0.00019296626107351254, + "loss": 1.215, + "step": 64810 + }, + { + "epoch": 0.1377186445485396, + "grad_norm": 0.3328743577003479, + "learning_rate": 0.00019296375161558923, + "loss": 1.2229, + "step": 64820 + }, + { + "epoch": 0.13773989086827865, + "grad_norm": 0.37695106863975525, + "learning_rate": 0.00019296124172641385, + "loss": 1.1875, + "step": 64830 + }, + { + "epoch": 0.13776113718801772, + "grad_norm": 0.31302645802497864, + "learning_rate": 0.00019295873140599804, + "loss": 1.2109, + "step": 64840 + }, + { + "epoch": 0.13778238350775676, + "grad_norm": 0.3724821209907532, + "learning_rate": 0.00019295622065435342, + "loss": 1.1949, + "step": 64850 + }, + { + "epoch": 0.1378036298274958, + "grad_norm": 0.32957637310028076, + "learning_rate": 0.00019295370947149164, + "loss": 1.1852, + "step": 64860 + }, + { + "epoch": 0.13782487614723488, + "grad_norm": 0.3876308798789978, + "learning_rate": 0.00019295119785742434, + "loss": 1.2266, + "step": 64870 + }, + { + "epoch": 0.13784612246697392, + "grad_norm": 0.427807480096817, + "learning_rate": 0.00019294868581216319, + "loss": 1.1487, + "step": 64880 + }, + { + "epoch": 0.13786736878671296, + "grad_norm": 0.31726065278053284, + "learning_rate": 0.00019294617333571982, + "loss": 1.1782, + "step": 64890 + }, + { + "epoch": 0.13788861510645203, + "grad_norm": 0.3385744094848633, + "learning_rate": 0.00019294366042810595, + "loss": 1.1953, + "step": 64900 + }, + { + "epoch": 0.13790986142619108, + "grad_norm": 0.3182094693183899, + "learning_rate": 0.00019294114708933314, + "loss": 1.1837, + "step": 64910 + }, + { + "epoch": 0.13793110774593015, + "grad_norm": 0.3336630165576935, + "learning_rate": 0.00019293863331941316, + "loss": 1.1939, + "step": 64920 + }, + { + "epoch": 0.1379523540656692, + "grad_norm": 0.33548954129219055, + "learning_rate": 0.0001929361191183576, + "loss": 1.2025, + "step": 64930 + }, + { + "epoch": 0.13797360038540823, + "grad_norm": 0.4053560793399811, + "learning_rate": 0.00019293360448617808, + "loss": 1.2278, + "step": 64940 + }, + { + "epoch": 0.1379948467051473, + "grad_norm": 0.32944250106811523, + "learning_rate": 0.00019293108942288636, + "loss": 1.2016, + "step": 64950 + }, + { + "epoch": 0.13801609302488635, + "grad_norm": 0.4024416506290436, + "learning_rate": 0.00019292857392849406, + "loss": 1.2009, + "step": 64960 + }, + { + "epoch": 0.1380373393446254, + "grad_norm": 0.3208698034286499, + "learning_rate": 0.00019292605800301288, + "loss": 1.184, + "step": 64970 + }, + { + "epoch": 0.13805858566436446, + "grad_norm": 0.3791919946670532, + "learning_rate": 0.00019292354164645442, + "loss": 1.1949, + "step": 64980 + }, + { + "epoch": 0.1380798319841035, + "grad_norm": 0.35923638939857483, + "learning_rate": 0.00019292102485883047, + "loss": 1.1744, + "step": 64990 + }, + { + "epoch": 0.13810107830384255, + "grad_norm": 0.38732847571372986, + "learning_rate": 0.00019291850764015256, + "loss": 1.2208, + "step": 65000 + }, + { + "epoch": 0.13812232462358162, + "grad_norm": 0.4487999379634857, + "learning_rate": 0.00019291598999043248, + "loss": 1.2318, + "step": 65010 + }, + { + "epoch": 0.13814357094332066, + "grad_norm": 0.36316511034965515, + "learning_rate": 0.00019291347190968186, + "loss": 1.2029, + "step": 65020 + }, + { + "epoch": 0.1381648172630597, + "grad_norm": 0.3738231956958771, + "learning_rate": 0.0001929109533979124, + "loss": 1.1834, + "step": 65030 + }, + { + "epoch": 0.13818606358279878, + "grad_norm": 0.3864080011844635, + "learning_rate": 0.0001929084344551358, + "loss": 1.1977, + "step": 65040 + }, + { + "epoch": 0.13820730990253782, + "grad_norm": 0.3638046085834503, + "learning_rate": 0.0001929059150813637, + "loss": 1.1941, + "step": 65050 + }, + { + "epoch": 0.13822855622227687, + "grad_norm": 0.31789976358413696, + "learning_rate": 0.00019290339527660782, + "loss": 1.2397, + "step": 65060 + }, + { + "epoch": 0.13824980254201594, + "grad_norm": 0.3894723653793335, + "learning_rate": 0.00019290087504087984, + "loss": 1.2051, + "step": 65070 + }, + { + "epoch": 0.13827104886175498, + "grad_norm": 0.3337412476539612, + "learning_rate": 0.00019289835437419144, + "loss": 1.1998, + "step": 65080 + }, + { + "epoch": 0.13829229518149402, + "grad_norm": 0.41162386536598206, + "learning_rate": 0.00019289583327655434, + "loss": 1.2388, + "step": 65090 + }, + { + "epoch": 0.1383135415012331, + "grad_norm": 0.3504777252674103, + "learning_rate": 0.00019289331174798018, + "loss": 1.2089, + "step": 65100 + }, + { + "epoch": 0.13833478782097214, + "grad_norm": 0.3833545744419098, + "learning_rate": 0.00019289078978848071, + "loss": 1.2015, + "step": 65110 + }, + { + "epoch": 0.13835603414071118, + "grad_norm": 0.393144428730011, + "learning_rate": 0.00019288826739806764, + "loss": 1.1971, + "step": 65120 + }, + { + "epoch": 0.13837728046045025, + "grad_norm": 0.4040599763393402, + "learning_rate": 0.00019288574457675264, + "loss": 1.2094, + "step": 65130 + }, + { + "epoch": 0.1383985267801893, + "grad_norm": 0.3821299076080322, + "learning_rate": 0.0001928832213245474, + "loss": 1.166, + "step": 65140 + }, + { + "epoch": 0.13841977309992834, + "grad_norm": 0.3347020745277405, + "learning_rate": 0.00019288069764146365, + "loss": 1.1751, + "step": 65150 + }, + { + "epoch": 0.1384410194196674, + "grad_norm": 0.5540316700935364, + "learning_rate": 0.00019287817352751308, + "loss": 1.1995, + "step": 65160 + }, + { + "epoch": 0.13846226573940645, + "grad_norm": 0.3923502564430237, + "learning_rate": 0.00019287564898270743, + "loss": 1.2198, + "step": 65170 + }, + { + "epoch": 0.1384835120591455, + "grad_norm": 0.34937959909439087, + "learning_rate": 0.0001928731240070584, + "loss": 1.2027, + "step": 65180 + }, + { + "epoch": 0.13850475837888457, + "grad_norm": 0.36579179763793945, + "learning_rate": 0.00019287059860057765, + "loss": 1.1953, + "step": 65190 + }, + { + "epoch": 0.1385260046986236, + "grad_norm": 0.357659250497818, + "learning_rate": 0.00019286807276327697, + "loss": 1.2163, + "step": 65200 + }, + { + "epoch": 0.13854725101836265, + "grad_norm": 0.3865721523761749, + "learning_rate": 0.00019286554649516807, + "loss": 1.1652, + "step": 65210 + }, + { + "epoch": 0.13856849733810173, + "grad_norm": 0.8214536905288696, + "learning_rate": 0.0001928630197962626, + "loss": 1.1914, + "step": 65220 + }, + { + "epoch": 0.13858974365784077, + "grad_norm": 0.4104212522506714, + "learning_rate": 0.00019286049266657235, + "loss": 1.1995, + "step": 65230 + }, + { + "epoch": 0.1386109899775798, + "grad_norm": 0.42224669456481934, + "learning_rate": 0.00019285796510610902, + "loss": 1.1836, + "step": 65240 + }, + { + "epoch": 0.13863223629731888, + "grad_norm": 0.32464951276779175, + "learning_rate": 0.00019285543711488435, + "loss": 1.2088, + "step": 65250 + }, + { + "epoch": 0.13865348261705793, + "grad_norm": 0.34422433376312256, + "learning_rate": 0.00019285290869291, + "loss": 1.249, + "step": 65260 + }, + { + "epoch": 0.13867472893679697, + "grad_norm": 0.3105955123901367, + "learning_rate": 0.0001928503798401978, + "loss": 1.1709, + "step": 65270 + }, + { + "epoch": 0.13869597525653604, + "grad_norm": 0.36017686128616333, + "learning_rate": 0.00019284785055675944, + "loss": 1.2116, + "step": 65280 + }, + { + "epoch": 0.13871722157627508, + "grad_norm": 0.3757770359516144, + "learning_rate": 0.00019284532084260663, + "loss": 1.2452, + "step": 65290 + }, + { + "epoch": 0.13873846789601413, + "grad_norm": 0.34055033326148987, + "learning_rate": 0.00019284279069775112, + "loss": 1.1693, + "step": 65300 + }, + { + "epoch": 0.1387597142157532, + "grad_norm": 0.3396454155445099, + "learning_rate": 0.00019284026012220465, + "loss": 1.2798, + "step": 65310 + }, + { + "epoch": 0.13878096053549224, + "grad_norm": 0.3242254853248596, + "learning_rate": 0.00019283772911597897, + "loss": 1.1621, + "step": 65320 + }, + { + "epoch": 0.13880220685523129, + "grad_norm": 0.32467252016067505, + "learning_rate": 0.0001928351976790858, + "loss": 1.2228, + "step": 65330 + }, + { + "epoch": 0.13882345317497036, + "grad_norm": 0.37492164969444275, + "learning_rate": 0.00019283266581153689, + "loss": 1.1868, + "step": 65340 + }, + { + "epoch": 0.1388446994947094, + "grad_norm": 0.3280298113822937, + "learning_rate": 0.000192830133513344, + "loss": 1.1788, + "step": 65350 + }, + { + "epoch": 0.13886594581444844, + "grad_norm": 0.37604671716690063, + "learning_rate": 0.00019282760078451886, + "loss": 1.1927, + "step": 65360 + }, + { + "epoch": 0.13888719213418751, + "grad_norm": 0.3750140368938446, + "learning_rate": 0.0001928250676250732, + "loss": 1.2168, + "step": 65370 + }, + { + "epoch": 0.13890843845392656, + "grad_norm": 0.3255991041660309, + "learning_rate": 0.0001928225340350188, + "loss": 1.2169, + "step": 65380 + }, + { + "epoch": 0.1389296847736656, + "grad_norm": 0.3454228937625885, + "learning_rate": 0.00019282000001436743, + "loss": 1.2028, + "step": 65390 + }, + { + "epoch": 0.13895093109340467, + "grad_norm": 0.3169562518596649, + "learning_rate": 0.0001928174655631308, + "loss": 1.1807, + "step": 65400 + }, + { + "epoch": 0.13897217741314372, + "grad_norm": 0.33361899852752686, + "learning_rate": 0.0001928149306813207, + "loss": 1.2323, + "step": 65410 + }, + { + "epoch": 0.13899342373288276, + "grad_norm": 0.3389819264411926, + "learning_rate": 0.0001928123953689489, + "loss": 1.2359, + "step": 65420 + }, + { + "epoch": 0.13901467005262183, + "grad_norm": 0.4275556206703186, + "learning_rate": 0.0001928098596260271, + "loss": 1.2491, + "step": 65430 + }, + { + "epoch": 0.13903591637236087, + "grad_norm": 0.39210939407348633, + "learning_rate": 0.00019280732345256714, + "loss": 1.1752, + "step": 65440 + }, + { + "epoch": 0.13905716269209992, + "grad_norm": 0.36504656076431274, + "learning_rate": 0.0001928047868485807, + "loss": 1.2342, + "step": 65450 + }, + { + "epoch": 0.139078409011839, + "grad_norm": 0.45631396770477295, + "learning_rate": 0.00019280224981407966, + "loss": 1.1886, + "step": 65460 + }, + { + "epoch": 0.13909965533157803, + "grad_norm": 0.352655827999115, + "learning_rate": 0.00019279971234907568, + "loss": 1.1753, + "step": 65470 + }, + { + "epoch": 0.13912090165131707, + "grad_norm": 0.34131601452827454, + "learning_rate": 0.0001927971744535806, + "loss": 1.1897, + "step": 65480 + }, + { + "epoch": 0.13914214797105615, + "grad_norm": 0.36272379755973816, + "learning_rate": 0.00019279463612760614, + "loss": 1.1817, + "step": 65490 + }, + { + "epoch": 0.1391633942907952, + "grad_norm": 0.3405938744544983, + "learning_rate": 0.00019279209737116414, + "loss": 1.2053, + "step": 65500 + }, + { + "epoch": 0.13918464061053423, + "grad_norm": 0.37171417474746704, + "learning_rate": 0.00019278955818426635, + "loss": 1.2072, + "step": 65510 + }, + { + "epoch": 0.1392058869302733, + "grad_norm": 0.31477418541908264, + "learning_rate": 0.00019278701856692448, + "loss": 1.2006, + "step": 65520 + }, + { + "epoch": 0.13922713325001235, + "grad_norm": 0.34930431842803955, + "learning_rate": 0.00019278447851915043, + "loss": 1.2041, + "step": 65530 + }, + { + "epoch": 0.1392483795697514, + "grad_norm": 0.3527923822402954, + "learning_rate": 0.0001927819380409559, + "loss": 1.2087, + "step": 65540 + }, + { + "epoch": 0.13926962588949046, + "grad_norm": 0.3704983592033386, + "learning_rate": 0.00019277939713235268, + "loss": 1.2175, + "step": 65550 + }, + { + "epoch": 0.1392908722092295, + "grad_norm": 0.3322116434574127, + "learning_rate": 0.00019277685579335263, + "loss": 1.1841, + "step": 65560 + }, + { + "epoch": 0.13931211852896855, + "grad_norm": 0.3318660259246826, + "learning_rate": 0.00019277431402396747, + "loss": 1.2266, + "step": 65570 + }, + { + "epoch": 0.13933336484870762, + "grad_norm": 0.36267441511154175, + "learning_rate": 0.00019277177182420898, + "loss": 1.2251, + "step": 65580 + }, + { + "epoch": 0.13935461116844666, + "grad_norm": 0.3302595019340515, + "learning_rate": 0.00019276922919408897, + "loss": 1.2002, + "step": 65590 + }, + { + "epoch": 0.1393758574881857, + "grad_norm": 0.3870958983898163, + "learning_rate": 0.00019276668613361927, + "loss": 1.1956, + "step": 65600 + }, + { + "epoch": 0.13939710380792478, + "grad_norm": 0.4548105001449585, + "learning_rate": 0.00019276414264281165, + "loss": 1.2297, + "step": 65610 + }, + { + "epoch": 0.13941835012766382, + "grad_norm": 0.33682143688201904, + "learning_rate": 0.00019276159872167793, + "loss": 1.186, + "step": 65620 + }, + { + "epoch": 0.13943959644740286, + "grad_norm": 0.31228822469711304, + "learning_rate": 0.00019275905437022986, + "loss": 1.1782, + "step": 65630 + }, + { + "epoch": 0.13946084276714193, + "grad_norm": 0.39600539207458496, + "learning_rate": 0.0001927565095884793, + "loss": 1.2066, + "step": 65640 + }, + { + "epoch": 0.13948208908688098, + "grad_norm": 0.33865609765052795, + "learning_rate": 0.000192753964376438, + "loss": 1.1712, + "step": 65650 + }, + { + "epoch": 0.13950333540662002, + "grad_norm": 0.3193287253379822, + "learning_rate": 0.00019275141873411784, + "loss": 1.2152, + "step": 65660 + }, + { + "epoch": 0.1395245817263591, + "grad_norm": 0.6372979283332825, + "learning_rate": 0.00019274887266153057, + "loss": 1.2421, + "step": 65670 + }, + { + "epoch": 0.13954582804609814, + "grad_norm": 0.41160932183265686, + "learning_rate": 0.000192746326158688, + "loss": 1.1866, + "step": 65680 + }, + { + "epoch": 0.13956707436583718, + "grad_norm": 0.4424721300601959, + "learning_rate": 0.00019274377922560198, + "loss": 1.215, + "step": 65690 + }, + { + "epoch": 0.13958832068557625, + "grad_norm": 0.32858043909072876, + "learning_rate": 0.00019274123186228427, + "loss": 1.1824, + "step": 65700 + }, + { + "epoch": 0.1396095670053153, + "grad_norm": 0.3469431698322296, + "learning_rate": 0.00019273868406874677, + "loss": 1.1984, + "step": 65710 + }, + { + "epoch": 0.13963081332505434, + "grad_norm": 0.40177667140960693, + "learning_rate": 0.00019273613584500123, + "loss": 1.1788, + "step": 65720 + }, + { + "epoch": 0.1396520596447934, + "grad_norm": 0.43917518854141235, + "learning_rate": 0.00019273358719105953, + "loss": 1.2154, + "step": 65730 + }, + { + "epoch": 0.13967330596453245, + "grad_norm": 0.498374879360199, + "learning_rate": 0.00019273103810693338, + "loss": 1.1766, + "step": 65740 + }, + { + "epoch": 0.13969455228427152, + "grad_norm": 0.34869083762168884, + "learning_rate": 0.00019272848859263472, + "loss": 1.2484, + "step": 65750 + }, + { + "epoch": 0.13971579860401057, + "grad_norm": 0.37197503447532654, + "learning_rate": 0.00019272593864817536, + "loss": 1.1983, + "step": 65760 + }, + { + "epoch": 0.1397370449237496, + "grad_norm": 0.36904191970825195, + "learning_rate": 0.00019272338827356708, + "loss": 1.1741, + "step": 65770 + }, + { + "epoch": 0.13975829124348868, + "grad_norm": 0.34560808539390564, + "learning_rate": 0.00019272083746882176, + "loss": 1.2302, + "step": 65780 + }, + { + "epoch": 0.13977953756322772, + "grad_norm": 0.41022351384162903, + "learning_rate": 0.00019271828623395118, + "loss": 1.1904, + "step": 65790 + }, + { + "epoch": 0.13980078388296677, + "grad_norm": 0.3278406858444214, + "learning_rate": 0.00019271573456896725, + "loss": 1.1774, + "step": 65800 + }, + { + "epoch": 0.13982203020270584, + "grad_norm": 0.35743194818496704, + "learning_rate": 0.0001927131824738817, + "loss": 1.226, + "step": 65810 + }, + { + "epoch": 0.13984327652244488, + "grad_norm": 0.612693190574646, + "learning_rate": 0.00019271062994870647, + "loss": 1.1976, + "step": 65820 + }, + { + "epoch": 0.13986452284218392, + "grad_norm": 0.3998452126979828, + "learning_rate": 0.00019270807699345337, + "loss": 1.1823, + "step": 65830 + }, + { + "epoch": 0.139885769161923, + "grad_norm": 0.44860655069351196, + "learning_rate": 0.00019270552360813422, + "loss": 1.2321, + "step": 65840 + }, + { + "epoch": 0.13990701548166204, + "grad_norm": 0.5352202653884888, + "learning_rate": 0.00019270296979276088, + "loss": 1.181, + "step": 65850 + }, + { + "epoch": 0.13992826180140108, + "grad_norm": 0.5124374032020569, + "learning_rate": 0.00019270041554734523, + "loss": 1.2064, + "step": 65860 + }, + { + "epoch": 0.13994950812114015, + "grad_norm": 0.37401720881462097, + "learning_rate": 0.00019269786087189905, + "loss": 1.1934, + "step": 65870 + }, + { + "epoch": 0.1399707544408792, + "grad_norm": 0.3724496364593506, + "learning_rate": 0.00019269530576643425, + "loss": 1.1679, + "step": 65880 + }, + { + "epoch": 0.13999200076061824, + "grad_norm": 0.39528098702430725, + "learning_rate": 0.00019269275023096264, + "loss": 1.1663, + "step": 65890 + }, + { + "epoch": 0.1400132470803573, + "grad_norm": 0.3784645199775696, + "learning_rate": 0.0001926901942654961, + "loss": 1.2387, + "step": 65900 + }, + { + "epoch": 0.14003449340009635, + "grad_norm": 0.43514546751976013, + "learning_rate": 0.00019268763787004648, + "loss": 1.2048, + "step": 65910 + }, + { + "epoch": 0.1400557397198354, + "grad_norm": 0.32291775941848755, + "learning_rate": 0.00019268508104462566, + "loss": 1.257, + "step": 65920 + }, + { + "epoch": 0.14007698603957447, + "grad_norm": 0.3782608211040497, + "learning_rate": 0.00019268252378924547, + "loss": 1.2192, + "step": 65930 + }, + { + "epoch": 0.1400982323593135, + "grad_norm": 0.38053980469703674, + "learning_rate": 0.00019267996610391778, + "loss": 1.1901, + "step": 65940 + }, + { + "epoch": 0.14011947867905256, + "grad_norm": 0.3389507234096527, + "learning_rate": 0.00019267740798865448, + "loss": 1.2093, + "step": 65950 + }, + { + "epoch": 0.14014072499879163, + "grad_norm": 0.3751298487186432, + "learning_rate": 0.0001926748494434674, + "loss": 1.2396, + "step": 65960 + }, + { + "epoch": 0.14016197131853067, + "grad_norm": 0.4080202579498291, + "learning_rate": 0.00019267229046836842, + "loss": 1.1713, + "step": 65970 + }, + { + "epoch": 0.1401832176382697, + "grad_norm": 0.4090007245540619, + "learning_rate": 0.00019266973106336942, + "loss": 1.1911, + "step": 65980 + }, + { + "epoch": 0.14020446395800878, + "grad_norm": 0.4714829921722412, + "learning_rate": 0.0001926671712284823, + "loss": 1.2137, + "step": 65990 + }, + { + "epoch": 0.14022571027774783, + "grad_norm": 0.43501320481300354, + "learning_rate": 0.00019266461096371885, + "loss": 1.2273, + "step": 66000 + }, + { + "epoch": 0.14024695659748687, + "grad_norm": 0.39744845032691956, + "learning_rate": 0.00019266205026909105, + "loss": 1.1954, + "step": 66010 + }, + { + "epoch": 0.14026820291722594, + "grad_norm": 0.5677372813224792, + "learning_rate": 0.00019265948914461072, + "loss": 1.1789, + "step": 66020 + }, + { + "epoch": 0.14028944923696499, + "grad_norm": 0.32980161905288696, + "learning_rate": 0.00019265692759028974, + "loss": 1.2036, + "step": 66030 + }, + { + "epoch": 0.14031069555670403, + "grad_norm": 0.37356695532798767, + "learning_rate": 0.00019265436560614, + "loss": 1.2255, + "step": 66040 + }, + { + "epoch": 0.1403319418764431, + "grad_norm": 0.3461434841156006, + "learning_rate": 0.0001926518031921734, + "loss": 1.1936, + "step": 66050 + }, + { + "epoch": 0.14035318819618214, + "grad_norm": 0.37314197421073914, + "learning_rate": 0.00019264924034840183, + "loss": 1.2016, + "step": 66060 + }, + { + "epoch": 0.1403744345159212, + "grad_norm": 0.4205023944377899, + "learning_rate": 0.00019264667707483715, + "loss": 1.2107, + "step": 66070 + }, + { + "epoch": 0.14039568083566026, + "grad_norm": 0.32981228828430176, + "learning_rate": 0.00019264411337149127, + "loss": 1.1947, + "step": 66080 + }, + { + "epoch": 0.1404169271553993, + "grad_norm": 0.3149762451648712, + "learning_rate": 0.00019264154923837604, + "loss": 1.2351, + "step": 66090 + }, + { + "epoch": 0.14043817347513834, + "grad_norm": 0.35376426577568054, + "learning_rate": 0.00019263898467550346, + "loss": 1.2101, + "step": 66100 + }, + { + "epoch": 0.14045941979487742, + "grad_norm": 0.37753599882125854, + "learning_rate": 0.00019263641968288533, + "loss": 1.2097, + "step": 66110 + }, + { + "epoch": 0.14048066611461646, + "grad_norm": 0.413405179977417, + "learning_rate": 0.00019263385426053356, + "loss": 1.1945, + "step": 66120 + }, + { + "epoch": 0.1405019124343555, + "grad_norm": 0.4960145652294159, + "learning_rate": 0.0001926312884084601, + "loss": 1.2135, + "step": 66130 + }, + { + "epoch": 0.14052315875409457, + "grad_norm": 0.348333477973938, + "learning_rate": 0.0001926287221266768, + "loss": 1.2592, + "step": 66140 + }, + { + "epoch": 0.14054440507383362, + "grad_norm": 0.3465229570865631, + "learning_rate": 0.00019262615541519557, + "loss": 1.2262, + "step": 66150 + }, + { + "epoch": 0.14056565139357266, + "grad_norm": 0.4351942241191864, + "learning_rate": 0.00019262358827402837, + "loss": 1.1912, + "step": 66160 + }, + { + "epoch": 0.14058689771331173, + "grad_norm": 0.3205869793891907, + "learning_rate": 0.00019262102070318707, + "loss": 1.2126, + "step": 66170 + }, + { + "epoch": 0.14060814403305077, + "grad_norm": 0.3465080261230469, + "learning_rate": 0.00019261845270268356, + "loss": 1.2198, + "step": 66180 + }, + { + "epoch": 0.14062939035278982, + "grad_norm": 0.6415172219276428, + "learning_rate": 0.0001926158842725298, + "loss": 1.2268, + "step": 66190 + }, + { + "epoch": 0.1406506366725289, + "grad_norm": 0.4938334822654724, + "learning_rate": 0.00019261331541273768, + "loss": 1.1977, + "step": 66200 + }, + { + "epoch": 0.14067188299226793, + "grad_norm": 0.47465553879737854, + "learning_rate": 0.0001926107461233191, + "loss": 1.2015, + "step": 66210 + }, + { + "epoch": 0.14069312931200698, + "grad_norm": 0.351751446723938, + "learning_rate": 0.00019260817640428602, + "loss": 1.2075, + "step": 66220 + }, + { + "epoch": 0.14071437563174605, + "grad_norm": 0.38788333535194397, + "learning_rate": 0.00019260560625565033, + "loss": 1.1719, + "step": 66230 + }, + { + "epoch": 0.1407356219514851, + "grad_norm": 0.4109528958797455, + "learning_rate": 0.00019260303567742395, + "loss": 1.2202, + "step": 66240 + }, + { + "epoch": 0.14075686827122413, + "grad_norm": 0.3579079806804657, + "learning_rate": 0.0001926004646696188, + "loss": 1.192, + "step": 66250 + }, + { + "epoch": 0.1407781145909632, + "grad_norm": 0.3647270202636719, + "learning_rate": 0.00019259789323224687, + "loss": 1.193, + "step": 66260 + }, + { + "epoch": 0.14079936091070225, + "grad_norm": 0.4899335503578186, + "learning_rate": 0.00019259532136531998, + "loss": 1.216, + "step": 66270 + }, + { + "epoch": 0.1408206072304413, + "grad_norm": 0.34961605072021484, + "learning_rate": 0.00019259274906885016, + "loss": 1.1927, + "step": 66280 + }, + { + "epoch": 0.14084185355018036, + "grad_norm": 0.3594662547111511, + "learning_rate": 0.0001925901763428493, + "loss": 1.1945, + "step": 66290 + }, + { + "epoch": 0.1408630998699194, + "grad_norm": 0.40192317962646484, + "learning_rate": 0.00019258760318732935, + "loss": 1.2077, + "step": 66300 + }, + { + "epoch": 0.14088434618965845, + "grad_norm": 0.36067482829093933, + "learning_rate": 0.00019258502960230222, + "loss": 1.1867, + "step": 66310 + }, + { + "epoch": 0.14090559250939752, + "grad_norm": 0.3828514516353607, + "learning_rate": 0.00019258245558777986, + "loss": 1.2363, + "step": 66320 + }, + { + "epoch": 0.14092683882913656, + "grad_norm": 0.3737475872039795, + "learning_rate": 0.00019257988114377422, + "loss": 1.235, + "step": 66330 + }, + { + "epoch": 0.1409480851488756, + "grad_norm": 0.32303741574287415, + "learning_rate": 0.00019257730627029722, + "loss": 1.2137, + "step": 66340 + }, + { + "epoch": 0.14096933146861468, + "grad_norm": 0.3364577293395996, + "learning_rate": 0.00019257473096736085, + "loss": 1.2335, + "step": 66350 + }, + { + "epoch": 0.14099057778835372, + "grad_norm": 0.3607352375984192, + "learning_rate": 0.000192572155234977, + "loss": 1.2259, + "step": 66360 + }, + { + "epoch": 0.14101182410809276, + "grad_norm": 0.3003743588924408, + "learning_rate": 0.0001925695790731577, + "loss": 1.2182, + "step": 66370 + }, + { + "epoch": 0.14103307042783184, + "grad_norm": 0.3439536988735199, + "learning_rate": 0.00019256700248191478, + "loss": 1.1886, + "step": 66380 + }, + { + "epoch": 0.14105431674757088, + "grad_norm": 0.3409028947353363, + "learning_rate": 0.0001925644254612603, + "loss": 1.1932, + "step": 66390 + }, + { + "epoch": 0.14107556306730992, + "grad_norm": 0.32744723558425903, + "learning_rate": 0.00019256184801120615, + "loss": 1.2104, + "step": 66400 + }, + { + "epoch": 0.141096809387049, + "grad_norm": 0.33130428194999695, + "learning_rate": 0.00019255927013176434, + "loss": 1.2368, + "step": 66410 + }, + { + "epoch": 0.14111805570678804, + "grad_norm": 0.3581821024417877, + "learning_rate": 0.0001925566918229468, + "loss": 1.2113, + "step": 66420 + }, + { + "epoch": 0.14113930202652708, + "grad_norm": 0.34705594182014465, + "learning_rate": 0.00019255411308476546, + "loss": 1.167, + "step": 66430 + }, + { + "epoch": 0.14116054834626615, + "grad_norm": 0.394647479057312, + "learning_rate": 0.00019255153391723232, + "loss": 1.2275, + "step": 66440 + }, + { + "epoch": 0.1411817946660052, + "grad_norm": 0.345625638961792, + "learning_rate": 0.00019254895432035935, + "loss": 1.196, + "step": 66450 + }, + { + "epoch": 0.14120304098574424, + "grad_norm": 0.3355083167552948, + "learning_rate": 0.00019254637429415853, + "loss": 1.2147, + "step": 66460 + }, + { + "epoch": 0.1412242873054833, + "grad_norm": 0.3303545415401459, + "learning_rate": 0.00019254379383864176, + "loss": 1.2161, + "step": 66470 + }, + { + "epoch": 0.14124553362522235, + "grad_norm": 0.3994872570037842, + "learning_rate": 0.00019254121295382107, + "loss": 1.1876, + "step": 66480 + }, + { + "epoch": 0.1412667799449614, + "grad_norm": 0.4182440936565399, + "learning_rate": 0.0001925386316397084, + "loss": 1.2157, + "step": 66490 + }, + { + "epoch": 0.14128802626470047, + "grad_norm": 0.365170955657959, + "learning_rate": 0.00019253604989631577, + "loss": 1.2194, + "step": 66500 + }, + { + "epoch": 0.1413092725844395, + "grad_norm": 0.42147597670555115, + "learning_rate": 0.00019253346772365513, + "loss": 1.2238, + "step": 66510 + }, + { + "epoch": 0.14133051890417855, + "grad_norm": 0.3571195900440216, + "learning_rate": 0.00019253088512173844, + "loss": 1.2152, + "step": 66520 + }, + { + "epoch": 0.14135176522391762, + "grad_norm": 0.3798890709877014, + "learning_rate": 0.00019252830209057767, + "loss": 1.1547, + "step": 66530 + }, + { + "epoch": 0.14137301154365667, + "grad_norm": 0.374010294675827, + "learning_rate": 0.00019252571863018485, + "loss": 1.2147, + "step": 66540 + }, + { + "epoch": 0.1413942578633957, + "grad_norm": 0.5278629064559937, + "learning_rate": 0.00019252313474057194, + "loss": 1.2568, + "step": 66550 + }, + { + "epoch": 0.14141550418313478, + "grad_norm": 0.45162680745124817, + "learning_rate": 0.00019252055042175094, + "loss": 1.2107, + "step": 66560 + }, + { + "epoch": 0.14143675050287383, + "grad_norm": 0.3504185676574707, + "learning_rate": 0.00019251796567373383, + "loss": 1.1893, + "step": 66570 + }, + { + "epoch": 0.14145799682261287, + "grad_norm": 0.3178842067718506, + "learning_rate": 0.0001925153804965326, + "loss": 1.2268, + "step": 66580 + }, + { + "epoch": 0.14147924314235194, + "grad_norm": 0.3947225511074066, + "learning_rate": 0.00019251279489015924, + "loss": 1.2068, + "step": 66590 + }, + { + "epoch": 0.14150048946209098, + "grad_norm": 0.3643922209739685, + "learning_rate": 0.00019251020885462574, + "loss": 1.1796, + "step": 66600 + }, + { + "epoch": 0.14152173578183005, + "grad_norm": 0.5624138116836548, + "learning_rate": 0.0001925076223899441, + "loss": 1.169, + "step": 66610 + }, + { + "epoch": 0.1415429821015691, + "grad_norm": 0.344124972820282, + "learning_rate": 0.00019250503549612632, + "loss": 1.1977, + "step": 66620 + }, + { + "epoch": 0.14156422842130814, + "grad_norm": 0.35605302453041077, + "learning_rate": 0.0001925024481731844, + "loss": 1.2266, + "step": 66630 + }, + { + "epoch": 0.1415854747410472, + "grad_norm": 0.371127188205719, + "learning_rate": 0.00019249986042113034, + "loss": 1.2239, + "step": 66640 + }, + { + "epoch": 0.14160672106078626, + "grad_norm": 0.3982927203178406, + "learning_rate": 0.00019249727223997614, + "loss": 1.1797, + "step": 66650 + }, + { + "epoch": 0.1416279673805253, + "grad_norm": 0.3339950144290924, + "learning_rate": 0.00019249468362973385, + "loss": 1.2172, + "step": 66660 + }, + { + "epoch": 0.14164921370026437, + "grad_norm": 0.349131315946579, + "learning_rate": 0.0001924920945904154, + "loss": 1.2183, + "step": 66670 + }, + { + "epoch": 0.1416704600200034, + "grad_norm": 0.34099337458610535, + "learning_rate": 0.00019248950512203286, + "loss": 1.1826, + "step": 66680 + }, + { + "epoch": 0.14169170633974246, + "grad_norm": 0.3610743582248688, + "learning_rate": 0.00019248691522459824, + "loss": 1.1908, + "step": 66690 + }, + { + "epoch": 0.14171295265948153, + "grad_norm": 0.3183841407299042, + "learning_rate": 0.0001924843248981235, + "loss": 1.2056, + "step": 66700 + }, + { + "epoch": 0.14173419897922057, + "grad_norm": 0.35441988706588745, + "learning_rate": 0.0001924817341426207, + "loss": 1.2423, + "step": 66710 + }, + { + "epoch": 0.14175544529895961, + "grad_norm": 0.33189335465431213, + "learning_rate": 0.00019247914295810188, + "loss": 1.1827, + "step": 66720 + }, + { + "epoch": 0.14177669161869869, + "grad_norm": 0.35105305910110474, + "learning_rate": 0.00019247655134457903, + "loss": 1.1589, + "step": 66730 + }, + { + "epoch": 0.14179793793843773, + "grad_norm": 0.5904812216758728, + "learning_rate": 0.0001924739593020642, + "loss": 1.2157, + "step": 66740 + }, + { + "epoch": 0.14181918425817677, + "grad_norm": 0.47213658690452576, + "learning_rate": 0.00019247136683056933, + "loss": 1.1992, + "step": 66750 + }, + { + "epoch": 0.14184043057791584, + "grad_norm": 0.36569565534591675, + "learning_rate": 0.00019246877393010655, + "loss": 1.1894, + "step": 66760 + }, + { + "epoch": 0.1418616768976549, + "grad_norm": 0.36075690388679504, + "learning_rate": 0.00019246618060068786, + "loss": 1.2242, + "step": 66770 + }, + { + "epoch": 0.14188292321739393, + "grad_norm": 0.6903988718986511, + "learning_rate": 0.00019246358684232523, + "loss": 1.1992, + "step": 66780 + }, + { + "epoch": 0.141904169537133, + "grad_norm": 0.5756995677947998, + "learning_rate": 0.00019246099265503078, + "loss": 1.2455, + "step": 66790 + }, + { + "epoch": 0.14192541585687204, + "grad_norm": 0.47157010436058044, + "learning_rate": 0.00019245839803881646, + "loss": 1.2221, + "step": 66800 + }, + { + "epoch": 0.1419466621766111, + "grad_norm": 0.6757717132568359, + "learning_rate": 0.0001924558029936944, + "loss": 1.2222, + "step": 66810 + }, + { + "epoch": 0.14196790849635016, + "grad_norm": 0.39962026476860046, + "learning_rate": 0.00019245320751967655, + "loss": 1.1946, + "step": 66820 + }, + { + "epoch": 0.1419891548160892, + "grad_norm": 0.3325173854827881, + "learning_rate": 0.00019245061161677498, + "loss": 1.2338, + "step": 66830 + }, + { + "epoch": 0.14201040113582825, + "grad_norm": 0.4026639759540558, + "learning_rate": 0.00019244801528500176, + "loss": 1.2049, + "step": 66840 + }, + { + "epoch": 0.14203164745556732, + "grad_norm": 0.4116179943084717, + "learning_rate": 0.0001924454185243689, + "loss": 1.1724, + "step": 66850 + }, + { + "epoch": 0.14205289377530636, + "grad_norm": 0.5349782705307007, + "learning_rate": 0.00019244282133488847, + "loss": 1.2043, + "step": 66860 + }, + { + "epoch": 0.1420741400950454, + "grad_norm": 0.40292230248451233, + "learning_rate": 0.0001924402237165725, + "loss": 1.2184, + "step": 66870 + }, + { + "epoch": 0.14209538641478447, + "grad_norm": 0.3185567557811737, + "learning_rate": 0.00019243762566943304, + "loss": 1.1961, + "step": 66880 + }, + { + "epoch": 0.14211663273452352, + "grad_norm": 0.397219717502594, + "learning_rate": 0.0001924350271934822, + "loss": 1.2054, + "step": 66890 + }, + { + "epoch": 0.14213787905426256, + "grad_norm": 0.37078335881233215, + "learning_rate": 0.00019243242828873193, + "loss": 1.2081, + "step": 66900 + }, + { + "epoch": 0.14215912537400163, + "grad_norm": 0.3229777216911316, + "learning_rate": 0.00019242982895519435, + "loss": 1.2115, + "step": 66910 + }, + { + "epoch": 0.14218037169374068, + "grad_norm": 0.3928599953651428, + "learning_rate": 0.00019242722919288153, + "loss": 1.1826, + "step": 66920 + }, + { + "epoch": 0.14220161801347972, + "grad_norm": 0.3364781141281128, + "learning_rate": 0.00019242462900180546, + "loss": 1.2267, + "step": 66930 + }, + { + "epoch": 0.1422228643332188, + "grad_norm": 0.3170740604400635, + "learning_rate": 0.00019242202838197828, + "loss": 1.2046, + "step": 66940 + }, + { + "epoch": 0.14224411065295783, + "grad_norm": 0.49633702635765076, + "learning_rate": 0.00019241942733341203, + "loss": 1.2248, + "step": 66950 + }, + { + "epoch": 0.14226535697269688, + "grad_norm": 0.3724694848060608, + "learning_rate": 0.00019241682585611878, + "loss": 1.1873, + "step": 66960 + }, + { + "epoch": 0.14228660329243595, + "grad_norm": 0.505253255367279, + "learning_rate": 0.00019241422395011056, + "loss": 1.2187, + "step": 66970 + }, + { + "epoch": 0.142307849612175, + "grad_norm": 0.4070241451263428, + "learning_rate": 0.0001924116216153995, + "loss": 1.2113, + "step": 66980 + }, + { + "epoch": 0.14232909593191403, + "grad_norm": 0.3566206693649292, + "learning_rate": 0.00019240901885199762, + "loss": 1.2304, + "step": 66990 + }, + { + "epoch": 0.1423503422516531, + "grad_norm": 0.39033985137939453, + "learning_rate": 0.00019240641565991702, + "loss": 1.1976, + "step": 67000 + }, + { + "epoch": 0.14237158857139215, + "grad_norm": 0.3934558629989624, + "learning_rate": 0.0001924038120391698, + "loss": 1.1633, + "step": 67010 + }, + { + "epoch": 0.1423928348911312, + "grad_norm": 0.41479331254959106, + "learning_rate": 0.00019240120798976796, + "loss": 1.2276, + "step": 67020 + }, + { + "epoch": 0.14241408121087026, + "grad_norm": 0.3368047773838043, + "learning_rate": 0.00019239860351172365, + "loss": 1.2104, + "step": 67030 + }, + { + "epoch": 0.1424353275306093, + "grad_norm": 0.45681440830230713, + "learning_rate": 0.00019239599860504893, + "loss": 1.2076, + "step": 67040 + }, + { + "epoch": 0.14245657385034835, + "grad_norm": 0.33773136138916016, + "learning_rate": 0.00019239339326975588, + "loss": 1.205, + "step": 67050 + }, + { + "epoch": 0.14247782017008742, + "grad_norm": 0.31413644552230835, + "learning_rate": 0.0001923907875058566, + "loss": 1.1895, + "step": 67060 + }, + { + "epoch": 0.14249906648982646, + "grad_norm": 0.41882291436195374, + "learning_rate": 0.00019238818131336318, + "loss": 1.1993, + "step": 67070 + }, + { + "epoch": 0.1425203128095655, + "grad_norm": 0.432901531457901, + "learning_rate": 0.00019238557469228765, + "loss": 1.2317, + "step": 67080 + }, + { + "epoch": 0.14254155912930458, + "grad_norm": 0.4377076029777527, + "learning_rate": 0.0001923829676426422, + "loss": 1.2158, + "step": 67090 + }, + { + "epoch": 0.14256280544904362, + "grad_norm": 0.41825613379478455, + "learning_rate": 0.00019238036016443885, + "loss": 1.2316, + "step": 67100 + }, + { + "epoch": 0.14258405176878267, + "grad_norm": 0.343149334192276, + "learning_rate": 0.00019237775225768974, + "loss": 1.21, + "step": 67110 + }, + { + "epoch": 0.14260529808852174, + "grad_norm": 0.4206697344779968, + "learning_rate": 0.00019237514392240695, + "loss": 1.2238, + "step": 67120 + }, + { + "epoch": 0.14262654440826078, + "grad_norm": 0.6214691996574402, + "learning_rate": 0.00019237253515860255, + "loss": 1.206, + "step": 67130 + }, + { + "epoch": 0.14264779072799982, + "grad_norm": 0.3577651083469391, + "learning_rate": 0.00019236992596628869, + "loss": 1.2082, + "step": 67140 + }, + { + "epoch": 0.1426690370477389, + "grad_norm": 0.37119317054748535, + "learning_rate": 0.00019236731634547744, + "loss": 1.215, + "step": 67150 + }, + { + "epoch": 0.14269028336747794, + "grad_norm": 0.365482896566391, + "learning_rate": 0.00019236470629618095, + "loss": 1.2072, + "step": 67160 + }, + { + "epoch": 0.14271152968721698, + "grad_norm": 0.6385962963104248, + "learning_rate": 0.00019236209581841127, + "loss": 1.2163, + "step": 67170 + }, + { + "epoch": 0.14273277600695605, + "grad_norm": 0.3574247360229492, + "learning_rate": 0.00019235948491218055, + "loss": 1.216, + "step": 67180 + }, + { + "epoch": 0.1427540223266951, + "grad_norm": 0.3437846899032593, + "learning_rate": 0.00019235687357750086, + "loss": 1.2112, + "step": 67190 + }, + { + "epoch": 0.14277526864643414, + "grad_norm": 0.33989417552948, + "learning_rate": 0.00019235426181438435, + "loss": 1.2228, + "step": 67200 + }, + { + "epoch": 0.1427965149661732, + "grad_norm": 0.35253408551216125, + "learning_rate": 0.00019235164962284315, + "loss": 1.2237, + "step": 67210 + }, + { + "epoch": 0.14281776128591225, + "grad_norm": 0.36474865674972534, + "learning_rate": 0.00019234903700288938, + "loss": 1.2035, + "step": 67220 + }, + { + "epoch": 0.1428390076056513, + "grad_norm": 0.36945632100105286, + "learning_rate": 0.0001923464239545351, + "loss": 1.1984, + "step": 67230 + }, + { + "epoch": 0.14286025392539037, + "grad_norm": 0.4502631723880768, + "learning_rate": 0.00019234381047779248, + "loss": 1.2237, + "step": 67240 + }, + { + "epoch": 0.1428815002451294, + "grad_norm": 0.43486759066581726, + "learning_rate": 0.00019234119657267362, + "loss": 1.209, + "step": 67250 + }, + { + "epoch": 0.14290274656486845, + "grad_norm": 0.3952382504940033, + "learning_rate": 0.00019233858223919069, + "loss": 1.1581, + "step": 67260 + }, + { + "epoch": 0.14292399288460753, + "grad_norm": 0.32268792390823364, + "learning_rate": 0.00019233596747735574, + "loss": 1.1962, + "step": 67270 + }, + { + "epoch": 0.14294523920434657, + "grad_norm": 0.3757917881011963, + "learning_rate": 0.00019233335228718097, + "loss": 1.2267, + "step": 67280 + }, + { + "epoch": 0.1429664855240856, + "grad_norm": 0.45910367369651794, + "learning_rate": 0.00019233073666867847, + "loss": 1.2293, + "step": 67290 + }, + { + "epoch": 0.14298773184382468, + "grad_norm": 0.3405468463897705, + "learning_rate": 0.0001923281206218604, + "loss": 1.2187, + "step": 67300 + }, + { + "epoch": 0.14300897816356373, + "grad_norm": 0.7527291178703308, + "learning_rate": 0.0001923255041467389, + "loss": 1.1911, + "step": 67310 + }, + { + "epoch": 0.14303022448330277, + "grad_norm": 0.3186609447002411, + "learning_rate": 0.00019232288724332607, + "loss": 1.1523, + "step": 67320 + }, + { + "epoch": 0.14305147080304184, + "grad_norm": 0.3476082980632782, + "learning_rate": 0.0001923202699116341, + "loss": 1.202, + "step": 67330 + }, + { + "epoch": 0.14307271712278088, + "grad_norm": 0.3268408477306366, + "learning_rate": 0.00019231765215167506, + "loss": 1.2298, + "step": 67340 + }, + { + "epoch": 0.14309396344251993, + "grad_norm": 0.36047688126564026, + "learning_rate": 0.00019231503396346117, + "loss": 1.2235, + "step": 67350 + }, + { + "epoch": 0.143115209762259, + "grad_norm": 0.3791401982307434, + "learning_rate": 0.00019231241534700455, + "loss": 1.2034, + "step": 67360 + }, + { + "epoch": 0.14313645608199804, + "grad_norm": 0.32970288395881653, + "learning_rate": 0.00019230979630231734, + "loss": 1.2119, + "step": 67370 + }, + { + "epoch": 0.14315770240173709, + "grad_norm": 0.48503994941711426, + "learning_rate": 0.0001923071768294117, + "loss": 1.2482, + "step": 67380 + }, + { + "epoch": 0.14317894872147616, + "grad_norm": 0.3727063238620758, + "learning_rate": 0.00019230455692829975, + "loss": 1.1862, + "step": 67390 + }, + { + "epoch": 0.1432001950412152, + "grad_norm": 0.5931296348571777, + "learning_rate": 0.00019230193659899367, + "loss": 1.2428, + "step": 67400 + }, + { + "epoch": 0.14322144136095424, + "grad_norm": 0.4757171869277954, + "learning_rate": 0.0001922993158415056, + "loss": 1.2114, + "step": 67410 + }, + { + "epoch": 0.14324268768069331, + "grad_norm": 0.5082051157951355, + "learning_rate": 0.0001922966946558477, + "loss": 1.1898, + "step": 67420 + }, + { + "epoch": 0.14326393400043236, + "grad_norm": 0.340548038482666, + "learning_rate": 0.00019229407304203217, + "loss": 1.219, + "step": 67430 + }, + { + "epoch": 0.14328518032017143, + "grad_norm": 0.4320313632488251, + "learning_rate": 0.00019229145100007114, + "loss": 1.1996, + "step": 67440 + }, + { + "epoch": 0.14330642663991047, + "grad_norm": 0.4128219783306122, + "learning_rate": 0.0001922888285299768, + "loss": 1.195, + "step": 67450 + }, + { + "epoch": 0.14332767295964952, + "grad_norm": 0.4695760905742645, + "learning_rate": 0.00019228620563176122, + "loss": 1.2337, + "step": 67460 + }, + { + "epoch": 0.1433489192793886, + "grad_norm": 0.47719091176986694, + "learning_rate": 0.00019228358230543666, + "loss": 1.2297, + "step": 67470 + }, + { + "epoch": 0.14337016559912763, + "grad_norm": 0.3176884055137634, + "learning_rate": 0.00019228095855101528, + "loss": 1.222, + "step": 67480 + }, + { + "epoch": 0.14339141191886667, + "grad_norm": 0.3307338356971741, + "learning_rate": 0.00019227833436850924, + "loss": 1.2118, + "step": 67490 + }, + { + "epoch": 0.14341265823860574, + "grad_norm": 0.36831602454185486, + "learning_rate": 0.00019227570975793073, + "loss": 1.2058, + "step": 67500 + }, + { + "epoch": 0.1434339045583448, + "grad_norm": 0.3449885845184326, + "learning_rate": 0.00019227308471929185, + "loss": 1.217, + "step": 67510 + }, + { + "epoch": 0.14345515087808383, + "grad_norm": 0.33818402886390686, + "learning_rate": 0.00019227045925260489, + "loss": 1.2488, + "step": 67520 + }, + { + "epoch": 0.1434763971978229, + "grad_norm": 0.5227258205413818, + "learning_rate": 0.00019226783335788192, + "loss": 1.1946, + "step": 67530 + }, + { + "epoch": 0.14349764351756195, + "grad_norm": 0.5177890062332153, + "learning_rate": 0.00019226520703513521, + "loss": 1.2061, + "step": 67540 + }, + { + "epoch": 0.143518889837301, + "grad_norm": 0.5707379579544067, + "learning_rate": 0.0001922625802843769, + "loss": 1.245, + "step": 67550 + }, + { + "epoch": 0.14354013615704006, + "grad_norm": 0.3525550067424774, + "learning_rate": 0.00019225995310561918, + "loss": 1.1958, + "step": 67560 + }, + { + "epoch": 0.1435613824767791, + "grad_norm": 0.38052454590797424, + "learning_rate": 0.00019225732549887426, + "loss": 1.2199, + "step": 67570 + }, + { + "epoch": 0.14358262879651815, + "grad_norm": 0.34609827399253845, + "learning_rate": 0.00019225469746415427, + "loss": 1.2521, + "step": 67580 + }, + { + "epoch": 0.14360387511625722, + "grad_norm": 0.5508667826652527, + "learning_rate": 0.00019225206900147146, + "loss": 1.2418, + "step": 67590 + }, + { + "epoch": 0.14362512143599626, + "grad_norm": 0.5943878293037415, + "learning_rate": 0.00019224944011083801, + "loss": 1.2129, + "step": 67600 + }, + { + "epoch": 0.1436463677557353, + "grad_norm": 0.4629979729652405, + "learning_rate": 0.0001922468107922661, + "loss": 1.1935, + "step": 67610 + }, + { + "epoch": 0.14366761407547438, + "grad_norm": 0.4251401126384735, + "learning_rate": 0.00019224418104576795, + "loss": 1.2051, + "step": 67620 + }, + { + "epoch": 0.14368886039521342, + "grad_norm": 0.6565160751342773, + "learning_rate": 0.00019224155087135572, + "loss": 1.2461, + "step": 67630 + }, + { + "epoch": 0.14371010671495246, + "grad_norm": 0.6728334426879883, + "learning_rate": 0.00019223892026904165, + "loss": 1.193, + "step": 67640 + }, + { + "epoch": 0.14373135303469153, + "grad_norm": 0.39194726943969727, + "learning_rate": 0.00019223628923883794, + "loss": 1.1791, + "step": 67650 + }, + { + "epoch": 0.14375259935443058, + "grad_norm": 0.3430050313472748, + "learning_rate": 0.00019223365778075677, + "loss": 1.1887, + "step": 67660 + }, + { + "epoch": 0.14377384567416962, + "grad_norm": 0.4272966682910919, + "learning_rate": 0.00019223102589481037, + "loss": 1.2128, + "step": 67670 + }, + { + "epoch": 0.1437950919939087, + "grad_norm": 0.4846493601799011, + "learning_rate": 0.00019222839358101095, + "loss": 1.2284, + "step": 67680 + }, + { + "epoch": 0.14381633831364773, + "grad_norm": 0.34343186020851135, + "learning_rate": 0.0001922257608393707, + "loss": 1.2153, + "step": 67690 + }, + { + "epoch": 0.14383758463338678, + "grad_norm": 0.3350347578525543, + "learning_rate": 0.00019222312766990185, + "loss": 1.2217, + "step": 67700 + }, + { + "epoch": 0.14385883095312585, + "grad_norm": 0.3220061957836151, + "learning_rate": 0.00019222049407261659, + "loss": 1.1556, + "step": 67710 + }, + { + "epoch": 0.1438800772728649, + "grad_norm": 0.3187692165374756, + "learning_rate": 0.0001922178600475272, + "loss": 1.1516, + "step": 67720 + }, + { + "epoch": 0.14390132359260394, + "grad_norm": 0.4369345009326935, + "learning_rate": 0.00019221522559464583, + "loss": 1.2392, + "step": 67730 + }, + { + "epoch": 0.143922569912343, + "grad_norm": 0.6796104311943054, + "learning_rate": 0.0001922125907139847, + "loss": 1.194, + "step": 67740 + }, + { + "epoch": 0.14394381623208205, + "grad_norm": 0.6693904995918274, + "learning_rate": 0.00019220995540555612, + "loss": 1.2195, + "step": 67750 + }, + { + "epoch": 0.1439650625518211, + "grad_norm": 0.34348416328430176, + "learning_rate": 0.0001922073196693722, + "loss": 1.207, + "step": 67760 + }, + { + "epoch": 0.14398630887156016, + "grad_norm": 0.32131826877593994, + "learning_rate": 0.00019220468350544527, + "loss": 1.1888, + "step": 67770 + }, + { + "epoch": 0.1440075551912992, + "grad_norm": 0.46444904804229736, + "learning_rate": 0.00019220204691378746, + "loss": 1.2778, + "step": 67780 + }, + { + "epoch": 0.14402880151103825, + "grad_norm": 0.33468446135520935, + "learning_rate": 0.0001921994098944111, + "loss": 1.2196, + "step": 67790 + }, + { + "epoch": 0.14405004783077732, + "grad_norm": 0.5699175596237183, + "learning_rate": 0.00019219677244732838, + "loss": 1.2128, + "step": 67800 + }, + { + "epoch": 0.14407129415051637, + "grad_norm": 0.5891767144203186, + "learning_rate": 0.00019219413457255148, + "loss": 1.2468, + "step": 67810 + }, + { + "epoch": 0.1440925404702554, + "grad_norm": 0.6071411371231079, + "learning_rate": 0.00019219149627009273, + "loss": 1.1917, + "step": 67820 + }, + { + "epoch": 0.14411378678999448, + "grad_norm": 0.41697391867637634, + "learning_rate": 0.00019218885753996431, + "loss": 1.2091, + "step": 67830 + }, + { + "epoch": 0.14413503310973352, + "grad_norm": 0.39768868684768677, + "learning_rate": 0.0001921862183821785, + "loss": 1.2423, + "step": 67840 + }, + { + "epoch": 0.14415627942947257, + "grad_norm": 0.37140852212905884, + "learning_rate": 0.0001921835787967475, + "loss": 1.1778, + "step": 67850 + }, + { + "epoch": 0.14417752574921164, + "grad_norm": 0.3780539631843567, + "learning_rate": 0.00019218093878368357, + "loss": 1.2182, + "step": 67860 + }, + { + "epoch": 0.14419877206895068, + "grad_norm": 0.3811364471912384, + "learning_rate": 0.00019217829834299895, + "loss": 1.2157, + "step": 67870 + }, + { + "epoch": 0.14422001838868972, + "grad_norm": 0.40059682726860046, + "learning_rate": 0.0001921756574747059, + "loss": 1.2188, + "step": 67880 + }, + { + "epoch": 0.1442412647084288, + "grad_norm": 0.32294243574142456, + "learning_rate": 0.00019217301617881671, + "loss": 1.2065, + "step": 67890 + }, + { + "epoch": 0.14426251102816784, + "grad_norm": 0.3904600739479065, + "learning_rate": 0.00019217037445534358, + "loss": 1.1729, + "step": 67900 + }, + { + "epoch": 0.14428375734790688, + "grad_norm": 0.7627735137939453, + "learning_rate": 0.00019216773230429878, + "loss": 1.2204, + "step": 67910 + }, + { + "epoch": 0.14430500366764595, + "grad_norm": 0.37160080671310425, + "learning_rate": 0.00019216508972569454, + "loss": 1.2026, + "step": 67920 + }, + { + "epoch": 0.144326249987385, + "grad_norm": 0.309749573469162, + "learning_rate": 0.00019216244671954317, + "loss": 1.2209, + "step": 67930 + }, + { + "epoch": 0.14434749630712404, + "grad_norm": 0.4327913820743561, + "learning_rate": 0.00019215980328585687, + "loss": 1.1971, + "step": 67940 + }, + { + "epoch": 0.1443687426268631, + "grad_norm": 0.3667871952056885, + "learning_rate": 0.00019215715942464797, + "loss": 1.2092, + "step": 67950 + }, + { + "epoch": 0.14438998894660215, + "grad_norm": 0.3441128730773926, + "learning_rate": 0.0001921545151359287, + "loss": 1.2271, + "step": 67960 + }, + { + "epoch": 0.1444112352663412, + "grad_norm": 0.4269813299179077, + "learning_rate": 0.00019215187041971135, + "loss": 1.2516, + "step": 67970 + }, + { + "epoch": 0.14443248158608027, + "grad_norm": 0.521017849445343, + "learning_rate": 0.00019214922527600813, + "loss": 1.168, + "step": 67980 + }, + { + "epoch": 0.1444537279058193, + "grad_norm": 0.4103893041610718, + "learning_rate": 0.00019214657970483137, + "loss": 1.212, + "step": 67990 + }, + { + "epoch": 0.14447497422555836, + "grad_norm": 0.3420412540435791, + "learning_rate": 0.00019214393370619332, + "loss": 1.2125, + "step": 68000 + }, + { + "epoch": 0.14449622054529743, + "grad_norm": 0.3383365273475647, + "learning_rate": 0.00019214128728010623, + "loss": 1.244, + "step": 68010 + }, + { + "epoch": 0.14451746686503647, + "grad_norm": 0.35436201095581055, + "learning_rate": 0.00019213864042658243, + "loss": 1.1927, + "step": 68020 + }, + { + "epoch": 0.1445387131847755, + "grad_norm": 0.33968400955200195, + "learning_rate": 0.00019213599314563416, + "loss": 1.1686, + "step": 68030 + }, + { + "epoch": 0.14455995950451458, + "grad_norm": 0.4236471652984619, + "learning_rate": 0.00019213334543727371, + "loss": 1.2012, + "step": 68040 + }, + { + "epoch": 0.14458120582425363, + "grad_norm": 0.3768179416656494, + "learning_rate": 0.0001921306973015134, + "loss": 1.2207, + "step": 68050 + }, + { + "epoch": 0.14460245214399267, + "grad_norm": 0.35545459389686584, + "learning_rate": 0.00019212804873836544, + "loss": 1.1729, + "step": 68060 + }, + { + "epoch": 0.14462369846373174, + "grad_norm": 0.35519540309906006, + "learning_rate": 0.00019212539974784214, + "loss": 1.2366, + "step": 68070 + }, + { + "epoch": 0.14464494478347079, + "grad_norm": 0.4052969813346863, + "learning_rate": 0.0001921227503299558, + "loss": 1.2026, + "step": 68080 + }, + { + "epoch": 0.14466619110320983, + "grad_norm": 0.36010634899139404, + "learning_rate": 0.00019212010048471877, + "loss": 1.1962, + "step": 68090 + }, + { + "epoch": 0.1446874374229489, + "grad_norm": 0.6500228047370911, + "learning_rate": 0.00019211745021214323, + "loss": 1.1954, + "step": 68100 + }, + { + "epoch": 0.14470868374268794, + "grad_norm": 0.33653756976127625, + "learning_rate": 0.00019211479951224155, + "loss": 1.2008, + "step": 68110 + }, + { + "epoch": 0.144729930062427, + "grad_norm": 0.3448184132575989, + "learning_rate": 0.00019211214838502603, + "loss": 1.187, + "step": 68120 + }, + { + "epoch": 0.14475117638216606, + "grad_norm": 0.4096815288066864, + "learning_rate": 0.00019210949683050893, + "loss": 1.2095, + "step": 68130 + }, + { + "epoch": 0.1447724227019051, + "grad_norm": 0.33310019969940186, + "learning_rate": 0.00019210684484870255, + "loss": 1.2429, + "step": 68140 + }, + { + "epoch": 0.14479366902164414, + "grad_norm": 0.5435060262680054, + "learning_rate": 0.00019210419243961921, + "loss": 1.2023, + "step": 68150 + }, + { + "epoch": 0.14481491534138322, + "grad_norm": 0.39706510305404663, + "learning_rate": 0.00019210153960327121, + "loss": 1.222, + "step": 68160 + }, + { + "epoch": 0.14483616166112226, + "grad_norm": 0.3912816643714905, + "learning_rate": 0.00019209888633967088, + "loss": 1.2252, + "step": 68170 + }, + { + "epoch": 0.1448574079808613, + "grad_norm": 0.4419354200363159, + "learning_rate": 0.0001920962326488305, + "loss": 1.1915, + "step": 68180 + }, + { + "epoch": 0.14487865430060037, + "grad_norm": 0.5957629680633545, + "learning_rate": 0.0001920935785307624, + "loss": 1.204, + "step": 68190 + }, + { + "epoch": 0.14489990062033942, + "grad_norm": 0.5303277373313904, + "learning_rate": 0.00019209092398547885, + "loss": 1.1727, + "step": 68200 + }, + { + "epoch": 0.14492114694007846, + "grad_norm": 0.6837072372436523, + "learning_rate": 0.0001920882690129922, + "loss": 1.2242, + "step": 68210 + }, + { + "epoch": 0.14494239325981753, + "grad_norm": 0.6801193952560425, + "learning_rate": 0.00019208561361331477, + "loss": 1.2061, + "step": 68220 + }, + { + "epoch": 0.14496363957955657, + "grad_norm": 0.3447994887828827, + "learning_rate": 0.00019208295778645886, + "loss": 1.1479, + "step": 68230 + }, + { + "epoch": 0.14498488589929562, + "grad_norm": 0.35376694798469543, + "learning_rate": 0.00019208030153243684, + "loss": 1.2297, + "step": 68240 + }, + { + "epoch": 0.1450061322190347, + "grad_norm": 0.46443817019462585, + "learning_rate": 0.00019207764485126093, + "loss": 1.1896, + "step": 68250 + }, + { + "epoch": 0.14502737853877373, + "grad_norm": 0.3926984667778015, + "learning_rate": 0.00019207498774294354, + "loss": 1.2124, + "step": 68260 + }, + { + "epoch": 0.14504862485851278, + "grad_norm": 0.5688468217849731, + "learning_rate": 0.00019207233020749698, + "loss": 1.1718, + "step": 68270 + }, + { + "epoch": 0.14506987117825185, + "grad_norm": 0.3447321951389313, + "learning_rate": 0.00019206967224493355, + "loss": 1.1921, + "step": 68280 + }, + { + "epoch": 0.1450911174979909, + "grad_norm": 0.34563830494880676, + "learning_rate": 0.0001920670138552656, + "loss": 1.182, + "step": 68290 + }, + { + "epoch": 0.14511236381772996, + "grad_norm": 0.4194674789905548, + "learning_rate": 0.0001920643550385055, + "loss": 1.2183, + "step": 68300 + }, + { + "epoch": 0.145133610137469, + "grad_norm": 0.3205513060092926, + "learning_rate": 0.0001920616957946655, + "loss": 1.1805, + "step": 68310 + }, + { + "epoch": 0.14515485645720805, + "grad_norm": 0.3702497184276581, + "learning_rate": 0.000192059036123758, + "loss": 1.1914, + "step": 68320 + }, + { + "epoch": 0.14517610277694712, + "grad_norm": 0.4651612639427185, + "learning_rate": 0.00019205637602579532, + "loss": 1.2064, + "step": 68330 + }, + { + "epoch": 0.14519734909668616, + "grad_norm": 0.37382936477661133, + "learning_rate": 0.0001920537155007898, + "loss": 1.1837, + "step": 68340 + }, + { + "epoch": 0.1452185954164252, + "grad_norm": 0.366173654794693, + "learning_rate": 0.00019205105454875377, + "loss": 1.1596, + "step": 68350 + }, + { + "epoch": 0.14523984173616428, + "grad_norm": 0.3482207953929901, + "learning_rate": 0.00019204839316969957, + "loss": 1.199, + "step": 68360 + }, + { + "epoch": 0.14526108805590332, + "grad_norm": 0.3381294012069702, + "learning_rate": 0.00019204573136363958, + "loss": 1.1927, + "step": 68370 + }, + { + "epoch": 0.14528233437564236, + "grad_norm": 0.32687562704086304, + "learning_rate": 0.00019204306913058616, + "loss": 1.1616, + "step": 68380 + }, + { + "epoch": 0.14530358069538143, + "grad_norm": 0.42876479029655457, + "learning_rate": 0.0001920404064705516, + "loss": 1.1945, + "step": 68390 + }, + { + "epoch": 0.14532482701512048, + "grad_norm": 0.4787624180316925, + "learning_rate": 0.00019203774338354829, + "loss": 1.2306, + "step": 68400 + }, + { + "epoch": 0.14534607333485952, + "grad_norm": 0.35019707679748535, + "learning_rate": 0.00019203507986958855, + "loss": 1.2223, + "step": 68410 + }, + { + "epoch": 0.1453673196545986, + "grad_norm": 0.3619852364063263, + "learning_rate": 0.00019203241592868477, + "loss": 1.1966, + "step": 68420 + }, + { + "epoch": 0.14538856597433764, + "grad_norm": 0.35725051164627075, + "learning_rate": 0.0001920297515608493, + "loss": 1.1901, + "step": 68430 + }, + { + "epoch": 0.14540981229407668, + "grad_norm": 0.34927067160606384, + "learning_rate": 0.00019202708676609452, + "loss": 1.1798, + "step": 68440 + }, + { + "epoch": 0.14543105861381575, + "grad_norm": 0.37502601742744446, + "learning_rate": 0.00019202442154443277, + "loss": 1.1848, + "step": 68450 + }, + { + "epoch": 0.1454523049335548, + "grad_norm": 0.3833561837673187, + "learning_rate": 0.0001920217558958764, + "loss": 1.1674, + "step": 68460 + }, + { + "epoch": 0.14547355125329384, + "grad_norm": 0.31555283069610596, + "learning_rate": 0.00019201908982043777, + "loss": 1.1987, + "step": 68470 + }, + { + "epoch": 0.1454947975730329, + "grad_norm": 0.33981791138648987, + "learning_rate": 0.0001920164233181293, + "loss": 1.1939, + "step": 68480 + }, + { + "epoch": 0.14551604389277195, + "grad_norm": 0.3805835545063019, + "learning_rate": 0.00019201375638896328, + "loss": 1.2279, + "step": 68490 + }, + { + "epoch": 0.145537290212511, + "grad_norm": 0.3921438157558441, + "learning_rate": 0.00019201108903295217, + "loss": 1.1895, + "step": 68500 + }, + { + "epoch": 0.14555853653225007, + "grad_norm": 0.45190486311912537, + "learning_rate": 0.0001920084212501083, + "loss": 1.2044, + "step": 68510 + }, + { + "epoch": 0.1455797828519891, + "grad_norm": 0.3366224467754364, + "learning_rate": 0.00019200575304044402, + "loss": 1.2334, + "step": 68520 + }, + { + "epoch": 0.14560102917172815, + "grad_norm": 0.3193846642971039, + "learning_rate": 0.0001920030844039718, + "loss": 1.2545, + "step": 68530 + }, + { + "epoch": 0.14562227549146722, + "grad_norm": 0.32697629928588867, + "learning_rate": 0.00019200041534070386, + "loss": 1.221, + "step": 68540 + }, + { + "epoch": 0.14564352181120627, + "grad_norm": 0.32959654927253723, + "learning_rate": 0.00019199774585065276, + "loss": 1.2188, + "step": 68550 + }, + { + "epoch": 0.1456647681309453, + "grad_norm": 0.5078536868095398, + "learning_rate": 0.00019199507593383074, + "loss": 1.1801, + "step": 68560 + }, + { + "epoch": 0.14568601445068438, + "grad_norm": 0.6021037697792053, + "learning_rate": 0.0001919924055902503, + "loss": 1.1797, + "step": 68570 + }, + { + "epoch": 0.14570726077042342, + "grad_norm": 0.3488413393497467, + "learning_rate": 0.00019198973481992376, + "loss": 1.209, + "step": 68580 + }, + { + "epoch": 0.14572850709016247, + "grad_norm": 0.3449622392654419, + "learning_rate": 0.0001919870636228635, + "loss": 1.1798, + "step": 68590 + }, + { + "epoch": 0.14574975340990154, + "grad_norm": 0.5749824047088623, + "learning_rate": 0.00019198439199908193, + "loss": 1.2138, + "step": 68600 + }, + { + "epoch": 0.14577099972964058, + "grad_norm": 0.49169063568115234, + "learning_rate": 0.00019198171994859147, + "loss": 1.1985, + "step": 68610 + }, + { + "epoch": 0.14579224604937963, + "grad_norm": 0.5927749276161194, + "learning_rate": 0.0001919790474714045, + "loss": 1.2074, + "step": 68620 + }, + { + "epoch": 0.1458134923691187, + "grad_norm": 0.33826202154159546, + "learning_rate": 0.00019197637456753338, + "loss": 1.181, + "step": 68630 + }, + { + "epoch": 0.14583473868885774, + "grad_norm": 0.3503311276435852, + "learning_rate": 0.00019197370123699055, + "loss": 1.2089, + "step": 68640 + }, + { + "epoch": 0.14585598500859678, + "grad_norm": 0.3426028788089752, + "learning_rate": 0.00019197102747978845, + "loss": 1.2073, + "step": 68650 + }, + { + "epoch": 0.14587723132833585, + "grad_norm": 0.3315054774284363, + "learning_rate": 0.00019196835329593937, + "loss": 1.206, + "step": 68660 + }, + { + "epoch": 0.1458984776480749, + "grad_norm": 0.5163542628288269, + "learning_rate": 0.0001919656786854558, + "loss": 1.1576, + "step": 68670 + }, + { + "epoch": 0.14591972396781394, + "grad_norm": 0.36389604210853577, + "learning_rate": 0.00019196300364835014, + "loss": 1.195, + "step": 68680 + }, + { + "epoch": 0.145940970287553, + "grad_norm": 0.36515939235687256, + "learning_rate": 0.0001919603281846348, + "loss": 1.2122, + "step": 68690 + }, + { + "epoch": 0.14596221660729206, + "grad_norm": 0.36944469809532166, + "learning_rate": 0.00019195765229432213, + "loss": 1.2339, + "step": 68700 + }, + { + "epoch": 0.1459834629270311, + "grad_norm": 0.3381350040435791, + "learning_rate": 0.00019195497597742462, + "loss": 1.2209, + "step": 68710 + }, + { + "epoch": 0.14600470924677017, + "grad_norm": 0.46917736530303955, + "learning_rate": 0.00019195229923395468, + "loss": 1.1861, + "step": 68720 + }, + { + "epoch": 0.1460259555665092, + "grad_norm": 0.3579905927181244, + "learning_rate": 0.00019194962206392466, + "loss": 1.2122, + "step": 68730 + }, + { + "epoch": 0.14604720188624826, + "grad_norm": 0.5049021244049072, + "learning_rate": 0.00019194694446734704, + "loss": 1.1954, + "step": 68740 + }, + { + "epoch": 0.14606844820598733, + "grad_norm": 0.3274136185646057, + "learning_rate": 0.00019194426644423423, + "loss": 1.1939, + "step": 68750 + }, + { + "epoch": 0.14608969452572637, + "grad_norm": 0.3254965841770172, + "learning_rate": 0.00019194158799459867, + "loss": 1.1921, + "step": 68760 + }, + { + "epoch": 0.14611094084546541, + "grad_norm": 0.333126962184906, + "learning_rate": 0.00019193890911845272, + "loss": 1.1697, + "step": 68770 + }, + { + "epoch": 0.14613218716520449, + "grad_norm": 0.306079626083374, + "learning_rate": 0.00019193622981580885, + "loss": 1.2165, + "step": 68780 + }, + { + "epoch": 0.14615343348494353, + "grad_norm": 0.3504839837551117, + "learning_rate": 0.00019193355008667953, + "loss": 1.1958, + "step": 68790 + }, + { + "epoch": 0.14617467980468257, + "grad_norm": 0.36338236927986145, + "learning_rate": 0.0001919308699310771, + "loss": 1.2068, + "step": 68800 + }, + { + "epoch": 0.14619592612442164, + "grad_norm": 0.3287404179573059, + "learning_rate": 0.0001919281893490141, + "loss": 1.1939, + "step": 68810 + }, + { + "epoch": 0.1462171724441607, + "grad_norm": 0.3251025974750519, + "learning_rate": 0.00019192550834050287, + "loss": 1.1881, + "step": 68820 + }, + { + "epoch": 0.14623841876389973, + "grad_norm": 0.2877750098705292, + "learning_rate": 0.0001919228269055559, + "loss": 1.2372, + "step": 68830 + }, + { + "epoch": 0.1462596650836388, + "grad_norm": 0.3319435715675354, + "learning_rate": 0.0001919201450441856, + "loss": 1.2045, + "step": 68840 + }, + { + "epoch": 0.14628091140337784, + "grad_norm": 0.4615893065929413, + "learning_rate": 0.00019191746275640443, + "loss": 1.255, + "step": 68850 + }, + { + "epoch": 0.1463021577231169, + "grad_norm": 0.3631077706813812, + "learning_rate": 0.00019191478004222483, + "loss": 1.1886, + "step": 68860 + }, + { + "epoch": 0.14632340404285596, + "grad_norm": 0.5420201420783997, + "learning_rate": 0.00019191209690165922, + "loss": 1.1963, + "step": 68870 + }, + { + "epoch": 0.146344650362595, + "grad_norm": 0.3301151394844055, + "learning_rate": 0.0001919094133347201, + "loss": 1.2042, + "step": 68880 + }, + { + "epoch": 0.14636589668233405, + "grad_norm": 0.29680803418159485, + "learning_rate": 0.0001919067293414199, + "loss": 1.1994, + "step": 68890 + }, + { + "epoch": 0.14638714300207312, + "grad_norm": 0.31799477338790894, + "learning_rate": 0.00019190404492177104, + "loss": 1.2435, + "step": 68900 + }, + { + "epoch": 0.14640838932181216, + "grad_norm": 0.3180469274520874, + "learning_rate": 0.000191901360075786, + "loss": 1.1985, + "step": 68910 + }, + { + "epoch": 0.1464296356415512, + "grad_norm": 0.3573299050331116, + "learning_rate": 0.00019189867480347717, + "loss": 1.2049, + "step": 68920 + }, + { + "epoch": 0.14645088196129027, + "grad_norm": 0.3592850863933563, + "learning_rate": 0.00019189598910485716, + "loss": 1.1693, + "step": 68930 + }, + { + "epoch": 0.14647212828102932, + "grad_norm": 0.40471333265304565, + "learning_rate": 0.00019189330297993826, + "loss": 1.1554, + "step": 68940 + }, + { + "epoch": 0.14649337460076836, + "grad_norm": 0.34098702669143677, + "learning_rate": 0.00019189061642873302, + "loss": 1.1894, + "step": 68950 + }, + { + "epoch": 0.14651462092050743, + "grad_norm": 0.5778279304504395, + "learning_rate": 0.0001918879294512539, + "loss": 1.195, + "step": 68960 + }, + { + "epoch": 0.14653586724024648, + "grad_norm": 0.47212454676628113, + "learning_rate": 0.00019188524204751336, + "loss": 1.1939, + "step": 68970 + }, + { + "epoch": 0.14655711355998552, + "grad_norm": 0.36650943756103516, + "learning_rate": 0.00019188255421752383, + "loss": 1.1736, + "step": 68980 + }, + { + "epoch": 0.1465783598797246, + "grad_norm": 0.36658206582069397, + "learning_rate": 0.00019187986596129784, + "loss": 1.23, + "step": 68990 + }, + { + "epoch": 0.14659960619946363, + "grad_norm": 0.608676552772522, + "learning_rate": 0.00019187717727884776, + "loss": 1.2063, + "step": 69000 + }, + { + "epoch": 0.14662085251920268, + "grad_norm": 0.37656816840171814, + "learning_rate": 0.0001918744881701862, + "loss": 1.221, + "step": 69010 + }, + { + "epoch": 0.14664209883894175, + "grad_norm": 0.3108048737049103, + "learning_rate": 0.00019187179863532555, + "loss": 1.2134, + "step": 69020 + }, + { + "epoch": 0.1466633451586808, + "grad_norm": 0.33220481872558594, + "learning_rate": 0.00019186910867427826, + "loss": 1.2427, + "step": 69030 + }, + { + "epoch": 0.14668459147841983, + "grad_norm": 0.3912070691585541, + "learning_rate": 0.0001918664182870569, + "loss": 1.2139, + "step": 69040 + }, + { + "epoch": 0.1467058377981589, + "grad_norm": 0.3323987126350403, + "learning_rate": 0.00019186372747367387, + "loss": 1.2499, + "step": 69050 + }, + { + "epoch": 0.14672708411789795, + "grad_norm": 0.35090747475624084, + "learning_rate": 0.0001918610362341417, + "loss": 1.2105, + "step": 69060 + }, + { + "epoch": 0.146748330437637, + "grad_norm": 0.33434057235717773, + "learning_rate": 0.0001918583445684728, + "loss": 1.1973, + "step": 69070 + }, + { + "epoch": 0.14676957675737606, + "grad_norm": 0.34730979800224304, + "learning_rate": 0.00019185565247667978, + "loss": 1.206, + "step": 69080 + }, + { + "epoch": 0.1467908230771151, + "grad_norm": 0.3852848410606384, + "learning_rate": 0.00019185295995877505, + "loss": 1.2459, + "step": 69090 + }, + { + "epoch": 0.14681206939685415, + "grad_norm": 0.342034250497818, + "learning_rate": 0.0001918502670147711, + "loss": 1.2498, + "step": 69100 + }, + { + "epoch": 0.14683331571659322, + "grad_norm": 0.34822461009025574, + "learning_rate": 0.00019184757364468046, + "loss": 1.2298, + "step": 69110 + }, + { + "epoch": 0.14685456203633226, + "grad_norm": 0.42186519503593445, + "learning_rate": 0.00019184487984851557, + "loss": 1.1984, + "step": 69120 + }, + { + "epoch": 0.1468758083560713, + "grad_norm": 0.3571203947067261, + "learning_rate": 0.00019184218562628895, + "loss": 1.1998, + "step": 69130 + }, + { + "epoch": 0.14689705467581038, + "grad_norm": 0.43503159284591675, + "learning_rate": 0.0001918394909780131, + "loss": 1.238, + "step": 69140 + }, + { + "epoch": 0.14691830099554942, + "grad_norm": 0.3614442050457001, + "learning_rate": 0.00019183679590370053, + "loss": 1.164, + "step": 69150 + }, + { + "epoch": 0.1469395473152885, + "grad_norm": 0.4352841377258301, + "learning_rate": 0.00019183410040336373, + "loss": 1.2033, + "step": 69160 + }, + { + "epoch": 0.14696079363502754, + "grad_norm": 0.4322657585144043, + "learning_rate": 0.00019183140447701525, + "loss": 1.1785, + "step": 69170 + }, + { + "epoch": 0.14698203995476658, + "grad_norm": 0.3388819098472595, + "learning_rate": 0.0001918287081246675, + "loss": 1.2255, + "step": 69180 + }, + { + "epoch": 0.14700328627450565, + "grad_norm": 0.44199591875076294, + "learning_rate": 0.00019182601134633308, + "loss": 1.2207, + "step": 69190 + }, + { + "epoch": 0.1470245325942447, + "grad_norm": 0.45202401280403137, + "learning_rate": 0.00019182331414202447, + "loss": 1.2002, + "step": 69200 + }, + { + "epoch": 0.14704577891398374, + "grad_norm": 0.43392613530158997, + "learning_rate": 0.00019182061651175415, + "loss": 1.1944, + "step": 69210 + }, + { + "epoch": 0.1470670252337228, + "grad_norm": 0.3933318257331848, + "learning_rate": 0.00019181791845553469, + "loss": 1.2247, + "step": 69220 + }, + { + "epoch": 0.14708827155346185, + "grad_norm": 0.37108170986175537, + "learning_rate": 0.00019181521997337853, + "loss": 1.2201, + "step": 69230 + }, + { + "epoch": 0.1471095178732009, + "grad_norm": 0.4813520610332489, + "learning_rate": 0.00019181252106529825, + "loss": 1.2108, + "step": 69240 + }, + { + "epoch": 0.14713076419293997, + "grad_norm": 0.3295329809188843, + "learning_rate": 0.00019180982173130638, + "loss": 1.2181, + "step": 69250 + }, + { + "epoch": 0.147152010512679, + "grad_norm": 0.3843383491039276, + "learning_rate": 0.0001918071219714154, + "loss": 1.2221, + "step": 69260 + }, + { + "epoch": 0.14717325683241805, + "grad_norm": 0.41452088952064514, + "learning_rate": 0.00019180442178563786, + "loss": 1.2413, + "step": 69270 + }, + { + "epoch": 0.14719450315215712, + "grad_norm": 0.4176523685455322, + "learning_rate": 0.00019180172117398627, + "loss": 1.2182, + "step": 69280 + }, + { + "epoch": 0.14721574947189617, + "grad_norm": 0.4242675304412842, + "learning_rate": 0.00019179902013647318, + "loss": 1.1939, + "step": 69290 + }, + { + "epoch": 0.1472369957916352, + "grad_norm": 0.4064296782016754, + "learning_rate": 0.00019179631867311108, + "loss": 1.1959, + "step": 69300 + }, + { + "epoch": 0.14725824211137428, + "grad_norm": 0.40918585658073425, + "learning_rate": 0.00019179361678391254, + "loss": 1.1851, + "step": 69310 + }, + { + "epoch": 0.14727948843111333, + "grad_norm": 0.3635931611061096, + "learning_rate": 0.00019179091446889008, + "loss": 1.1862, + "step": 69320 + }, + { + "epoch": 0.14730073475085237, + "grad_norm": 0.4050203859806061, + "learning_rate": 0.00019178821172805623, + "loss": 1.2257, + "step": 69330 + }, + { + "epoch": 0.14732198107059144, + "grad_norm": 0.34451693296432495, + "learning_rate": 0.00019178550856142353, + "loss": 1.197, + "step": 69340 + }, + { + "epoch": 0.14734322739033048, + "grad_norm": 0.34840860962867737, + "learning_rate": 0.00019178280496900453, + "loss": 1.2072, + "step": 69350 + }, + { + "epoch": 0.14736447371006953, + "grad_norm": 0.464589387178421, + "learning_rate": 0.0001917801009508118, + "loss": 1.1992, + "step": 69360 + }, + { + "epoch": 0.1473857200298086, + "grad_norm": 0.42086032032966614, + "learning_rate": 0.0001917773965068578, + "loss": 1.2324, + "step": 69370 + }, + { + "epoch": 0.14740696634954764, + "grad_norm": 0.3568006753921509, + "learning_rate": 0.00019177469163715512, + "loss": 1.1976, + "step": 69380 + }, + { + "epoch": 0.14742821266928668, + "grad_norm": 0.6134538054466248, + "learning_rate": 0.00019177198634171637, + "loss": 1.2072, + "step": 69390 + }, + { + "epoch": 0.14744945898902576, + "grad_norm": 0.3865511417388916, + "learning_rate": 0.000191769280620554, + "loss": 1.1937, + "step": 69400 + }, + { + "epoch": 0.1474707053087648, + "grad_norm": 0.37884747982025146, + "learning_rate": 0.00019176657447368057, + "loss": 1.1895, + "step": 69410 + }, + { + "epoch": 0.14749195162850384, + "grad_norm": 0.34140753746032715, + "learning_rate": 0.00019176386790110871, + "loss": 1.204, + "step": 69420 + }, + { + "epoch": 0.1475131979482429, + "grad_norm": 0.3758402466773987, + "learning_rate": 0.00019176116090285095, + "loss": 1.1924, + "step": 69430 + }, + { + "epoch": 0.14753444426798196, + "grad_norm": 0.32395002245903015, + "learning_rate": 0.0001917584534789198, + "loss": 1.2201, + "step": 69440 + }, + { + "epoch": 0.147555690587721, + "grad_norm": 0.42840853333473206, + "learning_rate": 0.0001917557456293279, + "loss": 1.2114, + "step": 69450 + }, + { + "epoch": 0.14757693690746007, + "grad_norm": 0.3051045835018158, + "learning_rate": 0.0001917530373540877, + "loss": 1.2087, + "step": 69460 + }, + { + "epoch": 0.14759818322719911, + "grad_norm": 0.43763720989227295, + "learning_rate": 0.00019175032865321185, + "loss": 1.2093, + "step": 69470 + }, + { + "epoch": 0.14761942954693816, + "grad_norm": 0.5129753947257996, + "learning_rate": 0.0001917476195267129, + "loss": 1.1945, + "step": 69480 + }, + { + "epoch": 0.14764067586667723, + "grad_norm": 0.4614788591861725, + "learning_rate": 0.00019174490997460336, + "loss": 1.2119, + "step": 69490 + }, + { + "epoch": 0.14766192218641627, + "grad_norm": 0.3178160786628723, + "learning_rate": 0.0001917421999968959, + "loss": 1.2105, + "step": 69500 + }, + { + "epoch": 0.14768316850615532, + "grad_norm": 0.3773491680622101, + "learning_rate": 0.000191739489593603, + "loss": 1.2351, + "step": 69510 + }, + { + "epoch": 0.1477044148258944, + "grad_norm": 0.36195096373558044, + "learning_rate": 0.00019173677876473732, + "loss": 1.2089, + "step": 69520 + }, + { + "epoch": 0.14772566114563343, + "grad_norm": 0.3334718644618988, + "learning_rate": 0.00019173406751031138, + "loss": 1.2048, + "step": 69530 + }, + { + "epoch": 0.14774690746537247, + "grad_norm": 0.3718147575855255, + "learning_rate": 0.00019173135583033774, + "loss": 1.1626, + "step": 69540 + }, + { + "epoch": 0.14776815378511154, + "grad_norm": 0.33322039246559143, + "learning_rate": 0.000191728643724829, + "loss": 1.1752, + "step": 69550 + }, + { + "epoch": 0.1477894001048506, + "grad_norm": 0.3191688656806946, + "learning_rate": 0.00019172593119379778, + "loss": 1.191, + "step": 69560 + }, + { + "epoch": 0.14781064642458963, + "grad_norm": 0.3678285479545593, + "learning_rate": 0.00019172321823725662, + "loss": 1.2011, + "step": 69570 + }, + { + "epoch": 0.1478318927443287, + "grad_norm": 0.3503290116786957, + "learning_rate": 0.00019172050485521808, + "loss": 1.1942, + "step": 69580 + }, + { + "epoch": 0.14785313906406775, + "grad_norm": 0.41474777460098267, + "learning_rate": 0.00019171779104769482, + "loss": 1.2106, + "step": 69590 + }, + { + "epoch": 0.1478743853838068, + "grad_norm": 0.4519716799259186, + "learning_rate": 0.00019171507681469937, + "loss": 1.2233, + "step": 69600 + }, + { + "epoch": 0.14789563170354586, + "grad_norm": 0.3494243025779724, + "learning_rate": 0.00019171236215624436, + "loss": 1.2423, + "step": 69610 + }, + { + "epoch": 0.1479168780232849, + "grad_norm": 0.36382240056991577, + "learning_rate": 0.00019170964707234233, + "loss": 1.1846, + "step": 69620 + }, + { + "epoch": 0.14793812434302395, + "grad_norm": 0.35578203201293945, + "learning_rate": 0.00019170693156300592, + "loss": 1.2033, + "step": 69630 + }, + { + "epoch": 0.14795937066276302, + "grad_norm": 0.3417872190475464, + "learning_rate": 0.00019170421562824774, + "loss": 1.209, + "step": 69640 + }, + { + "epoch": 0.14798061698250206, + "grad_norm": 0.31988394260406494, + "learning_rate": 0.00019170149926808034, + "loss": 1.1723, + "step": 69650 + }, + { + "epoch": 0.1480018633022411, + "grad_norm": 0.4689073860645294, + "learning_rate": 0.00019169878248251638, + "loss": 1.2148, + "step": 69660 + }, + { + "epoch": 0.14802310962198018, + "grad_norm": 0.3390752375125885, + "learning_rate": 0.0001916960652715684, + "loss": 1.2085, + "step": 69670 + }, + { + "epoch": 0.14804435594171922, + "grad_norm": 0.40191298723220825, + "learning_rate": 0.00019169334763524906, + "loss": 1.1914, + "step": 69680 + }, + { + "epoch": 0.14806560226145826, + "grad_norm": 0.3164561688899994, + "learning_rate": 0.00019169062957357093, + "loss": 1.2102, + "step": 69690 + }, + { + "epoch": 0.14808684858119733, + "grad_norm": 0.33667469024658203, + "learning_rate": 0.0001916879110865466, + "loss": 1.1824, + "step": 69700 + }, + { + "epoch": 0.14810809490093638, + "grad_norm": 0.38840946555137634, + "learning_rate": 0.00019168519217418876, + "loss": 1.1908, + "step": 69710 + }, + { + "epoch": 0.14812934122067542, + "grad_norm": 0.37163588404655457, + "learning_rate": 0.00019168247283650995, + "loss": 1.1999, + "step": 69720 + }, + { + "epoch": 0.1481505875404145, + "grad_norm": 0.3400324881076813, + "learning_rate": 0.0001916797530735228, + "loss": 1.1908, + "step": 69730 + }, + { + "epoch": 0.14817183386015353, + "grad_norm": 0.3561813533306122, + "learning_rate": 0.00019167703288523996, + "loss": 1.1801, + "step": 69740 + }, + { + "epoch": 0.14819308017989258, + "grad_norm": 0.3752083480358124, + "learning_rate": 0.000191674312271674, + "loss": 1.264, + "step": 69750 + }, + { + "epoch": 0.14821432649963165, + "grad_norm": 0.3273826241493225, + "learning_rate": 0.00019167159123283756, + "loss": 1.2449, + "step": 69760 + }, + { + "epoch": 0.1482355728193707, + "grad_norm": 0.37116605043411255, + "learning_rate": 0.0001916688697687433, + "loss": 1.1984, + "step": 69770 + }, + { + "epoch": 0.14825681913910974, + "grad_norm": 0.33089250326156616, + "learning_rate": 0.0001916661478794038, + "loss": 1.2288, + "step": 69780 + }, + { + "epoch": 0.1482780654588488, + "grad_norm": 0.31253567337989807, + "learning_rate": 0.0001916634255648317, + "loss": 1.2432, + "step": 69790 + }, + { + "epoch": 0.14829931177858785, + "grad_norm": 0.46986672282218933, + "learning_rate": 0.00019166070282503964, + "loss": 1.2317, + "step": 69800 + }, + { + "epoch": 0.1483205580983269, + "grad_norm": 0.45851755142211914, + "learning_rate": 0.0001916579796600402, + "loss": 1.213, + "step": 69810 + }, + { + "epoch": 0.14834180441806596, + "grad_norm": 0.3915728032588959, + "learning_rate": 0.0001916552560698461, + "loss": 1.213, + "step": 69820 + }, + { + "epoch": 0.148363050737805, + "grad_norm": 0.36028170585632324, + "learning_rate": 0.0001916525320544699, + "loss": 1.193, + "step": 69830 + }, + { + "epoch": 0.14838429705754405, + "grad_norm": 0.4048469662666321, + "learning_rate": 0.00019164980761392423, + "loss": 1.1896, + "step": 69840 + }, + { + "epoch": 0.14840554337728312, + "grad_norm": 0.4214281141757965, + "learning_rate": 0.0001916470827482218, + "loss": 1.2089, + "step": 69850 + }, + { + "epoch": 0.14842678969702217, + "grad_norm": 0.550811767578125, + "learning_rate": 0.0001916443574573752, + "loss": 1.2162, + "step": 69860 + }, + { + "epoch": 0.1484480360167612, + "grad_norm": 0.4148653447628021, + "learning_rate": 0.0001916416317413971, + "loss": 1.2446, + "step": 69870 + }, + { + "epoch": 0.14846928233650028, + "grad_norm": 0.30524763464927673, + "learning_rate": 0.00019163890560030012, + "loss": 1.2085, + "step": 69880 + }, + { + "epoch": 0.14849052865623932, + "grad_norm": 0.384115070104599, + "learning_rate": 0.0001916361790340969, + "loss": 1.2086, + "step": 69890 + }, + { + "epoch": 0.14851177497597837, + "grad_norm": 0.341869056224823, + "learning_rate": 0.0001916334520428001, + "loss": 1.1809, + "step": 69900 + }, + { + "epoch": 0.14853302129571744, + "grad_norm": 0.33904829621315, + "learning_rate": 0.0001916307246264224, + "loss": 1.1962, + "step": 69910 + }, + { + "epoch": 0.14855426761545648, + "grad_norm": 0.373246431350708, + "learning_rate": 0.0001916279967849764, + "loss": 1.2164, + "step": 69920 + }, + { + "epoch": 0.14857551393519552, + "grad_norm": 0.33874714374542236, + "learning_rate": 0.0001916252685184748, + "loss": 1.2102, + "step": 69930 + }, + { + "epoch": 0.1485967602549346, + "grad_norm": 0.3305111527442932, + "learning_rate": 0.00019162253982693024, + "loss": 1.2085, + "step": 69940 + }, + { + "epoch": 0.14861800657467364, + "grad_norm": 0.4105394780635834, + "learning_rate": 0.00019161981071035532, + "loss": 1.2185, + "step": 69950 + }, + { + "epoch": 0.14863925289441268, + "grad_norm": 0.3933675289154053, + "learning_rate": 0.0001916170811687628, + "loss": 1.2629, + "step": 69960 + }, + { + "epoch": 0.14866049921415175, + "grad_norm": 0.5725981593132019, + "learning_rate": 0.0001916143512021653, + "loss": 1.1925, + "step": 69970 + }, + { + "epoch": 0.1486817455338908, + "grad_norm": 0.46858537197113037, + "learning_rate": 0.00019161162081057548, + "loss": 1.2143, + "step": 69980 + }, + { + "epoch": 0.14870299185362987, + "grad_norm": 0.3622988760471344, + "learning_rate": 0.000191608889994006, + "loss": 1.248, + "step": 69990 + }, + { + "epoch": 0.1487242381733689, + "grad_norm": 0.38341259956359863, + "learning_rate": 0.0001916061587524695, + "loss": 1.2162, + "step": 70000 + }, + { + "epoch": 0.14874548449310795, + "grad_norm": 0.3586738407611847, + "learning_rate": 0.00019160342708597874, + "loss": 1.1535, + "step": 70010 + }, + { + "epoch": 0.14876673081284703, + "grad_norm": 0.33946672081947327, + "learning_rate": 0.0001916006949945463, + "loss": 1.1905, + "step": 70020 + }, + { + "epoch": 0.14878797713258607, + "grad_norm": 0.4573322832584381, + "learning_rate": 0.0001915979624781849, + "loss": 1.2042, + "step": 70030 + }, + { + "epoch": 0.1488092234523251, + "grad_norm": 0.6372725367546082, + "learning_rate": 0.00019159522953690722, + "loss": 1.1694, + "step": 70040 + }, + { + "epoch": 0.14883046977206418, + "grad_norm": 0.3470112979412079, + "learning_rate": 0.00019159249617072593, + "loss": 1.2459, + "step": 70050 + }, + { + "epoch": 0.14885171609180323, + "grad_norm": 0.30895256996154785, + "learning_rate": 0.00019158976237965367, + "loss": 1.2086, + "step": 70060 + }, + { + "epoch": 0.14887296241154227, + "grad_norm": 0.33671680092811584, + "learning_rate": 0.00019158702816370315, + "loss": 1.2125, + "step": 70070 + }, + { + "epoch": 0.14889420873128134, + "grad_norm": 0.4222698211669922, + "learning_rate": 0.00019158429352288708, + "loss": 1.2144, + "step": 70080 + }, + { + "epoch": 0.14891545505102038, + "grad_norm": 0.4289410412311554, + "learning_rate": 0.00019158155845721815, + "loss": 1.1718, + "step": 70090 + }, + { + "epoch": 0.14893670137075943, + "grad_norm": 0.37230879068374634, + "learning_rate": 0.000191578822966709, + "loss": 1.2072, + "step": 70100 + }, + { + "epoch": 0.1489579476904985, + "grad_norm": 0.35391542315483093, + "learning_rate": 0.00019157608705137235, + "loss": 1.2516, + "step": 70110 + }, + { + "epoch": 0.14897919401023754, + "grad_norm": 0.325692743062973, + "learning_rate": 0.00019157335071122086, + "loss": 1.2403, + "step": 70120 + }, + { + "epoch": 0.14900044032997659, + "grad_norm": 0.34891802072525024, + "learning_rate": 0.00019157061394626726, + "loss": 1.1938, + "step": 70130 + }, + { + "epoch": 0.14902168664971566, + "grad_norm": 0.4768945276737213, + "learning_rate": 0.00019156787675652428, + "loss": 1.1799, + "step": 70140 + }, + { + "epoch": 0.1490429329694547, + "grad_norm": 0.3289676606655121, + "learning_rate": 0.00019156513914200455, + "loss": 1.2126, + "step": 70150 + }, + { + "epoch": 0.14906417928919374, + "grad_norm": 0.5510098934173584, + "learning_rate": 0.00019156240110272075, + "loss": 1.2079, + "step": 70160 + }, + { + "epoch": 0.14908542560893281, + "grad_norm": 0.3261242210865021, + "learning_rate": 0.00019155966263868565, + "loss": 1.1842, + "step": 70170 + }, + { + "epoch": 0.14910667192867186, + "grad_norm": 0.41846227645874023, + "learning_rate": 0.0001915569237499119, + "loss": 1.2102, + "step": 70180 + }, + { + "epoch": 0.1491279182484109, + "grad_norm": 0.34554070234298706, + "learning_rate": 0.0001915541844364123, + "loss": 1.1827, + "step": 70190 + }, + { + "epoch": 0.14914916456814997, + "grad_norm": 0.3338988423347473, + "learning_rate": 0.00019155144469819944, + "loss": 1.1947, + "step": 70200 + }, + { + "epoch": 0.14917041088788902, + "grad_norm": 0.37211769819259644, + "learning_rate": 0.00019154870453528608, + "loss": 1.2027, + "step": 70210 + }, + { + "epoch": 0.14919165720762806, + "grad_norm": 0.3843705654144287, + "learning_rate": 0.00019154596394768493, + "loss": 1.2012, + "step": 70220 + }, + { + "epoch": 0.14921290352736713, + "grad_norm": 0.33020836114883423, + "learning_rate": 0.00019154322293540873, + "loss": 1.208, + "step": 70230 + }, + { + "epoch": 0.14923414984710617, + "grad_norm": 0.3640719950199127, + "learning_rate": 0.00019154048149847014, + "loss": 1.1899, + "step": 70240 + }, + { + "epoch": 0.14925539616684522, + "grad_norm": 0.3650561571121216, + "learning_rate": 0.0001915377396368819, + "loss": 1.2033, + "step": 70250 + }, + { + "epoch": 0.1492766424865843, + "grad_norm": 0.3141230046749115, + "learning_rate": 0.00019153499735065677, + "loss": 1.1924, + "step": 70260 + }, + { + "epoch": 0.14929788880632333, + "grad_norm": 0.40734395384788513, + "learning_rate": 0.00019153225463980743, + "loss": 1.1937, + "step": 70270 + }, + { + "epoch": 0.14931913512606237, + "grad_norm": 0.29300013184547424, + "learning_rate": 0.0001915295115043466, + "loss": 1.2331, + "step": 70280 + }, + { + "epoch": 0.14934038144580145, + "grad_norm": 0.3169440031051636, + "learning_rate": 0.00019152676794428701, + "loss": 1.2112, + "step": 70290 + }, + { + "epoch": 0.1493616277655405, + "grad_norm": 0.3633112907409668, + "learning_rate": 0.0001915240239596414, + "loss": 1.1998, + "step": 70300 + }, + { + "epoch": 0.14938287408527953, + "grad_norm": 0.3845054507255554, + "learning_rate": 0.0001915212795504225, + "loss": 1.1914, + "step": 70310 + }, + { + "epoch": 0.1494041204050186, + "grad_norm": 0.3525099456310272, + "learning_rate": 0.000191518534716643, + "loss": 1.1827, + "step": 70320 + }, + { + "epoch": 0.14942536672475765, + "grad_norm": 0.33822405338287354, + "learning_rate": 0.0001915157894583157, + "loss": 1.1947, + "step": 70330 + }, + { + "epoch": 0.1494466130444967, + "grad_norm": 0.3540651202201843, + "learning_rate": 0.0001915130437754533, + "loss": 1.2036, + "step": 70340 + }, + { + "epoch": 0.14946785936423576, + "grad_norm": 0.3127554655075073, + "learning_rate": 0.00019151029766806854, + "loss": 1.2045, + "step": 70350 + }, + { + "epoch": 0.1494891056839748, + "grad_norm": 0.3349510729312897, + "learning_rate": 0.00019150755113617413, + "loss": 1.1961, + "step": 70360 + }, + { + "epoch": 0.14951035200371385, + "grad_norm": 0.4496714174747467, + "learning_rate": 0.00019150480417978285, + "loss": 1.2221, + "step": 70370 + }, + { + "epoch": 0.14953159832345292, + "grad_norm": 0.32303401827812195, + "learning_rate": 0.00019150205679890743, + "loss": 1.1969, + "step": 70380 + }, + { + "epoch": 0.14955284464319196, + "grad_norm": 0.31887301802635193, + "learning_rate": 0.00019149930899356061, + "loss": 1.1983, + "step": 70390 + }, + { + "epoch": 0.149574090962931, + "grad_norm": 0.5368044972419739, + "learning_rate": 0.00019149656076375518, + "loss": 1.2406, + "step": 70400 + }, + { + "epoch": 0.14959533728267008, + "grad_norm": 0.35883593559265137, + "learning_rate": 0.00019149381210950382, + "loss": 1.17, + "step": 70410 + }, + { + "epoch": 0.14961658360240912, + "grad_norm": 0.3838318884372711, + "learning_rate": 0.00019149106303081932, + "loss": 1.2331, + "step": 70420 + }, + { + "epoch": 0.14963782992214816, + "grad_norm": 0.32163187861442566, + "learning_rate": 0.00019148831352771442, + "loss": 1.2306, + "step": 70430 + }, + { + "epoch": 0.14965907624188723, + "grad_norm": 0.38768574595451355, + "learning_rate": 0.00019148556360020188, + "loss": 1.2226, + "step": 70440 + }, + { + "epoch": 0.14968032256162628, + "grad_norm": 0.353572279214859, + "learning_rate": 0.00019148281324829448, + "loss": 1.2024, + "step": 70450 + }, + { + "epoch": 0.14970156888136532, + "grad_norm": 0.4398868978023529, + "learning_rate": 0.00019148006247200492, + "loss": 1.1998, + "step": 70460 + }, + { + "epoch": 0.1497228152011044, + "grad_norm": 0.3495461940765381, + "learning_rate": 0.000191477311271346, + "loss": 1.2179, + "step": 70470 + }, + { + "epoch": 0.14974406152084344, + "grad_norm": 0.32613879442214966, + "learning_rate": 0.0001914745596463305, + "loss": 1.1969, + "step": 70480 + }, + { + "epoch": 0.14976530784058248, + "grad_norm": 0.3103106617927551, + "learning_rate": 0.00019147180759697117, + "loss": 1.2299, + "step": 70490 + }, + { + "epoch": 0.14978655416032155, + "grad_norm": 0.35458019375801086, + "learning_rate": 0.00019146905512328072, + "loss": 1.2085, + "step": 70500 + }, + { + "epoch": 0.1498078004800606, + "grad_norm": 0.4290529191493988, + "learning_rate": 0.00019146630222527202, + "loss": 1.1936, + "step": 70510 + }, + { + "epoch": 0.14982904679979964, + "grad_norm": 0.48328903317451477, + "learning_rate": 0.00019146354890295776, + "loss": 1.2106, + "step": 70520 + }, + { + "epoch": 0.1498502931195387, + "grad_norm": 0.7136117219924927, + "learning_rate": 0.00019146079515635077, + "loss": 1.2024, + "step": 70530 + }, + { + "epoch": 0.14987153943927775, + "grad_norm": 0.4034753143787384, + "learning_rate": 0.00019145804098546376, + "loss": 1.2172, + "step": 70540 + }, + { + "epoch": 0.1498927857590168, + "grad_norm": 0.32233312726020813, + "learning_rate": 0.00019145528639030954, + "loss": 1.1686, + "step": 70550 + }, + { + "epoch": 0.14991403207875587, + "grad_norm": 0.4020121097564697, + "learning_rate": 0.00019145253137090094, + "loss": 1.1942, + "step": 70560 + }, + { + "epoch": 0.1499352783984949, + "grad_norm": 0.3920692205429077, + "learning_rate": 0.00019144977592725065, + "loss": 1.2135, + "step": 70570 + }, + { + "epoch": 0.14995652471823395, + "grad_norm": 0.31288906931877136, + "learning_rate": 0.0001914470200593715, + "loss": 1.2168, + "step": 70580 + }, + { + "epoch": 0.14997777103797302, + "grad_norm": 0.359939306974411, + "learning_rate": 0.00019144426376727626, + "loss": 1.2655, + "step": 70590 + }, + { + "epoch": 0.14999901735771207, + "grad_norm": 0.3354300856590271, + "learning_rate": 0.00019144150705097774, + "loss": 1.1842, + "step": 70600 + }, + { + "epoch": 0.1500202636774511, + "grad_norm": 0.304050475358963, + "learning_rate": 0.0001914387499104887, + "loss": 1.211, + "step": 70610 + }, + { + "epoch": 0.15004150999719018, + "grad_norm": 0.3156076967716217, + "learning_rate": 0.00019143599234582194, + "loss": 1.2137, + "step": 70620 + }, + { + "epoch": 0.15006275631692922, + "grad_norm": 0.4004659056663513, + "learning_rate": 0.00019143323435699027, + "loss": 1.1859, + "step": 70630 + }, + { + "epoch": 0.15008400263666827, + "grad_norm": 0.3382870554924011, + "learning_rate": 0.00019143047594400648, + "loss": 1.1855, + "step": 70640 + }, + { + "epoch": 0.15010524895640734, + "grad_norm": 0.3250700831413269, + "learning_rate": 0.00019142771710688332, + "loss": 1.2039, + "step": 70650 + }, + { + "epoch": 0.15012649527614638, + "grad_norm": 0.3334566056728363, + "learning_rate": 0.00019142495784563364, + "loss": 1.2098, + "step": 70660 + }, + { + "epoch": 0.15014774159588543, + "grad_norm": 0.4093467593193054, + "learning_rate": 0.00019142219816027024, + "loss": 1.178, + "step": 70670 + }, + { + "epoch": 0.1501689879156245, + "grad_norm": 0.39429178833961487, + "learning_rate": 0.00019141943805080588, + "loss": 1.1969, + "step": 70680 + }, + { + "epoch": 0.15019023423536354, + "grad_norm": 0.3954247534275055, + "learning_rate": 0.00019141667751725337, + "loss": 1.1945, + "step": 70690 + }, + { + "epoch": 0.15021148055510258, + "grad_norm": 0.3369598686695099, + "learning_rate": 0.00019141391655962554, + "loss": 1.1745, + "step": 70700 + }, + { + "epoch": 0.15023272687484165, + "grad_norm": 0.3759360909461975, + "learning_rate": 0.00019141115517793523, + "loss": 1.2161, + "step": 70710 + }, + { + "epoch": 0.1502539731945807, + "grad_norm": 0.4451068341732025, + "learning_rate": 0.00019140839337219518, + "loss": 1.21, + "step": 70720 + }, + { + "epoch": 0.15027521951431974, + "grad_norm": 0.4031313955783844, + "learning_rate": 0.00019140563114241826, + "loss": 1.1902, + "step": 70730 + }, + { + "epoch": 0.1502964658340588, + "grad_norm": 0.3581078350543976, + "learning_rate": 0.00019140286848861723, + "loss": 1.2154, + "step": 70740 + }, + { + "epoch": 0.15031771215379786, + "grad_norm": 0.30775704979896545, + "learning_rate": 0.00019140010541080494, + "loss": 1.2029, + "step": 70750 + }, + { + "epoch": 0.1503389584735369, + "grad_norm": 0.33254700899124146, + "learning_rate": 0.0001913973419089942, + "loss": 1.1926, + "step": 70760 + }, + { + "epoch": 0.15036020479327597, + "grad_norm": 0.3639298677444458, + "learning_rate": 0.00019139457798319785, + "loss": 1.2121, + "step": 70770 + }, + { + "epoch": 0.150381451113015, + "grad_norm": 0.32301124930381775, + "learning_rate": 0.00019139181363342868, + "loss": 1.2184, + "step": 70780 + }, + { + "epoch": 0.15040269743275406, + "grad_norm": 0.3509587347507477, + "learning_rate": 0.00019138904885969952, + "loss": 1.2332, + "step": 70790 + }, + { + "epoch": 0.15042394375249313, + "grad_norm": 0.33127638697624207, + "learning_rate": 0.0001913862836620232, + "loss": 1.2053, + "step": 70800 + }, + { + "epoch": 0.15044519007223217, + "grad_norm": 0.5852491855621338, + "learning_rate": 0.00019138351804041256, + "loss": 1.2231, + "step": 70810 + }, + { + "epoch": 0.15046643639197121, + "grad_norm": 0.49306192994117737, + "learning_rate": 0.0001913807519948804, + "loss": 1.2028, + "step": 70820 + }, + { + "epoch": 0.15048768271171029, + "grad_norm": 0.33615389466285706, + "learning_rate": 0.00019137798552543958, + "loss": 1.1845, + "step": 70830 + }, + { + "epoch": 0.15050892903144933, + "grad_norm": 0.3715398907661438, + "learning_rate": 0.00019137521863210296, + "loss": 1.1925, + "step": 70840 + }, + { + "epoch": 0.1505301753511884, + "grad_norm": 0.34590598940849304, + "learning_rate": 0.0001913724513148833, + "loss": 1.2241, + "step": 70850 + }, + { + "epoch": 0.15055142167092744, + "grad_norm": 0.3174773156642914, + "learning_rate": 0.00019136968357379348, + "loss": 1.1997, + "step": 70860 + }, + { + "epoch": 0.1505726679906665, + "grad_norm": 0.3336026966571808, + "learning_rate": 0.00019136691540884632, + "loss": 1.1737, + "step": 70870 + }, + { + "epoch": 0.15059391431040556, + "grad_norm": 0.35592639446258545, + "learning_rate": 0.0001913641468200547, + "loss": 1.2237, + "step": 70880 + }, + { + "epoch": 0.1506151606301446, + "grad_norm": 0.5294058322906494, + "learning_rate": 0.00019136137780743142, + "loss": 1.1925, + "step": 70890 + }, + { + "epoch": 0.15063640694988364, + "grad_norm": 0.3817299008369446, + "learning_rate": 0.0001913586083709894, + "loss": 1.1587, + "step": 70900 + }, + { + "epoch": 0.15065765326962272, + "grad_norm": 0.35278066992759705, + "learning_rate": 0.00019135583851074138, + "loss": 1.221, + "step": 70910 + }, + { + "epoch": 0.15067889958936176, + "grad_norm": 0.33197230100631714, + "learning_rate": 0.00019135306822670026, + "loss": 1.2347, + "step": 70920 + }, + { + "epoch": 0.1507001459091008, + "grad_norm": 0.3379369080066681, + "learning_rate": 0.0001913502975188789, + "loss": 1.2015, + "step": 70930 + }, + { + "epoch": 0.15072139222883987, + "grad_norm": 0.5739507675170898, + "learning_rate": 0.00019134752638729017, + "loss": 1.1993, + "step": 70940 + }, + { + "epoch": 0.15074263854857892, + "grad_norm": 0.47630760073661804, + "learning_rate": 0.00019134475483194687, + "loss": 1.1924, + "step": 70950 + }, + { + "epoch": 0.15076388486831796, + "grad_norm": 0.4398469924926758, + "learning_rate": 0.0001913419828528619, + "loss": 1.2219, + "step": 70960 + }, + { + "epoch": 0.15078513118805703, + "grad_norm": 0.3969334363937378, + "learning_rate": 0.00019133921045004813, + "loss": 1.2284, + "step": 70970 + }, + { + "epoch": 0.15080637750779607, + "grad_norm": 0.34447747468948364, + "learning_rate": 0.00019133643762351836, + "loss": 1.201, + "step": 70980 + }, + { + "epoch": 0.15082762382753512, + "grad_norm": 0.38939887285232544, + "learning_rate": 0.00019133366437328552, + "loss": 1.1752, + "step": 70990 + }, + { + "epoch": 0.1508488701472742, + "grad_norm": 0.7436483502388, + "learning_rate": 0.00019133089069936246, + "loss": 1.1545, + "step": 71000 + }, + { + "epoch": 0.15087011646701323, + "grad_norm": 0.31707167625427246, + "learning_rate": 0.000191328116601762, + "loss": 1.212, + "step": 71010 + }, + { + "epoch": 0.15089136278675228, + "grad_norm": 0.34078654646873474, + "learning_rate": 0.00019132534208049707, + "loss": 1.2212, + "step": 71020 + }, + { + "epoch": 0.15091260910649135, + "grad_norm": 0.3376050591468811, + "learning_rate": 0.00019132256713558045, + "loss": 1.2109, + "step": 71030 + }, + { + "epoch": 0.1509338554262304, + "grad_norm": 0.388810932636261, + "learning_rate": 0.00019131979176702511, + "loss": 1.2195, + "step": 71040 + }, + { + "epoch": 0.15095510174596943, + "grad_norm": 0.36422914266586304, + "learning_rate": 0.00019131701597484394, + "loss": 1.2241, + "step": 71050 + }, + { + "epoch": 0.1509763480657085, + "grad_norm": 0.35593777894973755, + "learning_rate": 0.0001913142397590497, + "loss": 1.2206, + "step": 71060 + }, + { + "epoch": 0.15099759438544755, + "grad_norm": 0.35092592239379883, + "learning_rate": 0.0001913114631196554, + "loss": 1.2056, + "step": 71070 + }, + { + "epoch": 0.1510188407051866, + "grad_norm": 0.4729214906692505, + "learning_rate": 0.0001913086860566738, + "loss": 1.1866, + "step": 71080 + }, + { + "epoch": 0.15104008702492566, + "grad_norm": 0.3095354735851288, + "learning_rate": 0.00019130590857011786, + "loss": 1.2106, + "step": 71090 + }, + { + "epoch": 0.1510613333446647, + "grad_norm": 0.33843496441841125, + "learning_rate": 0.00019130313066000043, + "loss": 1.2104, + "step": 71100 + }, + { + "epoch": 0.15108257966440375, + "grad_norm": 0.3199734687805176, + "learning_rate": 0.00019130035232633444, + "loss": 1.1833, + "step": 71110 + }, + { + "epoch": 0.15110382598414282, + "grad_norm": 0.3684735596179962, + "learning_rate": 0.00019129757356913272, + "loss": 1.2059, + "step": 71120 + }, + { + "epoch": 0.15112507230388186, + "grad_norm": 0.33041444420814514, + "learning_rate": 0.0001912947943884082, + "loss": 1.1983, + "step": 71130 + }, + { + "epoch": 0.1511463186236209, + "grad_norm": 0.46261486411094666, + "learning_rate": 0.00019129201478417375, + "loss": 1.2421, + "step": 71140 + }, + { + "epoch": 0.15116756494335998, + "grad_norm": 0.3548736870288849, + "learning_rate": 0.0001912892347564423, + "loss": 1.1949, + "step": 71150 + }, + { + "epoch": 0.15118881126309902, + "grad_norm": 0.3634767234325409, + "learning_rate": 0.0001912864543052267, + "loss": 1.2376, + "step": 71160 + }, + { + "epoch": 0.15121005758283806, + "grad_norm": 0.3300638496875763, + "learning_rate": 0.00019128367343053987, + "loss": 1.2006, + "step": 71170 + }, + { + "epoch": 0.15123130390257714, + "grad_norm": 0.3262075185775757, + "learning_rate": 0.0001912808921323947, + "loss": 1.2051, + "step": 71180 + }, + { + "epoch": 0.15125255022231618, + "grad_norm": 0.3430345058441162, + "learning_rate": 0.0001912781104108041, + "loss": 1.199, + "step": 71190 + }, + { + "epoch": 0.15127379654205522, + "grad_norm": 0.3316657841205597, + "learning_rate": 0.00019127532826578104, + "loss": 1.1993, + "step": 71200 + }, + { + "epoch": 0.1512950428617943, + "grad_norm": 0.34799912571907043, + "learning_rate": 0.00019127254569733832, + "loss": 1.1963, + "step": 71210 + }, + { + "epoch": 0.15131628918153334, + "grad_norm": 0.31641465425491333, + "learning_rate": 0.0001912697627054889, + "loss": 1.2137, + "step": 71220 + }, + { + "epoch": 0.15133753550127238, + "grad_norm": 0.4784099757671356, + "learning_rate": 0.00019126697929024565, + "loss": 1.1948, + "step": 71230 + }, + { + "epoch": 0.15135878182101145, + "grad_norm": 0.41707471013069153, + "learning_rate": 0.00019126419545162154, + "loss": 1.2134, + "step": 71240 + }, + { + "epoch": 0.1513800281407505, + "grad_norm": 0.3883388042449951, + "learning_rate": 0.00019126141118962946, + "loss": 1.2516, + "step": 71250 + }, + { + "epoch": 0.15140127446048954, + "grad_norm": 0.4296300709247589, + "learning_rate": 0.0001912586265042823, + "loss": 1.2019, + "step": 71260 + }, + { + "epoch": 0.1514225207802286, + "grad_norm": 0.3606847822666168, + "learning_rate": 0.00019125584139559302, + "loss": 1.2243, + "step": 71270 + }, + { + "epoch": 0.15144376709996765, + "grad_norm": 0.32203155755996704, + "learning_rate": 0.00019125305586357452, + "loss": 1.2393, + "step": 71280 + }, + { + "epoch": 0.1514650134197067, + "grad_norm": 0.390855997800827, + "learning_rate": 0.00019125026990823973, + "loss": 1.194, + "step": 71290 + }, + { + "epoch": 0.15148625973944577, + "grad_norm": 0.35131868720054626, + "learning_rate": 0.00019124748352960155, + "loss": 1.1525, + "step": 71300 + }, + { + "epoch": 0.1515075060591848, + "grad_norm": 0.3450160622596741, + "learning_rate": 0.00019124469672767295, + "loss": 1.2178, + "step": 71310 + }, + { + "epoch": 0.15152875237892385, + "grad_norm": 0.4709353446960449, + "learning_rate": 0.0001912419095024668, + "loss": 1.1444, + "step": 71320 + }, + { + "epoch": 0.15154999869866292, + "grad_norm": 0.3670584559440613, + "learning_rate": 0.00019123912185399608, + "loss": 1.18, + "step": 71330 + }, + { + "epoch": 0.15157124501840197, + "grad_norm": 0.43671491742134094, + "learning_rate": 0.0001912363337822737, + "loss": 1.2303, + "step": 71340 + }, + { + "epoch": 0.151592491338141, + "grad_norm": 0.6639800071716309, + "learning_rate": 0.00019123354528731258, + "loss": 1.1864, + "step": 71350 + }, + { + "epoch": 0.15161373765788008, + "grad_norm": 0.3451612889766693, + "learning_rate": 0.0001912307563691257, + "loss": 1.2314, + "step": 71360 + }, + { + "epoch": 0.15163498397761913, + "grad_norm": 0.3870805501937866, + "learning_rate": 0.00019122796702772594, + "loss": 1.1835, + "step": 71370 + }, + { + "epoch": 0.15165623029735817, + "grad_norm": 0.32357874512672424, + "learning_rate": 0.00019122517726312627, + "loss": 1.2215, + "step": 71380 + }, + { + "epoch": 0.15167747661709724, + "grad_norm": 0.4207703769207001, + "learning_rate": 0.00019122238707533967, + "loss": 1.2247, + "step": 71390 + }, + { + "epoch": 0.15169872293683628, + "grad_norm": 0.3159569501876831, + "learning_rate": 0.000191219596464379, + "loss": 1.2024, + "step": 71400 + }, + { + "epoch": 0.15171996925657533, + "grad_norm": 0.3767338693141937, + "learning_rate": 0.00019121680543025726, + "loss": 1.2123, + "step": 71410 + }, + { + "epoch": 0.1517412155763144, + "grad_norm": 0.3789723813533783, + "learning_rate": 0.0001912140139729874, + "loss": 1.1819, + "step": 71420 + }, + { + "epoch": 0.15176246189605344, + "grad_norm": 0.5111535787582397, + "learning_rate": 0.0001912112220925823, + "loss": 1.2115, + "step": 71430 + }, + { + "epoch": 0.15178370821579248, + "grad_norm": 0.32391470670700073, + "learning_rate": 0.00019120842978905502, + "loss": 1.1827, + "step": 71440 + }, + { + "epoch": 0.15180495453553156, + "grad_norm": 0.3655300736427307, + "learning_rate": 0.00019120563706241845, + "loss": 1.1983, + "step": 71450 + }, + { + "epoch": 0.1518262008552706, + "grad_norm": 0.5828658938407898, + "learning_rate": 0.00019120284391268557, + "loss": 1.1752, + "step": 71460 + }, + { + "epoch": 0.15184744717500964, + "grad_norm": 0.3711400330066681, + "learning_rate": 0.0001912000503398693, + "loss": 1.1857, + "step": 71470 + }, + { + "epoch": 0.1518686934947487, + "grad_norm": 0.3461199998855591, + "learning_rate": 0.00019119725634398265, + "loss": 1.2178, + "step": 71480 + }, + { + "epoch": 0.15188993981448776, + "grad_norm": 0.3435879051685333, + "learning_rate": 0.00019119446192503853, + "loss": 1.2159, + "step": 71490 + }, + { + "epoch": 0.1519111861342268, + "grad_norm": 0.418956458568573, + "learning_rate": 0.00019119166708304994, + "loss": 1.2016, + "step": 71500 + }, + { + "epoch": 0.15193243245396587, + "grad_norm": 0.436808705329895, + "learning_rate": 0.0001911888718180298, + "loss": 1.1923, + "step": 71510 + }, + { + "epoch": 0.15195367877370491, + "grad_norm": 0.4510968029499054, + "learning_rate": 0.00019118607612999113, + "loss": 1.1903, + "step": 71520 + }, + { + "epoch": 0.15197492509344396, + "grad_norm": 0.3357973098754883, + "learning_rate": 0.0001911832800189469, + "loss": 1.1599, + "step": 71530 + }, + { + "epoch": 0.15199617141318303, + "grad_norm": 0.4202280044555664, + "learning_rate": 0.00019118048348491004, + "loss": 1.1987, + "step": 71540 + }, + { + "epoch": 0.15201741773292207, + "grad_norm": 0.3284028172492981, + "learning_rate": 0.0001911776865278935, + "loss": 1.2448, + "step": 71550 + }, + { + "epoch": 0.15203866405266112, + "grad_norm": 0.31390976905822754, + "learning_rate": 0.00019117488914791037, + "loss": 1.2233, + "step": 71560 + }, + { + "epoch": 0.1520599103724002, + "grad_norm": 0.3287726640701294, + "learning_rate": 0.0001911720913449735, + "loss": 1.1626, + "step": 71570 + }, + { + "epoch": 0.15208115669213923, + "grad_norm": 0.3268052935600281, + "learning_rate": 0.00019116929311909593, + "loss": 1.1798, + "step": 71580 + }, + { + "epoch": 0.15210240301187827, + "grad_norm": 0.3881189823150635, + "learning_rate": 0.00019116649447029063, + "loss": 1.1587, + "step": 71590 + }, + { + "epoch": 0.15212364933161734, + "grad_norm": 0.30829545855522156, + "learning_rate": 0.00019116369539857062, + "loss": 1.2167, + "step": 71600 + }, + { + "epoch": 0.1521448956513564, + "grad_norm": 0.3449328541755676, + "learning_rate": 0.0001911608959039488, + "loss": 1.1802, + "step": 71610 + }, + { + "epoch": 0.15216614197109543, + "grad_norm": 0.3557949662208557, + "learning_rate": 0.00019115809598643823, + "loss": 1.2091, + "step": 71620 + }, + { + "epoch": 0.1521873882908345, + "grad_norm": 0.339866578578949, + "learning_rate": 0.0001911552956460519, + "loss": 1.2155, + "step": 71630 + }, + { + "epoch": 0.15220863461057355, + "grad_norm": 0.36759957671165466, + "learning_rate": 0.00019115249488280272, + "loss": 1.2212, + "step": 71640 + }, + { + "epoch": 0.1522298809303126, + "grad_norm": 0.3606128692626953, + "learning_rate": 0.00019114969369670378, + "loss": 1.1984, + "step": 71650 + }, + { + "epoch": 0.15225112725005166, + "grad_norm": 0.3553573787212372, + "learning_rate": 0.00019114689208776803, + "loss": 1.194, + "step": 71660 + }, + { + "epoch": 0.1522723735697907, + "grad_norm": 0.3201541304588318, + "learning_rate": 0.00019114409005600847, + "loss": 1.1814, + "step": 71670 + }, + { + "epoch": 0.15229361988952977, + "grad_norm": 0.4376092553138733, + "learning_rate": 0.00019114128760143809, + "loss": 1.1679, + "step": 71680 + }, + { + "epoch": 0.15231486620926882, + "grad_norm": 0.3767891824245453, + "learning_rate": 0.0001911384847240699, + "loss": 1.1961, + "step": 71690 + }, + { + "epoch": 0.15233611252900786, + "grad_norm": 0.38861751556396484, + "learning_rate": 0.00019113568142391688, + "loss": 1.2269, + "step": 71700 + }, + { + "epoch": 0.15235735884874693, + "grad_norm": 0.36348819732666016, + "learning_rate": 0.0001911328777009921, + "loss": 1.1602, + "step": 71710 + }, + { + "epoch": 0.15237860516848598, + "grad_norm": 0.40932920575141907, + "learning_rate": 0.00019113007355530849, + "loss": 1.1975, + "step": 71720 + }, + { + "epoch": 0.15239985148822502, + "grad_norm": 0.4927641749382019, + "learning_rate": 0.0001911272689868791, + "loss": 1.1993, + "step": 71730 + }, + { + "epoch": 0.1524210978079641, + "grad_norm": 0.33380720019340515, + "learning_rate": 0.0001911244639957169, + "loss": 1.2005, + "step": 71740 + }, + { + "epoch": 0.15244234412770313, + "grad_norm": 0.33996057510375977, + "learning_rate": 0.00019112165858183494, + "loss": 1.1889, + "step": 71750 + }, + { + "epoch": 0.15246359044744218, + "grad_norm": 0.45646190643310547, + "learning_rate": 0.00019111885274524626, + "loss": 1.173, + "step": 71760 + }, + { + "epoch": 0.15248483676718125, + "grad_norm": 0.4009057581424713, + "learning_rate": 0.0001911160464859638, + "loss": 1.2041, + "step": 71770 + }, + { + "epoch": 0.1525060830869203, + "grad_norm": 0.6224416494369507, + "learning_rate": 0.00019111323980400066, + "loss": 1.163, + "step": 71780 + }, + { + "epoch": 0.15252732940665933, + "grad_norm": 0.4104729890823364, + "learning_rate": 0.0001911104326993698, + "loss": 1.2111, + "step": 71790 + }, + { + "epoch": 0.1525485757263984, + "grad_norm": 0.3900916576385498, + "learning_rate": 0.00019110762517208425, + "loss": 1.2201, + "step": 71800 + }, + { + "epoch": 0.15256982204613745, + "grad_norm": 0.3316819965839386, + "learning_rate": 0.00019110481722215704, + "loss": 1.1815, + "step": 71810 + }, + { + "epoch": 0.1525910683658765, + "grad_norm": 0.39579278230667114, + "learning_rate": 0.00019110200884960124, + "loss": 1.1773, + "step": 71820 + }, + { + "epoch": 0.15261231468561556, + "grad_norm": 0.36119386553764343, + "learning_rate": 0.00019109920005442983, + "loss": 1.2172, + "step": 71830 + }, + { + "epoch": 0.1526335610053546, + "grad_norm": 0.4427950978279114, + "learning_rate": 0.00019109639083665582, + "loss": 1.186, + "step": 71840 + }, + { + "epoch": 0.15265480732509365, + "grad_norm": 0.4224362373352051, + "learning_rate": 0.0001910935811962923, + "loss": 1.2215, + "step": 71850 + }, + { + "epoch": 0.15267605364483272, + "grad_norm": 0.327705442905426, + "learning_rate": 0.00019109077113335228, + "loss": 1.2102, + "step": 71860 + }, + { + "epoch": 0.15269729996457176, + "grad_norm": 0.3431737720966339, + "learning_rate": 0.00019108796064784875, + "loss": 1.2072, + "step": 71870 + }, + { + "epoch": 0.1527185462843108, + "grad_norm": 0.34007880091667175, + "learning_rate": 0.0001910851497397948, + "loss": 1.1648, + "step": 71880 + }, + { + "epoch": 0.15273979260404988, + "grad_norm": 0.3769775629043579, + "learning_rate": 0.0001910823384092035, + "loss": 1.2023, + "step": 71890 + }, + { + "epoch": 0.15276103892378892, + "grad_norm": 0.39168933033943176, + "learning_rate": 0.0001910795266560878, + "loss": 1.2328, + "step": 71900 + }, + { + "epoch": 0.15278228524352797, + "grad_norm": 0.33007875084877014, + "learning_rate": 0.00019107671448046084, + "loss": 1.2098, + "step": 71910 + }, + { + "epoch": 0.15280353156326704, + "grad_norm": 0.47996142506599426, + "learning_rate": 0.00019107390188233557, + "loss": 1.1683, + "step": 71920 + }, + { + "epoch": 0.15282477788300608, + "grad_norm": 0.3383987247943878, + "learning_rate": 0.0001910710888617251, + "loss": 1.1679, + "step": 71930 + }, + { + "epoch": 0.15284602420274512, + "grad_norm": 0.4295121729373932, + "learning_rate": 0.00019106827541864245, + "loss": 1.2243, + "step": 71940 + }, + { + "epoch": 0.1528672705224842, + "grad_norm": 0.3935948312282562, + "learning_rate": 0.00019106546155310073, + "loss": 1.2057, + "step": 71950 + }, + { + "epoch": 0.15288851684222324, + "grad_norm": 0.33184388279914856, + "learning_rate": 0.0001910626472651129, + "loss": 1.2218, + "step": 71960 + }, + { + "epoch": 0.15290976316196228, + "grad_norm": 0.3576926290988922, + "learning_rate": 0.00019105983255469209, + "loss": 1.162, + "step": 71970 + }, + { + "epoch": 0.15293100948170135, + "grad_norm": 0.368064284324646, + "learning_rate": 0.00019105701742185133, + "loss": 1.1948, + "step": 71980 + }, + { + "epoch": 0.1529522558014404, + "grad_norm": 0.3674444258213043, + "learning_rate": 0.00019105420186660366, + "loss": 1.1862, + "step": 71990 + }, + { + "epoch": 0.15297350212117944, + "grad_norm": 0.5495532751083374, + "learning_rate": 0.00019105138588896222, + "loss": 1.2527, + "step": 72000 + }, + { + "epoch": 0.1529947484409185, + "grad_norm": 0.4320761561393738, + "learning_rate": 0.00019104856948893998, + "loss": 1.2136, + "step": 72010 + }, + { + "epoch": 0.15301599476065755, + "grad_norm": 0.5529170036315918, + "learning_rate": 0.00019104575266655003, + "loss": 1.2173, + "step": 72020 + }, + { + "epoch": 0.1530372410803966, + "grad_norm": 0.3669241666793823, + "learning_rate": 0.00019104293542180546, + "loss": 1.2038, + "step": 72030 + }, + { + "epoch": 0.15305848740013567, + "grad_norm": 0.4675973951816559, + "learning_rate": 0.00019104011775471932, + "loss": 1.1999, + "step": 72040 + }, + { + "epoch": 0.1530797337198747, + "grad_norm": 0.36208751797676086, + "learning_rate": 0.0001910372996653047, + "loss": 1.1688, + "step": 72050 + }, + { + "epoch": 0.15310098003961375, + "grad_norm": 0.4237922132015228, + "learning_rate": 0.00019103448115357464, + "loss": 1.1937, + "step": 72060 + }, + { + "epoch": 0.15312222635935283, + "grad_norm": 0.4297255277633667, + "learning_rate": 0.00019103166221954229, + "loss": 1.1815, + "step": 72070 + }, + { + "epoch": 0.15314347267909187, + "grad_norm": 0.323624849319458, + "learning_rate": 0.0001910288428632206, + "loss": 1.218, + "step": 72080 + }, + { + "epoch": 0.1531647189988309, + "grad_norm": 0.3468819856643677, + "learning_rate": 0.00019102602308462277, + "loss": 1.2552, + "step": 72090 + }, + { + "epoch": 0.15318596531856998, + "grad_norm": 0.31554892659187317, + "learning_rate": 0.00019102320288376178, + "loss": 1.1914, + "step": 72100 + }, + { + "epoch": 0.15320721163830903, + "grad_norm": 0.31534576416015625, + "learning_rate": 0.0001910203822606508, + "loss": 1.2084, + "step": 72110 + }, + { + "epoch": 0.15322845795804807, + "grad_norm": 0.3823298513889313, + "learning_rate": 0.00019101756121530289, + "loss": 1.186, + "step": 72120 + }, + { + "epoch": 0.15324970427778714, + "grad_norm": 0.3680235743522644, + "learning_rate": 0.00019101473974773113, + "loss": 1.1803, + "step": 72130 + }, + { + "epoch": 0.15327095059752618, + "grad_norm": 0.39848899841308594, + "learning_rate": 0.00019101191785794857, + "loss": 1.1759, + "step": 72140 + }, + { + "epoch": 0.15329219691726523, + "grad_norm": 0.3467482030391693, + "learning_rate": 0.00019100909554596834, + "loss": 1.2183, + "step": 72150 + }, + { + "epoch": 0.1533134432370043, + "grad_norm": 0.3487699329853058, + "learning_rate": 0.00019100627281180358, + "loss": 1.2178, + "step": 72160 + }, + { + "epoch": 0.15333468955674334, + "grad_norm": 0.35589754581451416, + "learning_rate": 0.00019100344965546727, + "loss": 1.1951, + "step": 72170 + }, + { + "epoch": 0.15335593587648239, + "grad_norm": 0.3423006236553192, + "learning_rate": 0.00019100062607697258, + "loss": 1.1914, + "step": 72180 + }, + { + "epoch": 0.15337718219622146, + "grad_norm": 0.38810163736343384, + "learning_rate": 0.0001909978020763326, + "loss": 1.2008, + "step": 72190 + }, + { + "epoch": 0.1533984285159605, + "grad_norm": 0.34651023149490356, + "learning_rate": 0.00019099497765356045, + "loss": 1.23, + "step": 72200 + }, + { + "epoch": 0.15341967483569954, + "grad_norm": 0.3569962978363037, + "learning_rate": 0.0001909921528086692, + "loss": 1.21, + "step": 72210 + }, + { + "epoch": 0.15344092115543861, + "grad_norm": 0.39641109108924866, + "learning_rate": 0.00019098932754167192, + "loss": 1.1926, + "step": 72220 + }, + { + "epoch": 0.15346216747517766, + "grad_norm": 0.3571265637874603, + "learning_rate": 0.00019098650185258182, + "loss": 1.1727, + "step": 72230 + }, + { + "epoch": 0.1534834137949167, + "grad_norm": 0.3155643939971924, + "learning_rate": 0.00019098367574141192, + "loss": 1.2315, + "step": 72240 + }, + { + "epoch": 0.15350466011465577, + "grad_norm": 0.34573671221733093, + "learning_rate": 0.00019098084920817537, + "loss": 1.2217, + "step": 72250 + }, + { + "epoch": 0.15352590643439482, + "grad_norm": 0.3445620834827423, + "learning_rate": 0.00019097802225288523, + "loss": 1.1907, + "step": 72260 + }, + { + "epoch": 0.15354715275413386, + "grad_norm": 0.33625051379203796, + "learning_rate": 0.0001909751948755547, + "loss": 1.222, + "step": 72270 + }, + { + "epoch": 0.15356839907387293, + "grad_norm": 0.4089798331260681, + "learning_rate": 0.00019097236707619683, + "loss": 1.1676, + "step": 72280 + }, + { + "epoch": 0.15358964539361197, + "grad_norm": 0.3922787308692932, + "learning_rate": 0.00019096953885482475, + "loss": 1.2064, + "step": 72290 + }, + { + "epoch": 0.15361089171335102, + "grad_norm": 0.3511156737804413, + "learning_rate": 0.00019096671021145162, + "loss": 1.1972, + "step": 72300 + }, + { + "epoch": 0.1536321380330901, + "grad_norm": 0.3378687798976898, + "learning_rate": 0.00019096388114609052, + "loss": 1.2045, + "step": 72310 + }, + { + "epoch": 0.15365338435282913, + "grad_norm": 0.4219684898853302, + "learning_rate": 0.00019096105165875455, + "loss": 1.2397, + "step": 72320 + }, + { + "epoch": 0.15367463067256817, + "grad_norm": 0.3821120858192444, + "learning_rate": 0.0001909582217494569, + "loss": 1.2141, + "step": 72330 + }, + { + "epoch": 0.15369587699230725, + "grad_norm": 0.40089961886405945, + "learning_rate": 0.00019095539141821067, + "loss": 1.1734, + "step": 72340 + }, + { + "epoch": 0.1537171233120463, + "grad_norm": 0.43146654963493347, + "learning_rate": 0.00019095256066502898, + "loss": 1.2075, + "step": 72350 + }, + { + "epoch": 0.15373836963178533, + "grad_norm": 0.43825286626815796, + "learning_rate": 0.00019094972948992495, + "loss": 1.2222, + "step": 72360 + }, + { + "epoch": 0.1537596159515244, + "grad_norm": 0.4638819992542267, + "learning_rate": 0.0001909468978929118, + "loss": 1.1991, + "step": 72370 + }, + { + "epoch": 0.15378086227126345, + "grad_norm": 0.4201925992965698, + "learning_rate": 0.0001909440658740025, + "loss": 1.1529, + "step": 72380 + }, + { + "epoch": 0.1538021085910025, + "grad_norm": 0.4601966142654419, + "learning_rate": 0.00019094123343321034, + "loss": 1.2131, + "step": 72390 + }, + { + "epoch": 0.15382335491074156, + "grad_norm": 0.4584645926952362, + "learning_rate": 0.0001909384005705484, + "loss": 1.1679, + "step": 72400 + }, + { + "epoch": 0.1538446012304806, + "grad_norm": 0.32270950078964233, + "learning_rate": 0.00019093556728602982, + "loss": 1.1951, + "step": 72410 + }, + { + "epoch": 0.15386584755021965, + "grad_norm": 0.3109826147556305, + "learning_rate": 0.00019093273357966774, + "loss": 1.2496, + "step": 72420 + }, + { + "epoch": 0.15388709386995872, + "grad_norm": 0.3608633279800415, + "learning_rate": 0.00019092989945147535, + "loss": 1.2162, + "step": 72430 + }, + { + "epoch": 0.15390834018969776, + "grad_norm": 0.3413093388080597, + "learning_rate": 0.00019092706490146574, + "loss": 1.2175, + "step": 72440 + }, + { + "epoch": 0.1539295865094368, + "grad_norm": 0.3496948480606079, + "learning_rate": 0.0001909242299296521, + "loss": 1.2042, + "step": 72450 + }, + { + "epoch": 0.15395083282917588, + "grad_norm": 0.3420344591140747, + "learning_rate": 0.0001909213945360475, + "loss": 1.2484, + "step": 72460 + }, + { + "epoch": 0.15397207914891492, + "grad_norm": 0.3560081124305725, + "learning_rate": 0.0001909185587206652, + "loss": 1.2228, + "step": 72470 + }, + { + "epoch": 0.15399332546865396, + "grad_norm": 0.3119393587112427, + "learning_rate": 0.00019091572248351835, + "loss": 1.1932, + "step": 72480 + }, + { + "epoch": 0.15401457178839303, + "grad_norm": 0.353609561920166, + "learning_rate": 0.00019091288582462001, + "loss": 1.1859, + "step": 72490 + }, + { + "epoch": 0.15403581810813208, + "grad_norm": 0.31408196687698364, + "learning_rate": 0.0001909100487439834, + "loss": 1.2058, + "step": 72500 + }, + { + "epoch": 0.15405706442787112, + "grad_norm": 0.3346222937107086, + "learning_rate": 0.0001909072112416217, + "loss": 1.2078, + "step": 72510 + }, + { + "epoch": 0.1540783107476102, + "grad_norm": 0.3368677794933319, + "learning_rate": 0.00019090437331754806, + "loss": 1.207, + "step": 72520 + }, + { + "epoch": 0.15409955706734924, + "grad_norm": 0.5678977370262146, + "learning_rate": 0.00019090153497177564, + "loss": 1.2162, + "step": 72530 + }, + { + "epoch": 0.1541208033870883, + "grad_norm": 0.3265738785266876, + "learning_rate": 0.00019089869620431756, + "loss": 1.1897, + "step": 72540 + }, + { + "epoch": 0.15414204970682735, + "grad_norm": 0.3203190565109253, + "learning_rate": 0.00019089585701518707, + "loss": 1.1672, + "step": 72550 + }, + { + "epoch": 0.1541632960265664, + "grad_norm": 0.32052287459373474, + "learning_rate": 0.0001908930174043973, + "loss": 1.2129, + "step": 72560 + }, + { + "epoch": 0.15418454234630546, + "grad_norm": 0.44547152519226074, + "learning_rate": 0.0001908901773719614, + "loss": 1.2025, + "step": 72570 + }, + { + "epoch": 0.1542057886660445, + "grad_norm": 0.41468846797943115, + "learning_rate": 0.0001908873369178926, + "loss": 1.1752, + "step": 72580 + }, + { + "epoch": 0.15422703498578355, + "grad_norm": 0.4808143079280853, + "learning_rate": 0.00019088449604220403, + "loss": 1.2351, + "step": 72590 + }, + { + "epoch": 0.15424828130552262, + "grad_norm": 0.3551034927368164, + "learning_rate": 0.00019088165474490887, + "loss": 1.1731, + "step": 72600 + }, + { + "epoch": 0.15426952762526167, + "grad_norm": 0.3699586093425751, + "learning_rate": 0.00019087881302602035, + "loss": 1.1846, + "step": 72610 + }, + { + "epoch": 0.1542907739450007, + "grad_norm": 0.32895076274871826, + "learning_rate": 0.0001908759708855516, + "loss": 1.199, + "step": 72620 + }, + { + "epoch": 0.15431202026473978, + "grad_norm": 0.34158867597579956, + "learning_rate": 0.0001908731283235158, + "loss": 1.2119, + "step": 72630 + }, + { + "epoch": 0.15433326658447882, + "grad_norm": 0.3442084789276123, + "learning_rate": 0.0001908702853399262, + "loss": 1.2329, + "step": 72640 + }, + { + "epoch": 0.15435451290421787, + "grad_norm": 0.32613101601600647, + "learning_rate": 0.00019086744193479594, + "loss": 1.1682, + "step": 72650 + }, + { + "epoch": 0.15437575922395694, + "grad_norm": 0.5150797367095947, + "learning_rate": 0.0001908645981081382, + "loss": 1.2375, + "step": 72660 + }, + { + "epoch": 0.15439700554369598, + "grad_norm": 0.3485882580280304, + "learning_rate": 0.0001908617538599662, + "loss": 1.2318, + "step": 72670 + }, + { + "epoch": 0.15441825186343502, + "grad_norm": 0.4124588668346405, + "learning_rate": 0.00019085890919029312, + "loss": 1.1873, + "step": 72680 + }, + { + "epoch": 0.1544394981831741, + "grad_norm": 0.7097911834716797, + "learning_rate": 0.00019085606409913216, + "loss": 1.219, + "step": 72690 + }, + { + "epoch": 0.15446074450291314, + "grad_norm": 0.4379788339138031, + "learning_rate": 0.00019085321858649653, + "loss": 1.2072, + "step": 72700 + }, + { + "epoch": 0.15448199082265218, + "grad_norm": 0.32537776231765747, + "learning_rate": 0.00019085037265239942, + "loss": 1.159, + "step": 72710 + }, + { + "epoch": 0.15450323714239125, + "grad_norm": 0.43690600991249084, + "learning_rate": 0.00019084752629685404, + "loss": 1.2071, + "step": 72720 + }, + { + "epoch": 0.1545244834621303, + "grad_norm": 0.419170081615448, + "learning_rate": 0.00019084467951987355, + "loss": 1.1749, + "step": 72730 + }, + { + "epoch": 0.15454572978186934, + "grad_norm": 0.3678804636001587, + "learning_rate": 0.0001908418323214712, + "loss": 1.2081, + "step": 72740 + }, + { + "epoch": 0.1545669761016084, + "grad_norm": 0.3618285655975342, + "learning_rate": 0.00019083898470166025, + "loss": 1.2173, + "step": 72750 + }, + { + "epoch": 0.15458822242134745, + "grad_norm": 0.3080029785633087, + "learning_rate": 0.00019083613666045376, + "loss": 1.2223, + "step": 72760 + }, + { + "epoch": 0.1546094687410865, + "grad_norm": 0.3582988381385803, + "learning_rate": 0.00019083328819786508, + "loss": 1.1798, + "step": 72770 + }, + { + "epoch": 0.15463071506082557, + "grad_norm": 0.34265395998954773, + "learning_rate": 0.0001908304393139074, + "loss": 1.199, + "step": 72780 + }, + { + "epoch": 0.1546519613805646, + "grad_norm": 0.441554456949234, + "learning_rate": 0.00019082759000859384, + "loss": 1.1944, + "step": 72790 + }, + { + "epoch": 0.15467320770030366, + "grad_norm": 0.3979502022266388, + "learning_rate": 0.00019082474028193772, + "loss": 1.1841, + "step": 72800 + }, + { + "epoch": 0.15469445402004273, + "grad_norm": 0.4929479956626892, + "learning_rate": 0.00019082189013395223, + "loss": 1.18, + "step": 72810 + }, + { + "epoch": 0.15471570033978177, + "grad_norm": 0.3127864897251129, + "learning_rate": 0.0001908190395646506, + "loss": 1.2201, + "step": 72820 + }, + { + "epoch": 0.1547369466595208, + "grad_norm": 0.32028812170028687, + "learning_rate": 0.00019081618857404606, + "loss": 1.233, + "step": 72830 + }, + { + "epoch": 0.15475819297925988, + "grad_norm": 0.3984890580177307, + "learning_rate": 0.00019081333716215179, + "loss": 1.2321, + "step": 72840 + }, + { + "epoch": 0.15477943929899893, + "grad_norm": 0.34912678599357605, + "learning_rate": 0.00019081048532898108, + "loss": 1.1965, + "step": 72850 + }, + { + "epoch": 0.15480068561873797, + "grad_norm": 0.3567503094673157, + "learning_rate": 0.00019080763307454708, + "loss": 1.1789, + "step": 72860 + }, + { + "epoch": 0.15482193193847704, + "grad_norm": 0.36893290281295776, + "learning_rate": 0.00019080478039886308, + "loss": 1.2493, + "step": 72870 + }, + { + "epoch": 0.15484317825821609, + "grad_norm": 0.42813676595687866, + "learning_rate": 0.0001908019273019423, + "loss": 1.192, + "step": 72880 + }, + { + "epoch": 0.15486442457795513, + "grad_norm": 0.36415895819664, + "learning_rate": 0.000190799073783798, + "loss": 1.2431, + "step": 72890 + }, + { + "epoch": 0.1548856708976942, + "grad_norm": 0.34259289503097534, + "learning_rate": 0.00019079621984444337, + "loss": 1.2222, + "step": 72900 + }, + { + "epoch": 0.15490691721743324, + "grad_norm": 0.4477185010910034, + "learning_rate": 0.0001907933654838917, + "loss": 1.2478, + "step": 72910 + }, + { + "epoch": 0.1549281635371723, + "grad_norm": 0.47893714904785156, + "learning_rate": 0.00019079051070215614, + "loss": 1.2365, + "step": 72920 + }, + { + "epoch": 0.15494940985691136, + "grad_norm": 0.3144942820072174, + "learning_rate": 0.00019078765549925003, + "loss": 1.2297, + "step": 72930 + }, + { + "epoch": 0.1549706561766504, + "grad_norm": 0.3927861154079437, + "learning_rate": 0.00019078479987518662, + "loss": 1.2051, + "step": 72940 + }, + { + "epoch": 0.15499190249638944, + "grad_norm": 0.3434321880340576, + "learning_rate": 0.00019078194382997907, + "loss": 1.1924, + "step": 72950 + }, + { + "epoch": 0.15501314881612852, + "grad_norm": 0.34518852829933167, + "learning_rate": 0.0001907790873636407, + "loss": 1.1996, + "step": 72960 + }, + { + "epoch": 0.15503439513586756, + "grad_norm": 0.44912633299827576, + "learning_rate": 0.00019077623047618475, + "loss": 1.2005, + "step": 72970 + }, + { + "epoch": 0.1550556414556066, + "grad_norm": 0.3678351938724518, + "learning_rate": 0.00019077337316762442, + "loss": 1.1785, + "step": 72980 + }, + { + "epoch": 0.15507688777534567, + "grad_norm": 0.3515671193599701, + "learning_rate": 0.00019077051543797303, + "loss": 1.139, + "step": 72990 + }, + { + "epoch": 0.15509813409508472, + "grad_norm": 0.39694738388061523, + "learning_rate": 0.0001907676572872438, + "loss": 1.21, + "step": 73000 + }, + { + "epoch": 0.15511938041482376, + "grad_norm": 0.32010477781295776, + "learning_rate": 0.00019076479871545004, + "loss": 1.197, + "step": 73010 + }, + { + "epoch": 0.15514062673456283, + "grad_norm": 0.4236551821231842, + "learning_rate": 0.00019076193972260495, + "loss": 1.1473, + "step": 73020 + }, + { + "epoch": 0.15516187305430187, + "grad_norm": 0.38045069575309753, + "learning_rate": 0.0001907590803087218, + "loss": 1.188, + "step": 73030 + }, + { + "epoch": 0.15518311937404092, + "grad_norm": 0.5868270993232727, + "learning_rate": 0.0001907562204738139, + "loss": 1.1798, + "step": 73040 + }, + { + "epoch": 0.15520436569378, + "grad_norm": 0.38532519340515137, + "learning_rate": 0.00019075336021789446, + "loss": 1.1936, + "step": 73050 + }, + { + "epoch": 0.15522561201351903, + "grad_norm": 0.3978126049041748, + "learning_rate": 0.00019075049954097678, + "loss": 1.2449, + "step": 73060 + }, + { + "epoch": 0.15524685833325808, + "grad_norm": 0.3749487102031708, + "learning_rate": 0.00019074763844307416, + "loss": 1.2116, + "step": 73070 + }, + { + "epoch": 0.15526810465299715, + "grad_norm": 0.4327516257762909, + "learning_rate": 0.0001907447769241998, + "loss": 1.2225, + "step": 73080 + }, + { + "epoch": 0.1552893509727362, + "grad_norm": 0.5121191740036011, + "learning_rate": 0.000190741914984367, + "loss": 1.2231, + "step": 73090 + }, + { + "epoch": 0.15531059729247523, + "grad_norm": 0.42899566888809204, + "learning_rate": 0.0001907390526235891, + "loss": 1.2461, + "step": 73100 + }, + { + "epoch": 0.1553318436122143, + "grad_norm": 0.3393835127353668, + "learning_rate": 0.00019073618984187928, + "loss": 1.2295, + "step": 73110 + }, + { + "epoch": 0.15535308993195335, + "grad_norm": 0.40219351649284363, + "learning_rate": 0.00019073332663925086, + "loss": 1.1796, + "step": 73120 + }, + { + "epoch": 0.1553743362516924, + "grad_norm": 0.3297693729400635, + "learning_rate": 0.00019073046301571717, + "loss": 1.2217, + "step": 73130 + }, + { + "epoch": 0.15539558257143146, + "grad_norm": 0.528430163860321, + "learning_rate": 0.00019072759897129142, + "loss": 1.2417, + "step": 73140 + }, + { + "epoch": 0.1554168288911705, + "grad_norm": 0.3474425971508026, + "learning_rate": 0.00019072473450598696, + "loss": 1.1797, + "step": 73150 + }, + { + "epoch": 0.15543807521090955, + "grad_norm": 0.3906219005584717, + "learning_rate": 0.00019072186961981703, + "loss": 1.1939, + "step": 73160 + }, + { + "epoch": 0.15545932153064862, + "grad_norm": 0.32716941833496094, + "learning_rate": 0.00019071900431279491, + "loss": 1.2302, + "step": 73170 + }, + { + "epoch": 0.15548056785038766, + "grad_norm": 0.43320921063423157, + "learning_rate": 0.00019071613858493398, + "loss": 1.2284, + "step": 73180 + }, + { + "epoch": 0.1555018141701267, + "grad_norm": 0.4058789908885956, + "learning_rate": 0.00019071327243624743, + "loss": 1.2196, + "step": 73190 + }, + { + "epoch": 0.15552306048986578, + "grad_norm": 0.5767582654953003, + "learning_rate": 0.00019071040586674865, + "loss": 1.1952, + "step": 73200 + }, + { + "epoch": 0.15554430680960482, + "grad_norm": 0.3486863970756531, + "learning_rate": 0.00019070753887645084, + "loss": 1.2062, + "step": 73210 + }, + { + "epoch": 0.15556555312934386, + "grad_norm": 0.3281555473804474, + "learning_rate": 0.00019070467146536734, + "loss": 1.2139, + "step": 73220 + }, + { + "epoch": 0.15558679944908294, + "grad_norm": 0.6078891754150391, + "learning_rate": 0.00019070180363351148, + "loss": 1.2076, + "step": 73230 + }, + { + "epoch": 0.15560804576882198, + "grad_norm": 0.3370426893234253, + "learning_rate": 0.00019069893538089654, + "loss": 1.2473, + "step": 73240 + }, + { + "epoch": 0.15562929208856102, + "grad_norm": 0.33160659670829773, + "learning_rate": 0.00019069606670753584, + "loss": 1.2026, + "step": 73250 + }, + { + "epoch": 0.1556505384083001, + "grad_norm": 0.37002432346343994, + "learning_rate": 0.00019069319761344264, + "loss": 1.2367, + "step": 73260 + }, + { + "epoch": 0.15567178472803914, + "grad_norm": 0.4452435374259949, + "learning_rate": 0.0001906903280986303, + "loss": 1.1933, + "step": 73270 + }, + { + "epoch": 0.15569303104777818, + "grad_norm": 0.35648107528686523, + "learning_rate": 0.00019068745816311214, + "loss": 1.1915, + "step": 73280 + }, + { + "epoch": 0.15571427736751725, + "grad_norm": 0.3892822861671448, + "learning_rate": 0.00019068458780690143, + "loss": 1.2049, + "step": 73290 + }, + { + "epoch": 0.1557355236872563, + "grad_norm": 0.34342753887176514, + "learning_rate": 0.0001906817170300115, + "loss": 1.1791, + "step": 73300 + }, + { + "epoch": 0.15575677000699534, + "grad_norm": 0.39494389295578003, + "learning_rate": 0.00019067884583245568, + "loss": 1.2165, + "step": 73310 + }, + { + "epoch": 0.1557780163267344, + "grad_norm": 0.6547341346740723, + "learning_rate": 0.00019067597421424726, + "loss": 1.2115, + "step": 73320 + }, + { + "epoch": 0.15579926264647345, + "grad_norm": 0.5799471735954285, + "learning_rate": 0.00019067310217539963, + "loss": 1.1921, + "step": 73330 + }, + { + "epoch": 0.1558205089662125, + "grad_norm": 0.3992749750614166, + "learning_rate": 0.00019067022971592604, + "loss": 1.2246, + "step": 73340 + }, + { + "epoch": 0.15584175528595157, + "grad_norm": 0.43284910917282104, + "learning_rate": 0.00019066735683583982, + "loss": 1.2054, + "step": 73350 + }, + { + "epoch": 0.1558630016056906, + "grad_norm": 0.33387935161590576, + "learning_rate": 0.00019066448353515436, + "loss": 1.2091, + "step": 73360 + }, + { + "epoch": 0.15588424792542965, + "grad_norm": 0.314340740442276, + "learning_rate": 0.0001906616098138829, + "loss": 1.2086, + "step": 73370 + }, + { + "epoch": 0.15590549424516872, + "grad_norm": 0.37107592821121216, + "learning_rate": 0.00019065873567203887, + "loss": 1.2, + "step": 73380 + }, + { + "epoch": 0.15592674056490777, + "grad_norm": 0.3270666301250458, + "learning_rate": 0.00019065586110963553, + "loss": 1.1943, + "step": 73390 + }, + { + "epoch": 0.15594798688464684, + "grad_norm": 0.38415855169296265, + "learning_rate": 0.00019065298612668623, + "loss": 1.1739, + "step": 73400 + }, + { + "epoch": 0.15596923320438588, + "grad_norm": 0.3808116912841797, + "learning_rate": 0.0001906501107232043, + "loss": 1.2118, + "step": 73410 + }, + { + "epoch": 0.15599047952412493, + "grad_norm": 0.494716614484787, + "learning_rate": 0.00019064723489920312, + "loss": 1.185, + "step": 73420 + }, + { + "epoch": 0.156011725843864, + "grad_norm": 0.42262402176856995, + "learning_rate": 0.000190644358654696, + "loss": 1.2243, + "step": 73430 + }, + { + "epoch": 0.15603297216360304, + "grad_norm": 0.347709983587265, + "learning_rate": 0.00019064148198969628, + "loss": 1.182, + "step": 73440 + }, + { + "epoch": 0.15605421848334208, + "grad_norm": 0.35601717233657837, + "learning_rate": 0.0001906386049042173, + "loss": 1.2269, + "step": 73450 + }, + { + "epoch": 0.15607546480308115, + "grad_norm": 0.34533774852752686, + "learning_rate": 0.0001906357273982724, + "loss": 1.1571, + "step": 73460 + }, + { + "epoch": 0.1560967111228202, + "grad_norm": 0.35767483711242676, + "learning_rate": 0.000190632849471875, + "loss": 1.223, + "step": 73470 + }, + { + "epoch": 0.15611795744255924, + "grad_norm": 0.36172544956207275, + "learning_rate": 0.00019062997112503834, + "loss": 1.2079, + "step": 73480 + }, + { + "epoch": 0.1561392037622983, + "grad_norm": 0.3793455958366394, + "learning_rate": 0.00019062709235777585, + "loss": 1.2059, + "step": 73490 + }, + { + "epoch": 0.15616045008203736, + "grad_norm": 0.4261929988861084, + "learning_rate": 0.00019062421317010087, + "loss": 1.189, + "step": 73500 + }, + { + "epoch": 0.1561816964017764, + "grad_norm": 0.3355870842933655, + "learning_rate": 0.00019062133356202674, + "loss": 1.191, + "step": 73510 + }, + { + "epoch": 0.15620294272151547, + "grad_norm": 0.3600960969924927, + "learning_rate": 0.00019061845353356683, + "loss": 1.2114, + "step": 73520 + }, + { + "epoch": 0.1562241890412545, + "grad_norm": 0.32562172412872314, + "learning_rate": 0.0001906155730847345, + "loss": 1.183, + "step": 73530 + }, + { + "epoch": 0.15624543536099356, + "grad_norm": 0.37308138608932495, + "learning_rate": 0.00019061269221554313, + "loss": 1.185, + "step": 73540 + }, + { + "epoch": 0.15626668168073263, + "grad_norm": 0.3513392210006714, + "learning_rate": 0.00019060981092600604, + "loss": 1.193, + "step": 73550 + }, + { + "epoch": 0.15628792800047167, + "grad_norm": 0.325904905796051, + "learning_rate": 0.00019060692921613658, + "loss": 1.1853, + "step": 73560 + }, + { + "epoch": 0.15630917432021071, + "grad_norm": 0.3600985109806061, + "learning_rate": 0.0001906040470859482, + "loss": 1.229, + "step": 73570 + }, + { + "epoch": 0.15633042063994979, + "grad_norm": 0.3561075031757355, + "learning_rate": 0.00019060116453545426, + "loss": 1.1862, + "step": 73580 + }, + { + "epoch": 0.15635166695968883, + "grad_norm": 0.3691885769367218, + "learning_rate": 0.00019059828156466803, + "loss": 1.2251, + "step": 73590 + }, + { + "epoch": 0.15637291327942787, + "grad_norm": 0.3361085057258606, + "learning_rate": 0.000190595398173603, + "loss": 1.207, + "step": 73600 + }, + { + "epoch": 0.15639415959916694, + "grad_norm": 0.3435630202293396, + "learning_rate": 0.0001905925143622725, + "loss": 1.1846, + "step": 73610 + }, + { + "epoch": 0.156415405918906, + "grad_norm": 0.429837703704834, + "learning_rate": 0.00019058963013068988, + "loss": 1.1733, + "step": 73620 + }, + { + "epoch": 0.15643665223864503, + "grad_norm": 0.3306243419647217, + "learning_rate": 0.00019058674547886857, + "loss": 1.1683, + "step": 73630 + }, + { + "epoch": 0.1564578985583841, + "grad_norm": 0.5333802103996277, + "learning_rate": 0.00019058386040682194, + "loss": 1.1906, + "step": 73640 + }, + { + "epoch": 0.15647914487812314, + "grad_norm": 0.3444839119911194, + "learning_rate": 0.00019058097491456334, + "loss": 1.1581, + "step": 73650 + }, + { + "epoch": 0.1565003911978622, + "grad_norm": 0.45993414521217346, + "learning_rate": 0.00019057808900210617, + "loss": 1.2329, + "step": 73660 + }, + { + "epoch": 0.15652163751760126, + "grad_norm": 0.41235294938087463, + "learning_rate": 0.00019057520266946382, + "loss": 1.1967, + "step": 73670 + }, + { + "epoch": 0.1565428838373403, + "grad_norm": 0.3975154161453247, + "learning_rate": 0.0001905723159166497, + "loss": 1.1718, + "step": 73680 + }, + { + "epoch": 0.15656413015707935, + "grad_norm": 0.4170859754085541, + "learning_rate": 0.00019056942874367717, + "loss": 1.2097, + "step": 73690 + }, + { + "epoch": 0.15658537647681842, + "grad_norm": 0.3281058967113495, + "learning_rate": 0.00019056654115055967, + "loss": 1.2185, + "step": 73700 + }, + { + "epoch": 0.15660662279655746, + "grad_norm": 0.365788996219635, + "learning_rate": 0.00019056365313731055, + "loss": 1.2037, + "step": 73710 + }, + { + "epoch": 0.1566278691162965, + "grad_norm": 0.36207354068756104, + "learning_rate": 0.00019056076470394325, + "loss": 1.1851, + "step": 73720 + }, + { + "epoch": 0.15664911543603557, + "grad_norm": 0.3352060317993164, + "learning_rate": 0.00019055787585047111, + "loss": 1.2351, + "step": 73730 + }, + { + "epoch": 0.15667036175577462, + "grad_norm": 0.4274722933769226, + "learning_rate": 0.00019055498657690758, + "loss": 1.1881, + "step": 73740 + }, + { + "epoch": 0.15669160807551366, + "grad_norm": 0.5387140512466431, + "learning_rate": 0.000190552096883266, + "loss": 1.1955, + "step": 73750 + }, + { + "epoch": 0.15671285439525273, + "grad_norm": 0.3341343104839325, + "learning_rate": 0.00019054920676955987, + "loss": 1.1631, + "step": 73760 + }, + { + "epoch": 0.15673410071499178, + "grad_norm": 0.40224120020866394, + "learning_rate": 0.00019054631623580253, + "loss": 1.2511, + "step": 73770 + }, + { + "epoch": 0.15675534703473082, + "grad_norm": 0.34294646978378296, + "learning_rate": 0.0001905434252820074, + "loss": 1.2137, + "step": 73780 + }, + { + "epoch": 0.1567765933544699, + "grad_norm": 0.35757583379745483, + "learning_rate": 0.0001905405339081879, + "loss": 1.1738, + "step": 73790 + }, + { + "epoch": 0.15679783967420893, + "grad_norm": 0.346462607383728, + "learning_rate": 0.00019053764211435746, + "loss": 1.2298, + "step": 73800 + }, + { + "epoch": 0.15681908599394798, + "grad_norm": 0.3422798216342926, + "learning_rate": 0.0001905347499005295, + "loss": 1.1474, + "step": 73810 + }, + { + "epoch": 0.15684033231368705, + "grad_norm": 0.41216233372688293, + "learning_rate": 0.00019053185726671735, + "loss": 1.1957, + "step": 73820 + }, + { + "epoch": 0.1568615786334261, + "grad_norm": 0.3743588328361511, + "learning_rate": 0.00019052896421293451, + "loss": 1.2191, + "step": 73830 + }, + { + "epoch": 0.15688282495316513, + "grad_norm": 0.3453773260116577, + "learning_rate": 0.0001905260707391944, + "loss": 1.2145, + "step": 73840 + }, + { + "epoch": 0.1569040712729042, + "grad_norm": 0.48155272006988525, + "learning_rate": 0.0001905231768455104, + "loss": 1.2075, + "step": 73850 + }, + { + "epoch": 0.15692531759264325, + "grad_norm": 0.9350757598876953, + "learning_rate": 0.000190520282531896, + "loss": 1.1735, + "step": 73860 + }, + { + "epoch": 0.1569465639123823, + "grad_norm": 0.33079731464385986, + "learning_rate": 0.00019051738779836454, + "loss": 1.2489, + "step": 73870 + }, + { + "epoch": 0.15696781023212136, + "grad_norm": 0.33121949434280396, + "learning_rate": 0.00019051449264492952, + "loss": 1.1932, + "step": 73880 + }, + { + "epoch": 0.1569890565518604, + "grad_norm": 0.36686232686042786, + "learning_rate": 0.00019051159707160435, + "loss": 1.1721, + "step": 73890 + }, + { + "epoch": 0.15701030287159945, + "grad_norm": 0.3187958002090454, + "learning_rate": 0.00019050870107840243, + "loss": 1.2158, + "step": 73900 + }, + { + "epoch": 0.15703154919133852, + "grad_norm": 0.3199542164802551, + "learning_rate": 0.00019050580466533723, + "loss": 1.2557, + "step": 73910 + }, + { + "epoch": 0.15705279551107756, + "grad_norm": 0.3451758027076721, + "learning_rate": 0.00019050290783242217, + "loss": 1.2152, + "step": 73920 + }, + { + "epoch": 0.1570740418308166, + "grad_norm": 0.3307511508464813, + "learning_rate": 0.00019050001057967073, + "loss": 1.2234, + "step": 73930 + }, + { + "epoch": 0.15709528815055568, + "grad_norm": 0.4195064306259155, + "learning_rate": 0.00019049711290709627, + "loss": 1.2178, + "step": 73940 + }, + { + "epoch": 0.15711653447029472, + "grad_norm": 0.35167205333709717, + "learning_rate": 0.00019049421481471233, + "loss": 1.2021, + "step": 73950 + }, + { + "epoch": 0.15713778079003377, + "grad_norm": 0.3287941813468933, + "learning_rate": 0.00019049131630253225, + "loss": 1.1614, + "step": 73960 + }, + { + "epoch": 0.15715902710977284, + "grad_norm": 0.4161126911640167, + "learning_rate": 0.00019048841737056955, + "loss": 1.2008, + "step": 73970 + }, + { + "epoch": 0.15718027342951188, + "grad_norm": 0.5665867328643799, + "learning_rate": 0.00019048551801883768, + "loss": 1.1885, + "step": 73980 + }, + { + "epoch": 0.15720151974925092, + "grad_norm": 0.4273276627063751, + "learning_rate": 0.00019048261824735003, + "loss": 1.1897, + "step": 73990 + }, + { + "epoch": 0.15722276606899, + "grad_norm": 0.48293375968933105, + "learning_rate": 0.0001904797180561201, + "loss": 1.1977, + "step": 74000 + }, + { + "epoch": 0.15724401238872904, + "grad_norm": 0.433146595954895, + "learning_rate": 0.0001904768174451613, + "loss": 1.2224, + "step": 74010 + }, + { + "epoch": 0.15726525870846808, + "grad_norm": 0.3634732961654663, + "learning_rate": 0.00019047391641448716, + "loss": 1.1913, + "step": 74020 + }, + { + "epoch": 0.15728650502820715, + "grad_norm": 0.45016515254974365, + "learning_rate": 0.00019047101496411109, + "loss": 1.235, + "step": 74030 + }, + { + "epoch": 0.1573077513479462, + "grad_norm": 0.3978499174118042, + "learning_rate": 0.00019046811309404653, + "loss": 1.2103, + "step": 74040 + }, + { + "epoch": 0.15732899766768524, + "grad_norm": 0.3448773920536041, + "learning_rate": 0.00019046521080430697, + "loss": 1.2117, + "step": 74050 + }, + { + "epoch": 0.1573502439874243, + "grad_norm": 0.29934725165367126, + "learning_rate": 0.0001904623080949059, + "loss": 1.1811, + "step": 74060 + }, + { + "epoch": 0.15737149030716335, + "grad_norm": 0.33272087574005127, + "learning_rate": 0.00019045940496585672, + "loss": 1.2185, + "step": 74070 + }, + { + "epoch": 0.1573927366269024, + "grad_norm": 0.3759908080101013, + "learning_rate": 0.00019045650141717295, + "loss": 1.1986, + "step": 74080 + }, + { + "epoch": 0.15741398294664147, + "grad_norm": 0.31674060225486755, + "learning_rate": 0.00019045359744886803, + "loss": 1.2077, + "step": 74090 + }, + { + "epoch": 0.1574352292663805, + "grad_norm": 0.3190078139305115, + "learning_rate": 0.00019045069306095544, + "loss": 1.2348, + "step": 74100 + }, + { + "epoch": 0.15745647558611955, + "grad_norm": 0.3784289062023163, + "learning_rate": 0.00019044778825344868, + "loss": 1.2102, + "step": 74110 + }, + { + "epoch": 0.15747772190585863, + "grad_norm": 0.32383832335472107, + "learning_rate": 0.00019044488302636116, + "loss": 1.2023, + "step": 74120 + }, + { + "epoch": 0.15749896822559767, + "grad_norm": 0.3191031515598297, + "learning_rate": 0.00019044197737970645, + "loss": 1.2082, + "step": 74130 + }, + { + "epoch": 0.1575202145453367, + "grad_norm": 0.3136880695819855, + "learning_rate": 0.00019043907131349795, + "loss": 1.1984, + "step": 74140 + }, + { + "epoch": 0.15754146086507578, + "grad_norm": 0.34523889422416687, + "learning_rate": 0.00019043616482774915, + "loss": 1.1774, + "step": 74150 + }, + { + "epoch": 0.15756270718481483, + "grad_norm": 0.3149208426475525, + "learning_rate": 0.0001904332579224736, + "loss": 1.2011, + "step": 74160 + }, + { + "epoch": 0.15758395350455387, + "grad_norm": 0.3883579969406128, + "learning_rate": 0.00019043035059768468, + "loss": 1.2105, + "step": 74170 + }, + { + "epoch": 0.15760519982429294, + "grad_norm": 0.535163938999176, + "learning_rate": 0.00019042744285339597, + "loss": 1.2289, + "step": 74180 + }, + { + "epoch": 0.15762644614403198, + "grad_norm": 0.44346991181373596, + "learning_rate": 0.0001904245346896209, + "loss": 1.2202, + "step": 74190 + }, + { + "epoch": 0.15764769246377103, + "grad_norm": 0.34198397397994995, + "learning_rate": 0.00019042162610637302, + "loss": 1.1799, + "step": 74200 + }, + { + "epoch": 0.1576689387835101, + "grad_norm": 0.4147062599658966, + "learning_rate": 0.00019041871710366576, + "loss": 1.2136, + "step": 74210 + }, + { + "epoch": 0.15769018510324914, + "grad_norm": 0.3462279438972473, + "learning_rate": 0.00019041580768151264, + "loss": 1.177, + "step": 74220 + }, + { + "epoch": 0.1577114314229882, + "grad_norm": 0.41350215673446655, + "learning_rate": 0.00019041289783992715, + "loss": 1.1516, + "step": 74230 + }, + { + "epoch": 0.15773267774272726, + "grad_norm": 0.34493234753608704, + "learning_rate": 0.0001904099875789228, + "loss": 1.1966, + "step": 74240 + }, + { + "epoch": 0.1577539240624663, + "grad_norm": 0.3511230945587158, + "learning_rate": 0.0001904070768985131, + "loss": 1.1835, + "step": 74250 + }, + { + "epoch": 0.15777517038220537, + "grad_norm": 0.36466583609580994, + "learning_rate": 0.0001904041657987115, + "loss": 1.233, + "step": 74260 + }, + { + "epoch": 0.15779641670194441, + "grad_norm": 0.48229891061782837, + "learning_rate": 0.0001904012542795316, + "loss": 1.2096, + "step": 74270 + }, + { + "epoch": 0.15781766302168346, + "grad_norm": 0.393523246049881, + "learning_rate": 0.0001903983423409868, + "loss": 1.2411, + "step": 74280 + }, + { + "epoch": 0.15783890934142253, + "grad_norm": 0.32579872012138367, + "learning_rate": 0.0001903954299830907, + "loss": 1.1624, + "step": 74290 + }, + { + "epoch": 0.15786015566116157, + "grad_norm": 0.37493982911109924, + "learning_rate": 0.0001903925172058567, + "loss": 1.2225, + "step": 74300 + }, + { + "epoch": 0.15788140198090062, + "grad_norm": 0.44234952330589294, + "learning_rate": 0.00019038960400929845, + "loss": 1.1945, + "step": 74310 + }, + { + "epoch": 0.1579026483006397, + "grad_norm": 0.34088626503944397, + "learning_rate": 0.00019038669039342935, + "loss": 1.1971, + "step": 74320 + }, + { + "epoch": 0.15792389462037873, + "grad_norm": 0.35000357031822205, + "learning_rate": 0.000190383776358263, + "loss": 1.2325, + "step": 74330 + }, + { + "epoch": 0.15794514094011777, + "grad_norm": 0.3439798653125763, + "learning_rate": 0.00019038086190381284, + "loss": 1.2217, + "step": 74340 + }, + { + "epoch": 0.15796638725985684, + "grad_norm": 0.32817766070365906, + "learning_rate": 0.00019037794703009243, + "loss": 1.2377, + "step": 74350 + }, + { + "epoch": 0.1579876335795959, + "grad_norm": 0.3425509035587311, + "learning_rate": 0.0001903750317371153, + "loss": 1.1985, + "step": 74360 + }, + { + "epoch": 0.15800887989933493, + "grad_norm": 0.3384675979614258, + "learning_rate": 0.00019037211602489497, + "loss": 1.245, + "step": 74370 + }, + { + "epoch": 0.158030126219074, + "grad_norm": 0.32760491967201233, + "learning_rate": 0.00019036919989344497, + "loss": 1.1973, + "step": 74380 + }, + { + "epoch": 0.15805137253881305, + "grad_norm": 0.33829638361930847, + "learning_rate": 0.0001903662833427788, + "loss": 1.199, + "step": 74390 + }, + { + "epoch": 0.1580726188585521, + "grad_norm": 0.33738452196121216, + "learning_rate": 0.00019036336637291, + "loss": 1.1965, + "step": 74400 + }, + { + "epoch": 0.15809386517829116, + "grad_norm": 0.3922808766365051, + "learning_rate": 0.0001903604489838521, + "loss": 1.2223, + "step": 74410 + }, + { + "epoch": 0.1581151114980302, + "grad_norm": 0.6151635646820068, + "learning_rate": 0.00019035753117561865, + "loss": 1.197, + "step": 74420 + }, + { + "epoch": 0.15813635781776925, + "grad_norm": 0.4553794860839844, + "learning_rate": 0.00019035461294822318, + "loss": 1.2492, + "step": 74430 + }, + { + "epoch": 0.15815760413750832, + "grad_norm": 0.38325175642967224, + "learning_rate": 0.00019035169430167923, + "loss": 1.1729, + "step": 74440 + }, + { + "epoch": 0.15817885045724736, + "grad_norm": 0.3351958096027374, + "learning_rate": 0.00019034877523600035, + "loss": 1.2341, + "step": 74450 + }, + { + "epoch": 0.1582000967769864, + "grad_norm": 0.44061586260795593, + "learning_rate": 0.00019034585575120004, + "loss": 1.1557, + "step": 74460 + }, + { + "epoch": 0.15822134309672548, + "grad_norm": 0.4165131747722626, + "learning_rate": 0.00019034293584729186, + "loss": 1.2304, + "step": 74470 + }, + { + "epoch": 0.15824258941646452, + "grad_norm": 0.4084938168525696, + "learning_rate": 0.00019034001552428937, + "loss": 1.1848, + "step": 74480 + }, + { + "epoch": 0.15826383573620356, + "grad_norm": 0.5313873887062073, + "learning_rate": 0.00019033709478220612, + "loss": 1.2127, + "step": 74490 + }, + { + "epoch": 0.15828508205594263, + "grad_norm": 0.2958909869194031, + "learning_rate": 0.00019033417362105567, + "loss": 1.2276, + "step": 74500 + }, + { + "epoch": 0.15830632837568168, + "grad_norm": 0.35004016757011414, + "learning_rate": 0.0001903312520408515, + "loss": 1.188, + "step": 74510 + }, + { + "epoch": 0.15832757469542072, + "grad_norm": 0.3160194456577301, + "learning_rate": 0.00019032833004160727, + "loss": 1.1796, + "step": 74520 + }, + { + "epoch": 0.1583488210151598, + "grad_norm": 0.33580392599105835, + "learning_rate": 0.00019032540762333646, + "loss": 1.2028, + "step": 74530 + }, + { + "epoch": 0.15837006733489883, + "grad_norm": 0.3351803123950958, + "learning_rate": 0.00019032248478605262, + "loss": 1.2235, + "step": 74540 + }, + { + "epoch": 0.15839131365463788, + "grad_norm": 0.30256298184394836, + "learning_rate": 0.00019031956152976934, + "loss": 1.2436, + "step": 74550 + }, + { + "epoch": 0.15841255997437695, + "grad_norm": 0.3779975473880768, + "learning_rate": 0.00019031663785450023, + "loss": 1.2093, + "step": 74560 + }, + { + "epoch": 0.158433806294116, + "grad_norm": 0.5153021216392517, + "learning_rate": 0.00019031371376025875, + "loss": 1.2549, + "step": 74570 + }, + { + "epoch": 0.15845505261385504, + "grad_norm": 0.3338010013103485, + "learning_rate": 0.0001903107892470585, + "loss": 1.2563, + "step": 74580 + }, + { + "epoch": 0.1584762989335941, + "grad_norm": 0.3655741810798645, + "learning_rate": 0.0001903078643149131, + "loss": 1.2166, + "step": 74590 + }, + { + "epoch": 0.15849754525333315, + "grad_norm": 0.38975805044174194, + "learning_rate": 0.00019030493896383604, + "loss": 1.22, + "step": 74600 + }, + { + "epoch": 0.1585187915730722, + "grad_norm": 0.3397411108016968, + "learning_rate": 0.00019030201319384095, + "loss": 1.2076, + "step": 74610 + }, + { + "epoch": 0.15854003789281126, + "grad_norm": 0.37066036462783813, + "learning_rate": 0.0001902990870049414, + "loss": 1.1752, + "step": 74620 + }, + { + "epoch": 0.1585612842125503, + "grad_norm": 0.32178518176078796, + "learning_rate": 0.00019029616039715092, + "loss": 1.163, + "step": 74630 + }, + { + "epoch": 0.15858253053228935, + "grad_norm": 0.32246655225753784, + "learning_rate": 0.0001902932333704831, + "loss": 1.2139, + "step": 74640 + }, + { + "epoch": 0.15860377685202842, + "grad_norm": 0.3422689437866211, + "learning_rate": 0.00019029030592495155, + "loss": 1.2367, + "step": 74650 + }, + { + "epoch": 0.15862502317176747, + "grad_norm": 0.35830822587013245, + "learning_rate": 0.00019028737806056983, + "loss": 1.2044, + "step": 74660 + }, + { + "epoch": 0.1586462694915065, + "grad_norm": 0.3753519058227539, + "learning_rate": 0.00019028444977735152, + "loss": 1.1942, + "step": 74670 + }, + { + "epoch": 0.15866751581124558, + "grad_norm": 0.5325493216514587, + "learning_rate": 0.00019028152107531022, + "loss": 1.209, + "step": 74680 + }, + { + "epoch": 0.15868876213098462, + "grad_norm": 0.43508195877075195, + "learning_rate": 0.0001902785919544595, + "loss": 1.2088, + "step": 74690 + }, + { + "epoch": 0.15871000845072367, + "grad_norm": 0.3664546310901642, + "learning_rate": 0.0001902756624148129, + "loss": 1.1733, + "step": 74700 + }, + { + "epoch": 0.15873125477046274, + "grad_norm": 0.3427225947380066, + "learning_rate": 0.0001902727324563841, + "loss": 1.1996, + "step": 74710 + }, + { + "epoch": 0.15875250109020178, + "grad_norm": 0.4796276092529297, + "learning_rate": 0.00019026980207918664, + "loss": 1.215, + "step": 74720 + }, + { + "epoch": 0.15877374740994082, + "grad_norm": 0.36288413405418396, + "learning_rate": 0.00019026687128323414, + "loss": 1.1885, + "step": 74730 + }, + { + "epoch": 0.1587949937296799, + "grad_norm": 0.3601849377155304, + "learning_rate": 0.00019026394006854021, + "loss": 1.2392, + "step": 74740 + }, + { + "epoch": 0.15881624004941894, + "grad_norm": 0.3199496567249298, + "learning_rate": 0.00019026100843511835, + "loss": 1.2212, + "step": 74750 + }, + { + "epoch": 0.15883748636915798, + "grad_norm": 0.3338780105113983, + "learning_rate": 0.00019025807638298228, + "loss": 1.1978, + "step": 74760 + }, + { + "epoch": 0.15885873268889705, + "grad_norm": 0.39791014790534973, + "learning_rate": 0.0001902551439121455, + "loss": 1.2076, + "step": 74770 + }, + { + "epoch": 0.1588799790086361, + "grad_norm": 0.35883399844169617, + "learning_rate": 0.00019025221102262173, + "loss": 1.1687, + "step": 74780 + }, + { + "epoch": 0.15890122532837514, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.00019024927771442448, + "loss": 1.2308, + "step": 74790 + }, + { + "epoch": 0.1589224716481142, + "grad_norm": 0.4482017755508423, + "learning_rate": 0.0001902463439875674, + "loss": 1.2434, + "step": 74800 + }, + { + "epoch": 0.15894371796785325, + "grad_norm": 0.44218796491622925, + "learning_rate": 0.00019024340984206406, + "loss": 1.1881, + "step": 74810 + }, + { + "epoch": 0.1589649642875923, + "grad_norm": 0.3554952144622803, + "learning_rate": 0.00019024047527792812, + "loss": 1.2221, + "step": 74820 + }, + { + "epoch": 0.15898621060733137, + "grad_norm": 0.3143041729927063, + "learning_rate": 0.00019023754029517319, + "loss": 1.1902, + "step": 74830 + }, + { + "epoch": 0.1590074569270704, + "grad_norm": 0.33546993136405945, + "learning_rate": 0.0001902346048938128, + "loss": 1.2144, + "step": 74840 + }, + { + "epoch": 0.15902870324680946, + "grad_norm": 0.4293029010295868, + "learning_rate": 0.00019023166907386068, + "loss": 1.2166, + "step": 74850 + }, + { + "epoch": 0.15904994956654853, + "grad_norm": 0.4819267988204956, + "learning_rate": 0.0001902287328353304, + "loss": 1.1949, + "step": 74860 + }, + { + "epoch": 0.15907119588628757, + "grad_norm": 0.4817706048488617, + "learning_rate": 0.00019022579617823557, + "loss": 1.2156, + "step": 74870 + }, + { + "epoch": 0.1590924422060266, + "grad_norm": 0.4878651201725006, + "learning_rate": 0.00019022285910258986, + "loss": 1.1818, + "step": 74880 + }, + { + "epoch": 0.15911368852576568, + "grad_norm": 0.36689162254333496, + "learning_rate": 0.0001902199216084068, + "loss": 1.2362, + "step": 74890 + }, + { + "epoch": 0.15913493484550473, + "grad_norm": 0.5347386002540588, + "learning_rate": 0.00019021698369570014, + "loss": 1.2246, + "step": 74900 + }, + { + "epoch": 0.15915618116524377, + "grad_norm": 0.3676193952560425, + "learning_rate": 0.0001902140453644834, + "loss": 1.1895, + "step": 74910 + }, + { + "epoch": 0.15917742748498284, + "grad_norm": 0.4801021218299866, + "learning_rate": 0.0001902111066147703, + "loss": 1.1906, + "step": 74920 + }, + { + "epoch": 0.15919867380472189, + "grad_norm": 0.33144518733024597, + "learning_rate": 0.00019020816744657438, + "loss": 1.2101, + "step": 74930 + }, + { + "epoch": 0.15921992012446093, + "grad_norm": 0.3483010232448578, + "learning_rate": 0.00019020522785990934, + "loss": 1.22, + "step": 74940 + }, + { + "epoch": 0.1592411664442, + "grad_norm": 0.3230571746826172, + "learning_rate": 0.00019020228785478885, + "loss": 1.221, + "step": 74950 + }, + { + "epoch": 0.15926241276393904, + "grad_norm": 0.3723294734954834, + "learning_rate": 0.00019019934743122645, + "loss": 1.187, + "step": 74960 + }, + { + "epoch": 0.1592836590836781, + "grad_norm": 0.3772444725036621, + "learning_rate": 0.00019019640658923584, + "loss": 1.2176, + "step": 74970 + }, + { + "epoch": 0.15930490540341716, + "grad_norm": 0.370932012796402, + "learning_rate": 0.00019019346532883066, + "loss": 1.2564, + "step": 74980 + }, + { + "epoch": 0.1593261517231562, + "grad_norm": 0.3627120852470398, + "learning_rate": 0.00019019052365002454, + "loss": 1.22, + "step": 74990 + }, + { + "epoch": 0.15934739804289524, + "grad_norm": 0.4533821940422058, + "learning_rate": 0.00019018758155283112, + "loss": 1.1574, + "step": 75000 + }, + { + "epoch": 0.15936864436263432, + "grad_norm": 0.3331935703754425, + "learning_rate": 0.00019018463903726408, + "loss": 1.203, + "step": 75010 + }, + { + "epoch": 0.15938989068237336, + "grad_norm": 0.3821139931678772, + "learning_rate": 0.00019018169610333704, + "loss": 1.1865, + "step": 75020 + }, + { + "epoch": 0.1594111370021124, + "grad_norm": 0.3633139431476593, + "learning_rate": 0.00019017875275106368, + "loss": 1.2336, + "step": 75030 + }, + { + "epoch": 0.15943238332185147, + "grad_norm": 0.4283011853694916, + "learning_rate": 0.00019017580898045763, + "loss": 1.2, + "step": 75040 + }, + { + "epoch": 0.15945362964159052, + "grad_norm": 0.35416075587272644, + "learning_rate": 0.00019017286479153255, + "loss": 1.2083, + "step": 75050 + }, + { + "epoch": 0.15947487596132956, + "grad_norm": 0.3276880383491516, + "learning_rate": 0.00019016992018430212, + "loss": 1.2294, + "step": 75060 + }, + { + "epoch": 0.15949612228106863, + "grad_norm": 0.30037960410118103, + "learning_rate": 0.00019016697515877995, + "loss": 1.194, + "step": 75070 + }, + { + "epoch": 0.15951736860080767, + "grad_norm": 0.45528101921081543, + "learning_rate": 0.00019016402971497973, + "loss": 1.2024, + "step": 75080 + }, + { + "epoch": 0.15953861492054675, + "grad_norm": 0.33911508321762085, + "learning_rate": 0.00019016108385291517, + "loss": 1.2011, + "step": 75090 + }, + { + "epoch": 0.1595598612402858, + "grad_norm": 0.3187280595302582, + "learning_rate": 0.00019015813757259986, + "loss": 1.2019, + "step": 75100 + }, + { + "epoch": 0.15958110756002483, + "grad_norm": 0.3484994173049927, + "learning_rate": 0.00019015519087404748, + "loss": 1.2107, + "step": 75110 + }, + { + "epoch": 0.1596023538797639, + "grad_norm": 0.3835626542568207, + "learning_rate": 0.00019015224375727176, + "loss": 1.166, + "step": 75120 + }, + { + "epoch": 0.15962360019950295, + "grad_norm": 0.33788830041885376, + "learning_rate": 0.0001901492962222863, + "loss": 1.2078, + "step": 75130 + }, + { + "epoch": 0.159644846519242, + "grad_norm": 0.3464348018169403, + "learning_rate": 0.00019014634826910483, + "loss": 1.2299, + "step": 75140 + }, + { + "epoch": 0.15966609283898106, + "grad_norm": 0.32495981454849243, + "learning_rate": 0.00019014339989774096, + "loss": 1.2297, + "step": 75150 + }, + { + "epoch": 0.1596873391587201, + "grad_norm": 0.5164275765419006, + "learning_rate": 0.00019014045110820845, + "loss": 1.2134, + "step": 75160 + }, + { + "epoch": 0.15970858547845915, + "grad_norm": 0.7217769026756287, + "learning_rate": 0.00019013750190052093, + "loss": 1.2115, + "step": 75170 + }, + { + "epoch": 0.15972983179819822, + "grad_norm": 0.554804265499115, + "learning_rate": 0.00019013455227469205, + "loss": 1.2096, + "step": 75180 + }, + { + "epoch": 0.15975107811793726, + "grad_norm": 0.30358147621154785, + "learning_rate": 0.00019013160223073556, + "loss": 1.2266, + "step": 75190 + }, + { + "epoch": 0.1597723244376763, + "grad_norm": 0.4596715569496155, + "learning_rate": 0.00019012865176866513, + "loss": 1.1766, + "step": 75200 + }, + { + "epoch": 0.15979357075741538, + "grad_norm": 0.33068421483039856, + "learning_rate": 0.0001901257008884944, + "loss": 1.21, + "step": 75210 + }, + { + "epoch": 0.15981481707715442, + "grad_norm": 0.489717036485672, + "learning_rate": 0.0001901227495902371, + "loss": 1.2099, + "step": 75220 + }, + { + "epoch": 0.15983606339689346, + "grad_norm": 0.3930895924568176, + "learning_rate": 0.00019011979787390693, + "loss": 1.2178, + "step": 75230 + }, + { + "epoch": 0.15985730971663253, + "grad_norm": 0.32957834005355835, + "learning_rate": 0.00019011684573951753, + "loss": 1.2407, + "step": 75240 + }, + { + "epoch": 0.15987855603637158, + "grad_norm": 0.34011542797088623, + "learning_rate": 0.00019011389318708265, + "loss": 1.2315, + "step": 75250 + }, + { + "epoch": 0.15989980235611062, + "grad_norm": 0.32182806730270386, + "learning_rate": 0.00019011094021661597, + "loss": 1.1805, + "step": 75260 + }, + { + "epoch": 0.1599210486758497, + "grad_norm": 0.30836135149002075, + "learning_rate": 0.0001901079868281312, + "loss": 1.2485, + "step": 75270 + }, + { + "epoch": 0.15994229499558874, + "grad_norm": 0.32257404923439026, + "learning_rate": 0.00019010503302164196, + "loss": 1.2268, + "step": 75280 + }, + { + "epoch": 0.15996354131532778, + "grad_norm": 0.35217562317848206, + "learning_rate": 0.0001901020787971621, + "loss": 1.1783, + "step": 75290 + }, + { + "epoch": 0.15998478763506685, + "grad_norm": 0.32531285285949707, + "learning_rate": 0.0001900991241547052, + "loss": 1.2049, + "step": 75300 + }, + { + "epoch": 0.1600060339548059, + "grad_norm": 0.3518708348274231, + "learning_rate": 0.00019009616909428502, + "loss": 1.1961, + "step": 75310 + }, + { + "epoch": 0.16002728027454494, + "grad_norm": 0.3071639835834503, + "learning_rate": 0.00019009321361591525, + "loss": 1.2135, + "step": 75320 + }, + { + "epoch": 0.160048526594284, + "grad_norm": 0.4064239263534546, + "learning_rate": 0.0001900902577196096, + "loss": 1.1962, + "step": 75330 + }, + { + "epoch": 0.16006977291402305, + "grad_norm": 0.3817395865917206, + "learning_rate": 0.0001900873014053818, + "loss": 1.1936, + "step": 75340 + }, + { + "epoch": 0.1600910192337621, + "grad_norm": 0.5121039152145386, + "learning_rate": 0.00019008434467324554, + "loss": 1.1932, + "step": 75350 + }, + { + "epoch": 0.16011226555350117, + "grad_norm": 0.5498211979866028, + "learning_rate": 0.0001900813875232146, + "loss": 1.2187, + "step": 75360 + }, + { + "epoch": 0.1601335118732402, + "grad_norm": 0.4077077805995941, + "learning_rate": 0.00019007842995530258, + "loss": 1.1918, + "step": 75370 + }, + { + "epoch": 0.16015475819297925, + "grad_norm": 0.3520781099796295, + "learning_rate": 0.00019007547196952328, + "loss": 1.1877, + "step": 75380 + }, + { + "epoch": 0.16017600451271832, + "grad_norm": 0.3157672882080078, + "learning_rate": 0.00019007251356589042, + "loss": 1.2115, + "step": 75390 + }, + { + "epoch": 0.16019725083245737, + "grad_norm": 0.33911821246147156, + "learning_rate": 0.00019006955474441773, + "loss": 1.2072, + "step": 75400 + }, + { + "epoch": 0.1602184971521964, + "grad_norm": 0.3542759120464325, + "learning_rate": 0.0001900665955051189, + "loss": 1.1918, + "step": 75410 + }, + { + "epoch": 0.16023974347193548, + "grad_norm": 0.32280433177948, + "learning_rate": 0.00019006363584800768, + "loss": 1.2465, + "step": 75420 + }, + { + "epoch": 0.16026098979167452, + "grad_norm": 0.49564483761787415, + "learning_rate": 0.0001900606757730978, + "loss": 1.1974, + "step": 75430 + }, + { + "epoch": 0.16028223611141357, + "grad_norm": 0.5168460011482239, + "learning_rate": 0.00019005771528040298, + "loss": 1.2173, + "step": 75440 + }, + { + "epoch": 0.16030348243115264, + "grad_norm": 0.327903687953949, + "learning_rate": 0.00019005475436993698, + "loss": 1.1909, + "step": 75450 + }, + { + "epoch": 0.16032472875089168, + "grad_norm": 0.38787269592285156, + "learning_rate": 0.0001900517930417135, + "loss": 1.1873, + "step": 75460 + }, + { + "epoch": 0.16034597507063073, + "grad_norm": 0.3739088475704193, + "learning_rate": 0.00019004883129574628, + "loss": 1.1998, + "step": 75470 + }, + { + "epoch": 0.1603672213903698, + "grad_norm": 0.35072118043899536, + "learning_rate": 0.00019004586913204911, + "loss": 1.2242, + "step": 75480 + }, + { + "epoch": 0.16038846771010884, + "grad_norm": 0.34186312556266785, + "learning_rate": 0.0001900429065506357, + "loss": 1.2087, + "step": 75490 + }, + { + "epoch": 0.16040971402984788, + "grad_norm": 0.33106398582458496, + "learning_rate": 0.00019003994355151974, + "loss": 1.2447, + "step": 75500 + }, + { + "epoch": 0.16043096034958695, + "grad_norm": 0.38484570384025574, + "learning_rate": 0.00019003698013471504, + "loss": 1.2158, + "step": 75510 + }, + { + "epoch": 0.160452206669326, + "grad_norm": 0.3602566421031952, + "learning_rate": 0.00019003401630023534, + "loss": 1.1899, + "step": 75520 + }, + { + "epoch": 0.16047345298906504, + "grad_norm": 0.4025369882583618, + "learning_rate": 0.0001900310520480944, + "loss": 1.2014, + "step": 75530 + }, + { + "epoch": 0.1604946993088041, + "grad_norm": 0.3326391875743866, + "learning_rate": 0.00019002808737830594, + "loss": 1.206, + "step": 75540 + }, + { + "epoch": 0.16051594562854316, + "grad_norm": 0.40381813049316406, + "learning_rate": 0.00019002512229088366, + "loss": 1.2121, + "step": 75550 + }, + { + "epoch": 0.1605371919482822, + "grad_norm": 0.33510276675224304, + "learning_rate": 0.00019002215678584145, + "loss": 1.1826, + "step": 75560 + }, + { + "epoch": 0.16055843826802127, + "grad_norm": 0.45090925693511963, + "learning_rate": 0.00019001919086319296, + "loss": 1.1861, + "step": 75570 + }, + { + "epoch": 0.1605796845877603, + "grad_norm": 0.3107559382915497, + "learning_rate": 0.00019001622452295203, + "loss": 1.1962, + "step": 75580 + }, + { + "epoch": 0.16060093090749936, + "grad_norm": 0.35123392939567566, + "learning_rate": 0.0001900132577651323, + "loss": 1.2048, + "step": 75590 + }, + { + "epoch": 0.16062217722723843, + "grad_norm": 0.32223406434059143, + "learning_rate": 0.00019001029058974766, + "loss": 1.1868, + "step": 75600 + }, + { + "epoch": 0.16064342354697747, + "grad_norm": 0.3315802216529846, + "learning_rate": 0.00019000732299681182, + "loss": 1.2341, + "step": 75610 + }, + { + "epoch": 0.16066466986671651, + "grad_norm": 0.3863195478916168, + "learning_rate": 0.00019000435498633852, + "loss": 1.1852, + "step": 75620 + }, + { + "epoch": 0.16068591618645559, + "grad_norm": 0.47598353028297424, + "learning_rate": 0.00019000138655834157, + "loss": 1.1864, + "step": 75630 + }, + { + "epoch": 0.16070716250619463, + "grad_norm": 0.3932102620601654, + "learning_rate": 0.00018999841771283473, + "loss": 1.1993, + "step": 75640 + }, + { + "epoch": 0.16072840882593367, + "grad_norm": 0.33738136291503906, + "learning_rate": 0.00018999544844983178, + "loss": 1.1957, + "step": 75650 + }, + { + "epoch": 0.16074965514567274, + "grad_norm": 0.32903817296028137, + "learning_rate": 0.00018999247876934645, + "loss": 1.1709, + "step": 75660 + }, + { + "epoch": 0.1607709014654118, + "grad_norm": 0.34358975291252136, + "learning_rate": 0.00018998950867139258, + "loss": 1.2057, + "step": 75670 + }, + { + "epoch": 0.16079214778515083, + "grad_norm": 0.38163456320762634, + "learning_rate": 0.0001899865381559839, + "loss": 1.1902, + "step": 75680 + }, + { + "epoch": 0.1608133941048899, + "grad_norm": 0.3654797077178955, + "learning_rate": 0.00018998356722313423, + "loss": 1.1925, + "step": 75690 + }, + { + "epoch": 0.16083464042462894, + "grad_norm": 0.34574180841445923, + "learning_rate": 0.0001899805958728573, + "loss": 1.1902, + "step": 75700 + }, + { + "epoch": 0.160855886744368, + "grad_norm": 0.372376948595047, + "learning_rate": 0.0001899776241051669, + "loss": 1.2447, + "step": 75710 + }, + { + "epoch": 0.16087713306410706, + "grad_norm": 0.465036004781723, + "learning_rate": 0.00018997465192007688, + "loss": 1.1395, + "step": 75720 + }, + { + "epoch": 0.1608983793838461, + "grad_norm": 0.5675719380378723, + "learning_rate": 0.00018997167931760097, + "loss": 1.2111, + "step": 75730 + }, + { + "epoch": 0.16091962570358515, + "grad_norm": 0.38592439889907837, + "learning_rate": 0.000189968706297753, + "loss": 1.1771, + "step": 75740 + }, + { + "epoch": 0.16094087202332422, + "grad_norm": 0.3405519127845764, + "learning_rate": 0.00018996573286054672, + "loss": 1.206, + "step": 75750 + }, + { + "epoch": 0.16096211834306326, + "grad_norm": 0.32827815413475037, + "learning_rate": 0.00018996275900599595, + "loss": 1.1976, + "step": 75760 + }, + { + "epoch": 0.1609833646628023, + "grad_norm": 0.3778216540813446, + "learning_rate": 0.00018995978473411446, + "loss": 1.1961, + "step": 75770 + }, + { + "epoch": 0.16100461098254137, + "grad_norm": 0.36537009477615356, + "learning_rate": 0.00018995681004491607, + "loss": 1.185, + "step": 75780 + }, + { + "epoch": 0.16102585730228042, + "grad_norm": 0.5034687519073486, + "learning_rate": 0.00018995383493841455, + "loss": 1.1847, + "step": 75790 + }, + { + "epoch": 0.16104710362201946, + "grad_norm": 0.5252525210380554, + "learning_rate": 0.00018995085941462376, + "loss": 1.2037, + "step": 75800 + }, + { + "epoch": 0.16106834994175853, + "grad_norm": 0.34367576241493225, + "learning_rate": 0.00018994788347355746, + "loss": 1.2205, + "step": 75810 + }, + { + "epoch": 0.16108959626149758, + "grad_norm": 0.3444961905479431, + "learning_rate": 0.00018994490711522944, + "loss": 1.2113, + "step": 75820 + }, + { + "epoch": 0.16111084258123662, + "grad_norm": 0.47542065382003784, + "learning_rate": 0.00018994193033965354, + "loss": 1.1775, + "step": 75830 + }, + { + "epoch": 0.1611320889009757, + "grad_norm": 0.36840179562568665, + "learning_rate": 0.00018993895314684358, + "loss": 1.2425, + "step": 75840 + }, + { + "epoch": 0.16115333522071473, + "grad_norm": 0.3082975447177887, + "learning_rate": 0.00018993597553681335, + "loss": 1.2302, + "step": 75850 + }, + { + "epoch": 0.16117458154045378, + "grad_norm": 0.34816572070121765, + "learning_rate": 0.00018993299750957664, + "loss": 1.2162, + "step": 75860 + }, + { + "epoch": 0.16119582786019285, + "grad_norm": 0.3498188555240631, + "learning_rate": 0.00018993001906514732, + "loss": 1.1634, + "step": 75870 + }, + { + "epoch": 0.1612170741799319, + "grad_norm": 0.3702709972858429, + "learning_rate": 0.00018992704020353912, + "loss": 1.2057, + "step": 75880 + }, + { + "epoch": 0.16123832049967093, + "grad_norm": 0.40814825892448425, + "learning_rate": 0.00018992406092476597, + "loss": 1.193, + "step": 75890 + }, + { + "epoch": 0.16125956681941, + "grad_norm": 0.4503466784954071, + "learning_rate": 0.00018992108122884162, + "loss": 1.2456, + "step": 75900 + }, + { + "epoch": 0.16128081313914905, + "grad_norm": 0.3193706274032593, + "learning_rate": 0.00018991810111577989, + "loss": 1.2038, + "step": 75910 + }, + { + "epoch": 0.16130205945888812, + "grad_norm": 0.40721967816352844, + "learning_rate": 0.00018991512058559465, + "loss": 1.2771, + "step": 75920 + }, + { + "epoch": 0.16132330577862716, + "grad_norm": 0.34305593371391296, + "learning_rate": 0.00018991213963829966, + "loss": 1.1832, + "step": 75930 + }, + { + "epoch": 0.1613445520983662, + "grad_norm": 0.3408035933971405, + "learning_rate": 0.0001899091582739088, + "loss": 1.2164, + "step": 75940 + }, + { + "epoch": 0.16136579841810528, + "grad_norm": 0.4028477966785431, + "learning_rate": 0.0001899061764924359, + "loss": 1.1708, + "step": 75950 + }, + { + "epoch": 0.16138704473784432, + "grad_norm": 0.37482985854148865, + "learning_rate": 0.00018990319429389478, + "loss": 1.187, + "step": 75960 + }, + { + "epoch": 0.16140829105758336, + "grad_norm": 0.30684909224510193, + "learning_rate": 0.00018990021167829925, + "loss": 1.1632, + "step": 75970 + }, + { + "epoch": 0.16142953737732244, + "grad_norm": 0.43313080072402954, + "learning_rate": 0.00018989722864566318, + "loss": 1.2281, + "step": 75980 + }, + { + "epoch": 0.16145078369706148, + "grad_norm": 0.3302537798881531, + "learning_rate": 0.0001898942451960004, + "loss": 1.2083, + "step": 75990 + }, + { + "epoch": 0.16147203001680052, + "grad_norm": 0.32494616508483887, + "learning_rate": 0.00018989126132932475, + "loss": 1.1845, + "step": 76000 + }, + { + "epoch": 0.1614932763365396, + "grad_norm": 0.3725050091743469, + "learning_rate": 0.00018988827704565007, + "loss": 1.1641, + "step": 76010 + }, + { + "epoch": 0.16151452265627864, + "grad_norm": 0.3260229229927063, + "learning_rate": 0.00018988529234499016, + "loss": 1.2306, + "step": 76020 + }, + { + "epoch": 0.16153576897601768, + "grad_norm": 0.3318823277950287, + "learning_rate": 0.00018988230722735897, + "loss": 1.1606, + "step": 76030 + }, + { + "epoch": 0.16155701529575675, + "grad_norm": 0.38250279426574707, + "learning_rate": 0.00018987932169277027, + "loss": 1.2061, + "step": 76040 + }, + { + "epoch": 0.1615782616154958, + "grad_norm": 0.46871447563171387, + "learning_rate": 0.0001898763357412379, + "loss": 1.1876, + "step": 76050 + }, + { + "epoch": 0.16159950793523484, + "grad_norm": 0.3492083251476288, + "learning_rate": 0.00018987334937277576, + "loss": 1.2293, + "step": 76060 + }, + { + "epoch": 0.1616207542549739, + "grad_norm": 0.3259833753108978, + "learning_rate": 0.0001898703625873977, + "loss": 1.2288, + "step": 76070 + }, + { + "epoch": 0.16164200057471295, + "grad_norm": 0.4082067310810089, + "learning_rate": 0.00018986737538511753, + "loss": 1.1933, + "step": 76080 + }, + { + "epoch": 0.161663246894452, + "grad_norm": 0.3424185812473297, + "learning_rate": 0.00018986438776594913, + "loss": 1.2119, + "step": 76090 + }, + { + "epoch": 0.16168449321419107, + "grad_norm": 0.3352445662021637, + "learning_rate": 0.00018986139972990636, + "loss": 1.2114, + "step": 76100 + }, + { + "epoch": 0.1617057395339301, + "grad_norm": 0.31112751364707947, + "learning_rate": 0.00018985841127700312, + "loss": 1.1822, + "step": 76110 + }, + { + "epoch": 0.16172698585366915, + "grad_norm": 0.359834223985672, + "learning_rate": 0.0001898554224072532, + "loss": 1.1849, + "step": 76120 + }, + { + "epoch": 0.16174823217340822, + "grad_norm": 0.37668657302856445, + "learning_rate": 0.00018985243312067052, + "loss": 1.1973, + "step": 76130 + }, + { + "epoch": 0.16176947849314727, + "grad_norm": 0.30390632152557373, + "learning_rate": 0.00018984944341726893, + "loss": 1.2077, + "step": 76140 + }, + { + "epoch": 0.1617907248128863, + "grad_norm": 0.3190004527568817, + "learning_rate": 0.0001898464532970623, + "loss": 1.2306, + "step": 76150 + }, + { + "epoch": 0.16181197113262538, + "grad_norm": 0.3558428883552551, + "learning_rate": 0.0001898434627600645, + "loss": 1.1831, + "step": 76160 + }, + { + "epoch": 0.16183321745236443, + "grad_norm": 0.35324352979660034, + "learning_rate": 0.00018984047180628942, + "loss": 1.2251, + "step": 76170 + }, + { + "epoch": 0.16185446377210347, + "grad_norm": 0.37335675954818726, + "learning_rate": 0.00018983748043575088, + "loss": 1.2365, + "step": 76180 + }, + { + "epoch": 0.16187571009184254, + "grad_norm": 0.32533636689186096, + "learning_rate": 0.00018983448864846284, + "loss": 1.2358, + "step": 76190 + }, + { + "epoch": 0.16189695641158158, + "grad_norm": 0.3107481896877289, + "learning_rate": 0.00018983149644443908, + "loss": 1.2021, + "step": 76200 + }, + { + "epoch": 0.16191820273132063, + "grad_norm": 0.37059828639030457, + "learning_rate": 0.00018982850382369358, + "loss": 1.2249, + "step": 76210 + }, + { + "epoch": 0.1619394490510597, + "grad_norm": 0.44948968291282654, + "learning_rate": 0.00018982551078624015, + "loss": 1.2302, + "step": 76220 + }, + { + "epoch": 0.16196069537079874, + "grad_norm": 0.3457394242286682, + "learning_rate": 0.00018982251733209272, + "loss": 1.2116, + "step": 76230 + }, + { + "epoch": 0.16198194169053778, + "grad_norm": 0.33929336071014404, + "learning_rate": 0.00018981952346126514, + "loss": 1.1823, + "step": 76240 + }, + { + "epoch": 0.16200318801027686, + "grad_norm": 0.3466425836086273, + "learning_rate": 0.00018981652917377133, + "loss": 1.1991, + "step": 76250 + }, + { + "epoch": 0.1620244343300159, + "grad_norm": 0.3216744661331177, + "learning_rate": 0.00018981353446962515, + "loss": 1.217, + "step": 76260 + }, + { + "epoch": 0.16204568064975494, + "grad_norm": 0.3391377627849579, + "learning_rate": 0.00018981053934884052, + "loss": 1.1783, + "step": 76270 + }, + { + "epoch": 0.162066926969494, + "grad_norm": 0.4042370021343231, + "learning_rate": 0.00018980754381143133, + "loss": 1.2482, + "step": 76280 + }, + { + "epoch": 0.16208817328923306, + "grad_norm": 0.4866418242454529, + "learning_rate": 0.00018980454785741144, + "loss": 1.1896, + "step": 76290 + }, + { + "epoch": 0.1621094196089721, + "grad_norm": 0.40688174962997437, + "learning_rate": 0.00018980155148679478, + "loss": 1.2024, + "step": 76300 + }, + { + "epoch": 0.16213066592871117, + "grad_norm": 0.36647292971611023, + "learning_rate": 0.00018979855469959529, + "loss": 1.1679, + "step": 76310 + }, + { + "epoch": 0.16215191224845021, + "grad_norm": 0.378343403339386, + "learning_rate": 0.00018979555749582677, + "loss": 1.1807, + "step": 76320 + }, + { + "epoch": 0.16217315856818926, + "grad_norm": 0.3388229012489319, + "learning_rate": 0.00018979255987550323, + "loss": 1.1934, + "step": 76330 + }, + { + "epoch": 0.16219440488792833, + "grad_norm": 0.46499311923980713, + "learning_rate": 0.00018978956183863852, + "loss": 1.2161, + "step": 76340 + }, + { + "epoch": 0.16221565120766737, + "grad_norm": 0.6739134192466736, + "learning_rate": 0.00018978656338524655, + "loss": 1.2085, + "step": 76350 + }, + { + "epoch": 0.16223689752740642, + "grad_norm": 0.35225710272789, + "learning_rate": 0.00018978356451534122, + "loss": 1.2094, + "step": 76360 + }, + { + "epoch": 0.1622581438471455, + "grad_norm": 0.3375568985939026, + "learning_rate": 0.00018978056522893648, + "loss": 1.1837, + "step": 76370 + }, + { + "epoch": 0.16227939016688453, + "grad_norm": 0.3929598033428192, + "learning_rate": 0.0001897775655260462, + "loss": 1.1935, + "step": 76380 + }, + { + "epoch": 0.16230063648662357, + "grad_norm": 0.38114893436431885, + "learning_rate": 0.0001897745654066843, + "loss": 1.1959, + "step": 76390 + }, + { + "epoch": 0.16232188280636264, + "grad_norm": 0.35508161783218384, + "learning_rate": 0.00018977156487086475, + "loss": 1.2084, + "step": 76400 + }, + { + "epoch": 0.1623431291261017, + "grad_norm": 0.4014102816581726, + "learning_rate": 0.00018976856391860142, + "loss": 1.1827, + "step": 76410 + }, + { + "epoch": 0.16236437544584073, + "grad_norm": 0.3351564109325409, + "learning_rate": 0.00018976556254990824, + "loss": 1.2032, + "step": 76420 + }, + { + "epoch": 0.1623856217655798, + "grad_norm": 0.40406620502471924, + "learning_rate": 0.00018976256076479916, + "loss": 1.2373, + "step": 76430 + }, + { + "epoch": 0.16240686808531885, + "grad_norm": 0.36916229128837585, + "learning_rate": 0.00018975955856328805, + "loss": 1.2107, + "step": 76440 + }, + { + "epoch": 0.1624281144050579, + "grad_norm": 0.4732893109321594, + "learning_rate": 0.0001897565559453889, + "loss": 1.208, + "step": 76450 + }, + { + "epoch": 0.16244936072479696, + "grad_norm": 0.33843815326690674, + "learning_rate": 0.00018975355291111557, + "loss": 1.213, + "step": 76460 + }, + { + "epoch": 0.162470607044536, + "grad_norm": 0.355246365070343, + "learning_rate": 0.00018975054946048205, + "loss": 1.2243, + "step": 76470 + }, + { + "epoch": 0.16249185336427505, + "grad_norm": 0.30408158898353577, + "learning_rate": 0.00018974754559350222, + "loss": 1.2118, + "step": 76480 + }, + { + "epoch": 0.16251309968401412, + "grad_norm": 0.338474839925766, + "learning_rate": 0.0001897445413101901, + "loss": 1.1876, + "step": 76490 + }, + { + "epoch": 0.16253434600375316, + "grad_norm": 0.37419557571411133, + "learning_rate": 0.00018974153661055953, + "loss": 1.1933, + "step": 76500 + } + ], + "logging_steps": 10, + "max_steps": 470669, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 9.571013687771136e+19, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}