{ "best_metric": 0.26086652278900146, "best_model_checkpoint": "/content/drive/MyDrive/LLMproject/model_outputs/checkpoint-3250", "epoch": 0.6577615867233354, "eval_steps": 50, "global_step": 3250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020238818053025704, "grad_norm": 14.387454986572266, "learning_rate": 4e-05, "loss": 1.9191, "step": 1 }, { "epoch": 0.0004047763610605141, "grad_norm": 14.54423713684082, "learning_rate": 8e-05, "loss": 1.9108, "step": 2 }, { "epoch": 0.0006071645415907711, "grad_norm": 1.4354013204574585, "learning_rate": 0.00012, "loss": 1.7398, "step": 3 }, { "epoch": 0.0008095527221210282, "grad_norm": 0.9231274127960205, "learning_rate": 0.00016, "loss": 1.678, "step": 4 }, { "epoch": 0.0010119409026512851, "grad_norm": 1.798851728439331, "learning_rate": 0.0002, "loss": 1.6315, "step": 5 }, { "epoch": 0.0012143290831815423, "grad_norm": 0.8043261170387268, "learning_rate": 0.00019999997974559766, "loss": 1.5256, "step": 6 }, { "epoch": 0.0014167172637117992, "grad_norm": 0.6839005351066589, "learning_rate": 0.0001999999189823988, "loss": 1.5078, "step": 7 }, { "epoch": 0.0016191054442420564, "grad_norm": 0.8023523092269897, "learning_rate": 0.00019999981771042804, "loss": 1.4957, "step": 8 }, { "epoch": 0.0018214936247723133, "grad_norm": 0.6323018074035645, "learning_rate": 0.0001999996759297264, "loss": 1.404, "step": 9 }, { "epoch": 0.0020238818053025702, "grad_norm": 0.6342468857765198, "learning_rate": 0.00019999949364035138, "loss": 1.3995, "step": 10 }, { "epoch": 0.002226269985832827, "grad_norm": 0.5285978317260742, "learning_rate": 0.00019999927084237676, "loss": 1.3504, "step": 11 }, { "epoch": 0.0024286581663630845, "grad_norm": 0.8137556314468384, "learning_rate": 0.00019999900753589275, "loss": 1.3175, "step": 12 }, { "epoch": 0.0026310463468933415, "grad_norm": 0.6766992211341858, "learning_rate": 0.00019999870372100614, "loss": 1.3945, "step": 13 }, { "epoch": 0.0028334345274235984, "grad_norm": 0.6511569023132324, "learning_rate": 0.00019999835939783986, "loss": 1.3075, "step": 14 }, { "epoch": 0.0030358227079538553, "grad_norm": 0.6129029393196106, "learning_rate": 0.00019999797456653347, "loss": 1.3322, "step": 15 }, { "epoch": 0.0032382108884841127, "grad_norm": 0.6257243752479553, "learning_rate": 0.00019999754922724288, "loss": 1.2804, "step": 16 }, { "epoch": 0.0034405990690143697, "grad_norm": 0.7010571956634521, "learning_rate": 0.00019999708338014035, "loss": 1.2925, "step": 17 }, { "epoch": 0.0036429872495446266, "grad_norm": 0.6821787357330322, "learning_rate": 0.0001999965770254146, "loss": 1.2984, "step": 18 }, { "epoch": 0.0038453754300748835, "grad_norm": 0.6434952020645142, "learning_rate": 0.00019999603016327073, "loss": 1.2494, "step": 19 }, { "epoch": 0.0040477636106051405, "grad_norm": 0.6330264210700989, "learning_rate": 0.0001999954427939303, "loss": 1.2269, "step": 20 }, { "epoch": 0.004250151791135397, "grad_norm": 0.7511361241340637, "learning_rate": 0.0001999948149176312, "loss": 1.1629, "step": 21 }, { "epoch": 0.004452539971665654, "grad_norm": 0.7411013841629028, "learning_rate": 0.00019999414653462785, "loss": 1.2053, "step": 22 }, { "epoch": 0.004654928152195912, "grad_norm": 0.7284002304077148, "learning_rate": 0.00019999343764519093, "loss": 1.2111, "step": 23 }, { "epoch": 0.004857316332726169, "grad_norm": 0.8637538552284241, "learning_rate": 0.00019999268824960763, "loss": 1.1704, "step": 24 }, { "epoch": 0.005059704513256426, "grad_norm": 0.7000638842582703, "learning_rate": 0.00019999189834818155, "loss": 1.1803, "step": 25 }, { "epoch": 0.005262092693786683, "grad_norm": 0.6226357817649841, "learning_rate": 0.00019999106794123264, "loss": 1.1526, "step": 26 }, { "epoch": 0.00546448087431694, "grad_norm": 0.652762234210968, "learning_rate": 0.00019999019702909724, "loss": 1.1265, "step": 27 }, { "epoch": 0.005666869054847197, "grad_norm": 0.5238654613494873, "learning_rate": 0.00019998928561212824, "loss": 1.1542, "step": 28 }, { "epoch": 0.005869257235377454, "grad_norm": 0.6347120404243469, "learning_rate": 0.0001999883336906948, "loss": 1.1418, "step": 29 }, { "epoch": 0.006071645415907711, "grad_norm": 0.5927608013153076, "learning_rate": 0.00019998734126518253, "loss": 1.1415, "step": 30 }, { "epoch": 0.006274033596437968, "grad_norm": 0.5574004054069519, "learning_rate": 0.00019998630833599346, "loss": 1.1118, "step": 31 }, { "epoch": 0.0064764217769682254, "grad_norm": 0.6282877326011658, "learning_rate": 0.000199985234903546, "loss": 1.0813, "step": 32 }, { "epoch": 0.006678809957498482, "grad_norm": 0.5187103748321533, "learning_rate": 0.000199984120968275, "loss": 1.0808, "step": 33 }, { "epoch": 0.006881198138028739, "grad_norm": 0.6009660959243774, "learning_rate": 0.00019998296653063168, "loss": 1.099, "step": 34 }, { "epoch": 0.007083586318558996, "grad_norm": 0.6281402707099915, "learning_rate": 0.0001999817715910837, "loss": 1.0743, "step": 35 }, { "epoch": 0.007285974499089253, "grad_norm": 0.7426573038101196, "learning_rate": 0.00019998053615011513, "loss": 1.0673, "step": 36 }, { "epoch": 0.00748836267961951, "grad_norm": 0.5853238701820374, "learning_rate": 0.00019997926020822644, "loss": 1.1277, "step": 37 }, { "epoch": 0.007690750860149767, "grad_norm": 0.6665608882904053, "learning_rate": 0.00019997794376593446, "loss": 1.0492, "step": 38 }, { "epoch": 0.007893139040680024, "grad_norm": 0.5596454739570618, "learning_rate": 0.0001999765868237725, "loss": 1.1303, "step": 39 }, { "epoch": 0.008095527221210281, "grad_norm": 0.6165941953659058, "learning_rate": 0.00019997518938229023, "loss": 1.1246, "step": 40 }, { "epoch": 0.008297915401740538, "grad_norm": 0.5157654285430908, "learning_rate": 0.00019997375144205373, "loss": 1.0574, "step": 41 }, { "epoch": 0.008500303582270795, "grad_norm": 0.5184974670410156, "learning_rate": 0.0001999722730036455, "loss": 1.11, "step": 42 }, { "epoch": 0.008702691762801052, "grad_norm": 0.5431533455848694, "learning_rate": 0.00019997075406766443, "loss": 1.0863, "step": 43 }, { "epoch": 0.008905079943331309, "grad_norm": 0.521007239818573, "learning_rate": 0.00019996919463472583, "loss": 1.0665, "step": 44 }, { "epoch": 0.009107468123861567, "grad_norm": 0.6022652387619019, "learning_rate": 0.0001999675947054614, "loss": 1.1089, "step": 45 }, { "epoch": 0.009309856304391824, "grad_norm": 0.536115288734436, "learning_rate": 0.00019996595428051927, "loss": 1.0642, "step": 46 }, { "epoch": 0.009512244484922081, "grad_norm": 0.5365018844604492, "learning_rate": 0.00019996427336056393, "loss": 1.0322, "step": 47 }, { "epoch": 0.009714632665452338, "grad_norm": 0.5249005556106567, "learning_rate": 0.00019996255194627634, "loss": 1.0731, "step": 48 }, { "epoch": 0.009917020845982595, "grad_norm": 0.5967987179756165, "learning_rate": 0.00019996079003835378, "loss": 1.0311, "step": 49 }, { "epoch": 0.010119409026512852, "grad_norm": 0.5689749121665955, "learning_rate": 0.00019995898763751, "loss": 1.0439, "step": 50 }, { "epoch": 0.010119409026512852, "eval_loss": 1.093253254890442, "eval_runtime": 1.3322, "eval_samples_per_second": 3.753, "eval_steps_per_second": 0.751, "step": 50 }, { "epoch": 0.010321797207043109, "grad_norm": 0.5671619772911072, "learning_rate": 0.00019995714474447512, "loss": 0.9692, "step": 51 }, { "epoch": 0.010524185387573366, "grad_norm": 0.5940735340118408, "learning_rate": 0.0001999552613599957, "loss": 1.0246, "step": 52 }, { "epoch": 0.010726573568103623, "grad_norm": 0.6614010334014893, "learning_rate": 0.00019995333748483465, "loss": 0.9914, "step": 53 }, { "epoch": 0.01092896174863388, "grad_norm": 0.4534197151660919, "learning_rate": 0.0001999513731197713, "loss": 1.0236, "step": 54 }, { "epoch": 0.011131349929164137, "grad_norm": 0.5843859910964966, "learning_rate": 0.0001999493682656014, "loss": 0.9769, "step": 55 }, { "epoch": 0.011333738109694394, "grad_norm": 0.4622138738632202, "learning_rate": 0.00019994732292313717, "loss": 1.0538, "step": 56 }, { "epoch": 0.01153612629022465, "grad_norm": 0.49927034974098206, "learning_rate": 0.00019994523709320703, "loss": 1.0295, "step": 57 }, { "epoch": 0.011738514470754908, "grad_norm": 0.49757030606269836, "learning_rate": 0.00019994311077665598, "loss": 1.1059, "step": 58 }, { "epoch": 0.011940902651285164, "grad_norm": 0.5178074836730957, "learning_rate": 0.00019994094397434537, "loss": 1.0039, "step": 59 }, { "epoch": 0.012143290831815421, "grad_norm": 0.599951446056366, "learning_rate": 0.00019993873668715297, "loss": 1.1223, "step": 60 }, { "epoch": 0.012345679012345678, "grad_norm": 0.6430076956748962, "learning_rate": 0.00019993648891597284, "loss": 1.0171, "step": 61 }, { "epoch": 0.012548067192875935, "grad_norm": 0.552902340888977, "learning_rate": 0.00019993420066171562, "loss": 1.0158, "step": 62 }, { "epoch": 0.012750455373406194, "grad_norm": 0.5770806074142456, "learning_rate": 0.00019993187192530822, "loss": 1.0185, "step": 63 }, { "epoch": 0.012952843553936451, "grad_norm": 0.4967328906059265, "learning_rate": 0.00019992950270769394, "loss": 1.0636, "step": 64 }, { "epoch": 0.013155231734466708, "grad_norm": 0.6269118189811707, "learning_rate": 0.0001999270930098326, "loss": 0.9498, "step": 65 }, { "epoch": 0.013357619914996965, "grad_norm": 0.5689034461975098, "learning_rate": 0.0001999246428327003, "loss": 1.0054, "step": 66 }, { "epoch": 0.013560008095527222, "grad_norm": 0.5415133833885193, "learning_rate": 0.00019992215217728957, "loss": 0.9241, "step": 67 }, { "epoch": 0.013762396276057479, "grad_norm": 0.6960447430610657, "learning_rate": 0.00019991962104460936, "loss": 0.9892, "step": 68 }, { "epoch": 0.013964784456587736, "grad_norm": 0.5459744334220886, "learning_rate": 0.00019991704943568496, "loss": 0.9765, "step": 69 }, { "epoch": 0.014167172637117992, "grad_norm": 0.7261528372764587, "learning_rate": 0.00019991443735155814, "loss": 0.9914, "step": 70 }, { "epoch": 0.01436956081764825, "grad_norm": 0.601763129234314, "learning_rate": 0.00019991178479328704, "loss": 1.0302, "step": 71 }, { "epoch": 0.014571948998178506, "grad_norm": 0.6255568861961365, "learning_rate": 0.00019990909176194617, "loss": 1.0223, "step": 72 }, { "epoch": 0.014774337178708763, "grad_norm": 0.5464239120483398, "learning_rate": 0.0001999063582586264, "loss": 1.0139, "step": 73 }, { "epoch": 0.01497672535923902, "grad_norm": 0.6552570462226868, "learning_rate": 0.00019990358428443507, "loss": 0.9722, "step": 74 }, { "epoch": 0.015179113539769277, "grad_norm": 0.5881670713424683, "learning_rate": 0.0001999007698404959, "loss": 0.9769, "step": 75 }, { "epoch": 0.015381501720299534, "grad_norm": 0.6116035580635071, "learning_rate": 0.00019989791492794896, "loss": 0.9665, "step": 76 }, { "epoch": 0.015583889900829791, "grad_norm": 0.647977888584137, "learning_rate": 0.00019989501954795075, "loss": 0.997, "step": 77 }, { "epoch": 0.015786278081360048, "grad_norm": 0.6350228190422058, "learning_rate": 0.0001998920837016742, "loss": 0.9935, "step": 78 }, { "epoch": 0.015988666261890307, "grad_norm": 0.6319441795349121, "learning_rate": 0.0001998891073903085, "loss": 0.9594, "step": 79 }, { "epoch": 0.016191054442420562, "grad_norm": 0.5678344964981079, "learning_rate": 0.00019988609061505935, "loss": 0.968, "step": 80 }, { "epoch": 0.01639344262295082, "grad_norm": 0.5742438435554504, "learning_rate": 0.00019988303337714887, "loss": 1.0413, "step": 81 }, { "epoch": 0.016595830803481076, "grad_norm": 0.584251344203949, "learning_rate": 0.00019987993567781542, "loss": 0.9842, "step": 82 }, { "epoch": 0.016798218984011334, "grad_norm": 0.5619480013847351, "learning_rate": 0.00019987679751831388, "loss": 0.9695, "step": 83 }, { "epoch": 0.01700060716454159, "grad_norm": 0.5736238956451416, "learning_rate": 0.0001998736188999155, "loss": 0.9442, "step": 84 }, { "epoch": 0.01720299534507185, "grad_norm": 0.7481099367141724, "learning_rate": 0.0001998703998239079, "loss": 1.0483, "step": 85 }, { "epoch": 0.017405383525602103, "grad_norm": 0.549994945526123, "learning_rate": 0.00019986714029159502, "loss": 0.9963, "step": 86 }, { "epoch": 0.017607771706132362, "grad_norm": 0.6068609952926636, "learning_rate": 0.00019986384030429736, "loss": 0.9184, "step": 87 }, { "epoch": 0.017810159886662617, "grad_norm": 0.5920400023460388, "learning_rate": 0.0001998604998633516, "loss": 0.9513, "step": 88 }, { "epoch": 0.018012548067192876, "grad_norm": 0.6460102200508118, "learning_rate": 0.00019985711897011103, "loss": 0.8895, "step": 89 }, { "epoch": 0.018214936247723135, "grad_norm": 0.5728783011436462, "learning_rate": 0.00019985369762594513, "loss": 0.9637, "step": 90 }, { "epoch": 0.01841732442825339, "grad_norm": 0.6623243093490601, "learning_rate": 0.00019985023583223986, "loss": 0.9549, "step": 91 }, { "epoch": 0.01861971260878365, "grad_norm": 0.6371002793312073, "learning_rate": 0.00019984673359039757, "loss": 0.9168, "step": 92 }, { "epoch": 0.018822100789313904, "grad_norm": 0.5405300259590149, "learning_rate": 0.00019984319090183693, "loss": 1.0345, "step": 93 }, { "epoch": 0.019024488969844162, "grad_norm": 0.6232616305351257, "learning_rate": 0.00019983960776799307, "loss": 0.9681, "step": 94 }, { "epoch": 0.019226877150374418, "grad_norm": 0.5972675681114197, "learning_rate": 0.0001998359841903175, "loss": 0.8218, "step": 95 }, { "epoch": 0.019429265330904676, "grad_norm": 0.5877737998962402, "learning_rate": 0.00019983232017027803, "loss": 0.9205, "step": 96 }, { "epoch": 0.01963165351143493, "grad_norm": 0.5897297859191895, "learning_rate": 0.000199828615709359, "loss": 0.9431, "step": 97 }, { "epoch": 0.01983404169196519, "grad_norm": 0.5962453484535217, "learning_rate": 0.00019982487080906096, "loss": 0.9437, "step": 98 }, { "epoch": 0.020036429872495445, "grad_norm": 0.5518609881401062, "learning_rate": 0.00019982108547090093, "loss": 0.9639, "step": 99 }, { "epoch": 0.020238818053025704, "grad_norm": 0.5038187503814697, "learning_rate": 0.00019981725969641235, "loss": 0.975, "step": 100 }, { "epoch": 0.020238818053025704, "eval_loss": 1.005001187324524, "eval_runtime": 1.3203, "eval_samples_per_second": 3.787, "eval_steps_per_second": 0.757, "step": 100 }, { "epoch": 0.02044120623355596, "grad_norm": 0.6186122894287109, "learning_rate": 0.000199813393487145, "loss": 0.9775, "step": 101 }, { "epoch": 0.020643594414086218, "grad_norm": 0.5972813367843628, "learning_rate": 0.00019980948684466493, "loss": 0.8724, "step": 102 }, { "epoch": 0.020845982594616473, "grad_norm": 0.6249723434448242, "learning_rate": 0.00019980553977055483, "loss": 0.9211, "step": 103 }, { "epoch": 0.021048370775146732, "grad_norm": 0.6906153559684753, "learning_rate": 0.00019980155226641347, "loss": 1.0598, "step": 104 }, { "epoch": 0.021250758955676987, "grad_norm": 0.6543189287185669, "learning_rate": 0.00019979752433385627, "loss": 1.0285, "step": 105 }, { "epoch": 0.021453147136207246, "grad_norm": 0.6009184718132019, "learning_rate": 0.0001997934559745148, "loss": 0.9117, "step": 106 }, { "epoch": 0.0216555353167375, "grad_norm": 0.7261719703674316, "learning_rate": 0.00019978934719003716, "loss": 0.9212, "step": 107 }, { "epoch": 0.02185792349726776, "grad_norm": 0.5727847218513489, "learning_rate": 0.0001997851979820877, "loss": 0.8583, "step": 108 }, { "epoch": 0.022060311677798018, "grad_norm": 0.6676685810089111, "learning_rate": 0.0001997810083523473, "loss": 0.9655, "step": 109 }, { "epoch": 0.022262699858328273, "grad_norm": 0.6750203967094421, "learning_rate": 0.00019977677830251307, "loss": 1.0024, "step": 110 }, { "epoch": 0.022465088038858532, "grad_norm": 0.646692156791687, "learning_rate": 0.00019977250783429856, "loss": 0.9552, "step": 111 }, { "epoch": 0.022667476219388787, "grad_norm": 0.6072238087654114, "learning_rate": 0.00019976819694943375, "loss": 0.9803, "step": 112 }, { "epoch": 0.022869864399919046, "grad_norm": 0.5723574757575989, "learning_rate": 0.00019976384564966481, "loss": 0.8494, "step": 113 }, { "epoch": 0.0230722525804493, "grad_norm": 0.6598263382911682, "learning_rate": 0.00019975945393675451, "loss": 0.9903, "step": 114 }, { "epoch": 0.02327464076097956, "grad_norm": 0.5655977129936218, "learning_rate": 0.00019975502181248183, "loss": 0.9088, "step": 115 }, { "epoch": 0.023477028941509815, "grad_norm": 0.5673314929008484, "learning_rate": 0.00019975054927864213, "loss": 0.8747, "step": 116 }, { "epoch": 0.023679417122040074, "grad_norm": 0.5771192312240601, "learning_rate": 0.00019974603633704727, "loss": 0.9461, "step": 117 }, { "epoch": 0.02388180530257033, "grad_norm": 0.607169508934021, "learning_rate": 0.00019974148298952535, "loss": 0.9328, "step": 118 }, { "epoch": 0.024084193483100588, "grad_norm": 0.5910905599594116, "learning_rate": 0.00019973688923792087, "loss": 0.9596, "step": 119 }, { "epoch": 0.024286581663630843, "grad_norm": 0.5878623127937317, "learning_rate": 0.00019973225508409468, "loss": 0.9234, "step": 120 }, { "epoch": 0.0244889698441611, "grad_norm": 0.5838086009025574, "learning_rate": 0.00019972758052992404, "loss": 0.8643, "step": 121 }, { "epoch": 0.024691358024691357, "grad_norm": 0.5974692106246948, "learning_rate": 0.0001997228655773026, "loss": 0.962, "step": 122 }, { "epoch": 0.024893746205221615, "grad_norm": 0.5902374982833862, "learning_rate": 0.00019971811022814027, "loss": 0.8958, "step": 123 }, { "epoch": 0.02509613438575187, "grad_norm": 0.5953021049499512, "learning_rate": 0.0001997133144843634, "loss": 0.9473, "step": 124 }, { "epoch": 0.02529852256628213, "grad_norm": 0.6714699268341064, "learning_rate": 0.0001997084783479147, "loss": 0.9178, "step": 125 }, { "epoch": 0.025500910746812388, "grad_norm": 0.6421539187431335, "learning_rate": 0.00019970360182075325, "loss": 0.9202, "step": 126 }, { "epoch": 0.025703298927342643, "grad_norm": 0.5653443336486816, "learning_rate": 0.00019969868490485443, "loss": 0.9563, "step": 127 }, { "epoch": 0.025905687107872902, "grad_norm": 0.6230595707893372, "learning_rate": 0.00019969372760221004, "loss": 0.9789, "step": 128 }, { "epoch": 0.026108075288403157, "grad_norm": 0.6035506129264832, "learning_rate": 0.00019968872991482824, "loss": 0.8771, "step": 129 }, { "epoch": 0.026310463468933416, "grad_norm": 0.6146412491798401, "learning_rate": 0.00019968369184473353, "loss": 0.9315, "step": 130 }, { "epoch": 0.02651285164946367, "grad_norm": 0.6470977067947388, "learning_rate": 0.00019967861339396673, "loss": 0.9055, "step": 131 }, { "epoch": 0.02671523982999393, "grad_norm": 0.6507331728935242, "learning_rate": 0.0001996734945645851, "loss": 0.9258, "step": 132 }, { "epoch": 0.026917628010524185, "grad_norm": 0.5733680129051208, "learning_rate": 0.00019966833535866224, "loss": 0.934, "step": 133 }, { "epoch": 0.027120016191054443, "grad_norm": 0.6250080466270447, "learning_rate": 0.00019966313577828802, "loss": 0.8813, "step": 134 }, { "epoch": 0.0273224043715847, "grad_norm": 0.5467213988304138, "learning_rate": 0.00019965789582556878, "loss": 0.9385, "step": 135 }, { "epoch": 0.027524792552114957, "grad_norm": 0.5897491574287415, "learning_rate": 0.00019965261550262712, "loss": 0.9174, "step": 136 }, { "epoch": 0.027727180732645212, "grad_norm": 0.658240556716919, "learning_rate": 0.00019964729481160209, "loss": 0.9221, "step": 137 }, { "epoch": 0.02792956891317547, "grad_norm": 0.5418585538864136, "learning_rate": 0.00019964193375464898, "loss": 0.9339, "step": 138 }, { "epoch": 0.028131957093705726, "grad_norm": 0.590020477771759, "learning_rate": 0.00019963653233393954, "loss": 0.8821, "step": 139 }, { "epoch": 0.028334345274235985, "grad_norm": 0.5390585064888, "learning_rate": 0.0001996310905516618, "loss": 1.0112, "step": 140 }, { "epoch": 0.02853673345476624, "grad_norm": 0.558049201965332, "learning_rate": 0.0001996256084100201, "loss": 0.9541, "step": 141 }, { "epoch": 0.0287391216352965, "grad_norm": 0.6206382513046265, "learning_rate": 0.0001996200859112353, "loss": 0.8687, "step": 142 }, { "epoch": 0.028941509815826754, "grad_norm": 0.6574843525886536, "learning_rate": 0.00019961452305754444, "loss": 0.8893, "step": 143 }, { "epoch": 0.029143897996357013, "grad_norm": 0.6227236986160278, "learning_rate": 0.00019960891985120095, "loss": 0.9509, "step": 144 }, { "epoch": 0.02934628617688727, "grad_norm": 0.6090421676635742, "learning_rate": 0.00019960327629447468, "loss": 0.9742, "step": 145 }, { "epoch": 0.029548674357417527, "grad_norm": 0.6087396740913391, "learning_rate": 0.00019959759238965172, "loss": 0.9793, "step": 146 }, { "epoch": 0.029751062537947785, "grad_norm": 0.6357349157333374, "learning_rate": 0.00019959186813903456, "loss": 0.9473, "step": 147 }, { "epoch": 0.02995345071847804, "grad_norm": 0.6962687373161316, "learning_rate": 0.00019958610354494204, "loss": 0.903, "step": 148 }, { "epoch": 0.0301558388990083, "grad_norm": 0.6479822993278503, "learning_rate": 0.0001995802986097093, "loss": 0.8944, "step": 149 }, { "epoch": 0.030358227079538554, "grad_norm": 0.6951386332511902, "learning_rate": 0.00019957445333568786, "loss": 0.858, "step": 150 }, { "epoch": 0.030358227079538554, "eval_loss": 0.9309841394424438, "eval_runtime": 1.3151, "eval_samples_per_second": 3.802, "eval_steps_per_second": 0.76, "step": 150 }, { "epoch": 0.030560615260068813, "grad_norm": 0.7290776371955872, "learning_rate": 0.0001995685677252456, "loss": 0.9445, "step": 151 }, { "epoch": 0.030763003440599068, "grad_norm": 0.6124785542488098, "learning_rate": 0.00019956264178076666, "loss": 0.9573, "step": 152 }, { "epoch": 0.030965391621129327, "grad_norm": 0.6602783799171448, "learning_rate": 0.00019955667550465163, "loss": 0.8799, "step": 153 }, { "epoch": 0.031167779801659582, "grad_norm": 0.6001726984977722, "learning_rate": 0.0001995506688993173, "loss": 0.8796, "step": 154 }, { "epoch": 0.03137016798218984, "grad_norm": 0.6315905451774597, "learning_rate": 0.00019954462196719697, "loss": 0.935, "step": 155 }, { "epoch": 0.031572556162720096, "grad_norm": 0.6410464644432068, "learning_rate": 0.0001995385347107401, "loss": 0.9021, "step": 156 }, { "epoch": 0.03177494434325035, "grad_norm": 0.6680133938789368, "learning_rate": 0.0001995324071324126, "loss": 0.9685, "step": 157 }, { "epoch": 0.03197733252378061, "grad_norm": 0.6096596121788025, "learning_rate": 0.00019952623923469667, "loss": 0.9313, "step": 158 }, { "epoch": 0.03217972070431087, "grad_norm": 0.5973380208015442, "learning_rate": 0.00019952003102009083, "loss": 0.9305, "step": 159 }, { "epoch": 0.032382108884841124, "grad_norm": 0.5799363255500793, "learning_rate": 0.00019951378249111, "loss": 0.8796, "step": 160 }, { "epoch": 0.03258449706537138, "grad_norm": 0.6738299131393433, "learning_rate": 0.0001995074936502854, "loss": 0.8991, "step": 161 }, { "epoch": 0.03278688524590164, "grad_norm": 0.6220852136611938, "learning_rate": 0.00019950116450016445, "loss": 0.851, "step": 162 }, { "epoch": 0.032989273426431896, "grad_norm": 0.6165013909339905, "learning_rate": 0.00019949479504331112, "loss": 0.8693, "step": 163 }, { "epoch": 0.03319166160696215, "grad_norm": 0.7656255960464478, "learning_rate": 0.0001994883852823056, "loss": 0.8686, "step": 164 }, { "epoch": 0.033394049787492414, "grad_norm": 0.7385483384132385, "learning_rate": 0.00019948193521974434, "loss": 0.9033, "step": 165 }, { "epoch": 0.03359643796802267, "grad_norm": 0.6244580745697021, "learning_rate": 0.00019947544485824023, "loss": 0.8866, "step": 166 }, { "epoch": 0.033798826148552924, "grad_norm": 0.6161398887634277, "learning_rate": 0.00019946891420042243, "loss": 0.8675, "step": 167 }, { "epoch": 0.03400121432908318, "grad_norm": 0.6833418607711792, "learning_rate": 0.00019946234324893646, "loss": 0.8278, "step": 168 }, { "epoch": 0.03420360250961344, "grad_norm": 0.6483539938926697, "learning_rate": 0.00019945573200644406, "loss": 0.8773, "step": 169 }, { "epoch": 0.0344059906901437, "grad_norm": 0.6794087290763855, "learning_rate": 0.0001994490804756234, "loss": 0.9054, "step": 170 }, { "epoch": 0.03460837887067395, "grad_norm": 0.6649914383888245, "learning_rate": 0.00019944238865916898, "loss": 0.8592, "step": 171 }, { "epoch": 0.03481076705120421, "grad_norm": 0.6133947372436523, "learning_rate": 0.00019943565655979154, "loss": 0.9038, "step": 172 }, { "epoch": 0.03501315523173447, "grad_norm": 0.6577770113945007, "learning_rate": 0.00019942888418021814, "loss": 0.8177, "step": 173 }, { "epoch": 0.035215543412264724, "grad_norm": 0.6792947053909302, "learning_rate": 0.00019942207152319225, "loss": 0.811, "step": 174 }, { "epoch": 0.03541793159279498, "grad_norm": 0.6069537997245789, "learning_rate": 0.00019941521859147358, "loss": 0.8304, "step": 175 }, { "epoch": 0.035620319773325235, "grad_norm": 0.7489471435546875, "learning_rate": 0.00019940832538783813, "loss": 0.8619, "step": 176 }, { "epoch": 0.0358227079538555, "grad_norm": 0.6735177636146545, "learning_rate": 0.00019940139191507828, "loss": 0.8761, "step": 177 }, { "epoch": 0.03602509613438575, "grad_norm": 0.6483293175697327, "learning_rate": 0.00019939441817600273, "loss": 0.8032, "step": 178 }, { "epoch": 0.03622748431491601, "grad_norm": 0.7031853795051575, "learning_rate": 0.0001993874041734364, "loss": 0.7786, "step": 179 }, { "epoch": 0.03642987249544627, "grad_norm": 0.7454729676246643, "learning_rate": 0.00019938034991022062, "loss": 0.8408, "step": 180 }, { "epoch": 0.036632260675976525, "grad_norm": 0.629190981388092, "learning_rate": 0.00019937325538921298, "loss": 0.7784, "step": 181 }, { "epoch": 0.03683464885650678, "grad_norm": 0.7828189134597778, "learning_rate": 0.0001993661206132874, "loss": 0.841, "step": 182 }, { "epoch": 0.037037037037037035, "grad_norm": 0.7269335985183716, "learning_rate": 0.000199358945585334, "loss": 0.8545, "step": 183 }, { "epoch": 0.0372394252175673, "grad_norm": 0.6851218342781067, "learning_rate": 0.00019935173030825943, "loss": 0.7988, "step": 184 }, { "epoch": 0.03744181339809755, "grad_norm": 0.716194748878479, "learning_rate": 0.00019934447478498645, "loss": 0.8469, "step": 185 }, { "epoch": 0.03764420157862781, "grad_norm": 0.8001633882522583, "learning_rate": 0.0001993371790184542, "loss": 0.709, "step": 186 }, { "epoch": 0.03784658975915806, "grad_norm": 0.9564402103424072, "learning_rate": 0.00019932984301161807, "loss": 0.7762, "step": 187 }, { "epoch": 0.038048977939688325, "grad_norm": 1.243919849395752, "learning_rate": 0.00019932246676744985, "loss": 0.7994, "step": 188 }, { "epoch": 0.03825136612021858, "grad_norm": 0.6786725521087646, "learning_rate": 0.0001993150502889375, "loss": 0.8775, "step": 189 }, { "epoch": 0.038453754300748835, "grad_norm": 1.0258820056915283, "learning_rate": 0.0001993075935790854, "loss": 0.8347, "step": 190 }, { "epoch": 0.03865614248127909, "grad_norm": 0.6894447803497314, "learning_rate": 0.00019930009664091412, "loss": 0.8879, "step": 191 }, { "epoch": 0.03885853066180935, "grad_norm": 0.8886065483093262, "learning_rate": 0.00019929255947746067, "loss": 0.7962, "step": 192 }, { "epoch": 0.03906091884233961, "grad_norm": 0.7580958604812622, "learning_rate": 0.00019928498209177817, "loss": 0.8185, "step": 193 }, { "epoch": 0.03926330702286986, "grad_norm": 0.9674456715583801, "learning_rate": 0.00019927736448693617, "loss": 0.8535, "step": 194 }, { "epoch": 0.03946569520340012, "grad_norm": 0.9685388207435608, "learning_rate": 0.00019926970666602048, "loss": 0.875, "step": 195 }, { "epoch": 0.03966808338393038, "grad_norm": 0.8533955812454224, "learning_rate": 0.0001992620086321332, "loss": 0.9014, "step": 196 }, { "epoch": 0.039870471564460636, "grad_norm": 0.9767157435417175, "learning_rate": 0.00019925427038839267, "loss": 0.9269, "step": 197 }, { "epoch": 0.04007285974499089, "grad_norm": 0.941107988357544, "learning_rate": 0.0001992464919379336, "loss": 0.7302, "step": 198 }, { "epoch": 0.04027524792552115, "grad_norm": 0.8190636038780212, "learning_rate": 0.0001992386732839069, "loss": 0.8963, "step": 199 }, { "epoch": 0.04047763610605141, "grad_norm": 0.9992904663085938, "learning_rate": 0.00019923081442947983, "loss": 0.7371, "step": 200 }, { "epoch": 0.04047763610605141, "eval_loss": 0.8278228640556335, "eval_runtime": 1.3128, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.762, "step": 200 }, { "epoch": 0.04068002428658166, "grad_norm": 1.1904680728912354, "learning_rate": 0.000199222915377836, "loss": 0.8449, "step": 201 }, { "epoch": 0.04088241246711192, "grad_norm": 1.2412151098251343, "learning_rate": 0.0001992149761321751, "loss": 0.8524, "step": 202 }, { "epoch": 0.04108480064764218, "grad_norm": 0.8818174004554749, "learning_rate": 0.00019920699669571327, "loss": 0.7295, "step": 203 }, { "epoch": 0.041287188828172436, "grad_norm": 1.1238662004470825, "learning_rate": 0.0001991989770716829, "loss": 0.7717, "step": 204 }, { "epoch": 0.04148957700870269, "grad_norm": 0.9506884813308716, "learning_rate": 0.00019919091726333267, "loss": 0.8635, "step": 205 }, { "epoch": 0.041691965189232946, "grad_norm": 0.9091785550117493, "learning_rate": 0.00019918281727392744, "loss": 0.8792, "step": 206 }, { "epoch": 0.04189435336976321, "grad_norm": 1.376017689704895, "learning_rate": 0.00019917467710674845, "loss": 0.8006, "step": 207 }, { "epoch": 0.042096741550293464, "grad_norm": 1.0959804058074951, "learning_rate": 0.0001991664967650932, "loss": 0.7992, "step": 208 }, { "epoch": 0.04229912973082372, "grad_norm": 0.9853761196136475, "learning_rate": 0.00019915827625227546, "loss": 0.7778, "step": 209 }, { "epoch": 0.042501517911353974, "grad_norm": 1.0245534181594849, "learning_rate": 0.0001991500155716252, "loss": 0.645, "step": 210 }, { "epoch": 0.042703906091884236, "grad_norm": 1.2277005910873413, "learning_rate": 0.0001991417147264888, "loss": 0.7919, "step": 211 }, { "epoch": 0.04290629427241449, "grad_norm": 1.0931971073150635, "learning_rate": 0.00019913337372022877, "loss": 0.7563, "step": 212 }, { "epoch": 0.04310868245294475, "grad_norm": 1.6419724225997925, "learning_rate": 0.00019912499255622396, "loss": 0.8459, "step": 213 }, { "epoch": 0.043311070633475, "grad_norm": 1.1451297998428345, "learning_rate": 0.00019911657123786953, "loss": 0.7238, "step": 214 }, { "epoch": 0.043513458814005264, "grad_norm": 1.3817908763885498, "learning_rate": 0.00019910810976857683, "loss": 0.7197, "step": 215 }, { "epoch": 0.04371584699453552, "grad_norm": 0.8958253860473633, "learning_rate": 0.0001990996081517735, "loss": 0.6527, "step": 216 }, { "epoch": 0.043918235175065774, "grad_norm": 1.8711469173431396, "learning_rate": 0.0001990910663909034, "loss": 0.72, "step": 217 }, { "epoch": 0.044120623355596036, "grad_norm": 1.3028295040130615, "learning_rate": 0.00019908248448942674, "loss": 0.698, "step": 218 }, { "epoch": 0.04432301153612629, "grad_norm": 1.1462165117263794, "learning_rate": 0.00019907386245081993, "loss": 0.6602, "step": 219 }, { "epoch": 0.04452539971665655, "grad_norm": 1.4850302934646606, "learning_rate": 0.00019906520027857566, "loss": 0.6217, "step": 220 }, { "epoch": 0.0447277878971868, "grad_norm": 1.1862238645553589, "learning_rate": 0.0001990564979762029, "loss": 0.6577, "step": 221 }, { "epoch": 0.044930176077717064, "grad_norm": 1.1340328454971313, "learning_rate": 0.0001990477555472268, "loss": 0.591, "step": 222 }, { "epoch": 0.04513256425824732, "grad_norm": 1.2525830268859863, "learning_rate": 0.00019903897299518886, "loss": 0.624, "step": 223 }, { "epoch": 0.045334952438777575, "grad_norm": 1.3443201780319214, "learning_rate": 0.00019903015032364673, "loss": 0.6209, "step": 224 }, { "epoch": 0.04553734061930783, "grad_norm": 1.2040412425994873, "learning_rate": 0.0001990212875361744, "loss": 0.527, "step": 225 }, { "epoch": 0.04573972879983809, "grad_norm": 1.094304084777832, "learning_rate": 0.0001990123846363621, "loss": 0.522, "step": 226 }, { "epoch": 0.04594211698036835, "grad_norm": 1.0972617864608765, "learning_rate": 0.00019900344162781627, "loss": 0.4912, "step": 227 }, { "epoch": 0.0461445051608986, "grad_norm": 0.9556677341461182, "learning_rate": 0.00019899445851415966, "loss": 0.4907, "step": 228 }, { "epoch": 0.04634689334142886, "grad_norm": 0.9740906953811646, "learning_rate": 0.0001989854352990311, "loss": 0.5065, "step": 229 }, { "epoch": 0.04654928152195912, "grad_norm": 1.0087182521820068, "learning_rate": 0.0001989763719860859, "loss": 0.4701, "step": 230 }, { "epoch": 0.046751669702489375, "grad_norm": 0.9313437342643738, "learning_rate": 0.00019896726857899545, "loss": 0.4311, "step": 231 }, { "epoch": 0.04695405788301963, "grad_norm": 0.9619327783584595, "learning_rate": 0.00019895812508144745, "loss": 0.4388, "step": 232 }, { "epoch": 0.047156446063549885, "grad_norm": 0.9713581800460815, "learning_rate": 0.00019894894149714585, "loss": 0.5709, "step": 233 }, { "epoch": 0.04735883424408015, "grad_norm": 0.9759028553962708, "learning_rate": 0.00019893971782981073, "loss": 0.433, "step": 234 }, { "epoch": 0.0475612224246104, "grad_norm": 1.1055574417114258, "learning_rate": 0.00019893045408317855, "loss": 0.4481, "step": 235 }, { "epoch": 0.04776361060514066, "grad_norm": 0.9863013029098511, "learning_rate": 0.00019892115026100193, "loss": 0.4446, "step": 236 }, { "epoch": 0.04796599878567092, "grad_norm": 0.8422528505325317, "learning_rate": 0.00019891180636704973, "loss": 0.4416, "step": 237 }, { "epoch": 0.048168386966201175, "grad_norm": 0.9537456631660461, "learning_rate": 0.00019890242240510705, "loss": 0.3621, "step": 238 }, { "epoch": 0.04837077514673143, "grad_norm": 1.1801033020019531, "learning_rate": 0.00019889299837897524, "loss": 0.43, "step": 239 }, { "epoch": 0.048573163327261686, "grad_norm": 0.8404889106750488, "learning_rate": 0.00019888353429247185, "loss": 0.3996, "step": 240 }, { "epoch": 0.04877555150779195, "grad_norm": 1.1369571685791016, "learning_rate": 0.00019887403014943062, "loss": 0.4145, "step": 241 }, { "epoch": 0.0489779396883222, "grad_norm": 0.7849044799804688, "learning_rate": 0.00019886448595370163, "loss": 0.4066, "step": 242 }, { "epoch": 0.04918032786885246, "grad_norm": 0.9194380640983582, "learning_rate": 0.00019885490170915113, "loss": 0.395, "step": 243 }, { "epoch": 0.04938271604938271, "grad_norm": 0.7878439426422119, "learning_rate": 0.0001988452774196615, "loss": 0.3941, "step": 244 }, { "epoch": 0.049585104229912975, "grad_norm": 0.8196842670440674, "learning_rate": 0.00019883561308913154, "loss": 0.3547, "step": 245 }, { "epoch": 0.04978749241044323, "grad_norm": 1.0760830640792847, "learning_rate": 0.00019882590872147602, "loss": 0.373, "step": 246 }, { "epoch": 0.049989880590973486, "grad_norm": 0.7927395105361938, "learning_rate": 0.00019881616432062619, "loss": 0.4096, "step": 247 }, { "epoch": 0.05019226877150374, "grad_norm": 0.7793158292770386, "learning_rate": 0.0001988063798905293, "loss": 0.3739, "step": 248 }, { "epoch": 0.050394656952034, "grad_norm": 0.7429131269454956, "learning_rate": 0.00019879655543514894, "loss": 0.3881, "step": 249 }, { "epoch": 0.05059704513256426, "grad_norm": 0.758611261844635, "learning_rate": 0.0001987866909584649, "loss": 0.4018, "step": 250 }, { "epoch": 0.05059704513256426, "eval_loss": 0.4100117087364197, "eval_runtime": 1.3156, "eval_samples_per_second": 3.8, "eval_steps_per_second": 0.76, "step": 250 }, { "epoch": 0.050799433313094514, "grad_norm": 0.7496142387390137, "learning_rate": 0.00019877678646447312, "loss": 0.3526, "step": 251 }, { "epoch": 0.051001821493624776, "grad_norm": 0.7281865477561951, "learning_rate": 0.0001987668419571858, "loss": 0.3601, "step": 252 }, { "epoch": 0.05120420967415503, "grad_norm": 0.7838665246963501, "learning_rate": 0.00019875685744063137, "loss": 0.3698, "step": 253 }, { "epoch": 0.051406597854685286, "grad_norm": 0.9405313730239868, "learning_rate": 0.00019874683291885444, "loss": 0.4373, "step": 254 }, { "epoch": 0.05160898603521554, "grad_norm": 0.5769254565238953, "learning_rate": 0.00019873676839591577, "loss": 0.3632, "step": 255 }, { "epoch": 0.051811374215745803, "grad_norm": 0.6190117001533508, "learning_rate": 0.00019872666387589245, "loss": 0.368, "step": 256 }, { "epoch": 0.05201376239627606, "grad_norm": 0.6636627316474915, "learning_rate": 0.00019871651936287764, "loss": 0.3578, "step": 257 }, { "epoch": 0.052216150576806314, "grad_norm": 0.7890253067016602, "learning_rate": 0.0001987063348609808, "loss": 0.3578, "step": 258 }, { "epoch": 0.05241853875733657, "grad_norm": 0.6671871542930603, "learning_rate": 0.00019869611037432754, "loss": 0.3109, "step": 259 }, { "epoch": 0.05262092693786683, "grad_norm": 0.7174453735351562, "learning_rate": 0.00019868584590705966, "loss": 0.3767, "step": 260 }, { "epoch": 0.052823315118397086, "grad_norm": 0.8265895843505859, "learning_rate": 0.00019867554146333518, "loss": 0.3931, "step": 261 }, { "epoch": 0.05302570329892734, "grad_norm": 0.6119767427444458, "learning_rate": 0.00019866519704732832, "loss": 0.3255, "step": 262 }, { "epoch": 0.0532280914794576, "grad_norm": 0.6065880656242371, "learning_rate": 0.00019865481266322947, "loss": 0.325, "step": 263 }, { "epoch": 0.05343047965998786, "grad_norm": 0.8321945667266846, "learning_rate": 0.0001986443883152452, "loss": 0.3791, "step": 264 }, { "epoch": 0.053632867840518114, "grad_norm": 0.5443018674850464, "learning_rate": 0.00019863392400759836, "loss": 0.3363, "step": 265 }, { "epoch": 0.05383525602104837, "grad_norm": 0.546472430229187, "learning_rate": 0.00019862341974452786, "loss": 0.3166, "step": 266 }, { "epoch": 0.054037644201578625, "grad_norm": 0.6580498814582825, "learning_rate": 0.00019861287553028881, "loss": 0.3272, "step": 267 }, { "epoch": 0.05424003238210889, "grad_norm": 0.6845491528511047, "learning_rate": 0.00019860229136915263, "loss": 0.2889, "step": 268 }, { "epoch": 0.05444242056263914, "grad_norm": 0.6450967192649841, "learning_rate": 0.0001985916672654068, "loss": 0.3289, "step": 269 }, { "epoch": 0.0546448087431694, "grad_norm": 0.5730056762695312, "learning_rate": 0.000198581003223355, "loss": 0.2763, "step": 270 }, { "epoch": 0.05484719692369966, "grad_norm": 0.6887822151184082, "learning_rate": 0.0001985702992473171, "loss": 0.3047, "step": 271 }, { "epoch": 0.055049585104229914, "grad_norm": 0.5276957154273987, "learning_rate": 0.00019855955534162924, "loss": 0.3341, "step": 272 }, { "epoch": 0.05525197328476017, "grad_norm": 0.5062105059623718, "learning_rate": 0.00019854877151064356, "loss": 0.3397, "step": 273 }, { "epoch": 0.055454361465290425, "grad_norm": 0.7230908870697021, "learning_rate": 0.00019853794775872845, "loss": 0.3279, "step": 274 }, { "epoch": 0.05565674964582069, "grad_norm": 0.6312658786773682, "learning_rate": 0.00019852708409026857, "loss": 0.324, "step": 275 }, { "epoch": 0.05585913782635094, "grad_norm": 0.6563535928726196, "learning_rate": 0.00019851618050966457, "loss": 0.3666, "step": 276 }, { "epoch": 0.0560615260068812, "grad_norm": 0.5921420454978943, "learning_rate": 0.00019850523702133338, "loss": 0.3438, "step": 277 }, { "epoch": 0.05626391418741145, "grad_norm": 0.6209537386894226, "learning_rate": 0.00019849425362970818, "loss": 0.3137, "step": 278 }, { "epoch": 0.056466302367941715, "grad_norm": 0.6404590010643005, "learning_rate": 0.00019848323033923806, "loss": 0.3234, "step": 279 }, { "epoch": 0.05666869054847197, "grad_norm": 0.5893368124961853, "learning_rate": 0.0001984721671543885, "loss": 0.3398, "step": 280 }, { "epoch": 0.056871078729002225, "grad_norm": 0.6329981088638306, "learning_rate": 0.00019846106407964107, "loss": 0.3718, "step": 281 }, { "epoch": 0.05707346690953248, "grad_norm": 0.5510638952255249, "learning_rate": 0.00019844992111949346, "loss": 0.3017, "step": 282 }, { "epoch": 0.05727585509006274, "grad_norm": 0.5538240075111389, "learning_rate": 0.00019843873827845955, "loss": 0.264, "step": 283 }, { "epoch": 0.057478243270593, "grad_norm": 0.5221415162086487, "learning_rate": 0.00019842751556106944, "loss": 0.3574, "step": 284 }, { "epoch": 0.05768063145112325, "grad_norm": 0.6255350112915039, "learning_rate": 0.00019841625297186926, "loss": 0.3096, "step": 285 }, { "epoch": 0.05788301963165351, "grad_norm": 0.6035017967224121, "learning_rate": 0.00019840495051542134, "loss": 0.3362, "step": 286 }, { "epoch": 0.05808540781218377, "grad_norm": 0.5840286612510681, "learning_rate": 0.0001983936081963042, "loss": 0.347, "step": 287 }, { "epoch": 0.058287795992714025, "grad_norm": 0.5255894660949707, "learning_rate": 0.00019838222601911248, "loss": 0.3418, "step": 288 }, { "epoch": 0.05849018417324428, "grad_norm": 0.5202164053916931, "learning_rate": 0.0001983708039884569, "loss": 0.3649, "step": 289 }, { "epoch": 0.05869257235377454, "grad_norm": 0.45187389850616455, "learning_rate": 0.0001983593421089645, "loss": 0.25, "step": 290 }, { "epoch": 0.0588949605343048, "grad_norm": 0.4174633026123047, "learning_rate": 0.00019834784038527827, "loss": 0.2832, "step": 291 }, { "epoch": 0.05909734871483505, "grad_norm": 0.6519940495491028, "learning_rate": 0.00019833629882205745, "loss": 0.279, "step": 292 }, { "epoch": 0.05929973689536531, "grad_norm": 0.475143164396286, "learning_rate": 0.00019832471742397737, "loss": 0.2811, "step": 293 }, { "epoch": 0.05950212507589557, "grad_norm": 0.489053338766098, "learning_rate": 0.00019831309619572953, "loss": 0.3084, "step": 294 }, { "epoch": 0.059704513256425826, "grad_norm": 0.5794592499732971, "learning_rate": 0.00019830143514202155, "loss": 0.3191, "step": 295 }, { "epoch": 0.05990690143695608, "grad_norm": 0.5311474800109863, "learning_rate": 0.00019828973426757717, "loss": 0.3007, "step": 296 }, { "epoch": 0.060109289617486336, "grad_norm": 0.43288078904151917, "learning_rate": 0.00019827799357713632, "loss": 0.279, "step": 297 }, { "epoch": 0.0603116777980166, "grad_norm": 0.6474811434745789, "learning_rate": 0.00019826621307545496, "loss": 0.3149, "step": 298 }, { "epoch": 0.060514065978546853, "grad_norm": 0.7450769543647766, "learning_rate": 0.00019825439276730525, "loss": 0.3231, "step": 299 }, { "epoch": 0.06071645415907711, "grad_norm": 0.5388215184211731, "learning_rate": 0.00019824253265747545, "loss": 0.2864, "step": 300 }, { "epoch": 0.06071645415907711, "eval_loss": 0.3300868272781372, "eval_runtime": 1.3126, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.762, "step": 300 }, { "epoch": 0.060918842339607364, "grad_norm": 0.5626067519187927, "learning_rate": 0.00019823063275076997, "loss": 0.2918, "step": 301 }, { "epoch": 0.061121230520137626, "grad_norm": 0.612052321434021, "learning_rate": 0.0001982186930520093, "loss": 0.3608, "step": 302 }, { "epoch": 0.06132361870066788, "grad_norm": 0.5095520615577698, "learning_rate": 0.00019820671356603005, "loss": 0.329, "step": 303 }, { "epoch": 0.061526006881198136, "grad_norm": 0.5314645767211914, "learning_rate": 0.000198194694297685, "loss": 0.2432, "step": 304 }, { "epoch": 0.06172839506172839, "grad_norm": 0.43283140659332275, "learning_rate": 0.000198182635251843, "loss": 0.2811, "step": 305 }, { "epoch": 0.061930783242258654, "grad_norm": 0.544320285320282, "learning_rate": 0.00019817053643338906, "loss": 0.3281, "step": 306 }, { "epoch": 0.06213317142278891, "grad_norm": 0.5179933309555054, "learning_rate": 0.0001981583978472242, "loss": 0.3093, "step": 307 }, { "epoch": 0.062335559603319164, "grad_norm": 0.5187379717826843, "learning_rate": 0.00019814621949826563, "loss": 0.3001, "step": 308 }, { "epoch": 0.06253794778384943, "grad_norm": 0.4313293695449829, "learning_rate": 0.0001981340013914467, "loss": 0.2986, "step": 309 }, { "epoch": 0.06274033596437968, "grad_norm": 0.7100459933280945, "learning_rate": 0.00019812174353171682, "loss": 0.2993, "step": 310 }, { "epoch": 0.06294272414490994, "grad_norm": 0.4955151081085205, "learning_rate": 0.00019810944592404145, "loss": 0.3333, "step": 311 }, { "epoch": 0.06314511232544019, "grad_norm": 0.792910635471344, "learning_rate": 0.0001980971085734022, "loss": 0.3206, "step": 312 }, { "epoch": 0.06334750050597045, "grad_norm": 0.6413151621818542, "learning_rate": 0.00019808473148479684, "loss": 0.3617, "step": 313 }, { "epoch": 0.0635498886865007, "grad_norm": 0.4688740074634552, "learning_rate": 0.00019807231466323916, "loss": 0.2874, "step": 314 }, { "epoch": 0.06375227686703097, "grad_norm": 0.567500650882721, "learning_rate": 0.00019805985811375906, "loss": 0.3266, "step": 315 }, { "epoch": 0.06395466504756123, "grad_norm": 0.5868168473243713, "learning_rate": 0.00019804736184140254, "loss": 0.3257, "step": 316 }, { "epoch": 0.06415705322809148, "grad_norm": 0.5254923105239868, "learning_rate": 0.00019803482585123165, "loss": 0.3435, "step": 317 }, { "epoch": 0.06435944140862174, "grad_norm": 0.4791468679904938, "learning_rate": 0.00019802225014832464, "loss": 0.2934, "step": 318 }, { "epoch": 0.06456182958915199, "grad_norm": 0.6540399193763733, "learning_rate": 0.00019800963473777573, "loss": 0.2645, "step": 319 }, { "epoch": 0.06476421776968225, "grad_norm": 0.5253922939300537, "learning_rate": 0.0001979969796246953, "loss": 0.339, "step": 320 }, { "epoch": 0.0649666059502125, "grad_norm": 0.5550575256347656, "learning_rate": 0.0001979842848142098, "loss": 0.2993, "step": 321 }, { "epoch": 0.06516899413074276, "grad_norm": 0.48955628275871277, "learning_rate": 0.00019797155031146168, "loss": 0.2494, "step": 322 }, { "epoch": 0.06537138231127303, "grad_norm": 0.4756161868572235, "learning_rate": 0.00019795877612160958, "loss": 0.3264, "step": 323 }, { "epoch": 0.06557377049180328, "grad_norm": 0.6048707962036133, "learning_rate": 0.00019794596224982817, "loss": 0.2771, "step": 324 }, { "epoch": 0.06577615867233354, "grad_norm": 0.6601628065109253, "learning_rate": 0.0001979331087013082, "loss": 0.3563, "step": 325 }, { "epoch": 0.06597854685286379, "grad_norm": 0.4659283459186554, "learning_rate": 0.00019792021548125646, "loss": 0.2808, "step": 326 }, { "epoch": 0.06618093503339405, "grad_norm": 0.7773919105529785, "learning_rate": 0.00019790728259489585, "loss": 0.3156, "step": 327 }, { "epoch": 0.0663833232139243, "grad_norm": 0.5949811935424805, "learning_rate": 0.00019789431004746538, "loss": 0.2825, "step": 328 }, { "epoch": 0.06658571139445456, "grad_norm": 0.4546753466129303, "learning_rate": 0.00019788129784421998, "loss": 0.2758, "step": 329 }, { "epoch": 0.06678809957498483, "grad_norm": 0.4924231767654419, "learning_rate": 0.00019786824599043082, "loss": 0.2967, "step": 330 }, { "epoch": 0.06699048775551508, "grad_norm": 0.4607464373111725, "learning_rate": 0.000197855154491385, "loss": 0.2849, "step": 331 }, { "epoch": 0.06719287593604534, "grad_norm": 0.42234212160110474, "learning_rate": 0.0001978420233523858, "loss": 0.2947, "step": 332 }, { "epoch": 0.06739526411657559, "grad_norm": 0.4296901226043701, "learning_rate": 0.00019782885257875238, "loss": 0.2809, "step": 333 }, { "epoch": 0.06759765229710585, "grad_norm": 0.7048768401145935, "learning_rate": 0.00019781564217582015, "loss": 0.2581, "step": 334 }, { "epoch": 0.0678000404776361, "grad_norm": 0.6850839853286743, "learning_rate": 0.00019780239214894042, "loss": 0.2787, "step": 335 }, { "epoch": 0.06800242865816636, "grad_norm": 0.5054475665092468, "learning_rate": 0.0001977891025034807, "loss": 0.2938, "step": 336 }, { "epoch": 0.06820481683869661, "grad_norm": 0.4846641719341278, "learning_rate": 0.0001977757732448244, "loss": 0.2563, "step": 337 }, { "epoch": 0.06840720501922688, "grad_norm": 0.37315550446510315, "learning_rate": 0.00019776240437837105, "loss": 0.2856, "step": 338 }, { "epoch": 0.06860959319975714, "grad_norm": 0.7993639707565308, "learning_rate": 0.00019774899590953624, "loss": 0.2839, "step": 339 }, { "epoch": 0.0688119813802874, "grad_norm": 0.5315250754356384, "learning_rate": 0.00019773554784375155, "loss": 0.2567, "step": 340 }, { "epoch": 0.06901436956081765, "grad_norm": 0.5073575377464294, "learning_rate": 0.0001977220601864647, "loss": 0.3166, "step": 341 }, { "epoch": 0.0692167577413479, "grad_norm": 0.4915961027145386, "learning_rate": 0.00019770853294313933, "loss": 0.3005, "step": 342 }, { "epoch": 0.06941914592187816, "grad_norm": 0.6062594056129456, "learning_rate": 0.00019769496611925512, "loss": 0.3304, "step": 343 }, { "epoch": 0.06962153410240841, "grad_norm": 0.4515087902545929, "learning_rate": 0.00019768135972030788, "loss": 0.318, "step": 344 }, { "epoch": 0.06982392228293868, "grad_norm": 0.5379495620727539, "learning_rate": 0.0001976677137518094, "loss": 0.3571, "step": 345 }, { "epoch": 0.07002631046346894, "grad_norm": 0.4913288950920105, "learning_rate": 0.0001976540282192875, "loss": 0.2629, "step": 346 }, { "epoch": 0.0702286986439992, "grad_norm": 0.42056629061698914, "learning_rate": 0.00019764030312828602, "loss": 0.2337, "step": 347 }, { "epoch": 0.07043108682452945, "grad_norm": 0.5364342331886292, "learning_rate": 0.00019762653848436483, "loss": 0.2978, "step": 348 }, { "epoch": 0.0706334750050597, "grad_norm": 0.4940585196018219, "learning_rate": 0.0001976127342930998, "loss": 0.2657, "step": 349 }, { "epoch": 0.07083586318558996, "grad_norm": 0.49181851744651794, "learning_rate": 0.00019759889056008287, "loss": 0.2972, "step": 350 }, { "epoch": 0.07083586318558996, "eval_loss": 0.32093870639801025, "eval_runtime": 1.323, "eval_samples_per_second": 3.779, "eval_steps_per_second": 0.756, "step": 350 }, { "epoch": 0.07103825136612021, "grad_norm": 0.5329145193099976, "learning_rate": 0.000197585007290922, "loss": 0.2985, "step": 351 }, { "epoch": 0.07124063954665047, "grad_norm": 0.5908301472663879, "learning_rate": 0.00019757108449124105, "loss": 0.3073, "step": 352 }, { "epoch": 0.07144302772718074, "grad_norm": 0.5759735107421875, "learning_rate": 0.00019755712216668004, "loss": 0.2985, "step": 353 }, { "epoch": 0.071645415907711, "grad_norm": 0.41807422041893005, "learning_rate": 0.00019754312032289496, "loss": 0.269, "step": 354 }, { "epoch": 0.07184780408824125, "grad_norm": 0.4687816798686981, "learning_rate": 0.00019752907896555778, "loss": 0.3287, "step": 355 }, { "epoch": 0.0720501922687715, "grad_norm": 0.4629111886024475, "learning_rate": 0.0001975149981003564, "loss": 0.2961, "step": 356 }, { "epoch": 0.07225258044930176, "grad_norm": 0.506994903087616, "learning_rate": 0.00019750087773299493, "loss": 0.2845, "step": 357 }, { "epoch": 0.07245496862983201, "grad_norm": 0.4481952488422394, "learning_rate": 0.00019748671786919326, "loss": 0.29, "step": 358 }, { "epoch": 0.07265735681036227, "grad_norm": 0.5964845418930054, "learning_rate": 0.00019747251851468745, "loss": 0.3084, "step": 359 }, { "epoch": 0.07285974499089254, "grad_norm": 0.3764006793498993, "learning_rate": 0.0001974582796752295, "loss": 0.2746, "step": 360 }, { "epoch": 0.0730621331714228, "grad_norm": 0.5635024309158325, "learning_rate": 0.00019744400135658733, "loss": 0.307, "step": 361 }, { "epoch": 0.07326452135195305, "grad_norm": 0.5087900757789612, "learning_rate": 0.00019742968356454497, "loss": 0.3083, "step": 362 }, { "epoch": 0.0734669095324833, "grad_norm": 0.4915212094783783, "learning_rate": 0.00019741532630490233, "loss": 0.3161, "step": 363 }, { "epoch": 0.07366929771301356, "grad_norm": 0.39608627557754517, "learning_rate": 0.00019740092958347544, "loss": 0.2822, "step": 364 }, { "epoch": 0.07387168589354381, "grad_norm": 0.3772413432598114, "learning_rate": 0.0001973864934060962, "loss": 0.2656, "step": 365 }, { "epoch": 0.07407407407407407, "grad_norm": 0.4396105706691742, "learning_rate": 0.0001973720177786125, "loss": 0.2768, "step": 366 }, { "epoch": 0.07427646225460433, "grad_norm": 0.4604906439781189, "learning_rate": 0.0001973575027068883, "loss": 0.2732, "step": 367 }, { "epoch": 0.0744788504351346, "grad_norm": 0.4070630669593811, "learning_rate": 0.00019734294819680345, "loss": 0.3195, "step": 368 }, { "epoch": 0.07468123861566485, "grad_norm": 0.3994045555591583, "learning_rate": 0.0001973283542542538, "loss": 0.25, "step": 369 }, { "epoch": 0.0748836267961951, "grad_norm": 0.5679988265037537, "learning_rate": 0.00019731372088515123, "loss": 0.3029, "step": 370 }, { "epoch": 0.07508601497672536, "grad_norm": 0.3986324369907379, "learning_rate": 0.0001972990480954235, "loss": 0.2559, "step": 371 }, { "epoch": 0.07528840315725562, "grad_norm": 0.43111109733581543, "learning_rate": 0.00019728433589101437, "loss": 0.2291, "step": 372 }, { "epoch": 0.07549079133778587, "grad_norm": 0.38930702209472656, "learning_rate": 0.00019726958427788364, "loss": 0.2562, "step": 373 }, { "epoch": 0.07569317951831613, "grad_norm": 0.4614977240562439, "learning_rate": 0.00019725479326200692, "loss": 0.2628, "step": 374 }, { "epoch": 0.07589556769884638, "grad_norm": 0.4017693102359772, "learning_rate": 0.00019723996284937598, "loss": 0.2828, "step": 375 }, { "epoch": 0.07609795587937665, "grad_norm": 0.5743600130081177, "learning_rate": 0.00019722509304599838, "loss": 0.3018, "step": 376 }, { "epoch": 0.0763003440599069, "grad_norm": 0.5052629113197327, "learning_rate": 0.0001972101838578977, "loss": 0.2877, "step": 377 }, { "epoch": 0.07650273224043716, "grad_norm": 0.5050588250160217, "learning_rate": 0.00019719523529111347, "loss": 0.2838, "step": 378 }, { "epoch": 0.07670512042096742, "grad_norm": 0.4845387637615204, "learning_rate": 0.00019718024735170122, "loss": 0.2871, "step": 379 }, { "epoch": 0.07690750860149767, "grad_norm": 0.43115630745887756, "learning_rate": 0.0001971652200457323, "loss": 0.2738, "step": 380 }, { "epoch": 0.07710989678202793, "grad_norm": 0.3670971393585205, "learning_rate": 0.00019715015337929418, "loss": 0.201, "step": 381 }, { "epoch": 0.07731228496255818, "grad_norm": 0.556155264377594, "learning_rate": 0.00019713504735849018, "loss": 0.2442, "step": 382 }, { "epoch": 0.07751467314308845, "grad_norm": 0.7867807149887085, "learning_rate": 0.0001971199019894395, "loss": 0.349, "step": 383 }, { "epoch": 0.0777170613236187, "grad_norm": 0.4595763683319092, "learning_rate": 0.00019710471727827738, "loss": 0.2517, "step": 384 }, { "epoch": 0.07791944950414896, "grad_norm": 0.48216989636421204, "learning_rate": 0.000197089493231155, "loss": 0.2999, "step": 385 }, { "epoch": 0.07812183768467922, "grad_norm": 0.4103691279888153, "learning_rate": 0.00019707422985423945, "loss": 0.2945, "step": 386 }, { "epoch": 0.07832422586520947, "grad_norm": 0.6067521572113037, "learning_rate": 0.00019705892715371362, "loss": 0.3065, "step": 387 }, { "epoch": 0.07852661404573973, "grad_norm": 0.47684332728385925, "learning_rate": 0.0001970435851357766, "loss": 0.2862, "step": 388 }, { "epoch": 0.07872900222626998, "grad_norm": 0.4243808388710022, "learning_rate": 0.00019702820380664316, "loss": 0.2764, "step": 389 }, { "epoch": 0.07893139040680024, "grad_norm": 0.31469157338142395, "learning_rate": 0.00019701278317254417, "loss": 0.287, "step": 390 }, { "epoch": 0.0791337785873305, "grad_norm": 0.3231123983860016, "learning_rate": 0.00019699732323972627, "loss": 0.2595, "step": 391 }, { "epoch": 0.07933616676786076, "grad_norm": 0.39723944664001465, "learning_rate": 0.00019698182401445212, "loss": 0.2392, "step": 392 }, { "epoch": 0.07953855494839102, "grad_norm": 0.3718653917312622, "learning_rate": 0.00019696628550300028, "loss": 0.2894, "step": 393 }, { "epoch": 0.07974094312892127, "grad_norm": 0.41565394401550293, "learning_rate": 0.00019695070771166522, "loss": 0.2823, "step": 394 }, { "epoch": 0.07994333130945153, "grad_norm": 0.4772963225841522, "learning_rate": 0.0001969350906467573, "loss": 0.2777, "step": 395 }, { "epoch": 0.08014571948998178, "grad_norm": 0.523183286190033, "learning_rate": 0.00019691943431460283, "loss": 0.317, "step": 396 }, { "epoch": 0.08034810767051204, "grad_norm": 0.5802732706069946, "learning_rate": 0.00019690373872154396, "loss": 0.273, "step": 397 }, { "epoch": 0.0805504958510423, "grad_norm": 0.5425595641136169, "learning_rate": 0.0001968880038739388, "loss": 0.3069, "step": 398 }, { "epoch": 0.08075288403157256, "grad_norm": 0.645439624786377, "learning_rate": 0.00019687222977816143, "loss": 0.3343, "step": 399 }, { "epoch": 0.08095527221210282, "grad_norm": 0.4414001703262329, "learning_rate": 0.00019685641644060165, "loss": 0.2902, "step": 400 }, { "epoch": 0.08095527221210282, "eval_loss": 0.31833919882774353, "eval_runtime": 1.3164, "eval_samples_per_second": 3.798, "eval_steps_per_second": 0.76, "step": 400 }, { "epoch": 0.08115766039263307, "grad_norm": 0.4709251821041107, "learning_rate": 0.00019684056386766528, "loss": 0.2659, "step": 401 }, { "epoch": 0.08136004857316333, "grad_norm": 0.3637852668762207, "learning_rate": 0.000196824672065774, "loss": 0.2652, "step": 402 }, { "epoch": 0.08156243675369358, "grad_norm": 0.4023182690143585, "learning_rate": 0.00019680874104136545, "loss": 0.2593, "step": 403 }, { "epoch": 0.08176482493422384, "grad_norm": 0.5218076705932617, "learning_rate": 0.00019679277080089297, "loss": 0.2657, "step": 404 }, { "epoch": 0.08196721311475409, "grad_norm": 0.3712092936038971, "learning_rate": 0.00019677676135082607, "loss": 0.2651, "step": 405 }, { "epoch": 0.08216960129528436, "grad_norm": 0.38520634174346924, "learning_rate": 0.00019676071269764987, "loss": 0.2255, "step": 406 }, { "epoch": 0.08237198947581462, "grad_norm": 0.34115347266197205, "learning_rate": 0.00019674462484786553, "loss": 0.2668, "step": 407 }, { "epoch": 0.08257437765634487, "grad_norm": 0.4922421872615814, "learning_rate": 0.00019672849780799005, "loss": 0.2798, "step": 408 }, { "epoch": 0.08277676583687513, "grad_norm": 0.4661756753921509, "learning_rate": 0.00019671233158455632, "loss": 0.2623, "step": 409 }, { "epoch": 0.08297915401740538, "grad_norm": 0.3546546697616577, "learning_rate": 0.00019669612618411302, "loss": 0.2872, "step": 410 }, { "epoch": 0.08318154219793564, "grad_norm": 0.45860913395881653, "learning_rate": 0.00019667988161322478, "loss": 0.2601, "step": 411 }, { "epoch": 0.08338393037846589, "grad_norm": 0.3420308530330658, "learning_rate": 0.00019666359787847215, "loss": 0.2599, "step": 412 }, { "epoch": 0.08358631855899615, "grad_norm": 0.319474458694458, "learning_rate": 0.00019664727498645143, "loss": 0.2211, "step": 413 }, { "epoch": 0.08378870673952642, "grad_norm": 0.3286161720752716, "learning_rate": 0.00019663091294377482, "loss": 0.2685, "step": 414 }, { "epoch": 0.08399109492005667, "grad_norm": 0.5573515295982361, "learning_rate": 0.0001966145117570704, "loss": 0.2707, "step": 415 }, { "epoch": 0.08419348310058693, "grad_norm": 0.4362538158893585, "learning_rate": 0.00019659807143298206, "loss": 0.2345, "step": 416 }, { "epoch": 0.08439587128111718, "grad_norm": 0.5010119080543518, "learning_rate": 0.00019658159197816964, "loss": 0.3341, "step": 417 }, { "epoch": 0.08459825946164744, "grad_norm": 0.4639767110347748, "learning_rate": 0.00019656507339930874, "loss": 0.2443, "step": 418 }, { "epoch": 0.08480064764217769, "grad_norm": 0.3297119140625, "learning_rate": 0.00019654851570309086, "loss": 0.2478, "step": 419 }, { "epoch": 0.08500303582270795, "grad_norm": 0.5071136951446533, "learning_rate": 0.00019653191889622328, "loss": 0.2541, "step": 420 }, { "epoch": 0.08520542400323822, "grad_norm": 0.31538739800453186, "learning_rate": 0.0001965152829854292, "loss": 0.2428, "step": 421 }, { "epoch": 0.08540781218376847, "grad_norm": 0.4691992402076721, "learning_rate": 0.00019649860797744758, "loss": 0.2861, "step": 422 }, { "epoch": 0.08561020036429873, "grad_norm": 0.34454119205474854, "learning_rate": 0.00019648189387903332, "loss": 0.2465, "step": 423 }, { "epoch": 0.08581258854482898, "grad_norm": 0.36330515146255493, "learning_rate": 0.00019646514069695712, "loss": 0.2621, "step": 424 }, { "epoch": 0.08601497672535924, "grad_norm": 0.32474184036254883, "learning_rate": 0.00019644834843800545, "loss": 0.277, "step": 425 }, { "epoch": 0.0862173649058895, "grad_norm": 0.45646318793296814, "learning_rate": 0.00019643151710898064, "loss": 0.2763, "step": 426 }, { "epoch": 0.08641975308641975, "grad_norm": 0.3294435441493988, "learning_rate": 0.00019641464671670092, "loss": 0.2518, "step": 427 }, { "epoch": 0.08662214126695, "grad_norm": 0.4083794355392456, "learning_rate": 0.00019639773726800024, "loss": 0.2662, "step": 428 }, { "epoch": 0.08682452944748027, "grad_norm": 0.3710835576057434, "learning_rate": 0.00019638078876972842, "loss": 0.2752, "step": 429 }, { "epoch": 0.08702691762801053, "grad_norm": 0.5596700310707092, "learning_rate": 0.00019636380122875111, "loss": 0.2607, "step": 430 }, { "epoch": 0.08722930580854078, "grad_norm": 0.4794429540634155, "learning_rate": 0.00019634677465194976, "loss": 0.2603, "step": 431 }, { "epoch": 0.08743169398907104, "grad_norm": 0.40568777918815613, "learning_rate": 0.00019632970904622158, "loss": 0.2724, "step": 432 }, { "epoch": 0.0876340821696013, "grad_norm": 1.0941001176834106, "learning_rate": 0.00019631260441847973, "loss": 0.2656, "step": 433 }, { "epoch": 0.08783647035013155, "grad_norm": 0.35679563879966736, "learning_rate": 0.00019629546077565302, "loss": 0.295, "step": 434 }, { "epoch": 0.0880388585306618, "grad_norm": 0.3402407169342041, "learning_rate": 0.00019627827812468618, "loss": 0.269, "step": 435 }, { "epoch": 0.08824124671119207, "grad_norm": 0.3904549181461334, "learning_rate": 0.00019626105647253965, "loss": 0.2392, "step": 436 }, { "epoch": 0.08844363489172233, "grad_norm": 0.33952316641807556, "learning_rate": 0.00019624379582618975, "loss": 0.2213, "step": 437 }, { "epoch": 0.08864602307225258, "grad_norm": 0.45735833048820496, "learning_rate": 0.00019622649619262857, "loss": 0.257, "step": 438 }, { "epoch": 0.08884841125278284, "grad_norm": 0.5346881151199341, "learning_rate": 0.00019620915757886394, "loss": 0.2786, "step": 439 }, { "epoch": 0.0890507994333131, "grad_norm": 0.33781683444976807, "learning_rate": 0.00019619177999191954, "loss": 0.2764, "step": 440 }, { "epoch": 0.08925318761384335, "grad_norm": 0.383535772562027, "learning_rate": 0.00019617436343883488, "loss": 0.2595, "step": 441 }, { "epoch": 0.0894555757943736, "grad_norm": 0.525073766708374, "learning_rate": 0.00019615690792666515, "loss": 0.2826, "step": 442 }, { "epoch": 0.08965796397490386, "grad_norm": 0.5851255655288696, "learning_rate": 0.00019613941346248136, "loss": 0.2664, "step": 443 }, { "epoch": 0.08986035215543413, "grad_norm": 0.5176846981048584, "learning_rate": 0.00019612188005337032, "loss": 0.2741, "step": 444 }, { "epoch": 0.09006274033596438, "grad_norm": 0.48070332407951355, "learning_rate": 0.0001961043077064346, "loss": 0.2554, "step": 445 }, { "epoch": 0.09026512851649464, "grad_norm": 0.5157674551010132, "learning_rate": 0.00019608669642879258, "loss": 0.2691, "step": 446 }, { "epoch": 0.0904675166970249, "grad_norm": 0.42131873965263367, "learning_rate": 0.00019606904622757835, "loss": 0.2881, "step": 447 }, { "epoch": 0.09066990487755515, "grad_norm": 0.38773128390312195, "learning_rate": 0.00019605135710994178, "loss": 0.2971, "step": 448 }, { "epoch": 0.0908722930580854, "grad_norm": 0.42365771532058716, "learning_rate": 0.00019603362908304854, "loss": 0.2545, "step": 449 }, { "epoch": 0.09107468123861566, "grad_norm": 0.3761996924877167, "learning_rate": 0.00019601586215408002, "loss": 0.2801, "step": 450 }, { "epoch": 0.09107468123861566, "eval_loss": 0.31783393025398254, "eval_runtime": 1.3162, "eval_samples_per_second": 3.799, "eval_steps_per_second": 0.76, "step": 450 }, { "epoch": 0.09127706941914593, "grad_norm": 0.41931861639022827, "learning_rate": 0.00019599805633023344, "loss": 0.2677, "step": 451 }, { "epoch": 0.09147945759967618, "grad_norm": 0.47299373149871826, "learning_rate": 0.00019598021161872168, "loss": 0.2887, "step": 452 }, { "epoch": 0.09168184578020644, "grad_norm": 0.5986862182617188, "learning_rate": 0.00019596232802677346, "loss": 0.2361, "step": 453 }, { "epoch": 0.0918842339607367, "grad_norm": 0.532763659954071, "learning_rate": 0.00019594440556163317, "loss": 0.2476, "step": 454 }, { "epoch": 0.09208662214126695, "grad_norm": 0.4764944911003113, "learning_rate": 0.00019592644423056097, "loss": 0.2659, "step": 455 }, { "epoch": 0.0922890103217972, "grad_norm": 0.30676013231277466, "learning_rate": 0.00019590844404083284, "loss": 0.2274, "step": 456 }, { "epoch": 0.09249139850232746, "grad_norm": 0.38060346245765686, "learning_rate": 0.00019589040499974042, "loss": 0.2659, "step": 457 }, { "epoch": 0.09269378668285772, "grad_norm": 0.3842496871948242, "learning_rate": 0.0001958723271145911, "loss": 0.2528, "step": 458 }, { "epoch": 0.09289617486338798, "grad_norm": 0.3631954789161682, "learning_rate": 0.000195854210392708, "loss": 0.2852, "step": 459 }, { "epoch": 0.09309856304391824, "grad_norm": 0.4610200822353363, "learning_rate": 0.00019583605484143002, "loss": 0.2136, "step": 460 }, { "epoch": 0.0933009512244485, "grad_norm": 0.45114290714263916, "learning_rate": 0.00019581786046811173, "loss": 0.3453, "step": 461 }, { "epoch": 0.09350333940497875, "grad_norm": 0.47917625308036804, "learning_rate": 0.00019579962728012351, "loss": 0.2575, "step": 462 }, { "epoch": 0.093705727585509, "grad_norm": 0.5544509887695312, "learning_rate": 0.0001957813552848513, "loss": 0.2788, "step": 463 }, { "epoch": 0.09390811576603926, "grad_norm": 0.3755500316619873, "learning_rate": 0.00019576304448969698, "loss": 0.2687, "step": 464 }, { "epoch": 0.09411050394656952, "grad_norm": 0.2686128616333008, "learning_rate": 0.00019574469490207797, "loss": 0.2263, "step": 465 }, { "epoch": 0.09431289212709977, "grad_norm": 0.45375293493270874, "learning_rate": 0.00019572630652942748, "loss": 0.3131, "step": 466 }, { "epoch": 0.09451528030763004, "grad_norm": 0.7846113443374634, "learning_rate": 0.0001957078793791944, "loss": 0.2867, "step": 467 }, { "epoch": 0.0947176684881603, "grad_norm": 0.5866795778274536, "learning_rate": 0.0001956894134588434, "loss": 0.2642, "step": 468 }, { "epoch": 0.09492005666869055, "grad_norm": 0.5000519156455994, "learning_rate": 0.00019567090877585476, "loss": 0.2704, "step": 469 }, { "epoch": 0.0951224448492208, "grad_norm": 0.3544883131980896, "learning_rate": 0.0001956523653377245, "loss": 0.2596, "step": 470 }, { "epoch": 0.09532483302975106, "grad_norm": 0.39905259013175964, "learning_rate": 0.0001956337831519644, "loss": 0.2126, "step": 471 }, { "epoch": 0.09552722121028132, "grad_norm": 0.3646845519542694, "learning_rate": 0.00019561516222610186, "loss": 0.264, "step": 472 }, { "epoch": 0.09572960939081157, "grad_norm": 0.40496543049812317, "learning_rate": 0.00019559650256767994, "loss": 0.2616, "step": 473 }, { "epoch": 0.09593199757134184, "grad_norm": 0.36601021885871887, "learning_rate": 0.00019557780418425752, "loss": 0.2807, "step": 474 }, { "epoch": 0.0961343857518721, "grad_norm": 0.3899361193180084, "learning_rate": 0.00019555906708340902, "loss": 0.2679, "step": 475 }, { "epoch": 0.09633677393240235, "grad_norm": 0.4096783399581909, "learning_rate": 0.00019554029127272465, "loss": 0.3052, "step": 476 }, { "epoch": 0.0965391621129326, "grad_norm": 0.4588966369628906, "learning_rate": 0.00019552147675981033, "loss": 0.3057, "step": 477 }, { "epoch": 0.09674155029346286, "grad_norm": 0.564294695854187, "learning_rate": 0.00019550262355228748, "loss": 0.3157, "step": 478 }, { "epoch": 0.09694393847399312, "grad_norm": 0.39877036213874817, "learning_rate": 0.00019548373165779336, "loss": 0.3048, "step": 479 }, { "epoch": 0.09714632665452337, "grad_norm": 0.46616441011428833, "learning_rate": 0.0001954648010839809, "loss": 0.2435, "step": 480 }, { "epoch": 0.09734871483505363, "grad_norm": 0.3962508738040924, "learning_rate": 0.00019544583183851857, "loss": 0.2978, "step": 481 }, { "epoch": 0.0975511030155839, "grad_norm": 0.47796082496643066, "learning_rate": 0.00019542682392909063, "loss": 0.3236, "step": 482 }, { "epoch": 0.09775349119611415, "grad_norm": 0.4678974449634552, "learning_rate": 0.00019540777736339691, "loss": 0.2821, "step": 483 }, { "epoch": 0.0979558793766444, "grad_norm": 0.6112771034240723, "learning_rate": 0.000195388692149153, "loss": 0.2528, "step": 484 }, { "epoch": 0.09815826755717466, "grad_norm": 0.5008243322372437, "learning_rate": 0.00019536956829409007, "loss": 0.241, "step": 485 }, { "epoch": 0.09836065573770492, "grad_norm": 0.3432110846042633, "learning_rate": 0.00019535040580595498, "loss": 0.2794, "step": 486 }, { "epoch": 0.09856304391823517, "grad_norm": 0.345851868391037, "learning_rate": 0.0001953312046925102, "loss": 0.2466, "step": 487 }, { "epoch": 0.09876543209876543, "grad_norm": 0.5199732780456543, "learning_rate": 0.0001953119649615339, "loss": 0.2889, "step": 488 }, { "epoch": 0.0989678202792957, "grad_norm": 0.2812657952308655, "learning_rate": 0.00019529268662081984, "loss": 0.2636, "step": 489 }, { "epoch": 0.09917020845982595, "grad_norm": 0.3841879963874817, "learning_rate": 0.00019527336967817743, "loss": 0.3325, "step": 490 }, { "epoch": 0.0993725966403562, "grad_norm": 0.35691508650779724, "learning_rate": 0.0001952540141414318, "loss": 0.2585, "step": 491 }, { "epoch": 0.09957498482088646, "grad_norm": 0.43971186876296997, "learning_rate": 0.0001952346200184236, "loss": 0.2513, "step": 492 }, { "epoch": 0.09977737300141672, "grad_norm": 0.34127867221832275, "learning_rate": 0.00019521518731700912, "loss": 0.277, "step": 493 }, { "epoch": 0.09997976118194697, "grad_norm": 0.596693217754364, "learning_rate": 0.00019519571604506038, "loss": 0.2852, "step": 494 }, { "epoch": 0.10018214936247723, "grad_norm": 0.3105694055557251, "learning_rate": 0.0001951762062104649, "loss": 0.2484, "step": 495 }, { "epoch": 0.10038453754300748, "grad_norm": 0.32709312438964844, "learning_rate": 0.00019515665782112598, "loss": 0.2622, "step": 496 }, { "epoch": 0.10058692572353775, "grad_norm": 0.35804563760757446, "learning_rate": 0.00019513707088496233, "loss": 0.239, "step": 497 }, { "epoch": 0.100789313904068, "grad_norm": 0.48923608660697937, "learning_rate": 0.00019511744540990847, "loss": 0.3016, "step": 498 }, { "epoch": 0.10099170208459826, "grad_norm": 0.3978324234485626, "learning_rate": 0.00019509778140391437, "loss": 0.2705, "step": 499 }, { "epoch": 0.10119409026512852, "grad_norm": 0.3775447607040405, "learning_rate": 0.00019507807887494572, "loss": 0.275, "step": 500 }, { "epoch": 0.10119409026512852, "eval_loss": 0.2983620762825012, "eval_runtime": 1.3199, "eval_samples_per_second": 3.788, "eval_steps_per_second": 0.758, "step": 500 }, { "epoch": 0.10139647844565877, "grad_norm": 0.36164119839668274, "learning_rate": 0.00019505833783098377, "loss": 0.2606, "step": 501 }, { "epoch": 0.10159886662618903, "grad_norm": 0.37158089876174927, "learning_rate": 0.00019503855828002542, "loss": 0.2078, "step": 502 }, { "epoch": 0.10180125480671928, "grad_norm": 0.31535255908966064, "learning_rate": 0.00019501874023008308, "loss": 0.2363, "step": 503 }, { "epoch": 0.10200364298724955, "grad_norm": 0.3091668486595154, "learning_rate": 0.0001949988836891848, "loss": 0.2262, "step": 504 }, { "epoch": 0.1022060311677798, "grad_norm": 0.4352635145187378, "learning_rate": 0.00019497898866537426, "loss": 0.2504, "step": 505 }, { "epoch": 0.10240841934831006, "grad_norm": 0.553655207157135, "learning_rate": 0.0001949590551667107, "loss": 0.3011, "step": 506 }, { "epoch": 0.10261080752884032, "grad_norm": 0.34165793657302856, "learning_rate": 0.0001949390832012689, "loss": 0.2539, "step": 507 }, { "epoch": 0.10281319570937057, "grad_norm": 0.34486615657806396, "learning_rate": 0.00019491907277713934, "loss": 0.2556, "step": 508 }, { "epoch": 0.10301558388990083, "grad_norm": 0.35744449496269226, "learning_rate": 0.00019489902390242795, "loss": 0.2398, "step": 509 }, { "epoch": 0.10321797207043108, "grad_norm": 0.418155312538147, "learning_rate": 0.00019487893658525626, "loss": 0.303, "step": 510 }, { "epoch": 0.10342036025096134, "grad_norm": 0.3624366223812103, "learning_rate": 0.00019485881083376147, "loss": 0.2557, "step": 511 }, { "epoch": 0.10362274843149161, "grad_norm": 0.3640274405479431, "learning_rate": 0.00019483864665609623, "loss": 0.313, "step": 512 }, { "epoch": 0.10382513661202186, "grad_norm": 0.31960329413414, "learning_rate": 0.00019481844406042884, "loss": 0.259, "step": 513 }, { "epoch": 0.10402752479255212, "grad_norm": 0.31938621401786804, "learning_rate": 0.00019479820305494313, "loss": 0.2361, "step": 514 }, { "epoch": 0.10422991297308237, "grad_norm": 0.3560287654399872, "learning_rate": 0.00019477792364783843, "loss": 0.2414, "step": 515 }, { "epoch": 0.10443230115361263, "grad_norm": 0.4755111336708069, "learning_rate": 0.00019475760584732976, "loss": 0.2405, "step": 516 }, { "epoch": 0.10463468933414288, "grad_norm": 0.3549703359603882, "learning_rate": 0.00019473724966164759, "loss": 0.2572, "step": 517 }, { "epoch": 0.10483707751467314, "grad_norm": 0.34324032068252563, "learning_rate": 0.00019471685509903796, "loss": 0.2619, "step": 518 }, { "epoch": 0.1050394656952034, "grad_norm": 0.3671707212924957, "learning_rate": 0.00019469642216776248, "loss": 0.2641, "step": 519 }, { "epoch": 0.10524185387573366, "grad_norm": 0.44061991572380066, "learning_rate": 0.0001946759508760983, "loss": 0.2578, "step": 520 }, { "epoch": 0.10544424205626392, "grad_norm": 0.35979196429252625, "learning_rate": 0.00019465544123233803, "loss": 0.2298, "step": 521 }, { "epoch": 0.10564663023679417, "grad_norm": 0.43028783798217773, "learning_rate": 0.00019463489324478994, "loss": 0.2567, "step": 522 }, { "epoch": 0.10584901841732443, "grad_norm": 0.35531121492385864, "learning_rate": 0.00019461430692177775, "loss": 0.2647, "step": 523 }, { "epoch": 0.10605140659785468, "grad_norm": 0.49430280923843384, "learning_rate": 0.00019459368227164075, "loss": 0.215, "step": 524 }, { "epoch": 0.10625379477838494, "grad_norm": 0.34892547130584717, "learning_rate": 0.00019457301930273375, "loss": 0.2543, "step": 525 }, { "epoch": 0.1064561829589152, "grad_norm": 0.31647083163261414, "learning_rate": 0.00019455231802342703, "loss": 0.2228, "step": 526 }, { "epoch": 0.10665857113944546, "grad_norm": 0.3485078811645508, "learning_rate": 0.00019453157844210645, "loss": 0.2564, "step": 527 }, { "epoch": 0.10686095931997572, "grad_norm": 0.4160056710243225, "learning_rate": 0.00019451080056717338, "loss": 0.2863, "step": 528 }, { "epoch": 0.10706334750050597, "grad_norm": 0.4739069640636444, "learning_rate": 0.00019448998440704468, "loss": 0.2743, "step": 529 }, { "epoch": 0.10726573568103623, "grad_norm": 0.3276217579841614, "learning_rate": 0.0001944691299701527, "loss": 0.2196, "step": 530 }, { "epoch": 0.10746812386156648, "grad_norm": 0.4787321388721466, "learning_rate": 0.00019444823726494538, "loss": 0.257, "step": 531 }, { "epoch": 0.10767051204209674, "grad_norm": 0.3252486288547516, "learning_rate": 0.0001944273062998861, "loss": 0.2183, "step": 532 }, { "epoch": 0.107872900222627, "grad_norm": 0.5177460312843323, "learning_rate": 0.00019440633708345365, "loss": 0.3001, "step": 533 }, { "epoch": 0.10807528840315725, "grad_norm": 0.2944439649581909, "learning_rate": 0.0001943853296241425, "loss": 0.2236, "step": 534 }, { "epoch": 0.10827767658368752, "grad_norm": 0.33791181445121765, "learning_rate": 0.0001943642839304625, "loss": 0.2712, "step": 535 }, { "epoch": 0.10848006476421777, "grad_norm": 0.4525901973247528, "learning_rate": 0.00019434320001093905, "loss": 0.2528, "step": 536 }, { "epoch": 0.10868245294474803, "grad_norm": 0.40618807077407837, "learning_rate": 0.00019432207787411292, "loss": 0.2332, "step": 537 }, { "epoch": 0.10888484112527828, "grad_norm": 0.3275223970413208, "learning_rate": 0.00019430091752854046, "loss": 0.2442, "step": 538 }, { "epoch": 0.10908722930580854, "grad_norm": 0.4409460425376892, "learning_rate": 0.00019427971898279347, "loss": 0.2524, "step": 539 }, { "epoch": 0.1092896174863388, "grad_norm": 0.3441372215747833, "learning_rate": 0.00019425848224545927, "loss": 0.2523, "step": 540 }, { "epoch": 0.10949200566686905, "grad_norm": 0.3794746398925781, "learning_rate": 0.00019423720732514053, "loss": 0.29, "step": 541 }, { "epoch": 0.10969439384739932, "grad_norm": 0.2813130021095276, "learning_rate": 0.00019421589423045555, "loss": 0.2422, "step": 542 }, { "epoch": 0.10989678202792957, "grad_norm": 0.3874035179615021, "learning_rate": 0.00019419454297003793, "loss": 0.2647, "step": 543 }, { "epoch": 0.11009917020845983, "grad_norm": 0.39327943325042725, "learning_rate": 0.00019417315355253688, "loss": 0.2793, "step": 544 }, { "epoch": 0.11030155838899008, "grad_norm": 0.37798020243644714, "learning_rate": 0.00019415172598661698, "loss": 0.2908, "step": 545 }, { "epoch": 0.11050394656952034, "grad_norm": 0.5571022033691406, "learning_rate": 0.00019413026028095823, "loss": 0.2744, "step": 546 }, { "epoch": 0.1107063347500506, "grad_norm": 0.3673059046268463, "learning_rate": 0.0001941087564442562, "loss": 0.234, "step": 547 }, { "epoch": 0.11090872293058085, "grad_norm": 0.2958654463291168, "learning_rate": 0.0001940872144852218, "loss": 0.2019, "step": 548 }, { "epoch": 0.1111111111111111, "grad_norm": 0.4359016716480255, "learning_rate": 0.00019406563441258143, "loss": 0.2482, "step": 549 }, { "epoch": 0.11131349929164137, "grad_norm": 0.3692166805267334, "learning_rate": 0.00019404401623507694, "loss": 0.2765, "step": 550 }, { "epoch": 0.11131349929164137, "eval_loss": 0.30828970670700073, "eval_runtime": 1.3171, "eval_samples_per_second": 3.796, "eval_steps_per_second": 0.759, "step": 550 }, { "epoch": 0.11151588747217163, "grad_norm": 0.8261103630065918, "learning_rate": 0.00019402235996146555, "loss": 0.2185, "step": 551 }, { "epoch": 0.11171827565270188, "grad_norm": 0.4357093572616577, "learning_rate": 0.00019400066560051996, "loss": 0.2512, "step": 552 }, { "epoch": 0.11192066383323214, "grad_norm": 0.33879590034484863, "learning_rate": 0.00019397893316102833, "loss": 0.243, "step": 553 }, { "epoch": 0.1121230520137624, "grad_norm": 0.4153684079647064, "learning_rate": 0.0001939571626517942, "loss": 0.2813, "step": 554 }, { "epoch": 0.11232544019429265, "grad_norm": 0.38233983516693115, "learning_rate": 0.00019393535408163655, "loss": 0.2565, "step": 555 }, { "epoch": 0.1125278283748229, "grad_norm": 0.4811713397502899, "learning_rate": 0.00019391350745938976, "loss": 0.2712, "step": 556 }, { "epoch": 0.11273021655535317, "grad_norm": 0.3775161802768707, "learning_rate": 0.00019389162279390362, "loss": 0.2539, "step": 557 }, { "epoch": 0.11293260473588343, "grad_norm": 0.31863147020339966, "learning_rate": 0.00019386970009404336, "loss": 0.2194, "step": 558 }, { "epoch": 0.11313499291641368, "grad_norm": 0.4357392489910126, "learning_rate": 0.00019384773936868962, "loss": 0.2306, "step": 559 }, { "epoch": 0.11333738109694394, "grad_norm": 0.40994343161582947, "learning_rate": 0.00019382574062673844, "loss": 0.3032, "step": 560 }, { "epoch": 0.1135397692774742, "grad_norm": 0.3298352062702179, "learning_rate": 0.00019380370387710118, "loss": 0.2774, "step": 561 }, { "epoch": 0.11374215745800445, "grad_norm": 0.27224934101104736, "learning_rate": 0.00019378162912870472, "loss": 0.2582, "step": 562 }, { "epoch": 0.1139445456385347, "grad_norm": 0.3737615942955017, "learning_rate": 0.0001937595163904913, "loss": 0.2871, "step": 563 }, { "epoch": 0.11414693381906496, "grad_norm": 0.48551228642463684, "learning_rate": 0.00019373736567141844, "loss": 0.2759, "step": 564 }, { "epoch": 0.11434932199959523, "grad_norm": 0.29210370779037476, "learning_rate": 0.0001937151769804592, "loss": 0.2389, "step": 565 }, { "epoch": 0.11455171018012549, "grad_norm": 0.23588493466377258, "learning_rate": 0.00019369295032660192, "loss": 0.205, "step": 566 }, { "epoch": 0.11475409836065574, "grad_norm": 0.4304031729698181, "learning_rate": 0.0001936706857188504, "loss": 0.2421, "step": 567 }, { "epoch": 0.114956486541186, "grad_norm": 0.3817498981952667, "learning_rate": 0.0001936483831662237, "loss": 0.2402, "step": 568 }, { "epoch": 0.11515887472171625, "grad_norm": 0.3602907359600067, "learning_rate": 0.00019362604267775634, "loss": 0.2593, "step": 569 }, { "epoch": 0.1153612629022465, "grad_norm": 0.3670409023761749, "learning_rate": 0.00019360366426249823, "loss": 0.2866, "step": 570 }, { "epoch": 0.11556365108277676, "grad_norm": 0.3887392580509186, "learning_rate": 0.00019358124792951453, "loss": 0.232, "step": 571 }, { "epoch": 0.11576603926330702, "grad_norm": 0.43573734164237976, "learning_rate": 0.00019355879368788587, "loss": 0.2382, "step": 572 }, { "epoch": 0.11596842744383729, "grad_norm": 0.3740786015987396, "learning_rate": 0.00019353630154670822, "loss": 0.2882, "step": 573 }, { "epoch": 0.11617081562436754, "grad_norm": 0.4014526307582855, "learning_rate": 0.0001935137715150928, "loss": 0.2531, "step": 574 }, { "epoch": 0.1163732038048978, "grad_norm": 0.3006138503551483, "learning_rate": 0.0001934912036021663, "loss": 0.2616, "step": 575 }, { "epoch": 0.11657559198542805, "grad_norm": 0.3588373363018036, "learning_rate": 0.00019346859781707072, "loss": 0.2491, "step": 576 }, { "epoch": 0.1167779801659583, "grad_norm": 0.34524407982826233, "learning_rate": 0.0001934459541689634, "loss": 0.2642, "step": 577 }, { "epoch": 0.11698036834648856, "grad_norm": 0.4407418966293335, "learning_rate": 0.000193423272667017, "loss": 0.2627, "step": 578 }, { "epoch": 0.11718275652701882, "grad_norm": 0.3426596522331238, "learning_rate": 0.0001934005533204195, "loss": 0.2335, "step": 579 }, { "epoch": 0.11738514470754909, "grad_norm": 0.3014998137950897, "learning_rate": 0.00019337779613837428, "loss": 0.2504, "step": 580 }, { "epoch": 0.11758753288807934, "grad_norm": 0.2892261743545532, "learning_rate": 0.00019335500113009995, "loss": 0.2396, "step": 581 }, { "epoch": 0.1177899210686096, "grad_norm": 0.3431570529937744, "learning_rate": 0.00019333216830483057, "loss": 0.2845, "step": 582 }, { "epoch": 0.11799230924913985, "grad_norm": 0.3361996114253998, "learning_rate": 0.00019330929767181535, "loss": 0.2599, "step": 583 }, { "epoch": 0.1181946974296701, "grad_norm": 0.2910197079181671, "learning_rate": 0.000193286389240319, "loss": 0.2397, "step": 584 }, { "epoch": 0.11839708561020036, "grad_norm": 0.3843308985233307, "learning_rate": 0.0001932634430196214, "loss": 0.2703, "step": 585 }, { "epoch": 0.11859947379073062, "grad_norm": 0.4046321213245392, "learning_rate": 0.0001932404590190178, "loss": 0.2509, "step": 586 }, { "epoch": 0.11880186197126087, "grad_norm": 0.34260809421539307, "learning_rate": 0.00019321743724781874, "loss": 0.2494, "step": 587 }, { "epoch": 0.11900425015179114, "grad_norm": 0.3353549540042877, "learning_rate": 0.00019319437771535007, "loss": 0.2573, "step": 588 }, { "epoch": 0.1192066383323214, "grad_norm": 0.41693922877311707, "learning_rate": 0.00019317128043095294, "loss": 0.2375, "step": 589 }, { "epoch": 0.11940902651285165, "grad_norm": 0.4338701367378235, "learning_rate": 0.00019314814540398375, "loss": 0.3063, "step": 590 }, { "epoch": 0.1196114146933819, "grad_norm": 0.3591492474079132, "learning_rate": 0.0001931249726438143, "loss": 0.2869, "step": 591 }, { "epoch": 0.11981380287391216, "grad_norm": 0.35468024015426636, "learning_rate": 0.0001931017621598315, "loss": 0.2972, "step": 592 }, { "epoch": 0.12001619105444242, "grad_norm": 0.3192785382270813, "learning_rate": 0.00019307851396143768, "loss": 0.2267, "step": 593 }, { "epoch": 0.12021857923497267, "grad_norm": 0.2555534839630127, "learning_rate": 0.00019305522805805042, "loss": 0.2395, "step": 594 }, { "epoch": 0.12042096741550294, "grad_norm": 0.3443443775177002, "learning_rate": 0.00019303190445910259, "loss": 0.2822, "step": 595 }, { "epoch": 0.1206233555960332, "grad_norm": 0.300624817609787, "learning_rate": 0.00019300854317404222, "loss": 0.2413, "step": 596 }, { "epoch": 0.12082574377656345, "grad_norm": 0.4178803861141205, "learning_rate": 0.00019298514421233276, "loss": 0.2695, "step": 597 }, { "epoch": 0.12102813195709371, "grad_norm": 0.3329545557498932, "learning_rate": 0.00019296170758345283, "loss": 0.243, "step": 598 }, { "epoch": 0.12123052013762396, "grad_norm": 0.2844451367855072, "learning_rate": 0.00019293823329689632, "loss": 0.2433, "step": 599 }, { "epoch": 0.12143290831815422, "grad_norm": 0.3378356695175171, "learning_rate": 0.00019291472136217234, "loss": 0.225, "step": 600 }, { "epoch": 0.12143290831815422, "eval_loss": 0.2922983467578888, "eval_runtime": 1.3195, "eval_samples_per_second": 3.789, "eval_steps_per_second": 0.758, "step": 600 }, { "epoch": 0.12163529649868447, "grad_norm": 0.5775878429412842, "learning_rate": 0.00019289117178880537, "loss": 0.3115, "step": 601 }, { "epoch": 0.12183768467921473, "grad_norm": 0.44308537244796753, "learning_rate": 0.00019286758458633504, "loss": 0.2846, "step": 602 }, { "epoch": 0.122040072859745, "grad_norm": 0.35728394985198975, "learning_rate": 0.0001928439597643162, "loss": 0.2521, "step": 603 }, { "epoch": 0.12224246104027525, "grad_norm": 0.30003228783607483, "learning_rate": 0.00019282029733231904, "loss": 0.2512, "step": 604 }, { "epoch": 0.12244484922080551, "grad_norm": 0.3294042944908142, "learning_rate": 0.00019279659729992888, "loss": 0.2773, "step": 605 }, { "epoch": 0.12264723740133576, "grad_norm": 0.35016027092933655, "learning_rate": 0.00019277285967674634, "loss": 0.247, "step": 606 }, { "epoch": 0.12284962558186602, "grad_norm": 0.3923855125904083, "learning_rate": 0.00019274908447238726, "loss": 0.3014, "step": 607 }, { "epoch": 0.12305201376239627, "grad_norm": 0.3739917576313019, "learning_rate": 0.00019272527169648268, "loss": 0.2624, "step": 608 }, { "epoch": 0.12325440194292653, "grad_norm": 0.3266221582889557, "learning_rate": 0.0001927014213586789, "loss": 0.2391, "step": 609 }, { "epoch": 0.12345679012345678, "grad_norm": 0.34201592206954956, "learning_rate": 0.00019267753346863736, "loss": 0.2453, "step": 610 }, { "epoch": 0.12365917830398705, "grad_norm": 1.0092198848724365, "learning_rate": 0.00019265360803603473, "loss": 0.2855, "step": 611 }, { "epoch": 0.12386156648451731, "grad_norm": 0.3326072096824646, "learning_rate": 0.00019262964507056303, "loss": 0.2604, "step": 612 }, { "epoch": 0.12406395466504756, "grad_norm": 0.37959495186805725, "learning_rate": 0.00019260564458192927, "loss": 0.2593, "step": 613 }, { "epoch": 0.12426634284557782, "grad_norm": 0.4499305486679077, "learning_rate": 0.00019258160657985583, "loss": 0.2396, "step": 614 }, { "epoch": 0.12446873102610807, "grad_norm": 0.26959168910980225, "learning_rate": 0.00019255753107408013, "loss": 0.2266, "step": 615 }, { "epoch": 0.12467111920663833, "grad_norm": 0.35351112484931946, "learning_rate": 0.000192533418074355, "loss": 0.2734, "step": 616 }, { "epoch": 0.12487350738716858, "grad_norm": 0.4333021938800812, "learning_rate": 0.0001925092675904482, "loss": 0.24, "step": 617 }, { "epoch": 0.12507589556769885, "grad_norm": 0.4381181001663208, "learning_rate": 0.00019248507963214284, "loss": 0.2236, "step": 618 }, { "epoch": 0.1252782837482291, "grad_norm": 0.44631966948509216, "learning_rate": 0.0001924608542092372, "loss": 0.2646, "step": 619 }, { "epoch": 0.12548067192875936, "grad_norm": 0.36171311140060425, "learning_rate": 0.0001924365913315447, "loss": 0.2264, "step": 620 }, { "epoch": 0.12568306010928962, "grad_norm": 0.35419216752052307, "learning_rate": 0.00019241229100889394, "loss": 0.2365, "step": 621 }, { "epoch": 0.12588544828981987, "grad_norm": 0.45207804441452026, "learning_rate": 0.0001923879532511287, "loss": 0.3144, "step": 622 }, { "epoch": 0.12608783647035013, "grad_norm": 0.42940232157707214, "learning_rate": 0.00019236357806810787, "loss": 0.2606, "step": 623 }, { "epoch": 0.12629022465088038, "grad_norm": 0.31150850653648376, "learning_rate": 0.0001923391654697056, "loss": 0.2079, "step": 624 }, { "epoch": 0.12649261283141064, "grad_norm": 0.29540133476257324, "learning_rate": 0.0001923147154658111, "loss": 0.2394, "step": 625 }, { "epoch": 0.1266950010119409, "grad_norm": 0.3295978009700775, "learning_rate": 0.0001922902280663288, "loss": 0.265, "step": 626 }, { "epoch": 0.12689738919247115, "grad_norm": 0.38124343752861023, "learning_rate": 0.00019226570328117828, "loss": 0.2438, "step": 627 }, { "epoch": 0.1270997773730014, "grad_norm": 0.2848641574382782, "learning_rate": 0.00019224114112029417, "loss": 0.2412, "step": 628 }, { "epoch": 0.1273021655535317, "grad_norm": 0.39890938997268677, "learning_rate": 0.00019221654159362636, "loss": 0.2911, "step": 629 }, { "epoch": 0.12750455373406194, "grad_norm": 0.4254617393016815, "learning_rate": 0.0001921919047111398, "loss": 0.2832, "step": 630 }, { "epoch": 0.1277069419145922, "grad_norm": 0.42490291595458984, "learning_rate": 0.0001921672304828146, "loss": 0.2412, "step": 631 }, { "epoch": 0.12790933009512245, "grad_norm": 0.31237921118736267, "learning_rate": 0.00019214251891864597, "loss": 0.2522, "step": 632 }, { "epoch": 0.1281117182756527, "grad_norm": 0.35776185989379883, "learning_rate": 0.00019211777002864434, "loss": 0.2401, "step": 633 }, { "epoch": 0.12831410645618296, "grad_norm": 0.5008077025413513, "learning_rate": 0.00019209298382283512, "loss": 0.2629, "step": 634 }, { "epoch": 0.12851649463671322, "grad_norm": 0.37600621581077576, "learning_rate": 0.00019206816031125894, "loss": 0.2554, "step": 635 }, { "epoch": 0.12871888281724347, "grad_norm": 0.3486710786819458, "learning_rate": 0.00019204329950397151, "loss": 0.239, "step": 636 }, { "epoch": 0.12892127099777373, "grad_norm": 0.2802092432975769, "learning_rate": 0.0001920184014110436, "loss": 0.2339, "step": 637 }, { "epoch": 0.12912365917830398, "grad_norm": 0.3463604748249054, "learning_rate": 0.0001919934660425612, "loss": 0.2218, "step": 638 }, { "epoch": 0.12932604735883424, "grad_norm": 0.3298332095146179, "learning_rate": 0.0001919684934086253, "loss": 0.2466, "step": 639 }, { "epoch": 0.1295284355393645, "grad_norm": 0.33739981055259705, "learning_rate": 0.00019194348351935197, "loss": 0.2376, "step": 640 }, { "epoch": 0.12973082371989475, "grad_norm": 0.42447808384895325, "learning_rate": 0.0001919184363848725, "loss": 0.2802, "step": 641 }, { "epoch": 0.129933211900425, "grad_norm": 0.6273435950279236, "learning_rate": 0.00019189335201533312, "loss": 0.2582, "step": 642 }, { "epoch": 0.13013560008095526, "grad_norm": 0.3449552655220032, "learning_rate": 0.00019186823042089524, "loss": 0.2735, "step": 643 }, { "epoch": 0.13033798826148552, "grad_norm": 0.40999501943588257, "learning_rate": 0.0001918430716117353, "loss": 0.2311, "step": 644 }, { "epoch": 0.1305403764420158, "grad_norm": 0.3628951907157898, "learning_rate": 0.00019181787559804486, "loss": 0.2798, "step": 645 }, { "epoch": 0.13074276462254605, "grad_norm": 0.36851662397384644, "learning_rate": 0.0001917926423900305, "loss": 0.2335, "step": 646 }, { "epoch": 0.1309451528030763, "grad_norm": 0.46718528866767883, "learning_rate": 0.00019176737199791389, "loss": 0.2749, "step": 647 }, { "epoch": 0.13114754098360656, "grad_norm": 0.377085417509079, "learning_rate": 0.00019174206443193178, "loss": 0.2571, "step": 648 }, { "epoch": 0.13134992916413682, "grad_norm": 0.3306112587451935, "learning_rate": 0.0001917167197023359, "loss": 0.2574, "step": 649 }, { "epoch": 0.13155231734466707, "grad_norm": 0.3243536055088043, "learning_rate": 0.00019169133781939322, "loss": 0.2364, "step": 650 }, { "epoch": 0.13155231734466707, "eval_loss": 0.3029465973377228, "eval_runtime": 1.3223, "eval_samples_per_second": 3.781, "eval_steps_per_second": 0.756, "step": 650 }, { "epoch": 0.13175470552519733, "grad_norm": 0.3559458553791046, "learning_rate": 0.00019166591879338555, "loss": 0.2571, "step": 651 }, { "epoch": 0.13195709370572759, "grad_norm": 0.4021930992603302, "learning_rate": 0.00019164046263460982, "loss": 0.241, "step": 652 }, { "epoch": 0.13215948188625784, "grad_norm": 0.41974547505378723, "learning_rate": 0.00019161496935337808, "loss": 0.2645, "step": 653 }, { "epoch": 0.1323618700667881, "grad_norm": 0.3117694556713104, "learning_rate": 0.0001915894389600173, "loss": 0.224, "step": 654 }, { "epoch": 0.13256425824731835, "grad_norm": 0.333473801612854, "learning_rate": 0.00019156387146486957, "loss": 0.2349, "step": 655 }, { "epoch": 0.1327666464278486, "grad_norm": 0.3041480481624603, "learning_rate": 0.00019153826687829195, "loss": 0.2392, "step": 656 }, { "epoch": 0.13296903460837886, "grad_norm": 0.5042875409126282, "learning_rate": 0.0001915126252106566, "loss": 0.2712, "step": 657 }, { "epoch": 0.13317142278890912, "grad_norm": 0.3337761163711548, "learning_rate": 0.00019148694647235057, "loss": 0.2603, "step": 658 }, { "epoch": 0.13337381096943937, "grad_norm": 0.4858247637748718, "learning_rate": 0.00019146123067377606, "loss": 0.2233, "step": 659 }, { "epoch": 0.13357619914996965, "grad_norm": 0.3900105655193329, "learning_rate": 0.00019143547782535025, "loss": 0.2509, "step": 660 }, { "epoch": 0.1337785873304999, "grad_norm": 0.4188336431980133, "learning_rate": 0.0001914096879375053, "loss": 0.2604, "step": 661 }, { "epoch": 0.13398097551103016, "grad_norm": 0.4977852404117584, "learning_rate": 0.0001913838610206884, "loss": 0.2632, "step": 662 }, { "epoch": 0.13418336369156042, "grad_norm": 0.3657325506210327, "learning_rate": 0.00019135799708536167, "loss": 0.2945, "step": 663 }, { "epoch": 0.13438575187209068, "grad_norm": 0.36988165974617004, "learning_rate": 0.0001913320961420023, "loss": 0.2461, "step": 664 }, { "epoch": 0.13458814005262093, "grad_norm": 0.30742794275283813, "learning_rate": 0.0001913061582011025, "loss": 0.246, "step": 665 }, { "epoch": 0.13479052823315119, "grad_norm": 0.29840725660324097, "learning_rate": 0.00019128018327316934, "loss": 0.2437, "step": 666 }, { "epoch": 0.13499291641368144, "grad_norm": 0.37174108624458313, "learning_rate": 0.00019125417136872506, "loss": 0.2725, "step": 667 }, { "epoch": 0.1351953045942117, "grad_norm": 0.33392372727394104, "learning_rate": 0.00019122812249830666, "loss": 0.2631, "step": 668 }, { "epoch": 0.13539769277474195, "grad_norm": 0.3134192228317261, "learning_rate": 0.00019120203667246632, "loss": 0.2789, "step": 669 }, { "epoch": 0.1356000809552722, "grad_norm": 0.3190394341945648, "learning_rate": 0.00019117591390177102, "loss": 0.231, "step": 670 }, { "epoch": 0.13580246913580246, "grad_norm": 0.36286187171936035, "learning_rate": 0.00019114975419680282, "loss": 0.2732, "step": 671 }, { "epoch": 0.13600485731633272, "grad_norm": 0.3986504375934601, "learning_rate": 0.0001911235575681587, "loss": 0.267, "step": 672 }, { "epoch": 0.13620724549686297, "grad_norm": 0.31040704250335693, "learning_rate": 0.00019109732402645063, "loss": 0.2296, "step": 673 }, { "epoch": 0.13640963367739323, "grad_norm": 0.36109480261802673, "learning_rate": 0.00019107105358230544, "loss": 0.2395, "step": 674 }, { "epoch": 0.1366120218579235, "grad_norm": 0.38045042753219604, "learning_rate": 0.00019104474624636502, "loss": 0.2553, "step": 675 }, { "epoch": 0.13681441003845377, "grad_norm": 0.3555276095867157, "learning_rate": 0.00019101840202928613, "loss": 0.2866, "step": 676 }, { "epoch": 0.13701679821898402, "grad_norm": 0.3541220724582672, "learning_rate": 0.00019099202094174055, "loss": 0.2615, "step": 677 }, { "epoch": 0.13721918639951428, "grad_norm": 0.39211398363113403, "learning_rate": 0.00019096560299441487, "loss": 0.27, "step": 678 }, { "epoch": 0.13742157458004453, "grad_norm": 0.5050557255744934, "learning_rate": 0.00019093914819801074, "loss": 0.3031, "step": 679 }, { "epoch": 0.1376239627605748, "grad_norm": 0.31856054067611694, "learning_rate": 0.00019091265656324464, "loss": 0.308, "step": 680 }, { "epoch": 0.13782635094110504, "grad_norm": 0.3063257038593292, "learning_rate": 0.00019088612810084805, "loss": 0.2694, "step": 681 }, { "epoch": 0.1380287391216353, "grad_norm": 0.3525073826313019, "learning_rate": 0.0001908595628215673, "loss": 0.248, "step": 682 }, { "epoch": 0.13823112730216555, "grad_norm": 0.34887996315956116, "learning_rate": 0.0001908329607361637, "loss": 0.2433, "step": 683 }, { "epoch": 0.1384335154826958, "grad_norm": 0.3819187581539154, "learning_rate": 0.00019080632185541343, "loss": 0.2623, "step": 684 }, { "epoch": 0.13863590366322606, "grad_norm": 0.40141919255256653, "learning_rate": 0.0001907796461901076, "loss": 0.2791, "step": 685 }, { "epoch": 0.13883829184375632, "grad_norm": 0.3135068714618683, "learning_rate": 0.00019075293375105213, "loss": 0.2513, "step": 686 }, { "epoch": 0.13904068002428657, "grad_norm": 0.3952779471874237, "learning_rate": 0.00019072618454906798, "loss": 0.2411, "step": 687 }, { "epoch": 0.13924306820481683, "grad_norm": 0.31623515486717224, "learning_rate": 0.0001906993985949909, "loss": 0.2721, "step": 688 }, { "epoch": 0.13944545638534708, "grad_norm": 0.3655812442302704, "learning_rate": 0.0001906725758996716, "loss": 0.2342, "step": 689 }, { "epoch": 0.13964784456587737, "grad_norm": 0.35465773940086365, "learning_rate": 0.00019064571647397558, "loss": 0.2954, "step": 690 }, { "epoch": 0.13985023274640762, "grad_norm": 0.3672182261943817, "learning_rate": 0.00019061882032878332, "loss": 0.2809, "step": 691 }, { "epoch": 0.14005262092693788, "grad_norm": 0.34542813897132874, "learning_rate": 0.00019059188747499005, "loss": 0.2534, "step": 692 }, { "epoch": 0.14025500910746813, "grad_norm": 0.36235249042510986, "learning_rate": 0.00019056491792350606, "loss": 0.255, "step": 693 }, { "epoch": 0.1404573972879984, "grad_norm": 0.2668914496898651, "learning_rate": 0.00019053791168525627, "loss": 0.2262, "step": 694 }, { "epoch": 0.14065978546852864, "grad_norm": 0.2793848514556885, "learning_rate": 0.0001905108687711807, "loss": 0.2464, "step": 695 }, { "epoch": 0.1408621736490589, "grad_norm": 0.29764464497566223, "learning_rate": 0.00019048378919223405, "loss": 0.2443, "step": 696 }, { "epoch": 0.14106456182958915, "grad_norm": 0.33268776535987854, "learning_rate": 0.00019045667295938592, "loss": 0.2225, "step": 697 }, { "epoch": 0.1412669500101194, "grad_norm": 0.28696081042289734, "learning_rate": 0.0001904295200836208, "loss": 0.2389, "step": 698 }, { "epoch": 0.14146933819064966, "grad_norm": 0.6153262257575989, "learning_rate": 0.000190402330575938, "loss": 0.3124, "step": 699 }, { "epoch": 0.14167172637117992, "grad_norm": 0.39108186960220337, "learning_rate": 0.00019037510444735168, "loss": 0.2041, "step": 700 }, { "epoch": 0.14167172637117992, "eval_loss": 0.29775145649909973, "eval_runtime": 1.3196, "eval_samples_per_second": 3.789, "eval_steps_per_second": 0.758, "step": 700 }, { "epoch": 0.14187411455171017, "grad_norm": 0.3963056802749634, "learning_rate": 0.00019034784170889076, "loss": 0.2441, "step": 701 }, { "epoch": 0.14207650273224043, "grad_norm": 0.37787678837776184, "learning_rate": 0.00019032054237159906, "loss": 0.251, "step": 702 }, { "epoch": 0.14227889091277068, "grad_norm": 0.2585795521736145, "learning_rate": 0.00019029320644653528, "loss": 0.2365, "step": 703 }, { "epoch": 0.14248127909330094, "grad_norm": 0.3206770718097687, "learning_rate": 0.0001902658339447728, "loss": 0.2393, "step": 704 }, { "epoch": 0.14268366727383122, "grad_norm": 0.3604775071144104, "learning_rate": 0.0001902384248774, "loss": 0.2828, "step": 705 }, { "epoch": 0.14288605545436148, "grad_norm": 0.3225979804992676, "learning_rate": 0.0001902109792555198, "loss": 0.2286, "step": 706 }, { "epoch": 0.14308844363489173, "grad_norm": 0.43546780943870544, "learning_rate": 0.00019018349709025022, "loss": 0.2312, "step": 707 }, { "epoch": 0.143290831815422, "grad_norm": 0.3137240409851074, "learning_rate": 0.00019015597839272395, "loss": 0.268, "step": 708 }, { "epoch": 0.14349321999595224, "grad_norm": 0.27973780035972595, "learning_rate": 0.00019012842317408843, "loss": 0.2311, "step": 709 }, { "epoch": 0.1436956081764825, "grad_norm": 0.3511960804462433, "learning_rate": 0.000190100831445506, "loss": 0.2807, "step": 710 }, { "epoch": 0.14389799635701275, "grad_norm": 0.3491211235523224, "learning_rate": 0.0001900732032181537, "loss": 0.2282, "step": 711 }, { "epoch": 0.144100384537543, "grad_norm": 0.4287636876106262, "learning_rate": 0.00019004553850322342, "loss": 0.2409, "step": 712 }, { "epoch": 0.14430277271807326, "grad_norm": 0.3858319818973541, "learning_rate": 0.0001900178373119218, "loss": 0.231, "step": 713 }, { "epoch": 0.14450516089860352, "grad_norm": 0.42482271790504456, "learning_rate": 0.00018999009965547025, "loss": 0.2282, "step": 714 }, { "epoch": 0.14470754907913377, "grad_norm": 0.32496002316474915, "learning_rate": 0.000189962325545105, "loss": 0.2864, "step": 715 }, { "epoch": 0.14490993725966403, "grad_norm": 0.47039979696273804, "learning_rate": 0.00018993451499207695, "loss": 0.2997, "step": 716 }, { "epoch": 0.14511232544019428, "grad_norm": 1.0757962465286255, "learning_rate": 0.00018990666800765187, "loss": 0.2351, "step": 717 }, { "epoch": 0.14531471362072454, "grad_norm": 0.5151010155677795, "learning_rate": 0.00018987878460311023, "loss": 0.2369, "step": 718 }, { "epoch": 0.1455171018012548, "grad_norm": 0.43854132294654846, "learning_rate": 0.00018985086478974722, "loss": 0.2331, "step": 719 }, { "epoch": 0.14571948998178508, "grad_norm": 0.30598852038383484, "learning_rate": 0.00018982290857887292, "loss": 0.2623, "step": 720 }, { "epoch": 0.14592187816231533, "grad_norm": 0.4506291449069977, "learning_rate": 0.00018979491598181193, "loss": 0.2512, "step": 721 }, { "epoch": 0.1461242663428456, "grad_norm": 0.3413534164428711, "learning_rate": 0.00018976688700990385, "loss": 0.215, "step": 722 }, { "epoch": 0.14632665452337584, "grad_norm": 0.3063417971134186, "learning_rate": 0.0001897388216745028, "loss": 0.246, "step": 723 }, { "epoch": 0.1465290427039061, "grad_norm": 0.34487882256507874, "learning_rate": 0.00018971071998697772, "loss": 0.2527, "step": 724 }, { "epoch": 0.14673143088443635, "grad_norm": 0.5452083349227905, "learning_rate": 0.0001896825819587123, "loss": 0.2356, "step": 725 }, { "epoch": 0.1469338190649666, "grad_norm": 0.6690550446510315, "learning_rate": 0.00018965440760110487, "loss": 0.2783, "step": 726 }, { "epoch": 0.14713620724549686, "grad_norm": 0.2755454480648041, "learning_rate": 0.00018962619692556855, "loss": 0.2364, "step": 727 }, { "epoch": 0.14733859542602712, "grad_norm": 0.281415730714798, "learning_rate": 0.00018959794994353115, "loss": 0.2227, "step": 728 }, { "epoch": 0.14754098360655737, "grad_norm": 0.38511228561401367, "learning_rate": 0.00018956966666643522, "loss": 0.2559, "step": 729 }, { "epoch": 0.14774337178708763, "grad_norm": 0.414522647857666, "learning_rate": 0.0001895413471057379, "loss": 0.2604, "step": 730 }, { "epoch": 0.14794575996761788, "grad_norm": 0.2914217710494995, "learning_rate": 0.00018951299127291116, "loss": 0.2379, "step": 731 }, { "epoch": 0.14814814814814814, "grad_norm": 0.3874073326587677, "learning_rate": 0.0001894845991794416, "loss": 0.2986, "step": 732 }, { "epoch": 0.1483505363286784, "grad_norm": 0.47619831562042236, "learning_rate": 0.0001894561708368305, "loss": 0.2646, "step": 733 }, { "epoch": 0.14855292450920865, "grad_norm": 0.28893476724624634, "learning_rate": 0.00018942770625659386, "loss": 0.2317, "step": 734 }, { "epoch": 0.1487553126897389, "grad_norm": 0.3063059151172638, "learning_rate": 0.00018939920545026233, "loss": 0.2565, "step": 735 }, { "epoch": 0.1489577008702692, "grad_norm": 0.3602577745914459, "learning_rate": 0.00018937066842938127, "loss": 0.2286, "step": 736 }, { "epoch": 0.14916008905079944, "grad_norm": 0.3898904621601105, "learning_rate": 0.00018934209520551066, "loss": 0.2831, "step": 737 }, { "epoch": 0.1493624772313297, "grad_norm": 0.47818851470947266, "learning_rate": 0.0001893134857902252, "loss": 0.2559, "step": 738 }, { "epoch": 0.14956486541185995, "grad_norm": 0.3992469906806946, "learning_rate": 0.00018928484019511418, "loss": 0.2681, "step": 739 }, { "epoch": 0.1497672535923902, "grad_norm": 0.4363305866718292, "learning_rate": 0.0001892561584317816, "loss": 0.2736, "step": 740 }, { "epoch": 0.14996964177292046, "grad_norm": 0.5279491543769836, "learning_rate": 0.0001892274405118461, "loss": 0.2606, "step": 741 }, { "epoch": 0.15017202995345072, "grad_norm": 0.45467397570610046, "learning_rate": 0.000189198686446941, "loss": 0.3291, "step": 742 }, { "epoch": 0.15037441813398097, "grad_norm": 0.3572303056716919, "learning_rate": 0.0001891698962487142, "loss": 0.2622, "step": 743 }, { "epoch": 0.15057680631451123, "grad_norm": 0.318308562040329, "learning_rate": 0.00018914106992882828, "loss": 0.2204, "step": 744 }, { "epoch": 0.15077919449504149, "grad_norm": 0.3323497176170349, "learning_rate": 0.0001891122074989604, "loss": 0.2198, "step": 745 }, { "epoch": 0.15098158267557174, "grad_norm": 0.2885453402996063, "learning_rate": 0.00018908330897080244, "loss": 0.2323, "step": 746 }, { "epoch": 0.151183970856102, "grad_norm": 0.3950914144515991, "learning_rate": 0.0001890543743560608, "loss": 0.2595, "step": 747 }, { "epoch": 0.15138635903663225, "grad_norm": 0.392425537109375, "learning_rate": 0.00018902540366645657, "loss": 0.2291, "step": 748 }, { "epoch": 0.1515887472171625, "grad_norm": 0.30079102516174316, "learning_rate": 0.00018899639691372544, "loss": 0.2691, "step": 749 }, { "epoch": 0.15179113539769276, "grad_norm": 0.3596877157688141, "learning_rate": 0.00018896735410961767, "loss": 0.2602, "step": 750 }, { "epoch": 0.15179113539769276, "eval_loss": 0.2997572422027588, "eval_runtime": 1.3148, "eval_samples_per_second": 3.803, "eval_steps_per_second": 0.761, "step": 750 }, { "epoch": 0.15199352357822304, "grad_norm": 0.5068015456199646, "learning_rate": 0.00018893827526589817, "loss": 0.2434, "step": 751 }, { "epoch": 0.1521959117587533, "grad_norm": 0.4442897439002991, "learning_rate": 0.00018890916039434642, "loss": 0.2891, "step": 752 }, { "epoch": 0.15239829993928355, "grad_norm": 0.7166479229927063, "learning_rate": 0.00018888000950675654, "loss": 0.3013, "step": 753 }, { "epoch": 0.1526006881198138, "grad_norm": 0.30901750922203064, "learning_rate": 0.0001888508226149372, "loss": 0.2498, "step": 754 }, { "epoch": 0.15280307630034407, "grad_norm": 0.38305988907814026, "learning_rate": 0.00018882159973071159, "loss": 0.2535, "step": 755 }, { "epoch": 0.15300546448087432, "grad_norm": 0.29901936650276184, "learning_rate": 0.00018879234086591765, "loss": 0.2526, "step": 756 }, { "epoch": 0.15320785266140458, "grad_norm": 0.5786696672439575, "learning_rate": 0.00018876304603240773, "loss": 0.3119, "step": 757 }, { "epoch": 0.15341024084193483, "grad_norm": 0.25216397643089294, "learning_rate": 0.00018873371524204884, "loss": 0.2254, "step": 758 }, { "epoch": 0.15361262902246509, "grad_norm": 0.3706039488315582, "learning_rate": 0.00018870434850672255, "loss": 0.2592, "step": 759 }, { "epoch": 0.15381501720299534, "grad_norm": 0.3494788408279419, "learning_rate": 0.00018867494583832493, "loss": 0.2651, "step": 760 }, { "epoch": 0.1540174053835256, "grad_norm": 0.3870294988155365, "learning_rate": 0.00018864550724876668, "loss": 0.2523, "step": 761 }, { "epoch": 0.15421979356405585, "grad_norm": 0.3510667383670807, "learning_rate": 0.00018861603274997302, "loss": 0.244, "step": 762 }, { "epoch": 0.1544221817445861, "grad_norm": 0.323993444442749, "learning_rate": 0.0001885865223538837, "loss": 0.2627, "step": 763 }, { "epoch": 0.15462456992511636, "grad_norm": 0.42742663621902466, "learning_rate": 0.00018855697607245305, "loss": 0.2603, "step": 764 }, { "epoch": 0.15482695810564662, "grad_norm": 0.33221808075904846, "learning_rate": 0.0001885273939176499, "loss": 0.2471, "step": 765 }, { "epoch": 0.1550293462861769, "grad_norm": 0.3855549693107605, "learning_rate": 0.00018849777590145762, "loss": 0.2225, "step": 766 }, { "epoch": 0.15523173446670716, "grad_norm": 0.2925729751586914, "learning_rate": 0.00018846812203587415, "loss": 0.2569, "step": 767 }, { "epoch": 0.1554341226472374, "grad_norm": 0.3755352199077606, "learning_rate": 0.0001884384323329119, "loss": 0.259, "step": 768 }, { "epoch": 0.15563651082776767, "grad_norm": 0.4056079387664795, "learning_rate": 0.0001884087068045978, "loss": 0.2784, "step": 769 }, { "epoch": 0.15583889900829792, "grad_norm": 0.36607983708381653, "learning_rate": 0.00018837894546297332, "loss": 0.2652, "step": 770 }, { "epoch": 0.15604128718882818, "grad_norm": 0.360861599445343, "learning_rate": 0.00018834914832009442, "loss": 0.249, "step": 771 }, { "epoch": 0.15624367536935843, "grad_norm": 0.3167110085487366, "learning_rate": 0.00018831931538803155, "loss": 0.2492, "step": 772 }, { "epoch": 0.1564460635498887, "grad_norm": 0.4494132995605469, "learning_rate": 0.00018828944667886969, "loss": 0.2661, "step": 773 }, { "epoch": 0.15664845173041894, "grad_norm": 0.39809274673461914, "learning_rate": 0.0001882595422047083, "loss": 0.278, "step": 774 }, { "epoch": 0.1568508399109492, "grad_norm": 0.3770066499710083, "learning_rate": 0.0001882296019776613, "loss": 0.2887, "step": 775 }, { "epoch": 0.15705322809147945, "grad_norm": 0.3856280744075775, "learning_rate": 0.00018819962600985716, "loss": 0.2386, "step": 776 }, { "epoch": 0.1572556162720097, "grad_norm": 0.2342846691608429, "learning_rate": 0.0001881696143134388, "loss": 0.2156, "step": 777 }, { "epoch": 0.15745800445253996, "grad_norm": 0.23688459396362305, "learning_rate": 0.0001881395669005635, "loss": 0.2066, "step": 778 }, { "epoch": 0.15766039263307022, "grad_norm": 0.4036184251308441, "learning_rate": 0.0001881094837834032, "loss": 0.2606, "step": 779 }, { "epoch": 0.15786278081360047, "grad_norm": 0.33522829413414, "learning_rate": 0.00018807936497414418, "loss": 0.2492, "step": 780 }, { "epoch": 0.15806516899413076, "grad_norm": 0.3788508474826813, "learning_rate": 0.00018804921048498722, "loss": 0.2635, "step": 781 }, { "epoch": 0.158267557174661, "grad_norm": 0.3290668725967407, "learning_rate": 0.00018801902032814755, "loss": 0.2498, "step": 782 }, { "epoch": 0.15846994535519127, "grad_norm": 0.3567562997341156, "learning_rate": 0.00018798879451585482, "loss": 0.2591, "step": 783 }, { "epoch": 0.15867233353572152, "grad_norm": 0.3006361722946167, "learning_rate": 0.00018795853306035313, "loss": 0.2465, "step": 784 }, { "epoch": 0.15887472171625178, "grad_norm": 0.3369651436805725, "learning_rate": 0.00018792823597390106, "loss": 0.254, "step": 785 }, { "epoch": 0.15907710989678203, "grad_norm": 0.42409297823905945, "learning_rate": 0.00018789790326877162, "loss": 0.2662, "step": 786 }, { "epoch": 0.1592794980773123, "grad_norm": 0.3042981028556824, "learning_rate": 0.00018786753495725217, "loss": 0.2422, "step": 787 }, { "epoch": 0.15948188625784254, "grad_norm": 0.3036056160926819, "learning_rate": 0.0001878371310516446, "loss": 0.2375, "step": 788 }, { "epoch": 0.1596842744383728, "grad_norm": 0.812958836555481, "learning_rate": 0.00018780669156426516, "loss": 0.2167, "step": 789 }, { "epoch": 0.15988666261890305, "grad_norm": 0.42303574085235596, "learning_rate": 0.0001877762165074445, "loss": 0.2515, "step": 790 }, { "epoch": 0.1600890507994333, "grad_norm": 0.4357164800167084, "learning_rate": 0.0001877457058935277, "loss": 0.2393, "step": 791 }, { "epoch": 0.16029143897996356, "grad_norm": 0.41273993253707886, "learning_rate": 0.00018771515973487428, "loss": 0.2227, "step": 792 }, { "epoch": 0.16049382716049382, "grad_norm": 0.550197184085846, "learning_rate": 0.0001876845780438581, "loss": 0.2627, "step": 793 }, { "epoch": 0.16069621534102407, "grad_norm": 0.3392764925956726, "learning_rate": 0.00018765396083286743, "loss": 0.2402, "step": 794 }, { "epoch": 0.16089860352155433, "grad_norm": 0.36948779225349426, "learning_rate": 0.00018762330811430492, "loss": 0.2188, "step": 795 }, { "epoch": 0.1611009917020846, "grad_norm": 0.8481617569923401, "learning_rate": 0.0001875926199005877, "loss": 0.2409, "step": 796 }, { "epoch": 0.16130337988261487, "grad_norm": 0.4211081266403198, "learning_rate": 0.00018756189620414712, "loss": 0.2714, "step": 797 }, { "epoch": 0.16150576806314512, "grad_norm": 0.34100785851478577, "learning_rate": 0.000187531137037429, "loss": 0.2319, "step": 798 }, { "epoch": 0.16170815624367538, "grad_norm": 0.3640136122703552, "learning_rate": 0.00018750034241289353, "loss": 0.2436, "step": 799 }, { "epoch": 0.16191054442420563, "grad_norm": 0.2910507917404175, "learning_rate": 0.00018746951234301524, "loss": 0.2555, "step": 800 }, { "epoch": 0.16191054442420563, "eval_loss": 0.2940669059753418, "eval_runtime": 1.3189, "eval_samples_per_second": 3.791, "eval_steps_per_second": 0.758, "step": 800 }, { "epoch": 0.1621129326047359, "grad_norm": 0.31314074993133545, "learning_rate": 0.00018743864684028297, "loss": 0.2745, "step": 801 }, { "epoch": 0.16231532078526614, "grad_norm": 0.3645711839199066, "learning_rate": 0.00018740774591720008, "loss": 0.2562, "step": 802 }, { "epoch": 0.1625177089657964, "grad_norm": 0.33595263957977295, "learning_rate": 0.00018737680958628403, "loss": 0.2465, "step": 803 }, { "epoch": 0.16272009714632665, "grad_norm": 0.3377053439617157, "learning_rate": 0.00018734583786006687, "loss": 0.2513, "step": 804 }, { "epoch": 0.1629224853268569, "grad_norm": 0.4379747807979584, "learning_rate": 0.0001873148307510948, "loss": 0.2694, "step": 805 }, { "epoch": 0.16312487350738716, "grad_norm": 0.30942919850349426, "learning_rate": 0.00018728378827192845, "loss": 0.2606, "step": 806 }, { "epoch": 0.16332726168791742, "grad_norm": 0.4087422788143158, "learning_rate": 0.0001872527104351428, "loss": 0.2442, "step": 807 }, { "epoch": 0.16352964986844767, "grad_norm": 0.35748982429504395, "learning_rate": 0.00018722159725332706, "loss": 0.2549, "step": 808 }, { "epoch": 0.16373203804897793, "grad_norm": 0.459611177444458, "learning_rate": 0.0001871904487390848, "loss": 0.2352, "step": 809 }, { "epoch": 0.16393442622950818, "grad_norm": 0.3697472810745239, "learning_rate": 0.00018715926490503397, "loss": 0.2657, "step": 810 }, { "epoch": 0.16413681441003847, "grad_norm": 0.37559008598327637, "learning_rate": 0.00018712804576380673, "loss": 0.2392, "step": 811 }, { "epoch": 0.16433920259056872, "grad_norm": 0.35550206899642944, "learning_rate": 0.00018709679132804957, "loss": 0.2573, "step": 812 }, { "epoch": 0.16454159077109898, "grad_norm": 0.29014283418655396, "learning_rate": 0.0001870655016104233, "loss": 0.2345, "step": 813 }, { "epoch": 0.16474397895162923, "grad_norm": 0.48033180832862854, "learning_rate": 0.00018703417662360302, "loss": 0.2828, "step": 814 }, { "epoch": 0.1649463671321595, "grad_norm": 0.3042934834957123, "learning_rate": 0.0001870028163802781, "loss": 0.2466, "step": 815 }, { "epoch": 0.16514875531268974, "grad_norm": 0.3336636424064636, "learning_rate": 0.0001869714208931522, "loss": 0.2274, "step": 816 }, { "epoch": 0.16535114349322, "grad_norm": 0.363098680973053, "learning_rate": 0.00018693999017494327, "loss": 0.2539, "step": 817 }, { "epoch": 0.16555353167375025, "grad_norm": 0.2703571617603302, "learning_rate": 0.0001869085242383835, "loss": 0.2496, "step": 818 }, { "epoch": 0.1657559198542805, "grad_norm": 0.3858727216720581, "learning_rate": 0.00018687702309621936, "loss": 0.2733, "step": 819 }, { "epoch": 0.16595830803481076, "grad_norm": 0.3601354658603668, "learning_rate": 0.0001868454867612116, "loss": 0.2588, "step": 820 }, { "epoch": 0.16616069621534102, "grad_norm": 0.4197833836078644, "learning_rate": 0.00018681391524613518, "loss": 0.2548, "step": 821 }, { "epoch": 0.16636308439587127, "grad_norm": 0.36976855993270874, "learning_rate": 0.00018678230856377943, "loss": 0.2276, "step": 822 }, { "epoch": 0.16656547257640153, "grad_norm": 0.4233555197715759, "learning_rate": 0.00018675066672694775, "loss": 0.2663, "step": 823 }, { "epoch": 0.16676786075693179, "grad_norm": 0.384177565574646, "learning_rate": 0.00018671898974845788, "loss": 0.2999, "step": 824 }, { "epoch": 0.16697024893746204, "grad_norm": 0.26830950379371643, "learning_rate": 0.00018668727764114187, "loss": 0.2577, "step": 825 }, { "epoch": 0.1671726371179923, "grad_norm": 0.4031645655632019, "learning_rate": 0.0001866555304178458, "loss": 0.2756, "step": 826 }, { "epoch": 0.16737502529852258, "grad_norm": 0.2974866032600403, "learning_rate": 0.00018662374809143013, "loss": 0.2069, "step": 827 }, { "epoch": 0.16757741347905283, "grad_norm": 0.3027338981628418, "learning_rate": 0.00018659193067476957, "loss": 0.2517, "step": 828 }, { "epoch": 0.1677798016595831, "grad_norm": 0.3020446002483368, "learning_rate": 0.00018656007818075286, "loss": 0.2479, "step": 829 }, { "epoch": 0.16798218984011334, "grad_norm": 0.4435567557811737, "learning_rate": 0.00018652819062228316, "loss": 0.232, "step": 830 }, { "epoch": 0.1681845780206436, "grad_norm": 0.29276329278945923, "learning_rate": 0.0001864962680122777, "loss": 0.2839, "step": 831 }, { "epoch": 0.16838696620117385, "grad_norm": 0.287499338388443, "learning_rate": 0.00018646431036366793, "loss": 0.243, "step": 832 }, { "epoch": 0.1685893543817041, "grad_norm": 0.3262856900691986, "learning_rate": 0.00018643231768939955, "loss": 0.235, "step": 833 }, { "epoch": 0.16879174256223436, "grad_norm": 0.6079496145248413, "learning_rate": 0.00018640029000243236, "loss": 0.3112, "step": 834 }, { "epoch": 0.16899413074276462, "grad_norm": 0.3428683578968048, "learning_rate": 0.0001863682273157405, "loss": 0.238, "step": 835 }, { "epoch": 0.16919651892329488, "grad_norm": 0.40579506754875183, "learning_rate": 0.00018633612964231205, "loss": 0.2815, "step": 836 }, { "epoch": 0.16939890710382513, "grad_norm": 0.2931835651397705, "learning_rate": 0.00018630399699514944, "loss": 0.2198, "step": 837 }, { "epoch": 0.16960129528435539, "grad_norm": 0.3209609091281891, "learning_rate": 0.00018627182938726926, "loss": 0.2834, "step": 838 }, { "epoch": 0.16980368346488564, "grad_norm": 0.4252403974533081, "learning_rate": 0.00018623962683170215, "loss": 0.2967, "step": 839 }, { "epoch": 0.1700060716454159, "grad_norm": 0.5377660393714905, "learning_rate": 0.00018620738934149307, "loss": 0.2343, "step": 840 }, { "epoch": 0.17020845982594615, "grad_norm": 0.3427773714065552, "learning_rate": 0.000186175116929701, "loss": 0.2734, "step": 841 }, { "epoch": 0.17041084800647643, "grad_norm": 0.3581143021583557, "learning_rate": 0.00018614280960939906, "loss": 0.2922, "step": 842 }, { "epoch": 0.1706132361870067, "grad_norm": 0.36788514256477356, "learning_rate": 0.00018611046739367462, "loss": 0.246, "step": 843 }, { "epoch": 0.17081562436753694, "grad_norm": 0.2968742549419403, "learning_rate": 0.00018607809029562913, "loss": 0.2496, "step": 844 }, { "epoch": 0.1710180125480672, "grad_norm": 0.4073772132396698, "learning_rate": 0.0001860456783283781, "loss": 0.2727, "step": 845 }, { "epoch": 0.17122040072859745, "grad_norm": 0.43219760060310364, "learning_rate": 0.00018601323150505133, "loss": 0.2767, "step": 846 }, { "epoch": 0.1714227889091277, "grad_norm": 0.29775089025497437, "learning_rate": 0.00018598074983879257, "loss": 0.2638, "step": 847 }, { "epoch": 0.17162517708965797, "grad_norm": 0.35835182666778564, "learning_rate": 0.00018594823334275974, "loss": 0.2577, "step": 848 }, { "epoch": 0.17182756527018822, "grad_norm": 0.32153385877609253, "learning_rate": 0.00018591568203012496, "loss": 0.2324, "step": 849 }, { "epoch": 0.17202995345071848, "grad_norm": 0.37278905510902405, "learning_rate": 0.0001858830959140743, "loss": 0.2763, "step": 850 }, { "epoch": 0.17202995345071848, "eval_loss": 0.2905246913433075, "eval_runtime": 1.3191, "eval_samples_per_second": 3.79, "eval_steps_per_second": 0.758, "step": 850 }, { "epoch": 0.17223234163124873, "grad_norm": 0.33816516399383545, "learning_rate": 0.00018585047500780806, "loss": 0.2454, "step": 851 }, { "epoch": 0.172434729811779, "grad_norm": 0.5656778812408447, "learning_rate": 0.00018581781932454052, "loss": 0.2572, "step": 852 }, { "epoch": 0.17263711799230924, "grad_norm": 0.6452876329421997, "learning_rate": 0.00018578512887750017, "loss": 0.2222, "step": 853 }, { "epoch": 0.1728395061728395, "grad_norm": 0.38986819982528687, "learning_rate": 0.0001857524036799295, "loss": 0.2477, "step": 854 }, { "epoch": 0.17304189435336975, "grad_norm": 0.3929271996021271, "learning_rate": 0.00018571964374508506, "loss": 0.2838, "step": 855 }, { "epoch": 0.1732442825339, "grad_norm": 0.42379677295684814, "learning_rate": 0.00018568684908623756, "loss": 0.2551, "step": 856 }, { "epoch": 0.1734466707144303, "grad_norm": 0.5375417470932007, "learning_rate": 0.00018565401971667168, "loss": 0.239, "step": 857 }, { "epoch": 0.17364905889496055, "grad_norm": 0.345939576625824, "learning_rate": 0.00018562115564968625, "loss": 0.2745, "step": 858 }, { "epoch": 0.1738514470754908, "grad_norm": 0.3439200222492218, "learning_rate": 0.0001855882568985941, "loss": 0.2631, "step": 859 }, { "epoch": 0.17405383525602106, "grad_norm": 0.41948434710502625, "learning_rate": 0.0001855553234767221, "loss": 0.2336, "step": 860 }, { "epoch": 0.1742562234365513, "grad_norm": 0.29237958788871765, "learning_rate": 0.00018552235539741116, "loss": 0.2909, "step": 861 }, { "epoch": 0.17445861161708157, "grad_norm": 0.3038657307624817, "learning_rate": 0.00018548935267401632, "loss": 0.2268, "step": 862 }, { "epoch": 0.17466099979761182, "grad_norm": 0.5614389777183533, "learning_rate": 0.00018545631531990652, "loss": 0.2511, "step": 863 }, { "epoch": 0.17486338797814208, "grad_norm": 0.3933039605617523, "learning_rate": 0.00018542324334846488, "loss": 0.2543, "step": 864 }, { "epoch": 0.17506577615867233, "grad_norm": 0.4585455358028412, "learning_rate": 0.00018539013677308842, "loss": 0.2655, "step": 865 }, { "epoch": 0.1752681643392026, "grad_norm": 0.642288327217102, "learning_rate": 0.0001853569956071882, "loss": 0.2931, "step": 866 }, { "epoch": 0.17547055251973284, "grad_norm": 0.40841567516326904, "learning_rate": 0.0001853238198641893, "loss": 0.278, "step": 867 }, { "epoch": 0.1756729407002631, "grad_norm": 0.4422517716884613, "learning_rate": 0.00018529060955753087, "loss": 0.2441, "step": 868 }, { "epoch": 0.17587532888079335, "grad_norm": 0.4614473283290863, "learning_rate": 0.00018525736470066594, "loss": 0.2701, "step": 869 }, { "epoch": 0.1760777170613236, "grad_norm": 0.3251465857028961, "learning_rate": 0.00018522408530706168, "loss": 0.2687, "step": 870 }, { "epoch": 0.17628010524185386, "grad_norm": 0.46950843930244446, "learning_rate": 0.00018519077139019915, "loss": 0.2503, "step": 871 }, { "epoch": 0.17648249342238415, "grad_norm": 0.8243887424468994, "learning_rate": 0.00018515742296357338, "loss": 0.2954, "step": 872 }, { "epoch": 0.1766848816029144, "grad_norm": 0.35600486397743225, "learning_rate": 0.00018512404004069342, "loss": 0.2442, "step": 873 }, { "epoch": 0.17688726978344466, "grad_norm": 0.5414007306098938, "learning_rate": 0.00018509062263508236, "loss": 0.2595, "step": 874 }, { "epoch": 0.1770896579639749, "grad_norm": 0.5169119238853455, "learning_rate": 0.00018505717076027712, "loss": 0.2675, "step": 875 }, { "epoch": 0.17729204614450517, "grad_norm": 0.4278315603733063, "learning_rate": 0.0001850236844298287, "loss": 0.2525, "step": 876 }, { "epoch": 0.17749443432503542, "grad_norm": 0.2882659137248993, "learning_rate": 0.00018499016365730201, "loss": 0.2177, "step": 877 }, { "epoch": 0.17769682250556568, "grad_norm": 0.3621360659599304, "learning_rate": 0.0001849566084562759, "loss": 0.2618, "step": 878 }, { "epoch": 0.17789921068609593, "grad_norm": 0.47867634892463684, "learning_rate": 0.00018492301884034316, "loss": 0.2465, "step": 879 }, { "epoch": 0.1781015988666262, "grad_norm": 0.37105533480644226, "learning_rate": 0.0001848893948231106, "loss": 0.2395, "step": 880 }, { "epoch": 0.17830398704715644, "grad_norm": 0.3731389045715332, "learning_rate": 0.00018485573641819887, "loss": 0.2275, "step": 881 }, { "epoch": 0.1785063752276867, "grad_norm": 0.3359118402004242, "learning_rate": 0.00018482204363924254, "loss": 0.2504, "step": 882 }, { "epoch": 0.17870876340821695, "grad_norm": 0.3526201844215393, "learning_rate": 0.00018478831649989025, "loss": 0.2524, "step": 883 }, { "epoch": 0.1789111515887472, "grad_norm": 0.3005167245864868, "learning_rate": 0.0001847545550138044, "loss": 0.2194, "step": 884 }, { "epoch": 0.17911353976927746, "grad_norm": 0.3063315451145172, "learning_rate": 0.00018472075919466135, "loss": 0.2284, "step": 885 }, { "epoch": 0.17931592794980772, "grad_norm": 0.5195674300193787, "learning_rate": 0.00018468692905615144, "loss": 0.2289, "step": 886 }, { "epoch": 0.179518316130338, "grad_norm": 0.3882855474948883, "learning_rate": 0.0001846530646119788, "loss": 0.2584, "step": 887 }, { "epoch": 0.17972070431086826, "grad_norm": 0.7658414840698242, "learning_rate": 0.00018461916587586156, "loss": 0.2657, "step": 888 }, { "epoch": 0.1799230924913985, "grad_norm": 0.4395122528076172, "learning_rate": 0.00018458523286153167, "loss": 0.2575, "step": 889 }, { "epoch": 0.18012548067192877, "grad_norm": 0.3886345624923706, "learning_rate": 0.00018455126558273498, "loss": 0.2847, "step": 890 }, { "epoch": 0.18032786885245902, "grad_norm": 0.4332709312438965, "learning_rate": 0.00018451726405323122, "loss": 0.2842, "step": 891 }, { "epoch": 0.18053025703298928, "grad_norm": 0.3239903151988983, "learning_rate": 0.00018448322828679405, "loss": 0.2428, "step": 892 }, { "epoch": 0.18073264521351953, "grad_norm": 0.35239177942276, "learning_rate": 0.0001844491582972109, "loss": 0.2949, "step": 893 }, { "epoch": 0.1809350333940498, "grad_norm": 0.40151652693748474, "learning_rate": 0.00018441505409828312, "loss": 0.2483, "step": 894 }, { "epoch": 0.18113742157458004, "grad_norm": 0.37485212087631226, "learning_rate": 0.00018438091570382596, "loss": 0.2501, "step": 895 }, { "epoch": 0.1813398097551103, "grad_norm": 0.4656164348125458, "learning_rate": 0.00018434674312766842, "loss": 0.238, "step": 896 }, { "epoch": 0.18154219793564055, "grad_norm": 0.28423821926116943, "learning_rate": 0.00018431253638365346, "loss": 0.2475, "step": 897 }, { "epoch": 0.1817445861161708, "grad_norm": 0.44694411754608154, "learning_rate": 0.00018427829548563776, "loss": 0.2438, "step": 898 }, { "epoch": 0.18194697429670106, "grad_norm": 0.3071068525314331, "learning_rate": 0.00018424402044749192, "loss": 0.2675, "step": 899 }, { "epoch": 0.18214936247723132, "grad_norm": 0.34001922607421875, "learning_rate": 0.0001842097112831004, "loss": 0.2399, "step": 900 }, { "epoch": 0.18214936247723132, "eval_loss": 0.2879277467727661, "eval_runtime": 1.3154, "eval_samples_per_second": 3.801, "eval_steps_per_second": 0.76, "step": 900 }, { "epoch": 0.18235175065776157, "grad_norm": 0.3974973261356354, "learning_rate": 0.00018417536800636138, "loss": 0.2675, "step": 901 }, { "epoch": 0.18255413883829186, "grad_norm": 0.38737133145332336, "learning_rate": 0.00018414099063118687, "loss": 0.2768, "step": 902 }, { "epoch": 0.1827565270188221, "grad_norm": 0.39120012521743774, "learning_rate": 0.00018410657917150282, "loss": 0.2739, "step": 903 }, { "epoch": 0.18295891519935237, "grad_norm": 0.34539973735809326, "learning_rate": 0.0001840721336412489, "loss": 0.2394, "step": 904 }, { "epoch": 0.18316130337988262, "grad_norm": 0.37100082635879517, "learning_rate": 0.0001840376540543785, "loss": 0.2801, "step": 905 }, { "epoch": 0.18336369156041288, "grad_norm": 0.3418366014957428, "learning_rate": 0.00018400314042485896, "loss": 0.2236, "step": 906 }, { "epoch": 0.18356607974094313, "grad_norm": 0.43396133184432983, "learning_rate": 0.00018396859276667132, "loss": 0.1779, "step": 907 }, { "epoch": 0.1837684679214734, "grad_norm": 0.2989194989204407, "learning_rate": 0.00018393401109381038, "loss": 0.2456, "step": 908 }, { "epoch": 0.18397085610200364, "grad_norm": 0.4158579707145691, "learning_rate": 0.00018389939542028482, "loss": 0.2646, "step": 909 }, { "epoch": 0.1841732442825339, "grad_norm": 0.3321724534034729, "learning_rate": 0.00018386474576011702, "loss": 0.2355, "step": 910 }, { "epoch": 0.18437563246306415, "grad_norm": 0.33564624190330505, "learning_rate": 0.00018383006212734315, "loss": 0.2525, "step": 911 }, { "epoch": 0.1845780206435944, "grad_norm": 0.4092387855052948, "learning_rate": 0.00018379534453601313, "loss": 0.2424, "step": 912 }, { "epoch": 0.18478040882412466, "grad_norm": 0.29904553294181824, "learning_rate": 0.0001837605930001906, "loss": 0.2474, "step": 913 }, { "epoch": 0.18498279700465492, "grad_norm": 0.4696289002895355, "learning_rate": 0.00018372580753395307, "loss": 0.2864, "step": 914 }, { "epoch": 0.18518518518518517, "grad_norm": 0.3313215672969818, "learning_rate": 0.00018369098815139166, "loss": 0.267, "step": 915 }, { "epoch": 0.18538757336571543, "grad_norm": 0.3044814467430115, "learning_rate": 0.00018365613486661132, "loss": 0.2334, "step": 916 }, { "epoch": 0.1855899615462457, "grad_norm": 0.3326167166233063, "learning_rate": 0.00018362124769373065, "loss": 0.2451, "step": 917 }, { "epoch": 0.18579234972677597, "grad_norm": 0.5224262475967407, "learning_rate": 0.00018358632664688203, "loss": 0.3243, "step": 918 }, { "epoch": 0.18599473790730622, "grad_norm": 0.34398549795150757, "learning_rate": 0.00018355137174021162, "loss": 0.22, "step": 919 }, { "epoch": 0.18619712608783648, "grad_norm": 0.42548784613609314, "learning_rate": 0.0001835163829878792, "loss": 0.2866, "step": 920 }, { "epoch": 0.18639951426836673, "grad_norm": 0.34959691762924194, "learning_rate": 0.0001834813604040583, "loss": 0.2306, "step": 921 }, { "epoch": 0.186601902448897, "grad_norm": 0.38658004999160767, "learning_rate": 0.0001834463040029361, "loss": 0.2586, "step": 922 }, { "epoch": 0.18680429062942724, "grad_norm": 0.42646870017051697, "learning_rate": 0.00018341121379871362, "loss": 0.2344, "step": 923 }, { "epoch": 0.1870066788099575, "grad_norm": 0.30141687393188477, "learning_rate": 0.00018337608980560543, "loss": 0.2686, "step": 924 }, { "epoch": 0.18720906699048775, "grad_norm": 0.4940946102142334, "learning_rate": 0.00018334093203783985, "loss": 0.3285, "step": 925 }, { "epoch": 0.187411455171018, "grad_norm": 0.34840384125709534, "learning_rate": 0.00018330574050965886, "loss": 0.2136, "step": 926 }, { "epoch": 0.18761384335154827, "grad_norm": 0.3788219392299652, "learning_rate": 0.00018327051523531814, "loss": 0.2497, "step": 927 }, { "epoch": 0.18781623153207852, "grad_norm": 0.36543574929237366, "learning_rate": 0.000183235256229087, "loss": 0.2708, "step": 928 }, { "epoch": 0.18801861971260878, "grad_norm": 0.520186185836792, "learning_rate": 0.0001831999635052485, "loss": 0.2312, "step": 929 }, { "epoch": 0.18822100789313903, "grad_norm": 0.5425753593444824, "learning_rate": 0.00018316463707809924, "loss": 0.2398, "step": 930 }, { "epoch": 0.18842339607366929, "grad_norm": 0.3049781322479248, "learning_rate": 0.00018312927696194958, "loss": 0.2381, "step": 931 }, { "epoch": 0.18862578425419954, "grad_norm": 0.3273719549179077, "learning_rate": 0.00018309388317112345, "loss": 0.2672, "step": 932 }, { "epoch": 0.18882817243472982, "grad_norm": 0.31122419238090515, "learning_rate": 0.00018305845571995843, "loss": 0.2719, "step": 933 }, { "epoch": 0.18903056061526008, "grad_norm": 0.30341628193855286, "learning_rate": 0.00018302299462280583, "loss": 0.2615, "step": 934 }, { "epoch": 0.18923294879579033, "grad_norm": 0.32557210326194763, "learning_rate": 0.00018298749989403045, "loss": 0.2309, "step": 935 }, { "epoch": 0.1894353369763206, "grad_norm": 0.34578633308410645, "learning_rate": 0.00018295197154801081, "loss": 0.239, "step": 936 }, { "epoch": 0.18963772515685084, "grad_norm": 0.2989829182624817, "learning_rate": 0.000182916409599139, "loss": 0.2094, "step": 937 }, { "epoch": 0.1898401133373811, "grad_norm": 0.27314433455467224, "learning_rate": 0.00018288081406182079, "loss": 0.2504, "step": 938 }, { "epoch": 0.19004250151791136, "grad_norm": 0.33700141310691833, "learning_rate": 0.00018284518495047546, "loss": 0.2619, "step": 939 }, { "epoch": 0.1902448896984416, "grad_norm": 0.28188449144363403, "learning_rate": 0.00018280952227953592, "loss": 0.2533, "step": 940 }, { "epoch": 0.19044727787897187, "grad_norm": 0.3926515579223633, "learning_rate": 0.00018277382606344872, "loss": 0.313, "step": 941 }, { "epoch": 0.19064966605950212, "grad_norm": 0.3274219334125519, "learning_rate": 0.00018273809631667398, "loss": 0.2121, "step": 942 }, { "epoch": 0.19085205424003238, "grad_norm": 0.38629379868507385, "learning_rate": 0.0001827023330536854, "loss": 0.2597, "step": 943 }, { "epoch": 0.19105444242056263, "grad_norm": 0.4779243767261505, "learning_rate": 0.0001826665362889702, "loss": 0.2626, "step": 944 }, { "epoch": 0.1912568306010929, "grad_norm": 0.2860121726989746, "learning_rate": 0.00018263070603702927, "loss": 0.2328, "step": 945 }, { "epoch": 0.19145921878162314, "grad_norm": 0.3092181384563446, "learning_rate": 0.00018259484231237703, "loss": 0.2593, "step": 946 }, { "epoch": 0.1916616069621534, "grad_norm": 0.30320075154304504, "learning_rate": 0.00018255894512954137, "loss": 0.2597, "step": 947 }, { "epoch": 0.19186399514268368, "grad_norm": 0.3562355637550354, "learning_rate": 0.00018252301450306387, "loss": 0.2481, "step": 948 }, { "epoch": 0.19206638332321394, "grad_norm": 0.6523458361625671, "learning_rate": 0.00018248705044749961, "loss": 0.2578, "step": 949 }, { "epoch": 0.1922687715037442, "grad_norm": 0.29616525769233704, "learning_rate": 0.00018245105297741714, "loss": 0.2422, "step": 950 }, { "epoch": 0.1922687715037442, "eval_loss": 0.290086954832077, "eval_runtime": 1.3128, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.762, "step": 950 }, { "epoch": 0.19247115968427445, "grad_norm": 0.3146413564682007, "learning_rate": 0.00018241502210739868, "loss": 0.2218, "step": 951 }, { "epoch": 0.1926735478648047, "grad_norm": 0.33451956510543823, "learning_rate": 0.0001823789578520398, "loss": 0.2603, "step": 952 }, { "epoch": 0.19287593604533496, "grad_norm": 0.294619619846344, "learning_rate": 0.00018234286022594982, "loss": 0.2666, "step": 953 }, { "epoch": 0.1930783242258652, "grad_norm": 0.4188424050807953, "learning_rate": 0.00018230672924375138, "loss": 0.26, "step": 954 }, { "epoch": 0.19328071240639547, "grad_norm": 0.3397003412246704, "learning_rate": 0.00018227056492008071, "loss": 0.2663, "step": 955 }, { "epoch": 0.19348310058692572, "grad_norm": 0.30348098278045654, "learning_rate": 0.0001822343672695876, "loss": 0.2344, "step": 956 }, { "epoch": 0.19368548876745598, "grad_norm": 0.35058358311653137, "learning_rate": 0.0001821981363069352, "loss": 0.257, "step": 957 }, { "epoch": 0.19388787694798623, "grad_norm": 0.31035804748535156, "learning_rate": 0.00018216187204680032, "loss": 0.25, "step": 958 }, { "epoch": 0.1940902651285165, "grad_norm": 0.3813909590244293, "learning_rate": 0.00018212557450387315, "loss": 0.2641, "step": 959 }, { "epoch": 0.19429265330904674, "grad_norm": 0.3305506110191345, "learning_rate": 0.00018208924369285737, "loss": 0.2589, "step": 960 }, { "epoch": 0.194495041489577, "grad_norm": 0.328037291765213, "learning_rate": 0.0001820528796284702, "loss": 0.2698, "step": 961 }, { "epoch": 0.19469742967010725, "grad_norm": 0.3886243999004364, "learning_rate": 0.00018201648232544227, "loss": 0.2497, "step": 962 }, { "epoch": 0.19489981785063754, "grad_norm": 0.34645378589630127, "learning_rate": 0.00018198005179851764, "loss": 0.2615, "step": 963 }, { "epoch": 0.1951022060311678, "grad_norm": 0.28150102496147156, "learning_rate": 0.00018194358806245396, "loss": 0.2604, "step": 964 }, { "epoch": 0.19530459421169805, "grad_norm": 0.40966495871543884, "learning_rate": 0.0001819070911320222, "loss": 0.249, "step": 965 }, { "epoch": 0.1955069823922283, "grad_norm": 0.3797634541988373, "learning_rate": 0.00018187056102200684, "loss": 0.2585, "step": 966 }, { "epoch": 0.19570937057275856, "grad_norm": 0.4229888319969177, "learning_rate": 0.00018183399774720585, "loss": 0.2804, "step": 967 }, { "epoch": 0.1959117587532888, "grad_norm": 0.30038386583328247, "learning_rate": 0.00018179740132243048, "loss": 0.2315, "step": 968 }, { "epoch": 0.19611414693381907, "grad_norm": 0.253451943397522, "learning_rate": 0.00018176077176250557, "loss": 0.2399, "step": 969 }, { "epoch": 0.19631653511434932, "grad_norm": 0.3240930140018463, "learning_rate": 0.00018172410908226926, "loss": 0.3197, "step": 970 }, { "epoch": 0.19651892329487958, "grad_norm": 0.29480859637260437, "learning_rate": 0.00018168741329657327, "loss": 0.2108, "step": 971 }, { "epoch": 0.19672131147540983, "grad_norm": 0.3405677378177643, "learning_rate": 0.0001816506844202825, "loss": 0.2713, "step": 972 }, { "epoch": 0.1969236996559401, "grad_norm": 0.34222930669784546, "learning_rate": 0.00018161392246827544, "loss": 0.2651, "step": 973 }, { "epoch": 0.19712608783647034, "grad_norm": 0.285862535238266, "learning_rate": 0.00018157712745544394, "loss": 0.2215, "step": 974 }, { "epoch": 0.1973284760170006, "grad_norm": 0.3621199429035187, "learning_rate": 0.00018154029939669317, "loss": 0.25, "step": 975 }, { "epoch": 0.19753086419753085, "grad_norm": 0.285997599363327, "learning_rate": 0.00018150343830694178, "loss": 0.2507, "step": 976 }, { "epoch": 0.1977332523780611, "grad_norm": 0.279920756816864, "learning_rate": 0.00018146654420112172, "loss": 0.2642, "step": 977 }, { "epoch": 0.1979356405585914, "grad_norm": 0.6209318041801453, "learning_rate": 0.00018142961709417837, "loss": 0.2111, "step": 978 }, { "epoch": 0.19813802873912165, "grad_norm": 0.3013761341571808, "learning_rate": 0.00018139265700107044, "loss": 0.2557, "step": 979 }, { "epoch": 0.1983404169196519, "grad_norm": 0.35234320163726807, "learning_rate": 0.00018135566393677002, "loss": 0.2692, "step": 980 }, { "epoch": 0.19854280510018216, "grad_norm": 0.39161357283592224, "learning_rate": 0.00018131863791626262, "loss": 0.2345, "step": 981 }, { "epoch": 0.1987451932807124, "grad_norm": 0.6262965798377991, "learning_rate": 0.00018128157895454697, "loss": 0.2138, "step": 982 }, { "epoch": 0.19894758146124267, "grad_norm": 0.5214985609054565, "learning_rate": 0.00018124448706663526, "loss": 0.2619, "step": 983 }, { "epoch": 0.19914996964177292, "grad_norm": 0.3108718693256378, "learning_rate": 0.00018120736226755294, "loss": 0.2246, "step": 984 }, { "epoch": 0.19935235782230318, "grad_norm": 0.31983840465545654, "learning_rate": 0.0001811702045723388, "loss": 0.2502, "step": 985 }, { "epoch": 0.19955474600283343, "grad_norm": 0.24009887874126434, "learning_rate": 0.00018113301399604503, "loss": 0.219, "step": 986 }, { "epoch": 0.1997571341833637, "grad_norm": 0.288682758808136, "learning_rate": 0.00018109579055373707, "loss": 0.2221, "step": 987 }, { "epoch": 0.19995952236389394, "grad_norm": 0.4582456052303314, "learning_rate": 0.00018105853426049367, "loss": 0.2786, "step": 988 }, { "epoch": 0.2001619105444242, "grad_norm": 0.3323560655117035, "learning_rate": 0.0001810212451314069, "loss": 0.262, "step": 989 }, { "epoch": 0.20036429872495445, "grad_norm": 0.32303088903427124, "learning_rate": 0.00018098392318158224, "loss": 0.2113, "step": 990 }, { "epoch": 0.2005666869054847, "grad_norm": 0.3702344596385956, "learning_rate": 0.00018094656842613827, "loss": 0.2518, "step": 991 }, { "epoch": 0.20076907508601496, "grad_norm": 0.3023280203342438, "learning_rate": 0.00018090918088020693, "loss": 0.2359, "step": 992 }, { "epoch": 0.20097146326654525, "grad_norm": 0.28567495942115784, "learning_rate": 0.00018087176055893355, "loss": 0.223, "step": 993 }, { "epoch": 0.2011738514470755, "grad_norm": 0.32796812057495117, "learning_rate": 0.00018083430747747664, "loss": 0.2579, "step": 994 }, { "epoch": 0.20137623962760576, "grad_norm": 0.4312690496444702, "learning_rate": 0.00018079682165100794, "loss": 0.2478, "step": 995 }, { "epoch": 0.201578627808136, "grad_norm": 0.2939644455909729, "learning_rate": 0.00018075930309471256, "loss": 0.2663, "step": 996 }, { "epoch": 0.20178101598866627, "grad_norm": 0.39846959710121155, "learning_rate": 0.00018072175182378878, "loss": 0.2577, "step": 997 }, { "epoch": 0.20198340416919652, "grad_norm": 0.34113451838493347, "learning_rate": 0.00018068416785344823, "loss": 0.2599, "step": 998 }, { "epoch": 0.20218579234972678, "grad_norm": 0.26005908846855164, "learning_rate": 0.00018064655119891566, "loss": 0.2189, "step": 999 }, { "epoch": 0.20238818053025703, "grad_norm": 0.39042559266090393, "learning_rate": 0.0001806089018754292, "loss": 0.2409, "step": 1000 }, { "epoch": 0.20238818053025703, "eval_loss": 0.2912660241127014, "eval_runtime": 1.3202, "eval_samples_per_second": 3.787, "eval_steps_per_second": 0.757, "step": 1000 }, { "epoch": 0.2025905687107873, "grad_norm": 0.5236013531684875, "learning_rate": 0.00018057121989824006, "loss": 0.2353, "step": 1001 }, { "epoch": 0.20279295689131754, "grad_norm": 0.2592727243900299, "learning_rate": 0.00018053350528261284, "loss": 0.2327, "step": 1002 }, { "epoch": 0.2029953450718478, "grad_norm": 0.4260639548301697, "learning_rate": 0.00018049575804382522, "loss": 0.2337, "step": 1003 }, { "epoch": 0.20319773325237805, "grad_norm": 0.32424455881118774, "learning_rate": 0.00018045797819716817, "loss": 0.268, "step": 1004 }, { "epoch": 0.2034001214329083, "grad_norm": 0.33731651306152344, "learning_rate": 0.00018042016575794585, "loss": 0.298, "step": 1005 }, { "epoch": 0.20360250961343856, "grad_norm": 0.3271861672401428, "learning_rate": 0.00018038232074147565, "loss": 0.2464, "step": 1006 }, { "epoch": 0.20380489779396882, "grad_norm": 0.4066775143146515, "learning_rate": 0.00018034444316308813, "loss": 0.2825, "step": 1007 }, { "epoch": 0.2040072859744991, "grad_norm": 0.2421562224626541, "learning_rate": 0.00018030653303812702, "loss": 0.2376, "step": 1008 }, { "epoch": 0.20420967415502936, "grad_norm": 0.40023431181907654, "learning_rate": 0.00018026859038194925, "loss": 0.2157, "step": 1009 }, { "epoch": 0.2044120623355596, "grad_norm": 0.3185890018939972, "learning_rate": 0.000180230615209925, "loss": 0.2239, "step": 1010 }, { "epoch": 0.20461445051608987, "grad_norm": 0.28344491124153137, "learning_rate": 0.00018019260753743748, "loss": 0.2594, "step": 1011 }, { "epoch": 0.20481683869662012, "grad_norm": 0.3082351088523865, "learning_rate": 0.0001801545673798832, "loss": 0.253, "step": 1012 }, { "epoch": 0.20501922687715038, "grad_norm": 0.4255550503730774, "learning_rate": 0.00018011649475267177, "loss": 0.287, "step": 1013 }, { "epoch": 0.20522161505768063, "grad_norm": 0.5091785788536072, "learning_rate": 0.00018007838967122592, "loss": 0.2593, "step": 1014 }, { "epoch": 0.2054240032382109, "grad_norm": 0.3073277771472931, "learning_rate": 0.0001800402521509816, "loss": 0.2323, "step": 1015 }, { "epoch": 0.20562639141874114, "grad_norm": 0.261557936668396, "learning_rate": 0.00018000208220738783, "loss": 0.2371, "step": 1016 }, { "epoch": 0.2058287795992714, "grad_norm": 0.4510318636894226, "learning_rate": 0.00017996387985590683, "loss": 0.2937, "step": 1017 }, { "epoch": 0.20603116777980165, "grad_norm": 0.41151154041290283, "learning_rate": 0.00017992564511201388, "loss": 0.2626, "step": 1018 }, { "epoch": 0.2062335559603319, "grad_norm": 0.4155992269515991, "learning_rate": 0.00017988737799119745, "loss": 0.2857, "step": 1019 }, { "epoch": 0.20643594414086217, "grad_norm": 0.3613973557949066, "learning_rate": 0.0001798490785089591, "loss": 0.2214, "step": 1020 }, { "epoch": 0.20663833232139242, "grad_norm": 0.3269294500350952, "learning_rate": 0.00017981074668081343, "loss": 0.2456, "step": 1021 }, { "epoch": 0.20684072050192268, "grad_norm": 0.309458464384079, "learning_rate": 0.00017977238252228829, "loss": 0.2329, "step": 1022 }, { "epoch": 0.20704310868245293, "grad_norm": 0.489417165517807, "learning_rate": 0.0001797339860489245, "loss": 0.2977, "step": 1023 }, { "epoch": 0.20724549686298321, "grad_norm": 0.30292201042175293, "learning_rate": 0.000179695557276276, "loss": 0.2402, "step": 1024 }, { "epoch": 0.20744788504351347, "grad_norm": 0.3333778381347656, "learning_rate": 0.00017965709621990984, "loss": 0.2165, "step": 1025 }, { "epoch": 0.20765027322404372, "grad_norm": 0.35148313641548157, "learning_rate": 0.0001796186028954061, "loss": 0.2736, "step": 1026 }, { "epoch": 0.20785266140457398, "grad_norm": 0.3211592435836792, "learning_rate": 0.00017958007731835805, "loss": 0.2485, "step": 1027 }, { "epoch": 0.20805504958510423, "grad_norm": 0.28974542021751404, "learning_rate": 0.0001795415195043719, "loss": 0.212, "step": 1028 }, { "epoch": 0.2082574377656345, "grad_norm": 0.273147314786911, "learning_rate": 0.00017950292946906693, "loss": 0.2349, "step": 1029 }, { "epoch": 0.20845982594616475, "grad_norm": 0.2560424208641052, "learning_rate": 0.00017946430722807553, "loss": 0.2088, "step": 1030 }, { "epoch": 0.208662214126695, "grad_norm": 0.2953382730484009, "learning_rate": 0.00017942565279704308, "loss": 0.271, "step": 1031 }, { "epoch": 0.20886460230722526, "grad_norm": 0.4164333939552307, "learning_rate": 0.00017938696619162808, "loss": 0.2603, "step": 1032 }, { "epoch": 0.2090669904877555, "grad_norm": 0.30831435322761536, "learning_rate": 0.00017934824742750193, "loss": 0.243, "step": 1033 }, { "epoch": 0.20926937866828577, "grad_norm": 0.42307373881340027, "learning_rate": 0.00017930949652034922, "loss": 0.2473, "step": 1034 }, { "epoch": 0.20947176684881602, "grad_norm": 0.32177865505218506, "learning_rate": 0.00017927071348586746, "loss": 0.2441, "step": 1035 }, { "epoch": 0.20967415502934628, "grad_norm": 0.2849270701408386, "learning_rate": 0.00017923189833976718, "loss": 0.25, "step": 1036 }, { "epoch": 0.20987654320987653, "grad_norm": 0.27396008372306824, "learning_rate": 0.00017919305109777195, "loss": 0.2457, "step": 1037 }, { "epoch": 0.2100789313904068, "grad_norm": 0.2789502441883087, "learning_rate": 0.00017915417177561825, "loss": 0.2573, "step": 1038 }, { "epoch": 0.21028131957093707, "grad_norm": 0.31958284974098206, "learning_rate": 0.00017911526038905576, "loss": 0.2563, "step": 1039 }, { "epoch": 0.21048370775146732, "grad_norm": 0.39583903551101685, "learning_rate": 0.00017907631695384687, "loss": 0.2079, "step": 1040 }, { "epoch": 0.21068609593199758, "grad_norm": 0.3569786250591278, "learning_rate": 0.0001790373414857672, "loss": 0.2852, "step": 1041 }, { "epoch": 0.21088848411252784, "grad_norm": 0.37313446402549744, "learning_rate": 0.00017899833400060526, "loss": 0.2237, "step": 1042 }, { "epoch": 0.2110908722930581, "grad_norm": 0.3469915986061096, "learning_rate": 0.00017895929451416243, "loss": 0.2726, "step": 1043 }, { "epoch": 0.21129326047358835, "grad_norm": 0.24449874460697174, "learning_rate": 0.0001789202230422532, "loss": 0.2081, "step": 1044 }, { "epoch": 0.2114956486541186, "grad_norm": 0.2736341953277588, "learning_rate": 0.00017888111960070492, "loss": 0.228, "step": 1045 }, { "epoch": 0.21169803683464886, "grad_norm": 0.3459782302379608, "learning_rate": 0.00017884198420535798, "loss": 0.2203, "step": 1046 }, { "epoch": 0.2119004250151791, "grad_norm": 0.45452389121055603, "learning_rate": 0.0001788028168720656, "loss": 0.2491, "step": 1047 }, { "epoch": 0.21210281319570937, "grad_norm": 0.3582545518875122, "learning_rate": 0.00017876361761669406, "loss": 0.229, "step": 1048 }, { "epoch": 0.21230520137623962, "grad_norm": 0.40121760964393616, "learning_rate": 0.00017872438645512247, "loss": 0.2442, "step": 1049 }, { "epoch": 0.21250758955676988, "grad_norm": 0.3111826479434967, "learning_rate": 0.0001786851234032429, "loss": 0.2488, "step": 1050 }, { "epoch": 0.21250758955676988, "eval_loss": 0.29152265191078186, "eval_runtime": 1.3119, "eval_samples_per_second": 3.811, "eval_steps_per_second": 0.762, "step": 1050 }, { "epoch": 0.21270997773730013, "grad_norm": 0.27258729934692383, "learning_rate": 0.00017864582847696036, "loss": 0.2583, "step": 1051 }, { "epoch": 0.2129123659178304, "grad_norm": 0.5324286818504333, "learning_rate": 0.00017860650169219272, "loss": 0.2668, "step": 1052 }, { "epoch": 0.21311475409836064, "grad_norm": 0.44438406825065613, "learning_rate": 0.00017856714306487085, "loss": 0.2498, "step": 1053 }, { "epoch": 0.21331714227889093, "grad_norm": 0.3728419542312622, "learning_rate": 0.00017852775261093843, "loss": 0.2392, "step": 1054 }, { "epoch": 0.21351953045942118, "grad_norm": 0.30213701725006104, "learning_rate": 0.00017848833034635208, "loss": 0.2365, "step": 1055 }, { "epoch": 0.21372191863995144, "grad_norm": 0.26849064230918884, "learning_rate": 0.00017844887628708125, "loss": 0.2407, "step": 1056 }, { "epoch": 0.2139243068204817, "grad_norm": 0.28937771916389465, "learning_rate": 0.00017840939044910833, "loss": 0.2559, "step": 1057 }, { "epoch": 0.21412669500101195, "grad_norm": 0.32687801122665405, "learning_rate": 0.00017836987284842857, "loss": 0.2269, "step": 1058 }, { "epoch": 0.2143290831815422, "grad_norm": 0.33478304743766785, "learning_rate": 0.00017833032350105004, "loss": 0.2353, "step": 1059 }, { "epoch": 0.21453147136207246, "grad_norm": 0.31630390882492065, "learning_rate": 0.00017829074242299372, "loss": 0.2668, "step": 1060 }, { "epoch": 0.2147338595426027, "grad_norm": 0.37771493196487427, "learning_rate": 0.00017825112963029352, "loss": 0.253, "step": 1061 }, { "epoch": 0.21493624772313297, "grad_norm": 0.2708573341369629, "learning_rate": 0.00017821148513899596, "loss": 0.2617, "step": 1062 }, { "epoch": 0.21513863590366322, "grad_norm": 0.29544270038604736, "learning_rate": 0.00017817180896516067, "loss": 0.217, "step": 1063 }, { "epoch": 0.21534102408419348, "grad_norm": 0.36647096276283264, "learning_rate": 0.0001781321011248599, "loss": 0.2747, "step": 1064 }, { "epoch": 0.21554341226472373, "grad_norm": 0.4009073078632355, "learning_rate": 0.00017809236163417893, "loss": 0.2353, "step": 1065 }, { "epoch": 0.215745800445254, "grad_norm": 0.31906449794769287, "learning_rate": 0.00017805259050921569, "loss": 0.2303, "step": 1066 }, { "epoch": 0.21594818862578424, "grad_norm": 0.32114338874816895, "learning_rate": 0.00017801278776608093, "loss": 0.2625, "step": 1067 }, { "epoch": 0.2161505768063145, "grad_norm": 0.3272583484649658, "learning_rate": 0.00017797295342089837, "loss": 0.2805, "step": 1068 }, { "epoch": 0.21635296498684478, "grad_norm": 0.2592332363128662, "learning_rate": 0.00017793308748980437, "loss": 0.1797, "step": 1069 }, { "epoch": 0.21655535316737504, "grad_norm": 0.39050546288490295, "learning_rate": 0.00017789318998894817, "loss": 0.2704, "step": 1070 }, { "epoch": 0.2167577413479053, "grad_norm": 0.3129369914531708, "learning_rate": 0.00017785326093449174, "loss": 0.2498, "step": 1071 }, { "epoch": 0.21696012952843555, "grad_norm": 0.3162764608860016, "learning_rate": 0.00017781330034260986, "loss": 0.2537, "step": 1072 }, { "epoch": 0.2171625177089658, "grad_norm": 0.2931753695011139, "learning_rate": 0.00017777330822949013, "loss": 0.2742, "step": 1073 }, { "epoch": 0.21736490588949606, "grad_norm": 0.3201962113380432, "learning_rate": 0.00017773328461133286, "loss": 0.2375, "step": 1074 }, { "epoch": 0.2175672940700263, "grad_norm": 0.3062599003314972, "learning_rate": 0.0001776932295043511, "loss": 0.26, "step": 1075 }, { "epoch": 0.21776968225055657, "grad_norm": 0.3314260244369507, "learning_rate": 0.00017765314292477076, "loss": 0.2614, "step": 1076 }, { "epoch": 0.21797207043108682, "grad_norm": 0.3045271039009094, "learning_rate": 0.0001776130248888304, "loss": 0.2154, "step": 1077 }, { "epoch": 0.21817445861161708, "grad_norm": 0.2882177233695984, "learning_rate": 0.00017757287541278135, "loss": 0.2524, "step": 1078 }, { "epoch": 0.21837684679214733, "grad_norm": 0.29515182971954346, "learning_rate": 0.00017753269451288768, "loss": 0.2322, "step": 1079 }, { "epoch": 0.2185792349726776, "grad_norm": 0.29339510202407837, "learning_rate": 0.0001774924822054262, "loss": 0.252, "step": 1080 }, { "epoch": 0.21878162315320784, "grad_norm": 0.3157486021518707, "learning_rate": 0.00017745223850668647, "loss": 0.2357, "step": 1081 }, { "epoch": 0.2189840113337381, "grad_norm": 0.28650447726249695, "learning_rate": 0.00017741196343297068, "loss": 0.2669, "step": 1082 }, { "epoch": 0.21918639951426835, "grad_norm": 0.30057305097579956, "learning_rate": 0.0001773716570005938, "loss": 0.2297, "step": 1083 }, { "epoch": 0.21938878769479864, "grad_norm": 0.381552517414093, "learning_rate": 0.0001773313192258835, "loss": 0.2856, "step": 1084 }, { "epoch": 0.2195911758753289, "grad_norm": 0.5149953365325928, "learning_rate": 0.0001772909501251801, "loss": 0.2708, "step": 1085 }, { "epoch": 0.21979356405585915, "grad_norm": 0.5274503827095032, "learning_rate": 0.00017725054971483666, "loss": 0.2394, "step": 1086 }, { "epoch": 0.2199959522363894, "grad_norm": 0.30883169174194336, "learning_rate": 0.0001772101180112189, "loss": 0.2634, "step": 1087 }, { "epoch": 0.22019834041691966, "grad_norm": 0.3409402668476105, "learning_rate": 0.0001771696550307052, "loss": 0.2389, "step": 1088 }, { "epoch": 0.2204007285974499, "grad_norm": 0.3443385064601898, "learning_rate": 0.00017712916078968668, "loss": 0.2375, "step": 1089 }, { "epoch": 0.22060311677798017, "grad_norm": 0.40337422490119934, "learning_rate": 0.00017708863530456703, "loss": 0.2711, "step": 1090 }, { "epoch": 0.22080550495851042, "grad_norm": 0.3459715247154236, "learning_rate": 0.00017704807859176264, "loss": 0.2424, "step": 1091 }, { "epoch": 0.22100789313904068, "grad_norm": 0.3014639616012573, "learning_rate": 0.00017700749066770256, "loss": 0.2294, "step": 1092 }, { "epoch": 0.22121028131957093, "grad_norm": 0.2968370020389557, "learning_rate": 0.0001769668715488285, "loss": 0.2429, "step": 1093 }, { "epoch": 0.2214126695001012, "grad_norm": 0.23571616411209106, "learning_rate": 0.00017692622125159473, "loss": 0.2104, "step": 1094 }, { "epoch": 0.22161505768063144, "grad_norm": 0.2732604444026947, "learning_rate": 0.00017688553979246824, "loss": 0.2354, "step": 1095 }, { "epoch": 0.2218174458611617, "grad_norm": 0.2672936022281647, "learning_rate": 0.00017684482718792856, "loss": 0.2296, "step": 1096 }, { "epoch": 0.22201983404169195, "grad_norm": 0.2975200414657593, "learning_rate": 0.00017680408345446792, "loss": 0.2282, "step": 1097 }, { "epoch": 0.2222222222222222, "grad_norm": 0.3423837125301361, "learning_rate": 0.0001767633086085911, "loss": 0.2534, "step": 1098 }, { "epoch": 0.2224246104027525, "grad_norm": 0.3439367413520813, "learning_rate": 0.0001767225026668155, "loss": 0.2393, "step": 1099 }, { "epoch": 0.22262699858328275, "grad_norm": 0.28630560636520386, "learning_rate": 0.00017668166564567114, "loss": 0.2493, "step": 1100 }, { "epoch": 0.22262699858328275, "eval_loss": 0.2887337803840637, "eval_runtime": 1.3178, "eval_samples_per_second": 3.794, "eval_steps_per_second": 0.759, "step": 1100 }, { "epoch": 0.222829386763813, "grad_norm": 0.30221977829933167, "learning_rate": 0.0001766407975617006, "loss": 0.2246, "step": 1101 }, { "epoch": 0.22303177494434326, "grad_norm": 0.3510182201862335, "learning_rate": 0.00017659989843145905, "loss": 0.23, "step": 1102 }, { "epoch": 0.2232341631248735, "grad_norm": 0.2726145088672638, "learning_rate": 0.0001765589682715142, "loss": 0.2277, "step": 1103 }, { "epoch": 0.22343655130540377, "grad_norm": 0.5233666300773621, "learning_rate": 0.00017651800709844647, "loss": 0.2276, "step": 1104 }, { "epoch": 0.22363893948593402, "grad_norm": 0.42945125699043274, "learning_rate": 0.0001764770149288486, "loss": 0.2681, "step": 1105 }, { "epoch": 0.22384132766646428, "grad_norm": 0.3593583405017853, "learning_rate": 0.00017643599177932616, "loss": 0.2641, "step": 1106 }, { "epoch": 0.22404371584699453, "grad_norm": 0.26907968521118164, "learning_rate": 0.00017639493766649707, "loss": 0.2359, "step": 1107 }, { "epoch": 0.2242461040275248, "grad_norm": 0.42594459652900696, "learning_rate": 0.00017635385260699184, "loss": 0.2257, "step": 1108 }, { "epoch": 0.22444849220805504, "grad_norm": 0.8036966323852539, "learning_rate": 0.0001763127366174536, "loss": 0.2409, "step": 1109 }, { "epoch": 0.2246508803885853, "grad_norm": 0.3021351397037506, "learning_rate": 0.00017627158971453792, "loss": 0.2472, "step": 1110 }, { "epoch": 0.22485326856911556, "grad_norm": 0.30910512804985046, "learning_rate": 0.00017623041191491292, "loss": 0.1991, "step": 1111 }, { "epoch": 0.2250556567496458, "grad_norm": 0.3571963310241699, "learning_rate": 0.00017618920323525923, "loss": 0.2822, "step": 1112 }, { "epoch": 0.22525804493017607, "grad_norm": 0.28929024934768677, "learning_rate": 0.00017614796369226995, "loss": 0.2298, "step": 1113 }, { "epoch": 0.22546043311070635, "grad_norm": 0.28346583247184753, "learning_rate": 0.0001761066933026508, "loss": 0.2427, "step": 1114 }, { "epoch": 0.2256628212912366, "grad_norm": 0.2847822606563568, "learning_rate": 0.00017606539208311987, "loss": 0.2629, "step": 1115 }, { "epoch": 0.22586520947176686, "grad_norm": 0.32049399614334106, "learning_rate": 0.00017602406005040784, "loss": 0.2501, "step": 1116 }, { "epoch": 0.22606759765229711, "grad_norm": 0.5602374076843262, "learning_rate": 0.00017598269722125776, "loss": 0.2357, "step": 1117 }, { "epoch": 0.22626998583282737, "grad_norm": 0.3128582835197449, "learning_rate": 0.00017594130361242526, "loss": 0.2465, "step": 1118 }, { "epoch": 0.22647237401335762, "grad_norm": 0.3063986003398895, "learning_rate": 0.0001758998792406784, "loss": 0.238, "step": 1119 }, { "epoch": 0.22667476219388788, "grad_norm": 0.3415353298187256, "learning_rate": 0.00017585842412279768, "loss": 0.2226, "step": 1120 }, { "epoch": 0.22687715037441814, "grad_norm": 0.3886197805404663, "learning_rate": 0.00017581693827557605, "loss": 0.2709, "step": 1121 }, { "epoch": 0.2270795385549484, "grad_norm": 0.42613688111305237, "learning_rate": 0.00017577542171581895, "loss": 0.2506, "step": 1122 }, { "epoch": 0.22728192673547865, "grad_norm": 0.2579197883605957, "learning_rate": 0.00017573387446034427, "loss": 0.234, "step": 1123 }, { "epoch": 0.2274843149160089, "grad_norm": 0.4238468408584595, "learning_rate": 0.00017569229652598231, "loss": 0.2633, "step": 1124 }, { "epoch": 0.22768670309653916, "grad_norm": 0.4641907215118408, "learning_rate": 0.00017565068792957576, "loss": 0.2214, "step": 1125 }, { "epoch": 0.2278890912770694, "grad_norm": 0.45199456810951233, "learning_rate": 0.00017560904868797977, "loss": 0.2907, "step": 1126 }, { "epoch": 0.22809147945759967, "grad_norm": 0.31280794739723206, "learning_rate": 0.0001755673788180619, "loss": 0.2305, "step": 1127 }, { "epoch": 0.22829386763812992, "grad_norm": 0.6617363691329956, "learning_rate": 0.00017552567833670213, "loss": 0.2879, "step": 1128 }, { "epoch": 0.22849625581866018, "grad_norm": 0.33732813596725464, "learning_rate": 0.0001754839472607928, "loss": 0.2419, "step": 1129 }, { "epoch": 0.22869864399919046, "grad_norm": 0.2739343047142029, "learning_rate": 0.0001754421856072387, "loss": 0.2329, "step": 1130 }, { "epoch": 0.22890103217972071, "grad_norm": 0.25780314207077026, "learning_rate": 0.00017540039339295692, "loss": 0.2016, "step": 1131 }, { "epoch": 0.22910342036025097, "grad_norm": 0.4358406066894531, "learning_rate": 0.00017535857063487708, "loss": 0.243, "step": 1132 }, { "epoch": 0.22930580854078123, "grad_norm": 0.32524991035461426, "learning_rate": 0.00017531671734994102, "loss": 0.2474, "step": 1133 }, { "epoch": 0.22950819672131148, "grad_norm": 0.33677685260772705, "learning_rate": 0.000175274833555103, "loss": 0.2841, "step": 1134 }, { "epoch": 0.22971058490184174, "grad_norm": 0.3155839741230011, "learning_rate": 0.0001752329192673297, "loss": 0.2511, "step": 1135 }, { "epoch": 0.229912973082372, "grad_norm": 0.41307759284973145, "learning_rate": 0.0001751909745036, "loss": 0.2821, "step": 1136 }, { "epoch": 0.23011536126290225, "grad_norm": 0.34991124272346497, "learning_rate": 0.0001751489992809053, "loss": 0.2824, "step": 1137 }, { "epoch": 0.2303177494434325, "grad_norm": 0.28543153405189514, "learning_rate": 0.00017510699361624927, "loss": 0.2141, "step": 1138 }, { "epoch": 0.23052013762396276, "grad_norm": 0.331800252199173, "learning_rate": 0.00017506495752664785, "loss": 0.2635, "step": 1139 }, { "epoch": 0.230722525804493, "grad_norm": 0.40567854046821594, "learning_rate": 0.00017502289102912938, "loss": 0.2545, "step": 1140 }, { "epoch": 0.23092491398502327, "grad_norm": 0.3930947482585907, "learning_rate": 0.0001749807941407345, "loss": 0.2693, "step": 1141 }, { "epoch": 0.23112730216555352, "grad_norm": 0.2990710139274597, "learning_rate": 0.00017493866687851614, "loss": 0.2528, "step": 1142 }, { "epoch": 0.23132969034608378, "grad_norm": 0.42789512872695923, "learning_rate": 0.00017489650925953955, "loss": 0.2972, "step": 1143 }, { "epoch": 0.23153207852661403, "grad_norm": 0.3303764760494232, "learning_rate": 0.00017485432130088233, "loss": 0.2885, "step": 1144 }, { "epoch": 0.23173446670714432, "grad_norm": 0.35476845502853394, "learning_rate": 0.00017481210301963427, "loss": 0.2585, "step": 1145 }, { "epoch": 0.23193685488767457, "grad_norm": 0.3026008903980255, "learning_rate": 0.00017476985443289747, "loss": 0.227, "step": 1146 }, { "epoch": 0.23213924306820483, "grad_norm": 0.2617722153663635, "learning_rate": 0.00017472757555778636, "loss": 0.2556, "step": 1147 }, { "epoch": 0.23234163124873508, "grad_norm": 0.3646996319293976, "learning_rate": 0.0001746852664114276, "loss": 0.2611, "step": 1148 }, { "epoch": 0.23254401942926534, "grad_norm": 0.3172960579395294, "learning_rate": 0.00017464292701096015, "loss": 0.2336, "step": 1149 }, { "epoch": 0.2327464076097956, "grad_norm": 0.28832510113716125, "learning_rate": 0.00017460055737353515, "loss": 0.2231, "step": 1150 }, { "epoch": 0.2327464076097956, "eval_loss": 0.2939348816871643, "eval_runtime": 1.3229, "eval_samples_per_second": 3.78, "eval_steps_per_second": 0.756, "step": 1150 }, { "epoch": 0.23294879579032585, "grad_norm": 0.3201945722103119, "learning_rate": 0.00017455815751631608, "loss": 0.237, "step": 1151 }, { "epoch": 0.2331511839708561, "grad_norm": 0.3089236319065094, "learning_rate": 0.00017451572745647855, "loss": 0.2592, "step": 1152 }, { "epoch": 0.23335357215138636, "grad_norm": 0.26715022325515747, "learning_rate": 0.0001744732672112105, "loss": 0.2217, "step": 1153 }, { "epoch": 0.2335559603319166, "grad_norm": 0.25825124979019165, "learning_rate": 0.00017443077679771211, "loss": 0.2327, "step": 1154 }, { "epoch": 0.23375834851244687, "grad_norm": 0.3765757083892822, "learning_rate": 0.00017438825623319567, "loss": 0.2734, "step": 1155 }, { "epoch": 0.23396073669297712, "grad_norm": 0.31510287523269653, "learning_rate": 0.00017434570553488582, "loss": 0.2538, "step": 1156 }, { "epoch": 0.23416312487350738, "grad_norm": 0.2595655918121338, "learning_rate": 0.00017430312472001928, "loss": 0.2352, "step": 1157 }, { "epoch": 0.23436551305403763, "grad_norm": 0.3281400203704834, "learning_rate": 0.00017426051380584507, "loss": 0.2314, "step": 1158 }, { "epoch": 0.2345679012345679, "grad_norm": 0.4035564064979553, "learning_rate": 0.00017421787280962433, "loss": 0.251, "step": 1159 }, { "epoch": 0.23477028941509817, "grad_norm": 0.28015413880348206, "learning_rate": 0.00017417520174863043, "loss": 0.246, "step": 1160 }, { "epoch": 0.23497267759562843, "grad_norm": 0.27035999298095703, "learning_rate": 0.00017413250064014893, "loss": 0.2262, "step": 1161 }, { "epoch": 0.23517506577615868, "grad_norm": 0.3146262466907501, "learning_rate": 0.0001740897695014775, "loss": 0.2272, "step": 1162 }, { "epoch": 0.23537745395668894, "grad_norm": 0.29210802912712097, "learning_rate": 0.00017404700834992607, "loss": 0.208, "step": 1163 }, { "epoch": 0.2355798421372192, "grad_norm": 0.48875606060028076, "learning_rate": 0.0001740042172028166, "loss": 0.2314, "step": 1164 }, { "epoch": 0.23578223031774945, "grad_norm": 0.3455335199832916, "learning_rate": 0.0001739613960774833, "loss": 0.2719, "step": 1165 }, { "epoch": 0.2359846184982797, "grad_norm": 0.38722291588783264, "learning_rate": 0.00017391854499127253, "loss": 0.2968, "step": 1166 }, { "epoch": 0.23618700667880996, "grad_norm": 0.4133739471435547, "learning_rate": 0.00017387566396154269, "loss": 0.24, "step": 1167 }, { "epoch": 0.2363893948593402, "grad_norm": 0.32769665122032166, "learning_rate": 0.00017383275300566443, "loss": 0.2273, "step": 1168 }, { "epoch": 0.23659178303987047, "grad_norm": 0.2910957336425781, "learning_rate": 0.00017378981214102046, "loss": 0.2515, "step": 1169 }, { "epoch": 0.23679417122040072, "grad_norm": 0.417803555727005, "learning_rate": 0.00017374684138500557, "loss": 0.3014, "step": 1170 }, { "epoch": 0.23699655940093098, "grad_norm": 0.335295706987381, "learning_rate": 0.00017370384075502673, "loss": 0.2657, "step": 1171 }, { "epoch": 0.23719894758146123, "grad_norm": 0.29605528712272644, "learning_rate": 0.00017366081026850297, "loss": 0.2529, "step": 1172 }, { "epoch": 0.2374013357619915, "grad_norm": 0.2679653763771057, "learning_rate": 0.00017361774994286545, "loss": 0.2602, "step": 1173 }, { "epoch": 0.23760372394252174, "grad_norm": 0.28809428215026855, "learning_rate": 0.00017357465979555734, "loss": 0.2452, "step": 1174 }, { "epoch": 0.23780611212305203, "grad_norm": 0.2955229878425598, "learning_rate": 0.00017353153984403402, "loss": 0.2336, "step": 1175 }, { "epoch": 0.23800850030358228, "grad_norm": 0.39755091071128845, "learning_rate": 0.0001734883901057628, "loss": 0.2709, "step": 1176 }, { "epoch": 0.23821088848411254, "grad_norm": 0.28155839443206787, "learning_rate": 0.00017344521059822315, "loss": 0.2276, "step": 1177 }, { "epoch": 0.2384132766646428, "grad_norm": 0.2997364401817322, "learning_rate": 0.00017340200133890657, "loss": 0.2255, "step": 1178 }, { "epoch": 0.23861566484517305, "grad_norm": 0.30834028124809265, "learning_rate": 0.0001733587623453166, "loss": 0.2517, "step": 1179 }, { "epoch": 0.2388180530257033, "grad_norm": 0.3276856541633606, "learning_rate": 0.0001733154936349689, "loss": 0.2564, "step": 1180 }, { "epoch": 0.23902044120623356, "grad_norm": 0.3279306888580322, "learning_rate": 0.000173272195225391, "loss": 0.2806, "step": 1181 }, { "epoch": 0.2392228293867638, "grad_norm": 0.4740404784679413, "learning_rate": 0.0001732288671341227, "loss": 0.275, "step": 1182 }, { "epoch": 0.23942521756729407, "grad_norm": 0.42953193187713623, "learning_rate": 0.00017318550937871555, "loss": 0.2529, "step": 1183 }, { "epoch": 0.23962760574782432, "grad_norm": 0.24740366637706757, "learning_rate": 0.00017314212197673334, "loss": 0.2276, "step": 1184 }, { "epoch": 0.23982999392835458, "grad_norm": 0.2679139971733093, "learning_rate": 0.0001730987049457518, "loss": 0.2718, "step": 1185 }, { "epoch": 0.24003238210888483, "grad_norm": 0.8186133503913879, "learning_rate": 0.00017305525830335866, "loss": 0.2473, "step": 1186 }, { "epoch": 0.2402347702894151, "grad_norm": 0.34955456852912903, "learning_rate": 0.00017301178206715356, "loss": 0.2734, "step": 1187 }, { "epoch": 0.24043715846994534, "grad_norm": 0.3177298307418823, "learning_rate": 0.00017296827625474826, "loss": 0.2353, "step": 1188 }, { "epoch": 0.2406395466504756, "grad_norm": 0.3289826512336731, "learning_rate": 0.00017292474088376642, "loss": 0.2457, "step": 1189 }, { "epoch": 0.24084193483100588, "grad_norm": 0.4080020785331726, "learning_rate": 0.00017288117597184373, "loss": 0.2762, "step": 1190 }, { "epoch": 0.24104432301153614, "grad_norm": 0.3012888431549072, "learning_rate": 0.00017283758153662778, "loss": 0.2291, "step": 1191 }, { "epoch": 0.2412467111920664, "grad_norm": 0.3433528244495392, "learning_rate": 0.00017279395759577817, "loss": 0.2583, "step": 1192 }, { "epoch": 0.24144909937259665, "grad_norm": 0.3287104070186615, "learning_rate": 0.00017275030416696646, "loss": 0.2391, "step": 1193 }, { "epoch": 0.2416514875531269, "grad_norm": 0.3012438118457794, "learning_rate": 0.00017270662126787608, "loss": 0.2232, "step": 1194 }, { "epoch": 0.24185387573365716, "grad_norm": 0.2747318744659424, "learning_rate": 0.0001726629089162025, "loss": 0.2373, "step": 1195 }, { "epoch": 0.24205626391418741, "grad_norm": 0.32882562279701233, "learning_rate": 0.00017261916712965305, "loss": 0.2296, "step": 1196 }, { "epoch": 0.24225865209471767, "grad_norm": 0.40864628553390503, "learning_rate": 0.00017257539592594698, "loss": 0.2556, "step": 1197 }, { "epoch": 0.24246104027524792, "grad_norm": 0.2906353175640106, "learning_rate": 0.00017253159532281553, "loss": 0.2255, "step": 1198 }, { "epoch": 0.24266342845577818, "grad_norm": 0.2991185486316681, "learning_rate": 0.00017248776533800177, "loss": 0.2, "step": 1199 }, { "epoch": 0.24286581663630843, "grad_norm": 0.3152303695678711, "learning_rate": 0.00017244390598926073, "loss": 0.238, "step": 1200 }, { "epoch": 0.24286581663630843, "eval_loss": 0.2918797433376312, "eval_runtime": 1.3217, "eval_samples_per_second": 3.783, "eval_steps_per_second": 0.757, "step": 1200 }, { "epoch": 0.2430682048168387, "grad_norm": 0.2977762520313263, "learning_rate": 0.00017240001729435927, "loss": 0.2684, "step": 1201 }, { "epoch": 0.24327059299736895, "grad_norm": 0.38705986738204956, "learning_rate": 0.0001723560992710762, "loss": 0.2528, "step": 1202 }, { "epoch": 0.2434729811778992, "grad_norm": 0.2904474139213562, "learning_rate": 0.00017231215193720217, "loss": 0.2514, "step": 1203 }, { "epoch": 0.24367536935842946, "grad_norm": 0.4413079023361206, "learning_rate": 0.00017226817531053974, "loss": 0.2484, "step": 1204 }, { "epoch": 0.24387775753895974, "grad_norm": 0.31725379824638367, "learning_rate": 0.00017222416940890328, "loss": 0.277, "step": 1205 }, { "epoch": 0.24408014571949, "grad_norm": 0.2761150896549225, "learning_rate": 0.00017218013425011913, "loss": 0.2294, "step": 1206 }, { "epoch": 0.24428253390002025, "grad_norm": 0.26313307881355286, "learning_rate": 0.00017213606985202535, "loss": 0.2303, "step": 1207 }, { "epoch": 0.2444849220805505, "grad_norm": 0.32208874821662903, "learning_rate": 0.0001720919762324719, "loss": 0.2211, "step": 1208 }, { "epoch": 0.24468731026108076, "grad_norm": 0.2977825403213501, "learning_rate": 0.00017204785340932058, "loss": 0.2331, "step": 1209 }, { "epoch": 0.24488969844161101, "grad_norm": 0.2714790105819702, "learning_rate": 0.00017200370140044502, "loss": 0.276, "step": 1210 }, { "epoch": 0.24509208662214127, "grad_norm": 0.3142950236797333, "learning_rate": 0.00017195952022373064, "loss": 0.2679, "step": 1211 }, { "epoch": 0.24529447480267152, "grad_norm": 0.2710469961166382, "learning_rate": 0.0001719153098970748, "loss": 0.2236, "step": 1212 }, { "epoch": 0.24549686298320178, "grad_norm": 0.3323655426502228, "learning_rate": 0.0001718710704383865, "loss": 0.2287, "step": 1213 }, { "epoch": 0.24569925116373204, "grad_norm": 0.32181984186172485, "learning_rate": 0.00017182680186558663, "loss": 0.2312, "step": 1214 }, { "epoch": 0.2459016393442623, "grad_norm": 0.3159317672252655, "learning_rate": 0.00017178250419660788, "loss": 0.2473, "step": 1215 }, { "epoch": 0.24610402752479255, "grad_norm": 0.31007179617881775, "learning_rate": 0.00017173817744939467, "loss": 0.2026, "step": 1216 }, { "epoch": 0.2463064157053228, "grad_norm": 0.29636502265930176, "learning_rate": 0.00017169382164190325, "loss": 0.2542, "step": 1217 }, { "epoch": 0.24650880388585306, "grad_norm": 0.33488592505455017, "learning_rate": 0.00017164943679210163, "loss": 0.2396, "step": 1218 }, { "epoch": 0.2467111920663833, "grad_norm": 0.3274032771587372, "learning_rate": 0.00017160502291796963, "loss": 0.2291, "step": 1219 }, { "epoch": 0.24691358024691357, "grad_norm": 0.5109665989875793, "learning_rate": 0.00017156058003749868, "loss": 0.2515, "step": 1220 }, { "epoch": 0.24711596842744385, "grad_norm": 0.3878520727157593, "learning_rate": 0.00017151610816869216, "loss": 0.2422, "step": 1221 }, { "epoch": 0.2473183566079741, "grad_norm": 0.45315369963645935, "learning_rate": 0.000171471607329565, "loss": 0.2497, "step": 1222 }, { "epoch": 0.24752074478850436, "grad_norm": 0.3440694212913513, "learning_rate": 0.00017142707753814402, "loss": 0.2622, "step": 1223 }, { "epoch": 0.24772313296903462, "grad_norm": 0.328428715467453, "learning_rate": 0.00017138251881246768, "loss": 0.225, "step": 1224 }, { "epoch": 0.24792552114956487, "grad_norm": 0.2521361708641052, "learning_rate": 0.00017133793117058622, "loss": 0.2267, "step": 1225 }, { "epoch": 0.24812790933009513, "grad_norm": 0.3897157907485962, "learning_rate": 0.00017129331463056153, "loss": 0.2255, "step": 1226 }, { "epoch": 0.24833029751062538, "grad_norm": 0.3402661979198456, "learning_rate": 0.00017124866921046724, "loss": 0.2579, "step": 1227 }, { "epoch": 0.24853268569115564, "grad_norm": 0.38770735263824463, "learning_rate": 0.0001712039949283887, "loss": 0.3088, "step": 1228 }, { "epoch": 0.2487350738716859, "grad_norm": 0.2749958038330078, "learning_rate": 0.0001711592918024229, "loss": 0.2251, "step": 1229 }, { "epoch": 0.24893746205221615, "grad_norm": 0.29723653197288513, "learning_rate": 0.00017111455985067853, "loss": 0.2368, "step": 1230 }, { "epoch": 0.2491398502327464, "grad_norm": 0.4661113917827606, "learning_rate": 0.00017106979909127602, "loss": 0.2493, "step": 1231 }, { "epoch": 0.24934223841327666, "grad_norm": 0.2877221703529358, "learning_rate": 0.00017102500954234738, "loss": 0.2664, "step": 1232 }, { "epoch": 0.2495446265938069, "grad_norm": 0.3117177188396454, "learning_rate": 0.0001709801912220363, "loss": 0.2599, "step": 1233 }, { "epoch": 0.24974701477433717, "grad_norm": 0.4992692172527313, "learning_rate": 0.0001709353441484982, "loss": 0.2803, "step": 1234 }, { "epoch": 0.24994940295486742, "grad_norm": 0.39757898449897766, "learning_rate": 0.00017089046833990008, "loss": 0.2201, "step": 1235 }, { "epoch": 0.2501517911353977, "grad_norm": 0.40783241391181946, "learning_rate": 0.0001708455638144206, "loss": 0.2528, "step": 1236 }, { "epoch": 0.25035417931592796, "grad_norm": 0.28234609961509705, "learning_rate": 0.00017080063059024994, "loss": 0.1999, "step": 1237 }, { "epoch": 0.2505565674964582, "grad_norm": 0.26824888586997986, "learning_rate": 0.0001707556686855902, "loss": 0.2731, "step": 1238 }, { "epoch": 0.25075895567698847, "grad_norm": 0.3187876343727112, "learning_rate": 0.00017071067811865476, "loss": 0.27, "step": 1239 }, { "epoch": 0.2509613438575187, "grad_norm": 0.2885381877422333, "learning_rate": 0.00017066565890766884, "loss": 0.2263, "step": 1240 }, { "epoch": 0.251163732038049, "grad_norm": 0.32039493322372437, "learning_rate": 0.00017062061107086913, "loss": 0.2245, "step": 1241 }, { "epoch": 0.25136612021857924, "grad_norm": 0.6021535992622375, "learning_rate": 0.00017057553462650402, "loss": 0.2354, "step": 1242 }, { "epoch": 0.2515685083991095, "grad_norm": 0.31638115644454956, "learning_rate": 0.00017053042959283337, "loss": 0.2384, "step": 1243 }, { "epoch": 0.25177089657963975, "grad_norm": 0.3874339759349823, "learning_rate": 0.0001704852959881288, "loss": 0.2973, "step": 1244 }, { "epoch": 0.25197328476017, "grad_norm": 0.27009493112564087, "learning_rate": 0.00017044013383067327, "loss": 0.2894, "step": 1245 }, { "epoch": 0.25217567294070026, "grad_norm": 0.33196914196014404, "learning_rate": 0.00017039494313876151, "loss": 0.2289, "step": 1246 }, { "epoch": 0.2523780611212305, "grad_norm": 0.27076539397239685, "learning_rate": 0.00017034972393069973, "loss": 0.2494, "step": 1247 }, { "epoch": 0.25258044930176077, "grad_norm": 0.646059513092041, "learning_rate": 0.00017030447622480566, "loss": 0.2549, "step": 1248 }, { "epoch": 0.252782837482291, "grad_norm": 0.35935306549072266, "learning_rate": 0.0001702592000394086, "loss": 0.2615, "step": 1249 }, { "epoch": 0.2529852256628213, "grad_norm": 0.4167482852935791, "learning_rate": 0.00017021389539284942, "loss": 0.2418, "step": 1250 }, { "epoch": 0.2529852256628213, "eval_loss": 0.2969488799571991, "eval_runtime": 1.3187, "eval_samples_per_second": 3.792, "eval_steps_per_second": 0.758, "step": 1250 }, { "epoch": 0.25318761384335153, "grad_norm": 0.3723834156990051, "learning_rate": 0.00017016856230348047, "loss": 0.2547, "step": 1251 }, { "epoch": 0.2533900020238818, "grad_norm": 0.38796353340148926, "learning_rate": 0.00017012320078966563, "loss": 0.2598, "step": 1252 }, { "epoch": 0.25359239020441204, "grad_norm": 0.34247612953186035, "learning_rate": 0.00017007781086978037, "loss": 0.2872, "step": 1253 }, { "epoch": 0.2537947783849423, "grad_norm": 0.3610890507698059, "learning_rate": 0.0001700323925622115, "loss": 0.2365, "step": 1254 }, { "epoch": 0.25399716656547255, "grad_norm": 0.34484267234802246, "learning_rate": 0.00016998694588535754, "loss": 0.2698, "step": 1255 }, { "epoch": 0.2541995547460028, "grad_norm": 0.29228222370147705, "learning_rate": 0.00016994147085762834, "loss": 0.2647, "step": 1256 }, { "epoch": 0.25440194292653306, "grad_norm": 0.33437371253967285, "learning_rate": 0.0001698959674974453, "loss": 0.241, "step": 1257 }, { "epoch": 0.2546043311070634, "grad_norm": 0.3573373854160309, "learning_rate": 0.00016985043582324127, "loss": 0.2885, "step": 1258 }, { "epoch": 0.25480671928759363, "grad_norm": 0.3468393087387085, "learning_rate": 0.00016980487585346063, "loss": 0.257, "step": 1259 }, { "epoch": 0.2550091074681239, "grad_norm": 0.3223399519920349, "learning_rate": 0.00016975928760655913, "loss": 0.2743, "step": 1260 }, { "epoch": 0.25521149564865414, "grad_norm": 0.2661367356777191, "learning_rate": 0.00016971367110100405, "loss": 0.2864, "step": 1261 }, { "epoch": 0.2554138838291844, "grad_norm": 0.2897075414657593, "learning_rate": 0.00016966802635527407, "loss": 0.2443, "step": 1262 }, { "epoch": 0.25561627200971465, "grad_norm": 0.3161570429801941, "learning_rate": 0.0001696223533878594, "loss": 0.2621, "step": 1263 }, { "epoch": 0.2558186601902449, "grad_norm": 0.2701908349990845, "learning_rate": 0.00016957665221726154, "loss": 0.1951, "step": 1264 }, { "epoch": 0.25602104837077516, "grad_norm": 0.2720428705215454, "learning_rate": 0.00016953092286199351, "loss": 0.2261, "step": 1265 }, { "epoch": 0.2562234365513054, "grad_norm": 0.29421889781951904, "learning_rate": 0.0001694851653405797, "loss": 0.2698, "step": 1266 }, { "epoch": 0.25642582473183567, "grad_norm": 0.34430429339408875, "learning_rate": 0.00016943937967155596, "loss": 0.2548, "step": 1267 }, { "epoch": 0.2566282129123659, "grad_norm": 0.28240466117858887, "learning_rate": 0.00016939356587346955, "loss": 0.2362, "step": 1268 }, { "epoch": 0.2568306010928962, "grad_norm": 0.29789406061172485, "learning_rate": 0.00016934772396487904, "loss": 0.2792, "step": 1269 }, { "epoch": 0.25703298927342644, "grad_norm": 0.2889864444732666, "learning_rate": 0.00016930185396435448, "loss": 0.2513, "step": 1270 }, { "epoch": 0.2572353774539567, "grad_norm": 0.35167181491851807, "learning_rate": 0.0001692559558904772, "loss": 0.2556, "step": 1271 }, { "epoch": 0.25743776563448695, "grad_norm": 0.3764891028404236, "learning_rate": 0.00016921002976184, "loss": 0.2581, "step": 1272 }, { "epoch": 0.2576401538150172, "grad_norm": 0.2878571152687073, "learning_rate": 0.00016916407559704704, "loss": 0.2765, "step": 1273 }, { "epoch": 0.25784254199554746, "grad_norm": 0.3229897916316986, "learning_rate": 0.00016911809341471377, "loss": 0.2598, "step": 1274 }, { "epoch": 0.2580449301760777, "grad_norm": 0.28680092096328735, "learning_rate": 0.000169072083233467, "loss": 0.2635, "step": 1275 }, { "epoch": 0.25824731835660797, "grad_norm": 0.3871955871582031, "learning_rate": 0.0001690260450719449, "loss": 0.2678, "step": 1276 }, { "epoch": 0.2584497065371382, "grad_norm": 0.2831651270389557, "learning_rate": 0.00016897997894879704, "loss": 0.2316, "step": 1277 }, { "epoch": 0.2586520947176685, "grad_norm": 0.42714688181877136, "learning_rate": 0.0001689338848826842, "loss": 0.2715, "step": 1278 }, { "epoch": 0.25885448289819873, "grad_norm": 0.2766859531402588, "learning_rate": 0.00016888776289227856, "loss": 0.2382, "step": 1279 }, { "epoch": 0.259056871078729, "grad_norm": 0.38155597448349, "learning_rate": 0.00016884161299626356, "loss": 0.2645, "step": 1280 }, { "epoch": 0.25925925925925924, "grad_norm": 0.32789286971092224, "learning_rate": 0.00016879543521333403, "loss": 0.2465, "step": 1281 }, { "epoch": 0.2594616474397895, "grad_norm": 0.32755446434020996, "learning_rate": 0.000168749229562196, "loss": 0.2651, "step": 1282 }, { "epoch": 0.25966403562031976, "grad_norm": 0.3243027329444885, "learning_rate": 0.0001687029960615668, "loss": 0.2361, "step": 1283 }, { "epoch": 0.25986642380085, "grad_norm": 0.28720352053642273, "learning_rate": 0.0001686567347301751, "loss": 0.2393, "step": 1284 }, { "epoch": 0.26006881198138027, "grad_norm": 0.30660369992256165, "learning_rate": 0.0001686104455867608, "loss": 0.2756, "step": 1285 }, { "epoch": 0.2602712001619105, "grad_norm": 0.31535613536834717, "learning_rate": 0.0001685641286500751, "loss": 0.2731, "step": 1286 }, { "epoch": 0.2604735883424408, "grad_norm": 0.27085864543914795, "learning_rate": 0.00016851778393888044, "loss": 0.2616, "step": 1287 }, { "epoch": 0.26067597652297103, "grad_norm": 0.2863300144672394, "learning_rate": 0.00016847141147195049, "loss": 0.1959, "step": 1288 }, { "epoch": 0.26087836470350134, "grad_norm": 0.2789638936519623, "learning_rate": 0.00016842501126807018, "loss": 0.2378, "step": 1289 }, { "epoch": 0.2610807528840316, "grad_norm": 0.38301560282707214, "learning_rate": 0.00016837858334603568, "loss": 0.2145, "step": 1290 }, { "epoch": 0.26128314106456185, "grad_norm": 0.28795748949050903, "learning_rate": 0.00016833212772465435, "loss": 0.2111, "step": 1291 }, { "epoch": 0.2614855292450921, "grad_norm": 0.29287293553352356, "learning_rate": 0.0001682856444227449, "loss": 0.2656, "step": 1292 }, { "epoch": 0.26168791742562236, "grad_norm": 0.2658687233924866, "learning_rate": 0.0001682391334591371, "loss": 0.2559, "step": 1293 }, { "epoch": 0.2618903056061526, "grad_norm": 0.32528966665267944, "learning_rate": 0.00016819259485267195, "loss": 0.258, "step": 1294 }, { "epoch": 0.2620926937866829, "grad_norm": 0.24630677700042725, "learning_rate": 0.00016814602862220176, "loss": 0.201, "step": 1295 }, { "epoch": 0.26229508196721313, "grad_norm": 0.27742281556129456, "learning_rate": 0.00016809943478658993, "loss": 0.2176, "step": 1296 }, { "epoch": 0.2624974701477434, "grad_norm": 0.5506246089935303, "learning_rate": 0.000168052813364711, "loss": 0.2505, "step": 1297 }, { "epoch": 0.26269985832827364, "grad_norm": 0.28451213240623474, "learning_rate": 0.00016800616437545086, "loss": 0.2358, "step": 1298 }, { "epoch": 0.2629022465088039, "grad_norm": 0.37538620829582214, "learning_rate": 0.0001679594878377064, "loss": 0.2341, "step": 1299 }, { "epoch": 0.26310463468933415, "grad_norm": 0.2859930992126465, "learning_rate": 0.00016791278377038568, "loss": 0.203, "step": 1300 }, { "epoch": 0.26310463468933415, "eval_loss": 0.28687986731529236, "eval_runtime": 1.3162, "eval_samples_per_second": 3.799, "eval_steps_per_second": 0.76, "step": 1300 }, { "epoch": 0.2633070228698644, "grad_norm": 0.3527231812477112, "learning_rate": 0.00016786605219240808, "loss": 0.3076, "step": 1301 }, { "epoch": 0.26350941105039466, "grad_norm": 0.6578617691993713, "learning_rate": 0.00016781929312270388, "loss": 0.2387, "step": 1302 }, { "epoch": 0.2637117992309249, "grad_norm": 0.3532882034778595, "learning_rate": 0.0001677725065802147, "loss": 0.2583, "step": 1303 }, { "epoch": 0.26391418741145517, "grad_norm": 0.4073423147201538, "learning_rate": 0.00016772569258389315, "loss": 0.251, "step": 1304 }, { "epoch": 0.2641165755919854, "grad_norm": 0.315077543258667, "learning_rate": 0.00016767885115270308, "loss": 0.2323, "step": 1305 }, { "epoch": 0.2643189637725157, "grad_norm": 0.2608642578125, "learning_rate": 0.00016763198230561934, "loss": 0.2237, "step": 1306 }, { "epoch": 0.26452135195304594, "grad_norm": 0.3275552988052368, "learning_rate": 0.00016758508606162803, "loss": 0.2455, "step": 1307 }, { "epoch": 0.2647237401335762, "grad_norm": 0.33860984444618225, "learning_rate": 0.00016753816243972612, "loss": 0.2349, "step": 1308 }, { "epoch": 0.26492612831410645, "grad_norm": 0.25315356254577637, "learning_rate": 0.00016749121145892193, "loss": 0.2389, "step": 1309 }, { "epoch": 0.2651285164946367, "grad_norm": 0.42971888184547424, "learning_rate": 0.00016744423313823465, "loss": 0.2222, "step": 1310 }, { "epoch": 0.26533090467516696, "grad_norm": 0.4363480806350708, "learning_rate": 0.00016739722749669472, "loss": 0.2668, "step": 1311 }, { "epoch": 0.2655332928556972, "grad_norm": 0.3004481792449951, "learning_rate": 0.00016735019455334352, "loss": 0.2564, "step": 1312 }, { "epoch": 0.26573568103622747, "grad_norm": 0.8771609663963318, "learning_rate": 0.00016730313432723355, "loss": 0.2621, "step": 1313 }, { "epoch": 0.2659380692167577, "grad_norm": 0.2608064115047455, "learning_rate": 0.0001672560468374283, "loss": 0.2722, "step": 1314 }, { "epoch": 0.266140457397288, "grad_norm": 0.32929351925849915, "learning_rate": 0.00016720893210300236, "loss": 0.2411, "step": 1315 }, { "epoch": 0.26634284557781823, "grad_norm": 0.28313860297203064, "learning_rate": 0.00016716179014304142, "loss": 0.2037, "step": 1316 }, { "epoch": 0.2665452337583485, "grad_norm": 0.30115577578544617, "learning_rate": 0.00016711462097664207, "loss": 0.2362, "step": 1317 }, { "epoch": 0.26674762193887874, "grad_norm": 0.3461308181285858, "learning_rate": 0.00016706742462291194, "loss": 0.2196, "step": 1318 }, { "epoch": 0.26695001011940905, "grad_norm": 0.2715943157672882, "learning_rate": 0.00016702020110096974, "loss": 0.266, "step": 1319 }, { "epoch": 0.2671523982999393, "grad_norm": 0.2957078516483307, "learning_rate": 0.0001669729504299452, "loss": 0.1989, "step": 1320 }, { "epoch": 0.26735478648046956, "grad_norm": 0.3063766062259674, "learning_rate": 0.00016692567262897897, "loss": 0.2227, "step": 1321 }, { "epoch": 0.2675571746609998, "grad_norm": 0.2916683554649353, "learning_rate": 0.00016687836771722266, "loss": 0.2346, "step": 1322 }, { "epoch": 0.2677595628415301, "grad_norm": 0.43019410967826843, "learning_rate": 0.000166831035713839, "loss": 0.2373, "step": 1323 }, { "epoch": 0.26796195102206033, "grad_norm": 0.35000717639923096, "learning_rate": 0.00016678367663800162, "loss": 0.2468, "step": 1324 }, { "epoch": 0.2681643392025906, "grad_norm": 0.4278392493724823, "learning_rate": 0.00016673629050889508, "loss": 0.2235, "step": 1325 }, { "epoch": 0.26836672738312084, "grad_norm": 0.2858755588531494, "learning_rate": 0.00016668887734571495, "loss": 0.2308, "step": 1326 }, { "epoch": 0.2685691155636511, "grad_norm": 0.2804993689060211, "learning_rate": 0.0001666414371676677, "loss": 0.249, "step": 1327 }, { "epoch": 0.26877150374418135, "grad_norm": 0.3359355628490448, "learning_rate": 0.00016659396999397084, "loss": 0.2765, "step": 1328 }, { "epoch": 0.2689738919247116, "grad_norm": 0.26137131452560425, "learning_rate": 0.00016654647584385274, "loss": 0.2398, "step": 1329 }, { "epoch": 0.26917628010524186, "grad_norm": 0.2753427028656006, "learning_rate": 0.0001664989547365527, "loss": 0.243, "step": 1330 }, { "epoch": 0.2693786682857721, "grad_norm": 0.3281339406967163, "learning_rate": 0.00016645140669132094, "loss": 0.232, "step": 1331 }, { "epoch": 0.26958105646630237, "grad_norm": 0.27060410380363464, "learning_rate": 0.0001664038317274186, "loss": 0.238, "step": 1332 }, { "epoch": 0.2697834446468326, "grad_norm": 0.28284206986427307, "learning_rate": 0.00016635622986411778, "loss": 0.2342, "step": 1333 }, { "epoch": 0.2699858328273629, "grad_norm": 0.25805503129959106, "learning_rate": 0.00016630860112070136, "loss": 0.2436, "step": 1334 }, { "epoch": 0.27018822100789314, "grad_norm": 0.2467060387134552, "learning_rate": 0.00016626094551646322, "loss": 0.2068, "step": 1335 }, { "epoch": 0.2703906091884234, "grad_norm": 0.3367370069026947, "learning_rate": 0.00016621326307070806, "loss": 0.2686, "step": 1336 }, { "epoch": 0.27059299736895365, "grad_norm": 0.30159375071525574, "learning_rate": 0.00016616555380275148, "loss": 0.253, "step": 1337 }, { "epoch": 0.2707953855494839, "grad_norm": 0.27586883306503296, "learning_rate": 0.0001661178177319199, "loss": 0.225, "step": 1338 }, { "epoch": 0.27099777373001416, "grad_norm": 0.3171173930168152, "learning_rate": 0.00016607005487755068, "loss": 0.2536, "step": 1339 }, { "epoch": 0.2712001619105444, "grad_norm": 0.27477142214775085, "learning_rate": 0.00016602226525899195, "loss": 0.2451, "step": 1340 }, { "epoch": 0.27140255009107467, "grad_norm": 0.2661517262458801, "learning_rate": 0.00016597444889560273, "loss": 0.2491, "step": 1341 }, { "epoch": 0.2716049382716049, "grad_norm": 0.327012300491333, "learning_rate": 0.00016592660580675282, "loss": 0.2054, "step": 1342 }, { "epoch": 0.2718073264521352, "grad_norm": 0.30578696727752686, "learning_rate": 0.00016587873601182294, "loss": 0.2406, "step": 1343 }, { "epoch": 0.27200971463266543, "grad_norm": 0.29754316806793213, "learning_rate": 0.00016583083953020453, "loss": 0.2638, "step": 1344 }, { "epoch": 0.2722121028131957, "grad_norm": 0.32589009404182434, "learning_rate": 0.0001657829163812999, "loss": 0.2644, "step": 1345 }, { "epoch": 0.27241449099372594, "grad_norm": 0.35984236001968384, "learning_rate": 0.00016573496658452218, "loss": 0.2598, "step": 1346 }, { "epoch": 0.2726168791742562, "grad_norm": 0.2969356179237366, "learning_rate": 0.00016568699015929514, "loss": 0.2193, "step": 1347 }, { "epoch": 0.27281926735478645, "grad_norm": 0.33946526050567627, "learning_rate": 0.0001656389871250536, "loss": 0.2502, "step": 1348 }, { "epoch": 0.27302165553531677, "grad_norm": 0.27091342210769653, "learning_rate": 0.00016559095750124294, "loss": 0.2558, "step": 1349 }, { "epoch": 0.273224043715847, "grad_norm": 0.2503189444541931, "learning_rate": 0.00016554290130731938, "loss": 0.2113, "step": 1350 }, { "epoch": 0.273224043715847, "eval_loss": 0.2814163267612457, "eval_runtime": 1.3128, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.762, "step": 1350 }, { "epoch": 0.2734264318963773, "grad_norm": 0.3225633203983307, "learning_rate": 0.00016549481856274994, "loss": 0.2428, "step": 1351 }, { "epoch": 0.27362882007690753, "grad_norm": 0.29037073254585266, "learning_rate": 0.00016544670928701237, "loss": 0.265, "step": 1352 }, { "epoch": 0.2738312082574378, "grad_norm": 0.32522687315940857, "learning_rate": 0.00016539857349959512, "loss": 0.2485, "step": 1353 }, { "epoch": 0.27403359643796804, "grad_norm": 0.27587494254112244, "learning_rate": 0.00016535041121999743, "loss": 0.2072, "step": 1354 }, { "epoch": 0.2742359846184983, "grad_norm": 0.3260543942451477, "learning_rate": 0.00016530222246772933, "loss": 0.2559, "step": 1355 }, { "epoch": 0.27443837279902855, "grad_norm": 0.31061992049217224, "learning_rate": 0.00016525400726231143, "loss": 0.2388, "step": 1356 }, { "epoch": 0.2746407609795588, "grad_norm": 0.2436290830373764, "learning_rate": 0.00016520576562327517, "loss": 0.2055, "step": 1357 }, { "epoch": 0.27484314916008906, "grad_norm": 0.3192553222179413, "learning_rate": 0.00016515749757016265, "loss": 0.2574, "step": 1358 }, { "epoch": 0.2750455373406193, "grad_norm": 0.3148309886455536, "learning_rate": 0.0001651092031225267, "loss": 0.2499, "step": 1359 }, { "epoch": 0.2752479255211496, "grad_norm": 0.31500867009162903, "learning_rate": 0.0001650608822999308, "loss": 0.2186, "step": 1360 }, { "epoch": 0.2754503137016798, "grad_norm": 0.30355751514434814, "learning_rate": 0.00016501253512194914, "loss": 0.2535, "step": 1361 }, { "epoch": 0.2756527018822101, "grad_norm": 0.2818071246147156, "learning_rate": 0.0001649641616081666, "loss": 0.2418, "step": 1362 }, { "epoch": 0.27585509006274034, "grad_norm": 0.3359229266643524, "learning_rate": 0.0001649157617781787, "loss": 0.2686, "step": 1363 }, { "epoch": 0.2760574782432706, "grad_norm": 0.3475838303565979, "learning_rate": 0.00016486733565159164, "loss": 0.257, "step": 1364 }, { "epoch": 0.27625986642380085, "grad_norm": 0.41123494505882263, "learning_rate": 0.00016481888324802223, "loss": 0.2623, "step": 1365 }, { "epoch": 0.2764622546043311, "grad_norm": 0.32068777084350586, "learning_rate": 0.00016477040458709805, "loss": 0.2662, "step": 1366 }, { "epoch": 0.27666464278486136, "grad_norm": 0.27015361189842224, "learning_rate": 0.00016472189968845711, "loss": 0.2286, "step": 1367 }, { "epoch": 0.2768670309653916, "grad_norm": 0.38609763979911804, "learning_rate": 0.00016467336857174826, "loss": 0.2354, "step": 1368 }, { "epoch": 0.27706941914592187, "grad_norm": 0.37728220224380493, "learning_rate": 0.00016462481125663082, "loss": 0.2081, "step": 1369 }, { "epoch": 0.2772718073264521, "grad_norm": 0.31664031744003296, "learning_rate": 0.0001645762277627748, "loss": 0.291, "step": 1370 }, { "epoch": 0.2774741955069824, "grad_norm": 0.33264803886413574, "learning_rate": 0.00016452761810986077, "loss": 0.2285, "step": 1371 }, { "epoch": 0.27767658368751263, "grad_norm": 0.30719199776649475, "learning_rate": 0.00016447898231757995, "loss": 0.2427, "step": 1372 }, { "epoch": 0.2778789718680429, "grad_norm": 0.28520432114601135, "learning_rate": 0.0001644303204056341, "loss": 0.2675, "step": 1373 }, { "epoch": 0.27808136004857315, "grad_norm": 0.35676461458206177, "learning_rate": 0.0001643816323937356, "loss": 0.2834, "step": 1374 }, { "epoch": 0.2782837482291034, "grad_norm": 0.36278045177459717, "learning_rate": 0.0001643329183016073, "loss": 0.2757, "step": 1375 }, { "epoch": 0.27848613640963366, "grad_norm": 0.30848854780197144, "learning_rate": 0.00016428417814898282, "loss": 0.2192, "step": 1376 }, { "epoch": 0.2786885245901639, "grad_norm": 0.25591447949409485, "learning_rate": 0.00016423541195560617, "loss": 0.2407, "step": 1377 }, { "epoch": 0.27889091277069417, "grad_norm": 0.3188348710536957, "learning_rate": 0.0001641866197412319, "loss": 0.2721, "step": 1378 }, { "epoch": 0.2790933009512244, "grad_norm": 0.30868473649024963, "learning_rate": 0.00016413780152562519, "loss": 0.2694, "step": 1379 }, { "epoch": 0.27929568913175473, "grad_norm": 0.32113486528396606, "learning_rate": 0.00016408895732856173, "loss": 0.2923, "step": 1380 }, { "epoch": 0.279498077312285, "grad_norm": 0.4016645550727844, "learning_rate": 0.0001640400871698277, "loss": 0.2184, "step": 1381 }, { "epoch": 0.27970046549281524, "grad_norm": 0.344021737575531, "learning_rate": 0.00016399119106921982, "loss": 0.2588, "step": 1382 }, { "epoch": 0.2799028536733455, "grad_norm": 0.2822872996330261, "learning_rate": 0.00016394226904654533, "loss": 0.249, "step": 1383 }, { "epoch": 0.28010524185387575, "grad_norm": 0.27933451533317566, "learning_rate": 0.0001638933211216219, "loss": 0.2369, "step": 1384 }, { "epoch": 0.280307630034406, "grad_norm": 0.28503698110580444, "learning_rate": 0.00016384434731427785, "loss": 0.2171, "step": 1385 }, { "epoch": 0.28051001821493626, "grad_norm": 0.3234362304210663, "learning_rate": 0.0001637953476443518, "loss": 0.2938, "step": 1386 }, { "epoch": 0.2807124063954665, "grad_norm": 0.27071040868759155, "learning_rate": 0.000163746322131693, "loss": 0.2293, "step": 1387 }, { "epoch": 0.2809147945759968, "grad_norm": 0.2438916116952896, "learning_rate": 0.000163697270796161, "loss": 0.2232, "step": 1388 }, { "epoch": 0.28111718275652703, "grad_norm": 0.309865802526474, "learning_rate": 0.000163648193657626, "loss": 0.2224, "step": 1389 }, { "epoch": 0.2813195709370573, "grad_norm": 0.28574180603027344, "learning_rate": 0.00016359909073596852, "loss": 0.2597, "step": 1390 }, { "epoch": 0.28152195911758754, "grad_norm": 0.3140735328197479, "learning_rate": 0.00016354996205107956, "loss": 0.2313, "step": 1391 }, { "epoch": 0.2817243472981178, "grad_norm": 0.3212859034538269, "learning_rate": 0.0001635008076228606, "loss": 0.2433, "step": 1392 }, { "epoch": 0.28192673547864805, "grad_norm": 0.29252833127975464, "learning_rate": 0.0001634516274712235, "loss": 0.2589, "step": 1393 }, { "epoch": 0.2821291236591783, "grad_norm": 0.3086389899253845, "learning_rate": 0.00016340242161609053, "loss": 0.2404, "step": 1394 }, { "epoch": 0.28233151183970856, "grad_norm": 0.282094806432724, "learning_rate": 0.0001633531900773944, "loss": 0.2709, "step": 1395 }, { "epoch": 0.2825339000202388, "grad_norm": 0.42968419194221497, "learning_rate": 0.00016330393287507824, "loss": 0.2401, "step": 1396 }, { "epoch": 0.28273628820076907, "grad_norm": 0.5299389362335205, "learning_rate": 0.00016325465002909554, "loss": 0.2348, "step": 1397 }, { "epoch": 0.2829386763812993, "grad_norm": 0.29599180817604065, "learning_rate": 0.00016320534155941018, "loss": 0.2518, "step": 1398 }, { "epoch": 0.2831410645618296, "grad_norm": 0.3073466718196869, "learning_rate": 0.00016315600748599644, "loss": 0.2009, "step": 1399 }, { "epoch": 0.28334345274235984, "grad_norm": 0.27249675989151, "learning_rate": 0.000163106647828839, "loss": 0.24, "step": 1400 }, { "epoch": 0.28334345274235984, "eval_loss": 0.28701725602149963, "eval_runtime": 1.3143, "eval_samples_per_second": 3.804, "eval_steps_per_second": 0.761, "step": 1400 }, { "epoch": 0.2835458409228901, "grad_norm": 0.3398053050041199, "learning_rate": 0.0001630572626079328, "loss": 0.2573, "step": 1401 }, { "epoch": 0.28374822910342035, "grad_norm": 0.4803030788898468, "learning_rate": 0.0001630078518432832, "loss": 0.2881, "step": 1402 }, { "epoch": 0.2839506172839506, "grad_norm": 0.33604663610458374, "learning_rate": 0.000162958415554906, "loss": 0.2653, "step": 1403 }, { "epoch": 0.28415300546448086, "grad_norm": 0.2845475375652313, "learning_rate": 0.00016290895376282717, "loss": 0.2153, "step": 1404 }, { "epoch": 0.2843553936450111, "grad_norm": 0.3457709848880768, "learning_rate": 0.0001628594664870831, "loss": 0.2722, "step": 1405 }, { "epoch": 0.28455778182554137, "grad_norm": 0.3690558969974518, "learning_rate": 0.00016280995374772054, "loss": 0.2625, "step": 1406 }, { "epoch": 0.2847601700060716, "grad_norm": 0.30499541759490967, "learning_rate": 0.00016276041556479646, "loss": 0.2326, "step": 1407 }, { "epoch": 0.2849625581866019, "grad_norm": 0.25210294127464294, "learning_rate": 0.0001627108519583782, "loss": 0.206, "step": 1408 }, { "epoch": 0.28516494636713213, "grad_norm": 0.32703089714050293, "learning_rate": 0.00016266126294854343, "loss": 0.2489, "step": 1409 }, { "epoch": 0.28536733454766244, "grad_norm": 0.38408195972442627, "learning_rate": 0.00016261164855538, "loss": 0.2433, "step": 1410 }, { "epoch": 0.2855697227281927, "grad_norm": 0.319349080324173, "learning_rate": 0.0001625620087989861, "loss": 0.2447, "step": 1411 }, { "epoch": 0.28577211090872295, "grad_norm": 0.3926481008529663, "learning_rate": 0.00016251234369947027, "loss": 0.2488, "step": 1412 }, { "epoch": 0.2859744990892532, "grad_norm": 0.26722252368927, "learning_rate": 0.0001624626532769512, "loss": 0.2405, "step": 1413 }, { "epoch": 0.28617688726978346, "grad_norm": 0.25793546438217163, "learning_rate": 0.00016241293755155792, "loss": 0.25, "step": 1414 }, { "epoch": 0.2863792754503137, "grad_norm": 0.2604754865169525, "learning_rate": 0.0001623631965434296, "loss": 0.2533, "step": 1415 }, { "epoch": 0.286581663630844, "grad_norm": 0.3502312898635864, "learning_rate": 0.00016231343027271582, "loss": 0.2526, "step": 1416 }, { "epoch": 0.28678405181137423, "grad_norm": 0.45335817337036133, "learning_rate": 0.00016226363875957624, "loss": 0.2506, "step": 1417 }, { "epoch": 0.2869864399919045, "grad_norm": 0.36080202460289, "learning_rate": 0.00016221382202418085, "loss": 0.2397, "step": 1418 }, { "epoch": 0.28718882817243474, "grad_norm": 0.27753639221191406, "learning_rate": 0.00016216398008670976, "loss": 0.277, "step": 1419 }, { "epoch": 0.287391216352965, "grad_norm": 0.2525143623352051, "learning_rate": 0.00016211411296735345, "loss": 0.2282, "step": 1420 }, { "epoch": 0.28759360453349525, "grad_norm": 0.24571165442466736, "learning_rate": 0.00016206422068631236, "loss": 0.2077, "step": 1421 }, { "epoch": 0.2877959927140255, "grad_norm": 0.38463279604911804, "learning_rate": 0.00016201430326379733, "loss": 0.2872, "step": 1422 }, { "epoch": 0.28799838089455576, "grad_norm": 0.2610633671283722, "learning_rate": 0.00016196436072002932, "loss": 0.2268, "step": 1423 }, { "epoch": 0.288200769075086, "grad_norm": 0.2668552100658417, "learning_rate": 0.0001619143930752394, "loss": 0.2367, "step": 1424 }, { "epoch": 0.28840315725561627, "grad_norm": 0.27416011691093445, "learning_rate": 0.00016186440034966892, "loss": 0.2618, "step": 1425 }, { "epoch": 0.2886055454361465, "grad_norm": 0.2385905534029007, "learning_rate": 0.00016181438256356932, "loss": 0.2244, "step": 1426 }, { "epoch": 0.2888079336166768, "grad_norm": 0.2840398848056793, "learning_rate": 0.0001617643397372022, "loss": 0.2625, "step": 1427 }, { "epoch": 0.28901032179720704, "grad_norm": 0.3690309226512909, "learning_rate": 0.00016171427189083933, "loss": 0.258, "step": 1428 }, { "epoch": 0.2892127099777373, "grad_norm": 0.23191918432712555, "learning_rate": 0.00016166417904476255, "loss": 0.2012, "step": 1429 }, { "epoch": 0.28941509815826755, "grad_norm": 0.27311259508132935, "learning_rate": 0.00016161406121926395, "loss": 0.2262, "step": 1430 }, { "epoch": 0.2896174863387978, "grad_norm": 0.3256944715976715, "learning_rate": 0.00016156391843464557, "loss": 0.2411, "step": 1431 }, { "epoch": 0.28981987451932806, "grad_norm": 0.31852561235427856, "learning_rate": 0.00016151375071121973, "loss": 0.2258, "step": 1432 }, { "epoch": 0.2900222626998583, "grad_norm": 0.352746844291687, "learning_rate": 0.00016146355806930874, "loss": 0.3067, "step": 1433 }, { "epoch": 0.29022465088038857, "grad_norm": 0.26716965436935425, "learning_rate": 0.00016141334052924503, "loss": 0.2405, "step": 1434 }, { "epoch": 0.2904270390609188, "grad_norm": 0.2332156002521515, "learning_rate": 0.00016136309811137117, "loss": 0.2004, "step": 1435 }, { "epoch": 0.2906294272414491, "grad_norm": 0.42516130208969116, "learning_rate": 0.00016131283083603969, "loss": 0.2798, "step": 1436 }, { "epoch": 0.29083181542197933, "grad_norm": 0.25368016958236694, "learning_rate": 0.00016126253872361334, "loss": 0.2439, "step": 1437 }, { "epoch": 0.2910342036025096, "grad_norm": 0.39688053727149963, "learning_rate": 0.0001612122217944648, "loss": 0.2332, "step": 1438 }, { "epoch": 0.29123659178303984, "grad_norm": 0.28587251901626587, "learning_rate": 0.00016116188006897687, "loss": 0.2387, "step": 1439 }, { "epoch": 0.29143897996357016, "grad_norm": 0.35700151324272156, "learning_rate": 0.0001611115135675424, "loss": 0.2517, "step": 1440 }, { "epoch": 0.2916413681441004, "grad_norm": 0.2821785509586334, "learning_rate": 0.00016106112231056426, "loss": 0.2269, "step": 1441 }, { "epoch": 0.29184375632463067, "grad_norm": 0.2888050973415375, "learning_rate": 0.0001610107063184553, "loss": 0.242, "step": 1442 }, { "epoch": 0.2920461445051609, "grad_norm": 0.3453981876373291, "learning_rate": 0.00016096026561163845, "loss": 0.2443, "step": 1443 }, { "epoch": 0.2922485326856912, "grad_norm": 0.28724777698516846, "learning_rate": 0.00016090980021054668, "loss": 0.2835, "step": 1444 }, { "epoch": 0.29245092086622143, "grad_norm": 0.2799629271030426, "learning_rate": 0.00016085931013562292, "loss": 0.2375, "step": 1445 }, { "epoch": 0.2926533090467517, "grad_norm": 0.2970954179763794, "learning_rate": 0.00016080879540732003, "loss": 0.2445, "step": 1446 }, { "epoch": 0.29285569722728194, "grad_norm": 0.4151013493537903, "learning_rate": 0.00016075825604610098, "loss": 0.2533, "step": 1447 }, { "epoch": 0.2930580854078122, "grad_norm": 0.34075066447257996, "learning_rate": 0.00016070769207243864, "loss": 0.2549, "step": 1448 }, { "epoch": 0.29326047358834245, "grad_norm": 0.44362327456474304, "learning_rate": 0.00016065710350681586, "loss": 0.2735, "step": 1449 }, { "epoch": 0.2934628617688727, "grad_norm": 0.2781960070133209, "learning_rate": 0.00016060649036972547, "loss": 0.2596, "step": 1450 }, { "epoch": 0.2934628617688727, "eval_loss": 0.288575679063797, "eval_runtime": 1.3206, "eval_samples_per_second": 3.786, "eval_steps_per_second": 0.757, "step": 1450 }, { "epoch": 0.29366524994940296, "grad_norm": 0.3316075801849365, "learning_rate": 0.0001605558526816703, "loss": 0.2437, "step": 1451 }, { "epoch": 0.2938676381299332, "grad_norm": 0.3345779776573181, "learning_rate": 0.00016050519046316298, "loss": 0.2549, "step": 1452 }, { "epoch": 0.2940700263104635, "grad_norm": 0.2846747934818268, "learning_rate": 0.00016045450373472625, "loss": 0.254, "step": 1453 }, { "epoch": 0.29427241449099373, "grad_norm": 0.3424137532711029, "learning_rate": 0.00016040379251689265, "loss": 0.2824, "step": 1454 }, { "epoch": 0.294474802671524, "grad_norm": 0.40482378005981445, "learning_rate": 0.0001603530568302047, "loss": 0.2851, "step": 1455 }, { "epoch": 0.29467719085205424, "grad_norm": 0.29035884141921997, "learning_rate": 0.00016030229669521484, "loss": 0.2269, "step": 1456 }, { "epoch": 0.2948795790325845, "grad_norm": 0.33798131346702576, "learning_rate": 0.00016025151213248535, "loss": 0.3265, "step": 1457 }, { "epoch": 0.29508196721311475, "grad_norm": 0.343180388212204, "learning_rate": 0.0001602007031625885, "loss": 0.2504, "step": 1458 }, { "epoch": 0.295284355393645, "grad_norm": 0.24138867855072021, "learning_rate": 0.00016014986980610635, "loss": 0.2133, "step": 1459 }, { "epoch": 0.29548674357417526, "grad_norm": 0.24454712867736816, "learning_rate": 0.00016009901208363092, "loss": 0.2342, "step": 1460 }, { "epoch": 0.2956891317547055, "grad_norm": 0.45114409923553467, "learning_rate": 0.00016004813001576403, "loss": 0.219, "step": 1461 }, { "epoch": 0.29589151993523577, "grad_norm": 0.390631765127182, "learning_rate": 0.00015999722362311744, "loss": 0.2436, "step": 1462 }, { "epoch": 0.296093908115766, "grad_norm": 0.41036346554756165, "learning_rate": 0.00015994629292631269, "loss": 0.2502, "step": 1463 }, { "epoch": 0.2962962962962963, "grad_norm": 0.2816285789012909, "learning_rate": 0.00015989533794598124, "loss": 0.2381, "step": 1464 }, { "epoch": 0.29649868447682654, "grad_norm": 0.5021857023239136, "learning_rate": 0.00015984435870276427, "loss": 0.2872, "step": 1465 }, { "epoch": 0.2967010726573568, "grad_norm": 0.3329884707927704, "learning_rate": 0.00015979335521731293, "loss": 0.2547, "step": 1466 }, { "epoch": 0.29690346083788705, "grad_norm": 0.31609731912612915, "learning_rate": 0.00015974232751028808, "loss": 0.2252, "step": 1467 }, { "epoch": 0.2971058490184173, "grad_norm": 0.26723894476890564, "learning_rate": 0.00015969127560236049, "loss": 0.2453, "step": 1468 }, { "epoch": 0.29730823719894756, "grad_norm": 0.2766818106174469, "learning_rate": 0.00015964019951421058, "loss": 0.2011, "step": 1469 }, { "epoch": 0.2975106253794778, "grad_norm": 0.26280006766319275, "learning_rate": 0.00015958909926652877, "loss": 0.2706, "step": 1470 }, { "epoch": 0.2977130135600081, "grad_norm": 0.3110751807689667, "learning_rate": 0.00015953797488001508, "loss": 0.2472, "step": 1471 }, { "epoch": 0.2979154017405384, "grad_norm": 0.28234657645225525, "learning_rate": 0.0001594868263753794, "loss": 0.2541, "step": 1472 }, { "epoch": 0.29811778992106863, "grad_norm": 0.27136537432670593, "learning_rate": 0.00015943565377334143, "loss": 0.2604, "step": 1473 }, { "epoch": 0.2983201781015989, "grad_norm": 0.3160214424133301, "learning_rate": 0.00015938445709463053, "loss": 0.2297, "step": 1474 }, { "epoch": 0.29852256628212914, "grad_norm": 0.3060542643070221, "learning_rate": 0.0001593332363599859, "loss": 0.2328, "step": 1475 }, { "epoch": 0.2987249544626594, "grad_norm": 0.252983421087265, "learning_rate": 0.0001592819915901564, "loss": 0.2218, "step": 1476 }, { "epoch": 0.29892734264318965, "grad_norm": 0.2582453191280365, "learning_rate": 0.0001592307228059007, "loss": 0.2613, "step": 1477 }, { "epoch": 0.2991297308237199, "grad_norm": 0.2648583650588989, "learning_rate": 0.0001591794300279872, "loss": 0.2632, "step": 1478 }, { "epoch": 0.29933211900425016, "grad_norm": 0.2925315201282501, "learning_rate": 0.0001591281132771939, "loss": 0.2693, "step": 1479 }, { "epoch": 0.2995345071847804, "grad_norm": 0.37666764855384827, "learning_rate": 0.00015907677257430872, "loss": 0.2272, "step": 1480 }, { "epoch": 0.2997368953653107, "grad_norm": 0.2659248113632202, "learning_rate": 0.00015902540794012908, "loss": 0.2337, "step": 1481 }, { "epoch": 0.29993928354584093, "grad_norm": 0.3361140787601471, "learning_rate": 0.00015897401939546222, "loss": 0.2413, "step": 1482 }, { "epoch": 0.3001416717263712, "grad_norm": 0.3773367404937744, "learning_rate": 0.00015892260696112503, "loss": 0.2754, "step": 1483 }, { "epoch": 0.30034405990690144, "grad_norm": 0.3029126524925232, "learning_rate": 0.00015887117065794404, "loss": 0.2598, "step": 1484 }, { "epoch": 0.3005464480874317, "grad_norm": 0.3077636659145355, "learning_rate": 0.0001588197105067555, "loss": 0.228, "step": 1485 }, { "epoch": 0.30074883626796195, "grad_norm": 0.2995539903640747, "learning_rate": 0.0001587682265284053, "loss": 0.2418, "step": 1486 }, { "epoch": 0.3009512244484922, "grad_norm": 0.2672542929649353, "learning_rate": 0.00015871671874374897, "loss": 0.2487, "step": 1487 }, { "epoch": 0.30115361262902246, "grad_norm": 0.37025803327560425, "learning_rate": 0.0001586651871736517, "loss": 0.2484, "step": 1488 }, { "epoch": 0.3013560008095527, "grad_norm": 0.38283589482307434, "learning_rate": 0.00015861363183898835, "loss": 0.2297, "step": 1489 }, { "epoch": 0.30155838899008297, "grad_norm": 0.3386457860469818, "learning_rate": 0.00015856205276064335, "loss": 0.2895, "step": 1490 }, { "epoch": 0.3017607771706132, "grad_norm": 0.35246574878692627, "learning_rate": 0.00015851044995951073, "loss": 0.2571, "step": 1491 }, { "epoch": 0.3019631653511435, "grad_norm": 0.33270493149757385, "learning_rate": 0.00015845882345649426, "loss": 0.2516, "step": 1492 }, { "epoch": 0.30216555353167374, "grad_norm": 0.4603479206562042, "learning_rate": 0.0001584071732725071, "loss": 0.2273, "step": 1493 }, { "epoch": 0.302367941712204, "grad_norm": 0.31603896617889404, "learning_rate": 0.00015835549942847224, "loss": 0.275, "step": 1494 }, { "epoch": 0.30257032989273425, "grad_norm": 0.26599591970443726, "learning_rate": 0.00015830380194532207, "loss": 0.2097, "step": 1495 }, { "epoch": 0.3027727180732645, "grad_norm": 0.3641069829463959, "learning_rate": 0.00015825208084399857, "loss": 0.2428, "step": 1496 }, { "epoch": 0.30297510625379476, "grad_norm": 0.2890602946281433, "learning_rate": 0.00015820033614545346, "loss": 0.2534, "step": 1497 }, { "epoch": 0.303177494434325, "grad_norm": 0.22469080984592438, "learning_rate": 0.00015814856787064782, "loss": 0.2145, "step": 1498 }, { "epoch": 0.30337988261485527, "grad_norm": 0.291421502828598, "learning_rate": 0.0001580967760405524, "loss": 0.2383, "step": 1499 }, { "epoch": 0.3035822707953855, "grad_norm": 0.25392282009124756, "learning_rate": 0.0001580449606761474, "loss": 0.247, "step": 1500 }, { "epoch": 0.3035822707953855, "eval_loss": 0.29185429215431213, "eval_runtime": 1.3228, "eval_samples_per_second": 3.78, "eval_steps_per_second": 0.756, "step": 1500 }, { "epoch": 0.30378465897591583, "grad_norm": 0.4040224254131317, "learning_rate": 0.00015799312179842266, "loss": 0.2862, "step": 1501 }, { "epoch": 0.3039870471564461, "grad_norm": 0.30887001752853394, "learning_rate": 0.00015794125942837745, "loss": 0.2728, "step": 1502 }, { "epoch": 0.30418943533697634, "grad_norm": 0.2798607349395752, "learning_rate": 0.00015788937358702062, "loss": 0.244, "step": 1503 }, { "epoch": 0.3043918235175066, "grad_norm": 0.2739742696285248, "learning_rate": 0.0001578374642953705, "loss": 0.2233, "step": 1504 }, { "epoch": 0.30459421169803685, "grad_norm": 0.30371665954589844, "learning_rate": 0.0001577855315744549, "loss": 0.228, "step": 1505 }, { "epoch": 0.3047965998785671, "grad_norm": 0.3093835711479187, "learning_rate": 0.00015773357544531118, "loss": 0.2652, "step": 1506 }, { "epoch": 0.30499898805909736, "grad_norm": 0.2595911920070648, "learning_rate": 0.00015768159592898613, "loss": 0.2354, "step": 1507 }, { "epoch": 0.3052013762396276, "grad_norm": 0.442634254693985, "learning_rate": 0.00015762959304653604, "loss": 0.2378, "step": 1508 }, { "epoch": 0.3054037644201579, "grad_norm": 0.2586023211479187, "learning_rate": 0.00015757756681902664, "loss": 0.2298, "step": 1509 }, { "epoch": 0.30560615260068813, "grad_norm": 0.24030888080596924, "learning_rate": 0.00015752551726753314, "loss": 0.2206, "step": 1510 }, { "epoch": 0.3058085407812184, "grad_norm": 0.3048587143421173, "learning_rate": 0.00015747344441314017, "loss": 0.2464, "step": 1511 }, { "epoch": 0.30601092896174864, "grad_norm": 0.31115928292274475, "learning_rate": 0.00015742134827694188, "loss": 0.2436, "step": 1512 }, { "epoch": 0.3062133171422789, "grad_norm": 0.27170801162719727, "learning_rate": 0.00015736922888004174, "loss": 0.2515, "step": 1513 }, { "epoch": 0.30641570532280915, "grad_norm": 0.3254013955593109, "learning_rate": 0.0001573170862435527, "loss": 0.2556, "step": 1514 }, { "epoch": 0.3066180935033394, "grad_norm": 0.4062742292881012, "learning_rate": 0.00015726492038859715, "loss": 0.2141, "step": 1515 }, { "epoch": 0.30682048168386966, "grad_norm": 0.3382481336593628, "learning_rate": 0.0001572127313363068, "loss": 0.2657, "step": 1516 }, { "epoch": 0.3070228698643999, "grad_norm": 0.34623807668685913, "learning_rate": 0.00015716051910782288, "loss": 0.2634, "step": 1517 }, { "epoch": 0.30722525804493017, "grad_norm": 0.4413280189037323, "learning_rate": 0.00015710828372429586, "loss": 0.2365, "step": 1518 }, { "epoch": 0.3074276462254604, "grad_norm": 0.2971428632736206, "learning_rate": 0.00015705602520688577, "loss": 0.2646, "step": 1519 }, { "epoch": 0.3076300344059907, "grad_norm": 0.371864914894104, "learning_rate": 0.0001570037435767618, "loss": 0.2548, "step": 1520 }, { "epoch": 0.30783242258652094, "grad_norm": 0.2553359568119049, "learning_rate": 0.0001569514388551027, "loss": 0.2255, "step": 1521 }, { "epoch": 0.3080348107670512, "grad_norm": 0.31415635347366333, "learning_rate": 0.00015689911106309644, "loss": 0.2852, "step": 1522 }, { "epoch": 0.30823719894758145, "grad_norm": 0.2687399685382843, "learning_rate": 0.00015684676022194042, "loss": 0.2265, "step": 1523 }, { "epoch": 0.3084395871281117, "grad_norm": 0.3047806918621063, "learning_rate": 0.0001567943863528413, "loss": 0.2631, "step": 1524 }, { "epoch": 0.30864197530864196, "grad_norm": 0.2811877727508545, "learning_rate": 0.00015674198947701512, "loss": 0.2147, "step": 1525 }, { "epoch": 0.3088443634891722, "grad_norm": 0.27975326776504517, "learning_rate": 0.00015668956961568725, "loss": 0.2771, "step": 1526 }, { "epoch": 0.30904675166970247, "grad_norm": 0.22187146544456482, "learning_rate": 0.0001566371267900923, "loss": 0.2366, "step": 1527 }, { "epoch": 0.3092491398502327, "grad_norm": 0.318733274936676, "learning_rate": 0.0001565846610214743, "loss": 0.2633, "step": 1528 }, { "epoch": 0.309451528030763, "grad_norm": 0.2838500142097473, "learning_rate": 0.00015653217233108647, "loss": 0.2046, "step": 1529 }, { "epoch": 0.30965391621129323, "grad_norm": 0.3495554029941559, "learning_rate": 0.00015647966074019135, "loss": 0.2772, "step": 1530 }, { "epoch": 0.30985630439182354, "grad_norm": 0.286012202501297, "learning_rate": 0.00015642712627006073, "loss": 0.2583, "step": 1531 }, { "epoch": 0.3100586925723538, "grad_norm": 0.3395470082759857, "learning_rate": 0.00015637456894197578, "loss": 0.2489, "step": 1532 }, { "epoch": 0.31026108075288406, "grad_norm": 0.321918785572052, "learning_rate": 0.00015632198877722675, "loss": 0.2593, "step": 1533 }, { "epoch": 0.3104634689334143, "grad_norm": 0.2817826569080353, "learning_rate": 0.00015626938579711328, "loss": 0.2542, "step": 1534 }, { "epoch": 0.31066585711394457, "grad_norm": 0.25056058168411255, "learning_rate": 0.00015621676002294422, "loss": 0.2164, "step": 1535 }, { "epoch": 0.3108682452944748, "grad_norm": 0.34727513790130615, "learning_rate": 0.00015616411147603763, "loss": 0.2195, "step": 1536 }, { "epoch": 0.3110706334750051, "grad_norm": 0.2715780735015869, "learning_rate": 0.00015611144017772078, "loss": 0.2502, "step": 1537 }, { "epoch": 0.31127302165553533, "grad_norm": 0.3880321681499481, "learning_rate": 0.00015605874614933022, "loss": 0.2494, "step": 1538 }, { "epoch": 0.3114754098360656, "grad_norm": 0.2526821494102478, "learning_rate": 0.0001560060294122117, "loss": 0.229, "step": 1539 }, { "epoch": 0.31167779801659584, "grad_norm": 0.2712867558002472, "learning_rate": 0.00015595328998772005, "loss": 0.269, "step": 1540 }, { "epoch": 0.3118801861971261, "grad_norm": 0.4174976944923401, "learning_rate": 0.00015590052789721945, "loss": 0.2511, "step": 1541 }, { "epoch": 0.31208257437765635, "grad_norm": 0.25503671169281006, "learning_rate": 0.00015584774316208317, "loss": 0.2126, "step": 1542 }, { "epoch": 0.3122849625581866, "grad_norm": 0.27231547236442566, "learning_rate": 0.00015579493580369368, "loss": 0.2528, "step": 1543 }, { "epoch": 0.31248735073871686, "grad_norm": 0.4011785387992859, "learning_rate": 0.0001557421058434426, "loss": 0.2583, "step": 1544 }, { "epoch": 0.3126897389192471, "grad_norm": 0.30185502767562866, "learning_rate": 0.0001556892533027307, "loss": 0.2335, "step": 1545 }, { "epoch": 0.3128921270997774, "grad_norm": 0.219326913356781, "learning_rate": 0.00015563637820296798, "loss": 0.1997, "step": 1546 }, { "epoch": 0.31309451528030763, "grad_norm": 0.22945311665534973, "learning_rate": 0.00015558348056557346, "loss": 0.2283, "step": 1547 }, { "epoch": 0.3132969034608379, "grad_norm": 0.30252352356910706, "learning_rate": 0.00015553056041197534, "loss": 0.2454, "step": 1548 }, { "epoch": 0.31349929164136814, "grad_norm": 0.277987539768219, "learning_rate": 0.00015547761776361094, "loss": 0.2055, "step": 1549 }, { "epoch": 0.3137016798218984, "grad_norm": 0.5741046667098999, "learning_rate": 0.0001554246526419267, "loss": 0.3159, "step": 1550 }, { "epoch": 0.3137016798218984, "eval_loss": 0.2873765528202057, "eval_runtime": 1.3168, "eval_samples_per_second": 3.797, "eval_steps_per_second": 0.759, "step": 1550 }, { "epoch": 0.31390406800242865, "grad_norm": 0.40516263246536255, "learning_rate": 0.00015537166506837818, "loss": 0.25, "step": 1551 }, { "epoch": 0.3141064561829589, "grad_norm": 0.3757566213607788, "learning_rate": 0.00015531865506442997, "loss": 0.254, "step": 1552 }, { "epoch": 0.31430884436348916, "grad_norm": 0.3254554867744446, "learning_rate": 0.00015526562265155583, "loss": 0.2315, "step": 1553 }, { "epoch": 0.3145112325440194, "grad_norm": 0.3593430817127228, "learning_rate": 0.00015521256785123856, "loss": 0.2774, "step": 1554 }, { "epoch": 0.31471362072454967, "grad_norm": 0.3386416733264923, "learning_rate": 0.00015515949068496996, "loss": 0.2878, "step": 1555 }, { "epoch": 0.3149160089050799, "grad_norm": 0.3190837502479553, "learning_rate": 0.00015510639117425102, "loss": 0.2774, "step": 1556 }, { "epoch": 0.3151183970856102, "grad_norm": 0.2489796131849289, "learning_rate": 0.00015505326934059168, "loss": 0.2201, "step": 1557 }, { "epoch": 0.31532078526614044, "grad_norm": 0.2533458471298218, "learning_rate": 0.00015500012520551097, "loss": 0.2542, "step": 1558 }, { "epoch": 0.3155231734466707, "grad_norm": 0.42690059542655945, "learning_rate": 0.00015494695879053694, "loss": 0.2397, "step": 1559 }, { "epoch": 0.31572556162720095, "grad_norm": 0.2737457752227783, "learning_rate": 0.00015489377011720676, "loss": 0.2523, "step": 1560 }, { "epoch": 0.3159279498077312, "grad_norm": 0.27361875772476196, "learning_rate": 0.00015484055920706637, "loss": 0.2343, "step": 1561 }, { "epoch": 0.3161303379882615, "grad_norm": 0.4532386064529419, "learning_rate": 0.00015478732608167098, "loss": 0.2233, "step": 1562 }, { "epoch": 0.31633272616879177, "grad_norm": 0.2857770323753357, "learning_rate": 0.00015473407076258466, "loss": 0.2347, "step": 1563 }, { "epoch": 0.316535114349322, "grad_norm": 0.3480817675590515, "learning_rate": 0.00015468079327138047, "loss": 0.2462, "step": 1564 }, { "epoch": 0.3167375025298523, "grad_norm": 0.25286075472831726, "learning_rate": 0.00015462749362964056, "loss": 0.2817, "step": 1565 }, { "epoch": 0.31693989071038253, "grad_norm": 0.26392602920532227, "learning_rate": 0.00015457417185895595, "loss": 0.242, "step": 1566 }, { "epoch": 0.3171422788909128, "grad_norm": 0.31012487411499023, "learning_rate": 0.0001545208279809266, "loss": 0.2488, "step": 1567 }, { "epoch": 0.31734466707144304, "grad_norm": 0.38509896397590637, "learning_rate": 0.00015446746201716155, "loss": 0.2674, "step": 1568 }, { "epoch": 0.3175470552519733, "grad_norm": 0.4102894067764282, "learning_rate": 0.00015441407398927864, "loss": 0.276, "step": 1569 }, { "epoch": 0.31774944343250355, "grad_norm": 0.529838502407074, "learning_rate": 0.0001543606639189048, "loss": 0.2485, "step": 1570 }, { "epoch": 0.3179518316130338, "grad_norm": 0.3085538446903229, "learning_rate": 0.00015430723182767577, "loss": 0.2809, "step": 1571 }, { "epoch": 0.31815421979356406, "grad_norm": 0.49552732706069946, "learning_rate": 0.00015425377773723624, "loss": 0.2346, "step": 1572 }, { "epoch": 0.3183566079740943, "grad_norm": 0.2906653881072998, "learning_rate": 0.00015420030166923983, "loss": 0.2404, "step": 1573 }, { "epoch": 0.3185589961546246, "grad_norm": 0.28941839933395386, "learning_rate": 0.0001541468036453491, "loss": 0.2086, "step": 1574 }, { "epoch": 0.31876138433515483, "grad_norm": 0.31704050302505493, "learning_rate": 0.0001540932836872354, "loss": 0.2373, "step": 1575 }, { "epoch": 0.3189637725156851, "grad_norm": 0.27731025218963623, "learning_rate": 0.00015403974181657905, "loss": 0.2577, "step": 1576 }, { "epoch": 0.31916616069621534, "grad_norm": 0.6662519574165344, "learning_rate": 0.00015398617805506922, "loss": 0.2473, "step": 1577 }, { "epoch": 0.3193685488767456, "grad_norm": 0.3100757896900177, "learning_rate": 0.00015393259242440398, "loss": 0.2706, "step": 1578 }, { "epoch": 0.31957093705727585, "grad_norm": 0.27312061190605164, "learning_rate": 0.00015387898494629017, "loss": 0.2304, "step": 1579 }, { "epoch": 0.3197733252378061, "grad_norm": 0.21853311359882355, "learning_rate": 0.0001538253556424436, "loss": 0.2239, "step": 1580 }, { "epoch": 0.31997571341833636, "grad_norm": 0.33781617879867554, "learning_rate": 0.00015377170453458877, "loss": 0.2319, "step": 1581 }, { "epoch": 0.3201781015988666, "grad_norm": 0.3139788806438446, "learning_rate": 0.00015371803164445922, "loss": 0.2885, "step": 1582 }, { "epoch": 0.32038048977939687, "grad_norm": 0.3251952528953552, "learning_rate": 0.00015366433699379712, "loss": 0.2548, "step": 1583 }, { "epoch": 0.3205828779599271, "grad_norm": 0.26830440759658813, "learning_rate": 0.00015361062060435354, "loss": 0.2109, "step": 1584 }, { "epoch": 0.3207852661404574, "grad_norm": 0.7824857831001282, "learning_rate": 0.00015355688249788836, "loss": 0.2144, "step": 1585 }, { "epoch": 0.32098765432098764, "grad_norm": 0.35961613059043884, "learning_rate": 0.00015350312269617024, "loss": 0.2564, "step": 1586 }, { "epoch": 0.3211900425015179, "grad_norm": 0.2986200451850891, "learning_rate": 0.00015344934122097663, "loss": 0.2775, "step": 1587 }, { "epoch": 0.32139243068204815, "grad_norm": 0.31535106897354126, "learning_rate": 0.00015339553809409377, "loss": 0.2466, "step": 1588 }, { "epoch": 0.3215948188625784, "grad_norm": 0.28162866830825806, "learning_rate": 0.00015334171333731663, "loss": 0.2393, "step": 1589 }, { "epoch": 0.32179720704310866, "grad_norm": 0.29046347737312317, "learning_rate": 0.00015328786697244907, "loss": 0.2202, "step": 1590 }, { "epoch": 0.3219995952236389, "grad_norm": 0.3580332398414612, "learning_rate": 0.0001532339990213035, "loss": 0.2347, "step": 1591 }, { "epoch": 0.3222019834041692, "grad_norm": 0.279365599155426, "learning_rate": 0.00015318010950570124, "loss": 0.2345, "step": 1592 }, { "epoch": 0.3224043715846995, "grad_norm": 0.3240690529346466, "learning_rate": 0.00015312619844747228, "loss": 0.2406, "step": 1593 }, { "epoch": 0.32260675976522973, "grad_norm": 0.5165361166000366, "learning_rate": 0.00015307226586845532, "loss": 0.2481, "step": 1594 }, { "epoch": 0.32280914794576, "grad_norm": 0.340131938457489, "learning_rate": 0.00015301831179049784, "loss": 0.2166, "step": 1595 }, { "epoch": 0.32301153612629024, "grad_norm": 1.3502074480056763, "learning_rate": 0.000152964336235456, "loss": 0.2624, "step": 1596 }, { "epoch": 0.3232139243068205, "grad_norm": 0.22510403394699097, "learning_rate": 0.00015291033922519456, "loss": 0.2095, "step": 1597 }, { "epoch": 0.32341631248735075, "grad_norm": 0.32942527532577515, "learning_rate": 0.00015285632078158718, "loss": 0.2357, "step": 1598 }, { "epoch": 0.323618700667881, "grad_norm": 0.2801137864589691, "learning_rate": 0.00015280228092651605, "loss": 0.2253, "step": 1599 }, { "epoch": 0.32382108884841126, "grad_norm": 0.3037624657154083, "learning_rate": 0.00015274821968187203, "loss": 0.2617, "step": 1600 }, { "epoch": 0.32382108884841126, "eval_loss": 0.29059848189353943, "eval_runtime": 1.32, "eval_samples_per_second": 3.788, "eval_steps_per_second": 0.758, "step": 1600 }, { "epoch": 0.3240234770289415, "grad_norm": 0.312289834022522, "learning_rate": 0.00015269413706955472, "loss": 0.2108, "step": 1601 }, { "epoch": 0.3242258652094718, "grad_norm": 0.40233370661735535, "learning_rate": 0.00015264003311147233, "loss": 0.2457, "step": 1602 }, { "epoch": 0.32442825339000203, "grad_norm": 0.3112492263317108, "learning_rate": 0.00015258590782954174, "loss": 0.2426, "step": 1603 }, { "epoch": 0.3246306415705323, "grad_norm": 0.2818208932876587, "learning_rate": 0.00015253176124568841, "loss": 0.234, "step": 1604 }, { "epoch": 0.32483302975106254, "grad_norm": 0.3278055787086487, "learning_rate": 0.00015247759338184652, "loss": 0.2547, "step": 1605 }, { "epoch": 0.3250354179315928, "grad_norm": 0.3111598491668701, "learning_rate": 0.00015242340425995884, "loss": 0.2681, "step": 1606 }, { "epoch": 0.32523780611212305, "grad_norm": 0.3168530762195587, "learning_rate": 0.0001523691939019767, "loss": 0.2145, "step": 1607 }, { "epoch": 0.3254401942926533, "grad_norm": 0.40210580825805664, "learning_rate": 0.00015231496232986005, "loss": 0.2162, "step": 1608 }, { "epoch": 0.32564258247318356, "grad_norm": 0.40183520317077637, "learning_rate": 0.00015226070956557747, "loss": 0.2417, "step": 1609 }, { "epoch": 0.3258449706537138, "grad_norm": 0.2815135419368744, "learning_rate": 0.00015220643563110614, "loss": 0.2415, "step": 1610 }, { "epoch": 0.32604735883424407, "grad_norm": 0.29655221104621887, "learning_rate": 0.00015215214054843174, "loss": 0.2455, "step": 1611 }, { "epoch": 0.3262497470147743, "grad_norm": 0.27248337864875793, "learning_rate": 0.00015209782433954857, "loss": 0.2289, "step": 1612 }, { "epoch": 0.3264521351953046, "grad_norm": 0.349803626537323, "learning_rate": 0.0001520434870264595, "loss": 0.2931, "step": 1613 }, { "epoch": 0.32665452337583484, "grad_norm": 0.29234540462493896, "learning_rate": 0.00015198912863117589, "loss": 0.263, "step": 1614 }, { "epoch": 0.3268569115563651, "grad_norm": 0.33855631947517395, "learning_rate": 0.00015193474917571772, "loss": 0.2547, "step": 1615 }, { "epoch": 0.32705929973689535, "grad_norm": 0.41110801696777344, "learning_rate": 0.0001518803486821134, "loss": 0.2418, "step": 1616 }, { "epoch": 0.3272616879174256, "grad_norm": 0.431369811296463, "learning_rate": 0.00015182592717239994, "loss": 0.266, "step": 1617 }, { "epoch": 0.32746407609795586, "grad_norm": 0.33998966217041016, "learning_rate": 0.00015177148466862284, "loss": 0.2039, "step": 1618 }, { "epoch": 0.3276664642784861, "grad_norm": 0.41198650002479553, "learning_rate": 0.00015171702119283617, "loss": 0.2304, "step": 1619 }, { "epoch": 0.32786885245901637, "grad_norm": 0.4760742485523224, "learning_rate": 0.00015166253676710234, "loss": 0.2894, "step": 1620 }, { "epoch": 0.3280712406395466, "grad_norm": 0.3359163999557495, "learning_rate": 0.00015160803141349243, "loss": 0.2429, "step": 1621 }, { "epoch": 0.32827362882007693, "grad_norm": 0.2576633393764496, "learning_rate": 0.00015155350515408582, "loss": 0.2244, "step": 1622 }, { "epoch": 0.3284760170006072, "grad_norm": 0.32722562551498413, "learning_rate": 0.0001514989580109705, "loss": 0.2481, "step": 1623 }, { "epoch": 0.32867840518113745, "grad_norm": 0.3234020173549652, "learning_rate": 0.00015144439000624283, "loss": 0.2627, "step": 1624 }, { "epoch": 0.3288807933616677, "grad_norm": 0.25320965051651, "learning_rate": 0.0001513898011620077, "loss": 0.231, "step": 1625 }, { "epoch": 0.32908318154219796, "grad_norm": 0.2784765362739563, "learning_rate": 0.00015133519150037838, "loss": 0.2502, "step": 1626 }, { "epoch": 0.3292855697227282, "grad_norm": 0.29173600673675537, "learning_rate": 0.0001512805610434766, "loss": 0.2243, "step": 1627 }, { "epoch": 0.32948795790325847, "grad_norm": 0.2985801696777344, "learning_rate": 0.00015122590981343247, "loss": 0.306, "step": 1628 }, { "epoch": 0.3296903460837887, "grad_norm": 0.3234592378139496, "learning_rate": 0.00015117123783238458, "loss": 0.2449, "step": 1629 }, { "epoch": 0.329892734264319, "grad_norm": 0.2717262804508209, "learning_rate": 0.0001511165451224799, "loss": 0.2318, "step": 1630 }, { "epoch": 0.33009512244484923, "grad_norm": 0.33839645981788635, "learning_rate": 0.00015106183170587376, "loss": 0.2765, "step": 1631 }, { "epoch": 0.3302975106253795, "grad_norm": 0.2860352396965027, "learning_rate": 0.00015100709760472992, "loss": 0.2412, "step": 1632 }, { "epoch": 0.33049989880590974, "grad_norm": 0.29939860105514526, "learning_rate": 0.00015095234284122057, "loss": 0.248, "step": 1633 }, { "epoch": 0.33070228698644, "grad_norm": 0.3044755160808563, "learning_rate": 0.00015089756743752615, "loss": 0.2608, "step": 1634 }, { "epoch": 0.33090467516697025, "grad_norm": 0.265420526266098, "learning_rate": 0.00015084277141583555, "loss": 0.2675, "step": 1635 }, { "epoch": 0.3311070633475005, "grad_norm": 0.349555641412735, "learning_rate": 0.0001507879547983459, "loss": 0.2911, "step": 1636 }, { "epoch": 0.33130945152803076, "grad_norm": 0.29959675669670105, "learning_rate": 0.00015073311760726289, "loss": 0.2554, "step": 1637 }, { "epoch": 0.331511839708561, "grad_norm": 0.3105815649032593, "learning_rate": 0.00015067825986480032, "loss": 0.2474, "step": 1638 }, { "epoch": 0.3317142278890913, "grad_norm": 0.3096911907196045, "learning_rate": 0.00015062338159318043, "loss": 0.2562, "step": 1639 }, { "epoch": 0.33191661606962153, "grad_norm": 0.2852685749530792, "learning_rate": 0.00015056848281463377, "loss": 0.2549, "step": 1640 }, { "epoch": 0.3321190042501518, "grad_norm": 0.4431990385055542, "learning_rate": 0.00015051356355139914, "loss": 0.2391, "step": 1641 }, { "epoch": 0.33232139243068204, "grad_norm": 0.36613190174102783, "learning_rate": 0.00015045862382572368, "loss": 0.2175, "step": 1642 }, { "epoch": 0.3325237806112123, "grad_norm": 0.3104919195175171, "learning_rate": 0.0001504036636598629, "loss": 0.2587, "step": 1643 }, { "epoch": 0.33272616879174255, "grad_norm": 0.2758963406085968, "learning_rate": 0.0001503486830760804, "loss": 0.2301, "step": 1644 }, { "epoch": 0.3329285569722728, "grad_norm": 0.3700515925884247, "learning_rate": 0.0001502936820966482, "loss": 0.2442, "step": 1645 }, { "epoch": 0.33313094515280306, "grad_norm": 0.2704278528690338, "learning_rate": 0.00015023866074384654, "loss": 0.2508, "step": 1646 }, { "epoch": 0.3333333333333333, "grad_norm": 0.23002755641937256, "learning_rate": 0.00015018361903996392, "loss": 0.2196, "step": 1647 }, { "epoch": 0.33353572151386357, "grad_norm": 0.3507612347602844, "learning_rate": 0.00015012855700729705, "loss": 0.2397, "step": 1648 }, { "epoch": 0.3337381096943938, "grad_norm": 0.43957728147506714, "learning_rate": 0.00015007347466815092, "loss": 0.2644, "step": 1649 }, { "epoch": 0.3339404978749241, "grad_norm": 0.5310310125350952, "learning_rate": 0.00015001837204483876, "loss": 0.2729, "step": 1650 }, { "epoch": 0.3339404978749241, "eval_loss": 0.2773481607437134, "eval_runtime": 1.316, "eval_samples_per_second": 3.799, "eval_steps_per_second": 0.76, "step": 1650 }, { "epoch": 0.33414288605545434, "grad_norm": 0.3015230596065521, "learning_rate": 0.00014996324915968193, "loss": 0.2285, "step": 1651 }, { "epoch": 0.3343452742359846, "grad_norm": 0.3256778419017792, "learning_rate": 0.00014990810603501004, "loss": 0.2529, "step": 1652 }, { "epoch": 0.3345476624165149, "grad_norm": 0.38299068808555603, "learning_rate": 0.00014985294269316096, "loss": 0.2547, "step": 1653 }, { "epoch": 0.33475005059704516, "grad_norm": 0.3158566355705261, "learning_rate": 0.00014979775915648072, "loss": 0.2498, "step": 1654 }, { "epoch": 0.3349524387775754, "grad_norm": 0.30741438269615173, "learning_rate": 0.00014974255544732343, "loss": 0.2274, "step": 1655 }, { "epoch": 0.33515482695810567, "grad_norm": 0.21585911512374878, "learning_rate": 0.0001496873315880515, "loss": 0.1887, "step": 1656 }, { "epoch": 0.3353572151386359, "grad_norm": 0.30536672472953796, "learning_rate": 0.00014963208760103544, "loss": 0.2577, "step": 1657 }, { "epoch": 0.3355596033191662, "grad_norm": 0.3961798846721649, "learning_rate": 0.000149576823508654, "loss": 0.2864, "step": 1658 }, { "epoch": 0.33576199149969643, "grad_norm": 0.2839348018169403, "learning_rate": 0.00014952153933329388, "loss": 0.2329, "step": 1659 }, { "epoch": 0.3359643796802267, "grad_norm": 0.29363957047462463, "learning_rate": 0.00014946623509735012, "loss": 0.2824, "step": 1660 }, { "epoch": 0.33616676786075694, "grad_norm": 0.2618269920349121, "learning_rate": 0.00014941091082322578, "loss": 0.2466, "step": 1661 }, { "epoch": 0.3363691560412872, "grad_norm": 0.2615951597690582, "learning_rate": 0.00014935556653333212, "loss": 0.2225, "step": 1662 }, { "epoch": 0.33657154422181745, "grad_norm": 0.2788453698158264, "learning_rate": 0.00014930020225008834, "loss": 0.2578, "step": 1663 }, { "epoch": 0.3367739324023477, "grad_norm": 0.2874954044818878, "learning_rate": 0.00014924481799592193, "loss": 0.2645, "step": 1664 }, { "epoch": 0.33697632058287796, "grad_norm": 0.32306531071662903, "learning_rate": 0.00014918941379326838, "loss": 0.2537, "step": 1665 }, { "epoch": 0.3371787087634082, "grad_norm": 0.29411888122558594, "learning_rate": 0.00014913398966457124, "loss": 0.2131, "step": 1666 }, { "epoch": 0.3373810969439385, "grad_norm": 0.3489333391189575, "learning_rate": 0.00014907854563228222, "loss": 0.2544, "step": 1667 }, { "epoch": 0.33758348512446873, "grad_norm": 0.3467283546924591, "learning_rate": 0.00014902308171886093, "loss": 0.2428, "step": 1668 }, { "epoch": 0.337785873304999, "grad_norm": 0.2710304856300354, "learning_rate": 0.00014896759794677525, "loss": 0.2427, "step": 1669 }, { "epoch": 0.33798826148552924, "grad_norm": 0.320618599653244, "learning_rate": 0.00014891209433850092, "loss": 0.2484, "step": 1670 }, { "epoch": 0.3381906496660595, "grad_norm": 0.34603026509284973, "learning_rate": 0.00014885657091652185, "loss": 0.2703, "step": 1671 }, { "epoch": 0.33839303784658975, "grad_norm": 0.3719444274902344, "learning_rate": 0.00014880102770332984, "loss": 0.2209, "step": 1672 }, { "epoch": 0.33859542602712, "grad_norm": 0.28150755167007446, "learning_rate": 0.00014874546472142485, "loss": 0.2318, "step": 1673 }, { "epoch": 0.33879781420765026, "grad_norm": 0.33083391189575195, "learning_rate": 0.00014868988199331473, "loss": 0.23, "step": 1674 }, { "epoch": 0.3390002023881805, "grad_norm": 0.2953188419342041, "learning_rate": 0.00014863427954151542, "loss": 0.2099, "step": 1675 }, { "epoch": 0.33920259056871077, "grad_norm": 0.3971119225025177, "learning_rate": 0.00014857865738855078, "loss": 0.2344, "step": 1676 }, { "epoch": 0.339404978749241, "grad_norm": 0.3236183226108551, "learning_rate": 0.00014852301555695266, "loss": 0.2492, "step": 1677 }, { "epoch": 0.3396073669297713, "grad_norm": 0.32918277382850647, "learning_rate": 0.000148467354069261, "loss": 0.2321, "step": 1678 }, { "epoch": 0.33980975511030154, "grad_norm": 0.3020811676979065, "learning_rate": 0.00014841167294802346, "loss": 0.2051, "step": 1679 }, { "epoch": 0.3400121432908318, "grad_norm": 0.22467920184135437, "learning_rate": 0.0001483559722157959, "loss": 0.2, "step": 1680 }, { "epoch": 0.34021453147136205, "grad_norm": 0.486819863319397, "learning_rate": 0.000148300251895142, "loss": 0.2376, "step": 1681 }, { "epoch": 0.3404169196518923, "grad_norm": 0.3118259608745575, "learning_rate": 0.0001482445120086334, "loss": 0.2688, "step": 1682 }, { "epoch": 0.3406193078324226, "grad_norm": 0.36272183060646057, "learning_rate": 0.00014818875257884958, "loss": 0.2429, "step": 1683 }, { "epoch": 0.34082169601295287, "grad_norm": 0.304829478263855, "learning_rate": 0.00014813297362837815, "loss": 0.2292, "step": 1684 }, { "epoch": 0.3410240841934831, "grad_norm": 0.30455195903778076, "learning_rate": 0.00014807717517981438, "loss": 0.2331, "step": 1685 }, { "epoch": 0.3412264723740134, "grad_norm": 0.24466173350811005, "learning_rate": 0.00014802135725576165, "loss": 0.239, "step": 1686 }, { "epoch": 0.34142886055454363, "grad_norm": 0.3234693109989166, "learning_rate": 0.0001479655198788311, "loss": 0.2449, "step": 1687 }, { "epoch": 0.3416312487350739, "grad_norm": 0.23751352727413177, "learning_rate": 0.00014790966307164172, "loss": 0.2202, "step": 1688 }, { "epoch": 0.34183363691560414, "grad_norm": 0.27076348662376404, "learning_rate": 0.00014785378685682054, "loss": 0.2434, "step": 1689 }, { "epoch": 0.3420360250961344, "grad_norm": 0.23964683711528778, "learning_rate": 0.00014779789125700225, "loss": 0.196, "step": 1690 }, { "epoch": 0.34223841327666465, "grad_norm": 0.2706647217273712, "learning_rate": 0.00014774197629482957, "loss": 0.2552, "step": 1691 }, { "epoch": 0.3424408014571949, "grad_norm": 0.39059948921203613, "learning_rate": 0.00014768604199295293, "loss": 0.2658, "step": 1692 }, { "epoch": 0.34264318963772517, "grad_norm": 0.24331116676330566, "learning_rate": 0.0001476300883740307, "loss": 0.1933, "step": 1693 }, { "epoch": 0.3428455778182554, "grad_norm": 0.23481175303459167, "learning_rate": 0.00014757411546072896, "loss": 0.2289, "step": 1694 }, { "epoch": 0.3430479659987857, "grad_norm": 0.33342844247817993, "learning_rate": 0.00014751812327572168, "loss": 0.2494, "step": 1695 }, { "epoch": 0.34325035417931593, "grad_norm": 0.32238465547561646, "learning_rate": 0.00014746211184169065, "loss": 0.221, "step": 1696 }, { "epoch": 0.3434527423598462, "grad_norm": 0.25494661927223206, "learning_rate": 0.00014740608118132543, "loss": 0.2315, "step": 1697 }, { "epoch": 0.34365513054037644, "grad_norm": 0.43517234921455383, "learning_rate": 0.00014735003131732336, "loss": 0.2445, "step": 1698 }, { "epoch": 0.3438575187209067, "grad_norm": 0.28439968824386597, "learning_rate": 0.0001472939622723896, "loss": 0.2347, "step": 1699 }, { "epoch": 0.34405990690143695, "grad_norm": 0.24379862844944, "learning_rate": 0.00014723787406923698, "loss": 0.1973, "step": 1700 }, { "epoch": 0.34405990690143695, "eval_loss": 0.285430371761322, "eval_runtime": 1.3224, "eval_samples_per_second": 3.781, "eval_steps_per_second": 0.756, "step": 1700 }, { "epoch": 0.3442622950819672, "grad_norm": 0.2594735324382782, "learning_rate": 0.00014718176673058625, "loss": 0.2583, "step": 1701 }, { "epoch": 0.34446468326249746, "grad_norm": 0.2564697861671448, "learning_rate": 0.00014712564027916574, "loss": 0.2219, "step": 1702 }, { "epoch": 0.3446670714430277, "grad_norm": 0.2669238746166229, "learning_rate": 0.00014706949473771165, "loss": 0.2322, "step": 1703 }, { "epoch": 0.344869459623558, "grad_norm": 0.30889827013015747, "learning_rate": 0.00014701333012896787, "loss": 0.24, "step": 1704 }, { "epoch": 0.3450718478040882, "grad_norm": 0.35292789340019226, "learning_rate": 0.000146957146475686, "loss": 0.2317, "step": 1705 }, { "epoch": 0.3452742359846185, "grad_norm": 0.37351149320602417, "learning_rate": 0.00014690094380062535, "loss": 0.248, "step": 1706 }, { "epoch": 0.34547662416514874, "grad_norm": 0.25748443603515625, "learning_rate": 0.00014684472212655298, "loss": 0.2968, "step": 1707 }, { "epoch": 0.345679012345679, "grad_norm": 0.2323027402162552, "learning_rate": 0.0001467884814762436, "loss": 0.2227, "step": 1708 }, { "epoch": 0.34588140052620925, "grad_norm": 0.33540675044059753, "learning_rate": 0.00014673222187247965, "loss": 0.2679, "step": 1709 }, { "epoch": 0.3460837887067395, "grad_norm": 0.3130699098110199, "learning_rate": 0.0001466759433380512, "loss": 0.2097, "step": 1710 }, { "epoch": 0.34628617688726976, "grad_norm": 0.31104639172554016, "learning_rate": 0.00014661964589575597, "loss": 0.2583, "step": 1711 }, { "epoch": 0.3464885650678, "grad_norm": 0.25682950019836426, "learning_rate": 0.00014656332956839948, "loss": 0.2409, "step": 1712 }, { "epoch": 0.3466909532483303, "grad_norm": 0.3116452097892761, "learning_rate": 0.00014650699437879477, "loss": 0.2497, "step": 1713 }, { "epoch": 0.3468933414288606, "grad_norm": 0.3444649875164032, "learning_rate": 0.00014645064034976245, "loss": 0.2431, "step": 1714 }, { "epoch": 0.34709572960939084, "grad_norm": 0.2818754017353058, "learning_rate": 0.000146394267504131, "loss": 0.2112, "step": 1715 }, { "epoch": 0.3472981177899211, "grad_norm": 0.39119580388069153, "learning_rate": 0.00014633787586473633, "loss": 0.26, "step": 1716 }, { "epoch": 0.34750050597045135, "grad_norm": 0.23993800580501556, "learning_rate": 0.000146281465454422, "loss": 0.2255, "step": 1717 }, { "epoch": 0.3477028941509816, "grad_norm": 0.43539923429489136, "learning_rate": 0.00014622503629603924, "loss": 0.2566, "step": 1718 }, { "epoch": 0.34790528233151186, "grad_norm": 0.40598592162132263, "learning_rate": 0.0001461685884124468, "loss": 0.2612, "step": 1719 }, { "epoch": 0.3481076705120421, "grad_norm": 0.2584461569786072, "learning_rate": 0.00014611212182651098, "loss": 0.2326, "step": 1720 }, { "epoch": 0.34831005869257237, "grad_norm": 0.2962055802345276, "learning_rate": 0.00014605563656110583, "loss": 0.2455, "step": 1721 }, { "epoch": 0.3485124468731026, "grad_norm": 0.42686739563941956, "learning_rate": 0.0001459991326391128, "loss": 0.2771, "step": 1722 }, { "epoch": 0.3487148350536329, "grad_norm": 0.4981527626514435, "learning_rate": 0.00014594261008342102, "loss": 0.2218, "step": 1723 }, { "epoch": 0.34891722323416313, "grad_norm": 0.2726820111274719, "learning_rate": 0.00014588606891692695, "loss": 0.2308, "step": 1724 }, { "epoch": 0.3491196114146934, "grad_norm": 0.2908082604408264, "learning_rate": 0.00014582950916253488, "loss": 0.244, "step": 1725 }, { "epoch": 0.34932199959522364, "grad_norm": 0.3046623170375824, "learning_rate": 0.00014577293084315645, "loss": 0.2051, "step": 1726 }, { "epoch": 0.3495243877757539, "grad_norm": 0.35965415835380554, "learning_rate": 0.00014571633398171083, "loss": 0.2529, "step": 1727 }, { "epoch": 0.34972677595628415, "grad_norm": 0.2797194719314575, "learning_rate": 0.00014565971860112478, "loss": 0.2569, "step": 1728 }, { "epoch": 0.3499291641368144, "grad_norm": 0.33128151297569275, "learning_rate": 0.00014560308472433246, "loss": 0.2353, "step": 1729 }, { "epoch": 0.35013155231734466, "grad_norm": 0.29469165205955505, "learning_rate": 0.00014554643237427563, "loss": 0.2274, "step": 1730 }, { "epoch": 0.3503339404978749, "grad_norm": 0.3316969573497772, "learning_rate": 0.00014548976157390347, "loss": 0.2475, "step": 1731 }, { "epoch": 0.3505363286784052, "grad_norm": 0.3903247117996216, "learning_rate": 0.0001454330723461726, "loss": 0.2226, "step": 1732 }, { "epoch": 0.35073871685893543, "grad_norm": 0.3155892491340637, "learning_rate": 0.0001453763647140472, "loss": 0.2437, "step": 1733 }, { "epoch": 0.3509411050394657, "grad_norm": 0.27760666608810425, "learning_rate": 0.00014531963870049883, "loss": 0.233, "step": 1734 }, { "epoch": 0.35114349321999594, "grad_norm": 0.3064099848270416, "learning_rate": 0.0001452628943285065, "loss": 0.2652, "step": 1735 }, { "epoch": 0.3513458814005262, "grad_norm": 0.28650030493736267, "learning_rate": 0.00014520613162105674, "loss": 0.2515, "step": 1736 }, { "epoch": 0.35154826958105645, "grad_norm": 0.27437010407447815, "learning_rate": 0.0001451493506011434, "loss": 0.2486, "step": 1737 }, { "epoch": 0.3517506577615867, "grad_norm": 0.23382247984409332, "learning_rate": 0.00014509255129176776, "loss": 0.2313, "step": 1738 }, { "epoch": 0.35195304594211696, "grad_norm": 0.2675359547138214, "learning_rate": 0.00014503573371593863, "loss": 0.2213, "step": 1739 }, { "epoch": 0.3521554341226472, "grad_norm": 0.30067166686058044, "learning_rate": 0.00014497889789667203, "loss": 0.2589, "step": 1740 }, { "epoch": 0.35235782230317747, "grad_norm": 0.33091431856155396, "learning_rate": 0.00014492204385699154, "loss": 0.2624, "step": 1741 }, { "epoch": 0.3525602104837077, "grad_norm": 0.25968363881111145, "learning_rate": 0.00014486517161992803, "loss": 0.2438, "step": 1742 }, { "epoch": 0.35276259866423804, "grad_norm": 0.28150394558906555, "learning_rate": 0.00014480828120851978, "loss": 0.235, "step": 1743 }, { "epoch": 0.3529649868447683, "grad_norm": 0.3035571873188019, "learning_rate": 0.0001447513726458124, "loss": 0.2506, "step": 1744 }, { "epoch": 0.35316737502529855, "grad_norm": 0.5170474052429199, "learning_rate": 0.00014469444595485885, "loss": 0.2546, "step": 1745 }, { "epoch": 0.3533697632058288, "grad_norm": 0.28294914960861206, "learning_rate": 0.00014463750115871948, "loss": 0.2257, "step": 1746 }, { "epoch": 0.35357215138635906, "grad_norm": 0.22720618546009064, "learning_rate": 0.00014458053828046195, "loss": 0.2223, "step": 1747 }, { "epoch": 0.3537745395668893, "grad_norm": 0.2560993432998657, "learning_rate": 0.00014452355734316123, "loss": 0.2545, "step": 1748 }, { "epoch": 0.35397692774741957, "grad_norm": 0.28795984387397766, "learning_rate": 0.0001444665583698996, "loss": 0.2427, "step": 1749 }, { "epoch": 0.3541793159279498, "grad_norm": 0.30775079131126404, "learning_rate": 0.0001444095413837667, "loss": 0.257, "step": 1750 }, { "epoch": 0.3541793159279498, "eval_loss": 0.2792920768260956, "eval_runtime": 1.3185, "eval_samples_per_second": 3.792, "eval_steps_per_second": 0.758, "step": 1750 }, { "epoch": 0.3543817041084801, "grad_norm": 0.25298476219177246, "learning_rate": 0.0001443525064078594, "loss": 0.2359, "step": 1751 }, { "epoch": 0.35458409228901033, "grad_norm": 0.29665857553482056, "learning_rate": 0.00014429545346528189, "loss": 0.2682, "step": 1752 }, { "epoch": 0.3547864804695406, "grad_norm": 0.24088400602340698, "learning_rate": 0.00014423838257914565, "loss": 0.2455, "step": 1753 }, { "epoch": 0.35498886865007084, "grad_norm": 0.30943191051483154, "learning_rate": 0.0001441812937725694, "loss": 0.2562, "step": 1754 }, { "epoch": 0.3551912568306011, "grad_norm": 0.2571665644645691, "learning_rate": 0.00014412418706867913, "loss": 0.216, "step": 1755 }, { "epoch": 0.35539364501113135, "grad_norm": 0.28411421179771423, "learning_rate": 0.0001440670624906081, "loss": 0.2473, "step": 1756 }, { "epoch": 0.3555960331916616, "grad_norm": 0.2374887317419052, "learning_rate": 0.00014400992006149673, "loss": 0.2299, "step": 1757 }, { "epoch": 0.35579842137219186, "grad_norm": 0.31073081493377686, "learning_rate": 0.00014395275980449288, "loss": 0.2651, "step": 1758 }, { "epoch": 0.3560008095527221, "grad_norm": 0.3543723225593567, "learning_rate": 0.00014389558174275133, "loss": 0.2508, "step": 1759 }, { "epoch": 0.3562031977332524, "grad_norm": 0.264026015996933, "learning_rate": 0.00014383838589943432, "loss": 0.2325, "step": 1760 }, { "epoch": 0.35640558591378263, "grad_norm": 0.3232871890068054, "learning_rate": 0.0001437811722977112, "loss": 0.2602, "step": 1761 }, { "epoch": 0.3566079740943129, "grad_norm": 0.25500282645225525, "learning_rate": 0.00014372394096075845, "loss": 0.1986, "step": 1762 }, { "epoch": 0.35681036227484314, "grad_norm": 0.2526322603225708, "learning_rate": 0.00014366669191175988, "loss": 0.2191, "step": 1763 }, { "epoch": 0.3570127504553734, "grad_norm": 0.3108225166797638, "learning_rate": 0.00014360942517390634, "loss": 0.2666, "step": 1764 }, { "epoch": 0.35721513863590365, "grad_norm": 0.40216559171676636, "learning_rate": 0.0001435521407703959, "loss": 0.2502, "step": 1765 }, { "epoch": 0.3574175268164339, "grad_norm": 0.2603747248649597, "learning_rate": 0.00014349483872443385, "loss": 0.2444, "step": 1766 }, { "epoch": 0.35761991499696416, "grad_norm": 0.46096399426460266, "learning_rate": 0.00014343751905923253, "loss": 0.2419, "step": 1767 }, { "epoch": 0.3578223031774944, "grad_norm": 0.29754045605659485, "learning_rate": 0.00014338018179801143, "loss": 0.2268, "step": 1768 }, { "epoch": 0.35802469135802467, "grad_norm": 0.3793618977069855, "learning_rate": 0.00014332282696399718, "loss": 0.2466, "step": 1769 }, { "epoch": 0.3582270795385549, "grad_norm": 0.2544354498386383, "learning_rate": 0.00014326545458042357, "loss": 0.2594, "step": 1770 }, { "epoch": 0.3584294677190852, "grad_norm": 0.28658121824264526, "learning_rate": 0.00014320806467053146, "loss": 0.2721, "step": 1771 }, { "epoch": 0.35863185589961544, "grad_norm": 0.2477906346321106, "learning_rate": 0.00014315065725756886, "loss": 0.2083, "step": 1772 }, { "epoch": 0.3588342440801457, "grad_norm": 0.3015955686569214, "learning_rate": 0.00014309323236479072, "loss": 0.2825, "step": 1773 }, { "epoch": 0.359036632260676, "grad_norm": 0.393539696931839, "learning_rate": 0.00014303579001545926, "loss": 0.2451, "step": 1774 }, { "epoch": 0.35923902044120626, "grad_norm": 0.2812289595603943, "learning_rate": 0.00014297833023284366, "loss": 0.2323, "step": 1775 }, { "epoch": 0.3594414086217365, "grad_norm": 0.24229012429714203, "learning_rate": 0.0001429208530402202, "loss": 0.2395, "step": 1776 }, { "epoch": 0.35964379680226677, "grad_norm": 0.3017047345638275, "learning_rate": 0.0001428633584608722, "loss": 0.2525, "step": 1777 }, { "epoch": 0.359846184982797, "grad_norm": 0.2679601013660431, "learning_rate": 0.00014280584651809003, "loss": 0.2207, "step": 1778 }, { "epoch": 0.3600485731633273, "grad_norm": 0.2501794695854187, "learning_rate": 0.0001427483172351711, "loss": 0.2264, "step": 1779 }, { "epoch": 0.36025096134385753, "grad_norm": 0.2744043469429016, "learning_rate": 0.00014269077063541982, "loss": 0.2337, "step": 1780 }, { "epoch": 0.3604533495243878, "grad_norm": 0.2627145051956177, "learning_rate": 0.00014263320674214763, "loss": 0.2584, "step": 1781 }, { "epoch": 0.36065573770491804, "grad_norm": 0.23826229572296143, "learning_rate": 0.000142575625578673, "loss": 0.219, "step": 1782 }, { "epoch": 0.3608581258854483, "grad_norm": 0.2856881618499756, "learning_rate": 0.0001425180271683213, "loss": 0.2478, "step": 1783 }, { "epoch": 0.36106051406597856, "grad_norm": 0.2626444101333618, "learning_rate": 0.00014246041153442504, "loss": 0.2556, "step": 1784 }, { "epoch": 0.3612629022465088, "grad_norm": 0.3405790627002716, "learning_rate": 0.00014240277870032362, "loss": 0.2288, "step": 1785 }, { "epoch": 0.36146529042703907, "grad_norm": 0.3541911840438843, "learning_rate": 0.0001423451286893633, "loss": 0.2364, "step": 1786 }, { "epoch": 0.3616676786075693, "grad_norm": 0.2727629840373993, "learning_rate": 0.0001422874615248976, "loss": 0.2541, "step": 1787 }, { "epoch": 0.3618700667880996, "grad_norm": 0.304404616355896, "learning_rate": 0.00014222977723028662, "loss": 0.249, "step": 1788 }, { "epoch": 0.36207245496862983, "grad_norm": 0.2847534716129303, "learning_rate": 0.0001421720758288977, "loss": 0.2196, "step": 1789 }, { "epoch": 0.3622748431491601, "grad_norm": 0.23090103268623352, "learning_rate": 0.0001421143573441049, "loss": 0.2247, "step": 1790 }, { "epoch": 0.36247723132969034, "grad_norm": 0.4167442321777344, "learning_rate": 0.00014205662179928935, "loss": 0.2411, "step": 1791 }, { "epoch": 0.3626796195102206, "grad_norm": 0.331055611371994, "learning_rate": 0.00014199886921783901, "loss": 0.2583, "step": 1792 }, { "epoch": 0.36288200769075085, "grad_norm": 0.3085455894470215, "learning_rate": 0.00014194109962314874, "loss": 0.2492, "step": 1793 }, { "epoch": 0.3630843958712811, "grad_norm": 0.24449102580547333, "learning_rate": 0.00014188331303862034, "loss": 0.2508, "step": 1794 }, { "epoch": 0.36328678405181136, "grad_norm": 0.3192913830280304, "learning_rate": 0.0001418255094876625, "loss": 0.2305, "step": 1795 }, { "epoch": 0.3634891722323416, "grad_norm": 0.282790869474411, "learning_rate": 0.00014176768899369063, "loss": 0.2542, "step": 1796 }, { "epoch": 0.3636915604128719, "grad_norm": 0.26583266258239746, "learning_rate": 0.00014170985158012725, "loss": 0.2523, "step": 1797 }, { "epoch": 0.36389394859340213, "grad_norm": 0.2712319791316986, "learning_rate": 0.00014165199727040153, "loss": 0.2678, "step": 1798 }, { "epoch": 0.3640963367739324, "grad_norm": 0.2928764820098877, "learning_rate": 0.00014159412608794956, "loss": 0.2584, "step": 1799 }, { "epoch": 0.36429872495446264, "grad_norm": 0.3224334120750427, "learning_rate": 0.0001415362380562143, "loss": 0.2106, "step": 1800 }, { "epoch": 0.36429872495446264, "eval_loss": 0.28134655952453613, "eval_runtime": 1.3178, "eval_samples_per_second": 3.794, "eval_steps_per_second": 0.759, "step": 1800 }, { "epoch": 0.3645011131349929, "grad_norm": 0.2981015145778656, "learning_rate": 0.00014147833319864546, "loss": 0.2363, "step": 1801 }, { "epoch": 0.36470350131552315, "grad_norm": 0.23247891664505005, "learning_rate": 0.00014142041153869967, "loss": 0.2105, "step": 1802 }, { "epoch": 0.3649058894960534, "grad_norm": 0.2946484386920929, "learning_rate": 0.00014136247309984022, "loss": 0.2562, "step": 1803 }, { "epoch": 0.3651082776765837, "grad_norm": 0.27451181411743164, "learning_rate": 0.00014130451790553733, "loss": 0.2383, "step": 1804 }, { "epoch": 0.36531066585711397, "grad_norm": 0.29219532012939453, "learning_rate": 0.00014124654597926794, "loss": 0.2459, "step": 1805 }, { "epoch": 0.3655130540376442, "grad_norm": 0.2635502219200134, "learning_rate": 0.00014118855734451583, "loss": 0.2416, "step": 1806 }, { "epoch": 0.3657154422181745, "grad_norm": 0.2339702993631363, "learning_rate": 0.00014113055202477138, "loss": 0.1961, "step": 1807 }, { "epoch": 0.36591783039870474, "grad_norm": 0.31288042664527893, "learning_rate": 0.00014107253004353202, "loss": 0.2404, "step": 1808 }, { "epoch": 0.366120218579235, "grad_norm": 0.32711726427078247, "learning_rate": 0.00014101449142430166, "loss": 0.2353, "step": 1809 }, { "epoch": 0.36632260675976525, "grad_norm": 0.3344823718070984, "learning_rate": 0.00014095643619059102, "loss": 0.2538, "step": 1810 }, { "epoch": 0.3665249949402955, "grad_norm": 0.2926856577396393, "learning_rate": 0.00014089836436591768, "loss": 0.2552, "step": 1811 }, { "epoch": 0.36672738312082576, "grad_norm": 0.19439418613910675, "learning_rate": 0.00014084027597380575, "loss": 0.1863, "step": 1812 }, { "epoch": 0.366929771301356, "grad_norm": 0.23226851224899292, "learning_rate": 0.0001407821710377862, "loss": 0.2409, "step": 1813 }, { "epoch": 0.36713215948188627, "grad_norm": 0.3362860679626465, "learning_rate": 0.0001407240495813966, "loss": 0.254, "step": 1814 }, { "epoch": 0.3673345476624165, "grad_norm": 0.2831605076789856, "learning_rate": 0.00014066591162818133, "loss": 0.2119, "step": 1815 }, { "epoch": 0.3675369358429468, "grad_norm": 0.31574276089668274, "learning_rate": 0.00014060775720169133, "loss": 0.2105, "step": 1816 }, { "epoch": 0.36773932402347703, "grad_norm": 0.2928045690059662, "learning_rate": 0.00014054958632548424, "loss": 0.1894, "step": 1817 }, { "epoch": 0.3679417122040073, "grad_norm": 0.32635268568992615, "learning_rate": 0.00014049139902312443, "loss": 0.2455, "step": 1818 }, { "epoch": 0.36814410038453754, "grad_norm": 0.29648733139038086, "learning_rate": 0.00014043319531818285, "loss": 0.2515, "step": 1819 }, { "epoch": 0.3683464885650678, "grad_norm": 0.36445343494415283, "learning_rate": 0.00014037497523423716, "loss": 0.2626, "step": 1820 }, { "epoch": 0.36854887674559805, "grad_norm": 0.33038073778152466, "learning_rate": 0.0001403167387948716, "loss": 0.2682, "step": 1821 }, { "epoch": 0.3687512649261283, "grad_norm": 0.30535727739334106, "learning_rate": 0.00014025848602367707, "loss": 0.2271, "step": 1822 }, { "epoch": 0.36895365310665856, "grad_norm": 0.2592299282550812, "learning_rate": 0.00014020021694425102, "loss": 0.2331, "step": 1823 }, { "epoch": 0.3691560412871888, "grad_norm": 0.33052313327789307, "learning_rate": 0.00014014193158019763, "loss": 0.2491, "step": 1824 }, { "epoch": 0.3693584294677191, "grad_norm": 0.2506415843963623, "learning_rate": 0.00014008362995512753, "loss": 0.2404, "step": 1825 }, { "epoch": 0.36956081764824933, "grad_norm": 0.28076010942459106, "learning_rate": 0.0001400253120926581, "loss": 0.2163, "step": 1826 }, { "epoch": 0.3697632058287796, "grad_norm": 0.32988500595092773, "learning_rate": 0.00013996697801641313, "loss": 0.2499, "step": 1827 }, { "epoch": 0.36996559400930984, "grad_norm": 0.27228912711143494, "learning_rate": 0.00013990862775002308, "loss": 0.2468, "step": 1828 }, { "epoch": 0.3701679821898401, "grad_norm": 0.2936830222606659, "learning_rate": 0.000139850261317125, "loss": 0.2715, "step": 1829 }, { "epoch": 0.37037037037037035, "grad_norm": 0.3024163842201233, "learning_rate": 0.00013979187874136234, "loss": 0.2567, "step": 1830 }, { "epoch": 0.3705727585509006, "grad_norm": 0.28606948256492615, "learning_rate": 0.0001397334800463852, "loss": 0.2303, "step": 1831 }, { "epoch": 0.37077514673143086, "grad_norm": 0.2841251790523529, "learning_rate": 0.0001396750652558503, "loss": 0.2214, "step": 1832 }, { "epoch": 0.3709775349119611, "grad_norm": 0.4481494426727295, "learning_rate": 0.00013961663439342068, "loss": 0.2702, "step": 1833 }, { "epoch": 0.3711799230924914, "grad_norm": 0.2626919448375702, "learning_rate": 0.00013955818748276593, "loss": 0.2302, "step": 1834 }, { "epoch": 0.3713823112730217, "grad_norm": 0.277553528547287, "learning_rate": 0.00013949972454756234, "loss": 0.254, "step": 1835 }, { "epoch": 0.37158469945355194, "grad_norm": 0.2988859713077545, "learning_rate": 0.00013944124561149246, "loss": 0.2548, "step": 1836 }, { "epoch": 0.3717870876340822, "grad_norm": 0.34181922674179077, "learning_rate": 0.0001393827506982454, "loss": 0.2609, "step": 1837 }, { "epoch": 0.37198947581461245, "grad_norm": 0.260486364364624, "learning_rate": 0.00013932423983151678, "loss": 0.2269, "step": 1838 }, { "epoch": 0.3721918639951427, "grad_norm": 0.31525251269340515, "learning_rate": 0.00013926571303500866, "loss": 0.2814, "step": 1839 }, { "epoch": 0.37239425217567296, "grad_norm": 0.27101269364356995, "learning_rate": 0.00013920717033242953, "loss": 0.2135, "step": 1840 }, { "epoch": 0.3725966403562032, "grad_norm": 0.4067181348800659, "learning_rate": 0.00013914861174749429, "loss": 0.2677, "step": 1841 }, { "epoch": 0.37279902853673347, "grad_norm": 0.3725970983505249, "learning_rate": 0.0001390900373039244, "loss": 0.2507, "step": 1842 }, { "epoch": 0.3730014167172637, "grad_norm": 0.30077874660491943, "learning_rate": 0.00013903144702544766, "loss": 0.2085, "step": 1843 }, { "epoch": 0.373203804897794, "grad_norm": 0.3246167302131653, "learning_rate": 0.00013897284093579825, "loss": 0.2696, "step": 1844 }, { "epoch": 0.37340619307832423, "grad_norm": 0.2538074553012848, "learning_rate": 0.0001389142190587168, "loss": 0.263, "step": 1845 }, { "epoch": 0.3736085812588545, "grad_norm": 0.2918241620063782, "learning_rate": 0.0001388555814179504, "loss": 0.2453, "step": 1846 }, { "epoch": 0.37381096943938474, "grad_norm": 0.3601178526878357, "learning_rate": 0.00013879692803725236, "loss": 0.2876, "step": 1847 }, { "epoch": 0.374013357619915, "grad_norm": 0.26595860719680786, "learning_rate": 0.0001387382589403825, "loss": 0.2389, "step": 1848 }, { "epoch": 0.37421574580044525, "grad_norm": 0.26432713866233826, "learning_rate": 0.00013867957415110698, "loss": 0.2302, "step": 1849 }, { "epoch": 0.3744181339809755, "grad_norm": 0.26036038994789124, "learning_rate": 0.00013862087369319833, "loss": 0.2432, "step": 1850 }, { "epoch": 0.3744181339809755, "eval_loss": 0.27562105655670166, "eval_runtime": 1.323, "eval_samples_per_second": 3.779, "eval_steps_per_second": 0.756, "step": 1850 }, { "epoch": 0.37462052216150576, "grad_norm": 0.24740183353424072, "learning_rate": 0.00013856215759043536, "loss": 0.2093, "step": 1851 }, { "epoch": 0.374822910342036, "grad_norm": 0.27319031953811646, "learning_rate": 0.00013850342586660327, "loss": 0.2671, "step": 1852 }, { "epoch": 0.3750252985225663, "grad_norm": 0.3029322326183319, "learning_rate": 0.0001384446785454936, "loss": 0.2274, "step": 1853 }, { "epoch": 0.37522768670309653, "grad_norm": 0.32145076990127563, "learning_rate": 0.00013838591565090423, "loss": 0.2396, "step": 1854 }, { "epoch": 0.3754300748836268, "grad_norm": 0.31832215189933777, "learning_rate": 0.00013832713720663916, "loss": 0.2439, "step": 1855 }, { "epoch": 0.37563246306415704, "grad_norm": 0.7246808409690857, "learning_rate": 0.000138268343236509, "loss": 0.213, "step": 1856 }, { "epoch": 0.3758348512446873, "grad_norm": 0.2564253807067871, "learning_rate": 0.0001382095337643304, "loss": 0.2608, "step": 1857 }, { "epoch": 0.37603723942521755, "grad_norm": 0.2714241147041321, "learning_rate": 0.00013815070881392633, "loss": 0.2067, "step": 1858 }, { "epoch": 0.3762396276057478, "grad_norm": 0.30548515915870667, "learning_rate": 0.0001380918684091262, "loss": 0.2287, "step": 1859 }, { "epoch": 0.37644201578627806, "grad_norm": 0.3033727705478668, "learning_rate": 0.00013803301257376543, "loss": 0.2472, "step": 1860 }, { "epoch": 0.3766444039668083, "grad_norm": 0.3225138187408447, "learning_rate": 0.0001379741413316859, "loss": 0.2434, "step": 1861 }, { "epoch": 0.37684679214733857, "grad_norm": 0.4423466920852661, "learning_rate": 0.00013791525470673564, "loss": 0.2388, "step": 1862 }, { "epoch": 0.3770491803278688, "grad_norm": 0.31593412160873413, "learning_rate": 0.00013785635272276886, "loss": 0.2451, "step": 1863 }, { "epoch": 0.3772515685083991, "grad_norm": 0.25586557388305664, "learning_rate": 0.0001377974354036461, "loss": 0.2368, "step": 1864 }, { "epoch": 0.3774539566889294, "grad_norm": 0.3190286457538605, "learning_rate": 0.00013773850277323406, "loss": 0.2579, "step": 1865 }, { "epoch": 0.37765634486945965, "grad_norm": 0.2932840585708618, "learning_rate": 0.0001376795548554056, "loss": 0.2412, "step": 1866 }, { "epoch": 0.3778587330499899, "grad_norm": 0.25149938464164734, "learning_rate": 0.0001376205916740399, "loss": 0.2302, "step": 1867 }, { "epoch": 0.37806112123052016, "grad_norm": 0.483714759349823, "learning_rate": 0.00013756161325302211, "loss": 0.2386, "step": 1868 }, { "epoch": 0.3782635094110504, "grad_norm": 0.3608554005622864, "learning_rate": 0.00013750261961624382, "loss": 0.2837, "step": 1869 }, { "epoch": 0.37846589759158067, "grad_norm": 0.30802130699157715, "learning_rate": 0.0001374436107876026, "loss": 0.247, "step": 1870 }, { "epoch": 0.3786682857721109, "grad_norm": 0.31632187962532043, "learning_rate": 0.00013738458679100218, "loss": 0.2589, "step": 1871 }, { "epoch": 0.3788706739526412, "grad_norm": 0.3298904299736023, "learning_rate": 0.0001373255476503525, "loss": 0.233, "step": 1872 }, { "epoch": 0.37907306213317143, "grad_norm": 0.29948553442955017, "learning_rate": 0.0001372664933895696, "loss": 0.2155, "step": 1873 }, { "epoch": 0.3792754503137017, "grad_norm": 0.33355945348739624, "learning_rate": 0.0001372074240325757, "loss": 0.2302, "step": 1874 }, { "epoch": 0.37947783849423194, "grad_norm": 0.398086279630661, "learning_rate": 0.00013714833960329906, "loss": 0.2305, "step": 1875 }, { "epoch": 0.3796802266747622, "grad_norm": 0.3670455515384674, "learning_rate": 0.00013708924012567408, "loss": 0.2633, "step": 1876 }, { "epoch": 0.37988261485529246, "grad_norm": 0.33737245202064514, "learning_rate": 0.00013703012562364124, "loss": 0.2513, "step": 1877 }, { "epoch": 0.3800850030358227, "grad_norm": 0.3446251451969147, "learning_rate": 0.00013697099612114714, "loss": 0.2862, "step": 1878 }, { "epoch": 0.38028739121635297, "grad_norm": 0.3114364743232727, "learning_rate": 0.0001369118516421444, "loss": 0.2717, "step": 1879 }, { "epoch": 0.3804897793968832, "grad_norm": 0.28015270829200745, "learning_rate": 0.0001368526922105918, "loss": 0.2663, "step": 1880 }, { "epoch": 0.3806921675774135, "grad_norm": 0.29030805826187134, "learning_rate": 0.00013679351785045408, "loss": 0.2587, "step": 1881 }, { "epoch": 0.38089455575794373, "grad_norm": 0.3173602819442749, "learning_rate": 0.00013673432858570198, "loss": 0.2683, "step": 1882 }, { "epoch": 0.381096943938474, "grad_norm": 0.26000502705574036, "learning_rate": 0.00013667512444031258, "loss": 0.2372, "step": 1883 }, { "epoch": 0.38129933211900424, "grad_norm": 0.2685737907886505, "learning_rate": 0.00013661590543826856, "loss": 0.2263, "step": 1884 }, { "epoch": 0.3815017202995345, "grad_norm": 0.2596903145313263, "learning_rate": 0.00013655667160355892, "loss": 0.233, "step": 1885 }, { "epoch": 0.38170410848006475, "grad_norm": 0.26874783635139465, "learning_rate": 0.0001364974229601786, "loss": 0.2837, "step": 1886 }, { "epoch": 0.381906496660595, "grad_norm": 0.27106040716171265, "learning_rate": 0.00013643815953212846, "loss": 0.2427, "step": 1887 }, { "epoch": 0.38210888484112526, "grad_norm": 0.28159913420677185, "learning_rate": 0.00013637888134341545, "loss": 0.2096, "step": 1888 }, { "epoch": 0.3823112730216555, "grad_norm": 0.2819752097129822, "learning_rate": 0.00013631958841805244, "loss": 0.2388, "step": 1889 }, { "epoch": 0.3825136612021858, "grad_norm": 0.2405043989419937, "learning_rate": 0.00013626028078005827, "loss": 0.2164, "step": 1890 }, { "epoch": 0.38271604938271603, "grad_norm": 0.3075813055038452, "learning_rate": 0.00013620095845345783, "loss": 0.2472, "step": 1891 }, { "epoch": 0.3829184375632463, "grad_norm": 0.38243961334228516, "learning_rate": 0.00013614162146228178, "loss": 0.2696, "step": 1892 }, { "epoch": 0.38312082574377654, "grad_norm": 0.3434121906757355, "learning_rate": 0.00013608226983056687, "loss": 0.2399, "step": 1893 }, { "epoch": 0.3833232139243068, "grad_norm": 0.27825695276260376, "learning_rate": 0.00013602290358235577, "loss": 0.2503, "step": 1894 }, { "epoch": 0.3835256021048371, "grad_norm": 0.2657754123210907, "learning_rate": 0.00013596352274169697, "loss": 0.2294, "step": 1895 }, { "epoch": 0.38372799028536736, "grad_norm": 0.27765652537345886, "learning_rate": 0.000135904127332645, "loss": 0.2378, "step": 1896 }, { "epoch": 0.3839303784658976, "grad_norm": 0.3239387571811676, "learning_rate": 0.00013584471737926017, "loss": 0.2595, "step": 1897 }, { "epoch": 0.38413276664642787, "grad_norm": 0.2365967184305191, "learning_rate": 0.00013578529290560884, "loss": 0.2047, "step": 1898 }, { "epoch": 0.3843351548269581, "grad_norm": 0.29406309127807617, "learning_rate": 0.00013572585393576304, "loss": 0.238, "step": 1899 }, { "epoch": 0.3845375430074884, "grad_norm": 0.32984691858291626, "learning_rate": 0.00013566640049380085, "loss": 0.2257, "step": 1900 }, { "epoch": 0.3845375430074884, "eval_loss": 0.27822864055633545, "eval_runtime": 1.3126, "eval_samples_per_second": 3.809, "eval_steps_per_second": 0.762, "step": 1900 }, { "epoch": 0.38473993118801864, "grad_norm": 0.3842868506908417, "learning_rate": 0.00013560693260380614, "loss": 0.2832, "step": 1901 }, { "epoch": 0.3849423193685489, "grad_norm": 0.2690775692462921, "learning_rate": 0.00013554745028986867, "loss": 0.2519, "step": 1902 }, { "epoch": 0.38514470754907915, "grad_norm": 0.36327075958251953, "learning_rate": 0.00013548795357608393, "loss": 0.2587, "step": 1903 }, { "epoch": 0.3853470957296094, "grad_norm": 0.42643454670906067, "learning_rate": 0.0001354284424865534, "loss": 0.2709, "step": 1904 }, { "epoch": 0.38554948391013966, "grad_norm": 0.376147985458374, "learning_rate": 0.00013536891704538431, "loss": 0.2334, "step": 1905 }, { "epoch": 0.3857518720906699, "grad_norm": 0.2361619621515274, "learning_rate": 0.00013530937727668967, "loss": 0.2517, "step": 1906 }, { "epoch": 0.38595426027120017, "grad_norm": 0.32191452383995056, "learning_rate": 0.00013524982320458836, "loss": 0.2449, "step": 1907 }, { "epoch": 0.3861566484517304, "grad_norm": 0.28632256388664246, "learning_rate": 0.00013519025485320498, "loss": 0.2286, "step": 1908 }, { "epoch": 0.3863590366322607, "grad_norm": 0.2994059920310974, "learning_rate": 0.00013513067224667, "loss": 0.2299, "step": 1909 }, { "epoch": 0.38656142481279093, "grad_norm": 0.32087379693984985, "learning_rate": 0.0001350710754091196, "loss": 0.2422, "step": 1910 }, { "epoch": 0.3867638129933212, "grad_norm": 0.2878233790397644, "learning_rate": 0.0001350114643646958, "loss": 0.2548, "step": 1911 }, { "epoch": 0.38696620117385144, "grad_norm": 0.35627928376197815, "learning_rate": 0.00013495183913754622, "loss": 0.2227, "step": 1912 }, { "epoch": 0.3871685893543817, "grad_norm": 0.3006799817085266, "learning_rate": 0.00013489219975182438, "loss": 0.2622, "step": 1913 }, { "epoch": 0.38737097753491195, "grad_norm": 0.2989097535610199, "learning_rate": 0.00013483254623168948, "loss": 0.2259, "step": 1914 }, { "epoch": 0.3875733657154422, "grad_norm": 0.34204626083374023, "learning_rate": 0.0001347728786013065, "loss": 0.2572, "step": 1915 }, { "epoch": 0.38777575389597246, "grad_norm": 0.31308454275131226, "learning_rate": 0.00013471319688484597, "loss": 0.2489, "step": 1916 }, { "epoch": 0.3879781420765027, "grad_norm": 0.32415515184402466, "learning_rate": 0.00013465350110648436, "loss": 0.2384, "step": 1917 }, { "epoch": 0.388180530257033, "grad_norm": 0.3070184290409088, "learning_rate": 0.00013459379129040366, "loss": 0.2638, "step": 1918 }, { "epoch": 0.38838291843756323, "grad_norm": 0.3244938552379608, "learning_rate": 0.00013453406746079157, "loss": 0.2256, "step": 1919 }, { "epoch": 0.3885853066180935, "grad_norm": 0.4286407232284546, "learning_rate": 0.00013447432964184153, "loss": 0.2394, "step": 1920 }, { "epoch": 0.38878769479862374, "grad_norm": 0.3179756999015808, "learning_rate": 0.00013441457785775266, "loss": 0.2153, "step": 1921 }, { "epoch": 0.388990082979154, "grad_norm": 0.2888988256454468, "learning_rate": 0.00013435481213272966, "loss": 0.2479, "step": 1922 }, { "epoch": 0.38919247115968425, "grad_norm": 0.2546892762184143, "learning_rate": 0.00013429503249098287, "loss": 0.2153, "step": 1923 }, { "epoch": 0.3893948593402145, "grad_norm": 0.368834912776947, "learning_rate": 0.00013423523895672834, "loss": 0.2482, "step": 1924 }, { "epoch": 0.3895972475207448, "grad_norm": 0.26037633419036865, "learning_rate": 0.00013417543155418775, "loss": 0.2197, "step": 1925 }, { "epoch": 0.38979963570127507, "grad_norm": 0.331725150346756, "learning_rate": 0.00013411561030758832, "loss": 0.2349, "step": 1926 }, { "epoch": 0.3900020238818053, "grad_norm": 0.25632625818252563, "learning_rate": 0.0001340557752411629, "loss": 0.1919, "step": 1927 }, { "epoch": 0.3902044120623356, "grad_norm": 0.25705817341804504, "learning_rate": 0.00013399592637915005, "loss": 0.2441, "step": 1928 }, { "epoch": 0.39040680024286584, "grad_norm": 0.2834140956401825, "learning_rate": 0.00013393606374579377, "loss": 0.2231, "step": 1929 }, { "epoch": 0.3906091884233961, "grad_norm": 0.302865207195282, "learning_rate": 0.00013387618736534365, "loss": 0.2496, "step": 1930 }, { "epoch": 0.39081157660392635, "grad_norm": 0.3081040680408478, "learning_rate": 0.000133816297262055, "loss": 0.2492, "step": 1931 }, { "epoch": 0.3910139647844566, "grad_norm": 0.29357609152793884, "learning_rate": 0.0001337563934601885, "loss": 0.2792, "step": 1932 }, { "epoch": 0.39121635296498686, "grad_norm": 0.3425729274749756, "learning_rate": 0.0001336964759840105, "loss": 0.239, "step": 1933 }, { "epoch": 0.3914187411455171, "grad_norm": 0.28074294328689575, "learning_rate": 0.00013363654485779285, "loss": 0.2647, "step": 1934 }, { "epoch": 0.39162112932604737, "grad_norm": 0.38483911752700806, "learning_rate": 0.00013357660010581294, "loss": 0.2584, "step": 1935 }, { "epoch": 0.3918235175065776, "grad_norm": 0.3414176404476166, "learning_rate": 0.00013351664175235368, "loss": 0.2729, "step": 1936 }, { "epoch": 0.3920259056871079, "grad_norm": 0.273685097694397, "learning_rate": 0.00013345666982170345, "loss": 0.227, "step": 1937 }, { "epoch": 0.39222829386763813, "grad_norm": 0.3120160400867462, "learning_rate": 0.00013339668433815617, "loss": 0.2191, "step": 1938 }, { "epoch": 0.3924306820481684, "grad_norm": 0.22675852477550507, "learning_rate": 0.00013333668532601126, "loss": 0.2166, "step": 1939 }, { "epoch": 0.39263307022869864, "grad_norm": 0.29665467143058777, "learning_rate": 0.00013327667280957361, "loss": 0.2668, "step": 1940 }, { "epoch": 0.3928354584092289, "grad_norm": 0.26096877455711365, "learning_rate": 0.00013321664681315354, "loss": 0.2275, "step": 1941 }, { "epoch": 0.39303784658975915, "grad_norm": 0.24969267845153809, "learning_rate": 0.00013315660736106687, "loss": 0.2055, "step": 1942 }, { "epoch": 0.3932402347702894, "grad_norm": 0.35524192452430725, "learning_rate": 0.0001330965544776349, "loss": 0.2491, "step": 1943 }, { "epoch": 0.39344262295081966, "grad_norm": 0.26154646277427673, "learning_rate": 0.0001330364881871843, "loss": 0.2524, "step": 1944 }, { "epoch": 0.3936450111313499, "grad_norm": 0.3056662678718567, "learning_rate": 0.00013297640851404722, "loss": 0.2971, "step": 1945 }, { "epoch": 0.3938473993118802, "grad_norm": 0.40250954031944275, "learning_rate": 0.00013291631548256123, "loss": 0.2819, "step": 1946 }, { "epoch": 0.39404978749241043, "grad_norm": 0.5901599526405334, "learning_rate": 0.00013285620911706927, "loss": 0.2549, "step": 1947 }, { "epoch": 0.3942521756729407, "grad_norm": 0.22675690054893494, "learning_rate": 0.0001327960894419197, "loss": 0.2076, "step": 1948 }, { "epoch": 0.39445456385347094, "grad_norm": 0.2752223610877991, "learning_rate": 0.00013273595648146633, "loss": 0.2317, "step": 1949 }, { "epoch": 0.3946569520340012, "grad_norm": 0.38042622804641724, "learning_rate": 0.0001326758102600683, "loss": 0.2233, "step": 1950 }, { "epoch": 0.3946569520340012, "eval_loss": 0.2925703823566437, "eval_runtime": 1.3223, "eval_samples_per_second": 3.781, "eval_steps_per_second": 0.756, "step": 1950 }, { "epoch": 0.39485934021453145, "grad_norm": 0.29109886288642883, "learning_rate": 0.00013261565080209005, "loss": 0.2395, "step": 1951 }, { "epoch": 0.3950617283950617, "grad_norm": 0.6579899191856384, "learning_rate": 0.00013255547813190158, "loss": 0.2343, "step": 1952 }, { "epoch": 0.39526411657559196, "grad_norm": 0.31066983938217163, "learning_rate": 0.000132495292273878, "loss": 0.2405, "step": 1953 }, { "epoch": 0.3954665047561222, "grad_norm": 0.292392373085022, "learning_rate": 0.00013243509325239994, "loss": 0.2495, "step": 1954 }, { "epoch": 0.39566889293665247, "grad_norm": 0.2772408127784729, "learning_rate": 0.0001323748810918533, "loss": 0.2516, "step": 1955 }, { "epoch": 0.3958712811171828, "grad_norm": 0.32036811113357544, "learning_rate": 0.00013231465581662932, "loss": 0.1978, "step": 1956 }, { "epoch": 0.39607366929771304, "grad_norm": 0.3376501798629761, "learning_rate": 0.0001322544174511245, "loss": 0.1822, "step": 1957 }, { "epoch": 0.3962760574782433, "grad_norm": 0.23488853871822357, "learning_rate": 0.0001321941660197407, "loss": 0.2029, "step": 1958 }, { "epoch": 0.39647844565877355, "grad_norm": 0.3550211489200592, "learning_rate": 0.0001321339015468851, "loss": 0.2706, "step": 1959 }, { "epoch": 0.3966808338393038, "grad_norm": 0.28546565771102905, "learning_rate": 0.00013207362405697005, "loss": 0.2457, "step": 1960 }, { "epoch": 0.39688322201983406, "grad_norm": 0.26281923055648804, "learning_rate": 0.00013201333357441326, "loss": 0.1823, "step": 1961 }, { "epoch": 0.3970856102003643, "grad_norm": 0.4920741021633148, "learning_rate": 0.0001319530301236377, "loss": 0.2718, "step": 1962 }, { "epoch": 0.39728799838089457, "grad_norm": 0.2771494686603546, "learning_rate": 0.00013189271372907158, "loss": 0.2516, "step": 1963 }, { "epoch": 0.3974903865614248, "grad_norm": 0.2805003523826599, "learning_rate": 0.0001318323844151483, "loss": 0.2616, "step": 1964 }, { "epoch": 0.3976927747419551, "grad_norm": 0.3793322741985321, "learning_rate": 0.00013177204220630662, "loss": 0.2561, "step": 1965 }, { "epoch": 0.39789516292248533, "grad_norm": 0.40119558572769165, "learning_rate": 0.0001317116871269904, "loss": 0.2393, "step": 1966 }, { "epoch": 0.3980975511030156, "grad_norm": 0.333647221326828, "learning_rate": 0.00013165131920164877, "loss": 0.2422, "step": 1967 }, { "epoch": 0.39829993928354585, "grad_norm": 0.25151124596595764, "learning_rate": 0.00013159093845473607, "loss": 0.2167, "step": 1968 }, { "epoch": 0.3985023274640761, "grad_norm": 0.5257695317268372, "learning_rate": 0.00013153054491071178, "loss": 0.2276, "step": 1969 }, { "epoch": 0.39870471564460636, "grad_norm": 0.27265262603759766, "learning_rate": 0.00013147013859404064, "loss": 0.2195, "step": 1970 }, { "epoch": 0.3989071038251366, "grad_norm": 0.31594887375831604, "learning_rate": 0.00013140971952919252, "loss": 0.2575, "step": 1971 }, { "epoch": 0.39910949200566687, "grad_norm": 0.3140024244785309, "learning_rate": 0.00013134928774064246, "loss": 0.25, "step": 1972 }, { "epoch": 0.3993118801861971, "grad_norm": 0.2463502287864685, "learning_rate": 0.00013128884325287063, "loss": 0.2332, "step": 1973 }, { "epoch": 0.3995142683667274, "grad_norm": 0.5335040092468262, "learning_rate": 0.00013122838609036242, "loss": 0.2739, "step": 1974 }, { "epoch": 0.39971665654725763, "grad_norm": 0.47993403673171997, "learning_rate": 0.00013116791627760822, "loss": 0.2517, "step": 1975 }, { "epoch": 0.3999190447277879, "grad_norm": 0.3202473223209381, "learning_rate": 0.00013110743383910376, "loss": 0.2411, "step": 1976 }, { "epoch": 0.40012143290831814, "grad_norm": 0.2764071822166443, "learning_rate": 0.00013104693879934965, "loss": 0.2361, "step": 1977 }, { "epoch": 0.4003238210888484, "grad_norm": 0.2712560296058655, "learning_rate": 0.0001309864311828517, "loss": 0.2464, "step": 1978 }, { "epoch": 0.40052620926937865, "grad_norm": 0.378466933965683, "learning_rate": 0.00013092591101412088, "loss": 0.2786, "step": 1979 }, { "epoch": 0.4007285974499089, "grad_norm": 0.30152297019958496, "learning_rate": 0.00013086537831767317, "loss": 0.2965, "step": 1980 }, { "epoch": 0.40093098563043916, "grad_norm": 0.527055025100708, "learning_rate": 0.0001308048331180296, "loss": 0.2516, "step": 1981 }, { "epoch": 0.4011333738109694, "grad_norm": 0.28280869126319885, "learning_rate": 0.00013074427543971636, "loss": 0.2309, "step": 1982 }, { "epoch": 0.4013357619914997, "grad_norm": 0.2994258999824524, "learning_rate": 0.00013068370530726467, "loss": 0.2397, "step": 1983 }, { "epoch": 0.40153815017202993, "grad_norm": 0.30291879177093506, "learning_rate": 0.00013062312274521066, "loss": 0.2405, "step": 1984 }, { "epoch": 0.4017405383525602, "grad_norm": 0.273774653673172, "learning_rate": 0.00013056252777809567, "loss": 0.216, "step": 1985 }, { "epoch": 0.4019429265330905, "grad_norm": 0.43318888545036316, "learning_rate": 0.000130501920430466, "loss": 0.2359, "step": 1986 }, { "epoch": 0.40214531471362075, "grad_norm": 0.2917752265930176, "learning_rate": 0.00013044130072687295, "loss": 0.2448, "step": 1987 }, { "epoch": 0.402347702894151, "grad_norm": 0.25260987877845764, "learning_rate": 0.00013038066869187285, "loss": 0.1924, "step": 1988 }, { "epoch": 0.40255009107468126, "grad_norm": 0.36240655183792114, "learning_rate": 0.00013032002435002697, "loss": 0.198, "step": 1989 }, { "epoch": 0.4027524792552115, "grad_norm": 0.38859719038009644, "learning_rate": 0.00013025936772590165, "loss": 0.2889, "step": 1990 }, { "epoch": 0.40295486743574177, "grad_norm": 0.3366321325302124, "learning_rate": 0.00013019869884406816, "loss": 0.2462, "step": 1991 }, { "epoch": 0.403157255616272, "grad_norm": 0.2428562492132187, "learning_rate": 0.0001301380177291027, "loss": 0.2124, "step": 1992 }, { "epoch": 0.4033596437968023, "grad_norm": 0.25549569725990295, "learning_rate": 0.00013007732440558652, "loss": 0.2234, "step": 1993 }, { "epoch": 0.40356203197733254, "grad_norm": 0.3338925242424011, "learning_rate": 0.00013001661889810578, "loss": 0.2674, "step": 1994 }, { "epoch": 0.4037644201578628, "grad_norm": 0.284200519323349, "learning_rate": 0.00012995590123125145, "loss": 0.2382, "step": 1995 }, { "epoch": 0.40396680833839305, "grad_norm": 0.2844996154308319, "learning_rate": 0.0001298951714296196, "loss": 0.2725, "step": 1996 }, { "epoch": 0.4041691965189233, "grad_norm": 0.3009307086467743, "learning_rate": 0.00012983442951781113, "loss": 0.2377, "step": 1997 }, { "epoch": 0.40437158469945356, "grad_norm": 0.35116681456565857, "learning_rate": 0.0001297736755204319, "loss": 0.2267, "step": 1998 }, { "epoch": 0.4045739728799838, "grad_norm": 0.27372056245803833, "learning_rate": 0.00012971290946209256, "loss": 0.2657, "step": 1999 }, { "epoch": 0.40477636106051407, "grad_norm": 0.3842753767967224, "learning_rate": 0.0001296521313674088, "loss": 0.2544, "step": 2000 }, { "epoch": 0.40477636106051407, "eval_loss": 0.28242871165275574, "eval_runtime": 1.3256, "eval_samples_per_second": 3.772, "eval_steps_per_second": 0.754, "step": 2000 }, { "epoch": 0.4049787492410443, "grad_norm": 0.34683626890182495, "learning_rate": 0.000129591341261001, "loss": 0.277, "step": 2001 }, { "epoch": 0.4051811374215746, "grad_norm": 0.2792201042175293, "learning_rate": 0.00012953053916749457, "loss": 0.2438, "step": 2002 }, { "epoch": 0.40538352560210483, "grad_norm": 0.2898143231868744, "learning_rate": 0.00012946972511151974, "loss": 0.2513, "step": 2003 }, { "epoch": 0.4055859137826351, "grad_norm": 0.40168705582618713, "learning_rate": 0.0001294088991177115, "loss": 0.2865, "step": 2004 }, { "epoch": 0.40578830196316534, "grad_norm": 0.36038899421691895, "learning_rate": 0.00012934806121070973, "loss": 0.2649, "step": 2005 }, { "epoch": 0.4059906901436956, "grad_norm": 0.2945319414138794, "learning_rate": 0.00012928721141515915, "loss": 0.2548, "step": 2006 }, { "epoch": 0.40619307832422585, "grad_norm": 0.3021438717842102, "learning_rate": 0.00012922634975570934, "loss": 0.2509, "step": 2007 }, { "epoch": 0.4063954665047561, "grad_norm": 0.24368038773536682, "learning_rate": 0.00012916547625701455, "loss": 0.234, "step": 2008 }, { "epoch": 0.40659785468528636, "grad_norm": 0.2840649485588074, "learning_rate": 0.00012910459094373392, "loss": 0.2414, "step": 2009 }, { "epoch": 0.4068002428658166, "grad_norm": 0.3940325379371643, "learning_rate": 0.0001290436938405314, "loss": 0.2812, "step": 2010 }, { "epoch": 0.4070026310463469, "grad_norm": 0.2627634108066559, "learning_rate": 0.0001289827849720757, "loss": 0.2556, "step": 2011 }, { "epoch": 0.40720501922687713, "grad_norm": 0.23669631779193878, "learning_rate": 0.0001289218643630402, "loss": 0.2496, "step": 2012 }, { "epoch": 0.4074074074074074, "grad_norm": 0.31009694933891296, "learning_rate": 0.00012886093203810315, "loss": 0.2628, "step": 2013 }, { "epoch": 0.40760979558793764, "grad_norm": 0.3262062668800354, "learning_rate": 0.0001287999880219475, "loss": 0.2221, "step": 2014 }, { "epoch": 0.4078121837684679, "grad_norm": 0.2925626337528229, "learning_rate": 0.00012873903233926094, "loss": 0.2384, "step": 2015 }, { "epoch": 0.4080145719489982, "grad_norm": 0.30689486861228943, "learning_rate": 0.0001286780650147359, "loss": 0.2274, "step": 2016 }, { "epoch": 0.40821696012952846, "grad_norm": 0.23987144231796265, "learning_rate": 0.00012861708607306952, "loss": 0.2151, "step": 2017 }, { "epoch": 0.4084193483100587, "grad_norm": 0.2843565344810486, "learning_rate": 0.00012855609553896364, "loss": 0.2199, "step": 2018 }, { "epoch": 0.40862173649058897, "grad_norm": 0.26927450299263, "learning_rate": 0.00012849509343712475, "loss": 0.2302, "step": 2019 }, { "epoch": 0.4088241246711192, "grad_norm": 0.3334886431694031, "learning_rate": 0.00012843407979226413, "loss": 0.2262, "step": 2020 }, { "epoch": 0.4090265128516495, "grad_norm": 0.2969011664390564, "learning_rate": 0.00012837305462909764, "loss": 0.2495, "step": 2021 }, { "epoch": 0.40922890103217974, "grad_norm": 0.2826099693775177, "learning_rate": 0.0001283120179723459, "loss": 0.2849, "step": 2022 }, { "epoch": 0.40943128921271, "grad_norm": 0.24149687588214874, "learning_rate": 0.00012825096984673404, "loss": 0.2127, "step": 2023 }, { "epoch": 0.40963367739324025, "grad_norm": 0.30478453636169434, "learning_rate": 0.000128189910276992, "loss": 0.2214, "step": 2024 }, { "epoch": 0.4098360655737705, "grad_norm": 0.267497181892395, "learning_rate": 0.00012812883928785425, "loss": 0.2115, "step": 2025 }, { "epoch": 0.41003845375430076, "grad_norm": 0.3685814440250397, "learning_rate": 0.00012806775690405996, "loss": 0.2514, "step": 2026 }, { "epoch": 0.410240841934831, "grad_norm": 0.38597095012664795, "learning_rate": 0.00012800666315035278, "loss": 0.2661, "step": 2027 }, { "epoch": 0.41044323011536127, "grad_norm": 0.2850973904132843, "learning_rate": 0.00012794555805148116, "loss": 0.2187, "step": 2028 }, { "epoch": 0.4106456182958915, "grad_norm": 0.5434443354606628, "learning_rate": 0.000127884441632198, "loss": 0.2546, "step": 2029 }, { "epoch": 0.4108480064764218, "grad_norm": 0.35577887296676636, "learning_rate": 0.0001278233139172608, "loss": 0.2508, "step": 2030 }, { "epoch": 0.41105039465695203, "grad_norm": 0.32496634125709534, "learning_rate": 0.00012776217493143177, "loss": 0.2248, "step": 2031 }, { "epoch": 0.4112527828374823, "grad_norm": 0.34825873374938965, "learning_rate": 0.00012770102469947746, "loss": 0.2392, "step": 2032 }, { "epoch": 0.41145517101801254, "grad_norm": 0.2490948885679245, "learning_rate": 0.00012763986324616916, "loss": 0.2206, "step": 2033 }, { "epoch": 0.4116575591985428, "grad_norm": 0.5652355551719666, "learning_rate": 0.00012757869059628262, "loss": 0.2149, "step": 2034 }, { "epoch": 0.41185994737907305, "grad_norm": 0.31565678119659424, "learning_rate": 0.00012751750677459823, "loss": 0.262, "step": 2035 }, { "epoch": 0.4120623355596033, "grad_norm": 0.31562215089797974, "learning_rate": 0.0001274563118059007, "loss": 0.2608, "step": 2036 }, { "epoch": 0.41226472374013357, "grad_norm": 0.33272048830986023, "learning_rate": 0.00012739510571497945, "loss": 0.2228, "step": 2037 }, { "epoch": 0.4124671119206638, "grad_norm": 0.6399551630020142, "learning_rate": 0.00012733388852662834, "loss": 0.236, "step": 2038 }, { "epoch": 0.4126695001011941, "grad_norm": 0.2761405408382416, "learning_rate": 0.0001272726602656457, "loss": 0.2449, "step": 2039 }, { "epoch": 0.41287188828172433, "grad_norm": 0.4142592251300812, "learning_rate": 0.00012721142095683437, "loss": 0.2549, "step": 2040 }, { "epoch": 0.4130742764622546, "grad_norm": 0.26064079999923706, "learning_rate": 0.00012715017062500165, "loss": 0.2316, "step": 2041 }, { "epoch": 0.41327666464278484, "grad_norm": 0.30711066722869873, "learning_rate": 0.00012708890929495937, "loss": 0.2344, "step": 2042 }, { "epoch": 0.4134790528233151, "grad_norm": 0.2903990149497986, "learning_rate": 0.0001270276369915237, "loss": 0.2502, "step": 2043 }, { "epoch": 0.41368144100384535, "grad_norm": 0.29076454043388367, "learning_rate": 0.00012696635373951536, "loss": 0.2554, "step": 2044 }, { "epoch": 0.4138838291843756, "grad_norm": 0.24669866263866425, "learning_rate": 0.00012690505956375943, "loss": 0.2154, "step": 2045 }, { "epoch": 0.41408621736490586, "grad_norm": 0.2728884220123291, "learning_rate": 0.00012684375448908548, "loss": 0.2062, "step": 2046 }, { "epoch": 0.4142886055454362, "grad_norm": 0.283176064491272, "learning_rate": 0.00012678243854032743, "loss": 0.2577, "step": 2047 }, { "epoch": 0.41449099372596643, "grad_norm": 0.33021536469459534, "learning_rate": 0.00012672111174232367, "loss": 0.2362, "step": 2048 }, { "epoch": 0.4146933819064967, "grad_norm": 0.3008541166782379, "learning_rate": 0.00012665977411991693, "loss": 0.2504, "step": 2049 }, { "epoch": 0.41489577008702694, "grad_norm": 0.3151417076587677, "learning_rate": 0.00012659842569795435, "loss": 0.2615, "step": 2050 }, { "epoch": 0.41489577008702694, "eval_loss": 0.265240877866745, "eval_runtime": 1.3153, "eval_samples_per_second": 3.801, "eval_steps_per_second": 0.76, "step": 2050 }, { "epoch": 0.4150981582675572, "grad_norm": 0.2754959762096405, "learning_rate": 0.00012653706650128747, "loss": 0.2589, "step": 2051 }, { "epoch": 0.41530054644808745, "grad_norm": 0.27886438369750977, "learning_rate": 0.00012647569655477214, "loss": 0.2698, "step": 2052 }, { "epoch": 0.4155029346286177, "grad_norm": 0.37979236245155334, "learning_rate": 0.00012641431588326858, "loss": 0.264, "step": 2053 }, { "epoch": 0.41570532280914796, "grad_norm": 0.2676214873790741, "learning_rate": 0.00012635292451164138, "loss": 0.2389, "step": 2054 }, { "epoch": 0.4159077109896782, "grad_norm": 0.2991965115070343, "learning_rate": 0.00012629152246475947, "loss": 0.2451, "step": 2055 }, { "epoch": 0.41611009917020847, "grad_norm": 0.327769935131073, "learning_rate": 0.00012623010976749608, "loss": 0.2941, "step": 2056 }, { "epoch": 0.4163124873507387, "grad_norm": 0.3254401683807373, "learning_rate": 0.0001261686864447287, "loss": 0.2293, "step": 2057 }, { "epoch": 0.416514875531269, "grad_norm": 0.3303678333759308, "learning_rate": 0.0001261072525213393, "loss": 0.2248, "step": 2058 }, { "epoch": 0.41671726371179924, "grad_norm": 0.3936763107776642, "learning_rate": 0.00012604580802221392, "loss": 0.2779, "step": 2059 }, { "epoch": 0.4169196518923295, "grad_norm": 0.2462732046842575, "learning_rate": 0.00012598435297224306, "loss": 0.2017, "step": 2060 }, { "epoch": 0.41712204007285975, "grad_norm": 0.2577330470085144, "learning_rate": 0.00012592288739632137, "loss": 0.255, "step": 2061 }, { "epoch": 0.41732442825339, "grad_norm": 0.3639339506626129, "learning_rate": 0.00012586141131934786, "loss": 0.2814, "step": 2062 }, { "epoch": 0.41752681643392026, "grad_norm": 0.2637769281864166, "learning_rate": 0.00012579992476622576, "loss": 0.2612, "step": 2063 }, { "epoch": 0.4177292046144505, "grad_norm": 0.24390006065368652, "learning_rate": 0.00012573842776186252, "loss": 0.2116, "step": 2064 }, { "epoch": 0.41793159279498077, "grad_norm": 0.3020148277282715, "learning_rate": 0.0001256769203311698, "loss": 0.2522, "step": 2065 }, { "epoch": 0.418133980975511, "grad_norm": 0.3385802209377289, "learning_rate": 0.0001256154024990636, "loss": 0.2413, "step": 2066 }, { "epoch": 0.4183363691560413, "grad_norm": 0.21690528094768524, "learning_rate": 0.000125553874290464, "loss": 0.2301, "step": 2067 }, { "epoch": 0.41853875733657153, "grad_norm": 0.2444566935300827, "learning_rate": 0.00012549233573029542, "loss": 0.227, "step": 2068 }, { "epoch": 0.4187411455171018, "grad_norm": 0.28474608063697815, "learning_rate": 0.0001254307868434863, "loss": 0.2074, "step": 2069 }, { "epoch": 0.41894353369763204, "grad_norm": 0.3127065598964691, "learning_rate": 0.00012536922765496943, "loss": 0.2662, "step": 2070 }, { "epoch": 0.4191459218781623, "grad_norm": 0.2597690522670746, "learning_rate": 0.00012530765818968162, "loss": 0.238, "step": 2071 }, { "epoch": 0.41934831005869255, "grad_norm": 0.26486197113990784, "learning_rate": 0.00012524607847256403, "loss": 0.2009, "step": 2072 }, { "epoch": 0.4195506982392228, "grad_norm": 0.2640715539455414, "learning_rate": 0.00012518448852856181, "loss": 0.278, "step": 2073 }, { "epoch": 0.41975308641975306, "grad_norm": 0.26320189237594604, "learning_rate": 0.0001251228883826243, "loss": 0.2064, "step": 2074 }, { "epoch": 0.4199554746002833, "grad_norm": 0.32839226722717285, "learning_rate": 0.00012506127805970502, "loss": 0.2541, "step": 2075 }, { "epoch": 0.4201578627808136, "grad_norm": 0.2406427413225174, "learning_rate": 0.00012499965758476153, "loss": 0.2333, "step": 2076 }, { "epoch": 0.4203602509613439, "grad_norm": 0.2833259403705597, "learning_rate": 0.00012493802698275555, "loss": 0.236, "step": 2077 }, { "epoch": 0.42056263914187414, "grad_norm": 0.2869158387184143, "learning_rate": 0.00012487638627865297, "loss": 0.2527, "step": 2078 }, { "epoch": 0.4207650273224044, "grad_norm": 0.3298097550868988, "learning_rate": 0.00012481473549742363, "loss": 0.2631, "step": 2079 }, { "epoch": 0.42096741550293465, "grad_norm": 0.31624171137809753, "learning_rate": 0.00012475307466404157, "loss": 0.2022, "step": 2080 }, { "epoch": 0.4211698036834649, "grad_norm": 0.2529713809490204, "learning_rate": 0.00012469140380348478, "loss": 0.2175, "step": 2081 }, { "epoch": 0.42137219186399516, "grad_norm": 0.3227352797985077, "learning_rate": 0.00012462972294073548, "loss": 0.1956, "step": 2082 }, { "epoch": 0.4215745800445254, "grad_norm": 0.35104241967201233, "learning_rate": 0.00012456803210077983, "loss": 0.2421, "step": 2083 }, { "epoch": 0.42177696822505567, "grad_norm": 0.2637273073196411, "learning_rate": 0.000124506331308608, "loss": 0.2196, "step": 2084 }, { "epoch": 0.4219793564055859, "grad_norm": 0.3178747892379761, "learning_rate": 0.0001244446205892143, "loss": 0.2471, "step": 2085 }, { "epoch": 0.4221817445861162, "grad_norm": 0.23902137577533722, "learning_rate": 0.00012438289996759698, "loss": 0.2295, "step": 2086 }, { "epoch": 0.42238413276664644, "grad_norm": 0.28139352798461914, "learning_rate": 0.00012432116946875833, "loss": 0.251, "step": 2087 }, { "epoch": 0.4225865209471767, "grad_norm": 0.32908013463020325, "learning_rate": 0.00012425942911770463, "loss": 0.2441, "step": 2088 }, { "epoch": 0.42278890912770695, "grad_norm": 0.32288312911987305, "learning_rate": 0.00012419767893944616, "loss": 0.2534, "step": 2089 }, { "epoch": 0.4229912973082372, "grad_norm": 1.0200849771499634, "learning_rate": 0.0001241359189589972, "loss": 0.2238, "step": 2090 }, { "epoch": 0.42319368548876746, "grad_norm": 0.4350891411304474, "learning_rate": 0.00012407414920137595, "loss": 0.2588, "step": 2091 }, { "epoch": 0.4233960736692977, "grad_norm": 0.23360294103622437, "learning_rate": 0.00012401236969160464, "loss": 0.2329, "step": 2092 }, { "epoch": 0.42359846184982797, "grad_norm": 0.27669280767440796, "learning_rate": 0.00012395058045470934, "loss": 0.2313, "step": 2093 }, { "epoch": 0.4238008500303582, "grad_norm": 0.35294219851493835, "learning_rate": 0.00012388878151572022, "loss": 0.243, "step": 2094 }, { "epoch": 0.4240032382108885, "grad_norm": 0.24622862040996552, "learning_rate": 0.00012382697289967123, "loss": 0.2314, "step": 2095 }, { "epoch": 0.42420562639141873, "grad_norm": 0.43022507429122925, "learning_rate": 0.00012376515463160032, "loss": 0.2574, "step": 2096 }, { "epoch": 0.424408014571949, "grad_norm": 0.30967745184898376, "learning_rate": 0.00012370332673654935, "loss": 0.2625, "step": 2097 }, { "epoch": 0.42461040275247924, "grad_norm": 0.35883182287216187, "learning_rate": 0.000123641489239564, "loss": 0.2311, "step": 2098 }, { "epoch": 0.4248127909330095, "grad_norm": 0.3922053575515747, "learning_rate": 0.000123579642165694, "loss": 0.2427, "step": 2099 }, { "epoch": 0.42501517911353975, "grad_norm": 0.27302175760269165, "learning_rate": 0.00012351778553999277, "loss": 0.2249, "step": 2100 }, { "epoch": 0.42501517911353975, "eval_loss": 0.26186245679855347, "eval_runtime": 1.3135, "eval_samples_per_second": 3.807, "eval_steps_per_second": 0.761, "step": 2100 }, { "epoch": 0.42521756729407, "grad_norm": 0.3689914345741272, "learning_rate": 0.0001234559193875177, "loss": 0.2847, "step": 2101 }, { "epoch": 0.42541995547460026, "grad_norm": 0.3237003684043884, "learning_rate": 0.00012339404373333009, "loss": 0.2354, "step": 2102 }, { "epoch": 0.4256223436551305, "grad_norm": 0.2731477916240692, "learning_rate": 0.000123332158602495, "loss": 0.2189, "step": 2103 }, { "epoch": 0.4258247318356608, "grad_norm": 0.4225029945373535, "learning_rate": 0.0001232702640200813, "loss": 0.2637, "step": 2104 }, { "epoch": 0.42602712001619103, "grad_norm": 0.26892513036727905, "learning_rate": 0.00012320836001116184, "loss": 0.208, "step": 2105 }, { "epoch": 0.4262295081967213, "grad_norm": 0.35070931911468506, "learning_rate": 0.0001231464466008131, "loss": 0.2533, "step": 2106 }, { "epoch": 0.4264318963772516, "grad_norm": 0.2935669422149658, "learning_rate": 0.00012308452381411557, "loss": 0.2335, "step": 2107 }, { "epoch": 0.42663428455778185, "grad_norm": 0.2581442594528198, "learning_rate": 0.00012302259167615333, "loss": 0.229, "step": 2108 }, { "epoch": 0.4268366727383121, "grad_norm": 0.5905941128730774, "learning_rate": 0.00012296065021201436, "loss": 0.2571, "step": 2109 }, { "epoch": 0.42703906091884236, "grad_norm": 0.29472848773002625, "learning_rate": 0.0001228986994467905, "loss": 0.2114, "step": 2110 }, { "epoch": 0.4272414490993726, "grad_norm": 0.3735221028327942, "learning_rate": 0.00012283673940557716, "loss": 0.2433, "step": 2111 }, { "epoch": 0.42744383727990287, "grad_norm": 0.29375994205474854, "learning_rate": 0.00012277477011347367, "loss": 0.238, "step": 2112 }, { "epoch": 0.4276462254604331, "grad_norm": 0.26810362935066223, "learning_rate": 0.00012271279159558303, "loss": 0.2496, "step": 2113 }, { "epoch": 0.4278486136409634, "grad_norm": 0.33673784136772156, "learning_rate": 0.000122650803877012, "loss": 0.2744, "step": 2114 }, { "epoch": 0.42805100182149364, "grad_norm": 0.2995823621749878, "learning_rate": 0.00012258880698287104, "loss": 0.2515, "step": 2115 }, { "epoch": 0.4282533900020239, "grad_norm": 0.30129626393318176, "learning_rate": 0.0001225268009382744, "loss": 0.2318, "step": 2116 }, { "epoch": 0.42845577818255415, "grad_norm": 0.3417244255542755, "learning_rate": 0.00012246478576833993, "loss": 0.2547, "step": 2117 }, { "epoch": 0.4286581663630844, "grad_norm": 0.3532252013683319, "learning_rate": 0.0001224027614981893, "loss": 0.2698, "step": 2118 }, { "epoch": 0.42886055454361466, "grad_norm": 0.3561701774597168, "learning_rate": 0.00012234072815294774, "loss": 0.2583, "step": 2119 }, { "epoch": 0.4290629427241449, "grad_norm": 0.25192761421203613, "learning_rate": 0.00012227868575774423, "loss": 0.2284, "step": 2120 }, { "epoch": 0.42926533090467517, "grad_norm": 0.2786290645599365, "learning_rate": 0.00012221663433771145, "loss": 0.2072, "step": 2121 }, { "epoch": 0.4294677190852054, "grad_norm": 0.27352750301361084, "learning_rate": 0.00012215457391798564, "loss": 0.2224, "step": 2122 }, { "epoch": 0.4296701072657357, "grad_norm": 0.2675737738609314, "learning_rate": 0.00012209250452370674, "loss": 0.2276, "step": 2123 }, { "epoch": 0.42987249544626593, "grad_norm": 0.29661306738853455, "learning_rate": 0.00012203042618001834, "loss": 0.2564, "step": 2124 }, { "epoch": 0.4300748836267962, "grad_norm": 0.2976089417934418, "learning_rate": 0.00012196833891206761, "loss": 0.2557, "step": 2125 }, { "epoch": 0.43027727180732644, "grad_norm": 0.27346041798591614, "learning_rate": 0.00012190624274500537, "loss": 0.2563, "step": 2126 }, { "epoch": 0.4304796599878567, "grad_norm": 0.4857114553451538, "learning_rate": 0.00012184413770398607, "loss": 0.2428, "step": 2127 }, { "epoch": 0.43068204816838696, "grad_norm": 0.31661370396614075, "learning_rate": 0.00012178202381416763, "loss": 0.2404, "step": 2128 }, { "epoch": 0.4308844363489172, "grad_norm": 0.3062070906162262, "learning_rate": 0.00012171990110071174, "loss": 0.2187, "step": 2129 }, { "epoch": 0.43108682452944747, "grad_norm": 0.2728961706161499, "learning_rate": 0.00012165776958878349, "loss": 0.2218, "step": 2130 }, { "epoch": 0.4312892127099777, "grad_norm": 0.3652915060520172, "learning_rate": 0.00012159562930355171, "loss": 0.2629, "step": 2131 }, { "epoch": 0.431491600890508, "grad_norm": 0.33820024132728577, "learning_rate": 0.0001215334802701886, "loss": 0.2601, "step": 2132 }, { "epoch": 0.43169398907103823, "grad_norm": 0.36203113198280334, "learning_rate": 0.00012147132251387004, "loss": 0.2473, "step": 2133 }, { "epoch": 0.4318963772515685, "grad_norm": 0.28117600083351135, "learning_rate": 0.00012140915605977535, "loss": 0.2278, "step": 2134 }, { "epoch": 0.43209876543209874, "grad_norm": 0.2607441544532776, "learning_rate": 0.00012134698093308746, "loss": 0.2326, "step": 2135 }, { "epoch": 0.432301153612629, "grad_norm": 0.2999681532382965, "learning_rate": 0.00012128479715899272, "loss": 0.2404, "step": 2136 }, { "epoch": 0.43250354179315925, "grad_norm": 0.28632259368896484, "learning_rate": 0.00012122260476268111, "loss": 0.232, "step": 2137 }, { "epoch": 0.43270592997368956, "grad_norm": 0.26560378074645996, "learning_rate": 0.00012116040376934596, "loss": 0.2575, "step": 2138 }, { "epoch": 0.4329083181542198, "grad_norm": 0.2713767886161804, "learning_rate": 0.00012109819420418413, "loss": 0.2165, "step": 2139 }, { "epoch": 0.4331107063347501, "grad_norm": 0.32122987508773804, "learning_rate": 0.00012103597609239607, "loss": 0.2717, "step": 2140 }, { "epoch": 0.43331309451528033, "grad_norm": 0.29684484004974365, "learning_rate": 0.00012097374945918554, "loss": 0.253, "step": 2141 }, { "epoch": 0.4335154826958106, "grad_norm": 0.25783488154411316, "learning_rate": 0.00012091151432975981, "loss": 0.2259, "step": 2142 }, { "epoch": 0.43371787087634084, "grad_norm": 0.32175707817077637, "learning_rate": 0.0001208492707293295, "loss": 0.288, "step": 2143 }, { "epoch": 0.4339202590568711, "grad_norm": 0.28667160868644714, "learning_rate": 0.00012078701868310889, "loss": 0.2385, "step": 2144 }, { "epoch": 0.43412264723740135, "grad_norm": 0.3361613154411316, "learning_rate": 0.00012072475821631546, "loss": 0.2522, "step": 2145 }, { "epoch": 0.4343250354179316, "grad_norm": 0.3594609797000885, "learning_rate": 0.0001206624893541702, "loss": 0.2565, "step": 2146 }, { "epoch": 0.43452742359846186, "grad_norm": 0.3329756557941437, "learning_rate": 0.0001206002121218975, "loss": 0.2539, "step": 2147 }, { "epoch": 0.4347298117789921, "grad_norm": 0.30344513058662415, "learning_rate": 0.00012053792654472506, "loss": 0.2577, "step": 2148 }, { "epoch": 0.43493219995952237, "grad_norm": 0.3196765184402466, "learning_rate": 0.0001204756326478841, "loss": 0.2719, "step": 2149 }, { "epoch": 0.4351345881400526, "grad_norm": 0.262235552072525, "learning_rate": 0.00012041333045660907, "loss": 0.2353, "step": 2150 }, { "epoch": 0.4351345881400526, "eval_loss": 0.2703745365142822, "eval_runtime": 1.3224, "eval_samples_per_second": 3.781, "eval_steps_per_second": 0.756, "step": 2150 }, { "epoch": 0.4353369763205829, "grad_norm": 0.24717698991298676, "learning_rate": 0.00012035101999613791, "loss": 0.2297, "step": 2151 }, { "epoch": 0.43553936450111314, "grad_norm": 0.23519881069660187, "learning_rate": 0.00012028870129171177, "loss": 0.199, "step": 2152 }, { "epoch": 0.4357417526816434, "grad_norm": 0.25399383902549744, "learning_rate": 0.00012022637436857524, "loss": 0.2182, "step": 2153 }, { "epoch": 0.43594414086217365, "grad_norm": 0.30349814891815186, "learning_rate": 0.00012016403925197623, "loss": 0.241, "step": 2154 }, { "epoch": 0.4361465290427039, "grad_norm": 0.2486637979745865, "learning_rate": 0.00012010169596716596, "loss": 0.2293, "step": 2155 }, { "epoch": 0.43634891722323416, "grad_norm": 0.3713061809539795, "learning_rate": 0.00012003934453939889, "loss": 0.2228, "step": 2156 }, { "epoch": 0.4365513054037644, "grad_norm": 0.3627392649650574, "learning_rate": 0.00011997698499393291, "loss": 0.2263, "step": 2157 }, { "epoch": 0.43675369358429467, "grad_norm": 0.2737453281879425, "learning_rate": 0.00011991461735602904, "loss": 0.2056, "step": 2158 }, { "epoch": 0.4369560817648249, "grad_norm": 0.2561127543449402, "learning_rate": 0.00011985224165095178, "loss": 0.2603, "step": 2159 }, { "epoch": 0.4371584699453552, "grad_norm": 0.29941946268081665, "learning_rate": 0.00011978985790396868, "loss": 0.2665, "step": 2160 }, { "epoch": 0.43736085812588543, "grad_norm": 0.27465149760246277, "learning_rate": 0.00011972746614035068, "loss": 0.2456, "step": 2161 }, { "epoch": 0.4375632463064157, "grad_norm": 0.30218666791915894, "learning_rate": 0.000119665066385372, "loss": 0.2566, "step": 2162 }, { "epoch": 0.43776563448694594, "grad_norm": 0.31681689620018005, "learning_rate": 0.00011960265866430993, "loss": 0.2404, "step": 2163 }, { "epoch": 0.4379680226674762, "grad_norm": 0.3065343201160431, "learning_rate": 0.00011954024300244518, "loss": 0.2547, "step": 2164 }, { "epoch": 0.43817041084800645, "grad_norm": 0.2827613949775696, "learning_rate": 0.0001194778194250615, "loss": 0.2395, "step": 2165 }, { "epoch": 0.4383727990285367, "grad_norm": 0.27637484669685364, "learning_rate": 0.00011941538795744604, "loss": 0.2202, "step": 2166 }, { "epoch": 0.43857518720906696, "grad_norm": 0.2045586109161377, "learning_rate": 0.00011935294862488895, "loss": 0.2107, "step": 2167 }, { "epoch": 0.4387775753895973, "grad_norm": 0.271394282579422, "learning_rate": 0.00011929050145268374, "loss": 0.2336, "step": 2168 }, { "epoch": 0.43897996357012753, "grad_norm": 0.2775926887989044, "learning_rate": 0.00011922804646612693, "loss": 0.2334, "step": 2169 }, { "epoch": 0.4391823517506578, "grad_norm": 0.266975462436676, "learning_rate": 0.00011916558369051835, "loss": 0.2359, "step": 2170 }, { "epoch": 0.43938473993118804, "grad_norm": 0.4295983612537384, "learning_rate": 0.00011910311315116091, "loss": 0.2335, "step": 2171 }, { "epoch": 0.4395871281117183, "grad_norm": 0.29055255651474, "learning_rate": 0.00011904063487336063, "loss": 0.2655, "step": 2172 }, { "epoch": 0.43978951629224855, "grad_norm": 0.2840280532836914, "learning_rate": 0.00011897814888242677, "loss": 0.2182, "step": 2173 }, { "epoch": 0.4399919044727788, "grad_norm": 0.26752063632011414, "learning_rate": 0.00011891565520367167, "loss": 0.2158, "step": 2174 }, { "epoch": 0.44019429265330906, "grad_norm": 0.33547624945640564, "learning_rate": 0.00011885315386241074, "loss": 0.2543, "step": 2175 }, { "epoch": 0.4403966808338393, "grad_norm": 0.30130869150161743, "learning_rate": 0.0001187906448839625, "loss": 0.2201, "step": 2176 }, { "epoch": 0.44059906901436957, "grad_norm": 0.2676388919353485, "learning_rate": 0.00011872812829364863, "loss": 0.2282, "step": 2177 }, { "epoch": 0.4408014571948998, "grad_norm": 0.26473209261894226, "learning_rate": 0.00011866560411679384, "loss": 0.2311, "step": 2178 }, { "epoch": 0.4410038453754301, "grad_norm": 0.3182562589645386, "learning_rate": 0.00011860307237872597, "loss": 0.2474, "step": 2179 }, { "epoch": 0.44120623355596034, "grad_norm": 0.29943689703941345, "learning_rate": 0.00011854053310477579, "loss": 0.249, "step": 2180 }, { "epoch": 0.4414086217364906, "grad_norm": 0.3413843512535095, "learning_rate": 0.00011847798632027727, "loss": 0.2356, "step": 2181 }, { "epoch": 0.44161100991702085, "grad_norm": 0.3279730975627899, "learning_rate": 0.00011841543205056736, "loss": 0.2528, "step": 2182 }, { "epoch": 0.4418133980975511, "grad_norm": 0.3044031262397766, "learning_rate": 0.00011835287032098608, "loss": 0.2246, "step": 2183 }, { "epoch": 0.44201578627808136, "grad_norm": 0.29211196303367615, "learning_rate": 0.00011829030115687635, "loss": 0.2149, "step": 2184 }, { "epoch": 0.4422181744586116, "grad_norm": 0.2847227156162262, "learning_rate": 0.00011822772458358429, "loss": 0.2815, "step": 2185 }, { "epoch": 0.44242056263914187, "grad_norm": 0.36023515462875366, "learning_rate": 0.00011816514062645887, "loss": 0.2629, "step": 2186 }, { "epoch": 0.4426229508196721, "grad_norm": 0.48622918128967285, "learning_rate": 0.00011810254931085206, "loss": 0.2489, "step": 2187 }, { "epoch": 0.4428253390002024, "grad_norm": 0.33632776141166687, "learning_rate": 0.00011803995066211894, "loss": 0.2658, "step": 2188 }, { "epoch": 0.44302772718073263, "grad_norm": 0.27311983704566956, "learning_rate": 0.00011797734470561743, "loss": 0.2585, "step": 2189 }, { "epoch": 0.4432301153612629, "grad_norm": 0.49943453073501587, "learning_rate": 0.00011791473146670849, "loss": 0.2391, "step": 2190 }, { "epoch": 0.44343250354179314, "grad_norm": 0.28346407413482666, "learning_rate": 0.0001178521109707559, "loss": 0.2694, "step": 2191 }, { "epoch": 0.4436348917223234, "grad_norm": 0.3144627809524536, "learning_rate": 0.00011778948324312659, "loss": 0.282, "step": 2192 }, { "epoch": 0.44383727990285365, "grad_norm": 0.302470862865448, "learning_rate": 0.00011772684830919026, "loss": 0.2402, "step": 2193 }, { "epoch": 0.4440396680833839, "grad_norm": 0.25894466042518616, "learning_rate": 0.00011766420619431953, "loss": 0.2382, "step": 2194 }, { "epoch": 0.44424205626391416, "grad_norm": 0.2533305585384369, "learning_rate": 0.00011760155692389005, "loss": 0.2713, "step": 2195 }, { "epoch": 0.4444444444444444, "grad_norm": 0.26686760783195496, "learning_rate": 0.00011753890052328022, "loss": 0.244, "step": 2196 }, { "epoch": 0.4446468326249747, "grad_norm": 0.33043012022972107, "learning_rate": 0.00011747623701787143, "loss": 0.2444, "step": 2197 }, { "epoch": 0.444849220805505, "grad_norm": 0.35391542315483093, "learning_rate": 0.00011741356643304792, "loss": 0.252, "step": 2198 }, { "epoch": 0.44505160898603524, "grad_norm": 0.3531978726387024, "learning_rate": 0.0001173508887941968, "loss": 0.2368, "step": 2199 }, { "epoch": 0.4452539971665655, "grad_norm": 0.3716188371181488, "learning_rate": 0.00011728820412670803, "loss": 0.2779, "step": 2200 }, { "epoch": 0.4452539971665655, "eval_loss": 0.27036261558532715, "eval_runtime": 1.3184, "eval_samples_per_second": 3.793, "eval_steps_per_second": 0.759, "step": 2200 }, { "epoch": 0.44545638534709575, "grad_norm": 0.29994723200798035, "learning_rate": 0.00011722551245597437, "loss": 0.2446, "step": 2201 }, { "epoch": 0.445658773527626, "grad_norm": 0.23883792757987976, "learning_rate": 0.00011716281380739155, "loss": 0.2301, "step": 2202 }, { "epoch": 0.44586116170815626, "grad_norm": 0.3914525806903839, "learning_rate": 0.00011710010820635799, "loss": 0.2326, "step": 2203 }, { "epoch": 0.4460635498886865, "grad_norm": 0.26396989822387695, "learning_rate": 0.00011703739567827501, "loss": 0.2336, "step": 2204 }, { "epoch": 0.44626593806921677, "grad_norm": 0.27222198247909546, "learning_rate": 0.00011697467624854666, "loss": 0.2433, "step": 2205 }, { "epoch": 0.446468326249747, "grad_norm": 0.20552025735378265, "learning_rate": 0.00011691194994257986, "loss": 0.2301, "step": 2206 }, { "epoch": 0.4466707144302773, "grad_norm": 0.35180479288101196, "learning_rate": 0.00011684921678578432, "loss": 0.2384, "step": 2207 }, { "epoch": 0.44687310261080754, "grad_norm": 0.300133615732193, "learning_rate": 0.00011678647680357242, "loss": 0.2477, "step": 2208 }, { "epoch": 0.4470754907913378, "grad_norm": 0.30273956060409546, "learning_rate": 0.00011672373002135942, "loss": 0.2187, "step": 2209 }, { "epoch": 0.44727787897186805, "grad_norm": 0.2722319960594177, "learning_rate": 0.00011666097646456328, "loss": 0.2553, "step": 2210 }, { "epoch": 0.4474802671523983, "grad_norm": 0.29300355911254883, "learning_rate": 0.00011659821615860474, "loss": 0.244, "step": 2211 }, { "epoch": 0.44768265533292856, "grad_norm": 0.4241216480731964, "learning_rate": 0.00011653544912890721, "loss": 0.2831, "step": 2212 }, { "epoch": 0.4478850435134588, "grad_norm": 0.28359654545783997, "learning_rate": 0.0001164726754008969, "loss": 0.2765, "step": 2213 }, { "epoch": 0.44808743169398907, "grad_norm": 0.2784329056739807, "learning_rate": 0.0001164098950000027, "loss": 0.2661, "step": 2214 }, { "epoch": 0.4482898198745193, "grad_norm": 0.3675428628921509, "learning_rate": 0.00011634710795165613, "loss": 0.2339, "step": 2215 }, { "epoch": 0.4484922080550496, "grad_norm": 0.2794439494609833, "learning_rate": 0.00011628431428129156, "loss": 0.2152, "step": 2216 }, { "epoch": 0.44869459623557983, "grad_norm": 0.2754041254520416, "learning_rate": 0.00011622151401434591, "loss": 0.2465, "step": 2217 }, { "epoch": 0.4488969844161101, "grad_norm": 0.32526063919067383, "learning_rate": 0.00011615870717625883, "loss": 0.2538, "step": 2218 }, { "epoch": 0.44909937259664034, "grad_norm": 0.38518714904785156, "learning_rate": 0.00011609589379247263, "loss": 0.2674, "step": 2219 }, { "epoch": 0.4493017607771706, "grad_norm": 0.24310287833213806, "learning_rate": 0.00011603307388843223, "loss": 0.2127, "step": 2220 }, { "epoch": 0.44950414895770086, "grad_norm": 0.2812139391899109, "learning_rate": 0.00011597024748958525, "loss": 0.2357, "step": 2221 }, { "epoch": 0.4497065371382311, "grad_norm": 0.2847861647605896, "learning_rate": 0.00011590741462138188, "loss": 0.2864, "step": 2222 }, { "epoch": 0.44990892531876137, "grad_norm": 0.2787908911705017, "learning_rate": 0.00011584457530927502, "loss": 0.2457, "step": 2223 }, { "epoch": 0.4501113134992916, "grad_norm": 0.24270068109035492, "learning_rate": 0.00011578172957872006, "loss": 0.2386, "step": 2224 }, { "epoch": 0.4503137016798219, "grad_norm": 0.24730296432971954, "learning_rate": 0.0001157188774551751, "loss": 0.2331, "step": 2225 }, { "epoch": 0.45051608986035213, "grad_norm": 0.3093625605106354, "learning_rate": 0.00011565601896410076, "loss": 0.2601, "step": 2226 }, { "epoch": 0.4507184780408824, "grad_norm": 0.3179914653301239, "learning_rate": 0.0001155931541309603, "loss": 0.2849, "step": 2227 }, { "epoch": 0.4509208662214127, "grad_norm": 0.42889705300331116, "learning_rate": 0.00011553028298121946, "loss": 0.2552, "step": 2228 }, { "epoch": 0.45112325440194295, "grad_norm": 0.36468520760536194, "learning_rate": 0.00011546740554034661, "loss": 0.2599, "step": 2229 }, { "epoch": 0.4513256425824732, "grad_norm": 0.2871343791484833, "learning_rate": 0.00011540452183381267, "loss": 0.239, "step": 2230 }, { "epoch": 0.45152803076300346, "grad_norm": 0.2675608992576599, "learning_rate": 0.00011534163188709108, "loss": 0.2783, "step": 2231 }, { "epoch": 0.4517304189435337, "grad_norm": 0.3286617398262024, "learning_rate": 0.00011527873572565777, "loss": 0.2491, "step": 2232 }, { "epoch": 0.451932807124064, "grad_norm": 0.2460174560546875, "learning_rate": 0.00011521583337499122, "loss": 0.2241, "step": 2233 }, { "epoch": 0.45213519530459423, "grad_norm": 0.24440915882587433, "learning_rate": 0.0001151529248605725, "loss": 0.2623, "step": 2234 }, { "epoch": 0.4523375834851245, "grad_norm": 0.3280555009841919, "learning_rate": 0.00011509001020788496, "loss": 0.2461, "step": 2235 }, { "epoch": 0.45253997166565474, "grad_norm": 0.3201722800731659, "learning_rate": 0.00011502708944241473, "loss": 0.2605, "step": 2236 }, { "epoch": 0.452742359846185, "grad_norm": 0.3319763243198395, "learning_rate": 0.00011496416258965015, "loss": 0.2438, "step": 2237 }, { "epoch": 0.45294474802671525, "grad_norm": 0.2652421295642853, "learning_rate": 0.00011490122967508219, "loss": 0.2521, "step": 2238 }, { "epoch": 0.4531471362072455, "grad_norm": 0.2711728811264038, "learning_rate": 0.00011483829072420415, "loss": 0.2422, "step": 2239 }, { "epoch": 0.45334952438777576, "grad_norm": 0.30874577164649963, "learning_rate": 0.00011477534576251196, "loss": 0.2459, "step": 2240 }, { "epoch": 0.453551912568306, "grad_norm": 0.2896255850791931, "learning_rate": 0.00011471239481550377, "loss": 0.2558, "step": 2241 }, { "epoch": 0.45375430074883627, "grad_norm": 0.341516375541687, "learning_rate": 0.00011464943790868033, "loss": 0.2424, "step": 2242 }, { "epoch": 0.4539566889293665, "grad_norm": 0.3559191823005676, "learning_rate": 0.00011458647506754467, "loss": 0.2793, "step": 2243 }, { "epoch": 0.4541590771098968, "grad_norm": 0.5574918985366821, "learning_rate": 0.00011452350631760233, "loss": 0.2446, "step": 2244 }, { "epoch": 0.45436146529042704, "grad_norm": 0.2864552140235901, "learning_rate": 0.00011446053168436117, "loss": 0.2379, "step": 2245 }, { "epoch": 0.4545638534709573, "grad_norm": 0.4059247076511383, "learning_rate": 0.00011439755119333147, "loss": 0.2486, "step": 2246 }, { "epoch": 0.45476624165148755, "grad_norm": 0.24590668082237244, "learning_rate": 0.00011433456487002587, "loss": 0.2239, "step": 2247 }, { "epoch": 0.4549686298320178, "grad_norm": 0.2575906217098236, "learning_rate": 0.00011427157273995939, "loss": 0.2181, "step": 2248 }, { "epoch": 0.45517101801254806, "grad_norm": 0.2416461855173111, "learning_rate": 0.00011420857482864935, "loss": 0.2208, "step": 2249 }, { "epoch": 0.4553734061930783, "grad_norm": 0.2669858932495117, "learning_rate": 0.00011414557116161551, "loss": 0.2617, "step": 2250 }, { "epoch": 0.4553734061930783, "eval_loss": 0.26571062207221985, "eval_runtime": 1.3218, "eval_samples_per_second": 3.783, "eval_steps_per_second": 0.757, "step": 2250 }, { "epoch": 0.45557579437360857, "grad_norm": 0.25440794229507446, "learning_rate": 0.00011408256176437988, "loss": 0.2271, "step": 2251 }, { "epoch": 0.4557781825541388, "grad_norm": 0.2872158885002136, "learning_rate": 0.00011401954666246679, "loss": 0.2556, "step": 2252 }, { "epoch": 0.4559805707346691, "grad_norm": 0.2983999252319336, "learning_rate": 0.00011395652588140291, "loss": 0.2461, "step": 2253 }, { "epoch": 0.45618295891519933, "grad_norm": 0.2660429775714874, "learning_rate": 0.00011389349944671723, "loss": 0.2337, "step": 2254 }, { "epoch": 0.4563853470957296, "grad_norm": 0.24313394725322723, "learning_rate": 0.00011383046738394101, "loss": 0.2195, "step": 2255 }, { "epoch": 0.45658773527625984, "grad_norm": 0.2963145673274994, "learning_rate": 0.00011376742971860774, "loss": 0.2476, "step": 2256 }, { "epoch": 0.4567901234567901, "grad_norm": 0.2715277373790741, "learning_rate": 0.00011370438647625326, "loss": 0.2624, "step": 2257 }, { "epoch": 0.45699251163732035, "grad_norm": 0.3594062626361847, "learning_rate": 0.00011364133768241564, "loss": 0.2666, "step": 2258 }, { "epoch": 0.45719489981785066, "grad_norm": 0.26948118209838867, "learning_rate": 0.00011357828336263514, "loss": 0.2115, "step": 2259 }, { "epoch": 0.4573972879983809, "grad_norm": 0.2981526553630829, "learning_rate": 0.0001135152235424544, "loss": 0.2375, "step": 2260 }, { "epoch": 0.4575996761789112, "grad_norm": 0.3454959988594055, "learning_rate": 0.00011345215824741812, "loss": 0.2439, "step": 2261 }, { "epoch": 0.45780206435944143, "grad_norm": 0.3687196671962738, "learning_rate": 0.00011338908750307333, "loss": 0.2711, "step": 2262 }, { "epoch": 0.4580044525399717, "grad_norm": 0.3129332661628723, "learning_rate": 0.00011332601133496922, "loss": 0.2437, "step": 2263 }, { "epoch": 0.45820684072050194, "grad_norm": 0.3113081455230713, "learning_rate": 0.00011326292976865722, "loss": 0.242, "step": 2264 }, { "epoch": 0.4584092289010322, "grad_norm": 0.24337448179721832, "learning_rate": 0.0001131998428296909, "loss": 0.2004, "step": 2265 }, { "epoch": 0.45861161708156245, "grad_norm": 0.3220856785774231, "learning_rate": 0.00011313675054362602, "loss": 0.2721, "step": 2266 }, { "epoch": 0.4588140052620927, "grad_norm": 0.3414178490638733, "learning_rate": 0.0001130736529360205, "loss": 0.2171, "step": 2267 }, { "epoch": 0.45901639344262296, "grad_norm": 0.24530169367790222, "learning_rate": 0.00011301055003243448, "loss": 0.247, "step": 2268 }, { "epoch": 0.4592187816231532, "grad_norm": 0.3847990036010742, "learning_rate": 0.00011294744185843014, "loss": 0.2887, "step": 2269 }, { "epoch": 0.45942116980368347, "grad_norm": 0.3693843483924866, "learning_rate": 0.00011288432843957186, "loss": 0.2287, "step": 2270 }, { "epoch": 0.4596235579842137, "grad_norm": 0.3055208921432495, "learning_rate": 0.00011282120980142615, "loss": 0.2635, "step": 2271 }, { "epoch": 0.459825946164744, "grad_norm": 0.31545400619506836, "learning_rate": 0.00011275808596956157, "loss": 0.2812, "step": 2272 }, { "epoch": 0.46002833434527424, "grad_norm": 0.29858043789863586, "learning_rate": 0.00011269495696954888, "loss": 0.2314, "step": 2273 }, { "epoch": 0.4602307225258045, "grad_norm": 0.27404287457466125, "learning_rate": 0.00011263182282696085, "loss": 0.2472, "step": 2274 }, { "epoch": 0.46043311070633475, "grad_norm": 0.25980135798454285, "learning_rate": 0.00011256868356737241, "loss": 0.2636, "step": 2275 }, { "epoch": 0.460635498886865, "grad_norm": 0.24390193819999695, "learning_rate": 0.00011250553921636047, "loss": 0.2511, "step": 2276 }, { "epoch": 0.46083788706739526, "grad_norm": 0.2550562024116516, "learning_rate": 0.00011244238979950405, "loss": 0.261, "step": 2277 }, { "epoch": 0.4610402752479255, "grad_norm": 0.33646664023399353, "learning_rate": 0.00011237923534238426, "loss": 0.2259, "step": 2278 }, { "epoch": 0.46124266342845577, "grad_norm": 0.25508198142051697, "learning_rate": 0.00011231607587058422, "loss": 0.2294, "step": 2279 }, { "epoch": 0.461445051608986, "grad_norm": 0.30878525972366333, "learning_rate": 0.00011225291140968903, "loss": 0.2416, "step": 2280 }, { "epoch": 0.4616474397895163, "grad_norm": 0.30404001474380493, "learning_rate": 0.0001121897419852859, "loss": 0.2409, "step": 2281 }, { "epoch": 0.46184982797004653, "grad_norm": 0.25172436237335205, "learning_rate": 0.000112126567622964, "loss": 0.248, "step": 2282 }, { "epoch": 0.4620522161505768, "grad_norm": 0.3161254823207855, "learning_rate": 0.00011206338834831447, "loss": 0.2536, "step": 2283 }, { "epoch": 0.46225460433110704, "grad_norm": 0.2523140609264374, "learning_rate": 0.00011200020418693056, "loss": 0.2296, "step": 2284 }, { "epoch": 0.4624569925116373, "grad_norm": 0.24689489603042603, "learning_rate": 0.00011193701516440734, "loss": 0.2302, "step": 2285 }, { "epoch": 0.46265938069216755, "grad_norm": 0.3138035237789154, "learning_rate": 0.00011187382130634199, "loss": 0.2816, "step": 2286 }, { "epoch": 0.4628617688726978, "grad_norm": 0.28723183274269104, "learning_rate": 0.00011181062263833351, "loss": 0.2331, "step": 2287 }, { "epoch": 0.46306415705322806, "grad_norm": 0.27657610177993774, "learning_rate": 0.00011174741918598299, "loss": 0.239, "step": 2288 }, { "epoch": 0.4632665452337584, "grad_norm": 0.4158143401145935, "learning_rate": 0.0001116842109748934, "loss": 0.2204, "step": 2289 }, { "epoch": 0.46346893341428863, "grad_norm": 0.2710595428943634, "learning_rate": 0.00011162099803066956, "loss": 0.2239, "step": 2290 }, { "epoch": 0.4636713215948189, "grad_norm": 0.2022787630558014, "learning_rate": 0.00011155778037891834, "loss": 0.1963, "step": 2291 }, { "epoch": 0.46387370977534914, "grad_norm": 0.22191934287548065, "learning_rate": 0.00011149455804524847, "loss": 0.2221, "step": 2292 }, { "epoch": 0.4640760979558794, "grad_norm": 0.4245707392692566, "learning_rate": 0.00011143133105527049, "loss": 0.2351, "step": 2293 }, { "epoch": 0.46427848613640965, "grad_norm": 0.30391061305999756, "learning_rate": 0.00011136809943459694, "loss": 0.2546, "step": 2294 }, { "epoch": 0.4644808743169399, "grad_norm": 0.3162023723125458, "learning_rate": 0.00011130486320884222, "loss": 0.2228, "step": 2295 }, { "epoch": 0.46468326249747016, "grad_norm": 0.2637988030910492, "learning_rate": 0.0001112416224036225, "loss": 0.2513, "step": 2296 }, { "epoch": 0.4648856506780004, "grad_norm": 0.4326387047767639, "learning_rate": 0.00011117837704455594, "loss": 0.2489, "step": 2297 }, { "epoch": 0.4650880388585307, "grad_norm": 0.3171291649341583, "learning_rate": 0.00011111512715726244, "loss": 0.2348, "step": 2298 }, { "epoch": 0.4652904270390609, "grad_norm": 0.3266473412513733, "learning_rate": 0.00011105187276736382, "loss": 0.2606, "step": 2299 }, { "epoch": 0.4654928152195912, "grad_norm": 0.24607928097248077, "learning_rate": 0.0001109886139004836, "loss": 0.2065, "step": 2300 }, { "epoch": 0.4654928152195912, "eval_loss": 0.2660582363605499, "eval_runtime": 1.3197, "eval_samples_per_second": 3.789, "eval_steps_per_second": 0.758, "step": 2300 }, { "epoch": 0.46569520340012144, "grad_norm": 0.2537308931350708, "learning_rate": 0.00011092535058224725, "loss": 0.2028, "step": 2301 }, { "epoch": 0.4658975915806517, "grad_norm": 0.2610647678375244, "learning_rate": 0.00011086208283828197, "loss": 0.2472, "step": 2302 }, { "epoch": 0.46609997976118195, "grad_norm": 0.2608964145183563, "learning_rate": 0.00011079881069421679, "loss": 0.2117, "step": 2303 }, { "epoch": 0.4663023679417122, "grad_norm": 0.2982208728790283, "learning_rate": 0.00011073553417568244, "loss": 0.233, "step": 2304 }, { "epoch": 0.46650475612224246, "grad_norm": 0.26081639528274536, "learning_rate": 0.0001106722533083115, "loss": 0.2242, "step": 2305 }, { "epoch": 0.4667071443027727, "grad_norm": 0.34071508049964905, "learning_rate": 0.00011060896811773838, "loss": 0.2585, "step": 2306 }, { "epoch": 0.46690953248330297, "grad_norm": 0.24649746716022491, "learning_rate": 0.00011054567862959899, "loss": 0.2354, "step": 2307 }, { "epoch": 0.4671119206638332, "grad_norm": 0.3163459002971649, "learning_rate": 0.00011048238486953131, "loss": 0.2613, "step": 2308 }, { "epoch": 0.4673143088443635, "grad_norm": 0.2794078290462494, "learning_rate": 0.00011041908686317479, "loss": 0.234, "step": 2309 }, { "epoch": 0.46751669702489373, "grad_norm": 0.26932984590530396, "learning_rate": 0.00011035578463617073, "loss": 0.2373, "step": 2310 }, { "epoch": 0.467719085205424, "grad_norm": 0.2641288936138153, "learning_rate": 0.00011029247821416207, "loss": 0.18, "step": 2311 }, { "epoch": 0.46792147338595425, "grad_norm": 0.3272799551486969, "learning_rate": 0.00011022916762279354, "loss": 0.2449, "step": 2312 }, { "epoch": 0.4681238615664845, "grad_norm": 0.2729485034942627, "learning_rate": 0.00011016585288771146, "loss": 0.2138, "step": 2313 }, { "epoch": 0.46832624974701476, "grad_norm": 0.19608184695243835, "learning_rate": 0.0001101025340345639, "loss": 0.1732, "step": 2314 }, { "epoch": 0.468528637927545, "grad_norm": 0.30385902523994446, "learning_rate": 0.00011003921108900052, "loss": 0.2801, "step": 2315 }, { "epoch": 0.46873102610807527, "grad_norm": 0.33474457263946533, "learning_rate": 0.00010997588407667278, "loss": 0.2877, "step": 2316 }, { "epoch": 0.4689334142886055, "grad_norm": 0.25006991624832153, "learning_rate": 0.0001099125530232336, "loss": 0.2329, "step": 2317 }, { "epoch": 0.4691358024691358, "grad_norm": 0.24891866743564606, "learning_rate": 0.0001098492179543377, "loss": 0.214, "step": 2318 }, { "epoch": 0.4693381906496661, "grad_norm": 0.27370598912239075, "learning_rate": 0.00010978587889564133, "loss": 0.2349, "step": 2319 }, { "epoch": 0.46954057883019634, "grad_norm": 0.25471097230911255, "learning_rate": 0.00010972253587280237, "loss": 0.2694, "step": 2320 }, { "epoch": 0.4697429670107266, "grad_norm": 0.3551804721355438, "learning_rate": 0.00010965918891148035, "loss": 0.2195, "step": 2321 }, { "epoch": 0.46994535519125685, "grad_norm": 0.3524259626865387, "learning_rate": 0.00010959583803733635, "loss": 0.2536, "step": 2322 }, { "epoch": 0.4701477433717871, "grad_norm": 0.24572840332984924, "learning_rate": 0.00010953248327603309, "loss": 0.2423, "step": 2323 }, { "epoch": 0.47035013155231736, "grad_norm": 0.26906469464302063, "learning_rate": 0.00010946912465323477, "loss": 0.2455, "step": 2324 }, { "epoch": 0.4705525197328476, "grad_norm": 0.2666884958744049, "learning_rate": 0.00010940576219460723, "loss": 0.2499, "step": 2325 }, { "epoch": 0.4707549079133779, "grad_norm": 0.28890320658683777, "learning_rate": 0.00010934239592581786, "loss": 0.2541, "step": 2326 }, { "epoch": 0.47095729609390813, "grad_norm": 0.264983594417572, "learning_rate": 0.00010927902587253558, "loss": 0.2309, "step": 2327 }, { "epoch": 0.4711596842744384, "grad_norm": 0.31703877449035645, "learning_rate": 0.00010921565206043083, "loss": 0.2636, "step": 2328 }, { "epoch": 0.47136207245496864, "grad_norm": 0.24766463041305542, "learning_rate": 0.00010915227451517555, "loss": 0.2197, "step": 2329 }, { "epoch": 0.4715644606354989, "grad_norm": 0.2647581100463867, "learning_rate": 0.00010908889326244331, "loss": 0.2279, "step": 2330 }, { "epoch": 0.47176684881602915, "grad_norm": 0.2700980603694916, "learning_rate": 0.00010902550832790899, "loss": 0.2245, "step": 2331 }, { "epoch": 0.4719692369965594, "grad_norm": 0.23651915788650513, "learning_rate": 0.00010896211973724919, "loss": 0.2207, "step": 2332 }, { "epoch": 0.47217162517708966, "grad_norm": 0.35579103231430054, "learning_rate": 0.00010889872751614176, "loss": 0.2429, "step": 2333 }, { "epoch": 0.4723740133576199, "grad_norm": 0.2522455155849457, "learning_rate": 0.0001088353316902662, "loss": 0.2072, "step": 2334 }, { "epoch": 0.47257640153815017, "grad_norm": 0.45010361075401306, "learning_rate": 0.00010877193228530335, "loss": 0.2411, "step": 2335 }, { "epoch": 0.4727787897186804, "grad_norm": 0.24297772347927094, "learning_rate": 0.00010870852932693565, "loss": 0.2151, "step": 2336 }, { "epoch": 0.4729811778992107, "grad_norm": 0.25942492485046387, "learning_rate": 0.00010864512284084676, "loss": 0.2249, "step": 2337 }, { "epoch": 0.47318356607974094, "grad_norm": 0.3297518789768219, "learning_rate": 0.00010858171285272194, "loss": 0.2566, "step": 2338 }, { "epoch": 0.4733859542602712, "grad_norm": 0.24841830134391785, "learning_rate": 0.00010851829938824783, "loss": 0.2499, "step": 2339 }, { "epoch": 0.47358834244080145, "grad_norm": 0.2740827202796936, "learning_rate": 0.00010845488247311249, "loss": 0.2265, "step": 2340 }, { "epoch": 0.4737907306213317, "grad_norm": 0.27901116013526917, "learning_rate": 0.00010839146213300526, "loss": 0.2379, "step": 2341 }, { "epoch": 0.47399311880186196, "grad_norm": 0.21914882957935333, "learning_rate": 0.00010832803839361704, "loss": 0.2279, "step": 2342 }, { "epoch": 0.4741955069823922, "grad_norm": 0.2769947052001953, "learning_rate": 0.00010826461128064004, "loss": 0.2171, "step": 2343 }, { "epoch": 0.47439789516292247, "grad_norm": 0.36490288376808167, "learning_rate": 0.00010820118081976777, "loss": 0.2754, "step": 2344 }, { "epoch": 0.4746002833434527, "grad_norm": 0.3090001344680786, "learning_rate": 0.0001081377470366952, "loss": 0.1875, "step": 2345 }, { "epoch": 0.474802671523983, "grad_norm": 0.2961217164993286, "learning_rate": 0.00010807430995711856, "loss": 0.2325, "step": 2346 }, { "epoch": 0.47500505970451323, "grad_norm": 0.2425258755683899, "learning_rate": 0.00010801086960673547, "loss": 0.2085, "step": 2347 }, { "epoch": 0.4752074478850435, "grad_norm": 0.3083477318286896, "learning_rate": 0.00010794742601124485, "loss": 0.2415, "step": 2348 }, { "epoch": 0.47540983606557374, "grad_norm": 0.3357618451118469, "learning_rate": 0.00010788397919634694, "loss": 0.2437, "step": 2349 }, { "epoch": 0.47561222424610405, "grad_norm": 0.32089847326278687, "learning_rate": 0.0001078205291877433, "loss": 0.1935, "step": 2350 }, { "epoch": 0.47561222424610405, "eval_loss": 0.2708786129951477, "eval_runtime": 1.3219, "eval_samples_per_second": 3.782, "eval_steps_per_second": 0.756, "step": 2350 }, { "epoch": 0.4758146124266343, "grad_norm": 0.25444650650024414, "learning_rate": 0.0001077570760111368, "loss": 0.2596, "step": 2351 }, { "epoch": 0.47601700060716456, "grad_norm": 0.2863706052303314, "learning_rate": 0.00010769361969223146, "loss": 0.2348, "step": 2352 }, { "epoch": 0.4762193887876948, "grad_norm": 0.24973492324352264, "learning_rate": 0.00010763016025673281, "loss": 0.2077, "step": 2353 }, { "epoch": 0.4764217769682251, "grad_norm": 0.2832881510257721, "learning_rate": 0.00010756669773034742, "loss": 0.263, "step": 2354 }, { "epoch": 0.47662416514875533, "grad_norm": 0.2901322841644287, "learning_rate": 0.00010750323213878319, "loss": 0.2186, "step": 2355 }, { "epoch": 0.4768265533292856, "grad_norm": 0.24156495928764343, "learning_rate": 0.00010743976350774936, "loss": 0.2334, "step": 2356 }, { "epoch": 0.47702894150981584, "grad_norm": 0.24589768052101135, "learning_rate": 0.00010737629186295622, "loss": 0.2657, "step": 2357 }, { "epoch": 0.4772313296903461, "grad_norm": 0.4988222122192383, "learning_rate": 0.00010731281723011541, "loss": 0.2644, "step": 2358 }, { "epoch": 0.47743371787087635, "grad_norm": 0.3501219153404236, "learning_rate": 0.00010724933963493978, "loss": 0.2703, "step": 2359 }, { "epoch": 0.4776361060514066, "grad_norm": 0.3000340163707733, "learning_rate": 0.00010718585910314332, "loss": 0.2651, "step": 2360 }, { "epoch": 0.47783849423193686, "grad_norm": 0.2568757236003876, "learning_rate": 0.00010712237566044119, "loss": 0.2095, "step": 2361 }, { "epoch": 0.4780408824124671, "grad_norm": 0.3020975589752197, "learning_rate": 0.00010705888933254984, "loss": 0.2128, "step": 2362 }, { "epoch": 0.47824327059299737, "grad_norm": 0.2676979601383209, "learning_rate": 0.00010699540014518679, "loss": 0.2301, "step": 2363 }, { "epoch": 0.4784456587735276, "grad_norm": 0.3454724848270416, "learning_rate": 0.00010693190812407077, "loss": 0.2603, "step": 2364 }, { "epoch": 0.4786480469540579, "grad_norm": 0.25040000677108765, "learning_rate": 0.00010686841329492158, "loss": 0.2464, "step": 2365 }, { "epoch": 0.47885043513458814, "grad_norm": 0.22993789613246918, "learning_rate": 0.00010680491568346029, "loss": 0.2011, "step": 2366 }, { "epoch": 0.4790528233151184, "grad_norm": 0.309632807970047, "learning_rate": 0.00010674141531540902, "loss": 0.2741, "step": 2367 }, { "epoch": 0.47925521149564865, "grad_norm": 0.2624116837978363, "learning_rate": 0.00010667791221649097, "loss": 0.2542, "step": 2368 }, { "epoch": 0.4794575996761789, "grad_norm": 0.274515837430954, "learning_rate": 0.00010661440641243048, "loss": 0.2663, "step": 2369 }, { "epoch": 0.47965998785670916, "grad_norm": 0.2694079875946045, "learning_rate": 0.00010655089792895304, "loss": 0.228, "step": 2370 }, { "epoch": 0.4798623760372394, "grad_norm": 0.2871139347553253, "learning_rate": 0.00010648738679178514, "loss": 0.2377, "step": 2371 }, { "epoch": 0.48006476421776967, "grad_norm": 0.2794390916824341, "learning_rate": 0.0001064238730266544, "loss": 0.267, "step": 2372 }, { "epoch": 0.4802671523982999, "grad_norm": 0.2174002230167389, "learning_rate": 0.00010636035665928946, "loss": 0.2168, "step": 2373 }, { "epoch": 0.4804695405788302, "grad_norm": 0.24901361763477325, "learning_rate": 0.00010629683771542008, "loss": 0.2263, "step": 2374 }, { "epoch": 0.48067192875936043, "grad_norm": 0.2867283523082733, "learning_rate": 0.00010623331622077702, "loss": 0.24, "step": 2375 }, { "epoch": 0.4808743169398907, "grad_norm": 0.5113086700439453, "learning_rate": 0.000106169792201092, "loss": 0.2276, "step": 2376 }, { "epoch": 0.48107670512042094, "grad_norm": 0.3070889115333557, "learning_rate": 0.00010610626568209799, "loss": 0.2327, "step": 2377 }, { "epoch": 0.4812790933009512, "grad_norm": 0.2681262195110321, "learning_rate": 0.00010604273668952871, "loss": 0.2106, "step": 2378 }, { "epoch": 0.48148148148148145, "grad_norm": 0.4238930642604828, "learning_rate": 0.000105979205249119, "loss": 0.22, "step": 2379 }, { "epoch": 0.48168386966201177, "grad_norm": 0.2735174894332886, "learning_rate": 0.00010591567138660474, "loss": 0.2425, "step": 2380 }, { "epoch": 0.481886257842542, "grad_norm": 0.2675718069076538, "learning_rate": 0.0001058521351277227, "loss": 0.2184, "step": 2381 }, { "epoch": 0.4820886460230723, "grad_norm": 0.2296140044927597, "learning_rate": 0.00010578859649821067, "loss": 0.2337, "step": 2382 }, { "epoch": 0.48229103420360253, "grad_norm": 0.23198723793029785, "learning_rate": 0.00010572505552380738, "loss": 0.2217, "step": 2383 }, { "epoch": 0.4824934223841328, "grad_norm": 0.2475014328956604, "learning_rate": 0.00010566151223025256, "loss": 0.2596, "step": 2384 }, { "epoch": 0.48269581056466304, "grad_norm": 0.2581985592842102, "learning_rate": 0.00010559796664328679, "loss": 0.2597, "step": 2385 }, { "epoch": 0.4828981987451933, "grad_norm": 0.22181877493858337, "learning_rate": 0.00010553441878865162, "loss": 0.2348, "step": 2386 }, { "epoch": 0.48310058692572355, "grad_norm": 0.29320740699768066, "learning_rate": 0.00010547086869208958, "loss": 0.2452, "step": 2387 }, { "epoch": 0.4833029751062538, "grad_norm": 0.3080987334251404, "learning_rate": 0.00010540731637934402, "loss": 0.2187, "step": 2388 }, { "epoch": 0.48350536328678406, "grad_norm": 0.3621383011341095, "learning_rate": 0.00010534376187615925, "loss": 0.2771, "step": 2389 }, { "epoch": 0.4837077514673143, "grad_norm": 0.274488627910614, "learning_rate": 0.00010528020520828036, "loss": 0.2449, "step": 2390 }, { "epoch": 0.4839101396478446, "grad_norm": 0.3163178861141205, "learning_rate": 0.00010521664640145353, "loss": 0.2643, "step": 2391 }, { "epoch": 0.48411252782837483, "grad_norm": 0.29728469252586365, "learning_rate": 0.00010515308548142553, "loss": 0.251, "step": 2392 }, { "epoch": 0.4843149160089051, "grad_norm": 0.28153446316719055, "learning_rate": 0.00010508952247394422, "loss": 0.2557, "step": 2393 }, { "epoch": 0.48451730418943534, "grad_norm": 0.2933938205242157, "learning_rate": 0.00010502595740475818, "loss": 0.287, "step": 2394 }, { "epoch": 0.4847196923699656, "grad_norm": 0.2985136806964874, "learning_rate": 0.0001049623902996169, "loss": 0.2555, "step": 2395 }, { "epoch": 0.48492208055049585, "grad_norm": 0.2639777958393097, "learning_rate": 0.00010489882118427062, "loss": 0.2353, "step": 2396 }, { "epoch": 0.4851244687310261, "grad_norm": 0.22662323713302612, "learning_rate": 0.0001048352500844704, "loss": 0.2271, "step": 2397 }, { "epoch": 0.48532685691155636, "grad_norm": 0.2585567831993103, "learning_rate": 0.00010477167702596817, "loss": 0.2281, "step": 2398 }, { "epoch": 0.4855292450920866, "grad_norm": 0.33381155133247375, "learning_rate": 0.00010470810203451665, "loss": 0.2328, "step": 2399 }, { "epoch": 0.48573163327261687, "grad_norm": 0.25043901801109314, "learning_rate": 0.00010464452513586922, "loss": 0.2459, "step": 2400 }, { "epoch": 0.48573163327261687, "eval_loss": 0.26652273535728455, "eval_runtime": 1.3211, "eval_samples_per_second": 3.785, "eval_steps_per_second": 0.757, "step": 2400 }, { "epoch": 0.4859340214531471, "grad_norm": 0.2609994411468506, "learning_rate": 0.00010458094635578022, "loss": 0.243, "step": 2401 }, { "epoch": 0.4861364096336774, "grad_norm": 0.24959850311279297, "learning_rate": 0.00010451736572000462, "loss": 0.2222, "step": 2402 }, { "epoch": 0.48633879781420764, "grad_norm": 0.24390028417110443, "learning_rate": 0.00010445378325429809, "loss": 0.2619, "step": 2403 }, { "epoch": 0.4865411859947379, "grad_norm": 0.3464578092098236, "learning_rate": 0.00010439019898441724, "loss": 0.2238, "step": 2404 }, { "epoch": 0.48674357417526815, "grad_norm": 0.2835236191749573, "learning_rate": 0.00010432661293611928, "loss": 0.238, "step": 2405 }, { "epoch": 0.4869459623557984, "grad_norm": 0.23934447765350342, "learning_rate": 0.00010426302513516209, "loss": 0.2145, "step": 2406 }, { "epoch": 0.48714835053632866, "grad_norm": 0.29843488335609436, "learning_rate": 0.00010419943560730439, "loss": 0.2163, "step": 2407 }, { "epoch": 0.4873507387168589, "grad_norm": 0.2832874059677124, "learning_rate": 0.00010413584437830555, "loss": 0.2276, "step": 2408 }, { "epoch": 0.48755312689738917, "grad_norm": 0.25486651062965393, "learning_rate": 0.00010407225147392555, "loss": 0.2264, "step": 2409 }, { "epoch": 0.4877555150779195, "grad_norm": 0.27106475830078125, "learning_rate": 0.00010400865691992517, "loss": 0.2359, "step": 2410 }, { "epoch": 0.48795790325844973, "grad_norm": 0.23776985704898834, "learning_rate": 0.00010394506074206578, "loss": 0.2384, "step": 2411 }, { "epoch": 0.48816029143898, "grad_norm": 0.26946938037872314, "learning_rate": 0.00010388146296610945, "loss": 0.2194, "step": 2412 }, { "epoch": 0.48836267961951024, "grad_norm": 0.26413363218307495, "learning_rate": 0.00010381786361781885, "loss": 0.257, "step": 2413 }, { "epoch": 0.4885650678000405, "grad_norm": 0.28128504753112793, "learning_rate": 0.00010375426272295735, "loss": 0.2632, "step": 2414 }, { "epoch": 0.48876745598057075, "grad_norm": 0.260922372341156, "learning_rate": 0.00010369066030728889, "loss": 0.2263, "step": 2415 }, { "epoch": 0.488969844161101, "grad_norm": 0.3633415400981903, "learning_rate": 0.00010362705639657806, "loss": 0.3208, "step": 2416 }, { "epoch": 0.48917223234163126, "grad_norm": 0.3404455780982971, "learning_rate": 0.00010356345101659001, "loss": 0.2559, "step": 2417 }, { "epoch": 0.4893746205221615, "grad_norm": 0.24845273792743683, "learning_rate": 0.00010349984419309056, "loss": 0.225, "step": 2418 }, { "epoch": 0.4895770087026918, "grad_norm": 0.226867213845253, "learning_rate": 0.00010343623595184608, "loss": 0.2399, "step": 2419 }, { "epoch": 0.48977939688322203, "grad_norm": 0.24961335957050323, "learning_rate": 0.00010337262631862348, "loss": 0.2365, "step": 2420 }, { "epoch": 0.4899817850637523, "grad_norm": 0.34672167897224426, "learning_rate": 0.00010330901531919026, "loss": 0.2489, "step": 2421 }, { "epoch": 0.49018417324428254, "grad_norm": 0.277829647064209, "learning_rate": 0.00010324540297931449, "loss": 0.2157, "step": 2422 }, { "epoch": 0.4903865614248128, "grad_norm": 0.21599017083644867, "learning_rate": 0.00010318178932476478, "loss": 0.198, "step": 2423 }, { "epoch": 0.49058894960534305, "grad_norm": 0.3362172842025757, "learning_rate": 0.00010311817438131021, "loss": 0.1774, "step": 2424 }, { "epoch": 0.4907913377858733, "grad_norm": 0.2567797601222992, "learning_rate": 0.00010305455817472052, "loss": 0.24, "step": 2425 }, { "epoch": 0.49099372596640356, "grad_norm": 0.33192771673202515, "learning_rate": 0.0001029909407307658, "loss": 0.2304, "step": 2426 }, { "epoch": 0.4911961141469338, "grad_norm": 0.29464447498321533, "learning_rate": 0.00010292732207521672, "loss": 0.2707, "step": 2427 }, { "epoch": 0.49139850232746407, "grad_norm": 0.35476332902908325, "learning_rate": 0.0001028637022338445, "loss": 0.2473, "step": 2428 }, { "epoch": 0.4916008905079943, "grad_norm": 0.2740033268928528, "learning_rate": 0.00010280008123242068, "loss": 0.2229, "step": 2429 }, { "epoch": 0.4918032786885246, "grad_norm": 0.2470201849937439, "learning_rate": 0.00010273645909671746, "loss": 0.2795, "step": 2430 }, { "epoch": 0.49200566686905484, "grad_norm": 0.2428119033575058, "learning_rate": 0.00010267283585250735, "loss": 0.2335, "step": 2431 }, { "epoch": 0.4922080550495851, "grad_norm": 0.35632118582725525, "learning_rate": 0.00010260921152556342, "loss": 0.2608, "step": 2432 }, { "epoch": 0.49241044323011535, "grad_norm": 0.2691579759120941, "learning_rate": 0.00010254558614165906, "loss": 0.2427, "step": 2433 }, { "epoch": 0.4926128314106456, "grad_norm": 0.2643824815750122, "learning_rate": 0.00010248195972656818, "loss": 0.2194, "step": 2434 }, { "epoch": 0.49281521959117586, "grad_norm": 0.2884403467178345, "learning_rate": 0.00010241833230606508, "loss": 0.2462, "step": 2435 }, { "epoch": 0.4930176077717061, "grad_norm": 0.23490701615810394, "learning_rate": 0.00010235470390592448, "loss": 0.2329, "step": 2436 }, { "epoch": 0.49321999595223637, "grad_norm": 0.2643316686153412, "learning_rate": 0.00010229107455192147, "loss": 0.26, "step": 2437 }, { "epoch": 0.4934223841327666, "grad_norm": 0.3046799898147583, "learning_rate": 0.00010222744426983153, "loss": 0.2519, "step": 2438 }, { "epoch": 0.4936247723132969, "grad_norm": 0.2748842239379883, "learning_rate": 0.00010216381308543057, "loss": 0.2389, "step": 2439 }, { "epoch": 0.49382716049382713, "grad_norm": 0.26580435037612915, "learning_rate": 0.00010210018102449477, "loss": 0.2539, "step": 2440 }, { "epoch": 0.49402954867435744, "grad_norm": 0.2602427303791046, "learning_rate": 0.00010203654811280075, "loss": 0.2279, "step": 2441 }, { "epoch": 0.4942319368548877, "grad_norm": 0.32396399974823, "learning_rate": 0.00010197291437612541, "loss": 0.2454, "step": 2442 }, { "epoch": 0.49443432503541795, "grad_norm": 0.23170936107635498, "learning_rate": 0.00010190927984024607, "loss": 0.2529, "step": 2443 }, { "epoch": 0.4946367132159482, "grad_norm": 0.373409628868103, "learning_rate": 0.00010184564453094028, "loss": 0.2788, "step": 2444 }, { "epoch": 0.49483910139647846, "grad_norm": 0.2425958663225174, "learning_rate": 0.00010178200847398594, "loss": 0.2385, "step": 2445 }, { "epoch": 0.4950414895770087, "grad_norm": 0.26942890882492065, "learning_rate": 0.00010171837169516128, "loss": 0.2154, "step": 2446 }, { "epoch": 0.495243877757539, "grad_norm": 0.3090429902076721, "learning_rate": 0.00010165473422024478, "loss": 0.2128, "step": 2447 }, { "epoch": 0.49544626593806923, "grad_norm": 0.29562005400657654, "learning_rate": 0.0001015910960750152, "loss": 0.2759, "step": 2448 }, { "epoch": 0.4956486541185995, "grad_norm": 0.3348391354084015, "learning_rate": 0.00010152745728525166, "loss": 0.2685, "step": 2449 }, { "epoch": 0.49585104229912974, "grad_norm": 0.2313123643398285, "learning_rate": 0.00010146381787673342, "loss": 0.2234, "step": 2450 }, { "epoch": 0.49585104229912974, "eval_loss": 0.266786128282547, "eval_runtime": 1.3185, "eval_samples_per_second": 3.792, "eval_steps_per_second": 0.758, "step": 2450 }, { "epoch": 0.49605343047966, "grad_norm": 0.25644999742507935, "learning_rate": 0.00010140017787524003, "loss": 0.2505, "step": 2451 }, { "epoch": 0.49625581866019025, "grad_norm": 0.2702362537384033, "learning_rate": 0.00010133653730655138, "loss": 0.2787, "step": 2452 }, { "epoch": 0.4964582068407205, "grad_norm": 0.29998981952667236, "learning_rate": 0.00010127289619644738, "loss": 0.2634, "step": 2453 }, { "epoch": 0.49666059502125076, "grad_norm": 0.2531276047229767, "learning_rate": 0.00010120925457070836, "loss": 0.218, "step": 2454 }, { "epoch": 0.496862983201781, "grad_norm": 0.2609771490097046, "learning_rate": 0.00010114561245511478, "loss": 0.2579, "step": 2455 }, { "epoch": 0.49706537138231127, "grad_norm": 0.2732056677341461, "learning_rate": 0.00010108196987544729, "loss": 0.2362, "step": 2456 }, { "epoch": 0.4972677595628415, "grad_norm": 0.5929343700408936, "learning_rate": 0.00010101832685748672, "loss": 0.2834, "step": 2457 }, { "epoch": 0.4974701477433718, "grad_norm": 0.2501356899738312, "learning_rate": 0.0001009546834270141, "loss": 0.2128, "step": 2458 }, { "epoch": 0.49767253592390204, "grad_norm": 0.27269136905670166, "learning_rate": 0.00010089103960981068, "loss": 0.2357, "step": 2459 }, { "epoch": 0.4978749241044323, "grad_norm": 0.24102306365966797, "learning_rate": 0.00010082739543165773, "loss": 0.2496, "step": 2460 }, { "epoch": 0.49807731228496255, "grad_norm": 0.31241747736930847, "learning_rate": 0.00010076375091833681, "loss": 0.2244, "step": 2461 }, { "epoch": 0.4982797004654928, "grad_norm": 0.20862232148647308, "learning_rate": 0.00010070010609562948, "loss": 0.1843, "step": 2462 }, { "epoch": 0.49848208864602306, "grad_norm": 0.2571997046470642, "learning_rate": 0.00010063646098931758, "loss": 0.2581, "step": 2463 }, { "epoch": 0.4986844768265533, "grad_norm": 0.33524224162101746, "learning_rate": 0.00010057281562518292, "loss": 0.25, "step": 2464 }, { "epoch": 0.49888686500708357, "grad_norm": 0.2212115079164505, "learning_rate": 0.0001005091700290075, "loss": 0.2103, "step": 2465 }, { "epoch": 0.4990892531876138, "grad_norm": 0.2426644265651703, "learning_rate": 0.00010044552422657338, "loss": 0.2242, "step": 2466 }, { "epoch": 0.4992916413681441, "grad_norm": 0.29241469502449036, "learning_rate": 0.00010038187824366274, "loss": 0.2127, "step": 2467 }, { "epoch": 0.49949402954867433, "grad_norm": 0.268191933631897, "learning_rate": 0.00010031823210605777, "loss": 0.2324, "step": 2468 }, { "epoch": 0.4996964177292046, "grad_norm": 0.2736126184463501, "learning_rate": 0.00010025458583954077, "loss": 0.2307, "step": 2469 }, { "epoch": 0.49989880590973484, "grad_norm": 0.2617895305156708, "learning_rate": 0.0001001909394698941, "loss": 0.2135, "step": 2470 }, { "epoch": 0.5001011940902651, "grad_norm": 0.3124042749404907, "learning_rate": 0.00010012729302290016, "loss": 0.2851, "step": 2471 }, { "epoch": 0.5003035822707954, "grad_norm": 0.2281905859708786, "learning_rate": 0.00010006364652434129, "loss": 0.2174, "step": 2472 }, { "epoch": 0.5005059704513256, "grad_norm": 0.29106268286705017, "learning_rate": 0.0001, "loss": 0.2651, "step": 2473 }, { "epoch": 0.5007083586318559, "grad_norm": 0.2818595767021179, "learning_rate": 9.993635347565872e-05, "loss": 0.2319, "step": 2474 }, { "epoch": 0.5009107468123861, "grad_norm": 0.30528661608695984, "learning_rate": 9.987270697709988e-05, "loss": 0.2653, "step": 2475 }, { "epoch": 0.5011131349929164, "grad_norm": 0.26907017827033997, "learning_rate": 9.980906053010592e-05, "loss": 0.2212, "step": 2476 }, { "epoch": 0.5013155231734466, "grad_norm": 0.2499941736459732, "learning_rate": 9.974541416045924e-05, "loss": 0.2105, "step": 2477 }, { "epoch": 0.5015179113539769, "grad_norm": 0.40199223160743713, "learning_rate": 9.968176789394225e-05, "loss": 0.2088, "step": 2478 }, { "epoch": 0.5017202995345071, "grad_norm": 0.23395894467830658, "learning_rate": 9.96181217563373e-05, "loss": 0.2416, "step": 2479 }, { "epoch": 0.5019226877150375, "grad_norm": 0.26265814900398254, "learning_rate": 9.955447577342665e-05, "loss": 0.2349, "step": 2480 }, { "epoch": 0.5021250758955677, "grad_norm": 0.24885410070419312, "learning_rate": 9.949082997099252e-05, "loss": 0.2341, "step": 2481 }, { "epoch": 0.502327464076098, "grad_norm": 0.3000493347644806, "learning_rate": 9.94271843748171e-05, "loss": 0.2141, "step": 2482 }, { "epoch": 0.5025298522566282, "grad_norm": 0.22744765877723694, "learning_rate": 9.936353901068246e-05, "loss": 0.2055, "step": 2483 }, { "epoch": 0.5027322404371585, "grad_norm": 0.5010953545570374, "learning_rate": 9.929989390437053e-05, "loss": 0.2309, "step": 2484 }, { "epoch": 0.5029346286176887, "grad_norm": 0.3712059259414673, "learning_rate": 9.923624908166322e-05, "loss": 0.2728, "step": 2485 }, { "epoch": 0.503137016798219, "grad_norm": 0.24506396055221558, "learning_rate": 9.917260456834229e-05, "loss": 0.2058, "step": 2486 }, { "epoch": 0.5033394049787493, "grad_norm": 0.31447944045066833, "learning_rate": 9.910896039018936e-05, "loss": 0.2426, "step": 2487 }, { "epoch": 0.5035417931592795, "grad_norm": 0.2355445921421051, "learning_rate": 9.904531657298591e-05, "loss": 0.224, "step": 2488 }, { "epoch": 0.5037441813398098, "grad_norm": 0.2530558109283447, "learning_rate": 9.898167314251328e-05, "loss": 0.2453, "step": 2489 }, { "epoch": 0.50394656952034, "grad_norm": 0.24408289790153503, "learning_rate": 9.891803012455276e-05, "loss": 0.2192, "step": 2490 }, { "epoch": 0.5041489577008703, "grad_norm": 0.2832314372062683, "learning_rate": 9.885438754488525e-05, "loss": 0.2505, "step": 2491 }, { "epoch": 0.5043513458814005, "grad_norm": 0.22832061350345612, "learning_rate": 9.879074542929167e-05, "loss": 0.2088, "step": 2492 }, { "epoch": 0.5045537340619308, "grad_norm": 0.24451425671577454, "learning_rate": 9.872710380355262e-05, "loss": 0.2396, "step": 2493 }, { "epoch": 0.504756122242461, "grad_norm": 0.2629949152469635, "learning_rate": 9.866346269344869e-05, "loss": 0.2276, "step": 2494 }, { "epoch": 0.5049585104229913, "grad_norm": 0.3229413628578186, "learning_rate": 9.859982212475999e-05, "loss": 0.2257, "step": 2495 }, { "epoch": 0.5051608986035215, "grad_norm": 0.26007333397865295, "learning_rate": 9.853618212326659e-05, "loss": 0.2365, "step": 2496 }, { "epoch": 0.5053632867840518, "grad_norm": 0.35556134581565857, "learning_rate": 9.847254271474833e-05, "loss": 0.2427, "step": 2497 }, { "epoch": 0.505565674964582, "grad_norm": 0.35611677169799805, "learning_rate": 9.840890392498481e-05, "loss": 0.216, "step": 2498 }, { "epoch": 0.5057680631451124, "grad_norm": 0.23794762790203094, "learning_rate": 9.834526577975524e-05, "loss": 0.2191, "step": 2499 }, { "epoch": 0.5059704513256426, "grad_norm": 0.30526167154312134, "learning_rate": 9.828162830483873e-05, "loss": 0.2678, "step": 2500 }, { "epoch": 0.5059704513256426, "eval_loss": 0.2628551721572876, "eval_runtime": 1.3152, "eval_samples_per_second": 3.802, "eval_steps_per_second": 0.76, "step": 2500 }, { "epoch": 0.5061728395061729, "grad_norm": 0.3397901654243469, "learning_rate": 9.821799152601411e-05, "loss": 0.2451, "step": 2501 }, { "epoch": 0.5063752276867031, "grad_norm": 0.27359530329704285, "learning_rate": 9.815435546905974e-05, "loss": 0.2596, "step": 2502 }, { "epoch": 0.5065776158672334, "grad_norm": 0.28169986605644226, "learning_rate": 9.809072015975395e-05, "loss": 0.2121, "step": 2503 }, { "epoch": 0.5067800040477636, "grad_norm": 0.33974123001098633, "learning_rate": 9.802708562387458e-05, "loss": 0.2509, "step": 2504 }, { "epoch": 0.5069823922282939, "grad_norm": 0.2948351204395294, "learning_rate": 9.79634518871993e-05, "loss": 0.255, "step": 2505 }, { "epoch": 0.5071847804088241, "grad_norm": 0.2774645686149597, "learning_rate": 9.789981897550526e-05, "loss": 0.2535, "step": 2506 }, { "epoch": 0.5073871685893544, "grad_norm": 0.26226383447647095, "learning_rate": 9.783618691456945e-05, "loss": 0.24, "step": 2507 }, { "epoch": 0.5075895567698846, "grad_norm": 0.2508104145526886, "learning_rate": 9.777255573016847e-05, "loss": 0.2065, "step": 2508 }, { "epoch": 0.5077919449504149, "grad_norm": 0.26128527522087097, "learning_rate": 9.770892544807856e-05, "loss": 0.2447, "step": 2509 }, { "epoch": 0.5079943331309451, "grad_norm": 0.27602192759513855, "learning_rate": 9.764529609407553e-05, "loss": 0.2599, "step": 2510 }, { "epoch": 0.5081967213114754, "grad_norm": 0.2233620584011078, "learning_rate": 9.758166769393493e-05, "loss": 0.2149, "step": 2511 }, { "epoch": 0.5083991094920056, "grad_norm": 0.22100715339183807, "learning_rate": 9.751804027343186e-05, "loss": 0.2025, "step": 2512 }, { "epoch": 0.5086014976725359, "grad_norm": 0.2950727343559265, "learning_rate": 9.745441385834097e-05, "loss": 0.2391, "step": 2513 }, { "epoch": 0.5088038858530661, "grad_norm": 0.22906029224395752, "learning_rate": 9.739078847443661e-05, "loss": 0.2281, "step": 2514 }, { "epoch": 0.5090062740335964, "grad_norm": 0.22737666964530945, "learning_rate": 9.732716414749265e-05, "loss": 0.2202, "step": 2515 }, { "epoch": 0.5092086622141268, "grad_norm": 0.35784223675727844, "learning_rate": 9.726354090328257e-05, "loss": 0.2472, "step": 2516 }, { "epoch": 0.509411050394657, "grad_norm": 0.20221096277236938, "learning_rate": 9.719991876757934e-05, "loss": 0.1826, "step": 2517 }, { "epoch": 0.5096134385751873, "grad_norm": 0.2307714968919754, "learning_rate": 9.713629776615554e-05, "loss": 0.2347, "step": 2518 }, { "epoch": 0.5098158267557175, "grad_norm": 0.25789138674736023, "learning_rate": 9.707267792478328e-05, "loss": 0.2542, "step": 2519 }, { "epoch": 0.5100182149362478, "grad_norm": 0.2408142387866974, "learning_rate": 9.700905926923423e-05, "loss": 0.2352, "step": 2520 }, { "epoch": 0.510220603116778, "grad_norm": 0.2210666835308075, "learning_rate": 9.694544182527952e-05, "loss": 0.1688, "step": 2521 }, { "epoch": 0.5104229912973083, "grad_norm": 0.3379744589328766, "learning_rate": 9.68818256186898e-05, "loss": 0.2047, "step": 2522 }, { "epoch": 0.5106253794778385, "grad_norm": 0.29697558283805847, "learning_rate": 9.681821067523527e-05, "loss": 0.2324, "step": 2523 }, { "epoch": 0.5108277676583688, "grad_norm": 0.26220622658729553, "learning_rate": 9.675459702068554e-05, "loss": 0.2361, "step": 2524 }, { "epoch": 0.511030155838899, "grad_norm": 0.2157612144947052, "learning_rate": 9.669098468080976e-05, "loss": 0.1854, "step": 2525 }, { "epoch": 0.5112325440194293, "grad_norm": 0.22162605822086334, "learning_rate": 9.662737368137654e-05, "loss": 0.2025, "step": 2526 }, { "epoch": 0.5114349321999595, "grad_norm": 0.22280775010585785, "learning_rate": 9.656376404815395e-05, "loss": 0.2163, "step": 2527 }, { "epoch": 0.5116373203804898, "grad_norm": 0.27487877011299133, "learning_rate": 9.650015580690945e-05, "loss": 0.2777, "step": 2528 }, { "epoch": 0.51183970856102, "grad_norm": 0.26948386430740356, "learning_rate": 9.643654898341e-05, "loss": 0.2123, "step": 2529 }, { "epoch": 0.5120420967415503, "grad_norm": 0.27343207597732544, "learning_rate": 9.637294360342197e-05, "loss": 0.2386, "step": 2530 }, { "epoch": 0.5122444849220805, "grad_norm": 0.2726834714412689, "learning_rate": 9.630933969271114e-05, "loss": 0.233, "step": 2531 }, { "epoch": 0.5124468731026108, "grad_norm": 0.2700757086277008, "learning_rate": 9.624573727704268e-05, "loss": 0.2451, "step": 2532 }, { "epoch": 0.512649261283141, "grad_norm": 0.3605319857597351, "learning_rate": 9.618213638218116e-05, "loss": 0.2789, "step": 2533 }, { "epoch": 0.5128516494636713, "grad_norm": 0.2853947579860687, "learning_rate": 9.61185370338906e-05, "loss": 0.2738, "step": 2534 }, { "epoch": 0.5130540376442015, "grad_norm": 0.30760255455970764, "learning_rate": 9.605493925793425e-05, "loss": 0.2489, "step": 2535 }, { "epoch": 0.5132564258247319, "grad_norm": 0.3698234260082245, "learning_rate": 9.599134308007486e-05, "loss": 0.2707, "step": 2536 }, { "epoch": 0.513458814005262, "grad_norm": 0.23096823692321777, "learning_rate": 9.592774852607445e-05, "loss": 0.2151, "step": 2537 }, { "epoch": 0.5136612021857924, "grad_norm": 0.26622629165649414, "learning_rate": 9.58641556216945e-05, "loss": 0.256, "step": 2538 }, { "epoch": 0.5138635903663226, "grad_norm": 0.2816718518733978, "learning_rate": 9.580056439269564e-05, "loss": 0.2308, "step": 2539 }, { "epoch": 0.5140659785468529, "grad_norm": 0.22424639761447906, "learning_rate": 9.573697486483794e-05, "loss": 0.2153, "step": 2540 }, { "epoch": 0.5142683667273831, "grad_norm": 0.26229122281074524, "learning_rate": 9.567338706388073e-05, "loss": 0.2493, "step": 2541 }, { "epoch": 0.5144707549079134, "grad_norm": 0.2750069499015808, "learning_rate": 9.560980101558279e-05, "loss": 0.2464, "step": 2542 }, { "epoch": 0.5146731430884436, "grad_norm": 0.3223314881324768, "learning_rate": 9.554621674570194e-05, "loss": 0.2259, "step": 2543 }, { "epoch": 0.5148755312689739, "grad_norm": 0.23399107158184052, "learning_rate": 9.548263427999542e-05, "loss": 0.2135, "step": 2544 }, { "epoch": 0.5150779194495041, "grad_norm": 0.30518293380737305, "learning_rate": 9.541905364421978e-05, "loss": 0.2909, "step": 2545 }, { "epoch": 0.5152803076300344, "grad_norm": 0.2462664246559143, "learning_rate": 9.53554748641308e-05, "loss": 0.2226, "step": 2546 }, { "epoch": 0.5154826958105647, "grad_norm": 0.26325687766075134, "learning_rate": 9.529189796548338e-05, "loss": 0.2577, "step": 2547 }, { "epoch": 0.5156850839910949, "grad_norm": 0.2973167896270752, "learning_rate": 9.522832297403183e-05, "loss": 0.2815, "step": 2548 }, { "epoch": 0.5158874721716252, "grad_norm": 0.3428213894367218, "learning_rate": 9.516474991552964e-05, "loss": 0.2806, "step": 2549 }, { "epoch": 0.5160898603521554, "grad_norm": 0.23133951425552368, "learning_rate": 9.510117881572942e-05, "loss": 0.176, "step": 2550 }, { "epoch": 0.5160898603521554, "eval_loss": 0.26222530007362366, "eval_runtime": 1.3171, "eval_samples_per_second": 3.796, "eval_steps_per_second": 0.759, "step": 2550 }, { "epoch": 0.5162922485326857, "grad_norm": 0.27165916562080383, "learning_rate": 9.503760970038312e-05, "loss": 0.247, "step": 2551 }, { "epoch": 0.5164946367132159, "grad_norm": 0.27054649591445923, "learning_rate": 9.497404259524182e-05, "loss": 0.249, "step": 2552 }, { "epoch": 0.5166970248937462, "grad_norm": 0.3385717272758484, "learning_rate": 9.491047752605583e-05, "loss": 0.2855, "step": 2553 }, { "epoch": 0.5168994130742764, "grad_norm": 0.25887152552604675, "learning_rate": 9.48469145185745e-05, "loss": 0.2477, "step": 2554 }, { "epoch": 0.5171018012548068, "grad_norm": 0.26196324825286865, "learning_rate": 9.478335359854651e-05, "loss": 0.2303, "step": 2555 }, { "epoch": 0.517304189435337, "grad_norm": 0.2598777413368225, "learning_rate": 9.471979479171963e-05, "loss": 0.2287, "step": 2556 }, { "epoch": 0.5175065776158673, "grad_norm": 0.29140013456344604, "learning_rate": 9.465623812384079e-05, "loss": 0.2059, "step": 2557 }, { "epoch": 0.5177089657963975, "grad_norm": 0.3534790575504303, "learning_rate": 9.459268362065599e-05, "loss": 0.2079, "step": 2558 }, { "epoch": 0.5179113539769278, "grad_norm": 0.3243381083011627, "learning_rate": 9.452913130791043e-05, "loss": 0.206, "step": 2559 }, { "epoch": 0.518113742157458, "grad_norm": 0.2610268294811249, "learning_rate": 9.44655812113484e-05, "loss": 0.2284, "step": 2560 }, { "epoch": 0.5183161303379883, "grad_norm": 0.2984575927257538, "learning_rate": 9.440203335671323e-05, "loss": 0.2374, "step": 2561 }, { "epoch": 0.5185185185185185, "grad_norm": 0.2842339277267456, "learning_rate": 9.433848776974746e-05, "loss": 0.2556, "step": 2562 }, { "epoch": 0.5187209066990488, "grad_norm": 0.27156350016593933, "learning_rate": 9.427494447619262e-05, "loss": 0.2838, "step": 2563 }, { "epoch": 0.518923294879579, "grad_norm": 0.27013957500457764, "learning_rate": 9.421140350178935e-05, "loss": 0.2396, "step": 2564 }, { "epoch": 0.5191256830601093, "grad_norm": 0.2426198571920395, "learning_rate": 9.414786487227733e-05, "loss": 0.1955, "step": 2565 }, { "epoch": 0.5193280712406395, "grad_norm": 0.3867042362689972, "learning_rate": 9.408432861339527e-05, "loss": 0.2379, "step": 2566 }, { "epoch": 0.5195304594211698, "grad_norm": 0.2537308931350708, "learning_rate": 9.4020794750881e-05, "loss": 0.2451, "step": 2567 }, { "epoch": 0.5197328476017, "grad_norm": 0.28788721561431885, "learning_rate": 9.395726331047134e-05, "loss": 0.2406, "step": 2568 }, { "epoch": 0.5199352357822303, "grad_norm": 0.29327020049095154, "learning_rate": 9.389373431790203e-05, "loss": 0.2334, "step": 2569 }, { "epoch": 0.5201376239627605, "grad_norm": 0.2516320049762726, "learning_rate": 9.383020779890799e-05, "loss": 0.2351, "step": 2570 }, { "epoch": 0.5203400121432908, "grad_norm": 0.2964017391204834, "learning_rate": 9.376668377922303e-05, "loss": 0.2665, "step": 2571 }, { "epoch": 0.520542400323821, "grad_norm": 0.31455114483833313, "learning_rate": 9.370316228457995e-05, "loss": 0.2584, "step": 2572 }, { "epoch": 0.5207447885043514, "grad_norm": 0.23788169026374817, "learning_rate": 9.363964334071055e-05, "loss": 0.2326, "step": 2573 }, { "epoch": 0.5209471766848816, "grad_norm": 0.2252766191959381, "learning_rate": 9.357612697334563e-05, "loss": 0.1959, "step": 2574 }, { "epoch": 0.5211495648654119, "grad_norm": 0.2241242229938507, "learning_rate": 9.35126132082149e-05, "loss": 0.1749, "step": 2575 }, { "epoch": 0.5213519530459421, "grad_norm": 0.2873989939689636, "learning_rate": 9.3449102071047e-05, "loss": 0.2489, "step": 2576 }, { "epoch": 0.5215543412264724, "grad_norm": 0.28411445021629333, "learning_rate": 9.338559358756955e-05, "loss": 0.2576, "step": 2577 }, { "epoch": 0.5217567294070027, "grad_norm": 0.1939312070608139, "learning_rate": 9.332208778350906e-05, "loss": 0.2225, "step": 2578 }, { "epoch": 0.5219591175875329, "grad_norm": 0.2737818658351898, "learning_rate": 9.325858468459102e-05, "loss": 0.2358, "step": 2579 }, { "epoch": 0.5221615057680632, "grad_norm": 0.24766233563423157, "learning_rate": 9.319508431653972e-05, "loss": 0.2356, "step": 2580 }, { "epoch": 0.5223638939485934, "grad_norm": 0.23636049032211304, "learning_rate": 9.313158670507843e-05, "loss": 0.2391, "step": 2581 }, { "epoch": 0.5225662821291237, "grad_norm": 0.3216640055179596, "learning_rate": 9.306809187592929e-05, "loss": 0.2634, "step": 2582 }, { "epoch": 0.5227686703096539, "grad_norm": 0.2212718278169632, "learning_rate": 9.300459985481325e-05, "loss": 0.1996, "step": 2583 }, { "epoch": 0.5229710584901842, "grad_norm": 0.25759443640708923, "learning_rate": 9.29411106674502e-05, "loss": 0.2182, "step": 2584 }, { "epoch": 0.5231734466707144, "grad_norm": 0.2374274730682373, "learning_rate": 9.28776243395588e-05, "loss": 0.2294, "step": 2585 }, { "epoch": 0.5233758348512447, "grad_norm": 0.277119517326355, "learning_rate": 9.281414089685673e-05, "loss": 0.2209, "step": 2586 }, { "epoch": 0.5235782230317749, "grad_norm": 0.23173052072525024, "learning_rate": 9.275066036506025e-05, "loss": 0.2327, "step": 2587 }, { "epoch": 0.5237806112123052, "grad_norm": 0.3074660301208496, "learning_rate": 9.26871827698846e-05, "loss": 0.2791, "step": 2588 }, { "epoch": 0.5239829993928354, "grad_norm": 0.25061342120170593, "learning_rate": 9.262370813704378e-05, "loss": 0.2573, "step": 2589 }, { "epoch": 0.5241853875733657, "grad_norm": 0.25158825516700745, "learning_rate": 9.256023649225069e-05, "loss": 0.2212, "step": 2590 }, { "epoch": 0.524387775753896, "grad_norm": 0.2625894248485565, "learning_rate": 9.249676786121682e-05, "loss": 0.2184, "step": 2591 }, { "epoch": 0.5245901639344263, "grad_norm": 0.3058525323867798, "learning_rate": 9.24333022696526e-05, "loss": 0.264, "step": 2592 }, { "epoch": 0.5247925521149565, "grad_norm": 0.27265241742134094, "learning_rate": 9.23698397432672e-05, "loss": 0.2466, "step": 2593 }, { "epoch": 0.5249949402954868, "grad_norm": 0.2680499851703644, "learning_rate": 9.230638030776856e-05, "loss": 0.2336, "step": 2594 }, { "epoch": 0.525197328476017, "grad_norm": 0.2301790565252304, "learning_rate": 9.224292398886323e-05, "loss": 0.222, "step": 2595 }, { "epoch": 0.5253997166565473, "grad_norm": 0.23603658378124237, "learning_rate": 9.217947081225668e-05, "loss": 0.2064, "step": 2596 }, { "epoch": 0.5256021048370775, "grad_norm": 0.2795204222202301, "learning_rate": 9.21160208036531e-05, "loss": 0.2073, "step": 2597 }, { "epoch": 0.5258044930176078, "grad_norm": 0.25146758556365967, "learning_rate": 9.205257398875516e-05, "loss": 0.2198, "step": 2598 }, { "epoch": 0.526006881198138, "grad_norm": 0.27271968126296997, "learning_rate": 9.198913039326455e-05, "loss": 0.2366, "step": 2599 }, { "epoch": 0.5262092693786683, "grad_norm": 0.2846743166446686, "learning_rate": 9.192569004288145e-05, "loss": 0.2581, "step": 2600 }, { "epoch": 0.5262092693786683, "eval_loss": 0.26670295000076294, "eval_runtime": 1.3247, "eval_samples_per_second": 3.774, "eval_steps_per_second": 0.755, "step": 2600 }, { "epoch": 0.5264116575591985, "grad_norm": 0.2918645441532135, "learning_rate": 9.186225296330485e-05, "loss": 0.221, "step": 2601 }, { "epoch": 0.5266140457397288, "grad_norm": 0.26365551352500916, "learning_rate": 9.179881918023224e-05, "loss": 0.2465, "step": 2602 }, { "epoch": 0.526816433920259, "grad_norm": 0.32395076751708984, "learning_rate": 9.173538871935997e-05, "loss": 0.2537, "step": 2603 }, { "epoch": 0.5270188221007893, "grad_norm": 0.2926045060157776, "learning_rate": 9.167196160638295e-05, "loss": 0.2416, "step": 2604 }, { "epoch": 0.5272212102813195, "grad_norm": 0.24190780520439148, "learning_rate": 9.160853786699475e-05, "loss": 0.2376, "step": 2605 }, { "epoch": 0.5274235984618498, "grad_norm": 0.2892686426639557, "learning_rate": 9.154511752688755e-05, "loss": 0.2506, "step": 2606 }, { "epoch": 0.5276259866423801, "grad_norm": 0.2567702829837799, "learning_rate": 9.148170061175217e-05, "loss": 0.2191, "step": 2607 }, { "epoch": 0.5278283748229103, "grad_norm": 0.2689024806022644, "learning_rate": 9.141828714727808e-05, "loss": 0.2196, "step": 2608 }, { "epoch": 0.5280307630034407, "grad_norm": 0.2794611155986786, "learning_rate": 9.135487715915326e-05, "loss": 0.2264, "step": 2609 }, { "epoch": 0.5282331511839709, "grad_norm": 0.2361474633216858, "learning_rate": 9.129147067306438e-05, "loss": 0.2368, "step": 2610 }, { "epoch": 0.5284355393645012, "grad_norm": 0.36901190876960754, "learning_rate": 9.122806771469663e-05, "loss": 0.2528, "step": 2611 }, { "epoch": 0.5286379275450314, "grad_norm": 0.29343241453170776, "learning_rate": 9.116466830973383e-05, "loss": 0.2411, "step": 2612 }, { "epoch": 0.5288403157255617, "grad_norm": 0.29200562834739685, "learning_rate": 9.110127248385825e-05, "loss": 0.2377, "step": 2613 }, { "epoch": 0.5290427039060919, "grad_norm": 0.265852689743042, "learning_rate": 9.103788026275084e-05, "loss": 0.1874, "step": 2614 }, { "epoch": 0.5292450920866222, "grad_norm": 0.22898375988006592, "learning_rate": 9.0974491672091e-05, "loss": 0.2204, "step": 2615 }, { "epoch": 0.5294474802671524, "grad_norm": 0.33142927289009094, "learning_rate": 9.091110673755672e-05, "loss": 0.2404, "step": 2616 }, { "epoch": 0.5296498684476827, "grad_norm": 0.25226548314094543, "learning_rate": 9.084772548482447e-05, "loss": 0.2157, "step": 2617 }, { "epoch": 0.5298522566282129, "grad_norm": 0.23224274814128876, "learning_rate": 9.078434793956921e-05, "loss": 0.2125, "step": 2618 }, { "epoch": 0.5300546448087432, "grad_norm": 0.2816801071166992, "learning_rate": 9.072097412746446e-05, "loss": 0.2249, "step": 2619 }, { "epoch": 0.5302570329892734, "grad_norm": 0.2431764304637909, "learning_rate": 9.065760407418216e-05, "loss": 0.2265, "step": 2620 }, { "epoch": 0.5304594211698037, "grad_norm": 0.22613844275474548, "learning_rate": 9.059423780539279e-05, "loss": 0.2434, "step": 2621 }, { "epoch": 0.5306618093503339, "grad_norm": 0.27943679690361023, "learning_rate": 9.053087534676525e-05, "loss": 0.2546, "step": 2622 }, { "epoch": 0.5308641975308642, "grad_norm": 0.2138296365737915, "learning_rate": 9.046751672396695e-05, "loss": 0.1956, "step": 2623 }, { "epoch": 0.5310665857113944, "grad_norm": 0.2680979073047638, "learning_rate": 9.040416196266367e-05, "loss": 0.2433, "step": 2624 }, { "epoch": 0.5312689738919247, "grad_norm": 0.2775616943836212, "learning_rate": 9.034081108851968e-05, "loss": 0.1971, "step": 2625 }, { "epoch": 0.5314713620724549, "grad_norm": 0.3357136845588684, "learning_rate": 9.027746412719764e-05, "loss": 0.2155, "step": 2626 }, { "epoch": 0.5316737502529852, "grad_norm": 0.2998809516429901, "learning_rate": 9.021412110435872e-05, "loss": 0.2666, "step": 2627 }, { "epoch": 0.5318761384335154, "grad_norm": 0.22077873349189758, "learning_rate": 9.015078204566233e-05, "loss": 0.2365, "step": 2628 }, { "epoch": 0.5320785266140458, "grad_norm": 0.25876784324645996, "learning_rate": 9.008744697676641e-05, "loss": 0.225, "step": 2629 }, { "epoch": 0.532280914794576, "grad_norm": 0.30391210317611694, "learning_rate": 9.002411592332723e-05, "loss": 0.2718, "step": 2630 }, { "epoch": 0.5324833029751063, "grad_norm": 0.24762603640556335, "learning_rate": 8.996078891099949e-05, "loss": 0.2257, "step": 2631 }, { "epoch": 0.5326856911556365, "grad_norm": 0.28124773502349854, "learning_rate": 8.989746596543613e-05, "loss": 0.2833, "step": 2632 }, { "epoch": 0.5328880793361668, "grad_norm": 0.31629490852355957, "learning_rate": 8.983414711228853e-05, "loss": 0.2209, "step": 2633 }, { "epoch": 0.533090467516697, "grad_norm": 0.24589718878269196, "learning_rate": 8.97708323772065e-05, "loss": 0.2359, "step": 2634 }, { "epoch": 0.5332928556972273, "grad_norm": 0.21564151346683502, "learning_rate": 8.970752178583794e-05, "loss": 0.227, "step": 2635 }, { "epoch": 0.5334952438777575, "grad_norm": 0.24084565043449402, "learning_rate": 8.964421536382928e-05, "loss": 0.2057, "step": 2636 }, { "epoch": 0.5336976320582878, "grad_norm": 0.29167628288269043, "learning_rate": 8.95809131368252e-05, "loss": 0.2374, "step": 2637 }, { "epoch": 0.5339000202388181, "grad_norm": 0.251741498708725, "learning_rate": 8.951761513046874e-05, "loss": 0.2391, "step": 2638 }, { "epoch": 0.5341024084193483, "grad_norm": 0.2519427239894867, "learning_rate": 8.945432137040102e-05, "loss": 0.2191, "step": 2639 }, { "epoch": 0.5343047965998786, "grad_norm": 0.2869488596916199, "learning_rate": 8.939103188226165e-05, "loss": 0.2454, "step": 2640 }, { "epoch": 0.5345071847804088, "grad_norm": 0.3657805919647217, "learning_rate": 8.932774669168848e-05, "loss": 0.243, "step": 2641 }, { "epoch": 0.5347095729609391, "grad_norm": 0.22538717091083527, "learning_rate": 8.926446582431761e-05, "loss": 0.2048, "step": 2642 }, { "epoch": 0.5349119611414693, "grad_norm": 0.3124086558818817, "learning_rate": 8.920118930578325e-05, "loss": 0.2225, "step": 2643 }, { "epoch": 0.5351143493219996, "grad_norm": 0.36256417632102966, "learning_rate": 8.913791716171804e-05, "loss": 0.235, "step": 2644 }, { "epoch": 0.5353167375025298, "grad_norm": 0.31242308020591736, "learning_rate": 8.90746494177528e-05, "loss": 0.2642, "step": 2645 }, { "epoch": 0.5355191256830601, "grad_norm": 0.3332633972167969, "learning_rate": 8.901138609951642e-05, "loss": 0.2465, "step": 2646 }, { "epoch": 0.5357215138635903, "grad_norm": 0.28270483016967773, "learning_rate": 8.894812723263621e-05, "loss": 0.2449, "step": 2647 }, { "epoch": 0.5359239020441207, "grad_norm": 0.309767484664917, "learning_rate": 8.888487284273757e-05, "loss": 0.252, "step": 2648 }, { "epoch": 0.5361262902246509, "grad_norm": 0.2607302665710449, "learning_rate": 8.882162295544409e-05, "loss": 0.2252, "step": 2649 }, { "epoch": 0.5363286784051812, "grad_norm": 0.32570475339889526, "learning_rate": 8.875837759637751e-05, "loss": 0.2265, "step": 2650 }, { "epoch": 0.5363286784051812, "eval_loss": 0.26630160212516785, "eval_runtime": 1.3181, "eval_samples_per_second": 3.793, "eval_steps_per_second": 0.759, "step": 2650 }, { "epoch": 0.5365310665857114, "grad_norm": 0.30641040205955505, "learning_rate": 8.86951367911578e-05, "loss": 0.2511, "step": 2651 }, { "epoch": 0.5367334547662417, "grad_norm": 0.2911195456981659, "learning_rate": 8.863190056540307e-05, "loss": 0.2319, "step": 2652 }, { "epoch": 0.5369358429467719, "grad_norm": 0.29902341961860657, "learning_rate": 8.856866894472953e-05, "loss": 0.2349, "step": 2653 }, { "epoch": 0.5371382311273022, "grad_norm": 0.3100128471851349, "learning_rate": 8.850544195475157e-05, "loss": 0.2389, "step": 2654 }, { "epoch": 0.5373406193078324, "grad_norm": 0.25454381108283997, "learning_rate": 8.844221962108166e-05, "loss": 0.2173, "step": 2655 }, { "epoch": 0.5375430074883627, "grad_norm": 0.2565534710884094, "learning_rate": 8.837900196933045e-05, "loss": 0.2098, "step": 2656 }, { "epoch": 0.5377453956688929, "grad_norm": 0.23675772547721863, "learning_rate": 8.831578902510663e-05, "loss": 0.2201, "step": 2657 }, { "epoch": 0.5379477838494232, "grad_norm": 0.21477952599525452, "learning_rate": 8.825258081401702e-05, "loss": 0.2422, "step": 2658 }, { "epoch": 0.5381501720299534, "grad_norm": 0.28204089403152466, "learning_rate": 8.81893773616665e-05, "loss": 0.2367, "step": 2659 }, { "epoch": 0.5383525602104837, "grad_norm": 0.26159507036209106, "learning_rate": 8.812617869365806e-05, "loss": 0.2614, "step": 2660 }, { "epoch": 0.5385549483910139, "grad_norm": 0.47747549414634705, "learning_rate": 8.806298483559267e-05, "loss": 0.2598, "step": 2661 }, { "epoch": 0.5387573365715442, "grad_norm": 0.2562398612499237, "learning_rate": 8.799979581306946e-05, "loss": 0.2407, "step": 2662 }, { "epoch": 0.5389597247520744, "grad_norm": 0.24922461807727814, "learning_rate": 8.793661165168552e-05, "loss": 0.2476, "step": 2663 }, { "epoch": 0.5391621129326047, "grad_norm": 0.4374734163284302, "learning_rate": 8.787343237703603e-05, "loss": 0.2568, "step": 2664 }, { "epoch": 0.5393645011131349, "grad_norm": 0.260881632566452, "learning_rate": 8.781025801471411e-05, "loss": 0.2074, "step": 2665 }, { "epoch": 0.5395668892936653, "grad_norm": 0.276424378156662, "learning_rate": 8.774708859031099e-05, "loss": 0.2364, "step": 2666 }, { "epoch": 0.5397692774741955, "grad_norm": 0.24571847915649414, "learning_rate": 8.768392412941583e-05, "loss": 0.2732, "step": 2667 }, { "epoch": 0.5399716656547258, "grad_norm": 0.24681046605110168, "learning_rate": 8.762076465761577e-05, "loss": 0.2258, "step": 2668 }, { "epoch": 0.5401740538352561, "grad_norm": 0.44437775015830994, "learning_rate": 8.755761020049597e-05, "loss": 0.2415, "step": 2669 }, { "epoch": 0.5403764420157863, "grad_norm": 0.3477860391139984, "learning_rate": 8.749446078363957e-05, "loss": 0.2672, "step": 2670 }, { "epoch": 0.5405788301963166, "grad_norm": 0.2183695286512375, "learning_rate": 8.743131643262763e-05, "loss": 0.2135, "step": 2671 }, { "epoch": 0.5407812183768468, "grad_norm": 0.24185119569301605, "learning_rate": 8.736817717303917e-05, "loss": 0.2418, "step": 2672 }, { "epoch": 0.5409836065573771, "grad_norm": 0.2490321695804596, "learning_rate": 8.730504303045114e-05, "loss": 0.2672, "step": 2673 }, { "epoch": 0.5411859947379073, "grad_norm": 0.25529637932777405, "learning_rate": 8.724191403043844e-05, "loss": 0.2163, "step": 2674 }, { "epoch": 0.5413883829184376, "grad_norm": 0.2605232298374176, "learning_rate": 8.717879019857389e-05, "loss": 0.2029, "step": 2675 }, { "epoch": 0.5415907710989678, "grad_norm": 0.2548829913139343, "learning_rate": 8.711567156042817e-05, "loss": 0.1943, "step": 2676 }, { "epoch": 0.5417931592794981, "grad_norm": 0.27267006039619446, "learning_rate": 8.705255814156987e-05, "loss": 0.2379, "step": 2677 }, { "epoch": 0.5419955474600283, "grad_norm": 0.29003629088401794, "learning_rate": 8.69894499675655e-05, "loss": 0.2732, "step": 2678 }, { "epoch": 0.5421979356405586, "grad_norm": 0.27857184410095215, "learning_rate": 8.692634706397951e-05, "loss": 0.2331, "step": 2679 }, { "epoch": 0.5424003238210888, "grad_norm": 0.2483905553817749, "learning_rate": 8.686324945637401e-05, "loss": 0.1841, "step": 2680 }, { "epoch": 0.5426027120016191, "grad_norm": 0.2645397186279297, "learning_rate": 8.68001571703091e-05, "loss": 0.2477, "step": 2681 }, { "epoch": 0.5428051001821493, "grad_norm": 0.25008052587509155, "learning_rate": 8.673707023134282e-05, "loss": 0.215, "step": 2682 }, { "epoch": 0.5430074883626796, "grad_norm": 0.27936068177223206, "learning_rate": 8.66739886650308e-05, "loss": 0.2296, "step": 2683 }, { "epoch": 0.5432098765432098, "grad_norm": 0.2522280812263489, "learning_rate": 8.661091249692668e-05, "loss": 0.2528, "step": 2684 }, { "epoch": 0.5434122647237402, "grad_norm": 0.25331631302833557, "learning_rate": 8.654784175258189e-05, "loss": 0.24, "step": 2685 }, { "epoch": 0.5436146529042704, "grad_norm": 0.2979610562324524, "learning_rate": 8.648477645754565e-05, "loss": 0.2709, "step": 2686 }, { "epoch": 0.5438170410848007, "grad_norm": 0.38583609461784363, "learning_rate": 8.642171663736487e-05, "loss": 0.239, "step": 2687 }, { "epoch": 0.5440194292653309, "grad_norm": 0.23189081251621246, "learning_rate": 8.635866231758436e-05, "loss": 0.2284, "step": 2688 }, { "epoch": 0.5442218174458612, "grad_norm": 0.45621731877326965, "learning_rate": 8.629561352374673e-05, "loss": 0.2652, "step": 2689 }, { "epoch": 0.5444242056263914, "grad_norm": 0.23809683322906494, "learning_rate": 8.623257028139228e-05, "loss": 0.246, "step": 2690 }, { "epoch": 0.5446265938069217, "grad_norm": 0.2549673020839691, "learning_rate": 8.6169532616059e-05, "loss": 0.2278, "step": 2691 }, { "epoch": 0.5448289819874519, "grad_norm": 0.25626522302627563, "learning_rate": 8.610650055328276e-05, "loss": 0.23, "step": 2692 }, { "epoch": 0.5450313701679822, "grad_norm": 0.288626492023468, "learning_rate": 8.604347411859713e-05, "loss": 0.2432, "step": 2693 }, { "epoch": 0.5452337583485124, "grad_norm": 0.26422813534736633, "learning_rate": 8.598045333753323e-05, "loss": 0.2567, "step": 2694 }, { "epoch": 0.5454361465290427, "grad_norm": 0.23058663308620453, "learning_rate": 8.591743823562014e-05, "loss": 0.2167, "step": 2695 }, { "epoch": 0.5456385347095729, "grad_norm": 0.236797034740448, "learning_rate": 8.585442883838449e-05, "loss": 0.2205, "step": 2696 }, { "epoch": 0.5458409228901032, "grad_norm": 0.3145347237586975, "learning_rate": 8.579142517135066e-05, "loss": 0.2274, "step": 2697 }, { "epoch": 0.5460433110706335, "grad_norm": 0.2510102391242981, "learning_rate": 8.572842726004064e-05, "loss": 0.2447, "step": 2698 }, { "epoch": 0.5462456992511637, "grad_norm": 0.23874910175800323, "learning_rate": 8.566543512997415e-05, "loss": 0.2502, "step": 2699 }, { "epoch": 0.546448087431694, "grad_norm": 0.2503799796104431, "learning_rate": 8.560244880666854e-05, "loss": 0.2373, "step": 2700 }, { "epoch": 0.546448087431694, "eval_loss": 0.2638868987560272, "eval_runtime": 1.3146, "eval_samples_per_second": 3.803, "eval_steps_per_second": 0.761, "step": 2700 }, { "epoch": 0.5466504756122242, "grad_norm": 0.235634908080101, "learning_rate": 8.553946831563885e-05, "loss": 0.2487, "step": 2701 }, { "epoch": 0.5468528637927546, "grad_norm": 0.2349555343389511, "learning_rate": 8.547649368239769e-05, "loss": 0.2212, "step": 2702 }, { "epoch": 0.5470552519732848, "grad_norm": 0.2593168318271637, "learning_rate": 8.541352493245534e-05, "loss": 0.2059, "step": 2703 }, { "epoch": 0.5472576401538151, "grad_norm": 0.27148836851119995, "learning_rate": 8.53505620913197e-05, "loss": 0.2676, "step": 2704 }, { "epoch": 0.5474600283343453, "grad_norm": 0.4631858170032501, "learning_rate": 8.528760518449624e-05, "loss": 0.2422, "step": 2705 }, { "epoch": 0.5476624165148756, "grad_norm": 0.30392640829086304, "learning_rate": 8.522465423748806e-05, "loss": 0.2239, "step": 2706 }, { "epoch": 0.5478648046954058, "grad_norm": 0.26981279253959656, "learning_rate": 8.516170927579583e-05, "loss": 0.2261, "step": 2707 }, { "epoch": 0.5480671928759361, "grad_norm": 0.2792915105819702, "learning_rate": 8.509877032491784e-05, "loss": 0.1947, "step": 2708 }, { "epoch": 0.5482695810564663, "grad_norm": 0.2961187958717346, "learning_rate": 8.503583741034987e-05, "loss": 0.2196, "step": 2709 }, { "epoch": 0.5484719692369966, "grad_norm": 0.2685526907444, "learning_rate": 8.497291055758529e-05, "loss": 0.1991, "step": 2710 }, { "epoch": 0.5486743574175268, "grad_norm": 0.29037392139434814, "learning_rate": 8.490998979211503e-05, "loss": 0.2419, "step": 2711 }, { "epoch": 0.5488767455980571, "grad_norm": 0.2965281009674072, "learning_rate": 8.484707513942755e-05, "loss": 0.2504, "step": 2712 }, { "epoch": 0.5490791337785873, "grad_norm": 0.3565129339694977, "learning_rate": 8.478416662500879e-05, "loss": 0.2346, "step": 2713 }, { "epoch": 0.5492815219591176, "grad_norm": 0.31673893332481384, "learning_rate": 8.472126427434226e-05, "loss": 0.2748, "step": 2714 }, { "epoch": 0.5494839101396478, "grad_norm": 0.3028927445411682, "learning_rate": 8.465836811290897e-05, "loss": 0.2353, "step": 2715 }, { "epoch": 0.5496862983201781, "grad_norm": 0.22023645043373108, "learning_rate": 8.459547816618736e-05, "loss": 0.1894, "step": 2716 }, { "epoch": 0.5498886865007083, "grad_norm": 0.28227436542510986, "learning_rate": 8.45325944596534e-05, "loss": 0.2443, "step": 2717 }, { "epoch": 0.5500910746812386, "grad_norm": 0.21290171146392822, "learning_rate": 8.446971701878055e-05, "loss": 0.2267, "step": 2718 }, { "epoch": 0.5502934628617688, "grad_norm": 0.3019583225250244, "learning_rate": 8.440684586903975e-05, "loss": 0.2149, "step": 2719 }, { "epoch": 0.5504958510422991, "grad_norm": 0.23116125166416168, "learning_rate": 8.434398103589926e-05, "loss": 0.195, "step": 2720 }, { "epoch": 0.5506982392228293, "grad_norm": 0.2696925699710846, "learning_rate": 8.428112254482492e-05, "loss": 0.2382, "step": 2721 }, { "epoch": 0.5509006274033597, "grad_norm": 0.24421848356723785, "learning_rate": 8.421827042127996e-05, "loss": 0.1874, "step": 2722 }, { "epoch": 0.5511030155838899, "grad_norm": 0.2750245928764343, "learning_rate": 8.415542469072502e-05, "loss": 0.227, "step": 2723 }, { "epoch": 0.5513054037644202, "grad_norm": 0.28206735849380493, "learning_rate": 8.409258537861814e-05, "loss": 0.2674, "step": 2724 }, { "epoch": 0.5515077919449504, "grad_norm": 0.32721057534217834, "learning_rate": 8.402975251041478e-05, "loss": 0.2283, "step": 2725 }, { "epoch": 0.5517101801254807, "grad_norm": 0.2898944914340973, "learning_rate": 8.396692611156777e-05, "loss": 0.1838, "step": 2726 }, { "epoch": 0.5519125683060109, "grad_norm": 0.3372180759906769, "learning_rate": 8.390410620752741e-05, "loss": 0.2476, "step": 2727 }, { "epoch": 0.5521149564865412, "grad_norm": 0.28951266407966614, "learning_rate": 8.384129282374119e-05, "loss": 0.2585, "step": 2728 }, { "epoch": 0.5523173446670715, "grad_norm": 0.2725861370563507, "learning_rate": 8.377848598565409e-05, "loss": 0.2077, "step": 2729 }, { "epoch": 0.5525197328476017, "grad_norm": 0.41843804717063904, "learning_rate": 8.371568571870847e-05, "loss": 0.2102, "step": 2730 }, { "epoch": 0.552722121028132, "grad_norm": 0.26017579436302185, "learning_rate": 8.365289204834389e-05, "loss": 0.2258, "step": 2731 }, { "epoch": 0.5529245092086622, "grad_norm": 0.22561419010162354, "learning_rate": 8.359010499999732e-05, "loss": 0.2228, "step": 2732 }, { "epoch": 0.5531268973891925, "grad_norm": 0.2669832706451416, "learning_rate": 8.352732459910309e-05, "loss": 0.2344, "step": 2733 }, { "epoch": 0.5533292855697227, "grad_norm": 0.2689177095890045, "learning_rate": 8.346455087109281e-05, "loss": 0.2454, "step": 2734 }, { "epoch": 0.553531673750253, "grad_norm": 0.3145497143268585, "learning_rate": 8.34017838413953e-05, "loss": 0.2113, "step": 2735 }, { "epoch": 0.5537340619307832, "grad_norm": 0.27574464678764343, "learning_rate": 8.333902353543672e-05, "loss": 0.2189, "step": 2736 }, { "epoch": 0.5539364501113135, "grad_norm": 0.234979048371315, "learning_rate": 8.327626997864059e-05, "loss": 0.2589, "step": 2737 }, { "epoch": 0.5541388382918437, "grad_norm": 0.29583293199539185, "learning_rate": 8.321352319642763e-05, "loss": 0.2548, "step": 2738 }, { "epoch": 0.554341226472374, "grad_norm": 0.3137109875679016, "learning_rate": 8.31507832142157e-05, "loss": 0.2618, "step": 2739 }, { "epoch": 0.5545436146529042, "grad_norm": 0.33810919523239136, "learning_rate": 8.308805005742014e-05, "loss": 0.2508, "step": 2740 }, { "epoch": 0.5547460028334346, "grad_norm": 0.2436242252588272, "learning_rate": 8.302532375145339e-05, "loss": 0.2465, "step": 2741 }, { "epoch": 0.5549483910139648, "grad_norm": 0.29385441541671753, "learning_rate": 8.296260432172502e-05, "loss": 0.2189, "step": 2742 }, { "epoch": 0.5551507791944951, "grad_norm": 0.2712760269641876, "learning_rate": 8.289989179364202e-05, "loss": 0.2448, "step": 2743 }, { "epoch": 0.5553531673750253, "grad_norm": 0.28467386960983276, "learning_rate": 8.283718619260846e-05, "loss": 0.2461, "step": 2744 }, { "epoch": 0.5555555555555556, "grad_norm": 0.33771151304244995, "learning_rate": 8.277448754402564e-05, "loss": 0.2453, "step": 2745 }, { "epoch": 0.5557579437360858, "grad_norm": 0.22715666890144348, "learning_rate": 8.2711795873292e-05, "loss": 0.2131, "step": 2746 }, { "epoch": 0.5559603319166161, "grad_norm": 0.2997545599937439, "learning_rate": 8.264911120580321e-05, "loss": 0.2486, "step": 2747 }, { "epoch": 0.5561627200971463, "grad_norm": 0.39109814167022705, "learning_rate": 8.258643356695209e-05, "loss": 0.238, "step": 2748 }, { "epoch": 0.5563651082776766, "grad_norm": 0.255256712436676, "learning_rate": 8.252376298212859e-05, "loss": 0.2318, "step": 2749 }, { "epoch": 0.5565674964582068, "grad_norm": 0.29748469591140747, "learning_rate": 8.246109947671981e-05, "loss": 0.2453, "step": 2750 }, { "epoch": 0.5565674964582068, "eval_loss": 0.2642709016799927, "eval_runtime": 1.3198, "eval_samples_per_second": 3.789, "eval_steps_per_second": 0.758, "step": 2750 }, { "epoch": 0.5567698846387371, "grad_norm": 0.30376553535461426, "learning_rate": 8.239844307610997e-05, "loss": 0.2596, "step": 2751 }, { "epoch": 0.5569722728192673, "grad_norm": 0.3042518198490143, "learning_rate": 8.23357938056805e-05, "loss": 0.2114, "step": 2752 }, { "epoch": 0.5571746609997976, "grad_norm": 0.2506563365459442, "learning_rate": 8.227315169080978e-05, "loss": 0.2442, "step": 2753 }, { "epoch": 0.5573770491803278, "grad_norm": 0.26603296399116516, "learning_rate": 8.221051675687342e-05, "loss": 0.228, "step": 2754 }, { "epoch": 0.5575794373608581, "grad_norm": 0.22108809649944305, "learning_rate": 8.21478890292441e-05, "loss": 0.2124, "step": 2755 }, { "epoch": 0.5577818255413883, "grad_norm": 0.2584373950958252, "learning_rate": 8.208526853329158e-05, "loss": 0.2557, "step": 2756 }, { "epoch": 0.5579842137219186, "grad_norm": 0.25329360365867615, "learning_rate": 8.20226552943826e-05, "loss": 0.2559, "step": 2757 }, { "epoch": 0.5581866019024488, "grad_norm": 0.2851312458515167, "learning_rate": 8.196004933788108e-05, "loss": 0.2318, "step": 2758 }, { "epoch": 0.5583889900829792, "grad_norm": 0.28638315200805664, "learning_rate": 8.189745068914795e-05, "loss": 0.2613, "step": 2759 }, { "epoch": 0.5585913782635095, "grad_norm": 0.28092876076698303, "learning_rate": 8.183485937354119e-05, "loss": 0.2354, "step": 2760 }, { "epoch": 0.5587937664440397, "grad_norm": 0.2601775527000427, "learning_rate": 8.177227541641575e-05, "loss": 0.2359, "step": 2761 }, { "epoch": 0.55899615462457, "grad_norm": 0.6413168907165527, "learning_rate": 8.170969884312366e-05, "loss": 0.2518, "step": 2762 }, { "epoch": 0.5591985428051002, "grad_norm": 0.5135788917541504, "learning_rate": 8.164712967901395e-05, "loss": 0.2484, "step": 2763 }, { "epoch": 0.5594009309856305, "grad_norm": 0.24095632135868073, "learning_rate": 8.158456794943265e-05, "loss": 0.2383, "step": 2764 }, { "epoch": 0.5596033191661607, "grad_norm": 0.2194024622440338, "learning_rate": 8.152201367972274e-05, "loss": 0.2177, "step": 2765 }, { "epoch": 0.559805707346691, "grad_norm": 0.24512092769145966, "learning_rate": 8.145946689522423e-05, "loss": 0.2284, "step": 2766 }, { "epoch": 0.5600080955272212, "grad_norm": 0.28830891847610474, "learning_rate": 8.139692762127408e-05, "loss": 0.2277, "step": 2767 }, { "epoch": 0.5602104837077515, "grad_norm": 0.26126909255981445, "learning_rate": 8.133439588320619e-05, "loss": 0.254, "step": 2768 }, { "epoch": 0.5604128718882817, "grad_norm": 0.3415836691856384, "learning_rate": 8.12718717063514e-05, "loss": 0.2348, "step": 2769 }, { "epoch": 0.560615260068812, "grad_norm": 0.28783220052719116, "learning_rate": 8.120935511603752e-05, "loss": 0.2317, "step": 2770 }, { "epoch": 0.5608176482493422, "grad_norm": 0.37124061584472656, "learning_rate": 8.114684613758931e-05, "loss": 0.2249, "step": 2771 }, { "epoch": 0.5610200364298725, "grad_norm": 0.2686764597892761, "learning_rate": 8.108434479632835e-05, "loss": 0.2374, "step": 2772 }, { "epoch": 0.5612224246104027, "grad_norm": 0.2780396342277527, "learning_rate": 8.102185111757324e-05, "loss": 0.2596, "step": 2773 }, { "epoch": 0.561424812790933, "grad_norm": 0.2450963854789734, "learning_rate": 8.095936512663935e-05, "loss": 0.2457, "step": 2774 }, { "epoch": 0.5616272009714632, "grad_norm": 0.24080894887447357, "learning_rate": 8.089688684883914e-05, "loss": 0.2578, "step": 2775 }, { "epoch": 0.5618295891519935, "grad_norm": 0.3862358629703522, "learning_rate": 8.083441630948167e-05, "loss": 0.2725, "step": 2776 }, { "epoch": 0.5620319773325237, "grad_norm": 0.24941718578338623, "learning_rate": 8.077195353387305e-05, "loss": 0.2014, "step": 2777 }, { "epoch": 0.5622343655130541, "grad_norm": 0.26436755061149597, "learning_rate": 8.070949854731631e-05, "loss": 0.221, "step": 2778 }, { "epoch": 0.5624367536935843, "grad_norm": 0.22715380787849426, "learning_rate": 8.064705137511106e-05, "loss": 0.202, "step": 2779 }, { "epoch": 0.5626391418741146, "grad_norm": 0.29069390892982483, "learning_rate": 8.058461204255397e-05, "loss": 0.2388, "step": 2780 }, { "epoch": 0.5628415300546448, "grad_norm": 0.226497083902359, "learning_rate": 8.052218057493848e-05, "loss": 0.2397, "step": 2781 }, { "epoch": 0.5630439182351751, "grad_norm": 0.24638384580612183, "learning_rate": 8.045975699755487e-05, "loss": 0.2233, "step": 2782 }, { "epoch": 0.5632463064157053, "grad_norm": 0.25720757246017456, "learning_rate": 8.03973413356901e-05, "loss": 0.2324, "step": 2783 }, { "epoch": 0.5634486945962356, "grad_norm": 0.27917003631591797, "learning_rate": 8.033493361462802e-05, "loss": 0.2375, "step": 2784 }, { "epoch": 0.5636510827767658, "grad_norm": 0.2921479046344757, "learning_rate": 8.02725338596493e-05, "loss": 0.2623, "step": 2785 }, { "epoch": 0.5638534709572961, "grad_norm": 0.26956483721733093, "learning_rate": 8.021014209603136e-05, "loss": 0.1967, "step": 2786 }, { "epoch": 0.5640558591378263, "grad_norm": 0.3391810655593872, "learning_rate": 8.014775834904823e-05, "loss": 0.2583, "step": 2787 }, { "epoch": 0.5642582473183566, "grad_norm": 0.3308781683444977, "learning_rate": 8.008538264397094e-05, "loss": 0.2897, "step": 2788 }, { "epoch": 0.5644606354988869, "grad_norm": 0.35334113240242004, "learning_rate": 8.002301500606715e-05, "loss": 0.2436, "step": 2789 }, { "epoch": 0.5646630236794171, "grad_norm": 0.2873499095439911, "learning_rate": 7.996065546060112e-05, "loss": 0.241, "step": 2790 }, { "epoch": 0.5648654118599474, "grad_norm": 0.24136365950107574, "learning_rate": 7.989830403283406e-05, "loss": 0.2249, "step": 2791 }, { "epoch": 0.5650678000404776, "grad_norm": 0.25536829233169556, "learning_rate": 7.983596074802376e-05, "loss": 0.2327, "step": 2792 }, { "epoch": 0.5652701882210079, "grad_norm": 0.30287352204322815, "learning_rate": 7.977362563142477e-05, "loss": 0.2564, "step": 2793 }, { "epoch": 0.5654725764015381, "grad_norm": 0.3800259530544281, "learning_rate": 7.971129870828826e-05, "loss": 0.2312, "step": 2794 }, { "epoch": 0.5656749645820685, "grad_norm": 1.0940887928009033, "learning_rate": 7.964898000386211e-05, "loss": 0.2516, "step": 2795 }, { "epoch": 0.5658773527625987, "grad_norm": 0.23001326620578766, "learning_rate": 7.958666954339092e-05, "loss": 0.231, "step": 2796 }, { "epoch": 0.566079740943129, "grad_norm": 0.2496563196182251, "learning_rate": 7.952436735211592e-05, "loss": 0.2151, "step": 2797 }, { "epoch": 0.5662821291236592, "grad_norm": 0.22902119159698486, "learning_rate": 7.946207345527495e-05, "loss": 0.2221, "step": 2798 }, { "epoch": 0.5664845173041895, "grad_norm": 0.2110871970653534, "learning_rate": 7.939978787810253e-05, "loss": 0.2072, "step": 2799 }, { "epoch": 0.5666869054847197, "grad_norm": 0.2872609496116638, "learning_rate": 7.933751064582982e-05, "loss": 0.2675, "step": 2800 }, { "epoch": 0.5666869054847197, "eval_loss": 0.2686084806919098, "eval_runtime": 1.3201, "eval_samples_per_second": 3.788, "eval_steps_per_second": 0.758, "step": 2800 }, { "epoch": 0.56688929366525, "grad_norm": 0.288274347782135, "learning_rate": 7.927524178368456e-05, "loss": 0.1997, "step": 2801 }, { "epoch": 0.5670916818457802, "grad_norm": 0.3002770245075226, "learning_rate": 7.921298131689112e-05, "loss": 0.2499, "step": 2802 }, { "epoch": 0.5672940700263105, "grad_norm": 0.2752934992313385, "learning_rate": 7.91507292706705e-05, "loss": 0.2114, "step": 2803 }, { "epoch": 0.5674964582068407, "grad_norm": 0.25393956899642944, "learning_rate": 7.908848567024026e-05, "loss": 0.2189, "step": 2804 }, { "epoch": 0.567698846387371, "grad_norm": 0.24213911592960358, "learning_rate": 7.902625054081449e-05, "loss": 0.2316, "step": 2805 }, { "epoch": 0.5679012345679012, "grad_norm": 0.29866114258766174, "learning_rate": 7.896402390760394e-05, "loss": 0.2392, "step": 2806 }, { "epoch": 0.5681036227484315, "grad_norm": 0.4242440164089203, "learning_rate": 7.890180579581585e-05, "loss": 0.2488, "step": 2807 }, { "epoch": 0.5683060109289617, "grad_norm": 0.3225146234035492, "learning_rate": 7.883959623065409e-05, "loss": 0.2941, "step": 2808 }, { "epoch": 0.568508399109492, "grad_norm": 0.2641158998012543, "learning_rate": 7.877739523731893e-05, "loss": 0.184, "step": 2809 }, { "epoch": 0.5687107872900222, "grad_norm": 0.19207541644573212, "learning_rate": 7.871520284100729e-05, "loss": 0.2026, "step": 2810 }, { "epoch": 0.5689131754705525, "grad_norm": 0.2771735191345215, "learning_rate": 7.865301906691257e-05, "loss": 0.2384, "step": 2811 }, { "epoch": 0.5691155636510827, "grad_norm": 0.24658817052841187, "learning_rate": 7.859084394022469e-05, "loss": 0.2342, "step": 2812 }, { "epoch": 0.569317951831613, "grad_norm": 0.2510536313056946, "learning_rate": 7.852867748613e-05, "loss": 0.2357, "step": 2813 }, { "epoch": 0.5695203400121432, "grad_norm": 0.2517163157463074, "learning_rate": 7.846651972981141e-05, "loss": 0.2425, "step": 2814 }, { "epoch": 0.5697227281926736, "grad_norm": 0.2826145887374878, "learning_rate": 7.840437069644833e-05, "loss": 0.2392, "step": 2815 }, { "epoch": 0.5699251163732038, "grad_norm": 1.2022955417633057, "learning_rate": 7.834223041121652e-05, "loss": 0.2202, "step": 2816 }, { "epoch": 0.5701275045537341, "grad_norm": 0.22330597043037415, "learning_rate": 7.828009889928828e-05, "loss": 0.2167, "step": 2817 }, { "epoch": 0.5703298927342643, "grad_norm": 0.2820896506309509, "learning_rate": 7.821797618583239e-05, "loss": 0.241, "step": 2818 }, { "epoch": 0.5705322809147946, "grad_norm": 0.2903810143470764, "learning_rate": 7.8155862296014e-05, "loss": 0.271, "step": 2819 }, { "epoch": 0.5707346690953249, "grad_norm": 0.30095216631889343, "learning_rate": 7.809375725499466e-05, "loss": 0.2176, "step": 2820 }, { "epoch": 0.5709370572758551, "grad_norm": 0.25605201721191406, "learning_rate": 7.803166108793242e-05, "loss": 0.2315, "step": 2821 }, { "epoch": 0.5711394454563854, "grad_norm": 0.21955852210521698, "learning_rate": 7.796957381998167e-05, "loss": 0.2284, "step": 2822 }, { "epoch": 0.5713418336369156, "grad_norm": 0.3275134563446045, "learning_rate": 7.79074954762933e-05, "loss": 0.2116, "step": 2823 }, { "epoch": 0.5715442218174459, "grad_norm": 0.2793656885623932, "learning_rate": 7.78454260820144e-05, "loss": 0.231, "step": 2824 }, { "epoch": 0.5717466099979761, "grad_norm": 0.23401936888694763, "learning_rate": 7.778336566228856e-05, "loss": 0.2194, "step": 2825 }, { "epoch": 0.5719489981785064, "grad_norm": 0.32186609506607056, "learning_rate": 7.772131424225579e-05, "loss": 0.2544, "step": 2826 }, { "epoch": 0.5721513863590366, "grad_norm": 0.3124012351036072, "learning_rate": 7.765927184705231e-05, "loss": 0.2607, "step": 2827 }, { "epoch": 0.5723537745395669, "grad_norm": 0.32611599564552307, "learning_rate": 7.759723850181072e-05, "loss": 0.2531, "step": 2828 }, { "epoch": 0.5725561627200971, "grad_norm": 0.299396276473999, "learning_rate": 7.753521423166006e-05, "loss": 0.2601, "step": 2829 }, { "epoch": 0.5727585509006274, "grad_norm": 0.30610567331314087, "learning_rate": 7.747319906172565e-05, "loss": 0.2682, "step": 2830 }, { "epoch": 0.5729609390811576, "grad_norm": 0.2609761655330658, "learning_rate": 7.741119301712901e-05, "loss": 0.2464, "step": 2831 }, { "epoch": 0.573163327261688, "grad_norm": 0.38842231035232544, "learning_rate": 7.734919612298803e-05, "loss": 0.2145, "step": 2832 }, { "epoch": 0.5733657154422181, "grad_norm": 0.2516114413738251, "learning_rate": 7.7287208404417e-05, "loss": 0.2156, "step": 2833 }, { "epoch": 0.5735681036227485, "grad_norm": 0.2874990701675415, "learning_rate": 7.722522988652638e-05, "loss": 0.2445, "step": 2834 }, { "epoch": 0.5737704918032787, "grad_norm": 0.2792865037918091, "learning_rate": 7.716326059442285e-05, "loss": 0.2474, "step": 2835 }, { "epoch": 0.573972879983809, "grad_norm": 0.33962929248809814, "learning_rate": 7.710130055320952e-05, "loss": 0.2089, "step": 2836 }, { "epoch": 0.5741752681643392, "grad_norm": 0.26019397377967834, "learning_rate": 7.703934978798566e-05, "loss": 0.2531, "step": 2837 }, { "epoch": 0.5743776563448695, "grad_norm": 0.29663413763046265, "learning_rate": 7.69774083238467e-05, "loss": 0.2066, "step": 2838 }, { "epoch": 0.5745800445253997, "grad_norm": 0.33600112795829773, "learning_rate": 7.691547618588446e-05, "loss": 0.2427, "step": 2839 }, { "epoch": 0.57478243270593, "grad_norm": 0.3229539096355438, "learning_rate": 7.68535533991869e-05, "loss": 0.1901, "step": 2840 }, { "epoch": 0.5749848208864602, "grad_norm": 0.38617613911628723, "learning_rate": 7.679163998883819e-05, "loss": 0.2486, "step": 2841 }, { "epoch": 0.5751872090669905, "grad_norm": 0.2812846899032593, "learning_rate": 7.672973597991871e-05, "loss": 0.2798, "step": 2842 }, { "epoch": 0.5753895972475207, "grad_norm": 0.254553884267807, "learning_rate": 7.666784139750503e-05, "loss": 0.236, "step": 2843 }, { "epoch": 0.575591985428051, "grad_norm": 0.3402416408061981, "learning_rate": 7.660595626666991e-05, "loss": 0.2551, "step": 2844 }, { "epoch": 0.5757943736085812, "grad_norm": 0.3104320466518402, "learning_rate": 7.654408061248231e-05, "loss": 0.2426, "step": 2845 }, { "epoch": 0.5759967617891115, "grad_norm": 0.2532760202884674, "learning_rate": 7.648221446000727e-05, "loss": 0.202, "step": 2846 }, { "epoch": 0.5761991499696417, "grad_norm": 0.3318571448326111, "learning_rate": 7.642035783430604e-05, "loss": 0.2395, "step": 2847 }, { "epoch": 0.576401538150172, "grad_norm": 0.5158884525299072, "learning_rate": 7.6358510760436e-05, "loss": 0.2329, "step": 2848 }, { "epoch": 0.5766039263307022, "grad_norm": 0.3966839909553528, "learning_rate": 7.629667326345069e-05, "loss": 0.2418, "step": 2849 }, { "epoch": 0.5768063145112325, "grad_norm": 0.2506566345691681, "learning_rate": 7.623484536839969e-05, "loss": 0.2334, "step": 2850 }, { "epoch": 0.5768063145112325, "eval_loss": 0.26579922437667847, "eval_runtime": 1.3213, "eval_samples_per_second": 3.784, "eval_steps_per_second": 0.757, "step": 2850 }, { "epoch": 0.5770087026917629, "grad_norm": 0.25292715430259705, "learning_rate": 7.617302710032878e-05, "loss": 0.2059, "step": 2851 }, { "epoch": 0.577211090872293, "grad_norm": 0.22005055844783783, "learning_rate": 7.611121848427981e-05, "loss": 0.2134, "step": 2852 }, { "epoch": 0.5774134790528234, "grad_norm": 0.23579035699367523, "learning_rate": 7.604941954529067e-05, "loss": 0.2297, "step": 2853 }, { "epoch": 0.5776158672333536, "grad_norm": 0.2468157857656479, "learning_rate": 7.598763030839539e-05, "loss": 0.2201, "step": 2854 }, { "epoch": 0.5778182554138839, "grad_norm": 0.273392915725708, "learning_rate": 7.592585079862406e-05, "loss": 0.2261, "step": 2855 }, { "epoch": 0.5780206435944141, "grad_norm": 0.29593828320503235, "learning_rate": 7.586408104100284e-05, "loss": 0.2297, "step": 2856 }, { "epoch": 0.5782230317749444, "grad_norm": 0.29689455032348633, "learning_rate": 7.580232106055387e-05, "loss": 0.229, "step": 2857 }, { "epoch": 0.5784254199554746, "grad_norm": 0.23380589485168457, "learning_rate": 7.574057088229539e-05, "loss": 0.2154, "step": 2858 }, { "epoch": 0.5786278081360049, "grad_norm": 0.2619762122631073, "learning_rate": 7.56788305312417e-05, "loss": 0.2315, "step": 2859 }, { "epoch": 0.5788301963165351, "grad_norm": 0.3029535114765167, "learning_rate": 7.561710003240306e-05, "loss": 0.2478, "step": 2860 }, { "epoch": 0.5790325844970654, "grad_norm": 0.27001458406448364, "learning_rate": 7.555537941078573e-05, "loss": 0.2423, "step": 2861 }, { "epoch": 0.5792349726775956, "grad_norm": 0.30913251638412476, "learning_rate": 7.549366869139202e-05, "loss": 0.2657, "step": 2862 }, { "epoch": 0.5794373608581259, "grad_norm": 0.29774031043052673, "learning_rate": 7.543196789922021e-05, "loss": 0.2167, "step": 2863 }, { "epoch": 0.5796397490386561, "grad_norm": 0.23448917269706726, "learning_rate": 7.537027705926453e-05, "loss": 0.2152, "step": 2864 }, { "epoch": 0.5798421372191864, "grad_norm": 0.2675460875034332, "learning_rate": 7.530859619651523e-05, "loss": 0.2542, "step": 2865 }, { "epoch": 0.5800445253997166, "grad_norm": 0.7183313965797424, "learning_rate": 7.524692533595847e-05, "loss": 0.2567, "step": 2866 }, { "epoch": 0.5802469135802469, "grad_norm": 0.2437203824520111, "learning_rate": 7.51852645025764e-05, "loss": 0.2128, "step": 2867 }, { "epoch": 0.5804493017607771, "grad_norm": 0.2911168932914734, "learning_rate": 7.512361372134706e-05, "loss": 0.2297, "step": 2868 }, { "epoch": 0.5806516899413074, "grad_norm": 0.2514744997024536, "learning_rate": 7.506197301724446e-05, "loss": 0.2436, "step": 2869 }, { "epoch": 0.5808540781218376, "grad_norm": 0.3065710663795471, "learning_rate": 7.500034241523848e-05, "loss": 0.25, "step": 2870 }, { "epoch": 0.581056466302368, "grad_norm": 0.32851630449295044, "learning_rate": 7.493872194029503e-05, "loss": 0.2388, "step": 2871 }, { "epoch": 0.5812588544828982, "grad_norm": 0.24758435785770416, "learning_rate": 7.487711161737572e-05, "loss": 0.2723, "step": 2872 }, { "epoch": 0.5814612426634285, "grad_norm": 0.2451726496219635, "learning_rate": 7.48155114714382e-05, "loss": 0.2267, "step": 2873 }, { "epoch": 0.5816636308439587, "grad_norm": 0.250808984041214, "learning_rate": 7.4753921527436e-05, "loss": 0.2148, "step": 2874 }, { "epoch": 0.581866019024489, "grad_norm": 0.25824838876724243, "learning_rate": 7.46923418103184e-05, "loss": 0.2578, "step": 2875 }, { "epoch": 0.5820684072050192, "grad_norm": 0.269781231880188, "learning_rate": 7.463077234503059e-05, "loss": 0.2655, "step": 2876 }, { "epoch": 0.5822707953855495, "grad_norm": 0.2898010313510895, "learning_rate": 7.45692131565137e-05, "loss": 0.2468, "step": 2877 }, { "epoch": 0.5824731835660797, "grad_norm": 0.20565253496170044, "learning_rate": 7.450766426970464e-05, "loss": 0.2167, "step": 2878 }, { "epoch": 0.58267557174661, "grad_norm": 0.2185257375240326, "learning_rate": 7.444612570953601e-05, "loss": 0.2286, "step": 2879 }, { "epoch": 0.5828779599271403, "grad_norm": 0.2570449709892273, "learning_rate": 7.438459750093641e-05, "loss": 0.2073, "step": 2880 }, { "epoch": 0.5830803481076705, "grad_norm": 0.3027397394180298, "learning_rate": 7.432307966883021e-05, "loss": 0.207, "step": 2881 }, { "epoch": 0.5832827362882008, "grad_norm": 0.24796275794506073, "learning_rate": 7.426157223813754e-05, "loss": 0.2357, "step": 2882 }, { "epoch": 0.583485124468731, "grad_norm": 0.24212820827960968, "learning_rate": 7.420007523377426e-05, "loss": 0.2013, "step": 2883 }, { "epoch": 0.5836875126492613, "grad_norm": 0.30438432097435, "learning_rate": 7.413858868065214e-05, "loss": 0.2354, "step": 2884 }, { "epoch": 0.5838899008297915, "grad_norm": 0.27021902799606323, "learning_rate": 7.407711260367866e-05, "loss": 0.2512, "step": 2885 }, { "epoch": 0.5840922890103218, "grad_norm": 0.22693021595478058, "learning_rate": 7.401564702775696e-05, "loss": 0.2001, "step": 2886 }, { "epoch": 0.584294677190852, "grad_norm": 0.24415190517902374, "learning_rate": 7.395419197778608e-05, "loss": 0.2433, "step": 2887 }, { "epoch": 0.5844970653713824, "grad_norm": 0.30443453788757324, "learning_rate": 7.38927474786607e-05, "loss": 0.2108, "step": 2888 }, { "epoch": 0.5846994535519126, "grad_norm": 0.2773347795009613, "learning_rate": 7.38313135552713e-05, "loss": 0.2119, "step": 2889 }, { "epoch": 0.5849018417324429, "grad_norm": 0.28007155656814575, "learning_rate": 7.376989023250394e-05, "loss": 0.2297, "step": 2890 }, { "epoch": 0.5851042299129731, "grad_norm": 0.2470293492078781, "learning_rate": 7.370847753524054e-05, "loss": 0.2478, "step": 2891 }, { "epoch": 0.5853066180935034, "grad_norm": 0.25164785981178284, "learning_rate": 7.364707548835861e-05, "loss": 0.2426, "step": 2892 }, { "epoch": 0.5855090062740336, "grad_norm": 0.28916865587234497, "learning_rate": 7.358568411673145e-05, "loss": 0.2486, "step": 2893 }, { "epoch": 0.5857113944545639, "grad_norm": 0.2670806348323822, "learning_rate": 7.35243034452279e-05, "loss": 0.2291, "step": 2894 }, { "epoch": 0.5859137826350941, "grad_norm": 0.25232431292533875, "learning_rate": 7.346293349871254e-05, "loss": 0.2159, "step": 2895 }, { "epoch": 0.5861161708156244, "grad_norm": 0.25800731778144836, "learning_rate": 7.340157430204564e-05, "loss": 0.209, "step": 2896 }, { "epoch": 0.5863185589961546, "grad_norm": 0.24318952858448029, "learning_rate": 7.33402258800831e-05, "loss": 0.2619, "step": 2897 }, { "epoch": 0.5865209471766849, "grad_norm": 0.2963591516017914, "learning_rate": 7.327888825767634e-05, "loss": 0.2202, "step": 2898 }, { "epoch": 0.5867233353572151, "grad_norm": 0.25565284490585327, "learning_rate": 7.321756145967258e-05, "loss": 0.2189, "step": 2899 }, { "epoch": 0.5869257235377454, "grad_norm": 0.2918548882007599, "learning_rate": 7.315624551091455e-05, "loss": 0.2391, "step": 2900 }, { "epoch": 0.5869257235377454, "eval_loss": 0.264272540807724, "eval_runtime": 1.3158, "eval_samples_per_second": 3.8, "eval_steps_per_second": 0.76, "step": 2900 }, { "epoch": 0.5871281117182756, "grad_norm": 0.26135432720184326, "learning_rate": 7.309494043624059e-05, "loss": 0.2335, "step": 2901 }, { "epoch": 0.5873304998988059, "grad_norm": 0.23883725702762604, "learning_rate": 7.303364626048465e-05, "loss": 0.2015, "step": 2902 }, { "epoch": 0.5875328880793361, "grad_norm": 0.27074703574180603, "learning_rate": 7.297236300847631e-05, "loss": 0.2304, "step": 2903 }, { "epoch": 0.5877352762598664, "grad_norm": 0.25825077295303345, "learning_rate": 7.291109070504067e-05, "loss": 0.205, "step": 2904 }, { "epoch": 0.5879376644403966, "grad_norm": 0.2450972944498062, "learning_rate": 7.284982937499836e-05, "loss": 0.22, "step": 2905 }, { "epoch": 0.588140052620927, "grad_norm": 0.23739856481552124, "learning_rate": 7.278857904316565e-05, "loss": 0.2429, "step": 2906 }, { "epoch": 0.5883424408014571, "grad_norm": 0.30816450715065, "learning_rate": 7.272733973435433e-05, "loss": 0.2467, "step": 2907 }, { "epoch": 0.5885448289819875, "grad_norm": 0.2724757790565491, "learning_rate": 7.26661114733717e-05, "loss": 0.2238, "step": 2908 }, { "epoch": 0.5887472171625177, "grad_norm": 0.3260548710823059, "learning_rate": 7.260489428502057e-05, "loss": 0.2992, "step": 2909 }, { "epoch": 0.588949605343048, "grad_norm": 0.25951969623565674, "learning_rate": 7.254368819409932e-05, "loss": 0.2818, "step": 2910 }, { "epoch": 0.5891519935235783, "grad_norm": 0.2804354727268219, "learning_rate": 7.248249322540182e-05, "loss": 0.2297, "step": 2911 }, { "epoch": 0.5893543817041085, "grad_norm": 0.4941915273666382, "learning_rate": 7.242130940371739e-05, "loss": 0.2585, "step": 2912 }, { "epoch": 0.5895567698846388, "grad_norm": 0.24917730689048767, "learning_rate": 7.236013675383087e-05, "loss": 0.2459, "step": 2913 }, { "epoch": 0.589759158065169, "grad_norm": 0.2629856765270233, "learning_rate": 7.229897530052256e-05, "loss": 0.2447, "step": 2914 }, { "epoch": 0.5899615462456993, "grad_norm": 0.25125187635421753, "learning_rate": 7.223782506856828e-05, "loss": 0.2361, "step": 2915 }, { "epoch": 0.5901639344262295, "grad_norm": 0.2941356301307678, "learning_rate": 7.217668608273921e-05, "loss": 0.2238, "step": 2916 }, { "epoch": 0.5903663226067598, "grad_norm": 0.25769826769828796, "learning_rate": 7.211555836780202e-05, "loss": 0.222, "step": 2917 }, { "epoch": 0.59056871078729, "grad_norm": 0.23638850450515747, "learning_rate": 7.205444194851884e-05, "loss": 0.2155, "step": 2918 }, { "epoch": 0.5907710989678203, "grad_norm": 0.2737606167793274, "learning_rate": 7.199333684964724e-05, "loss": 0.2153, "step": 2919 }, { "epoch": 0.5909734871483505, "grad_norm": 0.40391501784324646, "learning_rate": 7.19322430959401e-05, "loss": 0.2131, "step": 2920 }, { "epoch": 0.5911758753288808, "grad_norm": 0.2810826003551483, "learning_rate": 7.187116071214574e-05, "loss": 0.2279, "step": 2921 }, { "epoch": 0.591378263509411, "grad_norm": 0.2794511914253235, "learning_rate": 7.181008972300804e-05, "loss": 0.2073, "step": 2922 }, { "epoch": 0.5915806516899413, "grad_norm": 0.2838062345981598, "learning_rate": 7.1749030153266e-05, "loss": 0.267, "step": 2923 }, { "epoch": 0.5917830398704715, "grad_norm": 0.27623632550239563, "learning_rate": 7.168798202765412e-05, "loss": 0.2124, "step": 2924 }, { "epoch": 0.5919854280510018, "grad_norm": 0.2860824763774872, "learning_rate": 7.162694537090235e-05, "loss": 0.2331, "step": 2925 }, { "epoch": 0.592187816231532, "grad_norm": 0.3482542634010315, "learning_rate": 7.156592020773592e-05, "loss": 0.2253, "step": 2926 }, { "epoch": 0.5923902044120624, "grad_norm": 0.24206596612930298, "learning_rate": 7.150490656287529e-05, "loss": 0.196, "step": 2927 }, { "epoch": 0.5925925925925926, "grad_norm": 0.2094927728176117, "learning_rate": 7.144390446103638e-05, "loss": 0.1802, "step": 2928 }, { "epoch": 0.5927949807731229, "grad_norm": 0.5521467328071594, "learning_rate": 7.138291392693049e-05, "loss": 0.2682, "step": 2929 }, { "epoch": 0.5929973689536531, "grad_norm": 0.26555269956588745, "learning_rate": 7.132193498526414e-05, "loss": 0.2215, "step": 2930 }, { "epoch": 0.5931997571341834, "grad_norm": 0.3099657893180847, "learning_rate": 7.126096766073906e-05, "loss": 0.2849, "step": 2931 }, { "epoch": 0.5934021453147136, "grad_norm": 0.24373812973499298, "learning_rate": 7.120001197805251e-05, "loss": 0.2239, "step": 2932 }, { "epoch": 0.5936045334952439, "grad_norm": 0.24464809894561768, "learning_rate": 7.11390679618969e-05, "loss": 0.2572, "step": 2933 }, { "epoch": 0.5938069216757741, "grad_norm": 0.3659166097640991, "learning_rate": 7.107813563695983e-05, "loss": 0.2729, "step": 2934 }, { "epoch": 0.5940093098563044, "grad_norm": 0.23593838512897491, "learning_rate": 7.101721502792433e-05, "loss": 0.2229, "step": 2935 }, { "epoch": 0.5942116980368346, "grad_norm": 0.23439887166023254, "learning_rate": 7.09563061594686e-05, "loss": 0.2126, "step": 2936 }, { "epoch": 0.5944140862173649, "grad_norm": 0.2663463056087494, "learning_rate": 7.089540905626609e-05, "loss": 0.2288, "step": 2937 }, { "epoch": 0.5946164743978951, "grad_norm": 0.24549037218093872, "learning_rate": 7.083452374298548e-05, "loss": 0.2488, "step": 2938 }, { "epoch": 0.5948188625784254, "grad_norm": 0.2625141739845276, "learning_rate": 7.077365024429068e-05, "loss": 0.213, "step": 2939 }, { "epoch": 0.5950212507589556, "grad_norm": 0.2632252275943756, "learning_rate": 7.071278858484086e-05, "loss": 0.2461, "step": 2940 }, { "epoch": 0.5952236389394859, "grad_norm": 0.24017669260501862, "learning_rate": 7.065193878929031e-05, "loss": 0.228, "step": 2941 }, { "epoch": 0.5954260271200162, "grad_norm": 0.3071124255657196, "learning_rate": 7.059110088228853e-05, "loss": 0.2447, "step": 2942 }, { "epoch": 0.5956284153005464, "grad_norm": 0.2553541362285614, "learning_rate": 7.053027488848027e-05, "loss": 0.237, "step": 2943 }, { "epoch": 0.5958308034810768, "grad_norm": 0.2502846121788025, "learning_rate": 7.046946083250541e-05, "loss": 0.213, "step": 2944 }, { "epoch": 0.596033191661607, "grad_norm": 0.2631119191646576, "learning_rate": 7.040865873899901e-05, "loss": 0.229, "step": 2945 }, { "epoch": 0.5962355798421373, "grad_norm": 0.21765324473381042, "learning_rate": 7.034786863259124e-05, "loss": 0.1827, "step": 2946 }, { "epoch": 0.5964379680226675, "grad_norm": 0.22799643874168396, "learning_rate": 7.028709053790745e-05, "loss": 0.2142, "step": 2947 }, { "epoch": 0.5966403562031978, "grad_norm": 0.3424309194087982, "learning_rate": 7.022632447956813e-05, "loss": 0.2555, "step": 2948 }, { "epoch": 0.596842744383728, "grad_norm": 0.23502987623214722, "learning_rate": 7.016557048218889e-05, "loss": 0.2378, "step": 2949 }, { "epoch": 0.5970451325642583, "grad_norm": 0.28284627199172974, "learning_rate": 7.010482857038043e-05, "loss": 0.2436, "step": 2950 }, { "epoch": 0.5970451325642583, "eval_loss": 0.2657679617404938, "eval_runtime": 1.3087, "eval_samples_per_second": 3.821, "eval_steps_per_second": 0.764, "step": 2950 }, { "epoch": 0.5972475207447885, "grad_norm": 0.27368584275245667, "learning_rate": 7.004409876874857e-05, "loss": 0.27, "step": 2951 }, { "epoch": 0.5974499089253188, "grad_norm": 0.24745239317417145, "learning_rate": 6.998338110189429e-05, "loss": 0.2489, "step": 2952 }, { "epoch": 0.597652297105849, "grad_norm": 0.2607947587966919, "learning_rate": 6.992267559441349e-05, "loss": 0.2165, "step": 2953 }, { "epoch": 0.5978546852863793, "grad_norm": 0.2313959002494812, "learning_rate": 6.986198227089731e-05, "loss": 0.2388, "step": 2954 }, { "epoch": 0.5980570734669095, "grad_norm": 0.2623072862625122, "learning_rate": 6.980130115593186e-05, "loss": 0.2341, "step": 2955 }, { "epoch": 0.5982594616474398, "grad_norm": 0.22064810991287231, "learning_rate": 6.974063227409839e-05, "loss": 0.1969, "step": 2956 }, { "epoch": 0.59846184982797, "grad_norm": 0.34666523337364197, "learning_rate": 6.967997564997307e-05, "loss": 0.2679, "step": 2957 }, { "epoch": 0.5986642380085003, "grad_norm": 0.2472347617149353, "learning_rate": 6.961933130812719e-05, "loss": 0.197, "step": 2958 }, { "epoch": 0.5988666261890305, "grad_norm": 0.33698976039886475, "learning_rate": 6.955869927312709e-05, "loss": 0.2847, "step": 2959 }, { "epoch": 0.5990690143695608, "grad_norm": 0.2557450830936432, "learning_rate": 6.949807956953403e-05, "loss": 0.235, "step": 2960 }, { "epoch": 0.599271402550091, "grad_norm": 0.23061934113502502, "learning_rate": 6.943747222190435e-05, "loss": 0.2231, "step": 2961 }, { "epoch": 0.5994737907306213, "grad_norm": 0.2476864457130432, "learning_rate": 6.937687725478934e-05, "loss": 0.2419, "step": 2962 }, { "epoch": 0.5996761789111515, "grad_norm": 0.23239213228225708, "learning_rate": 6.93162946927354e-05, "loss": 0.2043, "step": 2963 }, { "epoch": 0.5998785670916819, "grad_norm": 0.34371045231819153, "learning_rate": 6.925572456028365e-05, "loss": 0.2641, "step": 2964 }, { "epoch": 0.6000809552722121, "grad_norm": 0.45099425315856934, "learning_rate": 6.91951668819704e-05, "loss": 0.2194, "step": 2965 }, { "epoch": 0.6002833434527424, "grad_norm": 0.2459549903869629, "learning_rate": 6.913462168232684e-05, "loss": 0.2375, "step": 2966 }, { "epoch": 0.6004857316332726, "grad_norm": 0.2394380122423172, "learning_rate": 6.907408898587915e-05, "loss": 0.2854, "step": 2967 }, { "epoch": 0.6006881198138029, "grad_norm": 0.21677638590335846, "learning_rate": 6.901356881714833e-05, "loss": 0.2317, "step": 2968 }, { "epoch": 0.6008905079943331, "grad_norm": 0.2051166146993637, "learning_rate": 6.895306120065037e-05, "loss": 0.2091, "step": 2969 }, { "epoch": 0.6010928961748634, "grad_norm": 0.24513091146945953, "learning_rate": 6.889256616089629e-05, "loss": 0.2246, "step": 2970 }, { "epoch": 0.6012952843553937, "grad_norm": 0.2696916460990906, "learning_rate": 6.883208372239178e-05, "loss": 0.2572, "step": 2971 }, { "epoch": 0.6014976725359239, "grad_norm": 0.2938191294670105, "learning_rate": 6.87716139096376e-05, "loss": 0.2275, "step": 2972 }, { "epoch": 0.6017000607164542, "grad_norm": 0.27625104784965515, "learning_rate": 6.871115674712937e-05, "loss": 0.2644, "step": 2973 }, { "epoch": 0.6019024488969844, "grad_norm": 0.2650105953216553, "learning_rate": 6.86507122593576e-05, "loss": 0.239, "step": 2974 }, { "epoch": 0.6021048370775147, "grad_norm": 0.25188297033309937, "learning_rate": 6.859028047080749e-05, "loss": 0.2278, "step": 2975 }, { "epoch": 0.6023072252580449, "grad_norm": 0.20642536878585815, "learning_rate": 6.852986140595936e-05, "loss": 0.1851, "step": 2976 }, { "epoch": 0.6025096134385752, "grad_norm": 0.28147801756858826, "learning_rate": 6.846945508928823e-05, "loss": 0.2159, "step": 2977 }, { "epoch": 0.6027120016191054, "grad_norm": 0.25589632987976074, "learning_rate": 6.840906154526399e-05, "loss": 0.2274, "step": 2978 }, { "epoch": 0.6029143897996357, "grad_norm": 0.27580776810646057, "learning_rate": 6.834868079835124e-05, "loss": 0.2408, "step": 2979 }, { "epoch": 0.6031167779801659, "grad_norm": 0.28267231583595276, "learning_rate": 6.828831287300961e-05, "loss": 0.2168, "step": 2980 }, { "epoch": 0.6033191661606963, "grad_norm": 0.26342248916625977, "learning_rate": 6.822795779369339e-05, "loss": 0.202, "step": 2981 }, { "epoch": 0.6035215543412265, "grad_norm": 0.24241618812084198, "learning_rate": 6.816761558485172e-05, "loss": 0.2092, "step": 2982 }, { "epoch": 0.6037239425217568, "grad_norm": 0.3960649371147156, "learning_rate": 6.810728627092846e-05, "loss": 0.2568, "step": 2983 }, { "epoch": 0.603926330702287, "grad_norm": 0.25792795419692993, "learning_rate": 6.804696987636232e-05, "loss": 0.2062, "step": 2984 }, { "epoch": 0.6041287188828173, "grad_norm": 0.3172403573989868, "learning_rate": 6.798666642558678e-05, "loss": 0.2667, "step": 2985 }, { "epoch": 0.6043311070633475, "grad_norm": 0.2607148289680481, "learning_rate": 6.792637594302999e-05, "loss": 0.2483, "step": 2986 }, { "epoch": 0.6045334952438778, "grad_norm": 0.2787432074546814, "learning_rate": 6.786609845311494e-05, "loss": 0.244, "step": 2987 }, { "epoch": 0.604735883424408, "grad_norm": 0.25241243839263916, "learning_rate": 6.78058339802593e-05, "loss": 0.219, "step": 2988 }, { "epoch": 0.6049382716049383, "grad_norm": 0.26967641711235046, "learning_rate": 6.774558254887554e-05, "loss": 0.2358, "step": 2989 }, { "epoch": 0.6051406597854685, "grad_norm": 0.24577388167381287, "learning_rate": 6.768534418337071e-05, "loss": 0.2658, "step": 2990 }, { "epoch": 0.6053430479659988, "grad_norm": 0.21785461902618408, "learning_rate": 6.76251189081467e-05, "loss": 0.2382, "step": 2991 }, { "epoch": 0.605545436146529, "grad_norm": 0.2414468228816986, "learning_rate": 6.756490674760008e-05, "loss": 0.232, "step": 2992 }, { "epoch": 0.6057478243270593, "grad_norm": 0.2201997935771942, "learning_rate": 6.750470772612203e-05, "loss": 0.222, "step": 2993 }, { "epoch": 0.6059502125075895, "grad_norm": 0.23584184050559998, "learning_rate": 6.744452186809846e-05, "loss": 0.2294, "step": 2994 }, { "epoch": 0.6061526006881198, "grad_norm": 0.21848222613334656, "learning_rate": 6.738434919790994e-05, "loss": 0.226, "step": 2995 }, { "epoch": 0.60635498886865, "grad_norm": 0.22887808084487915, "learning_rate": 6.732418973993175e-05, "loss": 0.2309, "step": 2996 }, { "epoch": 0.6065573770491803, "grad_norm": 0.2343272864818573, "learning_rate": 6.726404351853369e-05, "loss": 0.2367, "step": 2997 }, { "epoch": 0.6067597652297105, "grad_norm": 0.22424767911434174, "learning_rate": 6.720391055808031e-05, "loss": 0.2231, "step": 2998 }, { "epoch": 0.6069621534102408, "grad_norm": 0.2591332793235779, "learning_rate": 6.714379088293075e-05, "loss": 0.2301, "step": 2999 }, { "epoch": 0.607164541590771, "grad_norm": 0.3080432415008545, "learning_rate": 6.708368451743882e-05, "loss": 0.2545, "step": 3000 }, { "epoch": 0.607164541590771, "eval_loss": 0.26817837357521057, "eval_runtime": 1.3211, "eval_samples_per_second": 3.785, "eval_steps_per_second": 0.757, "step": 3000 }, { "epoch": 0.6073669297713014, "grad_norm": 0.2541639804840088, "learning_rate": 6.702359148595281e-05, "loss": 0.2325, "step": 3001 }, { "epoch": 0.6075693179518317, "grad_norm": 0.2314985990524292, "learning_rate": 6.696351181281571e-05, "loss": 0.2164, "step": 3002 }, { "epoch": 0.6077717061323619, "grad_norm": 0.27611708641052246, "learning_rate": 6.690344552236511e-05, "loss": 0.2457, "step": 3003 }, { "epoch": 0.6079740943128922, "grad_norm": 0.26329365372657776, "learning_rate": 6.684339263893315e-05, "loss": 0.2761, "step": 3004 }, { "epoch": 0.6081764824934224, "grad_norm": 0.23385192453861237, "learning_rate": 6.67833531868465e-05, "loss": 0.2287, "step": 3005 }, { "epoch": 0.6083788706739527, "grad_norm": 0.24689139425754547, "learning_rate": 6.672332719042642e-05, "loss": 0.2591, "step": 3006 }, { "epoch": 0.6085812588544829, "grad_norm": 0.24551792442798615, "learning_rate": 6.666331467398878e-05, "loss": 0.2501, "step": 3007 }, { "epoch": 0.6087836470350132, "grad_norm": 0.33833298087120056, "learning_rate": 6.660331566184386e-05, "loss": 0.2781, "step": 3008 }, { "epoch": 0.6089860352155434, "grad_norm": 0.23403316736221313, "learning_rate": 6.65433301782966e-05, "loss": 0.2336, "step": 3009 }, { "epoch": 0.6091884233960737, "grad_norm": 0.2035469114780426, "learning_rate": 6.648335824764633e-05, "loss": 0.2092, "step": 3010 }, { "epoch": 0.6093908115766039, "grad_norm": 0.2268756479024887, "learning_rate": 6.642339989418708e-05, "loss": 0.2183, "step": 3011 }, { "epoch": 0.6095931997571342, "grad_norm": 0.2795691192150116, "learning_rate": 6.636345514220718e-05, "loss": 0.1889, "step": 3012 }, { "epoch": 0.6097955879376644, "grad_norm": 0.3369617462158203, "learning_rate": 6.630352401598953e-05, "loss": 0.2394, "step": 3013 }, { "epoch": 0.6099979761181947, "grad_norm": 0.27903997898101807, "learning_rate": 6.62436065398115e-05, "loss": 0.2346, "step": 3014 }, { "epoch": 0.6102003642987249, "grad_norm": 0.27034711837768555, "learning_rate": 6.618370273794505e-05, "loss": 0.2552, "step": 3015 }, { "epoch": 0.6104027524792552, "grad_norm": 0.34340980648994446, "learning_rate": 6.612381263465637e-05, "loss": 0.2444, "step": 3016 }, { "epoch": 0.6106051406597854, "grad_norm": 0.25121599435806274, "learning_rate": 6.606393625420626e-05, "loss": 0.2333, "step": 3017 }, { "epoch": 0.6108075288403157, "grad_norm": 0.2349703460931778, "learning_rate": 6.600407362084999e-05, "loss": 0.243, "step": 3018 }, { "epoch": 0.611009917020846, "grad_norm": 0.23920835554599762, "learning_rate": 6.594422475883712e-05, "loss": 0.2747, "step": 3019 }, { "epoch": 0.6112123052013763, "grad_norm": 0.24157609045505524, "learning_rate": 6.58843896924117e-05, "loss": 0.2579, "step": 3020 }, { "epoch": 0.6114146933819065, "grad_norm": 0.2714554965496063, "learning_rate": 6.582456844581226e-05, "loss": 0.2195, "step": 3021 }, { "epoch": 0.6116170815624368, "grad_norm": 0.24838095903396606, "learning_rate": 6.57647610432717e-05, "loss": 0.2566, "step": 3022 }, { "epoch": 0.611819469742967, "grad_norm": 0.21487177908420563, "learning_rate": 6.570496750901716e-05, "loss": 0.2267, "step": 3023 }, { "epoch": 0.6120218579234973, "grad_norm": 0.2720756232738495, "learning_rate": 6.564518786727036e-05, "loss": 0.2742, "step": 3024 }, { "epoch": 0.6122242461040275, "grad_norm": 0.2329442799091339, "learning_rate": 6.558542214224734e-05, "loss": 0.2342, "step": 3025 }, { "epoch": 0.6124266342845578, "grad_norm": 0.28757500648498535, "learning_rate": 6.552567035815849e-05, "loss": 0.2883, "step": 3026 }, { "epoch": 0.612629022465088, "grad_norm": 0.27794039249420166, "learning_rate": 6.546593253920844e-05, "loss": 0.257, "step": 3027 }, { "epoch": 0.6128314106456183, "grad_norm": 0.20296815037727356, "learning_rate": 6.540620870959636e-05, "loss": 0.1961, "step": 3028 }, { "epoch": 0.6130337988261485, "grad_norm": 0.21851663291454315, "learning_rate": 6.534649889351565e-05, "loss": 0.2122, "step": 3029 }, { "epoch": 0.6132361870066788, "grad_norm": 0.25666186213493347, "learning_rate": 6.528680311515402e-05, "loss": 0.237, "step": 3030 }, { "epoch": 0.613438575187209, "grad_norm": 0.2396874874830246, "learning_rate": 6.522712139869352e-05, "loss": 0.2267, "step": 3031 }, { "epoch": 0.6136409633677393, "grad_norm": 0.2645941972732544, "learning_rate": 6.51674537683105e-05, "loss": 0.2166, "step": 3032 }, { "epoch": 0.6138433515482696, "grad_norm": 0.22799986600875854, "learning_rate": 6.510780024817564e-05, "loss": 0.2096, "step": 3033 }, { "epoch": 0.6140457397287998, "grad_norm": 0.46618616580963135, "learning_rate": 6.504816086245383e-05, "loss": 0.2188, "step": 3034 }, { "epoch": 0.6142481279093301, "grad_norm": 0.22280822694301605, "learning_rate": 6.498853563530425e-05, "loss": 0.1983, "step": 3035 }, { "epoch": 0.6144505160898603, "grad_norm": 0.2462967038154602, "learning_rate": 6.49289245908804e-05, "loss": 0.1896, "step": 3036 }, { "epoch": 0.6146529042703907, "grad_norm": 0.25686705112457275, "learning_rate": 6.486932775333003e-05, "loss": 0.252, "step": 3037 }, { "epoch": 0.6148552924509209, "grad_norm": 0.29693490266799927, "learning_rate": 6.480974514679502e-05, "loss": 0.2464, "step": 3038 }, { "epoch": 0.6150576806314512, "grad_norm": 0.24047529697418213, "learning_rate": 6.475017679541166e-05, "loss": 0.2252, "step": 3039 }, { "epoch": 0.6152600688119814, "grad_norm": 0.2417096346616745, "learning_rate": 6.469062272331034e-05, "loss": 0.1794, "step": 3040 }, { "epoch": 0.6154624569925117, "grad_norm": 0.2582316994667053, "learning_rate": 6.463108295461572e-05, "loss": 0.243, "step": 3041 }, { "epoch": 0.6156648451730419, "grad_norm": 0.22317497432231903, "learning_rate": 6.457155751344661e-05, "loss": 0.2365, "step": 3042 }, { "epoch": 0.6158672333535722, "grad_norm": 0.27534019947052, "learning_rate": 6.451204642391608e-05, "loss": 0.2145, "step": 3043 }, { "epoch": 0.6160696215341024, "grad_norm": 0.25475773215293884, "learning_rate": 6.445254971013138e-05, "loss": 0.2032, "step": 3044 }, { "epoch": 0.6162720097146327, "grad_norm": 0.24377329647541046, "learning_rate": 6.439306739619387e-05, "loss": 0.2257, "step": 3045 }, { "epoch": 0.6164743978951629, "grad_norm": 0.3480880856513977, "learning_rate": 6.433359950619917e-05, "loss": 0.2644, "step": 3046 }, { "epoch": 0.6166767860756932, "grad_norm": 0.2968449592590332, "learning_rate": 6.427414606423697e-05, "loss": 0.2493, "step": 3047 }, { "epoch": 0.6168791742562234, "grad_norm": 0.2328902781009674, "learning_rate": 6.421470709439121e-05, "loss": 0.1966, "step": 3048 }, { "epoch": 0.6170815624367537, "grad_norm": 0.3110244572162628, "learning_rate": 6.415528262073984e-05, "loss": 0.2603, "step": 3049 }, { "epoch": 0.6172839506172839, "grad_norm": 0.26721519231796265, "learning_rate": 6.409587266735503e-05, "loss": 0.2264, "step": 3050 }, { "epoch": 0.6172839506172839, "eval_loss": 0.26093789935112, "eval_runtime": 1.3188, "eval_samples_per_second": 3.791, "eval_steps_per_second": 0.758, "step": 3050 }, { "epoch": 0.6174863387978142, "grad_norm": 0.41167759895324707, "learning_rate": 6.403647725830305e-05, "loss": 0.2134, "step": 3051 }, { "epoch": 0.6176887269783444, "grad_norm": 0.24009092152118683, "learning_rate": 6.397709641764428e-05, "loss": 0.2308, "step": 3052 }, { "epoch": 0.6178911151588747, "grad_norm": 0.2674366533756256, "learning_rate": 6.391773016943315e-05, "loss": 0.2436, "step": 3053 }, { "epoch": 0.6180935033394049, "grad_norm": 0.2482387274503708, "learning_rate": 6.385837853771826e-05, "loss": 0.2591, "step": 3054 }, { "epoch": 0.6182958915199352, "grad_norm": 0.3292860686779022, "learning_rate": 6.379904154654221e-05, "loss": 0.2599, "step": 3055 }, { "epoch": 0.6184982797004654, "grad_norm": 0.2653280794620514, "learning_rate": 6.373971921994173e-05, "loss": 0.2159, "step": 3056 }, { "epoch": 0.6187006678809958, "grad_norm": 0.23486590385437012, "learning_rate": 6.368041158194757e-05, "loss": 0.231, "step": 3057 }, { "epoch": 0.618903056061526, "grad_norm": 0.23998433351516724, "learning_rate": 6.362111865658455e-05, "loss": 0.2543, "step": 3058 }, { "epoch": 0.6191054442420563, "grad_norm": 0.26096028089523315, "learning_rate": 6.356184046787157e-05, "loss": 0.2652, "step": 3059 }, { "epoch": 0.6193078324225865, "grad_norm": 0.25213226675987244, "learning_rate": 6.350257703982145e-05, "loss": 0.2233, "step": 3060 }, { "epoch": 0.6195102206031168, "grad_norm": 0.2854272723197937, "learning_rate": 6.34433283964411e-05, "loss": 0.249, "step": 3061 }, { "epoch": 0.6197126087836471, "grad_norm": 0.2546935975551605, "learning_rate": 6.338409456173145e-05, "loss": 0.2287, "step": 3062 }, { "epoch": 0.6199149969641773, "grad_norm": 0.2571673095226288, "learning_rate": 6.332487555968749e-05, "loss": 0.2521, "step": 3063 }, { "epoch": 0.6201173851447076, "grad_norm": 0.27219587564468384, "learning_rate": 6.326567141429802e-05, "loss": 0.2429, "step": 3064 }, { "epoch": 0.6203197733252378, "grad_norm": 0.30288025736808777, "learning_rate": 6.320648214954596e-05, "loss": 0.213, "step": 3065 }, { "epoch": 0.6205221615057681, "grad_norm": 0.25682565569877625, "learning_rate": 6.314730778940826e-05, "loss": 0.2569, "step": 3066 }, { "epoch": 0.6207245496862983, "grad_norm": 0.26676157116889954, "learning_rate": 6.308814835785564e-05, "loss": 0.2444, "step": 3067 }, { "epoch": 0.6209269378668286, "grad_norm": 0.27512454986572266, "learning_rate": 6.302900387885288e-05, "loss": 0.2081, "step": 3068 }, { "epoch": 0.6211293260473588, "grad_norm": 0.23868297040462494, "learning_rate": 6.296987437635877e-05, "loss": 0.2295, "step": 3069 }, { "epoch": 0.6213317142278891, "grad_norm": 0.3301694095134735, "learning_rate": 6.291075987432596e-05, "loss": 0.2533, "step": 3070 }, { "epoch": 0.6215341024084193, "grad_norm": 0.2617209851741791, "learning_rate": 6.285166039670095e-05, "loss": 0.2413, "step": 3071 }, { "epoch": 0.6217364905889496, "grad_norm": 0.2665987014770508, "learning_rate": 6.27925759674243e-05, "loss": 0.2568, "step": 3072 }, { "epoch": 0.6219388787694798, "grad_norm": 0.23757365345954895, "learning_rate": 6.273350661043037e-05, "loss": 0.2113, "step": 3073 }, { "epoch": 0.6221412669500102, "grad_norm": 0.2571878135204315, "learning_rate": 6.267445234964756e-05, "loss": 0.2609, "step": 3074 }, { "epoch": 0.6223436551305404, "grad_norm": 0.4653869569301605, "learning_rate": 6.261541320899786e-05, "loss": 0.2126, "step": 3075 }, { "epoch": 0.6225460433110707, "grad_norm": 0.3867615759372711, "learning_rate": 6.255638921239743e-05, "loss": 0.2181, "step": 3076 }, { "epoch": 0.6227484314916009, "grad_norm": 0.28550732135772705, "learning_rate": 6.249738038375617e-05, "loss": 0.2176, "step": 3077 }, { "epoch": 0.6229508196721312, "grad_norm": 0.2793484032154083, "learning_rate": 6.243838674697788e-05, "loss": 0.2289, "step": 3078 }, { "epoch": 0.6231532078526614, "grad_norm": 0.24702169001102448, "learning_rate": 6.237940832596014e-05, "loss": 0.2438, "step": 3079 }, { "epoch": 0.6233555960331917, "grad_norm": 0.2584024667739868, "learning_rate": 6.23204451445944e-05, "loss": 0.2446, "step": 3080 }, { "epoch": 0.6235579842137219, "grad_norm": 0.26924261450767517, "learning_rate": 6.226149722676598e-05, "loss": 0.1966, "step": 3081 }, { "epoch": 0.6237603723942522, "grad_norm": 0.27409616112709045, "learning_rate": 6.220256459635392e-05, "loss": 0.2078, "step": 3082 }, { "epoch": 0.6239627605747824, "grad_norm": 0.27125808596611023, "learning_rate": 6.214364727723114e-05, "loss": 0.2173, "step": 3083 }, { "epoch": 0.6241651487553127, "grad_norm": 0.27642229199409485, "learning_rate": 6.208474529326438e-05, "loss": 0.2499, "step": 3084 }, { "epoch": 0.6243675369358429, "grad_norm": 0.20267561078071594, "learning_rate": 6.202585866831411e-05, "loss": 0.2091, "step": 3085 }, { "epoch": 0.6245699251163732, "grad_norm": 0.24506455659866333, "learning_rate": 6.196698742623456e-05, "loss": 0.2025, "step": 3086 }, { "epoch": 0.6247723132969034, "grad_norm": 0.24663329124450684, "learning_rate": 6.190813159087384e-05, "loss": 0.1871, "step": 3087 }, { "epoch": 0.6249747014774337, "grad_norm": 0.21795138716697693, "learning_rate": 6.184929118607366e-05, "loss": 0.2075, "step": 3088 }, { "epoch": 0.6251770896579639, "grad_norm": 0.25957101583480835, "learning_rate": 6.179046623566965e-05, "loss": 0.2112, "step": 3089 }, { "epoch": 0.6253794778384942, "grad_norm": 0.24462351202964783, "learning_rate": 6.173165676349103e-05, "loss": 0.2008, "step": 3090 }, { "epoch": 0.6255818660190244, "grad_norm": 0.2708177864551544, "learning_rate": 6.167286279336085e-05, "loss": 0.2064, "step": 3091 }, { "epoch": 0.6257842541995547, "grad_norm": 0.3163127303123474, "learning_rate": 6.161408434909583e-05, "loss": 0.2263, "step": 3092 }, { "epoch": 0.6259866423800851, "grad_norm": 0.21040435135364532, "learning_rate": 6.15553214545064e-05, "loss": 0.2235, "step": 3093 }, { "epoch": 0.6261890305606153, "grad_norm": 0.2406248301267624, "learning_rate": 6.149657413339673e-05, "loss": 0.2238, "step": 3094 }, { "epoch": 0.6263914187411456, "grad_norm": 0.28941118717193604, "learning_rate": 6.143784240956465e-05, "loss": 0.2439, "step": 3095 }, { "epoch": 0.6265938069216758, "grad_norm": 0.2564215064048767, "learning_rate": 6.13791263068017e-05, "loss": 0.2279, "step": 3096 }, { "epoch": 0.6267961951022061, "grad_norm": 0.268678218126297, "learning_rate": 6.132042584889304e-05, "loss": 0.1975, "step": 3097 }, { "epoch": 0.6269985832827363, "grad_norm": 0.2823198139667511, "learning_rate": 6.126174105961753e-05, "loss": 0.2704, "step": 3098 }, { "epoch": 0.6272009714632666, "grad_norm": 0.26300010085105896, "learning_rate": 6.120307196274768e-05, "loss": 0.2353, "step": 3099 }, { "epoch": 0.6274033596437968, "grad_norm": 0.2869541049003601, "learning_rate": 6.114441858204966e-05, "loss": 0.2247, "step": 3100 }, { "epoch": 0.6274033596437968, "eval_loss": 0.26295047998428345, "eval_runtime": 1.3178, "eval_samples_per_second": 3.794, "eval_steps_per_second": 0.759, "step": 3100 }, { "epoch": 0.6276057478243271, "grad_norm": 0.2194211781024933, "learning_rate": 6.108578094128321e-05, "loss": 0.218, "step": 3101 }, { "epoch": 0.6278081360048573, "grad_norm": 0.2732012867927551, "learning_rate": 6.1027159064201776e-05, "loss": 0.2104, "step": 3102 }, { "epoch": 0.6280105241853876, "grad_norm": 0.2702508866786957, "learning_rate": 6.0968552974552375e-05, "loss": 0.2354, "step": 3103 }, { "epoch": 0.6282129123659178, "grad_norm": 0.25754570960998535, "learning_rate": 6.0909962696075603e-05, "loss": 0.2298, "step": 3104 }, { "epoch": 0.6284153005464481, "grad_norm": 0.26745882630348206, "learning_rate": 6.085138825250572e-05, "loss": 0.2523, "step": 3105 }, { "epoch": 0.6286176887269783, "grad_norm": 0.31613436341285706, "learning_rate": 6.079282966757048e-05, "loss": 0.2449, "step": 3106 }, { "epoch": 0.6288200769075086, "grad_norm": 0.22608822584152222, "learning_rate": 6.073428696499137e-05, "loss": 0.2537, "step": 3107 }, { "epoch": 0.6290224650880388, "grad_norm": 0.2263844609260559, "learning_rate": 6.0675760168483246e-05, "loss": 0.249, "step": 3108 }, { "epoch": 0.6292248532685691, "grad_norm": 0.2011934071779251, "learning_rate": 6.061724930175461e-05, "loss": 0.184, "step": 3109 }, { "epoch": 0.6294272414490993, "grad_norm": 0.3044241666793823, "learning_rate": 6.055875438850753e-05, "loss": 0.2172, "step": 3110 }, { "epoch": 0.6296296296296297, "grad_norm": 0.35803037881851196, "learning_rate": 6.050027545243768e-05, "loss": 0.2344, "step": 3111 }, { "epoch": 0.6298320178101598, "grad_norm": 0.23281417787075043, "learning_rate": 6.044181251723406e-05, "loss": 0.2292, "step": 3112 }, { "epoch": 0.6300344059906902, "grad_norm": 0.3003116548061371, "learning_rate": 6.0383365606579355e-05, "loss": 0.2124, "step": 3113 }, { "epoch": 0.6302367941712204, "grad_norm": 0.2256740927696228, "learning_rate": 6.0324934744149705e-05, "loss": 0.2453, "step": 3114 }, { "epoch": 0.6304391823517507, "grad_norm": 0.2587612569332123, "learning_rate": 6.0266519953614807e-05, "loss": 0.2333, "step": 3115 }, { "epoch": 0.6306415705322809, "grad_norm": 0.22457852959632874, "learning_rate": 6.020812125863768e-05, "loss": 0.2481, "step": 3116 }, { "epoch": 0.6308439587128112, "grad_norm": 0.20790676772594452, "learning_rate": 6.0149738682875036e-05, "loss": 0.2306, "step": 3117 }, { "epoch": 0.6310463468933414, "grad_norm": 0.24789734184741974, "learning_rate": 6.009137224997695e-05, "loss": 0.1857, "step": 3118 }, { "epoch": 0.6312487350738717, "grad_norm": 0.2102285921573639, "learning_rate": 6.003302198358689e-05, "loss": 0.2089, "step": 3119 }, { "epoch": 0.6314511232544019, "grad_norm": 0.2344849705696106, "learning_rate": 5.997468790734191e-05, "loss": 0.2438, "step": 3120 }, { "epoch": 0.6316535114349322, "grad_norm": 0.24293087422847748, "learning_rate": 5.991637004487245e-05, "loss": 0.2373, "step": 3121 }, { "epoch": 0.6318558996154624, "grad_norm": 0.3266341984272003, "learning_rate": 5.985806841980242e-05, "loss": 0.2783, "step": 3122 }, { "epoch": 0.6320582877959927, "grad_norm": 0.22124820947647095, "learning_rate": 5.979978305574898e-05, "loss": 0.2022, "step": 3123 }, { "epoch": 0.632260675976523, "grad_norm": 0.2270675152540207, "learning_rate": 5.974151397632295e-05, "loss": 0.2227, "step": 3124 }, { "epoch": 0.6324630641570532, "grad_norm": 0.2712065875530243, "learning_rate": 5.968326120512839e-05, "loss": 0.21, "step": 3125 }, { "epoch": 0.6326654523375835, "grad_norm": 0.21533921360969543, "learning_rate": 5.962502476576285e-05, "loss": 0.2212, "step": 3126 }, { "epoch": 0.6328678405181137, "grad_norm": 0.25962111353874207, "learning_rate": 5.956680468181714e-05, "loss": 0.2331, "step": 3127 }, { "epoch": 0.633070228698644, "grad_norm": 0.2168312668800354, "learning_rate": 5.9508600976875585e-05, "loss": 0.2286, "step": 3128 }, { "epoch": 0.6332726168791742, "grad_norm": 0.3093516230583191, "learning_rate": 5.945041367451578e-05, "loss": 0.2167, "step": 3129 }, { "epoch": 0.6334750050597046, "grad_norm": 0.23129330575466156, "learning_rate": 5.9392242798308704e-05, "loss": 0.1825, "step": 3130 }, { "epoch": 0.6336773932402348, "grad_norm": 0.27783188223838806, "learning_rate": 5.9334088371818684e-05, "loss": 0.2109, "step": 3131 }, { "epoch": 0.6338797814207651, "grad_norm": 0.2971867620944977, "learning_rate": 5.927595041860339e-05, "loss": 0.2311, "step": 3132 }, { "epoch": 0.6340821696012953, "grad_norm": 0.4069770872592926, "learning_rate": 5.9217828962213826e-05, "loss": 0.2356, "step": 3133 }, { "epoch": 0.6342845577818256, "grad_norm": 0.2737041711807251, "learning_rate": 5.915972402619427e-05, "loss": 0.2355, "step": 3134 }, { "epoch": 0.6344869459623558, "grad_norm": 0.2689873278141022, "learning_rate": 5.910163563408235e-05, "loss": 0.2432, "step": 3135 }, { "epoch": 0.6346893341428861, "grad_norm": 0.28194501996040344, "learning_rate": 5.904356380940899e-05, "loss": 0.221, "step": 3136 }, { "epoch": 0.6348917223234163, "grad_norm": 0.22797439992427826, "learning_rate": 5.898550857569838e-05, "loss": 0.2241, "step": 3137 }, { "epoch": 0.6350941105039466, "grad_norm": 0.403171569108963, "learning_rate": 5.892746995646801e-05, "loss": 0.2291, "step": 3138 }, { "epoch": 0.6352964986844768, "grad_norm": 0.29772353172302246, "learning_rate": 5.8869447975228596e-05, "loss": 0.2423, "step": 3139 }, { "epoch": 0.6354988868650071, "grad_norm": 0.28205201029777527, "learning_rate": 5.881144265548422e-05, "loss": 0.2572, "step": 3140 }, { "epoch": 0.6357012750455373, "grad_norm": 0.22829242050647736, "learning_rate": 5.8753454020732066e-05, "loss": 0.2, "step": 3141 }, { "epoch": 0.6359036632260676, "grad_norm": 0.2117273062467575, "learning_rate": 5.8695482094462684e-05, "loss": 0.2137, "step": 3142 }, { "epoch": 0.6361060514065978, "grad_norm": 0.27252891659736633, "learning_rate": 5.86375269001598e-05, "loss": 0.2291, "step": 3143 }, { "epoch": 0.6363084395871281, "grad_norm": 0.41147103905677795, "learning_rate": 5.857958846130038e-05, "loss": 0.2294, "step": 3144 }, { "epoch": 0.6365108277676583, "grad_norm": 0.23764705657958984, "learning_rate": 5.852166680135456e-05, "loss": 0.2097, "step": 3145 }, { "epoch": 0.6367132159481886, "grad_norm": 0.2606548070907593, "learning_rate": 5.8463761943785734e-05, "loss": 0.2151, "step": 3146 }, { "epoch": 0.6369156041287188, "grad_norm": 0.2353961020708084, "learning_rate": 5.840587391205046e-05, "loss": 0.207, "step": 3147 }, { "epoch": 0.6371179923092491, "grad_norm": 0.30080151557922363, "learning_rate": 5.8348002729598493e-05, "loss": 0.2199, "step": 3148 }, { "epoch": 0.6373203804897793, "grad_norm": 0.2464168816804886, "learning_rate": 5.829014841987277e-05, "loss": 0.2626, "step": 3149 }, { "epoch": 0.6375227686703097, "grad_norm": 0.32218682765960693, "learning_rate": 5.823231100630936e-05, "loss": 0.2511, "step": 3150 }, { "epoch": 0.6375227686703097, "eval_loss": 0.26261723041534424, "eval_runtime": 1.3208, "eval_samples_per_second": 3.785, "eval_steps_per_second": 0.757, "step": 3150 }, { "epoch": 0.6377251568508399, "grad_norm": 0.2318582981824875, "learning_rate": 5.817449051233755e-05, "loss": 0.2191, "step": 3151 }, { "epoch": 0.6379275450313702, "grad_norm": 0.24146978557109833, "learning_rate": 5.8116686961379665e-05, "loss": 0.2682, "step": 3152 }, { "epoch": 0.6381299332119005, "grad_norm": 0.2270742654800415, "learning_rate": 5.805890037685127e-05, "loss": 0.1942, "step": 3153 }, { "epoch": 0.6383323213924307, "grad_norm": 0.22645340859889984, "learning_rate": 5.800113078216101e-05, "loss": 0.1966, "step": 3154 }, { "epoch": 0.638534709572961, "grad_norm": 0.2359117567539215, "learning_rate": 5.7943378200710655e-05, "loss": 0.2319, "step": 3155 }, { "epoch": 0.6387370977534912, "grad_norm": 0.2844065725803375, "learning_rate": 5.78856426558951e-05, "loss": 0.2611, "step": 3156 }, { "epoch": 0.6389394859340215, "grad_norm": 0.2356262058019638, "learning_rate": 5.7827924171102324e-05, "loss": 0.1646, "step": 3157 }, { "epoch": 0.6391418741145517, "grad_norm": 0.5139538049697876, "learning_rate": 5.7770222769713366e-05, "loss": 0.2313, "step": 3158 }, { "epoch": 0.639344262295082, "grad_norm": 0.2635382413864136, "learning_rate": 5.771253847510245e-05, "loss": 0.2089, "step": 3159 }, { "epoch": 0.6395466504756122, "grad_norm": 0.3721647560596466, "learning_rate": 5.765487131063669e-05, "loss": 0.218, "step": 3160 }, { "epoch": 0.6397490386561425, "grad_norm": 0.38158339262008667, "learning_rate": 5.759722129967643e-05, "loss": 0.2463, "step": 3161 }, { "epoch": 0.6399514268366727, "grad_norm": 0.22761520743370056, "learning_rate": 5.753958846557498e-05, "loss": 0.2367, "step": 3162 }, { "epoch": 0.640153815017203, "grad_norm": 0.23620876669883728, "learning_rate": 5.748197283167871e-05, "loss": 0.2202, "step": 3163 }, { "epoch": 0.6403562031977332, "grad_norm": 0.28815045952796936, "learning_rate": 5.742437442132704e-05, "loss": 0.2388, "step": 3164 }, { "epoch": 0.6405585913782635, "grad_norm": 0.21483361721038818, "learning_rate": 5.736679325785239e-05, "loss": 0.2333, "step": 3165 }, { "epoch": 0.6407609795587937, "grad_norm": 0.2337856888771057, "learning_rate": 5.730922936458023e-05, "loss": 0.1957, "step": 3166 }, { "epoch": 0.640963367739324, "grad_norm": 0.22695200145244598, "learning_rate": 5.725168276482894e-05, "loss": 0.2139, "step": 3167 }, { "epoch": 0.6411657559198543, "grad_norm": 0.251996248960495, "learning_rate": 5.719415348191e-05, "loss": 0.2222, "step": 3168 }, { "epoch": 0.6413681441003846, "grad_norm": 0.24668018519878387, "learning_rate": 5.7136641539127835e-05, "loss": 0.2362, "step": 3169 }, { "epoch": 0.6415705322809148, "grad_norm": 0.24238133430480957, "learning_rate": 5.707914695977983e-05, "loss": 0.2298, "step": 3170 }, { "epoch": 0.6417729204614451, "grad_norm": 0.26997238397598267, "learning_rate": 5.702166976715637e-05, "loss": 0.2461, "step": 3171 }, { "epoch": 0.6419753086419753, "grad_norm": 0.3248326778411865, "learning_rate": 5.696420998454076e-05, "loss": 0.2494, "step": 3172 }, { "epoch": 0.6421776968225056, "grad_norm": 0.25379082560539246, "learning_rate": 5.6906767635209304e-05, "loss": 0.2277, "step": 3173 }, { "epoch": 0.6423800850030358, "grad_norm": 0.24339628219604492, "learning_rate": 5.684934274243121e-05, "loss": 0.232, "step": 3174 }, { "epoch": 0.6425824731835661, "grad_norm": 0.23307976126670837, "learning_rate": 5.679193532946856e-05, "loss": 0.2258, "step": 3175 }, { "epoch": 0.6427848613640963, "grad_norm": 0.23107542097568512, "learning_rate": 5.673454541957645e-05, "loss": 0.2018, "step": 3176 }, { "epoch": 0.6429872495446266, "grad_norm": 0.2971723973751068, "learning_rate": 5.667717303600284e-05, "loss": 0.2493, "step": 3177 }, { "epoch": 0.6431896377251568, "grad_norm": 0.20815639197826385, "learning_rate": 5.6619818201988605e-05, "loss": 0.213, "step": 3178 }, { "epoch": 0.6433920259056871, "grad_norm": 0.2121375948190689, "learning_rate": 5.656248094076748e-05, "loss": 0.2043, "step": 3179 }, { "epoch": 0.6435944140862173, "grad_norm": 0.27654093503952026, "learning_rate": 5.6505161275566146e-05, "loss": 0.231, "step": 3180 }, { "epoch": 0.6437968022667476, "grad_norm": 0.2911146283149719, "learning_rate": 5.6447859229604116e-05, "loss": 0.2092, "step": 3181 }, { "epoch": 0.6439991904472778, "grad_norm": 0.2760297358036041, "learning_rate": 5.639057482609369e-05, "loss": 0.2439, "step": 3182 }, { "epoch": 0.6442015786278081, "grad_norm": 0.22860626876354218, "learning_rate": 5.6333308088240125e-05, "loss": 0.2242, "step": 3183 }, { "epoch": 0.6444039668083384, "grad_norm": 0.22273755073547363, "learning_rate": 5.6276059039241535e-05, "loss": 0.2045, "step": 3184 }, { "epoch": 0.6446063549888686, "grad_norm": 0.21743179857730865, "learning_rate": 5.621882770228883e-05, "loss": 0.2209, "step": 3185 }, { "epoch": 0.644808743169399, "grad_norm": 0.23531091213226318, "learning_rate": 5.616161410056569e-05, "loss": 0.234, "step": 3186 }, { "epoch": 0.6450111313499292, "grad_norm": 0.29375889897346497, "learning_rate": 5.6104418257248684e-05, "loss": 0.2376, "step": 3187 }, { "epoch": 0.6452135195304595, "grad_norm": 0.34802162647247314, "learning_rate": 5.6047240195507175e-05, "loss": 0.2557, "step": 3188 }, { "epoch": 0.6454159077109897, "grad_norm": 0.2270517647266388, "learning_rate": 5.599007993850329e-05, "loss": 0.188, "step": 3189 }, { "epoch": 0.64561829589152, "grad_norm": 0.2054586559534073, "learning_rate": 5.593293750939192e-05, "loss": 0.2157, "step": 3190 }, { "epoch": 0.6458206840720502, "grad_norm": 0.3220299482345581, "learning_rate": 5.587581293132088e-05, "loss": 0.3185, "step": 3191 }, { "epoch": 0.6460230722525805, "grad_norm": 0.23124554753303528, "learning_rate": 5.581870622743065e-05, "loss": 0.2252, "step": 3192 }, { "epoch": 0.6462254604331107, "grad_norm": 0.31478726863861084, "learning_rate": 5.5761617420854396e-05, "loss": 0.2426, "step": 3193 }, { "epoch": 0.646427848613641, "grad_norm": 0.2315119504928589, "learning_rate": 5.570454653471814e-05, "loss": 0.1919, "step": 3194 }, { "epoch": 0.6466302367941712, "grad_norm": 0.2783889174461365, "learning_rate": 5.5647493592140635e-05, "loss": 0.2027, "step": 3195 }, { "epoch": 0.6468326249747015, "grad_norm": 0.21910448372364044, "learning_rate": 5.559045861623331e-05, "loss": 0.2186, "step": 3196 }, { "epoch": 0.6470350131552317, "grad_norm": 0.23766078054904938, "learning_rate": 5.5533441630100394e-05, "loss": 0.2084, "step": 3197 }, { "epoch": 0.647237401335762, "grad_norm": 0.35816648602485657, "learning_rate": 5.547644265683878e-05, "loss": 0.2246, "step": 3198 }, { "epoch": 0.6474397895162922, "grad_norm": 0.2093358188867569, "learning_rate": 5.541946171953808e-05, "loss": 0.2423, "step": 3199 }, { "epoch": 0.6476421776968225, "grad_norm": 0.23430635035037994, "learning_rate": 5.536249884128053e-05, "loss": 0.2041, "step": 3200 }, { "epoch": 0.6476421776968225, "eval_loss": 0.2606339156627655, "eval_runtime": 1.3199, "eval_samples_per_second": 3.788, "eval_steps_per_second": 0.758, "step": 3200 }, { "epoch": 0.6478445658773527, "grad_norm": 0.23133951425552368, "learning_rate": 5.530555404514117e-05, "loss": 0.2166, "step": 3201 }, { "epoch": 0.648046954057883, "grad_norm": 0.29459553956985474, "learning_rate": 5.524862735418762e-05, "loss": 0.2177, "step": 3202 }, { "epoch": 0.6482493422384132, "grad_norm": 0.25066307187080383, "learning_rate": 5.519171879148023e-05, "loss": 0.2264, "step": 3203 }, { "epoch": 0.6484517304189436, "grad_norm": 0.19875210523605347, "learning_rate": 5.513482838007197e-05, "loss": 0.2229, "step": 3204 }, { "epoch": 0.6486541185994738, "grad_norm": 0.26769936084747314, "learning_rate": 5.507795614300846e-05, "loss": 0.244, "step": 3205 }, { "epoch": 0.6488565067800041, "grad_norm": 0.2770192325115204, "learning_rate": 5.5021102103327956e-05, "loss": 0.2256, "step": 3206 }, { "epoch": 0.6490588949605343, "grad_norm": 0.292390912771225, "learning_rate": 5.4964266284061415e-05, "loss": 0.2212, "step": 3207 }, { "epoch": 0.6492612831410646, "grad_norm": 0.2941245138645172, "learning_rate": 5.4907448708232254e-05, "loss": 0.2338, "step": 3208 }, { "epoch": 0.6494636713215948, "grad_norm": 0.25069254636764526, "learning_rate": 5.485064939885665e-05, "loss": 0.2489, "step": 3209 }, { "epoch": 0.6496660595021251, "grad_norm": 0.26239052414894104, "learning_rate": 5.479386837894328e-05, "loss": 0.2519, "step": 3210 }, { "epoch": 0.6498684476826553, "grad_norm": 0.22076480090618134, "learning_rate": 5.47371056714935e-05, "loss": 0.2094, "step": 3211 }, { "epoch": 0.6500708358631856, "grad_norm": 0.24472157657146454, "learning_rate": 5.468036129950118e-05, "loss": 0.2381, "step": 3212 }, { "epoch": 0.6502732240437158, "grad_norm": 0.2893001139163971, "learning_rate": 5.462363528595281e-05, "loss": 0.2429, "step": 3213 }, { "epoch": 0.6504756122242461, "grad_norm": 0.25055640935897827, "learning_rate": 5.456692765382744e-05, "loss": 0.2159, "step": 3214 }, { "epoch": 0.6506780004047764, "grad_norm": 0.25454723834991455, "learning_rate": 5.451023842609657e-05, "loss": 0.2195, "step": 3215 }, { "epoch": 0.6508803885853066, "grad_norm": 0.27267035841941833, "learning_rate": 5.44535676257244e-05, "loss": 0.2339, "step": 3216 }, { "epoch": 0.6510827767658369, "grad_norm": 0.261115700006485, "learning_rate": 5.439691527566756e-05, "loss": 0.2351, "step": 3217 }, { "epoch": 0.6512851649463671, "grad_norm": 0.33638855814933777, "learning_rate": 5.434028139887526e-05, "loss": 0.2053, "step": 3218 }, { "epoch": 0.6514875531268974, "grad_norm": 0.26877719163894653, "learning_rate": 5.428366601828918e-05, "loss": 0.234, "step": 3219 }, { "epoch": 0.6516899413074276, "grad_norm": 0.24609103798866272, "learning_rate": 5.422706915684357e-05, "loss": 0.2418, "step": 3220 }, { "epoch": 0.6518923294879579, "grad_norm": 0.27210500836372375, "learning_rate": 5.4170490837465126e-05, "loss": 0.2211, "step": 3221 }, { "epoch": 0.6520947176684881, "grad_norm": 0.2737904489040375, "learning_rate": 5.411393108307308e-05, "loss": 0.2539, "step": 3222 }, { "epoch": 0.6522971058490185, "grad_norm": 0.2679438292980194, "learning_rate": 5.4057389916579047e-05, "loss": 0.2865, "step": 3223 }, { "epoch": 0.6524994940295487, "grad_norm": 0.21572163701057434, "learning_rate": 5.4000867360887206e-05, "loss": 0.2166, "step": 3224 }, { "epoch": 0.652701882210079, "grad_norm": 0.22046849131584167, "learning_rate": 5.394436343889418e-05, "loss": 0.2302, "step": 3225 }, { "epoch": 0.6529042703906092, "grad_norm": 0.2549203038215637, "learning_rate": 5.388787817348901e-05, "loss": 0.2428, "step": 3226 }, { "epoch": 0.6531066585711395, "grad_norm": 0.23139064013957977, "learning_rate": 5.383141158755324e-05, "loss": 0.2247, "step": 3227 }, { "epoch": 0.6533090467516697, "grad_norm": 0.22760528326034546, "learning_rate": 5.377496370396078e-05, "loss": 0.2398, "step": 3228 }, { "epoch": 0.6535114349322, "grad_norm": 0.22975961863994598, "learning_rate": 5.371853454557804e-05, "loss": 0.217, "step": 3229 }, { "epoch": 0.6537138231127302, "grad_norm": 0.2388610690832138, "learning_rate": 5.36621241352637e-05, "loss": 0.2228, "step": 3230 }, { "epoch": 0.6539162112932605, "grad_norm": 0.23507040739059448, "learning_rate": 5.3605732495868986e-05, "loss": 0.2548, "step": 3231 }, { "epoch": 0.6541185994737907, "grad_norm": 0.2857777178287506, "learning_rate": 5.354935965023753e-05, "loss": 0.2758, "step": 3232 }, { "epoch": 0.654320987654321, "grad_norm": 0.24908968806266785, "learning_rate": 5.349300562120528e-05, "loss": 0.2359, "step": 3233 }, { "epoch": 0.6545233758348512, "grad_norm": 0.235075443983078, "learning_rate": 5.343667043160054e-05, "loss": 0.2394, "step": 3234 }, { "epoch": 0.6547257640153815, "grad_norm": 0.24633139371871948, "learning_rate": 5.338035410424402e-05, "loss": 0.1881, "step": 3235 }, { "epoch": 0.6549281521959117, "grad_norm": 0.3352043330669403, "learning_rate": 5.3324056661948875e-05, "loss": 0.2317, "step": 3236 }, { "epoch": 0.655130540376442, "grad_norm": 0.36961349844932556, "learning_rate": 5.326777812752041e-05, "loss": 0.2572, "step": 3237 }, { "epoch": 0.6553329285569722, "grad_norm": 0.3469850420951843, "learning_rate": 5.321151852375641e-05, "loss": 0.2485, "step": 3238 }, { "epoch": 0.6555353167375025, "grad_norm": 0.25540149211883545, "learning_rate": 5.315527787344703e-05, "loss": 0.2352, "step": 3239 }, { "epoch": 0.6557377049180327, "grad_norm": 0.23150105774402618, "learning_rate": 5.3099056199374684e-05, "loss": 0.2465, "step": 3240 }, { "epoch": 0.655940093098563, "grad_norm": 0.2713060975074768, "learning_rate": 5.304285352431404e-05, "loss": 0.2468, "step": 3241 }, { "epoch": 0.6561424812790932, "grad_norm": 0.26449888944625854, "learning_rate": 5.2986669871032156e-05, "loss": 0.2268, "step": 3242 }, { "epoch": 0.6563448694596236, "grad_norm": 0.32318148016929626, "learning_rate": 5.2930505262288365e-05, "loss": 0.2614, "step": 3243 }, { "epoch": 0.6565472576401539, "grad_norm": 0.24303804337978363, "learning_rate": 5.287435972083428e-05, "loss": 0.244, "step": 3244 }, { "epoch": 0.6567496458206841, "grad_norm": 0.2077636867761612, "learning_rate": 5.2818233269413776e-05, "loss": 0.1903, "step": 3245 }, { "epoch": 0.6569520340012144, "grad_norm": 0.2627265751361847, "learning_rate": 5.276212593076302e-05, "loss": 0.2064, "step": 3246 }, { "epoch": 0.6571544221817446, "grad_norm": 0.2441711574792862, "learning_rate": 5.27060377276104e-05, "loss": 0.2236, "step": 3247 }, { "epoch": 0.6573568103622749, "grad_norm": 0.24978910386562347, "learning_rate": 5.2649968682676665e-05, "loss": 0.195, "step": 3248 }, { "epoch": 0.6575591985428051, "grad_norm": 0.2538270652294159, "learning_rate": 5.259391881867459e-05, "loss": 0.2577, "step": 3249 }, { "epoch": 0.6577615867233354, "grad_norm": 0.27217862010002136, "learning_rate": 5.253788815830936e-05, "loss": 0.2539, "step": 3250 }, { "epoch": 0.6577615867233354, "eval_loss": 0.26086652278900146, "eval_runtime": 1.3193, "eval_samples_per_second": 3.79, "eval_steps_per_second": 0.758, "step": 3250 } ], "logging_steps": 1, "max_steps": 4941, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.240886903735255e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }