diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15554 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998871968415116, + "eval_steps": 500, + "global_step": 2216, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004512126339537507, + "grad_norm": 11.364033095495042, + "learning_rate": 2.9850746268656714e-08, + "loss": 1.1259, + "step": 1 + }, + { + "epoch": 0.0009024252679075014, + "grad_norm": 13.931051354363555, + "learning_rate": 5.970149253731343e-08, + "loss": 1.0695, + "step": 2 + }, + { + "epoch": 0.0013536379018612521, + "grad_norm": 12.591108456147174, + "learning_rate": 8.955223880597014e-08, + "loss": 1.1413, + "step": 3 + }, + { + "epoch": 0.0018048505358150028, + "grad_norm": 13.097679442569106, + "learning_rate": 1.1940298507462686e-07, + "loss": 1.1058, + "step": 4 + }, + { + "epoch": 0.0022560631697687537, + "grad_norm": 10.498761541717155, + "learning_rate": 1.4925373134328355e-07, + "loss": 1.1735, + "step": 5 + }, + { + "epoch": 0.0027072758037225042, + "grad_norm": 10.84063689789702, + "learning_rate": 1.7910447761194027e-07, + "loss": 1.1406, + "step": 6 + }, + { + "epoch": 0.003158488437676255, + "grad_norm": 11.189978725198145, + "learning_rate": 2.08955223880597e-07, + "loss": 1.1035, + "step": 7 + }, + { + "epoch": 0.0036097010716300056, + "grad_norm": 11.74878304446126, + "learning_rate": 2.388059701492537e-07, + "loss": 1.1278, + "step": 8 + }, + { + "epoch": 0.0040609137055837565, + "grad_norm": 11.451667410942289, + "learning_rate": 2.686567164179104e-07, + "loss": 1.0887, + "step": 9 + }, + { + "epoch": 0.0045121263395375075, + "grad_norm": 13.400719338755062, + "learning_rate": 2.985074626865671e-07, + "loss": 0.9145, + "step": 10 + }, + { + "epoch": 0.0049633389734912575, + "grad_norm": 12.493648403772585, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.9701, + "step": 11 + }, + { + "epoch": 0.0054145516074450084, + "grad_norm": 11.383629551476218, + "learning_rate": 3.5820895522388055e-07, + "loss": 1.0357, + "step": 12 + }, + { + "epoch": 0.005865764241398759, + "grad_norm": 10.68706994675752, + "learning_rate": 3.880597014925373e-07, + "loss": 1.0357, + "step": 13 + }, + { + "epoch": 0.00631697687535251, + "grad_norm": 12.110795882375692, + "learning_rate": 4.17910447761194e-07, + "loss": 1.1691, + "step": 14 + }, + { + "epoch": 0.00676818950930626, + "grad_norm": 11.01201459317177, + "learning_rate": 4.4776119402985074e-07, + "loss": 1.0552, + "step": 15 + }, + { + "epoch": 0.007219402143260011, + "grad_norm": 11.621902494318055, + "learning_rate": 4.776119402985074e-07, + "loss": 1.1426, + "step": 16 + }, + { + "epoch": 0.007670614777213762, + "grad_norm": 12.473147801613736, + "learning_rate": 5.074626865671642e-07, + "loss": 1.0951, + "step": 17 + }, + { + "epoch": 0.008121827411167513, + "grad_norm": 9.182896427775415, + "learning_rate": 5.373134328358208e-07, + "loss": 1.101, + "step": 18 + }, + { + "epoch": 0.008573040045121263, + "grad_norm": 9.270909685736212, + "learning_rate": 5.671641791044775e-07, + "loss": 1.1811, + "step": 19 + }, + { + "epoch": 0.009024252679075015, + "grad_norm": 10.991256459745955, + "learning_rate": 5.970149253731342e-07, + "loss": 1.1503, + "step": 20 + }, + { + "epoch": 0.009475465313028765, + "grad_norm": 9.255537906212872, + "learning_rate": 6.26865671641791e-07, + "loss": 0.9254, + "step": 21 + }, + { + "epoch": 0.009926677946982515, + "grad_norm": 8.790099892645406, + "learning_rate": 6.567164179104477e-07, + "loss": 1.196, + "step": 22 + }, + { + "epoch": 0.010377890580936267, + "grad_norm": 8.636133146821528, + "learning_rate": 6.865671641791044e-07, + "loss": 1.0438, + "step": 23 + }, + { + "epoch": 0.010829103214890017, + "grad_norm": 7.949499233764216, + "learning_rate": 7.164179104477611e-07, + "loss": 1.0595, + "step": 24 + }, + { + "epoch": 0.011280315848843767, + "grad_norm": 8.40491249369533, + "learning_rate": 7.462686567164179e-07, + "loss": 0.9882, + "step": 25 + }, + { + "epoch": 0.011731528482797519, + "grad_norm": 7.041925202309207, + "learning_rate": 7.761194029850746e-07, + "loss": 0.8373, + "step": 26 + }, + { + "epoch": 0.012182741116751269, + "grad_norm": 5.936172454200004, + "learning_rate": 8.059701492537313e-07, + "loss": 1.0756, + "step": 27 + }, + { + "epoch": 0.01263395375070502, + "grad_norm": 5.908249319041824, + "learning_rate": 8.35820895522388e-07, + "loss": 1.0573, + "step": 28 + }, + { + "epoch": 0.01308516638465877, + "grad_norm": 4.780207987320098, + "learning_rate": 8.656716417910447e-07, + "loss": 0.9898, + "step": 29 + }, + { + "epoch": 0.01353637901861252, + "grad_norm": 4.772574247160276, + "learning_rate": 8.955223880597015e-07, + "loss": 0.7025, + "step": 30 + }, + { + "epoch": 0.013987591652566272, + "grad_norm": 4.950826482077291, + "learning_rate": 9.253731343283582e-07, + "loss": 0.935, + "step": 31 + }, + { + "epoch": 0.014438804286520023, + "grad_norm": 5.647161557749149, + "learning_rate": 9.552238805970149e-07, + "loss": 0.9128, + "step": 32 + }, + { + "epoch": 0.014890016920473773, + "grad_norm": 4.248897174075301, + "learning_rate": 9.850746268656714e-07, + "loss": 0.92, + "step": 33 + }, + { + "epoch": 0.015341229554427524, + "grad_norm": 4.260585778705935, + "learning_rate": 1.0149253731343285e-06, + "loss": 0.997, + "step": 34 + }, + { + "epoch": 0.015792442188381276, + "grad_norm": 4.712117773863939, + "learning_rate": 1.0447761194029848e-06, + "loss": 0.9365, + "step": 35 + }, + { + "epoch": 0.016243654822335026, + "grad_norm": 4.558718307740576, + "learning_rate": 1.0746268656716416e-06, + "loss": 0.9952, + "step": 36 + }, + { + "epoch": 0.016694867456288776, + "grad_norm": 4.536125586817622, + "learning_rate": 1.1044776119402984e-06, + "loss": 0.9118, + "step": 37 + }, + { + "epoch": 0.017146080090242526, + "grad_norm": 4.333053203593244, + "learning_rate": 1.134328358208955e-06, + "loss": 0.9406, + "step": 38 + }, + { + "epoch": 0.017597292724196276, + "grad_norm": 3.913517979851077, + "learning_rate": 1.1641791044776118e-06, + "loss": 0.68, + "step": 39 + }, + { + "epoch": 0.01804850535815003, + "grad_norm": 3.8887693821373746, + "learning_rate": 1.1940298507462684e-06, + "loss": 0.8987, + "step": 40 + }, + { + "epoch": 0.01849971799210378, + "grad_norm": 3.7966150058507626, + "learning_rate": 1.2238805970149252e-06, + "loss": 0.8783, + "step": 41 + }, + { + "epoch": 0.01895093062605753, + "grad_norm": 3.8413216293595465, + "learning_rate": 1.253731343283582e-06, + "loss": 1.0114, + "step": 42 + }, + { + "epoch": 0.01940214326001128, + "grad_norm": 4.031064280953991, + "learning_rate": 1.2835820895522386e-06, + "loss": 0.9885, + "step": 43 + }, + { + "epoch": 0.01985335589396503, + "grad_norm": 4.3421298053507025, + "learning_rate": 1.3134328358208954e-06, + "loss": 0.7812, + "step": 44 + }, + { + "epoch": 0.02030456852791878, + "grad_norm": 3.6498986771766377, + "learning_rate": 1.3432835820895522e-06, + "loss": 0.864, + "step": 45 + }, + { + "epoch": 0.020755781161872534, + "grad_norm": 3.959983938073483, + "learning_rate": 1.3731343283582088e-06, + "loss": 1.0879, + "step": 46 + }, + { + "epoch": 0.021206993795826284, + "grad_norm": 3.897851022301326, + "learning_rate": 1.4029850746268656e-06, + "loss": 0.8353, + "step": 47 + }, + { + "epoch": 0.021658206429780034, + "grad_norm": 3.9557985042022734, + "learning_rate": 1.4328358208955222e-06, + "loss": 1.0151, + "step": 48 + }, + { + "epoch": 0.022109419063733784, + "grad_norm": 3.954727075094809, + "learning_rate": 1.462686567164179e-06, + "loss": 0.8335, + "step": 49 + }, + { + "epoch": 0.022560631697687534, + "grad_norm": 4.085425811341359, + "learning_rate": 1.4925373134328358e-06, + "loss": 0.8118, + "step": 50 + }, + { + "epoch": 0.023011844331641287, + "grad_norm": 3.8699428653568715, + "learning_rate": 1.5223880597014924e-06, + "loss": 0.8204, + "step": 51 + }, + { + "epoch": 0.023463056965595037, + "grad_norm": 3.9398844157805035, + "learning_rate": 1.5522388059701492e-06, + "loss": 0.8355, + "step": 52 + }, + { + "epoch": 0.023914269599548788, + "grad_norm": 3.9157701162909917, + "learning_rate": 1.5820895522388058e-06, + "loss": 0.7559, + "step": 53 + }, + { + "epoch": 0.024365482233502538, + "grad_norm": 3.2664411221295166, + "learning_rate": 1.6119402985074626e-06, + "loss": 0.6912, + "step": 54 + }, + { + "epoch": 0.024816694867456288, + "grad_norm": 4.102566095975955, + "learning_rate": 1.6417910447761194e-06, + "loss": 0.8001, + "step": 55 + }, + { + "epoch": 0.02526790750141004, + "grad_norm": 3.984516243751416, + "learning_rate": 1.671641791044776e-06, + "loss": 0.6703, + "step": 56 + }, + { + "epoch": 0.02571912013536379, + "grad_norm": 3.282165881291406, + "learning_rate": 1.7014925373134328e-06, + "loss": 0.6232, + "step": 57 + }, + { + "epoch": 0.02617033276931754, + "grad_norm": 3.5552685115913896, + "learning_rate": 1.7313432835820893e-06, + "loss": 0.8499, + "step": 58 + }, + { + "epoch": 0.02662154540327129, + "grad_norm": 3.963753035943293, + "learning_rate": 1.7611940298507461e-06, + "loss": 0.7578, + "step": 59 + }, + { + "epoch": 0.02707275803722504, + "grad_norm": 3.9201161090299235, + "learning_rate": 1.791044776119403e-06, + "loss": 0.7639, + "step": 60 + }, + { + "epoch": 0.02752397067117879, + "grad_norm": 4.525487683658723, + "learning_rate": 1.8208955223880595e-06, + "loss": 0.7993, + "step": 61 + }, + { + "epoch": 0.027975183305132545, + "grad_norm": 3.577108451322024, + "learning_rate": 1.8507462686567163e-06, + "loss": 0.712, + "step": 62 + }, + { + "epoch": 0.028426395939086295, + "grad_norm": 3.750651576099599, + "learning_rate": 1.8805970149253731e-06, + "loss": 0.8058, + "step": 63 + }, + { + "epoch": 0.028877608573040045, + "grad_norm": 4.016735732488834, + "learning_rate": 1.9104477611940297e-06, + "loss": 0.8695, + "step": 64 + }, + { + "epoch": 0.029328821206993795, + "grad_norm": 3.274144203992301, + "learning_rate": 1.9402985074626867e-06, + "loss": 0.7518, + "step": 65 + }, + { + "epoch": 0.029780033840947545, + "grad_norm": 3.8331394793119284, + "learning_rate": 1.970149253731343e-06, + "loss": 0.7467, + "step": 66 + }, + { + "epoch": 0.0302312464749013, + "grad_norm": 3.7944270550638914, + "learning_rate": 2e-06, + "loss": 0.6613, + "step": 67 + }, + { + "epoch": 0.03068245910885505, + "grad_norm": 3.9078761683052377, + "learning_rate": 1.9999989314450967e-06, + "loss": 0.6253, + "step": 68 + }, + { + "epoch": 0.0311336717428088, + "grad_norm": 3.7685631166339966, + "learning_rate": 1.9999957257826715e-06, + "loss": 0.6976, + "step": 69 + }, + { + "epoch": 0.03158488437676255, + "grad_norm": 3.5388877908849423, + "learning_rate": 1.9999903830195744e-06, + "loss": 0.6982, + "step": 70 + }, + { + "epoch": 0.0320360970107163, + "grad_norm": 3.9557829292992435, + "learning_rate": 1.9999829031672236e-06, + "loss": 0.9304, + "step": 71 + }, + { + "epoch": 0.03248730964467005, + "grad_norm": 3.4688030496000697, + "learning_rate": 1.9999732862416053e-06, + "loss": 0.7131, + "step": 72 + }, + { + "epoch": 0.0329385222786238, + "grad_norm": 3.4382929899009707, + "learning_rate": 1.9999615322632707e-06, + "loss": 0.67, + "step": 73 + }, + { + "epoch": 0.03338973491257755, + "grad_norm": 3.68765625410286, + "learning_rate": 1.9999476412573397e-06, + "loss": 0.6871, + "step": 74 + }, + { + "epoch": 0.0338409475465313, + "grad_norm": 3.9034136569075444, + "learning_rate": 1.999931613253499e-06, + "loss": 0.7368, + "step": 75 + }, + { + "epoch": 0.03429216018048505, + "grad_norm": 3.4434580645266983, + "learning_rate": 1.9999134482860026e-06, + "loss": 0.7582, + "step": 76 + }, + { + "epoch": 0.0347433728144388, + "grad_norm": 3.4427590958689827, + "learning_rate": 1.9998931463936704e-06, + "loss": 0.6917, + "step": 77 + }, + { + "epoch": 0.03519458544839255, + "grad_norm": 3.5129243585420045, + "learning_rate": 1.9998707076198903e-06, + "loss": 0.7202, + "step": 78 + }, + { + "epoch": 0.0356457980823463, + "grad_norm": 3.3097035106926334, + "learning_rate": 1.999846132012616e-06, + "loss": 0.6518, + "step": 79 + }, + { + "epoch": 0.03609701071630006, + "grad_norm": 4.01365302973293, + "learning_rate": 1.9998194196243685e-06, + "loss": 0.8061, + "step": 80 + }, + { + "epoch": 0.03654822335025381, + "grad_norm": 3.936175515147698, + "learning_rate": 1.999790570512235e-06, + "loss": 0.6833, + "step": 81 + }, + { + "epoch": 0.03699943598420756, + "grad_norm": 4.047146080445624, + "learning_rate": 1.9997595847378693e-06, + "loss": 0.5754, + "step": 82 + }, + { + "epoch": 0.03745064861816131, + "grad_norm": 4.520964419884895, + "learning_rate": 1.9997264623674913e-06, + "loss": 0.7149, + "step": 83 + }, + { + "epoch": 0.03790186125211506, + "grad_norm": 4.192870395330733, + "learning_rate": 1.999691203471887e-06, + "loss": 0.8313, + "step": 84 + }, + { + "epoch": 0.03835307388606881, + "grad_norm": 3.9624103451987347, + "learning_rate": 1.9996538081264093e-06, + "loss": 0.6514, + "step": 85 + }, + { + "epoch": 0.03880428652002256, + "grad_norm": 3.520519118359727, + "learning_rate": 1.9996142764109753e-06, + "loss": 0.7045, + "step": 86 + }, + { + "epoch": 0.03925549915397631, + "grad_norm": 3.5083592386843914, + "learning_rate": 1.999572608410069e-06, + "loss": 0.5047, + "step": 87 + }, + { + "epoch": 0.03970671178793006, + "grad_norm": 3.620127744810978, + "learning_rate": 1.999528804212739e-06, + "loss": 0.6376, + "step": 88 + }, + { + "epoch": 0.04015792442188381, + "grad_norm": 3.0483993598270906, + "learning_rate": 1.9994828639126008e-06, + "loss": 0.6099, + "step": 89 + }, + { + "epoch": 0.04060913705583756, + "grad_norm": 3.7495513331273984, + "learning_rate": 1.999434787607833e-06, + "loss": 0.7896, + "step": 90 + }, + { + "epoch": 0.04106034968979132, + "grad_norm": 3.4700395599360796, + "learning_rate": 1.9993845754011797e-06, + "loss": 0.6285, + "step": 91 + }, + { + "epoch": 0.04151156232374507, + "grad_norm": 3.131175268232334, + "learning_rate": 1.9993322273999505e-06, + "loss": 0.6492, + "step": 92 + }, + { + "epoch": 0.04196277495769882, + "grad_norm": 3.5622755072245127, + "learning_rate": 1.9992777437160185e-06, + "loss": 0.8218, + "step": 93 + }, + { + "epoch": 0.04241398759165257, + "grad_norm": 3.3564760065634744, + "learning_rate": 1.9992211244658214e-06, + "loss": 0.6667, + "step": 94 + }, + { + "epoch": 0.04286520022560632, + "grad_norm": 3.687670006527879, + "learning_rate": 1.999162369770361e-06, + "loss": 0.6048, + "step": 95 + }, + { + "epoch": 0.04331641285956007, + "grad_norm": 3.7369728445839776, + "learning_rate": 1.9991014797552027e-06, + "loss": 0.8528, + "step": 96 + }, + { + "epoch": 0.04376762549351382, + "grad_norm": 4.326301281314035, + "learning_rate": 1.999038454550474e-06, + "loss": 0.6348, + "step": 97 + }, + { + "epoch": 0.04421883812746757, + "grad_norm": 3.330654698188114, + "learning_rate": 1.998973294290868e-06, + "loss": 0.6672, + "step": 98 + }, + { + "epoch": 0.04467005076142132, + "grad_norm": 3.4395950335828465, + "learning_rate": 1.998905999115639e-06, + "loss": 0.6424, + "step": 99 + }, + { + "epoch": 0.04512126339537507, + "grad_norm": 3.4714288997142835, + "learning_rate": 1.998836569168603e-06, + "loss": 0.6887, + "step": 100 + }, + { + "epoch": 0.04557247602932882, + "grad_norm": 3.739675822036197, + "learning_rate": 1.998765004598141e-06, + "loss": 0.744, + "step": 101 + }, + { + "epoch": 0.046023688663282575, + "grad_norm": 3.5376293890493167, + "learning_rate": 1.998691305557194e-06, + "loss": 0.75, + "step": 102 + }, + { + "epoch": 0.046474901297236325, + "grad_norm": 3.9480104427554745, + "learning_rate": 1.9986154722032646e-06, + "loss": 0.594, + "step": 103 + }, + { + "epoch": 0.046926113931190075, + "grad_norm": 3.9967536358361135, + "learning_rate": 1.9985375046984165e-06, + "loss": 0.7111, + "step": 104 + }, + { + "epoch": 0.047377326565143825, + "grad_norm": 4.404083699085148, + "learning_rate": 1.9984574032092758e-06, + "loss": 0.6737, + "step": 105 + }, + { + "epoch": 0.047828539199097575, + "grad_norm": 3.9446789384138943, + "learning_rate": 1.998375167907028e-06, + "loss": 0.5581, + "step": 106 + }, + { + "epoch": 0.048279751833051325, + "grad_norm": 3.2749627513940354, + "learning_rate": 1.9982907989674186e-06, + "loss": 0.6848, + "step": 107 + }, + { + "epoch": 0.048730964467005075, + "grad_norm": 3.1745752031824503, + "learning_rate": 1.9982042965707534e-06, + "loss": 0.6993, + "step": 108 + }, + { + "epoch": 0.049182177100958825, + "grad_norm": 2.918514031480819, + "learning_rate": 1.9981156609018974e-06, + "loss": 0.5622, + "step": 109 + }, + { + "epoch": 0.049633389734912575, + "grad_norm": 3.2967345302885773, + "learning_rate": 1.998024892150275e-06, + "loss": 0.7764, + "step": 110 + }, + { + "epoch": 0.050084602368866325, + "grad_norm": 4.096076864953819, + "learning_rate": 1.997931990509869e-06, + "loss": 0.8217, + "step": 111 + }, + { + "epoch": 0.05053581500282008, + "grad_norm": 3.5086960564534073, + "learning_rate": 1.9978369561792203e-06, + "loss": 0.7334, + "step": 112 + }, + { + "epoch": 0.05098702763677383, + "grad_norm": 4.957647662303255, + "learning_rate": 1.997739789361428e-06, + "loss": 0.7043, + "step": 113 + }, + { + "epoch": 0.05143824027072758, + "grad_norm": 3.108513749586831, + "learning_rate": 1.9976404902641475e-06, + "loss": 0.5838, + "step": 114 + }, + { + "epoch": 0.05188945290468133, + "grad_norm": 4.374801020766516, + "learning_rate": 1.9975390590995923e-06, + "loss": 0.6207, + "step": 115 + }, + { + "epoch": 0.05234066553863508, + "grad_norm": 3.5847626690371746, + "learning_rate": 1.9974354960845323e-06, + "loss": 0.7108, + "step": 116 + }, + { + "epoch": 0.05279187817258883, + "grad_norm": 3.3972794164820566, + "learning_rate": 1.9973298014402927e-06, + "loss": 0.6389, + "step": 117 + }, + { + "epoch": 0.05324309080654258, + "grad_norm": 3.4765618382257686, + "learning_rate": 1.9972219753927547e-06, + "loss": 0.7736, + "step": 118 + }, + { + "epoch": 0.05369430344049633, + "grad_norm": 3.983880838580834, + "learning_rate": 1.997112018172354e-06, + "loss": 0.6782, + "step": 119 + }, + { + "epoch": 0.05414551607445008, + "grad_norm": 3.475938176625616, + "learning_rate": 1.9969999300140816e-06, + "loss": 0.6435, + "step": 120 + }, + { + "epoch": 0.05459672870840383, + "grad_norm": 3.2723738223571908, + "learning_rate": 1.9968857111574823e-06, + "loss": 0.725, + "step": 121 + }, + { + "epoch": 0.05504794134235758, + "grad_norm": 2.7837662665336684, + "learning_rate": 1.9967693618466537e-06, + "loss": 0.5725, + "step": 122 + }, + { + "epoch": 0.05549915397631134, + "grad_norm": 3.6895192194143287, + "learning_rate": 1.996650882330248e-06, + "loss": 0.6869, + "step": 123 + }, + { + "epoch": 0.05595036661026509, + "grad_norm": 3.4109605481942475, + "learning_rate": 1.9965302728614685e-06, + "loss": 0.7059, + "step": 124 + }, + { + "epoch": 0.05640157924421884, + "grad_norm": 3.1175030442363023, + "learning_rate": 1.9964075336980705e-06, + "loss": 0.5166, + "step": 125 + }, + { + "epoch": 0.05685279187817259, + "grad_norm": 3.7149554868454207, + "learning_rate": 1.9962826651023618e-06, + "loss": 0.6477, + "step": 126 + }, + { + "epoch": 0.05730400451212634, + "grad_norm": 2.7779966288953006, + "learning_rate": 1.9961556673412e-06, + "loss": 0.5851, + "step": 127 + }, + { + "epoch": 0.05775521714608009, + "grad_norm": 3.6464661606212627, + "learning_rate": 1.9960265406859927e-06, + "loss": 0.7105, + "step": 128 + }, + { + "epoch": 0.05820642978003384, + "grad_norm": 4.222181937491618, + "learning_rate": 1.9958952854126986e-06, + "loss": 0.8397, + "step": 129 + }, + { + "epoch": 0.05865764241398759, + "grad_norm": 3.463205955817649, + "learning_rate": 1.995761901801824e-06, + "loss": 0.6813, + "step": 130 + }, + { + "epoch": 0.05910885504794134, + "grad_norm": 3.5951321527152214, + "learning_rate": 1.995626390138425e-06, + "loss": 0.6965, + "step": 131 + }, + { + "epoch": 0.05956006768189509, + "grad_norm": 3.321905615889881, + "learning_rate": 1.995488750712104e-06, + "loss": 0.7583, + "step": 132 + }, + { + "epoch": 0.06001128031584884, + "grad_norm": 3.5601557448366763, + "learning_rate": 1.995348983817012e-06, + "loss": 0.595, + "step": 133 + }, + { + "epoch": 0.0604624929498026, + "grad_norm": 3.734905545964483, + "learning_rate": 1.9952070897518465e-06, + "loss": 0.6582, + "step": 134 + }, + { + "epoch": 0.06091370558375635, + "grad_norm": 3.3634272520299824, + "learning_rate": 1.99506306881985e-06, + "loss": 0.635, + "step": 135 + }, + { + "epoch": 0.0613649182177101, + "grad_norm": 3.2571407250429774, + "learning_rate": 1.9949169213288123e-06, + "loss": 0.6879, + "step": 136 + }, + { + "epoch": 0.06181613085166385, + "grad_norm": 3.402056992142215, + "learning_rate": 1.9947686475910653e-06, + "loss": 0.66, + "step": 137 + }, + { + "epoch": 0.0622673434856176, + "grad_norm": 3.230669775565966, + "learning_rate": 1.9946182479234867e-06, + "loss": 0.6797, + "step": 138 + }, + { + "epoch": 0.06271855611957135, + "grad_norm": 3.2932122097098606, + "learning_rate": 1.9944657226474975e-06, + "loss": 0.6038, + "step": 139 + }, + { + "epoch": 0.0631697687535251, + "grad_norm": 3.453841593898078, + "learning_rate": 1.9943110720890605e-06, + "loss": 0.7284, + "step": 140 + }, + { + "epoch": 0.06362098138747885, + "grad_norm": 3.4659777218616945, + "learning_rate": 1.994154296578681e-06, + "loss": 0.7008, + "step": 141 + }, + { + "epoch": 0.0640721940214326, + "grad_norm": 3.2914543179139737, + "learning_rate": 1.993995396451406e-06, + "loss": 0.679, + "step": 142 + }, + { + "epoch": 0.06452340665538635, + "grad_norm": 3.5718717851262873, + "learning_rate": 1.9938343720468215e-06, + "loss": 0.7916, + "step": 143 + }, + { + "epoch": 0.0649746192893401, + "grad_norm": 3.822827292185195, + "learning_rate": 1.993671223709055e-06, + "loss": 0.7997, + "step": 144 + }, + { + "epoch": 0.06542583192329385, + "grad_norm": 3.636935399595887, + "learning_rate": 1.9935059517867726e-06, + "loss": 0.8537, + "step": 145 + }, + { + "epoch": 0.0658770445572476, + "grad_norm": 3.6710814044757787, + "learning_rate": 1.993338556633178e-06, + "loss": 0.7573, + "step": 146 + }, + { + "epoch": 0.06632825719120135, + "grad_norm": 2.7858354684349282, + "learning_rate": 1.993169038606014e-06, + "loss": 0.5819, + "step": 147 + }, + { + "epoch": 0.0667794698251551, + "grad_norm": 3.276036115967757, + "learning_rate": 1.992997398067558e-06, + "loss": 0.574, + "step": 148 + }, + { + "epoch": 0.06723068245910886, + "grad_norm": 3.0839012680951052, + "learning_rate": 1.992823635384625e-06, + "loss": 0.524, + "step": 149 + }, + { + "epoch": 0.0676818950930626, + "grad_norm": 3.506642162753119, + "learning_rate": 1.9926477509285654e-06, + "loss": 0.8239, + "step": 150 + }, + { + "epoch": 0.06813310772701636, + "grad_norm": 3.148995922283508, + "learning_rate": 1.9924697450752634e-06, + "loss": 0.6518, + "step": 151 + }, + { + "epoch": 0.0685843203609701, + "grad_norm": 3.42401846246941, + "learning_rate": 1.9922896182051368e-06, + "loss": 0.7514, + "step": 152 + }, + { + "epoch": 0.06903553299492386, + "grad_norm": 3.418930096776447, + "learning_rate": 1.9921073707031367e-06, + "loss": 0.7616, + "step": 153 + }, + { + "epoch": 0.0694867456288776, + "grad_norm": 3.4667255988772614, + "learning_rate": 1.9919230029587457e-06, + "loss": 0.7787, + "step": 154 + }, + { + "epoch": 0.06993795826283136, + "grad_norm": 3.4038766672764527, + "learning_rate": 1.991736515365979e-06, + "loss": 0.5716, + "step": 155 + }, + { + "epoch": 0.0703891708967851, + "grad_norm": 2.9809305533819743, + "learning_rate": 1.99154790832338e-06, + "loss": 0.6286, + "step": 156 + }, + { + "epoch": 0.07084038353073886, + "grad_norm": 3.4845882856327264, + "learning_rate": 1.9913571822340225e-06, + "loss": 0.6862, + "step": 157 + }, + { + "epoch": 0.0712915961646926, + "grad_norm": 3.5834795110879507, + "learning_rate": 1.9911643375055103e-06, + "loss": 0.8095, + "step": 158 + }, + { + "epoch": 0.07174280879864636, + "grad_norm": 3.4230242670945867, + "learning_rate": 1.9909693745499727e-06, + "loss": 0.7338, + "step": 159 + }, + { + "epoch": 0.07219402143260012, + "grad_norm": 3.423862511414622, + "learning_rate": 1.9907722937840673e-06, + "loss": 0.6397, + "step": 160 + }, + { + "epoch": 0.07264523406655386, + "grad_norm": 3.2464417100716476, + "learning_rate": 1.990573095628977e-06, + "loss": 0.6437, + "step": 161 + }, + { + "epoch": 0.07309644670050762, + "grad_norm": 3.2452080634188603, + "learning_rate": 1.990371780510411e-06, + "loss": 0.7933, + "step": 162 + }, + { + "epoch": 0.07354765933446136, + "grad_norm": 3.8701961071397193, + "learning_rate": 1.990168348858601e-06, + "loss": 0.6959, + "step": 163 + }, + { + "epoch": 0.07399887196841512, + "grad_norm": 3.346386820700462, + "learning_rate": 1.9899628011083025e-06, + "loss": 0.7028, + "step": 164 + }, + { + "epoch": 0.07445008460236886, + "grad_norm": 3.3695620544061917, + "learning_rate": 1.9897551376987948e-06, + "loss": 0.7207, + "step": 165 + }, + { + "epoch": 0.07490129723632262, + "grad_norm": 3.3459806929523928, + "learning_rate": 1.9895453590738766e-06, + "loss": 0.565, + "step": 166 + }, + { + "epoch": 0.07535250987027636, + "grad_norm": 3.5190095763107907, + "learning_rate": 1.9893334656818678e-06, + "loss": 0.5682, + "step": 167 + }, + { + "epoch": 0.07580372250423012, + "grad_norm": 3.4154315855452277, + "learning_rate": 1.989119457975608e-06, + "loss": 0.7248, + "step": 168 + }, + { + "epoch": 0.07625493513818386, + "grad_norm": 3.619554148268442, + "learning_rate": 1.988903336412455e-06, + "loss": 0.7166, + "step": 169 + }, + { + "epoch": 0.07670614777213762, + "grad_norm": 3.628782657060156, + "learning_rate": 1.988685101454285e-06, + "loss": 0.6744, + "step": 170 + }, + { + "epoch": 0.07715736040609138, + "grad_norm": 3.222286302019397, + "learning_rate": 1.9884647535674897e-06, + "loss": 0.6345, + "step": 171 + }, + { + "epoch": 0.07760857304004512, + "grad_norm": 3.3817029503603746, + "learning_rate": 1.988242293222976e-06, + "loss": 0.6987, + "step": 172 + }, + { + "epoch": 0.07805978567399888, + "grad_norm": 3.192434440346879, + "learning_rate": 1.9880177208961674e-06, + "loss": 0.6651, + "step": 173 + }, + { + "epoch": 0.07851099830795262, + "grad_norm": 3.459120242900092, + "learning_rate": 1.9877910370669984e-06, + "loss": 0.6704, + "step": 174 + }, + { + "epoch": 0.07896221094190638, + "grad_norm": 3.5176326381584473, + "learning_rate": 1.9875622422199184e-06, + "loss": 0.7315, + "step": 175 + }, + { + "epoch": 0.07941342357586012, + "grad_norm": 3.42358670836462, + "learning_rate": 1.9873313368438856e-06, + "loss": 0.709, + "step": 176 + }, + { + "epoch": 0.07986463620981388, + "grad_norm": 3.514397087656355, + "learning_rate": 1.987098321432372e-06, + "loss": 0.6897, + "step": 177 + }, + { + "epoch": 0.08031584884376762, + "grad_norm": 3.9791349562876364, + "learning_rate": 1.9868631964833554e-06, + "loss": 0.6663, + "step": 178 + }, + { + "epoch": 0.08076706147772138, + "grad_norm": 3.511691191882589, + "learning_rate": 1.9866259624993243e-06, + "loss": 0.7678, + "step": 179 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 3.6035462780242082, + "learning_rate": 1.9863866199872745e-06, + "loss": 0.6732, + "step": 180 + }, + { + "epoch": 0.08166948674562888, + "grad_norm": 3.3265262825383384, + "learning_rate": 1.986145169458706e-06, + "loss": 0.6459, + "step": 181 + }, + { + "epoch": 0.08212069937958263, + "grad_norm": 3.070737136256683, + "learning_rate": 1.9859016114296256e-06, + "loss": 0.6672, + "step": 182 + }, + { + "epoch": 0.08257191201353638, + "grad_norm": 3.1293576967982006, + "learning_rate": 1.985655946420544e-06, + "loss": 0.6482, + "step": 183 + }, + { + "epoch": 0.08302312464749013, + "grad_norm": 3.2300733698316, + "learning_rate": 1.9854081749564737e-06, + "loss": 0.5774, + "step": 184 + }, + { + "epoch": 0.08347433728144388, + "grad_norm": 3.1911556869196525, + "learning_rate": 1.98515829756693e-06, + "loss": 0.5046, + "step": 185 + }, + { + "epoch": 0.08392554991539763, + "grad_norm": 3.463324413729419, + "learning_rate": 1.984906314785928e-06, + "loss": 0.6925, + "step": 186 + }, + { + "epoch": 0.08437676254935138, + "grad_norm": 3.4082698478159053, + "learning_rate": 1.984652227151982e-06, + "loss": 0.6869, + "step": 187 + }, + { + "epoch": 0.08482797518330513, + "grad_norm": 3.0232384513032526, + "learning_rate": 1.984396035208107e-06, + "loss": 0.6854, + "step": 188 + }, + { + "epoch": 0.08527918781725888, + "grad_norm": 3.405093474595666, + "learning_rate": 1.984137739501811e-06, + "loss": 0.8003, + "step": 189 + }, + { + "epoch": 0.08573040045121263, + "grad_norm": 2.8650239504287827, + "learning_rate": 1.983877340585102e-06, + "loss": 0.6247, + "step": 190 + }, + { + "epoch": 0.08618161308516638, + "grad_norm": 3.1843877890785053, + "learning_rate": 1.98361483901448e-06, + "loss": 0.6268, + "step": 191 + }, + { + "epoch": 0.08663282571912014, + "grad_norm": 3.2905100987787574, + "learning_rate": 1.983350235350941e-06, + "loss": 0.6307, + "step": 192 + }, + { + "epoch": 0.08708403835307389, + "grad_norm": 3.6238689522296896, + "learning_rate": 1.9830835301599705e-06, + "loss": 0.6972, + "step": 193 + }, + { + "epoch": 0.08753525098702764, + "grad_norm": 3.6407089349858994, + "learning_rate": 1.982814724011548e-06, + "loss": 0.5048, + "step": 194 + }, + { + "epoch": 0.08798646362098139, + "grad_norm": 3.414185881113126, + "learning_rate": 1.982543817480141e-06, + "loss": 0.6903, + "step": 195 + }, + { + "epoch": 0.08843767625493514, + "grad_norm": 3.5617662518759423, + "learning_rate": 1.9822708111447073e-06, + "loss": 0.7267, + "step": 196 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 3.260537631153434, + "learning_rate": 1.9819957055886903e-06, + "loss": 0.6019, + "step": 197 + }, + { + "epoch": 0.08934010152284264, + "grad_norm": 3.9131056691786648, + "learning_rate": 1.9817185014000216e-06, + "loss": 0.7248, + "step": 198 + }, + { + "epoch": 0.08979131415679639, + "grad_norm": 3.2158505559140087, + "learning_rate": 1.981439199171117e-06, + "loss": 0.4917, + "step": 199 + }, + { + "epoch": 0.09024252679075014, + "grad_norm": 3.3742251974916306, + "learning_rate": 1.9811577994988754e-06, + "loss": 0.6419, + "step": 200 + }, + { + "epoch": 0.09069373942470389, + "grad_norm": 3.960767715416477, + "learning_rate": 1.9808743029846793e-06, + "loss": 0.6485, + "step": 201 + }, + { + "epoch": 0.09114495205865764, + "grad_norm": 3.2106160146208556, + "learning_rate": 1.980588710234392e-06, + "loss": 0.6513, + "step": 202 + }, + { + "epoch": 0.09159616469261139, + "grad_norm": 3.3168520724652435, + "learning_rate": 1.980301021858356e-06, + "loss": 0.6639, + "step": 203 + }, + { + "epoch": 0.09204737732656515, + "grad_norm": 3.3182631173626, + "learning_rate": 1.9800112384713937e-06, + "loss": 0.6178, + "step": 204 + }, + { + "epoch": 0.09249858996051889, + "grad_norm": 3.162501767023835, + "learning_rate": 1.9797193606928037e-06, + "loss": 0.6681, + "step": 205 + }, + { + "epoch": 0.09294980259447265, + "grad_norm": 3.2385347157247955, + "learning_rate": 1.9794253891463602e-06, + "loss": 0.5039, + "step": 206 + }, + { + "epoch": 0.09340101522842639, + "grad_norm": 3.573031152116351, + "learning_rate": 1.979129324460314e-06, + "loss": 0.6903, + "step": 207 + }, + { + "epoch": 0.09385222786238015, + "grad_norm": 3.601364277364461, + "learning_rate": 1.978831167267387e-06, + "loss": 0.5897, + "step": 208 + }, + { + "epoch": 0.09430344049633389, + "grad_norm": 3.2791769809356537, + "learning_rate": 1.9785309182047735e-06, + "loss": 0.5525, + "step": 209 + }, + { + "epoch": 0.09475465313028765, + "grad_norm": 4.31342038746554, + "learning_rate": 1.9782285779141393e-06, + "loss": 0.5973, + "step": 210 + }, + { + "epoch": 0.09520586576424139, + "grad_norm": 3.196166578673568, + "learning_rate": 1.977924147041619e-06, + "loss": 0.662, + "step": 211 + }, + { + "epoch": 0.09565707839819515, + "grad_norm": 3.5134072643179586, + "learning_rate": 1.9776176262378144e-06, + "loss": 0.6167, + "step": 212 + }, + { + "epoch": 0.09610829103214891, + "grad_norm": 3.4425320121595977, + "learning_rate": 1.977309016157794e-06, + "loss": 0.595, + "step": 213 + }, + { + "epoch": 0.09655950366610265, + "grad_norm": 3.374754780954467, + "learning_rate": 1.9769983174610917e-06, + "loss": 0.5101, + "step": 214 + }, + { + "epoch": 0.09701071630005641, + "grad_norm": 3.107648208296178, + "learning_rate": 1.9766855308117048e-06, + "loss": 0.705, + "step": 215 + }, + { + "epoch": 0.09746192893401015, + "grad_norm": 3.507381249990775, + "learning_rate": 1.9763706568780925e-06, + "loss": 0.6518, + "step": 216 + }, + { + "epoch": 0.09791314156796391, + "grad_norm": 3.178375092239186, + "learning_rate": 1.9760536963331747e-06, + "loss": 0.8446, + "step": 217 + }, + { + "epoch": 0.09836435420191765, + "grad_norm": 3.361779236537557, + "learning_rate": 1.9757346498543316e-06, + "loss": 0.5807, + "step": 218 + }, + { + "epoch": 0.09881556683587141, + "grad_norm": 3.859132201884954, + "learning_rate": 1.9754135181234003e-06, + "loss": 0.6968, + "step": 219 + }, + { + "epoch": 0.09926677946982515, + "grad_norm": 3.300316326276715, + "learning_rate": 1.9750903018266743e-06, + "loss": 0.7015, + "step": 220 + }, + { + "epoch": 0.09971799210377891, + "grad_norm": 3.149000744047261, + "learning_rate": 1.9747650016549027e-06, + "loss": 0.6008, + "step": 221 + }, + { + "epoch": 0.10016920473773265, + "grad_norm": 3.4892072583658695, + "learning_rate": 1.9744376183032873e-06, + "loss": 0.6077, + "step": 222 + }, + { + "epoch": 0.10062041737168641, + "grad_norm": 3.6046339473790323, + "learning_rate": 1.9741081524714825e-06, + "loss": 0.6846, + "step": 223 + }, + { + "epoch": 0.10107163000564016, + "grad_norm": 3.4520154967728085, + "learning_rate": 1.9737766048635928e-06, + "loss": 0.6671, + "step": 224 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 3.2952487774101797, + "learning_rate": 1.973442976188172e-06, + "loss": 0.6594, + "step": 225 + }, + { + "epoch": 0.10197405527354766, + "grad_norm": 3.2514684073236584, + "learning_rate": 1.973107267158221e-06, + "loss": 0.6518, + "step": 226 + }, + { + "epoch": 0.10242526790750141, + "grad_norm": 3.266600653278959, + "learning_rate": 1.9727694784911876e-06, + "loss": 0.6731, + "step": 227 + }, + { + "epoch": 0.10287648054145516, + "grad_norm": 3.1924596316986533, + "learning_rate": 1.972429610908962e-06, + "loss": 0.6636, + "step": 228 + }, + { + "epoch": 0.10332769317540891, + "grad_norm": 3.2104025132138623, + "learning_rate": 1.9720876651378794e-06, + "loss": 0.6779, + "step": 229 + }, + { + "epoch": 0.10377890580936266, + "grad_norm": 3.2963718496191294, + "learning_rate": 1.9717436419087155e-06, + "loss": 0.634, + "step": 230 + }, + { + "epoch": 0.10423011844331641, + "grad_norm": 3.2073822291013254, + "learning_rate": 1.9713975419566858e-06, + "loss": 0.5812, + "step": 231 + }, + { + "epoch": 0.10468133107727016, + "grad_norm": 3.4189730159980884, + "learning_rate": 1.971049366021443e-06, + "loss": 0.7316, + "step": 232 + }, + { + "epoch": 0.10513254371122391, + "grad_norm": 3.23394924091578, + "learning_rate": 1.9706991148470783e-06, + "loss": 0.6624, + "step": 233 + }, + { + "epoch": 0.10558375634517767, + "grad_norm": 3.1317986475999375, + "learning_rate": 1.970346789182116e-06, + "loss": 0.5124, + "step": 234 + }, + { + "epoch": 0.10603496897913142, + "grad_norm": 3.385294259471175, + "learning_rate": 1.969992389779516e-06, + "loss": 0.5569, + "step": 235 + }, + { + "epoch": 0.10648618161308517, + "grad_norm": 4.118491129332631, + "learning_rate": 1.9696359173966676e-06, + "loss": 0.6456, + "step": 236 + }, + { + "epoch": 0.10693739424703892, + "grad_norm": 3.385938350325588, + "learning_rate": 1.9692773727953923e-06, + "loss": 0.6794, + "step": 237 + }, + { + "epoch": 0.10738860688099267, + "grad_norm": 3.401423714020342, + "learning_rate": 1.9689167567419383e-06, + "loss": 0.6555, + "step": 238 + }, + { + "epoch": 0.10783981951494642, + "grad_norm": 3.401519447295867, + "learning_rate": 1.9685540700069827e-06, + "loss": 0.578, + "step": 239 + }, + { + "epoch": 0.10829103214890017, + "grad_norm": 4.10708011535945, + "learning_rate": 1.9681893133656257e-06, + "loss": 0.7618, + "step": 240 + }, + { + "epoch": 0.10874224478285392, + "grad_norm": 3.5017274039945163, + "learning_rate": 1.9678224875973932e-06, + "loss": 0.592, + "step": 241 + }, + { + "epoch": 0.10919345741680767, + "grad_norm": 3.1688095237487017, + "learning_rate": 1.9674535934862324e-06, + "loss": 0.6643, + "step": 242 + }, + { + "epoch": 0.10964467005076142, + "grad_norm": 3.1685960960934336, + "learning_rate": 1.9670826318205098e-06, + "loss": 0.6199, + "step": 243 + }, + { + "epoch": 0.11009588268471517, + "grad_norm": 3.4775317850948104, + "learning_rate": 1.9667096033930114e-06, + "loss": 0.6675, + "step": 244 + }, + { + "epoch": 0.11054709531866892, + "grad_norm": 3.643632342788612, + "learning_rate": 1.96633450900094e-06, + "loss": 0.711, + "step": 245 + }, + { + "epoch": 0.11099830795262268, + "grad_norm": 2.9819737147125003, + "learning_rate": 1.965957349445914e-06, + "loss": 0.6962, + "step": 246 + }, + { + "epoch": 0.11144952058657642, + "grad_norm": 2.9768216888435477, + "learning_rate": 1.9655781255339632e-06, + "loss": 0.65, + "step": 247 + }, + { + "epoch": 0.11190073322053018, + "grad_norm": 3.174224366971097, + "learning_rate": 1.965196838075533e-06, + "loss": 0.5309, + "step": 248 + }, + { + "epoch": 0.11235194585448392, + "grad_norm": 3.0582374952193985, + "learning_rate": 1.9648134878854744e-06, + "loss": 0.5635, + "step": 249 + }, + { + "epoch": 0.11280315848843768, + "grad_norm": 3.8277212668333114, + "learning_rate": 1.9644280757830507e-06, + "loss": 0.7526, + "step": 250 + }, + { + "epoch": 0.11325437112239142, + "grad_norm": 3.8709397850662945, + "learning_rate": 1.9640406025919285e-06, + "loss": 0.6035, + "step": 251 + }, + { + "epoch": 0.11370558375634518, + "grad_norm": 3.3137132906235203, + "learning_rate": 1.963651069140181e-06, + "loss": 0.6656, + "step": 252 + }, + { + "epoch": 0.11415679639029892, + "grad_norm": 3.574058474566791, + "learning_rate": 1.963259476260284e-06, + "loss": 0.7095, + "step": 253 + }, + { + "epoch": 0.11460800902425268, + "grad_norm": 3.6146347899303115, + "learning_rate": 1.962865824789115e-06, + "loss": 0.6267, + "step": 254 + }, + { + "epoch": 0.11505922165820642, + "grad_norm": 4.082630583384243, + "learning_rate": 1.96247011556795e-06, + "loss": 0.7022, + "step": 255 + }, + { + "epoch": 0.11551043429216018, + "grad_norm": 3.291509482076301, + "learning_rate": 1.9620723494424623e-06, + "loss": 0.5721, + "step": 256 + }, + { + "epoch": 0.11596164692611394, + "grad_norm": 3.58368271064381, + "learning_rate": 1.961672527262723e-06, + "loss": 0.6236, + "step": 257 + }, + { + "epoch": 0.11641285956006768, + "grad_norm": 3.1224675347563275, + "learning_rate": 1.9612706498831957e-06, + "loss": 0.5134, + "step": 258 + }, + { + "epoch": 0.11686407219402144, + "grad_norm": 3.5167852362721654, + "learning_rate": 1.9608667181627357e-06, + "loss": 0.6986, + "step": 259 + }, + { + "epoch": 0.11731528482797518, + "grad_norm": 3.5809347550436277, + "learning_rate": 1.96046073296459e-06, + "loss": 0.6251, + "step": 260 + }, + { + "epoch": 0.11776649746192894, + "grad_norm": 2.8641754562099244, + "learning_rate": 1.9600526951563937e-06, + "loss": 0.6369, + "step": 261 + }, + { + "epoch": 0.11821771009588268, + "grad_norm": 3.613739867174843, + "learning_rate": 1.9596426056101684e-06, + "loss": 0.786, + "step": 262 + }, + { + "epoch": 0.11866892272983644, + "grad_norm": 3.580296089646273, + "learning_rate": 1.9592304652023203e-06, + "loss": 0.6038, + "step": 263 + }, + { + "epoch": 0.11912013536379018, + "grad_norm": 3.15842598534144, + "learning_rate": 1.958816274813639e-06, + "loss": 0.5811, + "step": 264 + }, + { + "epoch": 0.11957134799774394, + "grad_norm": 3.0081075803485247, + "learning_rate": 1.958400035329294e-06, + "loss": 0.528, + "step": 265 + }, + { + "epoch": 0.12002256063169768, + "grad_norm": 2.8554740174867606, + "learning_rate": 1.9579817476388357e-06, + "loss": 0.625, + "step": 266 + }, + { + "epoch": 0.12047377326565144, + "grad_norm": 3.3849924046760447, + "learning_rate": 1.9575614126361907e-06, + "loss": 0.5894, + "step": 267 + }, + { + "epoch": 0.1209249858996052, + "grad_norm": 2.8658227520314994, + "learning_rate": 1.9571390312196607e-06, + "loss": 0.6311, + "step": 268 + }, + { + "epoch": 0.12137619853355894, + "grad_norm": 3.22918688434795, + "learning_rate": 1.9567146042919213e-06, + "loss": 0.6711, + "step": 269 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 3.102667861242874, + "learning_rate": 1.9562881327600197e-06, + "loss": 0.5661, + "step": 270 + }, + { + "epoch": 0.12227862380146644, + "grad_norm": 2.9951718210029843, + "learning_rate": 1.955859617535372e-06, + "loss": 0.7687, + "step": 271 + }, + { + "epoch": 0.1227298364354202, + "grad_norm": 3.1684228477861884, + "learning_rate": 1.9554290595337625e-06, + "loss": 0.6228, + "step": 272 + }, + { + "epoch": 0.12318104906937394, + "grad_norm": 3.393968292035563, + "learning_rate": 1.954996459675341e-06, + "loss": 0.5848, + "step": 273 + }, + { + "epoch": 0.1236322617033277, + "grad_norm": 2.852079323030113, + "learning_rate": 1.9545618188846205e-06, + "loss": 0.5769, + "step": 274 + }, + { + "epoch": 0.12408347433728144, + "grad_norm": 2.9641384401198074, + "learning_rate": 1.9541251380904762e-06, + "loss": 0.5864, + "step": 275 + }, + { + "epoch": 0.1245346869712352, + "grad_norm": 3.3554476588870568, + "learning_rate": 1.9536864182261435e-06, + "loss": 0.657, + "step": 276 + }, + { + "epoch": 0.12498589960518895, + "grad_norm": 3.281618043698293, + "learning_rate": 1.9532456602292146e-06, + "loss": 0.6213, + "step": 277 + }, + { + "epoch": 0.1254371122391427, + "grad_norm": 3.3143546652497644, + "learning_rate": 1.9528028650416375e-06, + "loss": 0.7062, + "step": 278 + }, + { + "epoch": 0.12588832487309645, + "grad_norm": 3.3561372317938134, + "learning_rate": 1.9523580336097145e-06, + "loss": 0.4876, + "step": 279 + }, + { + "epoch": 0.1263395375070502, + "grad_norm": 3.0109523300134264, + "learning_rate": 1.9519111668840982e-06, + "loss": 0.5256, + "step": 280 + }, + { + "epoch": 0.12679075014100394, + "grad_norm": 3.578806428790207, + "learning_rate": 1.9514622658197933e-06, + "loss": 0.5089, + "step": 281 + }, + { + "epoch": 0.1272419627749577, + "grad_norm": 3.074575948566442, + "learning_rate": 1.95101133137615e-06, + "loss": 0.695, + "step": 282 + }, + { + "epoch": 0.12769317540891145, + "grad_norm": 3.1027160235638984, + "learning_rate": 1.950558364516865e-06, + "loss": 0.5853, + "step": 283 + }, + { + "epoch": 0.1281443880428652, + "grad_norm": 3.3099951191328563, + "learning_rate": 1.9501033662099777e-06, + "loss": 0.5303, + "step": 284 + }, + { + "epoch": 0.12859560067681894, + "grad_norm": 3.446711214585093, + "learning_rate": 1.9496463374278696e-06, + "loss": 0.5828, + "step": 285 + }, + { + "epoch": 0.1290468133107727, + "grad_norm": 3.263902821177112, + "learning_rate": 1.949187279147262e-06, + "loss": 0.6326, + "step": 286 + }, + { + "epoch": 0.12949802594472645, + "grad_norm": 3.571231287506934, + "learning_rate": 1.9487261923492116e-06, + "loss": 0.6723, + "step": 287 + }, + { + "epoch": 0.1299492385786802, + "grad_norm": 3.001117794130769, + "learning_rate": 1.9482630780191126e-06, + "loss": 0.5732, + "step": 288 + }, + { + "epoch": 0.13040045121263397, + "grad_norm": 3.474015517628523, + "learning_rate": 1.947797937146691e-06, + "loss": 0.6214, + "step": 289 + }, + { + "epoch": 0.1308516638465877, + "grad_norm": 3.2034938171083955, + "learning_rate": 1.947330770726004e-06, + "loss": 0.7137, + "step": 290 + }, + { + "epoch": 0.13130287648054145, + "grad_norm": 4.0987214667589935, + "learning_rate": 1.946861579755437e-06, + "loss": 0.6807, + "step": 291 + }, + { + "epoch": 0.1317540891144952, + "grad_norm": 2.9958320731805355, + "learning_rate": 1.9463903652377026e-06, + "loss": 0.648, + "step": 292 + }, + { + "epoch": 0.13220530174844897, + "grad_norm": 3.5326747287868274, + "learning_rate": 1.945917128179839e-06, + "loss": 0.5305, + "step": 293 + }, + { + "epoch": 0.1326565143824027, + "grad_norm": 3.087656746813269, + "learning_rate": 1.9454418695932045e-06, + "loss": 0.6877, + "step": 294 + }, + { + "epoch": 0.13310772701635645, + "grad_norm": 3.161562230483419, + "learning_rate": 1.94496459049348e-06, + "loss": 0.7082, + "step": 295 + }, + { + "epoch": 0.1335589396503102, + "grad_norm": 3.1405430418827236, + "learning_rate": 1.9444852919006623e-06, + "loss": 0.6361, + "step": 296 + }, + { + "epoch": 0.13401015228426397, + "grad_norm": 3.2609308388363165, + "learning_rate": 1.944003974839066e-06, + "loss": 0.4959, + "step": 297 + }, + { + "epoch": 0.13446136491821772, + "grad_norm": 3.136461389328533, + "learning_rate": 1.9435206403373178e-06, + "loss": 0.7346, + "step": 298 + }, + { + "epoch": 0.13491257755217145, + "grad_norm": 3.5274417741499198, + "learning_rate": 1.9430352894283567e-06, + "loss": 0.5936, + "step": 299 + }, + { + "epoch": 0.1353637901861252, + "grad_norm": 3.7645608637306647, + "learning_rate": 1.9425479231494318e-06, + "loss": 0.6398, + "step": 300 + }, + { + "epoch": 0.13581500282007897, + "grad_norm": 3.1396546903591718, + "learning_rate": 1.942058542542097e-06, + "loss": 0.5478, + "step": 301 + }, + { + "epoch": 0.13626621545403272, + "grad_norm": 3.594716193105851, + "learning_rate": 1.9415671486522137e-06, + "loss": 0.7303, + "step": 302 + }, + { + "epoch": 0.13671742808798645, + "grad_norm": 3.507738272824508, + "learning_rate": 1.9410737425299434e-06, + "loss": 0.6789, + "step": 303 + }, + { + "epoch": 0.1371686407219402, + "grad_norm": 3.118749146822311, + "learning_rate": 1.94057832522975e-06, + "loss": 0.6171, + "step": 304 + }, + { + "epoch": 0.13761985335589397, + "grad_norm": 3.210316055229788, + "learning_rate": 1.9400808978103944e-06, + "loss": 0.5672, + "step": 305 + }, + { + "epoch": 0.13807106598984772, + "grad_norm": 3.0272573921584374, + "learning_rate": 1.9395814613349338e-06, + "loss": 0.6, + "step": 306 + }, + { + "epoch": 0.13852227862380145, + "grad_norm": 2.8900194965740487, + "learning_rate": 1.9390800168707182e-06, + "loss": 0.6387, + "step": 307 + }, + { + "epoch": 0.1389734912577552, + "grad_norm": 3.0767885306822738, + "learning_rate": 1.93857656548939e-06, + "loss": 0.6114, + "step": 308 + }, + { + "epoch": 0.13942470389170897, + "grad_norm": 3.1091627287910777, + "learning_rate": 1.93807110826688e-06, + "loss": 0.5469, + "step": 309 + }, + { + "epoch": 0.13987591652566272, + "grad_norm": 3.5058309066962976, + "learning_rate": 1.937563646283406e-06, + "loss": 0.5231, + "step": 310 + }, + { + "epoch": 0.14032712915961648, + "grad_norm": 3.381960145855753, + "learning_rate": 1.93705418062347e-06, + "loss": 0.6071, + "step": 311 + }, + { + "epoch": 0.1407783417935702, + "grad_norm": 3.3900203190115707, + "learning_rate": 1.9365427123758547e-06, + "loss": 0.6137, + "step": 312 + }, + { + "epoch": 0.14122955442752397, + "grad_norm": 3.1063379022563007, + "learning_rate": 1.936029242633626e-06, + "loss": 0.5899, + "step": 313 + }, + { + "epoch": 0.14168076706147772, + "grad_norm": 3.6211320331970343, + "learning_rate": 1.9355137724941234e-06, + "loss": 0.6263, + "step": 314 + }, + { + "epoch": 0.14213197969543148, + "grad_norm": 3.1680939515915902, + "learning_rate": 1.9349963030589644e-06, + "loss": 0.6648, + "step": 315 + }, + { + "epoch": 0.1425831923293852, + "grad_norm": 3.5195879110562562, + "learning_rate": 1.9344768354340377e-06, + "loss": 0.6575, + "step": 316 + }, + { + "epoch": 0.14303440496333897, + "grad_norm": 3.1488331602409176, + "learning_rate": 1.9339553707295018e-06, + "loss": 0.5313, + "step": 317 + }, + { + "epoch": 0.14348561759729273, + "grad_norm": 3.2151330395191557, + "learning_rate": 1.933431910059785e-06, + "loss": 0.528, + "step": 318 + }, + { + "epoch": 0.14393683023124648, + "grad_norm": 3.132260724021693, + "learning_rate": 1.93290645454358e-06, + "loss": 0.5815, + "step": 319 + }, + { + "epoch": 0.14438804286520024, + "grad_norm": 3.622222391906822, + "learning_rate": 1.9323790053038433e-06, + "loss": 0.7457, + "step": 320 + }, + { + "epoch": 0.14483925549915397, + "grad_norm": 4.223750697040322, + "learning_rate": 1.9318495634677907e-06, + "loss": 0.5711, + "step": 321 + }, + { + "epoch": 0.14529046813310773, + "grad_norm": 3.3650649933245376, + "learning_rate": 1.9313181301668985e-06, + "loss": 0.65, + "step": 322 + }, + { + "epoch": 0.14574168076706148, + "grad_norm": 3.7805123756041934, + "learning_rate": 1.9307847065368978e-06, + "loss": 0.4404, + "step": 323 + }, + { + "epoch": 0.14619289340101524, + "grad_norm": 3.6691078891308466, + "learning_rate": 1.9302492937177733e-06, + "loss": 0.5287, + "step": 324 + }, + { + "epoch": 0.14664410603496897, + "grad_norm": 2.9203910805909925, + "learning_rate": 1.9297118928537616e-06, + "loss": 0.5946, + "step": 325 + }, + { + "epoch": 0.14709531866892273, + "grad_norm": 3.4322784585242094, + "learning_rate": 1.9291725050933466e-06, + "loss": 0.5945, + "step": 326 + }, + { + "epoch": 0.14754653130287648, + "grad_norm": 3.4949175306707487, + "learning_rate": 1.9286311315892592e-06, + "loss": 0.6565, + "step": 327 + }, + { + "epoch": 0.14799774393683024, + "grad_norm": 3.3116625435966855, + "learning_rate": 1.9280877734984745e-06, + "loss": 0.5587, + "step": 328 + }, + { + "epoch": 0.14844895657078397, + "grad_norm": 3.9017874847497596, + "learning_rate": 1.9275424319822084e-06, + "loss": 0.6092, + "step": 329 + }, + { + "epoch": 0.14890016920473773, + "grad_norm": 3.483226347899875, + "learning_rate": 1.926995108205915e-06, + "loss": 0.661, + "step": 330 + }, + { + "epoch": 0.14935138183869148, + "grad_norm": 3.358115279938035, + "learning_rate": 1.926445803339286e-06, + "loss": 0.588, + "step": 331 + }, + { + "epoch": 0.14980259447264524, + "grad_norm": 3.6543876138753446, + "learning_rate": 1.925894518556246e-06, + "loss": 0.6363, + "step": 332 + }, + { + "epoch": 0.150253807106599, + "grad_norm": 3.5803877393350514, + "learning_rate": 1.9253412550349505e-06, + "loss": 0.6382, + "step": 333 + }, + { + "epoch": 0.15070501974055273, + "grad_norm": 3.514105036732028, + "learning_rate": 1.9247860139577852e-06, + "loss": 0.531, + "step": 334 + }, + { + "epoch": 0.15115623237450648, + "grad_norm": 3.0800678193550617, + "learning_rate": 1.924228796511361e-06, + "loss": 0.5065, + "step": 335 + }, + { + "epoch": 0.15160744500846024, + "grad_norm": 3.431976731385007, + "learning_rate": 1.923669603886513e-06, + "loss": 0.6681, + "step": 336 + }, + { + "epoch": 0.152058657642414, + "grad_norm": 3.1591532382990493, + "learning_rate": 1.9231084372782968e-06, + "loss": 0.5994, + "step": 337 + }, + { + "epoch": 0.15250987027636773, + "grad_norm": 2.8404865058044613, + "learning_rate": 1.9225452978859873e-06, + "loss": 0.5612, + "step": 338 + }, + { + "epoch": 0.15296108291032148, + "grad_norm": 3.326070609318879, + "learning_rate": 1.921980186913075e-06, + "loss": 0.5316, + "step": 339 + }, + { + "epoch": 0.15341229554427524, + "grad_norm": 3.5187885378329886, + "learning_rate": 1.9214131055672642e-06, + "loss": 0.7119, + "step": 340 + }, + { + "epoch": 0.153863508178229, + "grad_norm": 3.544854067555435, + "learning_rate": 1.9208440550604702e-06, + "loss": 0.7238, + "step": 341 + }, + { + "epoch": 0.15431472081218275, + "grad_norm": 3.2845637387346267, + "learning_rate": 1.9202730366088164e-06, + "loss": 0.6078, + "step": 342 + }, + { + "epoch": 0.15476593344613648, + "grad_norm": 3.3136242560590294, + "learning_rate": 1.9197000514326317e-06, + "loss": 0.5787, + "step": 343 + }, + { + "epoch": 0.15521714608009024, + "grad_norm": 3.284411489524786, + "learning_rate": 1.9191251007564487e-06, + "loss": 0.5574, + "step": 344 + }, + { + "epoch": 0.155668358714044, + "grad_norm": 3.6355673133340747, + "learning_rate": 1.9185481858089996e-06, + "loss": 0.7101, + "step": 345 + }, + { + "epoch": 0.15611957134799775, + "grad_norm": 4.128016030092157, + "learning_rate": 1.9179693078232155e-06, + "loss": 0.5079, + "step": 346 + }, + { + "epoch": 0.15657078398195148, + "grad_norm": 3.0530977440321405, + "learning_rate": 1.917388468036222e-06, + "loss": 0.5866, + "step": 347 + }, + { + "epoch": 0.15702199661590524, + "grad_norm": 3.165514172841862, + "learning_rate": 1.9168056676893374e-06, + "loss": 0.6284, + "step": 348 + }, + { + "epoch": 0.157473209249859, + "grad_norm": 2.995277805594194, + "learning_rate": 1.91622090802807e-06, + "loss": 0.6062, + "step": 349 + }, + { + "epoch": 0.15792442188381275, + "grad_norm": 3.005909595695992, + "learning_rate": 1.9156341903021155e-06, + "loss": 0.6028, + "step": 350 + }, + { + "epoch": 0.15837563451776648, + "grad_norm": 3.721941326683315, + "learning_rate": 1.9150455157653543e-06, + "loss": 0.5475, + "step": 351 + }, + { + "epoch": 0.15882684715172024, + "grad_norm": 3.5649796357457313, + "learning_rate": 1.9144548856758486e-06, + "loss": 0.6411, + "step": 352 + }, + { + "epoch": 0.159278059785674, + "grad_norm": 3.2716596578005066, + "learning_rate": 1.9138623012958393e-06, + "loss": 0.5344, + "step": 353 + }, + { + "epoch": 0.15972927241962775, + "grad_norm": 3.218727183746927, + "learning_rate": 1.913267763891745e-06, + "loss": 0.6509, + "step": 354 + }, + { + "epoch": 0.1601804850535815, + "grad_norm": 3.7971566293616035, + "learning_rate": 1.912671274734156e-06, + "loss": 0.5323, + "step": 355 + }, + { + "epoch": 0.16063169768753524, + "grad_norm": 3.201203458372683, + "learning_rate": 1.9120728350978367e-06, + "loss": 0.6743, + "step": 356 + }, + { + "epoch": 0.161082910321489, + "grad_norm": 3.3662683266894255, + "learning_rate": 1.9114724462617175e-06, + "loss": 0.6395, + "step": 357 + }, + { + "epoch": 0.16153412295544275, + "grad_norm": 3.4368545644579274, + "learning_rate": 1.910870109508896e-06, + "loss": 0.5264, + "step": 358 + }, + { + "epoch": 0.1619853355893965, + "grad_norm": 3.0271768583204435, + "learning_rate": 1.9102658261266306e-06, + "loss": 0.6154, + "step": 359 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 3.4935218886533757, + "learning_rate": 1.9096595974063424e-06, + "loss": 0.5963, + "step": 360 + }, + { + "epoch": 0.162887760857304, + "grad_norm": 3.64027486586054, + "learning_rate": 1.9090514246436083e-06, + "loss": 0.6211, + "step": 361 + }, + { + "epoch": 0.16333897349125776, + "grad_norm": 3.396850249691266, + "learning_rate": 1.908441309138161e-06, + "loss": 0.5142, + "step": 362 + }, + { + "epoch": 0.1637901861252115, + "grad_norm": 3.3870576147192053, + "learning_rate": 1.907829252193883e-06, + "loss": 0.6988, + "step": 363 + }, + { + "epoch": 0.16424139875916527, + "grad_norm": 3.788954000277857, + "learning_rate": 1.9072152551188081e-06, + "loss": 0.5813, + "step": 364 + }, + { + "epoch": 0.164692611393119, + "grad_norm": 3.1429732682111933, + "learning_rate": 1.9065993192251156e-06, + "loss": 0.6034, + "step": 365 + }, + { + "epoch": 0.16514382402707276, + "grad_norm": 3.678596714953962, + "learning_rate": 1.9059814458291275e-06, + "loss": 0.6919, + "step": 366 + }, + { + "epoch": 0.1655950366610265, + "grad_norm": 3.062164866400341, + "learning_rate": 1.9053616362513076e-06, + "loss": 0.4805, + "step": 367 + }, + { + "epoch": 0.16604624929498027, + "grad_norm": 3.3886077949868985, + "learning_rate": 1.904739891816257e-06, + "loss": 0.6668, + "step": 368 + }, + { + "epoch": 0.166497461928934, + "grad_norm": 3.521006425014518, + "learning_rate": 1.9041162138527112e-06, + "loss": 0.7878, + "step": 369 + }, + { + "epoch": 0.16694867456288776, + "grad_norm": 3.666265951900786, + "learning_rate": 1.9034906036935391e-06, + "loss": 0.6963, + "step": 370 + }, + { + "epoch": 0.1673998871968415, + "grad_norm": 3.645045850919915, + "learning_rate": 1.9028630626757386e-06, + "loss": 0.5457, + "step": 371 + }, + { + "epoch": 0.16785109983079527, + "grad_norm": 3.4414096877643097, + "learning_rate": 1.902233592140433e-06, + "loss": 0.6719, + "step": 372 + }, + { + "epoch": 0.16830231246474903, + "grad_norm": 3.265461782165135, + "learning_rate": 1.9016021934328706e-06, + "loss": 0.7438, + "step": 373 + }, + { + "epoch": 0.16875352509870276, + "grad_norm": 3.5380564324518415, + "learning_rate": 1.9009688679024189e-06, + "loss": 0.5506, + "step": 374 + }, + { + "epoch": 0.1692047377326565, + "grad_norm": 3.8268327357230345, + "learning_rate": 1.9003336169025653e-06, + "loss": 0.6524, + "step": 375 + }, + { + "epoch": 0.16965595036661027, + "grad_norm": 3.017258959670839, + "learning_rate": 1.89969644179091e-06, + "loss": 0.5096, + "step": 376 + }, + { + "epoch": 0.17010716300056403, + "grad_norm": 3.624267899480704, + "learning_rate": 1.8990573439291665e-06, + "loss": 0.6393, + "step": 377 + }, + { + "epoch": 0.17055837563451776, + "grad_norm": 3.3240315516106405, + "learning_rate": 1.8984163246831569e-06, + "loss": 0.5305, + "step": 378 + }, + { + "epoch": 0.1710095882684715, + "grad_norm": 3.096119100843986, + "learning_rate": 1.89777338542281e-06, + "loss": 0.5599, + "step": 379 + }, + { + "epoch": 0.17146080090242527, + "grad_norm": 3.4243899877650787, + "learning_rate": 1.8971285275221577e-06, + "loss": 0.5102, + "step": 380 + }, + { + "epoch": 0.17191201353637903, + "grad_norm": 3.500631529144404, + "learning_rate": 1.8964817523593318e-06, + "loss": 0.5937, + "step": 381 + }, + { + "epoch": 0.17236322617033276, + "grad_norm": 3.214480819264598, + "learning_rate": 1.8958330613165621e-06, + "loss": 0.7037, + "step": 382 + }, + { + "epoch": 0.1728144388042865, + "grad_norm": 3.2032488316353307, + "learning_rate": 1.8951824557801723e-06, + "loss": 0.5986, + "step": 383 + }, + { + "epoch": 0.17326565143824027, + "grad_norm": 2.773335792708175, + "learning_rate": 1.8945299371405783e-06, + "loss": 0.4929, + "step": 384 + }, + { + "epoch": 0.17371686407219403, + "grad_norm": 3.5616413240552283, + "learning_rate": 1.8938755067922836e-06, + "loss": 0.6427, + "step": 385 + }, + { + "epoch": 0.17416807670614778, + "grad_norm": 3.3452575187465423, + "learning_rate": 1.893219166133878e-06, + "loss": 0.6748, + "step": 386 + }, + { + "epoch": 0.1746192893401015, + "grad_norm": 3.329395667620974, + "learning_rate": 1.8925609165680336e-06, + "loss": 0.5985, + "step": 387 + }, + { + "epoch": 0.17507050197405527, + "grad_norm": 3.1397011497408323, + "learning_rate": 1.8919007595015017e-06, + "loss": 0.5812, + "step": 388 + }, + { + "epoch": 0.17552171460800903, + "grad_norm": 3.1476385473200343, + "learning_rate": 1.891238696345111e-06, + "loss": 0.5604, + "step": 389 + }, + { + "epoch": 0.17597292724196278, + "grad_norm": 3.1908486340362754, + "learning_rate": 1.8905747285137625e-06, + "loss": 0.6034, + "step": 390 + }, + { + "epoch": 0.1764241398759165, + "grad_norm": 3.2455622054904376, + "learning_rate": 1.889908857426429e-06, + "loss": 0.6501, + "step": 391 + }, + { + "epoch": 0.17687535250987027, + "grad_norm": 3.1460953645857925, + "learning_rate": 1.8892410845061496e-06, + "loss": 0.6749, + "step": 392 + }, + { + "epoch": 0.17732656514382403, + "grad_norm": 3.5804988998358818, + "learning_rate": 1.8885714111800286e-06, + "loss": 0.6729, + "step": 393 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 2.8850245833802806, + "learning_rate": 1.8878998388792312e-06, + "loss": 0.5124, + "step": 394 + }, + { + "epoch": 0.17822899041173154, + "grad_norm": 3.3696439645865772, + "learning_rate": 1.8872263690389817e-06, + "loss": 0.6009, + "step": 395 + }, + { + "epoch": 0.17868020304568527, + "grad_norm": 4.50248696358533, + "learning_rate": 1.8865510030985585e-06, + "loss": 0.8121, + "step": 396 + }, + { + "epoch": 0.17913141567963903, + "grad_norm": 3.2758219521283856, + "learning_rate": 1.8858737425012932e-06, + "loss": 0.6326, + "step": 397 + }, + { + "epoch": 0.17958262831359278, + "grad_norm": 3.0876038288374974, + "learning_rate": 1.8851945886945658e-06, + "loss": 0.5091, + "step": 398 + }, + { + "epoch": 0.18003384094754654, + "grad_norm": 3.161733247624105, + "learning_rate": 1.8845135431298025e-06, + "loss": 0.5075, + "step": 399 + }, + { + "epoch": 0.18048505358150027, + "grad_norm": 3.3744189725502096, + "learning_rate": 1.8838306072624729e-06, + "loss": 0.6991, + "step": 400 + }, + { + "epoch": 0.18093626621545403, + "grad_norm": 3.297593259153372, + "learning_rate": 1.8831457825520851e-06, + "loss": 0.6062, + "step": 401 + }, + { + "epoch": 0.18138747884940778, + "grad_norm": 3.366058459902841, + "learning_rate": 1.8824590704621856e-06, + "loss": 0.6666, + "step": 402 + }, + { + "epoch": 0.18183869148336154, + "grad_norm": 3.7126042071152976, + "learning_rate": 1.8817704724603533e-06, + "loss": 0.6618, + "step": 403 + }, + { + "epoch": 0.18228990411731527, + "grad_norm": 2.9848703959766825, + "learning_rate": 1.8810799900181976e-06, + "loss": 0.609, + "step": 404 + }, + { + "epoch": 0.18274111675126903, + "grad_norm": 3.649965565236517, + "learning_rate": 1.8803876246113552e-06, + "loss": 0.5993, + "step": 405 + }, + { + "epoch": 0.18319232938522279, + "grad_norm": 2.89156057373746, + "learning_rate": 1.8796933777194871e-06, + "loss": 0.6068, + "step": 406 + }, + { + "epoch": 0.18364354201917654, + "grad_norm": 3.0603328302400197, + "learning_rate": 1.8789972508262752e-06, + "loss": 0.5925, + "step": 407 + }, + { + "epoch": 0.1840947546531303, + "grad_norm": 3.2452938065021484, + "learning_rate": 1.8782992454194192e-06, + "loss": 0.4789, + "step": 408 + }, + { + "epoch": 0.18454596728708403, + "grad_norm": 2.8228292796413963, + "learning_rate": 1.877599362990633e-06, + "loss": 0.5509, + "step": 409 + }, + { + "epoch": 0.18499717992103779, + "grad_norm": 3.556457875267904, + "learning_rate": 1.8768976050356424e-06, + "loss": 0.5953, + "step": 410 + }, + { + "epoch": 0.18544839255499154, + "grad_norm": 3.792739279541135, + "learning_rate": 1.876193973054181e-06, + "loss": 0.7014, + "step": 411 + }, + { + "epoch": 0.1858996051889453, + "grad_norm": 3.2493298778303354, + "learning_rate": 1.8754884685499884e-06, + "loss": 0.4082, + "step": 412 + }, + { + "epoch": 0.18635081782289903, + "grad_norm": 3.6609624714373457, + "learning_rate": 1.874781093030804e-06, + "loss": 0.6845, + "step": 413 + }, + { + "epoch": 0.18680203045685279, + "grad_norm": 3.2137855218137594, + "learning_rate": 1.8740718480083678e-06, + "loss": 0.6125, + "step": 414 + }, + { + "epoch": 0.18725324309080654, + "grad_norm": 3.499410379932756, + "learning_rate": 1.8733607349984138e-06, + "loss": 0.6784, + "step": 415 + }, + { + "epoch": 0.1877044557247603, + "grad_norm": 3.2539894662199673, + "learning_rate": 1.8726477555206688e-06, + "loss": 0.6767, + "step": 416 + }, + { + "epoch": 0.18815566835871406, + "grad_norm": 3.7584814555532913, + "learning_rate": 1.8719329110988484e-06, + "loss": 0.5895, + "step": 417 + }, + { + "epoch": 0.18860688099266779, + "grad_norm": 2.7529038721428045, + "learning_rate": 1.8712162032606536e-06, + "loss": 0.5273, + "step": 418 + }, + { + "epoch": 0.18905809362662154, + "grad_norm": 3.033230260922539, + "learning_rate": 1.8704976335377676e-06, + "loss": 0.4975, + "step": 419 + }, + { + "epoch": 0.1895093062605753, + "grad_norm": 3.383380728150317, + "learning_rate": 1.8697772034658525e-06, + "loss": 0.7518, + "step": 420 + }, + { + "epoch": 0.18996051889452906, + "grad_norm": 3.4073642749167408, + "learning_rate": 1.8690549145845473e-06, + "loss": 0.6152, + "step": 421 + }, + { + "epoch": 0.19041173152848279, + "grad_norm": 2.9297215005432586, + "learning_rate": 1.8683307684374618e-06, + "loss": 0.5265, + "step": 422 + }, + { + "epoch": 0.19086294416243654, + "grad_norm": 3.167963569614919, + "learning_rate": 1.8676047665721763e-06, + "loss": 0.5796, + "step": 423 + }, + { + "epoch": 0.1913141567963903, + "grad_norm": 3.1693108021010654, + "learning_rate": 1.8668769105402365e-06, + "loss": 0.5744, + "step": 424 + }, + { + "epoch": 0.19176536943034406, + "grad_norm": 3.0256228579439046, + "learning_rate": 1.8661472018971502e-06, + "loss": 0.6616, + "step": 425 + }, + { + "epoch": 0.19221658206429781, + "grad_norm": 3.2078343663005486, + "learning_rate": 1.8654156422023858e-06, + "loss": 0.5892, + "step": 426 + }, + { + "epoch": 0.19266779469825154, + "grad_norm": 3.227710950393321, + "learning_rate": 1.8646822330193657e-06, + "loss": 0.6133, + "step": 427 + }, + { + "epoch": 0.1931190073322053, + "grad_norm": 3.941318300397022, + "learning_rate": 1.8639469759154665e-06, + "loss": 0.6455, + "step": 428 + }, + { + "epoch": 0.19357021996615906, + "grad_norm": 3.698927481165211, + "learning_rate": 1.863209872462013e-06, + "loss": 0.7186, + "step": 429 + }, + { + "epoch": 0.19402143260011281, + "grad_norm": 3.3302251096427833, + "learning_rate": 1.8624709242342763e-06, + "loss": 0.6038, + "step": 430 + }, + { + "epoch": 0.19447264523406654, + "grad_norm": 3.290865194019538, + "learning_rate": 1.8617301328114702e-06, + "loss": 0.5836, + "step": 431 + }, + { + "epoch": 0.1949238578680203, + "grad_norm": 3.310231926134767, + "learning_rate": 1.8609874997767471e-06, + "loss": 0.5143, + "step": 432 + }, + { + "epoch": 0.19537507050197406, + "grad_norm": 3.458415221251539, + "learning_rate": 1.8602430267171953e-06, + "loss": 0.5223, + "step": 433 + }, + { + "epoch": 0.19582628313592781, + "grad_norm": 3.5702337410450005, + "learning_rate": 1.8594967152238356e-06, + "loss": 0.6378, + "step": 434 + }, + { + "epoch": 0.19627749576988154, + "grad_norm": 3.3767941834158477, + "learning_rate": 1.8587485668916175e-06, + "loss": 0.7065, + "step": 435 + }, + { + "epoch": 0.1967287084038353, + "grad_norm": 3.376132837313131, + "learning_rate": 1.857998583319416e-06, + "loss": 0.6704, + "step": 436 + }, + { + "epoch": 0.19717992103778906, + "grad_norm": 3.7269212236357614, + "learning_rate": 1.8572467661100285e-06, + "loss": 0.6655, + "step": 437 + }, + { + "epoch": 0.19763113367174281, + "grad_norm": 2.885465129936691, + "learning_rate": 1.856493116870171e-06, + "loss": 0.5092, + "step": 438 + }, + { + "epoch": 0.19808234630569657, + "grad_norm": 3.2219073896145374, + "learning_rate": 1.855737637210475e-06, + "loss": 0.6651, + "step": 439 + }, + { + "epoch": 0.1985335589396503, + "grad_norm": 3.020821738843593, + "learning_rate": 1.8549803287454828e-06, + "loss": 0.5339, + "step": 440 + }, + { + "epoch": 0.19898477157360406, + "grad_norm": 3.3074095664255814, + "learning_rate": 1.8542211930936461e-06, + "loss": 0.6784, + "step": 441 + }, + { + "epoch": 0.19943598420755781, + "grad_norm": 3.0185027225876344, + "learning_rate": 1.8534602318773211e-06, + "loss": 0.4799, + "step": 442 + }, + { + "epoch": 0.19988719684151157, + "grad_norm": 3.3658699683573587, + "learning_rate": 1.8526974467227657e-06, + "loss": 0.5748, + "step": 443 + }, + { + "epoch": 0.2003384094754653, + "grad_norm": 3.3754909363924925, + "learning_rate": 1.8519328392601348e-06, + "loss": 0.6109, + "step": 444 + }, + { + "epoch": 0.20078962210941906, + "grad_norm": 3.3689745519480665, + "learning_rate": 1.8511664111234796e-06, + "loss": 0.6079, + "step": 445 + }, + { + "epoch": 0.20124083474337282, + "grad_norm": 3.843717566439344, + "learning_rate": 1.8503981639507402e-06, + "loss": 0.6312, + "step": 446 + }, + { + "epoch": 0.20169204737732657, + "grad_norm": 3.2274835544812848, + "learning_rate": 1.8496280993837457e-06, + "loss": 0.5936, + "step": 447 + }, + { + "epoch": 0.20214326001128033, + "grad_norm": 3.311187513569298, + "learning_rate": 1.8488562190682087e-06, + "loss": 0.5793, + "step": 448 + }, + { + "epoch": 0.20259447264523406, + "grad_norm": 2.9215114080546214, + "learning_rate": 1.8480825246537217e-06, + "loss": 0.5959, + "step": 449 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 3.404182604070715, + "learning_rate": 1.847307017793755e-06, + "loss": 0.5429, + "step": 450 + }, + { + "epoch": 0.20349689791314157, + "grad_norm": 3.2363535948970856, + "learning_rate": 1.846529700145652e-06, + "loss": 0.6569, + "step": 451 + }, + { + "epoch": 0.20394811054709533, + "grad_norm": 3.511507526063595, + "learning_rate": 1.8457505733706258e-06, + "loss": 0.4839, + "step": 452 + }, + { + "epoch": 0.20439932318104906, + "grad_norm": 3.37043417245356, + "learning_rate": 1.8449696391337554e-06, + "loss": 0.669, + "step": 453 + }, + { + "epoch": 0.20485053581500282, + "grad_norm": 3.9931457464636155, + "learning_rate": 1.8441868991039837e-06, + "loss": 0.5805, + "step": 454 + }, + { + "epoch": 0.20530174844895657, + "grad_norm": 2.9942829712689605, + "learning_rate": 1.8434023549541115e-06, + "loss": 0.5925, + "step": 455 + }, + { + "epoch": 0.20575296108291033, + "grad_norm": 3.1956405252290594, + "learning_rate": 1.8426160083607964e-06, + "loss": 0.5746, + "step": 456 + }, + { + "epoch": 0.20620417371686406, + "grad_norm": 2.916883450705641, + "learning_rate": 1.841827861004547e-06, + "loss": 0.5662, + "step": 457 + }, + { + "epoch": 0.20665538635081782, + "grad_norm": 3.431394584303988, + "learning_rate": 1.8410379145697208e-06, + "loss": 0.6516, + "step": 458 + }, + { + "epoch": 0.20710659898477157, + "grad_norm": 3.165177547700952, + "learning_rate": 1.8402461707445203e-06, + "loss": 0.5172, + "step": 459 + }, + { + "epoch": 0.20755781161872533, + "grad_norm": 3.7977061154161302, + "learning_rate": 1.8394526312209885e-06, + "loss": 0.6817, + "step": 460 + }, + { + "epoch": 0.2080090242526791, + "grad_norm": 2.924039062504755, + "learning_rate": 1.838657297695007e-06, + "loss": 0.7493, + "step": 461 + }, + { + "epoch": 0.20846023688663282, + "grad_norm": 3.120504427001902, + "learning_rate": 1.8378601718662905e-06, + "loss": 0.5429, + "step": 462 + }, + { + "epoch": 0.20891144952058657, + "grad_norm": 3.45109119066591, + "learning_rate": 1.8370612554383848e-06, + "loss": 0.6886, + "step": 463 + }, + { + "epoch": 0.20936266215454033, + "grad_norm": 3.9533453151193005, + "learning_rate": 1.8362605501186618e-06, + "loss": 0.6993, + "step": 464 + }, + { + "epoch": 0.2098138747884941, + "grad_norm": 3.5868823288557463, + "learning_rate": 1.8354580576183167e-06, + "loss": 0.7202, + "step": 465 + }, + { + "epoch": 0.21026508742244782, + "grad_norm": 3.360808407157414, + "learning_rate": 1.8346537796523642e-06, + "loss": 0.7506, + "step": 466 + }, + { + "epoch": 0.21071630005640157, + "grad_norm": 3.5999786014669453, + "learning_rate": 1.8338477179396347e-06, + "loss": 0.6462, + "step": 467 + }, + { + "epoch": 0.21116751269035533, + "grad_norm": 3.089511771354358, + "learning_rate": 1.8330398742027702e-06, + "loss": 0.5945, + "step": 468 + }, + { + "epoch": 0.2116187253243091, + "grad_norm": 3.1839344612610376, + "learning_rate": 1.8322302501682216e-06, + "loss": 0.5521, + "step": 469 + }, + { + "epoch": 0.21206993795826284, + "grad_norm": 3.2784855658730883, + "learning_rate": 1.831418847566245e-06, + "loss": 0.5371, + "step": 470 + }, + { + "epoch": 0.21252115059221657, + "grad_norm": 3.346783346886957, + "learning_rate": 1.8306056681308957e-06, + "loss": 0.6287, + "step": 471 + }, + { + "epoch": 0.21297236322617033, + "grad_norm": 3.2663627359220646, + "learning_rate": 1.8297907136000283e-06, + "loss": 0.6486, + "step": 472 + }, + { + "epoch": 0.2134235758601241, + "grad_norm": 3.035708873461981, + "learning_rate": 1.8289739857152903e-06, + "loss": 0.5249, + "step": 473 + }, + { + "epoch": 0.21387478849407784, + "grad_norm": 3.14810881896666, + "learning_rate": 1.8281554862221179e-06, + "loss": 0.6863, + "step": 474 + }, + { + "epoch": 0.21432600112803157, + "grad_norm": 3.606635360142977, + "learning_rate": 1.827335216869735e-06, + "loss": 0.5768, + "step": 475 + }, + { + "epoch": 0.21477721376198533, + "grad_norm": 3.420453255878494, + "learning_rate": 1.8265131794111477e-06, + "loss": 0.5868, + "step": 476 + }, + { + "epoch": 0.2152284263959391, + "grad_norm": 3.5479674298971213, + "learning_rate": 1.8256893756031396e-06, + "loss": 0.5529, + "step": 477 + }, + { + "epoch": 0.21567963902989284, + "grad_norm": 3.5672449049409143, + "learning_rate": 1.82486380720627e-06, + "loss": 0.5414, + "step": 478 + }, + { + "epoch": 0.2161308516638466, + "grad_norm": 3.4755060623971157, + "learning_rate": 1.8240364759848697e-06, + "loss": 0.6827, + "step": 479 + }, + { + "epoch": 0.21658206429780033, + "grad_norm": 3.5075155658583532, + "learning_rate": 1.823207383707036e-06, + "loss": 0.6522, + "step": 480 + }, + { + "epoch": 0.2170332769317541, + "grad_norm": 3.615334770032639, + "learning_rate": 1.82237653214463e-06, + "loss": 0.6112, + "step": 481 + }, + { + "epoch": 0.21748448956570784, + "grad_norm": 2.930096209443241, + "learning_rate": 1.8215439230732728e-06, + "loss": 0.4904, + "step": 482 + }, + { + "epoch": 0.2179357021996616, + "grad_norm": 3.045321232724037, + "learning_rate": 1.8207095582723416e-06, + "loss": 0.4742, + "step": 483 + }, + { + "epoch": 0.21838691483361533, + "grad_norm": 3.318218916061034, + "learning_rate": 1.8198734395249654e-06, + "loss": 0.5816, + "step": 484 + }, + { + "epoch": 0.2188381274675691, + "grad_norm": 3.238638276761325, + "learning_rate": 1.8190355686180218e-06, + "loss": 0.7086, + "step": 485 + }, + { + "epoch": 0.21928934010152284, + "grad_norm": 3.088879882296647, + "learning_rate": 1.8181959473421334e-06, + "loss": 0.5967, + "step": 486 + }, + { + "epoch": 0.2197405527354766, + "grad_norm": 3.3716940597716585, + "learning_rate": 1.8173545774916626e-06, + "loss": 0.7092, + "step": 487 + }, + { + "epoch": 0.22019176536943033, + "grad_norm": 3.7448981916189825, + "learning_rate": 1.816511460864709e-06, + "loss": 0.6427, + "step": 488 + }, + { + "epoch": 0.2206429780033841, + "grad_norm": 2.9887789178064548, + "learning_rate": 1.8156665992631057e-06, + "loss": 0.5822, + "step": 489 + }, + { + "epoch": 0.22109419063733785, + "grad_norm": 3.3414881969228714, + "learning_rate": 1.8148199944924146e-06, + "loss": 0.5946, + "step": 490 + }, + { + "epoch": 0.2215454032712916, + "grad_norm": 3.4422877305450763, + "learning_rate": 1.8139716483619232e-06, + "loss": 0.6066, + "step": 491 + }, + { + "epoch": 0.22199661590524536, + "grad_norm": 3.4139774261820093, + "learning_rate": 1.8131215626846403e-06, + "loss": 0.6513, + "step": 492 + }, + { + "epoch": 0.2224478285391991, + "grad_norm": 2.9492703190535465, + "learning_rate": 1.8122697392772923e-06, + "loss": 0.512, + "step": 493 + }, + { + "epoch": 0.22289904117315285, + "grad_norm": 3.392493989272701, + "learning_rate": 1.8114161799603192e-06, + "loss": 0.6111, + "step": 494 + }, + { + "epoch": 0.2233502538071066, + "grad_norm": 3.4162235015689095, + "learning_rate": 1.8105608865578712e-06, + "loss": 0.7261, + "step": 495 + }, + { + "epoch": 0.22380146644106036, + "grad_norm": 3.3485739796813645, + "learning_rate": 1.809703860897804e-06, + "loss": 0.5977, + "step": 496 + }, + { + "epoch": 0.2242526790750141, + "grad_norm": 3.3359232989076917, + "learning_rate": 1.808845104811676e-06, + "loss": 0.4957, + "step": 497 + }, + { + "epoch": 0.22470389170896785, + "grad_norm": 3.1119819516264706, + "learning_rate": 1.8079846201347426e-06, + "loss": 0.566, + "step": 498 + }, + { + "epoch": 0.2251551043429216, + "grad_norm": 3.33013160940601, + "learning_rate": 1.8071224087059545e-06, + "loss": 0.6161, + "step": 499 + }, + { + "epoch": 0.22560631697687536, + "grad_norm": 3.726562728905523, + "learning_rate": 1.806258472367952e-06, + "loss": 0.6273, + "step": 500 + }, + { + "epoch": 0.22605752961082912, + "grad_norm": 3.112643238486848, + "learning_rate": 1.805392812967062e-06, + "loss": 0.524, + "step": 501 + }, + { + "epoch": 0.22650874224478285, + "grad_norm": 3.078076912846401, + "learning_rate": 1.8045254323532938e-06, + "loss": 0.5933, + "step": 502 + }, + { + "epoch": 0.2269599548787366, + "grad_norm": 3.3007163587543293, + "learning_rate": 1.803656332380335e-06, + "loss": 0.6058, + "step": 503 + }, + { + "epoch": 0.22741116751269036, + "grad_norm": 3.350696137526816, + "learning_rate": 1.8027855149055476e-06, + "loss": 0.6761, + "step": 504 + }, + { + "epoch": 0.22786238014664412, + "grad_norm": 3.056386670780574, + "learning_rate": 1.8019129817899641e-06, + "loss": 0.5403, + "step": 505 + }, + { + "epoch": 0.22831359278059785, + "grad_norm": 3.3200840574141703, + "learning_rate": 1.8010387348982834e-06, + "loss": 0.4723, + "step": 506 + }, + { + "epoch": 0.2287648054145516, + "grad_norm": 2.9031378599313964, + "learning_rate": 1.8001627760988676e-06, + "loss": 0.5124, + "step": 507 + }, + { + "epoch": 0.22921601804850536, + "grad_norm": 3.0170169190296425, + "learning_rate": 1.7992851072637364e-06, + "loss": 0.5241, + "step": 508 + }, + { + "epoch": 0.22966723068245912, + "grad_norm": 3.260274778241787, + "learning_rate": 1.7984057302685645e-06, + "loss": 0.612, + "step": 509 + }, + { + "epoch": 0.23011844331641285, + "grad_norm": 2.957767077162712, + "learning_rate": 1.7975246469926773e-06, + "loss": 0.5856, + "step": 510 + }, + { + "epoch": 0.2305696559503666, + "grad_norm": 3.077346118529033, + "learning_rate": 1.7966418593190466e-06, + "loss": 0.6476, + "step": 511 + }, + { + "epoch": 0.23102086858432036, + "grad_norm": 3.40658654324856, + "learning_rate": 1.7957573691342863e-06, + "loss": 0.6496, + "step": 512 + }, + { + "epoch": 0.23147208121827412, + "grad_norm": 3.1537347899768187, + "learning_rate": 1.7948711783286494e-06, + "loss": 0.4991, + "step": 513 + }, + { + "epoch": 0.23192329385222787, + "grad_norm": 3.3406426536701064, + "learning_rate": 1.7939832887960228e-06, + "loss": 0.6539, + "step": 514 + }, + { + "epoch": 0.2323745064861816, + "grad_norm": 3.6637558500642617, + "learning_rate": 1.7930937024339236e-06, + "loss": 0.5154, + "step": 515 + }, + { + "epoch": 0.23282571912013536, + "grad_norm": 3.528943118739639, + "learning_rate": 1.7922024211434958e-06, + "loss": 0.4997, + "step": 516 + }, + { + "epoch": 0.23327693175408912, + "grad_norm": 2.8648086426744386, + "learning_rate": 1.7913094468295056e-06, + "loss": 0.4715, + "step": 517 + }, + { + "epoch": 0.23372814438804287, + "grad_norm": 3.39893956335253, + "learning_rate": 1.790414781400337e-06, + "loss": 0.6078, + "step": 518 + }, + { + "epoch": 0.2341793570219966, + "grad_norm": 3.3165376220099976, + "learning_rate": 1.7895184267679882e-06, + "loss": 0.5808, + "step": 519 + }, + { + "epoch": 0.23463056965595036, + "grad_norm": 3.220094259664805, + "learning_rate": 1.7886203848480671e-06, + "loss": 0.6231, + "step": 520 + }, + { + "epoch": 0.23508178228990412, + "grad_norm": 3.1138985690928886, + "learning_rate": 1.7877206575597887e-06, + "loss": 0.6523, + "step": 521 + }, + { + "epoch": 0.23553299492385787, + "grad_norm": 3.1979428387224935, + "learning_rate": 1.7868192468259684e-06, + "loss": 0.598, + "step": 522 + }, + { + "epoch": 0.23598420755781163, + "grad_norm": 3.135032930567459, + "learning_rate": 1.7859161545730204e-06, + "loss": 0.7253, + "step": 523 + }, + { + "epoch": 0.23643542019176536, + "grad_norm": 2.85623015158301, + "learning_rate": 1.7850113827309516e-06, + "loss": 0.485, + "step": 524 + }, + { + "epoch": 0.23688663282571912, + "grad_norm": 3.126524923453851, + "learning_rate": 1.7841049332333588e-06, + "loss": 0.6822, + "step": 525 + }, + { + "epoch": 0.23733784545967287, + "grad_norm": 3.2260751063220416, + "learning_rate": 1.7831968080174245e-06, + "loss": 0.5968, + "step": 526 + }, + { + "epoch": 0.23778905809362663, + "grad_norm": 3.2163409363779656, + "learning_rate": 1.7822870090239116e-06, + "loss": 0.6282, + "step": 527 + }, + { + "epoch": 0.23824027072758036, + "grad_norm": 3.223131291822765, + "learning_rate": 1.7813755381971603e-06, + "loss": 0.6361, + "step": 528 + }, + { + "epoch": 0.23869148336153412, + "grad_norm": 3.5266582311851367, + "learning_rate": 1.7804623974850843e-06, + "loss": 0.5917, + "step": 529 + }, + { + "epoch": 0.23914269599548788, + "grad_norm": 3.756212649445671, + "learning_rate": 1.7795475888391654e-06, + "loss": 0.5342, + "step": 530 + }, + { + "epoch": 0.23959390862944163, + "grad_norm": 3.0598171380651658, + "learning_rate": 1.7786311142144501e-06, + "loss": 0.7084, + "step": 531 + }, + { + "epoch": 0.24004512126339536, + "grad_norm": 3.3421256359633476, + "learning_rate": 1.7777129755695453e-06, + "loss": 0.5933, + "step": 532 + }, + { + "epoch": 0.24049633389734912, + "grad_norm": 3.5281023446435564, + "learning_rate": 1.7767931748666143e-06, + "loss": 0.6578, + "step": 533 + }, + { + "epoch": 0.24094754653130288, + "grad_norm": 3.0273532574157613, + "learning_rate": 1.7758717140713717e-06, + "loss": 0.5256, + "step": 534 + }, + { + "epoch": 0.24139875916525663, + "grad_norm": 3.197844985849922, + "learning_rate": 1.7749485951530812e-06, + "loss": 0.6199, + "step": 535 + }, + { + "epoch": 0.2418499717992104, + "grad_norm": 3.380058493493815, + "learning_rate": 1.7740238200845484e-06, + "loss": 0.5581, + "step": 536 + }, + { + "epoch": 0.24230118443316412, + "grad_norm": 2.8794589169982405, + "learning_rate": 1.7730973908421196e-06, + "loss": 0.5515, + "step": 537 + }, + { + "epoch": 0.24275239706711788, + "grad_norm": 3.303088479063469, + "learning_rate": 1.772169309405676e-06, + "loss": 0.6068, + "step": 538 + }, + { + "epoch": 0.24320360970107163, + "grad_norm": 3.5894154844174775, + "learning_rate": 1.7712395777586294e-06, + "loss": 0.569, + "step": 539 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 3.412578792730928, + "learning_rate": 1.770308197887918e-06, + "loss": 0.5934, + "step": 540 + }, + { + "epoch": 0.24410603496897912, + "grad_norm": 3.4253075329490814, + "learning_rate": 1.7693751717840035e-06, + "loss": 0.6087, + "step": 541 + }, + { + "epoch": 0.24455724760293288, + "grad_norm": 3.499633565706812, + "learning_rate": 1.7684405014408647e-06, + "loss": 0.588, + "step": 542 + }, + { + "epoch": 0.24500846023688663, + "grad_norm": 2.814806236683677, + "learning_rate": 1.767504188855995e-06, + "loss": 0.482, + "step": 543 + }, + { + "epoch": 0.2454596728708404, + "grad_norm": 3.098881310366283, + "learning_rate": 1.766566236030397e-06, + "loss": 0.5435, + "step": 544 + }, + { + "epoch": 0.24591088550479415, + "grad_norm": 3.288381281791134, + "learning_rate": 1.7656266449685792e-06, + "loss": 0.5992, + "step": 545 + }, + { + "epoch": 0.24636209813874788, + "grad_norm": 3.4041539654568425, + "learning_rate": 1.7646854176785506e-06, + "loss": 0.5966, + "step": 546 + }, + { + "epoch": 0.24681331077270163, + "grad_norm": 3.4008922515143345, + "learning_rate": 1.7637425561718174e-06, + "loss": 0.7074, + "step": 547 + }, + { + "epoch": 0.2472645234066554, + "grad_norm": 3.552760007080402, + "learning_rate": 1.762798062463378e-06, + "loss": 0.7019, + "step": 548 + }, + { + "epoch": 0.24771573604060915, + "grad_norm": 3.6537377564708255, + "learning_rate": 1.7618519385717192e-06, + "loss": 0.6482, + "step": 549 + }, + { + "epoch": 0.24816694867456288, + "grad_norm": 3.11958006185935, + "learning_rate": 1.7609041865188118e-06, + "loss": 0.5418, + "step": 550 + }, + { + "epoch": 0.24861816130851663, + "grad_norm": 3.336252618186167, + "learning_rate": 1.759954808330106e-06, + "loss": 0.6389, + "step": 551 + }, + { + "epoch": 0.2490693739424704, + "grad_norm": 2.921189003705698, + "learning_rate": 1.7590038060345274e-06, + "loss": 0.4537, + "step": 552 + }, + { + "epoch": 0.24952058657642415, + "grad_norm": 3.33855785234176, + "learning_rate": 1.7580511816644715e-06, + "loss": 0.5591, + "step": 553 + }, + { + "epoch": 0.2499717992103779, + "grad_norm": 4.125492421237276, + "learning_rate": 1.7570969372558021e-06, + "loss": 0.5379, + "step": 554 + }, + { + "epoch": 0.25042301184433163, + "grad_norm": 3.0575497100367683, + "learning_rate": 1.7561410748478441e-06, + "loss": 0.5602, + "step": 555 + }, + { + "epoch": 0.2508742244782854, + "grad_norm": 3.2758898433144075, + "learning_rate": 1.75518359648338e-06, + "loss": 0.4833, + "step": 556 + }, + { + "epoch": 0.25132543711223915, + "grad_norm": 3.2778885737843777, + "learning_rate": 1.7542245042086467e-06, + "loss": 0.5926, + "step": 557 + }, + { + "epoch": 0.2517766497461929, + "grad_norm": 3.2596395219328085, + "learning_rate": 1.7532638000733293e-06, + "loss": 0.555, + "step": 558 + }, + { + "epoch": 0.25222786238014666, + "grad_norm": 2.9901086229008227, + "learning_rate": 1.7523014861305585e-06, + "loss": 0.4896, + "step": 559 + }, + { + "epoch": 0.2526790750141004, + "grad_norm": 3.244315223694814, + "learning_rate": 1.7513375644369046e-06, + "loss": 0.585, + "step": 560 + }, + { + "epoch": 0.2531302876480541, + "grad_norm": 3.198365136580784, + "learning_rate": 1.750372037052374e-06, + "loss": 0.528, + "step": 561 + }, + { + "epoch": 0.2535815002820079, + "grad_norm": 3.2771916408813406, + "learning_rate": 1.7494049060404047e-06, + "loss": 0.7574, + "step": 562 + }, + { + "epoch": 0.25403271291596163, + "grad_norm": 2.8807336033407487, + "learning_rate": 1.7484361734678621e-06, + "loss": 0.5565, + "step": 563 + }, + { + "epoch": 0.2544839255499154, + "grad_norm": 3.6211865966847494, + "learning_rate": 1.747465841405034e-06, + "loss": 0.5855, + "step": 564 + }, + { + "epoch": 0.25493513818386915, + "grad_norm": 3.1436849609005204, + "learning_rate": 1.7464939119256266e-06, + "loss": 0.6694, + "step": 565 + }, + { + "epoch": 0.2553863508178229, + "grad_norm": 2.7285316400886113, + "learning_rate": 1.7455203871067596e-06, + "loss": 0.5036, + "step": 566 + }, + { + "epoch": 0.25583756345177666, + "grad_norm": 3.186433555908316, + "learning_rate": 1.744545269028963e-06, + "loss": 0.553, + "step": 567 + }, + { + "epoch": 0.2562887760857304, + "grad_norm": 2.8827236037421033, + "learning_rate": 1.7435685597761707e-06, + "loss": 0.5289, + "step": 568 + }, + { + "epoch": 0.2567399887196842, + "grad_norm": 3.181781322960324, + "learning_rate": 1.742590261435718e-06, + "loss": 0.6055, + "step": 569 + }, + { + "epoch": 0.2571912013536379, + "grad_norm": 3.4460098001320443, + "learning_rate": 1.7416103760983356e-06, + "loss": 0.5721, + "step": 570 + }, + { + "epoch": 0.25764241398759163, + "grad_norm": 3.2888809340579033, + "learning_rate": 1.7406289058581463e-06, + "loss": 0.5706, + "step": 571 + }, + { + "epoch": 0.2580936266215454, + "grad_norm": 3.039055552956028, + "learning_rate": 1.7396458528126594e-06, + "loss": 0.6311, + "step": 572 + }, + { + "epoch": 0.25854483925549915, + "grad_norm": 3.1817781660973012, + "learning_rate": 1.738661219062768e-06, + "loss": 0.5731, + "step": 573 + }, + { + "epoch": 0.2589960518894529, + "grad_norm": 3.2166470180702276, + "learning_rate": 1.7376750067127412e-06, + "loss": 0.6233, + "step": 574 + }, + { + "epoch": 0.25944726452340666, + "grad_norm": 2.933567700518428, + "learning_rate": 1.7366872178702246e-06, + "loss": 0.4681, + "step": 575 + }, + { + "epoch": 0.2598984771573604, + "grad_norm": 3.339854630057023, + "learning_rate": 1.7356978546462305e-06, + "loss": 0.5115, + "step": 576 + }, + { + "epoch": 0.2603496897913142, + "grad_norm": 3.042537582951306, + "learning_rate": 1.7347069191551367e-06, + "loss": 0.4721, + "step": 577 + }, + { + "epoch": 0.26080090242526793, + "grad_norm": 3.4742498601316583, + "learning_rate": 1.7337144135146815e-06, + "loss": 0.6191, + "step": 578 + }, + { + "epoch": 0.26125211505922163, + "grad_norm": 3.338002903883107, + "learning_rate": 1.7327203398459584e-06, + "loss": 0.5757, + "step": 579 + }, + { + "epoch": 0.2617033276931754, + "grad_norm": 3.582565654268005, + "learning_rate": 1.731724700273412e-06, + "loss": 0.5778, + "step": 580 + }, + { + "epoch": 0.26215454032712915, + "grad_norm": 3.3503897003901773, + "learning_rate": 1.7307274969248334e-06, + "loss": 0.6079, + "step": 581 + }, + { + "epoch": 0.2626057529610829, + "grad_norm": 3.6953227055441644, + "learning_rate": 1.7297287319313552e-06, + "loss": 0.6023, + "step": 582 + }, + { + "epoch": 0.26305696559503666, + "grad_norm": 3.2964361664081143, + "learning_rate": 1.7287284074274485e-06, + "loss": 0.6274, + "step": 583 + }, + { + "epoch": 0.2635081782289904, + "grad_norm": 3.347719926763811, + "learning_rate": 1.7277265255509163e-06, + "loss": 0.5585, + "step": 584 + }, + { + "epoch": 0.2639593908629442, + "grad_norm": 2.9228072338236237, + "learning_rate": 1.7267230884428903e-06, + "loss": 0.5438, + "step": 585 + }, + { + "epoch": 0.26441060349689793, + "grad_norm": 2.9981318621426545, + "learning_rate": 1.7257180982478254e-06, + "loss": 0.4731, + "step": 586 + }, + { + "epoch": 0.2648618161308517, + "grad_norm": 2.7051775991961073, + "learning_rate": 1.7247115571134968e-06, + "loss": 0.5209, + "step": 587 + }, + { + "epoch": 0.2653130287648054, + "grad_norm": 3.201881004333221, + "learning_rate": 1.7237034671909927e-06, + "loss": 0.5185, + "step": 588 + }, + { + "epoch": 0.26576424139875915, + "grad_norm": 3.0674421520184803, + "learning_rate": 1.7226938306347122e-06, + "loss": 0.4585, + "step": 589 + }, + { + "epoch": 0.2662154540327129, + "grad_norm": 2.9375079500042136, + "learning_rate": 1.7216826496023592e-06, + "loss": 0.5657, + "step": 590 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 3.526546662395944, + "learning_rate": 1.7206699262549394e-06, + "loss": 0.6383, + "step": 591 + }, + { + "epoch": 0.2671178793006204, + "grad_norm": 3.1723564070744406, + "learning_rate": 1.719655662756753e-06, + "loss": 0.4663, + "step": 592 + }, + { + "epoch": 0.2675690919345742, + "grad_norm": 3.2019446199311017, + "learning_rate": 1.7186398612753927e-06, + "loss": 0.5142, + "step": 593 + }, + { + "epoch": 0.26802030456852793, + "grad_norm": 3.041038824521175, + "learning_rate": 1.7176225239817378e-06, + "loss": 0.5848, + "step": 594 + }, + { + "epoch": 0.2684715172024817, + "grad_norm": 2.9674872867935385, + "learning_rate": 1.7166036530499502e-06, + "loss": 0.5316, + "step": 595 + }, + { + "epoch": 0.26892272983643545, + "grad_norm": 3.127580522901588, + "learning_rate": 1.7155832506574686e-06, + "loss": 0.6055, + "step": 596 + }, + { + "epoch": 0.26937394247038915, + "grad_norm": 3.135126305745904, + "learning_rate": 1.7145613189850048e-06, + "loss": 0.6033, + "step": 597 + }, + { + "epoch": 0.2698251551043429, + "grad_norm": 3.3477656942247425, + "learning_rate": 1.713537860216539e-06, + "loss": 0.5876, + "step": 598 + }, + { + "epoch": 0.27027636773829666, + "grad_norm": 3.2848248111575455, + "learning_rate": 1.7125128765393152e-06, + "loss": 0.5931, + "step": 599 + }, + { + "epoch": 0.2707275803722504, + "grad_norm": 3.4561387987225367, + "learning_rate": 1.7114863701438363e-06, + "loss": 0.6233, + "step": 600 + }, + { + "epoch": 0.2711787930062042, + "grad_norm": 3.4262466490817234, + "learning_rate": 1.7104583432238587e-06, + "loss": 0.6797, + "step": 601 + }, + { + "epoch": 0.27163000564015793, + "grad_norm": 3.333713435824011, + "learning_rate": 1.7094287979763891e-06, + "loss": 0.6204, + "step": 602 + }, + { + "epoch": 0.2720812182741117, + "grad_norm": 3.351871668499492, + "learning_rate": 1.7083977366016785e-06, + "loss": 0.6415, + "step": 603 + }, + { + "epoch": 0.27253243090806545, + "grad_norm": 3.2603491068445587, + "learning_rate": 1.7073651613032184e-06, + "loss": 0.5484, + "step": 604 + }, + { + "epoch": 0.27298364354201915, + "grad_norm": 3.024159212795425, + "learning_rate": 1.706331074287736e-06, + "loss": 0.6172, + "step": 605 + }, + { + "epoch": 0.2734348561759729, + "grad_norm": 2.8744869206183368, + "learning_rate": 1.705295477765188e-06, + "loss": 0.5581, + "step": 606 + }, + { + "epoch": 0.27388606880992666, + "grad_norm": 3.3867448368720483, + "learning_rate": 1.7042583739487584e-06, + "loss": 0.7357, + "step": 607 + }, + { + "epoch": 0.2743372814438804, + "grad_norm": 2.935426978961505, + "learning_rate": 1.703219765054852e-06, + "loss": 0.5314, + "step": 608 + }, + { + "epoch": 0.2747884940778342, + "grad_norm": 3.0973120617238004, + "learning_rate": 1.70217965330309e-06, + "loss": 0.598, + "step": 609 + }, + { + "epoch": 0.27523970671178793, + "grad_norm": 3.028317877474072, + "learning_rate": 1.701138040916305e-06, + "loss": 0.5377, + "step": 610 + }, + { + "epoch": 0.2756909193457417, + "grad_norm": 3.1165117895424723, + "learning_rate": 1.7000949301205373e-06, + "loss": 0.6877, + "step": 611 + }, + { + "epoch": 0.27614213197969545, + "grad_norm": 3.7422765877651516, + "learning_rate": 1.6990503231450297e-06, + "loss": 0.5404, + "step": 612 + }, + { + "epoch": 0.2765933446136492, + "grad_norm": 3.229450690131071, + "learning_rate": 1.6980042222222216e-06, + "loss": 0.6637, + "step": 613 + }, + { + "epoch": 0.2770445572476029, + "grad_norm": 2.9411077856787355, + "learning_rate": 1.696956629587745e-06, + "loss": 0.6576, + "step": 614 + }, + { + "epoch": 0.27749576988155666, + "grad_norm": 3.1585991420412554, + "learning_rate": 1.6959075474804215e-06, + "loss": 0.5854, + "step": 615 + }, + { + "epoch": 0.2779469825155104, + "grad_norm": 3.5738811250095197, + "learning_rate": 1.6948569781422538e-06, + "loss": 0.6342, + "step": 616 + }, + { + "epoch": 0.2783981951494642, + "grad_norm": 3.2043997534775635, + "learning_rate": 1.6938049238184244e-06, + "loss": 0.6077, + "step": 617 + }, + { + "epoch": 0.27884940778341794, + "grad_norm": 3.009802670893003, + "learning_rate": 1.6927513867572887e-06, + "loss": 0.5461, + "step": 618 + }, + { + "epoch": 0.2793006204173717, + "grad_norm": 3.735900498450287, + "learning_rate": 1.6916963692103713e-06, + "loss": 0.61, + "step": 619 + }, + { + "epoch": 0.27975183305132545, + "grad_norm": 3.3631424989752525, + "learning_rate": 1.6906398734323606e-06, + "loss": 0.7085, + "step": 620 + }, + { + "epoch": 0.2802030456852792, + "grad_norm": 3.217240383492006, + "learning_rate": 1.6895819016811038e-06, + "loss": 0.6147, + "step": 621 + }, + { + "epoch": 0.28065425831923296, + "grad_norm": 3.1433256959207885, + "learning_rate": 1.6885224562176031e-06, + "loss": 0.6808, + "step": 622 + }, + { + "epoch": 0.28110547095318666, + "grad_norm": 3.6645785813982856, + "learning_rate": 1.6874615393060091e-06, + "loss": 0.493, + "step": 623 + }, + { + "epoch": 0.2815566835871404, + "grad_norm": 2.9879222448827014, + "learning_rate": 1.6863991532136184e-06, + "loss": 0.4975, + "step": 624 + }, + { + "epoch": 0.2820078962210942, + "grad_norm": 3.373734037289056, + "learning_rate": 1.6853353002108667e-06, + "loss": 0.5831, + "step": 625 + }, + { + "epoch": 0.28245910885504794, + "grad_norm": 3.3186458821895872, + "learning_rate": 1.6842699825713242e-06, + "loss": 0.5788, + "step": 626 + }, + { + "epoch": 0.2829103214890017, + "grad_norm": 3.240295183925887, + "learning_rate": 1.683203202571692e-06, + "loss": 0.5798, + "step": 627 + }, + { + "epoch": 0.28336153412295545, + "grad_norm": 2.8461310264884188, + "learning_rate": 1.682134962491796e-06, + "loss": 0.624, + "step": 628 + }, + { + "epoch": 0.2838127467569092, + "grad_norm": 3.182637425551016, + "learning_rate": 1.6810652646145828e-06, + "loss": 0.5762, + "step": 629 + }, + { + "epoch": 0.28426395939086296, + "grad_norm": 3.21165606325256, + "learning_rate": 1.679994111226114e-06, + "loss": 0.5685, + "step": 630 + }, + { + "epoch": 0.2847151720248167, + "grad_norm": 3.065992459300649, + "learning_rate": 1.678921504615562e-06, + "loss": 0.5511, + "step": 631 + }, + { + "epoch": 0.2851663846587704, + "grad_norm": 4.0157081970954405, + "learning_rate": 1.677847447075205e-06, + "loss": 0.6992, + "step": 632 + }, + { + "epoch": 0.2856175972927242, + "grad_norm": 3.656189168497442, + "learning_rate": 1.676771940900422e-06, + "loss": 0.6023, + "step": 633 + }, + { + "epoch": 0.28606880992667794, + "grad_norm": 3.641835731718804, + "learning_rate": 1.6756949883896874e-06, + "loss": 0.657, + "step": 634 + }, + { + "epoch": 0.2865200225606317, + "grad_norm": 3.631195546639338, + "learning_rate": 1.6746165918445672e-06, + "loss": 0.5844, + "step": 635 + }, + { + "epoch": 0.28697123519458545, + "grad_norm": 3.305073818481503, + "learning_rate": 1.6735367535697133e-06, + "loss": 0.691, + "step": 636 + }, + { + "epoch": 0.2874224478285392, + "grad_norm": 3.6875236525441935, + "learning_rate": 1.6724554758728586e-06, + "loss": 0.4427, + "step": 637 + }, + { + "epoch": 0.28787366046249296, + "grad_norm": 3.270744869677528, + "learning_rate": 1.6713727610648122e-06, + "loss": 0.6261, + "step": 638 + }, + { + "epoch": 0.2883248730964467, + "grad_norm": 3.294150755730515, + "learning_rate": 1.670288611459455e-06, + "loss": 0.6095, + "step": 639 + }, + { + "epoch": 0.2887760857304005, + "grad_norm": 3.418142081910065, + "learning_rate": 1.669203029373733e-06, + "loss": 0.7189, + "step": 640 + }, + { + "epoch": 0.2892272983643542, + "grad_norm": 3.114231217877707, + "learning_rate": 1.6681160171276548e-06, + "loss": 0.5874, + "step": 641 + }, + { + "epoch": 0.28967851099830794, + "grad_norm": 3.3583157977609233, + "learning_rate": 1.6670275770442849e-06, + "loss": 0.6205, + "step": 642 + }, + { + "epoch": 0.2901297236322617, + "grad_norm": 2.946320692741977, + "learning_rate": 1.665937711449739e-06, + "loss": 0.532, + "step": 643 + }, + { + "epoch": 0.29058093626621545, + "grad_norm": 3.4190265435248155, + "learning_rate": 1.66484642267318e-06, + "loss": 0.5944, + "step": 644 + }, + { + "epoch": 0.2910321489001692, + "grad_norm": 2.8472323370866612, + "learning_rate": 1.6637537130468113e-06, + "loss": 0.6206, + "step": 645 + }, + { + "epoch": 0.29148336153412296, + "grad_norm": 3.2713722111872245, + "learning_rate": 1.662659584905874e-06, + "loss": 0.5323, + "step": 646 + }, + { + "epoch": 0.2919345741680767, + "grad_norm": 2.922995795945147, + "learning_rate": 1.6615640405886395e-06, + "loss": 0.5084, + "step": 647 + }, + { + "epoch": 0.2923857868020305, + "grad_norm": 3.5339931194135077, + "learning_rate": 1.6604670824364067e-06, + "loss": 0.6357, + "step": 648 + }, + { + "epoch": 0.2928369994359842, + "grad_norm": 3.317497856029728, + "learning_rate": 1.659368712793495e-06, + "loss": 0.6874, + "step": 649 + }, + { + "epoch": 0.29328821206993794, + "grad_norm": 3.7424311686840577, + "learning_rate": 1.6582689340072417e-06, + "loss": 0.7115, + "step": 650 + }, + { + "epoch": 0.2937394247038917, + "grad_norm": 3.1345128131384383, + "learning_rate": 1.6571677484279948e-06, + "loss": 0.578, + "step": 651 + }, + { + "epoch": 0.29419063733784545, + "grad_norm": 3.911109938851912, + "learning_rate": 1.6560651584091082e-06, + "loss": 0.7279, + "step": 652 + }, + { + "epoch": 0.2946418499717992, + "grad_norm": 3.3288650410464253, + "learning_rate": 1.6549611663069383e-06, + "loss": 0.5636, + "step": 653 + }, + { + "epoch": 0.29509306260575296, + "grad_norm": 2.863150590413283, + "learning_rate": 1.6538557744808371e-06, + "loss": 0.6478, + "step": 654 + }, + { + "epoch": 0.2955442752397067, + "grad_norm": 3.006213042305221, + "learning_rate": 1.6527489852931486e-06, + "loss": 0.562, + "step": 655 + }, + { + "epoch": 0.2959954878736605, + "grad_norm": 3.638111266983312, + "learning_rate": 1.6516408011092027e-06, + "loss": 0.5496, + "step": 656 + }, + { + "epoch": 0.29644670050761424, + "grad_norm": 3.316824374679788, + "learning_rate": 1.6505312242973108e-06, + "loss": 0.665, + "step": 657 + }, + { + "epoch": 0.29689791314156794, + "grad_norm": 3.0712540463962914, + "learning_rate": 1.6494202572287605e-06, + "loss": 0.5783, + "step": 658 + }, + { + "epoch": 0.2973491257755217, + "grad_norm": 3.792785889031542, + "learning_rate": 1.64830790227781e-06, + "loss": 0.6901, + "step": 659 + }, + { + "epoch": 0.29780033840947545, + "grad_norm": 3.420839278819223, + "learning_rate": 1.6471941618216842e-06, + "loss": 0.617, + "step": 660 + }, + { + "epoch": 0.2982515510434292, + "grad_norm": 3.310228590338887, + "learning_rate": 1.6460790382405688e-06, + "loss": 0.693, + "step": 661 + }, + { + "epoch": 0.29870276367738297, + "grad_norm": 3.4270595757910964, + "learning_rate": 1.6449625339176053e-06, + "loss": 0.5201, + "step": 662 + }, + { + "epoch": 0.2991539763113367, + "grad_norm": 3.1683236124814274, + "learning_rate": 1.643844651238886e-06, + "loss": 0.5696, + "step": 663 + }, + { + "epoch": 0.2996051889452905, + "grad_norm": 3.7401623649500473, + "learning_rate": 1.6427253925934492e-06, + "loss": 0.7044, + "step": 664 + }, + { + "epoch": 0.30005640157924424, + "grad_norm": 3.4634585244712124, + "learning_rate": 1.641604760373273e-06, + "loss": 0.5573, + "step": 665 + }, + { + "epoch": 0.300507614213198, + "grad_norm": 3.3656674389925443, + "learning_rate": 1.640482756973272e-06, + "loss": 0.5742, + "step": 666 + }, + { + "epoch": 0.3009588268471517, + "grad_norm": 3.5089959297906654, + "learning_rate": 1.6393593847912903e-06, + "loss": 0.625, + "step": 667 + }, + { + "epoch": 0.30141003948110545, + "grad_norm": 2.8305729096605194, + "learning_rate": 1.6382346462280979e-06, + "loss": 0.4971, + "step": 668 + }, + { + "epoch": 0.3018612521150592, + "grad_norm": 3.2836377236231926, + "learning_rate": 1.6371085436873843e-06, + "loss": 0.573, + "step": 669 + }, + { + "epoch": 0.30231246474901297, + "grad_norm": 2.839883087141276, + "learning_rate": 1.635981079575755e-06, + "loss": 0.4281, + "step": 670 + }, + { + "epoch": 0.3027636773829667, + "grad_norm": 3.0551538272584913, + "learning_rate": 1.6348522563027235e-06, + "loss": 0.5763, + "step": 671 + }, + { + "epoch": 0.3032148900169205, + "grad_norm": 3.1227628457864633, + "learning_rate": 1.6337220762807098e-06, + "loss": 0.5347, + "step": 672 + }, + { + "epoch": 0.30366610265087424, + "grad_norm": 3.0644130436580297, + "learning_rate": 1.6325905419250325e-06, + "loss": 0.5723, + "step": 673 + }, + { + "epoch": 0.304117315284828, + "grad_norm": 3.369765645299152, + "learning_rate": 1.631457655653905e-06, + "loss": 0.5128, + "step": 674 + }, + { + "epoch": 0.30456852791878175, + "grad_norm": 3.130938520045816, + "learning_rate": 1.6303234198884294e-06, + "loss": 0.5637, + "step": 675 + }, + { + "epoch": 0.30501974055273545, + "grad_norm": 2.8738261799739755, + "learning_rate": 1.6291878370525925e-06, + "loss": 0.5167, + "step": 676 + }, + { + "epoch": 0.3054709531866892, + "grad_norm": 3.566044327017721, + "learning_rate": 1.6280509095732588e-06, + "loss": 0.5657, + "step": 677 + }, + { + "epoch": 0.30592216582064297, + "grad_norm": 3.4988087210997554, + "learning_rate": 1.6269126398801679e-06, + "loss": 0.5211, + "step": 678 + }, + { + "epoch": 0.3063733784545967, + "grad_norm": 3.159865034013114, + "learning_rate": 1.6257730304059263e-06, + "loss": 0.5702, + "step": 679 + }, + { + "epoch": 0.3068245910885505, + "grad_norm": 2.765987657389912, + "learning_rate": 1.6246320835860052e-06, + "loss": 0.5558, + "step": 680 + }, + { + "epoch": 0.30727580372250424, + "grad_norm": 3.2675556171262277, + "learning_rate": 1.6234898018587336e-06, + "loss": 0.5354, + "step": 681 + }, + { + "epoch": 0.307727016356458, + "grad_norm": 3.0700923951988, + "learning_rate": 1.622346187665292e-06, + "loss": 0.4686, + "step": 682 + }, + { + "epoch": 0.30817822899041175, + "grad_norm": 2.7395711139916425, + "learning_rate": 1.6212012434497101e-06, + "loss": 0.5564, + "step": 683 + }, + { + "epoch": 0.3086294416243655, + "grad_norm": 2.9605613921627616, + "learning_rate": 1.6200549716588595e-06, + "loss": 0.4968, + "step": 684 + }, + { + "epoch": 0.3090806542583192, + "grad_norm": 3.055443187363974, + "learning_rate": 1.6189073747424482e-06, + "loss": 0.5546, + "step": 685 + }, + { + "epoch": 0.30953186689227297, + "grad_norm": 3.304091566042142, + "learning_rate": 1.6177584551530177e-06, + "loss": 0.5676, + "step": 686 + }, + { + "epoch": 0.3099830795262267, + "grad_norm": 3.127344665872412, + "learning_rate": 1.6166082153459346e-06, + "loss": 0.5305, + "step": 687 + }, + { + "epoch": 0.3104342921601805, + "grad_norm": 3.313157696795387, + "learning_rate": 1.6154566577793885e-06, + "loss": 0.5159, + "step": 688 + }, + { + "epoch": 0.31088550479413424, + "grad_norm": 3.4602680074652548, + "learning_rate": 1.6143037849143832e-06, + "loss": 0.5878, + "step": 689 + }, + { + "epoch": 0.311336717428088, + "grad_norm": 3.47196454330974, + "learning_rate": 1.6131495992147359e-06, + "loss": 0.6752, + "step": 690 + }, + { + "epoch": 0.31178793006204175, + "grad_norm": 2.926369050074452, + "learning_rate": 1.6119941031470675e-06, + "loss": 0.4326, + "step": 691 + }, + { + "epoch": 0.3122391426959955, + "grad_norm": 2.8753404832004272, + "learning_rate": 1.6108372991807996e-06, + "loss": 0.6389, + "step": 692 + }, + { + "epoch": 0.31269035532994927, + "grad_norm": 3.4818672558945845, + "learning_rate": 1.6096791897881498e-06, + "loss": 0.5769, + "step": 693 + }, + { + "epoch": 0.31314156796390297, + "grad_norm": 3.045331557479258, + "learning_rate": 1.608519777444125e-06, + "loss": 0.5813, + "step": 694 + }, + { + "epoch": 0.3135927805978567, + "grad_norm": 3.1173014954702665, + "learning_rate": 1.607359064626517e-06, + "loss": 0.609, + "step": 695 + }, + { + "epoch": 0.3140439932318105, + "grad_norm": 3.1211308380487486, + "learning_rate": 1.6061970538158958e-06, + "loss": 0.5857, + "step": 696 + }, + { + "epoch": 0.31449520586576424, + "grad_norm": 3.6380292500889255, + "learning_rate": 1.6050337474956066e-06, + "loss": 0.7401, + "step": 697 + }, + { + "epoch": 0.314946418499718, + "grad_norm": 3.7195496711700793, + "learning_rate": 1.6038691481517628e-06, + "loss": 0.6073, + "step": 698 + }, + { + "epoch": 0.31539763113367175, + "grad_norm": 3.461343986471695, + "learning_rate": 1.6027032582732406e-06, + "loss": 0.6371, + "step": 699 + }, + { + "epoch": 0.3158488437676255, + "grad_norm": 3.0912912625302384, + "learning_rate": 1.6015360803516752e-06, + "loss": 0.5852, + "step": 700 + }, + { + "epoch": 0.31630005640157927, + "grad_norm": 2.735792616102687, + "learning_rate": 1.6003676168814536e-06, + "loss": 0.4726, + "step": 701 + }, + { + "epoch": 0.31675126903553297, + "grad_norm": 3.2460293296612894, + "learning_rate": 1.5991978703597112e-06, + "loss": 0.6615, + "step": 702 + }, + { + "epoch": 0.3172024816694867, + "grad_norm": 3.2723530181109934, + "learning_rate": 1.5980268432863239e-06, + "loss": 0.5633, + "step": 703 + }, + { + "epoch": 0.3176536943034405, + "grad_norm": 3.1604928857576415, + "learning_rate": 1.596854538163906e-06, + "loss": 0.5592, + "step": 704 + }, + { + "epoch": 0.31810490693739424, + "grad_norm": 3.275606024012947, + "learning_rate": 1.5956809574978011e-06, + "loss": 0.5976, + "step": 705 + }, + { + "epoch": 0.318556119571348, + "grad_norm": 3.66434228008565, + "learning_rate": 1.594506103796081e-06, + "loss": 0.6421, + "step": 706 + }, + { + "epoch": 0.31900733220530175, + "grad_norm": 3.1389856099207183, + "learning_rate": 1.5933299795695368e-06, + "loss": 0.5529, + "step": 707 + }, + { + "epoch": 0.3194585448392555, + "grad_norm": 3.040579790837814, + "learning_rate": 1.5921525873316753e-06, + "loss": 0.4726, + "step": 708 + }, + { + "epoch": 0.31990975747320927, + "grad_norm": 3.3660094819872124, + "learning_rate": 1.5909739295987122e-06, + "loss": 0.5541, + "step": 709 + }, + { + "epoch": 0.320360970107163, + "grad_norm": 3.387903668787852, + "learning_rate": 1.5897940088895691e-06, + "loss": 0.5683, + "step": 710 + }, + { + "epoch": 0.3208121827411167, + "grad_norm": 3.2996751014380554, + "learning_rate": 1.5886128277258661e-06, + "loss": 0.6318, + "step": 711 + }, + { + "epoch": 0.3212633953750705, + "grad_norm": 3.4811079536606004, + "learning_rate": 1.5874303886319174e-06, + "loss": 0.5905, + "step": 712 + }, + { + "epoch": 0.32171460800902424, + "grad_norm": 2.974942368471126, + "learning_rate": 1.5862466941347243e-06, + "loss": 0.5155, + "step": 713 + }, + { + "epoch": 0.322165820642978, + "grad_norm": 3.0452805867406942, + "learning_rate": 1.5850617467639727e-06, + "loss": 0.5682, + "step": 714 + }, + { + "epoch": 0.32261703327693175, + "grad_norm": 3.2224139519675403, + "learning_rate": 1.5838755490520249e-06, + "loss": 0.6019, + "step": 715 + }, + { + "epoch": 0.3230682459108855, + "grad_norm": 2.8944069319837817, + "learning_rate": 1.5826881035339154e-06, + "loss": 0.515, + "step": 716 + }, + { + "epoch": 0.32351945854483927, + "grad_norm": 2.441725821700579, + "learning_rate": 1.5814994127473465e-06, + "loss": 0.3773, + "step": 717 + }, + { + "epoch": 0.323970671178793, + "grad_norm": 3.470342462769914, + "learning_rate": 1.5803094792326799e-06, + "loss": 0.5145, + "step": 718 + }, + { + "epoch": 0.3244218838127468, + "grad_norm": 3.56701743592647, + "learning_rate": 1.5791183055329352e-06, + "loss": 0.5895, + "step": 719 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 3.123117232719042, + "learning_rate": 1.5779258941937803e-06, + "loss": 0.6079, + "step": 720 + }, + { + "epoch": 0.32532430908065424, + "grad_norm": 3.50214397424285, + "learning_rate": 1.5767322477635304e-06, + "loss": 0.7037, + "step": 721 + }, + { + "epoch": 0.325775521714608, + "grad_norm": 3.7189664252368364, + "learning_rate": 1.575537368793138e-06, + "loss": 0.554, + "step": 722 + }, + { + "epoch": 0.32622673434856175, + "grad_norm": 2.9787465398404858, + "learning_rate": 1.5743412598361909e-06, + "loss": 0.589, + "step": 723 + }, + { + "epoch": 0.3266779469825155, + "grad_norm": 3.322004142520188, + "learning_rate": 1.5731439234489052e-06, + "loss": 0.5615, + "step": 724 + }, + { + "epoch": 0.32712915961646927, + "grad_norm": 3.26274181293833, + "learning_rate": 1.571945362190121e-06, + "loss": 0.5014, + "step": 725 + }, + { + "epoch": 0.327580372250423, + "grad_norm": 3.7220892721550234, + "learning_rate": 1.5707455786212944e-06, + "loss": 0.6807, + "step": 726 + }, + { + "epoch": 0.3280315848843768, + "grad_norm": 2.9903955440183148, + "learning_rate": 1.569544575306495e-06, + "loss": 0.4905, + "step": 727 + }, + { + "epoch": 0.32848279751833054, + "grad_norm": 3.4657777627392936, + "learning_rate": 1.5683423548123988e-06, + "loss": 0.6067, + "step": 728 + }, + { + "epoch": 0.32893401015228424, + "grad_norm": 3.4819746921424004, + "learning_rate": 1.5671389197082828e-06, + "loss": 0.7603, + "step": 729 + }, + { + "epoch": 0.329385222786238, + "grad_norm": 3.3128509975650777, + "learning_rate": 1.5659342725660204e-06, + "loss": 0.5962, + "step": 730 + }, + { + "epoch": 0.32983643542019175, + "grad_norm": 3.111013336637488, + "learning_rate": 1.5647284159600742e-06, + "loss": 0.5375, + "step": 731 + }, + { + "epoch": 0.3302876480541455, + "grad_norm": 3.043014983711972, + "learning_rate": 1.5635213524674926e-06, + "loss": 0.4457, + "step": 732 + }, + { + "epoch": 0.33073886068809927, + "grad_norm": 3.176508138409611, + "learning_rate": 1.562313084667903e-06, + "loss": 0.6087, + "step": 733 + }, + { + "epoch": 0.331190073322053, + "grad_norm": 3.020201486936763, + "learning_rate": 1.5611036151435057e-06, + "loss": 0.4561, + "step": 734 + }, + { + "epoch": 0.3316412859560068, + "grad_norm": 3.0231053359967497, + "learning_rate": 1.5598929464790703e-06, + "loss": 0.5651, + "step": 735 + }, + { + "epoch": 0.33209249858996054, + "grad_norm": 3.275662082022039, + "learning_rate": 1.5586810812619291e-06, + "loss": 0.5691, + "step": 736 + }, + { + "epoch": 0.3325437112239143, + "grad_norm": 3.3993781041664337, + "learning_rate": 1.55746802208197e-06, + "loss": 0.6204, + "step": 737 + }, + { + "epoch": 0.332994923857868, + "grad_norm": 2.995463194362283, + "learning_rate": 1.5562537715316349e-06, + "loss": 0.564, + "step": 738 + }, + { + "epoch": 0.33344613649182175, + "grad_norm": 3.134146072367485, + "learning_rate": 1.55503833220591e-06, + "loss": 0.5233, + "step": 739 + }, + { + "epoch": 0.3338973491257755, + "grad_norm": 3.201637769439807, + "learning_rate": 1.553821706702322e-06, + "loss": 0.6123, + "step": 740 + }, + { + "epoch": 0.33434856175972927, + "grad_norm": 3.232574838103331, + "learning_rate": 1.5526038976209343e-06, + "loss": 0.5694, + "step": 741 + }, + { + "epoch": 0.334799774393683, + "grad_norm": 3.3442531523646544, + "learning_rate": 1.5513849075643381e-06, + "loss": 0.5079, + "step": 742 + }, + { + "epoch": 0.3352509870276368, + "grad_norm": 3.0542429928779216, + "learning_rate": 1.550164739137649e-06, + "loss": 0.5389, + "step": 743 + }, + { + "epoch": 0.33570219966159054, + "grad_norm": 3.1568454912469903, + "learning_rate": 1.548943394948501e-06, + "loss": 0.5308, + "step": 744 + }, + { + "epoch": 0.3361534122955443, + "grad_norm": 3.542836071959044, + "learning_rate": 1.5477208776070408e-06, + "loss": 0.483, + "step": 745 + }, + { + "epoch": 0.33660462492949805, + "grad_norm": 3.1222777445652894, + "learning_rate": 1.5464971897259219e-06, + "loss": 0.5749, + "step": 746 + }, + { + "epoch": 0.33705583756345175, + "grad_norm": 3.628624782183597, + "learning_rate": 1.5452723339202998e-06, + "loss": 0.5553, + "step": 747 + }, + { + "epoch": 0.3375070501974055, + "grad_norm": 3.2488842241245988, + "learning_rate": 1.5440463128078261e-06, + "loss": 0.6232, + "step": 748 + }, + { + "epoch": 0.33795826283135927, + "grad_norm": 3.343015867459291, + "learning_rate": 1.5428191290086422e-06, + "loss": 0.6445, + "step": 749 + }, + { + "epoch": 0.338409475465313, + "grad_norm": 3.1055865259237234, + "learning_rate": 1.5415907851453747e-06, + "loss": 0.5384, + "step": 750 + }, + { + "epoch": 0.3388606880992668, + "grad_norm": 3.2377347656439817, + "learning_rate": 1.5403612838431298e-06, + "loss": 0.5763, + "step": 751 + }, + { + "epoch": 0.33931190073322054, + "grad_norm": 3.445908663047367, + "learning_rate": 1.539130627729486e-06, + "loss": 0.5514, + "step": 752 + }, + { + "epoch": 0.3397631133671743, + "grad_norm": 2.911061775987079, + "learning_rate": 1.537898819434491e-06, + "loss": 0.7235, + "step": 753 + }, + { + "epoch": 0.34021432600112805, + "grad_norm": 2.7941569220035625, + "learning_rate": 1.5366658615906545e-06, + "loss": 0.6081, + "step": 754 + }, + { + "epoch": 0.34066553863508175, + "grad_norm": 3.5600515194912847, + "learning_rate": 1.5354317568329425e-06, + "loss": 0.5073, + "step": 755 + }, + { + "epoch": 0.3411167512690355, + "grad_norm": 3.076725754060217, + "learning_rate": 1.5341965077987724e-06, + "loss": 0.683, + "step": 756 + }, + { + "epoch": 0.34156796390298927, + "grad_norm": 3.3055138999806424, + "learning_rate": 1.5329601171280073e-06, + "loss": 0.4901, + "step": 757 + }, + { + "epoch": 0.342019176536943, + "grad_norm": 3.688970910602998, + "learning_rate": 1.5317225874629496e-06, + "loss": 0.6957, + "step": 758 + }, + { + "epoch": 0.3424703891708968, + "grad_norm": 3.01278353896168, + "learning_rate": 1.530483921448336e-06, + "loss": 0.5273, + "step": 759 + }, + { + "epoch": 0.34292160180485054, + "grad_norm": 2.7371700116966697, + "learning_rate": 1.5292441217313322e-06, + "loss": 0.5151, + "step": 760 + }, + { + "epoch": 0.3433728144388043, + "grad_norm": 2.6873243424543682, + "learning_rate": 1.5280031909615261e-06, + "loss": 0.5559, + "step": 761 + }, + { + "epoch": 0.34382402707275805, + "grad_norm": 3.247322078009431, + "learning_rate": 1.5267611317909228e-06, + "loss": 0.5374, + "step": 762 + }, + { + "epoch": 0.3442752397067118, + "grad_norm": 3.2026837973300974, + "learning_rate": 1.5255179468739393e-06, + "loss": 0.5152, + "step": 763 + }, + { + "epoch": 0.3447264523406655, + "grad_norm": 3.0865094325229734, + "learning_rate": 1.5242736388673982e-06, + "loss": 0.588, + "step": 764 + }, + { + "epoch": 0.34517766497461927, + "grad_norm": 3.0866090252108282, + "learning_rate": 1.5230282104305226e-06, + "loss": 0.5795, + "step": 765 + }, + { + "epoch": 0.345628877608573, + "grad_norm": 3.2541619817207814, + "learning_rate": 1.5217816642249296e-06, + "loss": 0.6493, + "step": 766 + }, + { + "epoch": 0.3460800902425268, + "grad_norm": 3.6862448489806967, + "learning_rate": 1.5205340029146253e-06, + "loss": 0.586, + "step": 767 + }, + { + "epoch": 0.34653130287648054, + "grad_norm": 2.6861569549572737, + "learning_rate": 1.519285229165999e-06, + "loss": 0.4518, + "step": 768 + }, + { + "epoch": 0.3469825155104343, + "grad_norm": 3.039215785994512, + "learning_rate": 1.5180353456478173e-06, + "loss": 0.4648, + "step": 769 + }, + { + "epoch": 0.34743372814438805, + "grad_norm": 3.6176540281690914, + "learning_rate": 1.5167843550312188e-06, + "loss": 0.6346, + "step": 770 + }, + { + "epoch": 0.3478849407783418, + "grad_norm": 3.070746867448483, + "learning_rate": 1.5155322599897073e-06, + "loss": 0.5604, + "step": 771 + }, + { + "epoch": 0.34833615341229557, + "grad_norm": 2.73731135380931, + "learning_rate": 1.5142790631991478e-06, + "loss": 0.5152, + "step": 772 + }, + { + "epoch": 0.34878736604624927, + "grad_norm": 3.529456577457423, + "learning_rate": 1.5130247673377587e-06, + "loss": 0.5609, + "step": 773 + }, + { + "epoch": 0.349238578680203, + "grad_norm": 3.4439646240760826, + "learning_rate": 1.5117693750861094e-06, + "loss": 0.6688, + "step": 774 + }, + { + "epoch": 0.3496897913141568, + "grad_norm": 3.3042570111312335, + "learning_rate": 1.51051288912711e-06, + "loss": 0.4303, + "step": 775 + }, + { + "epoch": 0.35014100394811054, + "grad_norm": 2.9401197668624013, + "learning_rate": 1.5092553121460088e-06, + "loss": 0.4583, + "step": 776 + }, + { + "epoch": 0.3505922165820643, + "grad_norm": 3.1073494998401503, + "learning_rate": 1.5079966468303863e-06, + "loss": 0.5139, + "step": 777 + }, + { + "epoch": 0.35104342921601805, + "grad_norm": 3.118708334430417, + "learning_rate": 1.5067368958701485e-06, + "loss": 0.4641, + "step": 778 + }, + { + "epoch": 0.3514946418499718, + "grad_norm": 3.143071377710741, + "learning_rate": 1.5054760619575215e-06, + "loss": 0.5753, + "step": 779 + }, + { + "epoch": 0.35194585448392557, + "grad_norm": 3.1861569232470766, + "learning_rate": 1.5042141477870458e-06, + "loss": 0.535, + "step": 780 + }, + { + "epoch": 0.3523970671178793, + "grad_norm": 2.8866857302075117, + "learning_rate": 1.5029511560555706e-06, + "loss": 0.46, + "step": 781 + }, + { + "epoch": 0.352848279751833, + "grad_norm": 3.3778859647717443, + "learning_rate": 1.5016870894622473e-06, + "loss": 0.5898, + "step": 782 + }, + { + "epoch": 0.3532994923857868, + "grad_norm": 4.172842058459368, + "learning_rate": 1.5004219507085262e-06, + "loss": 0.5366, + "step": 783 + }, + { + "epoch": 0.35375070501974054, + "grad_norm": 3.59549440239923, + "learning_rate": 1.499155742498147e-06, + "loss": 0.4775, + "step": 784 + }, + { + "epoch": 0.3542019176536943, + "grad_norm": 3.0027968674611825, + "learning_rate": 1.4978884675371352e-06, + "loss": 0.5956, + "step": 785 + }, + { + "epoch": 0.35465313028764806, + "grad_norm": 3.385447063632404, + "learning_rate": 1.4966201285337976e-06, + "loss": 0.6469, + "step": 786 + }, + { + "epoch": 0.3551043429216018, + "grad_norm": 2.8706740349542597, + "learning_rate": 1.4953507281987134e-06, + "loss": 0.4904, + "step": 787 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 3.424910056173964, + "learning_rate": 1.4940802692447306e-06, + "loss": 0.5708, + "step": 788 + }, + { + "epoch": 0.3560067681895093, + "grad_norm": 3.0668549250942174, + "learning_rate": 1.4928087543869593e-06, + "loss": 0.5102, + "step": 789 + }, + { + "epoch": 0.3564579808234631, + "grad_norm": 2.9435846287096594, + "learning_rate": 1.491536186342766e-06, + "loss": 0.4531, + "step": 790 + }, + { + "epoch": 0.3569091934574168, + "grad_norm": 3.3647412629748, + "learning_rate": 1.4902625678317694e-06, + "loss": 0.5754, + "step": 791 + }, + { + "epoch": 0.35736040609137054, + "grad_norm": 3.4720397428714147, + "learning_rate": 1.4889879015758317e-06, + "loss": 0.5361, + "step": 792 + }, + { + "epoch": 0.3578116187253243, + "grad_norm": 3.169387365054564, + "learning_rate": 1.4877121902990542e-06, + "loss": 0.6178, + "step": 793 + }, + { + "epoch": 0.35826283135927806, + "grad_norm": 3.1834769003859416, + "learning_rate": 1.4864354367277723e-06, + "loss": 0.5135, + "step": 794 + }, + { + "epoch": 0.3587140439932318, + "grad_norm": 2.72897080565907, + "learning_rate": 1.4851576435905486e-06, + "loss": 0.5075, + "step": 795 + }, + { + "epoch": 0.35916525662718557, + "grad_norm": 3.472851661152734, + "learning_rate": 1.4838788136181674e-06, + "loss": 0.481, + "step": 796 + }, + { + "epoch": 0.3596164692611393, + "grad_norm": 3.3098456095281343, + "learning_rate": 1.4825989495436284e-06, + "loss": 0.6186, + "step": 797 + }, + { + "epoch": 0.3600676818950931, + "grad_norm": 3.5939015234225318, + "learning_rate": 1.4813180541021424e-06, + "loss": 0.566, + "step": 798 + }, + { + "epoch": 0.36051889452904684, + "grad_norm": 3.19607736166632, + "learning_rate": 1.4800361300311227e-06, + "loss": 0.6465, + "step": 799 + }, + { + "epoch": 0.36097010716300054, + "grad_norm": 3.236820425024368, + "learning_rate": 1.4787531800701825e-06, + "loss": 0.5826, + "step": 800 + }, + { + "epoch": 0.3614213197969543, + "grad_norm": 3.1520319660757927, + "learning_rate": 1.4774692069611265e-06, + "loss": 0.6347, + "step": 801 + }, + { + "epoch": 0.36187253243090806, + "grad_norm": 3.4007171711887203, + "learning_rate": 1.4761842134479461e-06, + "loss": 0.5826, + "step": 802 + }, + { + "epoch": 0.3623237450648618, + "grad_norm": 3.322956321362452, + "learning_rate": 1.4748982022768136e-06, + "loss": 0.5269, + "step": 803 + }, + { + "epoch": 0.36277495769881557, + "grad_norm": 3.5976572006549015, + "learning_rate": 1.4736111761960763e-06, + "loss": 0.6353, + "step": 804 + }, + { + "epoch": 0.3632261703327693, + "grad_norm": 3.358228871970233, + "learning_rate": 1.4723231379562503e-06, + "loss": 0.639, + "step": 805 + }, + { + "epoch": 0.3636773829667231, + "grad_norm": 3.066740811078913, + "learning_rate": 1.4710340903100142e-06, + "loss": 0.6528, + "step": 806 + }, + { + "epoch": 0.36412859560067684, + "grad_norm": 3.143447875416717, + "learning_rate": 1.4697440360122046e-06, + "loss": 0.5159, + "step": 807 + }, + { + "epoch": 0.36457980823463054, + "grad_norm": 3.4609033817493087, + "learning_rate": 1.4684529778198095e-06, + "loss": 0.6577, + "step": 808 + }, + { + "epoch": 0.3650310208685843, + "grad_norm": 3.3073777275985243, + "learning_rate": 1.467160918491962e-06, + "loss": 0.5486, + "step": 809 + }, + { + "epoch": 0.36548223350253806, + "grad_norm": 2.955049404596728, + "learning_rate": 1.4658678607899346e-06, + "loss": 0.4085, + "step": 810 + }, + { + "epoch": 0.3659334461364918, + "grad_norm": 3.1794400164176015, + "learning_rate": 1.4645738074771331e-06, + "loss": 0.5911, + "step": 811 + }, + { + "epoch": 0.36638465877044557, + "grad_norm": 3.282347844957748, + "learning_rate": 1.4632787613190927e-06, + "loss": 0.6132, + "step": 812 + }, + { + "epoch": 0.3668358714043993, + "grad_norm": 3.054992196038826, + "learning_rate": 1.461982725083468e-06, + "loss": 0.5046, + "step": 813 + }, + { + "epoch": 0.3672870840383531, + "grad_norm": 2.7569599731911083, + "learning_rate": 1.4606857015400315e-06, + "loss": 0.5099, + "step": 814 + }, + { + "epoch": 0.36773829667230684, + "grad_norm": 3.2540309485705454, + "learning_rate": 1.4593876934606649e-06, + "loss": 0.5801, + "step": 815 + }, + { + "epoch": 0.3681895093062606, + "grad_norm": 3.2943527058039073, + "learning_rate": 1.4580887036193537e-06, + "loss": 0.6682, + "step": 816 + }, + { + "epoch": 0.3686407219402143, + "grad_norm": 3.133778967882415, + "learning_rate": 1.4567887347921816e-06, + "loss": 0.6227, + "step": 817 + }, + { + "epoch": 0.36909193457416806, + "grad_norm": 3.256206658405506, + "learning_rate": 1.4554877897573257e-06, + "loss": 0.493, + "step": 818 + }, + { + "epoch": 0.3695431472081218, + "grad_norm": 3.182329974272021, + "learning_rate": 1.4541858712950475e-06, + "loss": 0.5302, + "step": 819 + }, + { + "epoch": 0.36999435984207557, + "grad_norm": 3.4363468847418077, + "learning_rate": 1.4528829821876898e-06, + "loss": 0.6303, + "step": 820 + }, + { + "epoch": 0.37044557247602933, + "grad_norm": 2.851895955662126, + "learning_rate": 1.45157912521967e-06, + "loss": 0.5902, + "step": 821 + }, + { + "epoch": 0.3708967851099831, + "grad_norm": 2.9192604758917735, + "learning_rate": 1.4502743031774735e-06, + "loss": 0.4448, + "step": 822 + }, + { + "epoch": 0.37134799774393684, + "grad_norm": 3.1234419282519297, + "learning_rate": 1.4489685188496485e-06, + "loss": 0.6215, + "step": 823 + }, + { + "epoch": 0.3717992103778906, + "grad_norm": 3.2175572399766983, + "learning_rate": 1.447661775026799e-06, + "loss": 0.5736, + "step": 824 + }, + { + "epoch": 0.37225042301184436, + "grad_norm": 3.0464327587263593, + "learning_rate": 1.4463540745015804e-06, + "loss": 0.4518, + "step": 825 + }, + { + "epoch": 0.37270163564579806, + "grad_norm": 2.7548027619798114, + "learning_rate": 1.4450454200686922e-06, + "loss": 0.5115, + "step": 826 + }, + { + "epoch": 0.3731528482797518, + "grad_norm": 3.508978871190512, + "learning_rate": 1.4437358145248726e-06, + "loss": 0.5587, + "step": 827 + }, + { + "epoch": 0.37360406091370557, + "grad_norm": 2.8776633436137082, + "learning_rate": 1.4424252606688923e-06, + "loss": 0.3882, + "step": 828 + }, + { + "epoch": 0.37405527354765933, + "grad_norm": 3.0186645553939444, + "learning_rate": 1.4411137613015493e-06, + "loss": 0.7998, + "step": 829 + }, + { + "epoch": 0.3745064861816131, + "grad_norm": 3.1591798420581148, + "learning_rate": 1.4398013192256612e-06, + "loss": 0.5591, + "step": 830 + }, + { + "epoch": 0.37495769881556684, + "grad_norm": 3.465443632908638, + "learning_rate": 1.4384879372460614e-06, + "loss": 0.5952, + "step": 831 + }, + { + "epoch": 0.3754089114495206, + "grad_norm": 3.983463883157992, + "learning_rate": 1.4371736181695906e-06, + "loss": 0.6167, + "step": 832 + }, + { + "epoch": 0.37586012408347436, + "grad_norm": 3.202176645358091, + "learning_rate": 1.4358583648050938e-06, + "loss": 0.6324, + "step": 833 + }, + { + "epoch": 0.3763113367174281, + "grad_norm": 3.729738183408635, + "learning_rate": 1.4345421799634117e-06, + "loss": 0.5976, + "step": 834 + }, + { + "epoch": 0.3767625493513818, + "grad_norm": 3.2963718228707286, + "learning_rate": 1.4332250664573753e-06, + "loss": 0.575, + "step": 835 + }, + { + "epoch": 0.37721376198533557, + "grad_norm": 3.3781633349933236, + "learning_rate": 1.4319070271018015e-06, + "loss": 0.6036, + "step": 836 + }, + { + "epoch": 0.37766497461928933, + "grad_norm": 3.4896071051826, + "learning_rate": 1.4305880647134845e-06, + "loss": 0.6199, + "step": 837 + }, + { + "epoch": 0.3781161872532431, + "grad_norm": 3.4302159345440826, + "learning_rate": 1.4292681821111917e-06, + "loss": 0.6029, + "step": 838 + }, + { + "epoch": 0.37856739988719684, + "grad_norm": 3.2071682961494195, + "learning_rate": 1.4279473821156577e-06, + "loss": 0.6575, + "step": 839 + }, + { + "epoch": 0.3790186125211506, + "grad_norm": 3.417176487620392, + "learning_rate": 1.4266256675495775e-06, + "loss": 0.5037, + "step": 840 + }, + { + "epoch": 0.37946982515510436, + "grad_norm": 3.1535347582785755, + "learning_rate": 1.4253030412375992e-06, + "loss": 0.6263, + "step": 841 + }, + { + "epoch": 0.3799210377890581, + "grad_norm": 3.2890481570320693, + "learning_rate": 1.4239795060063208e-06, + "loss": 0.6117, + "step": 842 + }, + { + "epoch": 0.38037225042301187, + "grad_norm": 3.48926192769284, + "learning_rate": 1.422655064684283e-06, + "loss": 0.6099, + "step": 843 + }, + { + "epoch": 0.38082346305696557, + "grad_norm": 3.9372420818037006, + "learning_rate": 1.4213297201019617e-06, + "loss": 0.5924, + "step": 844 + }, + { + "epoch": 0.38127467569091933, + "grad_norm": 2.7349996691946203, + "learning_rate": 1.4200034750917637e-06, + "loss": 0.4804, + "step": 845 + }, + { + "epoch": 0.3817258883248731, + "grad_norm": 3.3642312266416075, + "learning_rate": 1.4186763324880206e-06, + "loss": 0.4911, + "step": 846 + }, + { + "epoch": 0.38217710095882684, + "grad_norm": 3.2368382839246657, + "learning_rate": 1.4173482951269822e-06, + "loss": 0.5709, + "step": 847 + }, + { + "epoch": 0.3826283135927806, + "grad_norm": 3.5193539688661972, + "learning_rate": 1.4160193658468092e-06, + "loss": 0.6303, + "step": 848 + }, + { + "epoch": 0.38307952622673436, + "grad_norm": 2.9360735003460574, + "learning_rate": 1.4146895474875705e-06, + "loss": 0.5032, + "step": 849 + }, + { + "epoch": 0.3835307388606881, + "grad_norm": 3.4786066803717497, + "learning_rate": 1.4133588428912331e-06, + "loss": 0.7037, + "step": 850 + }, + { + "epoch": 0.38398195149464187, + "grad_norm": 3.1528126266207415, + "learning_rate": 1.412027254901659e-06, + "loss": 0.5899, + "step": 851 + }, + { + "epoch": 0.38443316412859563, + "grad_norm": 3.174247820940736, + "learning_rate": 1.4106947863645982e-06, + "loss": 0.4902, + "step": 852 + }, + { + "epoch": 0.38488437676254933, + "grad_norm": 3.957992235254075, + "learning_rate": 1.4093614401276823e-06, + "loss": 0.5707, + "step": 853 + }, + { + "epoch": 0.3853355893965031, + "grad_norm": 3.218519092249444, + "learning_rate": 1.4080272190404184e-06, + "loss": 0.6134, + "step": 854 + }, + { + "epoch": 0.38578680203045684, + "grad_norm": 3.1459639474006447, + "learning_rate": 1.4066921259541836e-06, + "loss": 0.6095, + "step": 855 + }, + { + "epoch": 0.3862380146644106, + "grad_norm": 3.5759941707522565, + "learning_rate": 1.405356163722218e-06, + "loss": 0.6358, + "step": 856 + }, + { + "epoch": 0.38668922729836436, + "grad_norm": 3.185073838148989, + "learning_rate": 1.4040193351996204e-06, + "loss": 0.4632, + "step": 857 + }, + { + "epoch": 0.3871404399323181, + "grad_norm": 3.109686641994834, + "learning_rate": 1.4026816432433398e-06, + "loss": 0.5566, + "step": 858 + }, + { + "epoch": 0.38759165256627187, + "grad_norm": 3.2959191557613665, + "learning_rate": 1.4013430907121703e-06, + "loss": 0.538, + "step": 859 + }, + { + "epoch": 0.38804286520022563, + "grad_norm": 3.3692958569410263, + "learning_rate": 1.4000036804667462e-06, + "loss": 0.5392, + "step": 860 + }, + { + "epoch": 0.38849407783417933, + "grad_norm": 2.6745649742861133, + "learning_rate": 1.3986634153695342e-06, + "loss": 0.3742, + "step": 861 + }, + { + "epoch": 0.3889452904681331, + "grad_norm": 3.3623036565879945, + "learning_rate": 1.3973222982848281e-06, + "loss": 0.7448, + "step": 862 + }, + { + "epoch": 0.38939650310208684, + "grad_norm": 3.277299965781224, + "learning_rate": 1.3959803320787417e-06, + "loss": 0.6002, + "step": 863 + }, + { + "epoch": 0.3898477157360406, + "grad_norm": 3.0879829136475814, + "learning_rate": 1.394637519619205e-06, + "loss": 0.4925, + "step": 864 + }, + { + "epoch": 0.39029892836999436, + "grad_norm": 3.365807183133468, + "learning_rate": 1.3932938637759552e-06, + "loss": 0.5599, + "step": 865 + }, + { + "epoch": 0.3907501410039481, + "grad_norm": 3.107049993469059, + "learning_rate": 1.3919493674205326e-06, + "loss": 0.6655, + "step": 866 + }, + { + "epoch": 0.39120135363790187, + "grad_norm": 3.053760677478005, + "learning_rate": 1.3906040334262731e-06, + "loss": 0.5843, + "step": 867 + }, + { + "epoch": 0.39165256627185563, + "grad_norm": 3.093046175711346, + "learning_rate": 1.3892578646683037e-06, + "loss": 0.6043, + "step": 868 + }, + { + "epoch": 0.3921037789058094, + "grad_norm": 2.86893503666343, + "learning_rate": 1.3879108640235345e-06, + "loss": 0.4397, + "step": 869 + }, + { + "epoch": 0.3925549915397631, + "grad_norm": 3.118837171623341, + "learning_rate": 1.386563034370654e-06, + "loss": 0.5386, + "step": 870 + }, + { + "epoch": 0.39300620417371684, + "grad_norm": 3.363263594306241, + "learning_rate": 1.3852143785901223e-06, + "loss": 0.5624, + "step": 871 + }, + { + "epoch": 0.3934574168076706, + "grad_norm": 3.186495282439576, + "learning_rate": 1.3838648995641644e-06, + "loss": 0.4957, + "step": 872 + }, + { + "epoch": 0.39390862944162436, + "grad_norm": 3.3353513396229335, + "learning_rate": 1.3825146001767653e-06, + "loss": 0.5313, + "step": 873 + }, + { + "epoch": 0.3943598420755781, + "grad_norm": 3.0521731392139424, + "learning_rate": 1.3811634833136637e-06, + "loss": 0.4819, + "step": 874 + }, + { + "epoch": 0.3948110547095319, + "grad_norm": 3.1624882499122724, + "learning_rate": 1.379811551862344e-06, + "loss": 0.5611, + "step": 875 + }, + { + "epoch": 0.39526226734348563, + "grad_norm": 2.885520973170412, + "learning_rate": 1.378458808712032e-06, + "loss": 0.5237, + "step": 876 + }, + { + "epoch": 0.3957134799774394, + "grad_norm": 3.1923720267502995, + "learning_rate": 1.377105256753689e-06, + "loss": 0.6018, + "step": 877 + }, + { + "epoch": 0.39616469261139314, + "grad_norm": 3.3170122186250968, + "learning_rate": 1.375750898880004e-06, + "loss": 0.5146, + "step": 878 + }, + { + "epoch": 0.39661590524534684, + "grad_norm": 2.9602277801561336, + "learning_rate": 1.3743957379853884e-06, + "loss": 0.5429, + "step": 879 + }, + { + "epoch": 0.3970671178793006, + "grad_norm": 3.1080024064190432, + "learning_rate": 1.3730397769659694e-06, + "loss": 0.5487, + "step": 880 + }, + { + "epoch": 0.39751833051325436, + "grad_norm": 3.439572751157547, + "learning_rate": 1.3716830187195854e-06, + "loss": 0.5924, + "step": 881 + }, + { + "epoch": 0.3979695431472081, + "grad_norm": 3.005894061607398, + "learning_rate": 1.3703254661457772e-06, + "loss": 0.5435, + "step": 882 + }, + { + "epoch": 0.3984207557811619, + "grad_norm": 2.9201664205070172, + "learning_rate": 1.3689671221457838e-06, + "loss": 0.6007, + "step": 883 + }, + { + "epoch": 0.39887196841511563, + "grad_norm": 3.076358828456211, + "learning_rate": 1.3676079896225357e-06, + "loss": 0.5791, + "step": 884 + }, + { + "epoch": 0.3993231810490694, + "grad_norm": 3.0203537087915278, + "learning_rate": 1.3662480714806481e-06, + "loss": 0.5749, + "step": 885 + }, + { + "epoch": 0.39977439368302314, + "grad_norm": 3.0613670662226538, + "learning_rate": 1.3648873706264158e-06, + "loss": 0.4396, + "step": 886 + }, + { + "epoch": 0.4002256063169769, + "grad_norm": 2.6309532474498574, + "learning_rate": 1.363525889967805e-06, + "loss": 0.4493, + "step": 887 + }, + { + "epoch": 0.4006768189509306, + "grad_norm": 3.208084607808158, + "learning_rate": 1.3621636324144507e-06, + "loss": 0.4828, + "step": 888 + }, + { + "epoch": 0.40112803158488436, + "grad_norm": 3.332508518536951, + "learning_rate": 1.3608006008776458e-06, + "loss": 0.7003, + "step": 889 + }, + { + "epoch": 0.4015792442188381, + "grad_norm": 3.3403690099503907, + "learning_rate": 1.3594367982703388e-06, + "loss": 0.6317, + "step": 890 + }, + { + "epoch": 0.4020304568527919, + "grad_norm": 3.4197414818392273, + "learning_rate": 1.3580722275071253e-06, + "loss": 0.5383, + "step": 891 + }, + { + "epoch": 0.40248166948674563, + "grad_norm": 2.913591421357927, + "learning_rate": 1.3567068915042433e-06, + "loss": 0.4599, + "step": 892 + }, + { + "epoch": 0.4029328821206994, + "grad_norm": 3.2044119627020495, + "learning_rate": 1.355340793179566e-06, + "loss": 0.5566, + "step": 893 + }, + { + "epoch": 0.40338409475465314, + "grad_norm": 3.56074826789299, + "learning_rate": 1.3539739354525946e-06, + "loss": 0.6223, + "step": 894 + }, + { + "epoch": 0.4038353073886069, + "grad_norm": 3.0506608223157787, + "learning_rate": 1.352606321244455e-06, + "loss": 0.7291, + "step": 895 + }, + { + "epoch": 0.40428652002256066, + "grad_norm": 3.06038972236859, + "learning_rate": 1.3512379534778882e-06, + "loss": 0.667, + "step": 896 + }, + { + "epoch": 0.40473773265651436, + "grad_norm": 3.094425625713067, + "learning_rate": 1.3498688350772472e-06, + "loss": 0.5412, + "step": 897 + }, + { + "epoch": 0.4051889452904681, + "grad_norm": 3.6102713876984054, + "learning_rate": 1.3484989689684879e-06, + "loss": 0.6519, + "step": 898 + }, + { + "epoch": 0.4056401579244219, + "grad_norm": 3.054024840623251, + "learning_rate": 1.347128358079164e-06, + "loss": 0.4936, + "step": 899 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 3.4039347396157145, + "learning_rate": 1.3457570053384224e-06, + "loss": 0.592, + "step": 900 + }, + { + "epoch": 0.4065425831923294, + "grad_norm": 3.3228410861857367, + "learning_rate": 1.3443849136769945e-06, + "loss": 0.4142, + "step": 901 + }, + { + "epoch": 0.40699379582628314, + "grad_norm": 3.5304381752960654, + "learning_rate": 1.3430120860271905e-06, + "loss": 0.4718, + "step": 902 + }, + { + "epoch": 0.4074450084602369, + "grad_norm": 3.303441875394312, + "learning_rate": 1.3416385253228937e-06, + "loss": 0.5454, + "step": 903 + }, + { + "epoch": 0.40789622109419066, + "grad_norm": 3.1629313821139706, + "learning_rate": 1.3402642344995542e-06, + "loss": 0.5869, + "step": 904 + }, + { + "epoch": 0.4083474337281444, + "grad_norm": 3.422560274484405, + "learning_rate": 1.3388892164941826e-06, + "loss": 0.5956, + "step": 905 + }, + { + "epoch": 0.4087986463620981, + "grad_norm": 3.09873469521746, + "learning_rate": 1.3375134742453434e-06, + "loss": 0.545, + "step": 906 + }, + { + "epoch": 0.4092498589960519, + "grad_norm": 3.0994234736000164, + "learning_rate": 1.3361370106931485e-06, + "loss": 0.6644, + "step": 907 + }, + { + "epoch": 0.40970107163000563, + "grad_norm": 3.3133585021955976, + "learning_rate": 1.3347598287792518e-06, + "loss": 0.5737, + "step": 908 + }, + { + "epoch": 0.4101522842639594, + "grad_norm": 3.126477001362301, + "learning_rate": 1.3333819314468427e-06, + "loss": 0.5358, + "step": 909 + }, + { + "epoch": 0.41060349689791314, + "grad_norm": 3.843129597851106, + "learning_rate": 1.3320033216406385e-06, + "loss": 0.5734, + "step": 910 + }, + { + "epoch": 0.4110547095318669, + "grad_norm": 3.0621269496803762, + "learning_rate": 1.33062400230688e-06, + "loss": 0.4941, + "step": 911 + }, + { + "epoch": 0.41150592216582066, + "grad_norm": 3.456611325927705, + "learning_rate": 1.3292439763933244e-06, + "loss": 0.4588, + "step": 912 + }, + { + "epoch": 0.4119571347997744, + "grad_norm": 2.847828708168504, + "learning_rate": 1.3278632468492377e-06, + "loss": 0.5647, + "step": 913 + }, + { + "epoch": 0.4124083474337281, + "grad_norm": 3.108813010015726, + "learning_rate": 1.3264818166253916e-06, + "loss": 0.4924, + "step": 914 + }, + { + "epoch": 0.4128595600676819, + "grad_norm": 3.8277286638201558, + "learning_rate": 1.325099688674053e-06, + "loss": 0.5561, + "step": 915 + }, + { + "epoch": 0.41331077270163563, + "grad_norm": 3.147511650054291, + "learning_rate": 1.3237168659489825e-06, + "loss": 0.5601, + "step": 916 + }, + { + "epoch": 0.4137619853355894, + "grad_norm": 3.0966116731178657, + "learning_rate": 1.3223333514054232e-06, + "loss": 0.5234, + "step": 917 + }, + { + "epoch": 0.41421319796954315, + "grad_norm": 2.5545639358135874, + "learning_rate": 1.3209491480000977e-06, + "loss": 0.4909, + "step": 918 + }, + { + "epoch": 0.4146644106034969, + "grad_norm": 3.556454372294347, + "learning_rate": 1.319564258691201e-06, + "loss": 0.5788, + "step": 919 + }, + { + "epoch": 0.41511562323745066, + "grad_norm": 3.1471280862116564, + "learning_rate": 1.3181786864383932e-06, + "loss": 0.582, + "step": 920 + }, + { + "epoch": 0.4155668358714044, + "grad_norm": 3.343836136967912, + "learning_rate": 1.3167924342027944e-06, + "loss": 0.7171, + "step": 921 + }, + { + "epoch": 0.4160180485053582, + "grad_norm": 2.917522532695626, + "learning_rate": 1.3154055049469782e-06, + "loss": 0.5267, + "step": 922 + }, + { + "epoch": 0.4164692611393119, + "grad_norm": 2.758113115417477, + "learning_rate": 1.3140179016349646e-06, + "loss": 0.5173, + "step": 923 + }, + { + "epoch": 0.41692047377326563, + "grad_norm": 3.3089395229486223, + "learning_rate": 1.312629627232214e-06, + "loss": 0.5977, + "step": 924 + }, + { + "epoch": 0.4173716864072194, + "grad_norm": 3.4613943944882237, + "learning_rate": 1.3112406847056212e-06, + "loss": 0.5058, + "step": 925 + }, + { + "epoch": 0.41782289904117315, + "grad_norm": 3.185867032326023, + "learning_rate": 1.3098510770235092e-06, + "loss": 0.5305, + "step": 926 + }, + { + "epoch": 0.4182741116751269, + "grad_norm": 2.9584569775451963, + "learning_rate": 1.308460807155622e-06, + "loss": 0.4347, + "step": 927 + }, + { + "epoch": 0.41872532430908066, + "grad_norm": 3.0943591120917735, + "learning_rate": 1.3070698780731192e-06, + "loss": 0.6806, + "step": 928 + }, + { + "epoch": 0.4191765369430344, + "grad_norm": 3.257203595590827, + "learning_rate": 1.3056782927485688e-06, + "loss": 0.5482, + "step": 929 + }, + { + "epoch": 0.4196277495769882, + "grad_norm": 3.378797664140071, + "learning_rate": 1.3042860541559415e-06, + "loss": 0.6118, + "step": 930 + }, + { + "epoch": 0.42007896221094193, + "grad_norm": 3.782330800744356, + "learning_rate": 1.3028931652706039e-06, + "loss": 0.5512, + "step": 931 + }, + { + "epoch": 0.42053017484489563, + "grad_norm": 3.2959465895996813, + "learning_rate": 1.3014996290693127e-06, + "loss": 0.557, + "step": 932 + }, + { + "epoch": 0.4209813874788494, + "grad_norm": 3.36676862860587, + "learning_rate": 1.3001054485302078e-06, + "loss": 0.5585, + "step": 933 + }, + { + "epoch": 0.42143260011280315, + "grad_norm": 2.9162822229329355, + "learning_rate": 1.2987106266328058e-06, + "loss": 0.4515, + "step": 934 + }, + { + "epoch": 0.4218838127467569, + "grad_norm": 3.489259111346056, + "learning_rate": 1.2973151663579947e-06, + "loss": 0.5667, + "step": 935 + }, + { + "epoch": 0.42233502538071066, + "grad_norm": 3.146522111454242, + "learning_rate": 1.295919070688026e-06, + "loss": 0.6089, + "step": 936 + }, + { + "epoch": 0.4227862380146644, + "grad_norm": 3.061791688694132, + "learning_rate": 1.2945223426065095e-06, + "loss": 0.5346, + "step": 937 + }, + { + "epoch": 0.4232374506486182, + "grad_norm": 3.0568178693062533, + "learning_rate": 1.2931249850984064e-06, + "loss": 0.5716, + "step": 938 + }, + { + "epoch": 0.42368866328257193, + "grad_norm": 3.375298437835067, + "learning_rate": 1.2917270011500232e-06, + "loss": 0.5784, + "step": 939 + }, + { + "epoch": 0.4241398759165257, + "grad_norm": 3.031854170941789, + "learning_rate": 1.2903283937490055e-06, + "loss": 0.4996, + "step": 940 + }, + { + "epoch": 0.4245910885504794, + "grad_norm": 3.3874162086068726, + "learning_rate": 1.2889291658843304e-06, + "loss": 0.6241, + "step": 941 + }, + { + "epoch": 0.42504230118443315, + "grad_norm": 3.1176739539444913, + "learning_rate": 1.2875293205463015e-06, + "loss": 0.5855, + "step": 942 + }, + { + "epoch": 0.4254935138183869, + "grad_norm": 3.302403976138221, + "learning_rate": 1.2861288607265424e-06, + "loss": 0.5478, + "step": 943 + }, + { + "epoch": 0.42594472645234066, + "grad_norm": 3.3731503961513747, + "learning_rate": 1.2847277894179888e-06, + "loss": 0.6005, + "step": 944 + }, + { + "epoch": 0.4263959390862944, + "grad_norm": 2.9075875109978737, + "learning_rate": 1.283326109614885e-06, + "loss": 0.5674, + "step": 945 + }, + { + "epoch": 0.4268471517202482, + "grad_norm": 3.4745646922579554, + "learning_rate": 1.2819238243127735e-06, + "loss": 0.5554, + "step": 946 + }, + { + "epoch": 0.42729836435420193, + "grad_norm": 3.4374933327923203, + "learning_rate": 1.2805209365084925e-06, + "loss": 0.5387, + "step": 947 + }, + { + "epoch": 0.4277495769881557, + "grad_norm": 3.5485274017082404, + "learning_rate": 1.2791174492001675e-06, + "loss": 0.5332, + "step": 948 + }, + { + "epoch": 0.42820078962210945, + "grad_norm": 2.9948929783352463, + "learning_rate": 1.2777133653872048e-06, + "loss": 0.4852, + "step": 949 + }, + { + "epoch": 0.42865200225606315, + "grad_norm": 3.183845074289996, + "learning_rate": 1.2763086880702859e-06, + "loss": 0.5381, + "step": 950 + }, + { + "epoch": 0.4291032148900169, + "grad_norm": 3.3312664393519524, + "learning_rate": 1.2749034202513598e-06, + "loss": 0.5461, + "step": 951 + }, + { + "epoch": 0.42955442752397066, + "grad_norm": 3.248294129712524, + "learning_rate": 1.2734975649336383e-06, + "loss": 0.6579, + "step": 952 + }, + { + "epoch": 0.4300056401579244, + "grad_norm": 3.650970462515364, + "learning_rate": 1.2720911251215896e-06, + "loss": 0.5921, + "step": 953 + }, + { + "epoch": 0.4304568527918782, + "grad_norm": 3.1484500818228627, + "learning_rate": 1.270684103820929e-06, + "loss": 0.5272, + "step": 954 + }, + { + "epoch": 0.43090806542583193, + "grad_norm": 3.0790541876092665, + "learning_rate": 1.2692765040386156e-06, + "loss": 0.503, + "step": 955 + }, + { + "epoch": 0.4313592780597857, + "grad_norm": 3.2768871039849254, + "learning_rate": 1.2678683287828449e-06, + "loss": 0.5212, + "step": 956 + }, + { + "epoch": 0.43181049069373945, + "grad_norm": 3.4878570977168097, + "learning_rate": 1.2664595810630422e-06, + "loss": 0.6475, + "step": 957 + }, + { + "epoch": 0.4322617033276932, + "grad_norm": 3.259311956538079, + "learning_rate": 1.2650502638898558e-06, + "loss": 0.5158, + "step": 958 + }, + { + "epoch": 0.4327129159616469, + "grad_norm": 2.489346150252523, + "learning_rate": 1.2636403802751515e-06, + "loss": 0.527, + "step": 959 + }, + { + "epoch": 0.43316412859560066, + "grad_norm": 3.227790541090488, + "learning_rate": 1.2622299332320047e-06, + "loss": 0.5149, + "step": 960 + }, + { + "epoch": 0.4336153412295544, + "grad_norm": 3.360641383529355, + "learning_rate": 1.2608189257746968e-06, + "loss": 0.5772, + "step": 961 + }, + { + "epoch": 0.4340665538635082, + "grad_norm": 3.3579225913579283, + "learning_rate": 1.2594073609187046e-06, + "loss": 0.5674, + "step": 962 + }, + { + "epoch": 0.43451776649746193, + "grad_norm": 3.6833581489779696, + "learning_rate": 1.2579952416806978e-06, + "loss": 0.6361, + "step": 963 + }, + { + "epoch": 0.4349689791314157, + "grad_norm": 3.68096763034361, + "learning_rate": 1.2565825710785303e-06, + "loss": 0.5787, + "step": 964 + }, + { + "epoch": 0.43542019176536945, + "grad_norm": 3.200182035241666, + "learning_rate": 1.2551693521312338e-06, + "loss": 0.4692, + "step": 965 + }, + { + "epoch": 0.4358714043993232, + "grad_norm": 3.161219776591041, + "learning_rate": 1.2537555878590124e-06, + "loss": 0.5817, + "step": 966 + }, + { + "epoch": 0.4363226170332769, + "grad_norm": 3.3483981216994434, + "learning_rate": 1.2523412812832366e-06, + "loss": 0.6038, + "step": 967 + }, + { + "epoch": 0.43677382966723066, + "grad_norm": 3.664757783306792, + "learning_rate": 1.2509264354264337e-06, + "loss": 0.5481, + "step": 968 + }, + { + "epoch": 0.4372250423011844, + "grad_norm": 3.0682250506770923, + "learning_rate": 1.2495110533122848e-06, + "loss": 0.5729, + "step": 969 + }, + { + "epoch": 0.4376762549351382, + "grad_norm": 3.276987387331576, + "learning_rate": 1.2480951379656173e-06, + "loss": 0.5415, + "step": 970 + }, + { + "epoch": 0.43812746756909193, + "grad_norm": 2.7598150218875084, + "learning_rate": 1.2466786924123977e-06, + "loss": 0.546, + "step": 971 + }, + { + "epoch": 0.4385786802030457, + "grad_norm": 3.232659097993065, + "learning_rate": 1.2452617196797258e-06, + "loss": 0.4954, + "step": 972 + }, + { + "epoch": 0.43902989283699945, + "grad_norm": 3.389259817026544, + "learning_rate": 1.2438442227958274e-06, + "loss": 0.5887, + "step": 973 + }, + { + "epoch": 0.4394811054709532, + "grad_norm": 3.287684474069178, + "learning_rate": 1.2424262047900498e-06, + "loss": 0.5423, + "step": 974 + }, + { + "epoch": 0.43993231810490696, + "grad_norm": 2.9570593801357603, + "learning_rate": 1.2410076686928521e-06, + "loss": 0.5879, + "step": 975 + }, + { + "epoch": 0.44038353073886066, + "grad_norm": 3.0562220060970104, + "learning_rate": 1.2395886175358026e-06, + "loss": 0.6012, + "step": 976 + }, + { + "epoch": 0.4408347433728144, + "grad_norm": 2.742466631605964, + "learning_rate": 1.2381690543515691e-06, + "loss": 0.5138, + "step": 977 + }, + { + "epoch": 0.4412859560067682, + "grad_norm": 3.508015423427429, + "learning_rate": 1.236748982173914e-06, + "loss": 0.5654, + "step": 978 + }, + { + "epoch": 0.44173716864072193, + "grad_norm": 3.1519521341701604, + "learning_rate": 1.2353284040376876e-06, + "loss": 0.696, + "step": 979 + }, + { + "epoch": 0.4421883812746757, + "grad_norm": 3.199203044906545, + "learning_rate": 1.2339073229788214e-06, + "loss": 0.5913, + "step": 980 + }, + { + "epoch": 0.44263959390862945, + "grad_norm": 2.914392046354155, + "learning_rate": 1.2324857420343216e-06, + "loss": 0.5423, + "step": 981 + }, + { + "epoch": 0.4430908065425832, + "grad_norm": 3.208161429147259, + "learning_rate": 1.2310636642422623e-06, + "loss": 0.6325, + "step": 982 + }, + { + "epoch": 0.44354201917653696, + "grad_norm": 3.222015781659111, + "learning_rate": 1.2296410926417804e-06, + "loss": 0.4924, + "step": 983 + }, + { + "epoch": 0.4439932318104907, + "grad_norm": 3.613081662865641, + "learning_rate": 1.228218030273068e-06, + "loss": 0.7603, + "step": 984 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 2.8395745169932316, + "learning_rate": 1.2267944801773648e-06, + "loss": 0.4732, + "step": 985 + }, + { + "epoch": 0.4448956570783982, + "grad_norm": 3.1488834617622157, + "learning_rate": 1.2253704453969541e-06, + "loss": 0.5935, + "step": 986 + }, + { + "epoch": 0.44534686971235193, + "grad_norm": 2.890352804759586, + "learning_rate": 1.2239459289751545e-06, + "loss": 0.5563, + "step": 987 + }, + { + "epoch": 0.4457980823463057, + "grad_norm": 3.344479914937335, + "learning_rate": 1.2225209339563143e-06, + "loss": 0.6045, + "step": 988 + }, + { + "epoch": 0.44624929498025945, + "grad_norm": 2.9945956060125245, + "learning_rate": 1.2210954633858042e-06, + "loss": 0.6325, + "step": 989 + }, + { + "epoch": 0.4467005076142132, + "grad_norm": 3.0243098524774985, + "learning_rate": 1.219669520310011e-06, + "loss": 0.5677, + "step": 990 + }, + { + "epoch": 0.44715172024816696, + "grad_norm": 2.892447973941234, + "learning_rate": 1.2182431077763316e-06, + "loss": 0.7021, + "step": 991 + }, + { + "epoch": 0.4476029328821207, + "grad_norm": 3.700082520722297, + "learning_rate": 1.216816228833167e-06, + "loss": 0.6754, + "step": 992 + }, + { + "epoch": 0.4480541455160745, + "grad_norm": 3.0843377850883766, + "learning_rate": 1.2153888865299133e-06, + "loss": 0.5611, + "step": 993 + }, + { + "epoch": 0.4485053581500282, + "grad_norm": 3.0386431018180806, + "learning_rate": 1.213961083916958e-06, + "loss": 0.5822, + "step": 994 + }, + { + "epoch": 0.44895657078398193, + "grad_norm": 2.8399741907687392, + "learning_rate": 1.2125328240456725e-06, + "loss": 0.4906, + "step": 995 + }, + { + "epoch": 0.4494077834179357, + "grad_norm": 3.027087230934386, + "learning_rate": 1.2111041099684042e-06, + "loss": 0.6892, + "step": 996 + }, + { + "epoch": 0.44985899605188945, + "grad_norm": 2.9568284939288616, + "learning_rate": 1.209674944738473e-06, + "loss": 0.6299, + "step": 997 + }, + { + "epoch": 0.4503102086858432, + "grad_norm": 3.259765282572653, + "learning_rate": 1.2082453314101606e-06, + "loss": 0.4755, + "step": 998 + }, + { + "epoch": 0.45076142131979696, + "grad_norm": 3.447951905086282, + "learning_rate": 1.2068152730387088e-06, + "loss": 0.6369, + "step": 999 + }, + { + "epoch": 0.4512126339537507, + "grad_norm": 3.234955570056143, + "learning_rate": 1.2053847726803089e-06, + "loss": 0.5156, + "step": 1000 + }, + { + "epoch": 0.4516638465877045, + "grad_norm": 3.5463128389089333, + "learning_rate": 1.203953833392097e-06, + "loss": 0.475, + "step": 1001 + }, + { + "epoch": 0.45211505922165823, + "grad_norm": 3.222070894972768, + "learning_rate": 1.2025224582321485e-06, + "loss": 0.609, + "step": 1002 + }, + { + "epoch": 0.45256627185561193, + "grad_norm": 3.0758689339986, + "learning_rate": 1.201090650259468e-06, + "loss": 0.5973, + "step": 1003 + }, + { + "epoch": 0.4530174844895657, + "grad_norm": 3.159127568590662, + "learning_rate": 1.1996584125339868e-06, + "loss": 0.5238, + "step": 1004 + }, + { + "epoch": 0.45346869712351945, + "grad_norm": 3.6135720048902105, + "learning_rate": 1.1982257481165545e-06, + "loss": 0.675, + "step": 1005 + }, + { + "epoch": 0.4539199097574732, + "grad_norm": 3.336465519438658, + "learning_rate": 1.1967926600689321e-06, + "loss": 0.542, + "step": 1006 + }, + { + "epoch": 0.45437112239142696, + "grad_norm": 2.7490928983502854, + "learning_rate": 1.1953591514537864e-06, + "loss": 0.4707, + "step": 1007 + }, + { + "epoch": 0.4548223350253807, + "grad_norm": 3.3908649442402714, + "learning_rate": 1.193925225334682e-06, + "loss": 0.602, + "step": 1008 + }, + { + "epoch": 0.4552735476593345, + "grad_norm": 2.894962512757854, + "learning_rate": 1.1924908847760772e-06, + "loss": 0.6097, + "step": 1009 + }, + { + "epoch": 0.45572476029328823, + "grad_norm": 3.4842483249570155, + "learning_rate": 1.191056132843315e-06, + "loss": 0.5953, + "step": 1010 + }, + { + "epoch": 0.45617597292724194, + "grad_norm": 2.950398511688371, + "learning_rate": 1.1896209726026176e-06, + "loss": 0.4035, + "step": 1011 + }, + { + "epoch": 0.4566271855611957, + "grad_norm": 2.9513437535271185, + "learning_rate": 1.1881854071210804e-06, + "loss": 0.5398, + "step": 1012 + }, + { + "epoch": 0.45707839819514945, + "grad_norm": 2.8613177638799945, + "learning_rate": 1.186749439466664e-06, + "loss": 0.5525, + "step": 1013 + }, + { + "epoch": 0.4575296108291032, + "grad_norm": 3.188791154585419, + "learning_rate": 1.1853130727081893e-06, + "loss": 0.5192, + "step": 1014 + }, + { + "epoch": 0.45798082346305696, + "grad_norm": 3.3146247782552756, + "learning_rate": 1.18387630991533e-06, + "loss": 0.5131, + "step": 1015 + }, + { + "epoch": 0.4584320360970107, + "grad_norm": 3.9250211014882517, + "learning_rate": 1.1824391541586055e-06, + "loss": 0.5914, + "step": 1016 + }, + { + "epoch": 0.4588832487309645, + "grad_norm": 2.9161098414160733, + "learning_rate": 1.1810016085093754e-06, + "loss": 0.5391, + "step": 1017 + }, + { + "epoch": 0.45933446136491823, + "grad_norm": 2.9799917015998987, + "learning_rate": 1.179563676039833e-06, + "loss": 0.5123, + "step": 1018 + }, + { + "epoch": 0.459785673998872, + "grad_norm": 3.530754717880453, + "learning_rate": 1.1781253598229982e-06, + "loss": 0.5318, + "step": 1019 + }, + { + "epoch": 0.4602368866328257, + "grad_norm": 3.375071447655234, + "learning_rate": 1.1766866629327097e-06, + "loss": 0.5517, + "step": 1020 + }, + { + "epoch": 0.46068809926677945, + "grad_norm": 3.388282264796258, + "learning_rate": 1.1752475884436213e-06, + "loss": 0.5301, + "step": 1021 + }, + { + "epoch": 0.4611393119007332, + "grad_norm": 3.4203055686756456, + "learning_rate": 1.1738081394311932e-06, + "loss": 0.6649, + "step": 1022 + }, + { + "epoch": 0.46159052453468696, + "grad_norm": 3.3102903920545232, + "learning_rate": 1.172368318971686e-06, + "loss": 0.5006, + "step": 1023 + }, + { + "epoch": 0.4620417371686407, + "grad_norm": 3.083071988078867, + "learning_rate": 1.170928130142154e-06, + "loss": 0.4744, + "step": 1024 + }, + { + "epoch": 0.4624929498025945, + "grad_norm": 3.348114186262164, + "learning_rate": 1.169487576020439e-06, + "loss": 0.5852, + "step": 1025 + }, + { + "epoch": 0.46294416243654823, + "grad_norm": 3.5395494913322696, + "learning_rate": 1.1680466596851635e-06, + "loss": 0.5319, + "step": 1026 + }, + { + "epoch": 0.463395375070502, + "grad_norm": 3.5359128839904366, + "learning_rate": 1.1666053842157232e-06, + "loss": 0.5107, + "step": 1027 + }, + { + "epoch": 0.46384658770445575, + "grad_norm": 2.6657070438945185, + "learning_rate": 1.165163752692283e-06, + "loss": 0.5478, + "step": 1028 + }, + { + "epoch": 0.46429780033840945, + "grad_norm": 3.0149910558712243, + "learning_rate": 1.1637217681957673e-06, + "loss": 0.5872, + "step": 1029 + }, + { + "epoch": 0.4647490129723632, + "grad_norm": 3.159458756633235, + "learning_rate": 1.1622794338078552e-06, + "loss": 0.706, + "step": 1030 + }, + { + "epoch": 0.46520022560631696, + "grad_norm": 3.0885082043510654, + "learning_rate": 1.1608367526109736e-06, + "loss": 0.593, + "step": 1031 + }, + { + "epoch": 0.4656514382402707, + "grad_norm": 2.9895356676399625, + "learning_rate": 1.159393727688291e-06, + "loss": 0.5294, + "step": 1032 + }, + { + "epoch": 0.4661026508742245, + "grad_norm": 3.2775855911044594, + "learning_rate": 1.1579503621237101e-06, + "loss": 0.4458, + "step": 1033 + }, + { + "epoch": 0.46655386350817823, + "grad_norm": 3.1259902784446894, + "learning_rate": 1.1565066590018613e-06, + "loss": 0.4544, + "step": 1034 + }, + { + "epoch": 0.467005076142132, + "grad_norm": 3.099407676118007, + "learning_rate": 1.1550626214080965e-06, + "loss": 0.5772, + "step": 1035 + }, + { + "epoch": 0.46745628877608575, + "grad_norm": 2.9661310625580333, + "learning_rate": 1.1536182524284833e-06, + "loss": 0.4604, + "step": 1036 + }, + { + "epoch": 0.4679075014100395, + "grad_norm": 3.3426717260171976, + "learning_rate": 1.1521735551497966e-06, + "loss": 0.6421, + "step": 1037 + }, + { + "epoch": 0.4683587140439932, + "grad_norm": 2.976762713490125, + "learning_rate": 1.1507285326595126e-06, + "loss": 0.597, + "step": 1038 + }, + { + "epoch": 0.46880992667794696, + "grad_norm": 3.30725401073655, + "learning_rate": 1.1492831880458037e-06, + "loss": 0.6313, + "step": 1039 + }, + { + "epoch": 0.4692611393119007, + "grad_norm": 2.9639877946890327, + "learning_rate": 1.1478375243975295e-06, + "loss": 0.4938, + "step": 1040 + }, + { + "epoch": 0.4697123519458545, + "grad_norm": 2.786610484320687, + "learning_rate": 1.1463915448042326e-06, + "loss": 0.4515, + "step": 1041 + }, + { + "epoch": 0.47016356457980824, + "grad_norm": 3.135745692122038, + "learning_rate": 1.1449452523561294e-06, + "loss": 0.5554, + "step": 1042 + }, + { + "epoch": 0.470614777213762, + "grad_norm": 2.788435273509232, + "learning_rate": 1.143498650144106e-06, + "loss": 0.4914, + "step": 1043 + }, + { + "epoch": 0.47106598984771575, + "grad_norm": 2.876792135265133, + "learning_rate": 1.1420517412597105e-06, + "loss": 0.4697, + "step": 1044 + }, + { + "epoch": 0.4715172024816695, + "grad_norm": 3.5477556615090093, + "learning_rate": 1.1406045287951457e-06, + "loss": 0.5816, + "step": 1045 + }, + { + "epoch": 0.47196841511562326, + "grad_norm": 2.92573936243044, + "learning_rate": 1.1391570158432635e-06, + "loss": 0.4241, + "step": 1046 + }, + { + "epoch": 0.47241962774957696, + "grad_norm": 3.2379205245375347, + "learning_rate": 1.1377092054975583e-06, + "loss": 0.5848, + "step": 1047 + }, + { + "epoch": 0.4728708403835307, + "grad_norm": 3.2602568198273736, + "learning_rate": 1.1362611008521596e-06, + "loss": 0.5032, + "step": 1048 + }, + { + "epoch": 0.4733220530174845, + "grad_norm": 3.563180746954425, + "learning_rate": 1.134812705001826e-06, + "loss": 0.6199, + "step": 1049 + }, + { + "epoch": 0.47377326565143824, + "grad_norm": 3.258258686215829, + "learning_rate": 1.1333640210419386e-06, + "loss": 0.6113, + "step": 1050 + }, + { + "epoch": 0.474224478285392, + "grad_norm": 3.2628377254999297, + "learning_rate": 1.1319150520684944e-06, + "loss": 0.5243, + "step": 1051 + }, + { + "epoch": 0.47467569091934575, + "grad_norm": 3.027101046392854, + "learning_rate": 1.1304658011780984e-06, + "loss": 0.5171, + "step": 1052 + }, + { + "epoch": 0.4751269035532995, + "grad_norm": 3.649012114499818, + "learning_rate": 1.1290162714679594e-06, + "loss": 0.6611, + "step": 1053 + }, + { + "epoch": 0.47557811618725326, + "grad_norm": 3.1689567292584266, + "learning_rate": 1.1275664660358817e-06, + "loss": 0.3962, + "step": 1054 + }, + { + "epoch": 0.476029328821207, + "grad_norm": 2.92166316898993, + "learning_rate": 1.1261163879802587e-06, + "loss": 0.4777, + "step": 1055 + }, + { + "epoch": 0.4764805414551607, + "grad_norm": 3.137056987986139, + "learning_rate": 1.1246660404000658e-06, + "loss": 0.5466, + "step": 1056 + }, + { + "epoch": 0.4769317540891145, + "grad_norm": 3.382936110474769, + "learning_rate": 1.1232154263948556e-06, + "loss": 0.5077, + "step": 1057 + }, + { + "epoch": 0.47738296672306824, + "grad_norm": 3.517217173416557, + "learning_rate": 1.1217645490647494e-06, + "loss": 0.5492, + "step": 1058 + }, + { + "epoch": 0.477834179357022, + "grad_norm": 3.26075070325087, + "learning_rate": 1.1203134115104315e-06, + "loss": 0.5016, + "step": 1059 + }, + { + "epoch": 0.47828539199097575, + "grad_norm": 3.2324066506117, + "learning_rate": 1.1188620168331419e-06, + "loss": 0.5466, + "step": 1060 + }, + { + "epoch": 0.4787366046249295, + "grad_norm": 2.9767488512886975, + "learning_rate": 1.1174103681346708e-06, + "loss": 0.5206, + "step": 1061 + }, + { + "epoch": 0.47918781725888326, + "grad_norm": 3.1883816288556166, + "learning_rate": 1.1159584685173505e-06, + "loss": 0.5209, + "step": 1062 + }, + { + "epoch": 0.479639029892837, + "grad_norm": 3.2725897850315304, + "learning_rate": 1.11450632108405e-06, + "loss": 0.6575, + "step": 1063 + }, + { + "epoch": 0.4800902425267907, + "grad_norm": 2.90226955653477, + "learning_rate": 1.113053928938168e-06, + "loss": 0.526, + "step": 1064 + }, + { + "epoch": 0.4805414551607445, + "grad_norm": 3.2118769622065724, + "learning_rate": 1.1116012951836255e-06, + "loss": 0.5494, + "step": 1065 + }, + { + "epoch": 0.48099266779469824, + "grad_norm": 2.9794708120012996, + "learning_rate": 1.110148422924861e-06, + "loss": 0.6493, + "step": 1066 + }, + { + "epoch": 0.481443880428652, + "grad_norm": 3.36225003487691, + "learning_rate": 1.1086953152668216e-06, + "loss": 0.5022, + "step": 1067 + }, + { + "epoch": 0.48189509306260575, + "grad_norm": 3.2803450847662754, + "learning_rate": 1.1072419753149585e-06, + "loss": 0.6551, + "step": 1068 + }, + { + "epoch": 0.4823463056965595, + "grad_norm": 3.210329635697201, + "learning_rate": 1.1057884061752176e-06, + "loss": 0.5731, + "step": 1069 + }, + { + "epoch": 0.48279751833051326, + "grad_norm": 3.5448326400292514, + "learning_rate": 1.1043346109540369e-06, + "loss": 0.5602, + "step": 1070 + }, + { + "epoch": 0.483248730964467, + "grad_norm": 3.2599148247025354, + "learning_rate": 1.102880592758336e-06, + "loss": 0.6967, + "step": 1071 + }, + { + "epoch": 0.4836999435984208, + "grad_norm": 3.921908291356212, + "learning_rate": 1.1014263546955115e-06, + "loss": 0.6903, + "step": 1072 + }, + { + "epoch": 0.4841511562323745, + "grad_norm": 3.6723472952642098, + "learning_rate": 1.0999718998734298e-06, + "loss": 0.7113, + "step": 1073 + }, + { + "epoch": 0.48460236886632824, + "grad_norm": 3.1199912636971767, + "learning_rate": 1.0985172314004203e-06, + "loss": 0.5092, + "step": 1074 + }, + { + "epoch": 0.485053581500282, + "grad_norm": 3.084748288084798, + "learning_rate": 1.0970623523852698e-06, + "loss": 0.5553, + "step": 1075 + }, + { + "epoch": 0.48550479413423575, + "grad_norm": 2.8776564795067556, + "learning_rate": 1.0956072659372141e-06, + "loss": 0.5115, + "step": 1076 + }, + { + "epoch": 0.4859560067681895, + "grad_norm": 2.953441401470534, + "learning_rate": 1.094151975165933e-06, + "loss": 0.6507, + "step": 1077 + }, + { + "epoch": 0.48640721940214326, + "grad_norm": 3.690371591944593, + "learning_rate": 1.0926964831815424e-06, + "loss": 0.6129, + "step": 1078 + }, + { + "epoch": 0.486858432036097, + "grad_norm": 2.9774725768337387, + "learning_rate": 1.0912407930945887e-06, + "loss": 0.6598, + "step": 1079 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 3.304695556663853, + "learning_rate": 1.0897849080160411e-06, + "loss": 0.596, + "step": 1080 + }, + { + "epoch": 0.48776085730400454, + "grad_norm": 3.5963573435316367, + "learning_rate": 1.0883288310572862e-06, + "loss": 0.6246, + "step": 1081 + }, + { + "epoch": 0.48821206993795824, + "grad_norm": 3.155896819738853, + "learning_rate": 1.0868725653301205e-06, + "loss": 0.5807, + "step": 1082 + }, + { + "epoch": 0.488663282571912, + "grad_norm": 3.081080659400599, + "learning_rate": 1.0854161139467435e-06, + "loss": 0.434, + "step": 1083 + }, + { + "epoch": 0.48911449520586575, + "grad_norm": 3.106854377639036, + "learning_rate": 1.0839594800197516e-06, + "loss": 0.48, + "step": 1084 + }, + { + "epoch": 0.4895657078398195, + "grad_norm": 3.7755265948586825, + "learning_rate": 1.082502666662132e-06, + "loss": 0.6046, + "step": 1085 + }, + { + "epoch": 0.49001692047377327, + "grad_norm": 3.1538567782545153, + "learning_rate": 1.0810456769872542e-06, + "loss": 0.4631, + "step": 1086 + }, + { + "epoch": 0.490468133107727, + "grad_norm": 3.493792209571887, + "learning_rate": 1.0795885141088652e-06, + "loss": 0.4979, + "step": 1087 + }, + { + "epoch": 0.4909193457416808, + "grad_norm": 3.2677544727924386, + "learning_rate": 1.0781311811410825e-06, + "loss": 0.4784, + "step": 1088 + }, + { + "epoch": 0.49137055837563454, + "grad_norm": 3.2385254672960335, + "learning_rate": 1.0766736811983863e-06, + "loss": 0.6579, + "step": 1089 + }, + { + "epoch": 0.4918217710095883, + "grad_norm": 3.0410160191967424, + "learning_rate": 1.0752160173956144e-06, + "loss": 0.4921, + "step": 1090 + }, + { + "epoch": 0.492272983643542, + "grad_norm": 3.0814337139592958, + "learning_rate": 1.0737581928479538e-06, + "loss": 0.4993, + "step": 1091 + }, + { + "epoch": 0.49272419627749575, + "grad_norm": 3.1302147016084336, + "learning_rate": 1.0723002106709363e-06, + "loss": 0.4858, + "step": 1092 + }, + { + "epoch": 0.4931754089114495, + "grad_norm": 3.2032365657032846, + "learning_rate": 1.0708420739804294e-06, + "loss": 0.4662, + "step": 1093 + }, + { + "epoch": 0.49362662154540327, + "grad_norm": 2.963453928384545, + "learning_rate": 1.0693837858926315e-06, + "loss": 0.4786, + "step": 1094 + }, + { + "epoch": 0.494077834179357, + "grad_norm": 2.8479265985444835, + "learning_rate": 1.0679253495240645e-06, + "loss": 0.6417, + "step": 1095 + }, + { + "epoch": 0.4945290468133108, + "grad_norm": 3.8316009838064153, + "learning_rate": 1.066466767991567e-06, + "loss": 0.6002, + "step": 1096 + }, + { + "epoch": 0.49498025944726454, + "grad_norm": 3.13289524958788, + "learning_rate": 1.0650080444122875e-06, + "loss": 0.6372, + "step": 1097 + }, + { + "epoch": 0.4954314720812183, + "grad_norm": 3.2639667702613253, + "learning_rate": 1.0635491819036792e-06, + "loss": 0.6167, + "step": 1098 + }, + { + "epoch": 0.49588268471517205, + "grad_norm": 3.049433076188395, + "learning_rate": 1.0620901835834912e-06, + "loss": 0.5683, + "step": 1099 + }, + { + "epoch": 0.49633389734912575, + "grad_norm": 3.387030164756313, + "learning_rate": 1.0606310525697627e-06, + "loss": 0.6261, + "step": 1100 + }, + { + "epoch": 0.4967851099830795, + "grad_norm": 3.402149165888981, + "learning_rate": 1.059171791980817e-06, + "loss": 0.4562, + "step": 1101 + }, + { + "epoch": 0.49723632261703327, + "grad_norm": 3.2602762733367996, + "learning_rate": 1.0577124049352548e-06, + "loss": 0.626, + "step": 1102 + }, + { + "epoch": 0.497687535250987, + "grad_norm": 3.2610142828869777, + "learning_rate": 1.0562528945519461e-06, + "loss": 0.5353, + "step": 1103 + }, + { + "epoch": 0.4981387478849408, + "grad_norm": 2.97682798873228, + "learning_rate": 1.0547932639500246e-06, + "loss": 0.4905, + "step": 1104 + }, + { + "epoch": 0.49858996051889454, + "grad_norm": 2.938963109880475, + "learning_rate": 1.0533335162488815e-06, + "loss": 0.4562, + "step": 1105 + }, + { + "epoch": 0.4990411731528483, + "grad_norm": 3.5017181297984012, + "learning_rate": 1.051873654568158e-06, + "loss": 0.5793, + "step": 1106 + }, + { + "epoch": 0.49949238578680205, + "grad_norm": 2.7407269861856736, + "learning_rate": 1.0504136820277384e-06, + "loss": 0.5171, + "step": 1107 + }, + { + "epoch": 0.4999435984207558, + "grad_norm": 3.2128618819706114, + "learning_rate": 1.0489536017477448e-06, + "loss": 0.4742, + "step": 1108 + }, + { + "epoch": 0.5003948110547095, + "grad_norm": 3.2860218544233737, + "learning_rate": 1.0474934168485288e-06, + "loss": 0.5809, + "step": 1109 + }, + { + "epoch": 0.5008460236886633, + "grad_norm": 3.1537779519919162, + "learning_rate": 1.0460331304506655e-06, + "loss": 0.5523, + "step": 1110 + }, + { + "epoch": 0.501297236322617, + "grad_norm": 2.9629163520820225, + "learning_rate": 1.0445727456749483e-06, + "loss": 0.5112, + "step": 1111 + }, + { + "epoch": 0.5017484489565708, + "grad_norm": 3.197935737383708, + "learning_rate": 1.043112265642379e-06, + "loss": 0.4732, + "step": 1112 + }, + { + "epoch": 0.5021996615905245, + "grad_norm": 2.9033918431675922, + "learning_rate": 1.041651693474164e-06, + "loss": 0.4971, + "step": 1113 + }, + { + "epoch": 0.5026508742244783, + "grad_norm": 3.1684252831494213, + "learning_rate": 1.0401910322917064e-06, + "loss": 0.5758, + "step": 1114 + }, + { + "epoch": 0.503102086858432, + "grad_norm": 2.980345483564053, + "learning_rate": 1.0387302852165998e-06, + "loss": 0.4851, + "step": 1115 + }, + { + "epoch": 0.5035532994923858, + "grad_norm": 3.36987317517813, + "learning_rate": 1.037269455370621e-06, + "loss": 0.5245, + "step": 1116 + }, + { + "epoch": 0.5040045121263396, + "grad_norm": 3.2674734863325945, + "learning_rate": 1.035808545875723e-06, + "loss": 0.587, + "step": 1117 + }, + { + "epoch": 0.5044557247602933, + "grad_norm": 3.089508629837757, + "learning_rate": 1.0343475598540307e-06, + "loss": 0.4585, + "step": 1118 + }, + { + "epoch": 0.5049069373942471, + "grad_norm": 3.058502135032177, + "learning_rate": 1.0328865004278315e-06, + "loss": 0.62, + "step": 1119 + }, + { + "epoch": 0.5053581500282008, + "grad_norm": 3.2132492413810305, + "learning_rate": 1.0314253707195703e-06, + "loss": 0.5984, + "step": 1120 + }, + { + "epoch": 0.5058093626621546, + "grad_norm": 2.9972219397833637, + "learning_rate": 1.0299641738518405e-06, + "loss": 0.6045, + "step": 1121 + }, + { + "epoch": 0.5062605752961082, + "grad_norm": 2.809038897639686, + "learning_rate": 1.0285029129473813e-06, + "loss": 0.5555, + "step": 1122 + }, + { + "epoch": 0.506711787930062, + "grad_norm": 2.659291698186336, + "learning_rate": 1.0270415911290671e-06, + "loss": 0.5977, + "step": 1123 + }, + { + "epoch": 0.5071630005640158, + "grad_norm": 3.0309037627722706, + "learning_rate": 1.0255802115199032e-06, + "loss": 0.5541, + "step": 1124 + }, + { + "epoch": 0.5076142131979695, + "grad_norm": 3.0510921385148646, + "learning_rate": 1.024118777243019e-06, + "loss": 0.6471, + "step": 1125 + }, + { + "epoch": 0.5080654258319233, + "grad_norm": 3.3253019574272176, + "learning_rate": 1.022657291421659e-06, + "loss": 0.5083, + "step": 1126 + }, + { + "epoch": 0.508516638465877, + "grad_norm": 3.1624347051422115, + "learning_rate": 1.0211957571791795e-06, + "loss": 0.5804, + "step": 1127 + }, + { + "epoch": 0.5089678510998308, + "grad_norm": 3.082831702328242, + "learning_rate": 1.019734177639039e-06, + "loss": 0.5742, + "step": 1128 + }, + { + "epoch": 0.5094190637337845, + "grad_norm": 2.855577550248334, + "learning_rate": 1.0182725559247945e-06, + "loss": 0.5888, + "step": 1129 + }, + { + "epoch": 0.5098702763677383, + "grad_norm": 3.5077212889613967, + "learning_rate": 1.0168108951600915e-06, + "loss": 0.5265, + "step": 1130 + }, + { + "epoch": 0.510321489001692, + "grad_norm": 3.196038174031813, + "learning_rate": 1.0153491984686593e-06, + "loss": 0.5447, + "step": 1131 + }, + { + "epoch": 0.5107727016356458, + "grad_norm": 3.0228322304609083, + "learning_rate": 1.0138874689743047e-06, + "loss": 0.5442, + "step": 1132 + }, + { + "epoch": 0.5112239142695996, + "grad_norm": 3.2815180852833095, + "learning_rate": 1.0124257098009042e-06, + "loss": 0.5194, + "step": 1133 + }, + { + "epoch": 0.5116751269035533, + "grad_norm": 2.9460400536573315, + "learning_rate": 1.0109639240723973e-06, + "loss": 0.5793, + "step": 1134 + }, + { + "epoch": 0.5121263395375071, + "grad_norm": 3.2088130319211015, + "learning_rate": 1.0095021149127806e-06, + "loss": 0.554, + "step": 1135 + }, + { + "epoch": 0.5125775521714608, + "grad_norm": 3.3804661649102465, + "learning_rate": 1.0080402854461007e-06, + "loss": 0.5165, + "step": 1136 + }, + { + "epoch": 0.5130287648054146, + "grad_norm": 3.321371296451192, + "learning_rate": 1.0065784387964485e-06, + "loss": 0.4321, + "step": 1137 + }, + { + "epoch": 0.5134799774393684, + "grad_norm": 3.037664620181153, + "learning_rate": 1.0051165780879503e-06, + "loss": 0.5399, + "step": 1138 + }, + { + "epoch": 0.5139311900733221, + "grad_norm": 3.556502110355346, + "learning_rate": 1.0036547064447622e-06, + "loss": 0.5263, + "step": 1139 + }, + { + "epoch": 0.5143824027072758, + "grad_norm": 3.281961668919357, + "learning_rate": 1.0021928269910657e-06, + "loss": 0.5017, + "step": 1140 + }, + { + "epoch": 0.5148336153412295, + "grad_norm": 4.422821086167055, + "learning_rate": 1.0007309428510568e-06, + "loss": 0.6125, + "step": 1141 + }, + { + "epoch": 0.5152848279751833, + "grad_norm": 3.042185816714922, + "learning_rate": 9.992690571489431e-07, + "loss": 0.4608, + "step": 1142 + }, + { + "epoch": 0.515736040609137, + "grad_norm": 3.139974486773792, + "learning_rate": 9.978071730089344e-07, + "loss": 0.4999, + "step": 1143 + }, + { + "epoch": 0.5161872532430908, + "grad_norm": 3.050392508261623, + "learning_rate": 9.963452935552377e-07, + "loss": 0.5472, + "step": 1144 + }, + { + "epoch": 0.5166384658770445, + "grad_norm": 3.1524033877265336, + "learning_rate": 9.948834219120498e-07, + "loss": 0.5617, + "step": 1145 + }, + { + "epoch": 0.5170896785109983, + "grad_norm": 3.045308188934204, + "learning_rate": 9.934215612035514e-07, + "loss": 0.4082, + "step": 1146 + }, + { + "epoch": 0.517540891144952, + "grad_norm": 3.3228413802611225, + "learning_rate": 9.91959714553899e-07, + "loss": 0.5028, + "step": 1147 + }, + { + "epoch": 0.5179921037789058, + "grad_norm": 2.9738895566428285, + "learning_rate": 9.904978850872191e-07, + "loss": 0.6102, + "step": 1148 + }, + { + "epoch": 0.5184433164128596, + "grad_norm": 3.164313244153204, + "learning_rate": 9.89036075927603e-07, + "loss": 0.541, + "step": 1149 + }, + { + "epoch": 0.5188945290468133, + "grad_norm": 3.0993846216386998, + "learning_rate": 9.87574290199096e-07, + "loss": 0.5887, + "step": 1150 + }, + { + "epoch": 0.5193457416807671, + "grad_norm": 3.0457956258147503, + "learning_rate": 9.861125310256954e-07, + "loss": 0.518, + "step": 1151 + }, + { + "epoch": 0.5197969543147208, + "grad_norm": 3.487761052187324, + "learning_rate": 9.846508015313406e-07, + "loss": 0.5634, + "step": 1152 + }, + { + "epoch": 0.5202481669486746, + "grad_norm": 3.0958153646337654, + "learning_rate": 9.831891048399084e-07, + "loss": 0.486, + "step": 1153 + }, + { + "epoch": 0.5206993795826284, + "grad_norm": 2.849378458901456, + "learning_rate": 9.817274440752052e-07, + "loss": 0.4714, + "step": 1154 + }, + { + "epoch": 0.5211505922165821, + "grad_norm": 3.0204334797341867, + "learning_rate": 9.802658223609608e-07, + "loss": 0.5409, + "step": 1155 + }, + { + "epoch": 0.5216018048505359, + "grad_norm": 3.4389615956259414, + "learning_rate": 9.78804242820821e-07, + "loss": 0.5677, + "step": 1156 + }, + { + "epoch": 0.5220530174844895, + "grad_norm": 3.079047759993495, + "learning_rate": 9.773427085783413e-07, + "loss": 0.5837, + "step": 1157 + }, + { + "epoch": 0.5225042301184433, + "grad_norm": 3.2180585801181407, + "learning_rate": 9.758812227569812e-07, + "loss": 0.6249, + "step": 1158 + }, + { + "epoch": 0.522955442752397, + "grad_norm": 3.106738311935934, + "learning_rate": 9.744197884800967e-07, + "loss": 0.5928, + "step": 1159 + }, + { + "epoch": 0.5234066553863508, + "grad_norm": 3.0296372900818676, + "learning_rate": 9.72958408870933e-07, + "loss": 0.6287, + "step": 1160 + }, + { + "epoch": 0.5238578680203045, + "grad_norm": 3.2657856406977164, + "learning_rate": 9.714970870526186e-07, + "loss": 0.4556, + "step": 1161 + }, + { + "epoch": 0.5243090806542583, + "grad_norm": 2.8714718085250497, + "learning_rate": 9.700358261481592e-07, + "loss": 0.5133, + "step": 1162 + }, + { + "epoch": 0.524760293288212, + "grad_norm": 3.1929538694444353, + "learning_rate": 9.6857462928043e-07, + "loss": 0.5365, + "step": 1163 + }, + { + "epoch": 0.5252115059221658, + "grad_norm": 3.1374361855603725, + "learning_rate": 9.671134995721684e-07, + "loss": 0.5009, + "step": 1164 + }, + { + "epoch": 0.5256627185561196, + "grad_norm": 3.0612765295251614, + "learning_rate": 9.656524401459692e-07, + "loss": 0.4686, + "step": 1165 + }, + { + "epoch": 0.5261139311900733, + "grad_norm": 3.0360262730106653, + "learning_rate": 9.64191454124277e-07, + "loss": 0.5078, + "step": 1166 + }, + { + "epoch": 0.5265651438240271, + "grad_norm": 2.7855335523075895, + "learning_rate": 9.62730544629379e-07, + "loss": 0.4643, + "step": 1167 + }, + { + "epoch": 0.5270163564579808, + "grad_norm": 3.2784285165726326, + "learning_rate": 9.612697147834003e-07, + "loss": 0.5212, + "step": 1168 + }, + { + "epoch": 0.5274675690919346, + "grad_norm": 2.9731466081932125, + "learning_rate": 9.598089677082933e-07, + "loss": 0.5892, + "step": 1169 + }, + { + "epoch": 0.5279187817258884, + "grad_norm": 3.496014297140297, + "learning_rate": 9.583483065258363e-07, + "loss": 0.5361, + "step": 1170 + }, + { + "epoch": 0.5283699943598421, + "grad_norm": 3.2554831703742027, + "learning_rate": 9.568877343576212e-07, + "loss": 0.5437, + "step": 1171 + }, + { + "epoch": 0.5288212069937959, + "grad_norm": 3.0774362338181307, + "learning_rate": 9.554272543250516e-07, + "loss": 0.5781, + "step": 1172 + }, + { + "epoch": 0.5292724196277496, + "grad_norm": 3.189104546167665, + "learning_rate": 9.539668695493344e-07, + "loss": 0.4621, + "step": 1173 + }, + { + "epoch": 0.5297236322617034, + "grad_norm": 3.5804034846331607, + "learning_rate": 9.525065831514713e-07, + "loss": 0.5667, + "step": 1174 + }, + { + "epoch": 0.530174844895657, + "grad_norm": 3.0139310674345934, + "learning_rate": 9.510463982522551e-07, + "loss": 0.6764, + "step": 1175 + }, + { + "epoch": 0.5306260575296108, + "grad_norm": 3.6437122981783507, + "learning_rate": 9.495863179722615e-07, + "loss": 0.5191, + "step": 1176 + }, + { + "epoch": 0.5310772701635645, + "grad_norm": 3.780802968968682, + "learning_rate": 9.481263454318421e-07, + "loss": 0.5795, + "step": 1177 + }, + { + "epoch": 0.5315284827975183, + "grad_norm": 3.371236920516789, + "learning_rate": 9.466664837511186e-07, + "loss": 0.4982, + "step": 1178 + }, + { + "epoch": 0.531979695431472, + "grad_norm": 3.174724613560376, + "learning_rate": 9.452067360499753e-07, + "loss": 0.5753, + "step": 1179 + }, + { + "epoch": 0.5324309080654258, + "grad_norm": 2.9536643600933283, + "learning_rate": 9.437471054480539e-07, + "loss": 0.6468, + "step": 1180 + }, + { + "epoch": 0.5328821206993796, + "grad_norm": 3.358069442011158, + "learning_rate": 9.422875950647451e-07, + "loss": 0.5811, + "step": 1181 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 3.1595218350137357, + "learning_rate": 9.408282080191828e-07, + "loss": 0.5539, + "step": 1182 + }, + { + "epoch": 0.5337845459672871, + "grad_norm": 3.1013329069241844, + "learning_rate": 9.393689474302376e-07, + "loss": 0.4659, + "step": 1183 + }, + { + "epoch": 0.5342357586012408, + "grad_norm": 3.743900056060665, + "learning_rate": 9.379098164165092e-07, + "loss": 0.6178, + "step": 1184 + }, + { + "epoch": 0.5346869712351946, + "grad_norm": 3.10516339656473, + "learning_rate": 9.364508180963207e-07, + "loss": 0.5486, + "step": 1185 + }, + { + "epoch": 0.5351381838691484, + "grad_norm": 3.7247912121292965, + "learning_rate": 9.349919555877124e-07, + "loss": 0.6168, + "step": 1186 + }, + { + "epoch": 0.5355893965031021, + "grad_norm": 3.523392877840991, + "learning_rate": 9.335332320084331e-07, + "loss": 0.6578, + "step": 1187 + }, + { + "epoch": 0.5360406091370559, + "grad_norm": 3.029040493416117, + "learning_rate": 9.320746504759354e-07, + "loss": 0.6227, + "step": 1188 + }, + { + "epoch": 0.5364918217710096, + "grad_norm": 3.494834853466634, + "learning_rate": 9.306162141073685e-07, + "loss": 0.6807, + "step": 1189 + }, + { + "epoch": 0.5369430344049634, + "grad_norm": 3.149740148496557, + "learning_rate": 9.291579260195708e-07, + "loss": 0.5903, + "step": 1190 + }, + { + "epoch": 0.5373942470389171, + "grad_norm": 3.267822705572205, + "learning_rate": 9.27699789329064e-07, + "loss": 0.4847, + "step": 1191 + }, + { + "epoch": 0.5378454596728709, + "grad_norm": 3.714908029228556, + "learning_rate": 9.262418071520463e-07, + "loss": 0.6019, + "step": 1192 + }, + { + "epoch": 0.5382966723068245, + "grad_norm": 2.824698681778885, + "learning_rate": 9.247839826043857e-07, + "loss": 0.4482, + "step": 1193 + }, + { + "epoch": 0.5387478849407783, + "grad_norm": 3.151682454847607, + "learning_rate": 9.233263188016138e-07, + "loss": 0.4968, + "step": 1194 + }, + { + "epoch": 0.5391990975747321, + "grad_norm": 2.9790615764183355, + "learning_rate": 9.218688188589174e-07, + "loss": 0.5278, + "step": 1195 + }, + { + "epoch": 0.5396503102086858, + "grad_norm": 2.9627521296692474, + "learning_rate": 9.204114858911346e-07, + "loss": 0.5431, + "step": 1196 + }, + { + "epoch": 0.5401015228426396, + "grad_norm": 2.992708202976521, + "learning_rate": 9.189543230127461e-07, + "loss": 0.5062, + "step": 1197 + }, + { + "epoch": 0.5405527354765933, + "grad_norm": 2.8085674068135016, + "learning_rate": 9.174973333378683e-07, + "loss": 0.5778, + "step": 1198 + }, + { + "epoch": 0.5410039481105471, + "grad_norm": 3.324978073123737, + "learning_rate": 9.160405199802486e-07, + "loss": 0.5476, + "step": 1199 + }, + { + "epoch": 0.5414551607445008, + "grad_norm": 2.6559214068968937, + "learning_rate": 9.145838860532567e-07, + "loss": 0.5372, + "step": 1200 + }, + { + "epoch": 0.5419063733784546, + "grad_norm": 2.9815500519398763, + "learning_rate": 9.131274346698795e-07, + "loss": 0.5114, + "step": 1201 + }, + { + "epoch": 0.5423575860124084, + "grad_norm": 3.4127009993684196, + "learning_rate": 9.116711689427136e-07, + "loss": 0.6166, + "step": 1202 + }, + { + "epoch": 0.5428087986463621, + "grad_norm": 3.226502001346105, + "learning_rate": 9.102150919839589e-07, + "loss": 0.5777, + "step": 1203 + }, + { + "epoch": 0.5432600112803159, + "grad_norm": 3.2761488720411585, + "learning_rate": 9.087592069054118e-07, + "loss": 0.4045, + "step": 1204 + }, + { + "epoch": 0.5437112239142696, + "grad_norm": 3.2522550379255803, + "learning_rate": 9.073035168184579e-07, + "loss": 0.5614, + "step": 1205 + }, + { + "epoch": 0.5441624365482234, + "grad_norm": 3.158338840928273, + "learning_rate": 9.058480248340671e-07, + "loss": 0.5347, + "step": 1206 + }, + { + "epoch": 0.5446136491821771, + "grad_norm": 3.128846961573522, + "learning_rate": 9.043927340627857e-07, + "loss": 0.4551, + "step": 1207 + }, + { + "epoch": 0.5450648618161309, + "grad_norm": 3.2010639237879333, + "learning_rate": 9.029376476147301e-07, + "loss": 0.6002, + "step": 1208 + }, + { + "epoch": 0.5455160744500847, + "grad_norm": 3.0281082746251857, + "learning_rate": 9.014827685995794e-07, + "loss": 0.7246, + "step": 1209 + }, + { + "epoch": 0.5459672870840383, + "grad_norm": 3.011888432511697, + "learning_rate": 9.0002810012657e-07, + "loss": 0.5268, + "step": 1210 + }, + { + "epoch": 0.5464184997179921, + "grad_norm": 3.2243121571383386, + "learning_rate": 8.985736453044886e-07, + "loss": 0.5456, + "step": 1211 + }, + { + "epoch": 0.5468697123519458, + "grad_norm": 3.0825653650623126, + "learning_rate": 8.97119407241664e-07, + "loss": 0.5749, + "step": 1212 + }, + { + "epoch": 0.5473209249858996, + "grad_norm": 2.661130517981536, + "learning_rate": 8.956653890459632e-07, + "loss": 0.5596, + "step": 1213 + }, + { + "epoch": 0.5477721376198533, + "grad_norm": 3.724775101530356, + "learning_rate": 8.942115938247823e-07, + "loss": 0.5477, + "step": 1214 + }, + { + "epoch": 0.5482233502538071, + "grad_norm": 3.6650788579826585, + "learning_rate": 8.927580246850416e-07, + "loss": 0.4938, + "step": 1215 + }, + { + "epoch": 0.5486745628877608, + "grad_norm": 3.139755611427215, + "learning_rate": 8.913046847331784e-07, + "loss": 0.6102, + "step": 1216 + }, + { + "epoch": 0.5491257755217146, + "grad_norm": 2.9266705632636856, + "learning_rate": 8.89851577075139e-07, + "loss": 0.5387, + "step": 1217 + }, + { + "epoch": 0.5495769881556684, + "grad_norm": 3.033908297559131, + "learning_rate": 8.883987048163746e-07, + "loss": 0.4825, + "step": 1218 + }, + { + "epoch": 0.5500282007896221, + "grad_norm": 2.961499235356465, + "learning_rate": 8.869460710618323e-07, + "loss": 0.5537, + "step": 1219 + }, + { + "epoch": 0.5504794134235759, + "grad_norm": 2.58750526796041, + "learning_rate": 8.8549367891595e-07, + "loss": 0.414, + "step": 1220 + }, + { + "epoch": 0.5509306260575296, + "grad_norm": 3.2580895273085684, + "learning_rate": 8.840415314826496e-07, + "loss": 0.5591, + "step": 1221 + }, + { + "epoch": 0.5513818386914834, + "grad_norm": 3.416770692667306, + "learning_rate": 8.825896318653292e-07, + "loss": 0.5914, + "step": 1222 + }, + { + "epoch": 0.5518330513254371, + "grad_norm": 3.3868054346511385, + "learning_rate": 8.811379831668578e-07, + "loss": 0.5391, + "step": 1223 + }, + { + "epoch": 0.5522842639593909, + "grad_norm": 3.513167625744527, + "learning_rate": 8.796865884895685e-07, + "loss": 0.6352, + "step": 1224 + }, + { + "epoch": 0.5527354765933447, + "grad_norm": 3.2759348364567495, + "learning_rate": 8.782354509352505e-07, + "loss": 0.4788, + "step": 1225 + }, + { + "epoch": 0.5531866892272984, + "grad_norm": 3.16036405291083, + "learning_rate": 8.767845736051445e-07, + "loss": 0.5454, + "step": 1226 + }, + { + "epoch": 0.5536379018612522, + "grad_norm": 3.0291347992929563, + "learning_rate": 8.753339595999343e-07, + "loss": 0.4814, + "step": 1227 + }, + { + "epoch": 0.5540891144952058, + "grad_norm": 3.419410089774215, + "learning_rate": 8.738836120197414e-07, + "loss": 0.6572, + "step": 1228 + }, + { + "epoch": 0.5545403271291596, + "grad_norm": 3.4196948382062895, + "learning_rate": 8.724335339641183e-07, + "loss": 0.5833, + "step": 1229 + }, + { + "epoch": 0.5549915397631133, + "grad_norm": 2.95345812029554, + "learning_rate": 8.709837285320405e-07, + "loss": 0.5173, + "step": 1230 + }, + { + "epoch": 0.5554427523970671, + "grad_norm": 3.277178241020724, + "learning_rate": 8.695341988219013e-07, + "loss": 0.4623, + "step": 1231 + }, + { + "epoch": 0.5558939650310208, + "grad_norm": 3.4090915194634466, + "learning_rate": 8.68084947931506e-07, + "loss": 0.54, + "step": 1232 + }, + { + "epoch": 0.5563451776649746, + "grad_norm": 2.9424477792511072, + "learning_rate": 8.666359789580612e-07, + "loss": 0.4951, + "step": 1233 + }, + { + "epoch": 0.5567963902989284, + "grad_norm": 2.8093499614133166, + "learning_rate": 8.651872949981741e-07, + "loss": 0.5294, + "step": 1234 + }, + { + "epoch": 0.5572476029328821, + "grad_norm": 3.403220558325019, + "learning_rate": 8.637388991478404e-07, + "loss": 0.5359, + "step": 1235 + }, + { + "epoch": 0.5576988155668359, + "grad_norm": 2.7915279885504294, + "learning_rate": 8.622907945024417e-07, + "loss": 0.4772, + "step": 1236 + }, + { + "epoch": 0.5581500282007896, + "grad_norm": 3.402216398468153, + "learning_rate": 8.608429841567364e-07, + "loss": 0.6865, + "step": 1237 + }, + { + "epoch": 0.5586012408347434, + "grad_norm": 3.4495430024602993, + "learning_rate": 8.593954712048544e-07, + "loss": 0.5511, + "step": 1238 + }, + { + "epoch": 0.5590524534686971, + "grad_norm": 3.141295268567425, + "learning_rate": 8.579482587402899e-07, + "loss": 0.6404, + "step": 1239 + }, + { + "epoch": 0.5595036661026509, + "grad_norm": 3.5009079206917257, + "learning_rate": 8.565013498558941e-07, + "loss": 0.5031, + "step": 1240 + }, + { + "epoch": 0.5599548787366047, + "grad_norm": 3.4205180711589063, + "learning_rate": 8.550547476438708e-07, + "loss": 0.7429, + "step": 1241 + }, + { + "epoch": 0.5604060913705584, + "grad_norm": 2.773602345160653, + "learning_rate": 8.536084551957676e-07, + "loss": 0.5418, + "step": 1242 + }, + { + "epoch": 0.5608573040045122, + "grad_norm": 3.1366076676001704, + "learning_rate": 8.521624756024704e-07, + "loss": 0.554, + "step": 1243 + }, + { + "epoch": 0.5613085166384659, + "grad_norm": 3.1430067736085676, + "learning_rate": 8.507168119541963e-07, + "loss": 0.5359, + "step": 1244 + }, + { + "epoch": 0.5617597292724196, + "grad_norm": 3.3366605489268717, + "learning_rate": 8.492714673404871e-07, + "loss": 0.6201, + "step": 1245 + }, + { + "epoch": 0.5622109419063733, + "grad_norm": 2.9317529934404396, + "learning_rate": 8.478264448502036e-07, + "loss": 0.5387, + "step": 1246 + }, + { + "epoch": 0.5626621545403271, + "grad_norm": 2.8606734322020233, + "learning_rate": 8.463817475715168e-07, + "loss": 0.5679, + "step": 1247 + }, + { + "epoch": 0.5631133671742808, + "grad_norm": 2.921942814808396, + "learning_rate": 8.449373785919034e-07, + "loss": 0.6017, + "step": 1248 + }, + { + "epoch": 0.5635645798082346, + "grad_norm": 3.108017504971404, + "learning_rate": 8.434933409981389e-07, + "loss": 0.5762, + "step": 1249 + }, + { + "epoch": 0.5640157924421884, + "grad_norm": 3.2256170994042828, + "learning_rate": 8.420496378762899e-07, + "loss": 0.5252, + "step": 1250 + }, + { + "epoch": 0.5644670050761421, + "grad_norm": 2.733940948644607, + "learning_rate": 8.406062723117089e-07, + "loss": 0.4403, + "step": 1251 + }, + { + "epoch": 0.5649182177100959, + "grad_norm": 3.130827807611649, + "learning_rate": 8.391632473890262e-07, + "loss": 0.5186, + "step": 1252 + }, + { + "epoch": 0.5653694303440496, + "grad_norm": 2.8551274180587187, + "learning_rate": 8.377205661921452e-07, + "loss": 0.5046, + "step": 1253 + }, + { + "epoch": 0.5658206429780034, + "grad_norm": 3.0734293806965263, + "learning_rate": 8.36278231804233e-07, + "loss": 0.5113, + "step": 1254 + }, + { + "epoch": 0.5662718556119571, + "grad_norm": 3.0293954304204593, + "learning_rate": 8.348362473077169e-07, + "loss": 0.4135, + "step": 1255 + }, + { + "epoch": 0.5667230682459109, + "grad_norm": 3.2711429622012087, + "learning_rate": 8.333946157842767e-07, + "loss": 0.5911, + "step": 1256 + }, + { + "epoch": 0.5671742808798647, + "grad_norm": 3.21348983820472, + "learning_rate": 8.319533403148366e-07, + "loss": 0.5845, + "step": 1257 + }, + { + "epoch": 0.5676254935138184, + "grad_norm": 2.977221190242596, + "learning_rate": 8.305124239795608e-07, + "loss": 0.4975, + "step": 1258 + }, + { + "epoch": 0.5680767061477722, + "grad_norm": 3.317575862791347, + "learning_rate": 8.29071869857846e-07, + "loss": 0.5145, + "step": 1259 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 3.0078612380810217, + "learning_rate": 8.27631681028314e-07, + "loss": 0.4761, + "step": 1260 + }, + { + "epoch": 0.5689791314156797, + "grad_norm": 2.9469364940545684, + "learning_rate": 8.26191860568807e-07, + "loss": 0.4903, + "step": 1261 + }, + { + "epoch": 0.5694303440496334, + "grad_norm": 2.8641257568656804, + "learning_rate": 8.247524115563788e-07, + "loss": 0.4808, + "step": 1262 + }, + { + "epoch": 0.5698815566835871, + "grad_norm": 3.3655191743332313, + "learning_rate": 8.233133370672903e-07, + "loss": 0.684, + "step": 1263 + }, + { + "epoch": 0.5703327693175408, + "grad_norm": 3.2105020379563602, + "learning_rate": 8.218746401770021e-07, + "loss": 0.5674, + "step": 1264 + }, + { + "epoch": 0.5707839819514946, + "grad_norm": 3.5642628952355975, + "learning_rate": 8.204363239601668e-07, + "loss": 0.501, + "step": 1265 + }, + { + "epoch": 0.5712351945854484, + "grad_norm": 2.838113061969968, + "learning_rate": 8.189983914906248e-07, + "loss": 0.616, + "step": 1266 + }, + { + "epoch": 0.5716864072194021, + "grad_norm": 2.9737366718198626, + "learning_rate": 8.175608458413947e-07, + "loss": 0.576, + "step": 1267 + }, + { + "epoch": 0.5721376198533559, + "grad_norm": 2.7885730115959597, + "learning_rate": 8.161236900846702e-07, + "loss": 0.4278, + "step": 1268 + }, + { + "epoch": 0.5725888324873096, + "grad_norm": 3.4834364046330792, + "learning_rate": 8.146869272918108e-07, + "loss": 0.5024, + "step": 1269 + }, + { + "epoch": 0.5730400451212634, + "grad_norm": 3.0305631394109054, + "learning_rate": 8.132505605333361e-07, + "loss": 0.542, + "step": 1270 + }, + { + "epoch": 0.5734912577552171, + "grad_norm": 2.7776093782076705, + "learning_rate": 8.118145928789197e-07, + "loss": 0.382, + "step": 1271 + }, + { + "epoch": 0.5739424703891709, + "grad_norm": 2.982019078285388, + "learning_rate": 8.103790273973823e-07, + "loss": 0.5661, + "step": 1272 + }, + { + "epoch": 0.5743936830231247, + "grad_norm": 2.9581883979360155, + "learning_rate": 8.089438671566852e-07, + "loss": 0.5193, + "step": 1273 + }, + { + "epoch": 0.5748448956570784, + "grad_norm": 2.973062841681966, + "learning_rate": 8.075091152239229e-07, + "loss": 0.5088, + "step": 1274 + }, + { + "epoch": 0.5752961082910322, + "grad_norm": 3.4183675101644595, + "learning_rate": 8.060747746653179e-07, + "loss": 0.6613, + "step": 1275 + }, + { + "epoch": 0.5757473209249859, + "grad_norm": 3.5528615876910012, + "learning_rate": 8.046408485462136e-07, + "loss": 0.5776, + "step": 1276 + }, + { + "epoch": 0.5761985335589397, + "grad_norm": 3.332215928387168, + "learning_rate": 8.032073399310677e-07, + "loss": 0.4554, + "step": 1277 + }, + { + "epoch": 0.5766497461928934, + "grad_norm": 3.1567473951867178, + "learning_rate": 8.017742518834453e-07, + "loss": 0.5779, + "step": 1278 + }, + { + "epoch": 0.5771009588268472, + "grad_norm": 3.1547565169857723, + "learning_rate": 8.003415874660129e-07, + "loss": 0.6327, + "step": 1279 + }, + { + "epoch": 0.577552171460801, + "grad_norm": 2.9713115658035547, + "learning_rate": 7.989093497405322e-07, + "loss": 0.5887, + "step": 1280 + }, + { + "epoch": 0.5780033840947546, + "grad_norm": 3.2508596246355834, + "learning_rate": 7.974775417678517e-07, + "loss": 0.5491, + "step": 1281 + }, + { + "epoch": 0.5784545967287084, + "grad_norm": 3.2239593887990288, + "learning_rate": 7.960461666079029e-07, + "loss": 0.5808, + "step": 1282 + }, + { + "epoch": 0.5789058093626621, + "grad_norm": 3.3138402872343966, + "learning_rate": 7.946152273196911e-07, + "loss": 0.5569, + "step": 1283 + }, + { + "epoch": 0.5793570219966159, + "grad_norm": 3.434837325852875, + "learning_rate": 7.931847269612911e-07, + "loss": 0.5351, + "step": 1284 + }, + { + "epoch": 0.5798082346305696, + "grad_norm": 3.234624298819192, + "learning_rate": 7.917546685898391e-07, + "loss": 0.5941, + "step": 1285 + }, + { + "epoch": 0.5802594472645234, + "grad_norm": 3.441772930399985, + "learning_rate": 7.903250552615272e-07, + "loss": 0.6435, + "step": 1286 + }, + { + "epoch": 0.5807106598984771, + "grad_norm": 3.2690887674264895, + "learning_rate": 7.888958900315959e-07, + "loss": 0.5054, + "step": 1287 + }, + { + "epoch": 0.5811618725324309, + "grad_norm": 3.9833861316481256, + "learning_rate": 7.874671759543278e-07, + "loss": 0.599, + "step": 1288 + }, + { + "epoch": 0.5816130851663847, + "grad_norm": 2.868845567268956, + "learning_rate": 7.860389160830419e-07, + "loss": 0.4934, + "step": 1289 + }, + { + "epoch": 0.5820642978003384, + "grad_norm": 3.047832823905869, + "learning_rate": 7.846111134700867e-07, + "loss": 0.5081, + "step": 1290 + }, + { + "epoch": 0.5825155104342922, + "grad_norm": 3.275881282842554, + "learning_rate": 7.831837711668332e-07, + "loss": 0.5563, + "step": 1291 + }, + { + "epoch": 0.5829667230682459, + "grad_norm": 3.14320311170746, + "learning_rate": 7.817568922236681e-07, + "loss": 0.5681, + "step": 1292 + }, + { + "epoch": 0.5834179357021997, + "grad_norm": 3.524498053095352, + "learning_rate": 7.80330479689989e-07, + "loss": 0.6049, + "step": 1293 + }, + { + "epoch": 0.5838691483361534, + "grad_norm": 3.118060779938019, + "learning_rate": 7.789045366141961e-07, + "loss": 0.6507, + "step": 1294 + }, + { + "epoch": 0.5843203609701072, + "grad_norm": 3.3177816993811065, + "learning_rate": 7.774790660436857e-07, + "loss": 0.4724, + "step": 1295 + }, + { + "epoch": 0.584771573604061, + "grad_norm": 2.837955287125353, + "learning_rate": 7.760540710248454e-07, + "loss": 0.51, + "step": 1296 + }, + { + "epoch": 0.5852227862380147, + "grad_norm": 3.3668111960232117, + "learning_rate": 7.746295546030458e-07, + "loss": 0.637, + "step": 1297 + }, + { + "epoch": 0.5856739988719684, + "grad_norm": 3.103539045578858, + "learning_rate": 7.732055198226351e-07, + "loss": 0.4814, + "step": 1298 + }, + { + "epoch": 0.5861252115059221, + "grad_norm": 3.0421653752914413, + "learning_rate": 7.717819697269321e-07, + "loss": 0.525, + "step": 1299 + }, + { + "epoch": 0.5865764241398759, + "grad_norm": 3.644103387205874, + "learning_rate": 7.703589073582193e-07, + "loss": 0.6064, + "step": 1300 + }, + { + "epoch": 0.5870276367738296, + "grad_norm": 2.8055962606604092, + "learning_rate": 7.689363357577378e-07, + "loss": 0.5143, + "step": 1301 + }, + { + "epoch": 0.5874788494077834, + "grad_norm": 3.277739442754899, + "learning_rate": 7.675142579656788e-07, + "loss": 0.5523, + "step": 1302 + }, + { + "epoch": 0.5879300620417371, + "grad_norm": 3.460236702752055, + "learning_rate": 7.660926770211787e-07, + "loss": 0.5705, + "step": 1303 + }, + { + "epoch": 0.5883812746756909, + "grad_norm": 3.260180098933021, + "learning_rate": 7.646715959623125e-07, + "loss": 0.6103, + "step": 1304 + }, + { + "epoch": 0.5888324873096447, + "grad_norm": 3.4686183563952824, + "learning_rate": 7.632510178260859e-07, + "loss": 0.6584, + "step": 1305 + }, + { + "epoch": 0.5892836999435984, + "grad_norm": 3.2006762499940353, + "learning_rate": 7.618309456484308e-07, + "loss": 0.4899, + "step": 1306 + }, + { + "epoch": 0.5897349125775522, + "grad_norm": 2.9971855862973267, + "learning_rate": 7.604113824641973e-07, + "loss": 0.6366, + "step": 1307 + }, + { + "epoch": 0.5901861252115059, + "grad_norm": 3.1873994214172807, + "learning_rate": 7.589923313071479e-07, + "loss": 0.6843, + "step": 1308 + }, + { + "epoch": 0.5906373378454597, + "grad_norm": 3.2537367572253824, + "learning_rate": 7.575737952099505e-07, + "loss": 0.5228, + "step": 1309 + }, + { + "epoch": 0.5910885504794134, + "grad_norm": 3.079550130831521, + "learning_rate": 7.561557772041725e-07, + "loss": 0.4874, + "step": 1310 + }, + { + "epoch": 0.5915397631133672, + "grad_norm": 3.2716458004854543, + "learning_rate": 7.547382803202742e-07, + "loss": 0.5857, + "step": 1311 + }, + { + "epoch": 0.591990975747321, + "grad_norm": 3.1743175723654735, + "learning_rate": 7.533213075876022e-07, + "loss": 0.529, + "step": 1312 + }, + { + "epoch": 0.5924421883812747, + "grad_norm": 3.8188908521422458, + "learning_rate": 7.519048620343825e-07, + "loss": 0.6333, + "step": 1313 + }, + { + "epoch": 0.5928934010152285, + "grad_norm": 2.8402686230355156, + "learning_rate": 7.504889466877149e-07, + "loss": 0.5294, + "step": 1314 + }, + { + "epoch": 0.5933446136491822, + "grad_norm": 3.207555735354162, + "learning_rate": 7.490735645735666e-07, + "loss": 0.4968, + "step": 1315 + }, + { + "epoch": 0.5937958262831359, + "grad_norm": 3.0695972720324622, + "learning_rate": 7.476587187167635e-07, + "loss": 0.5465, + "step": 1316 + }, + { + "epoch": 0.5942470389170896, + "grad_norm": 2.9505661618967283, + "learning_rate": 7.462444121409875e-07, + "loss": 0.6166, + "step": 1317 + }, + { + "epoch": 0.5946982515510434, + "grad_norm": 3.2187802075234737, + "learning_rate": 7.448306478687663e-07, + "loss": 0.4585, + "step": 1318 + }, + { + "epoch": 0.5951494641849971, + "grad_norm": 3.1253329687327818, + "learning_rate": 7.434174289214696e-07, + "loss": 0.5872, + "step": 1319 + }, + { + "epoch": 0.5956006768189509, + "grad_norm": 2.945507566127143, + "learning_rate": 7.420047583193018e-07, + "loss": 0.5749, + "step": 1320 + }, + { + "epoch": 0.5960518894529047, + "grad_norm": 2.8046185112075506, + "learning_rate": 7.405926390812952e-07, + "loss": 0.6378, + "step": 1321 + }, + { + "epoch": 0.5965031020868584, + "grad_norm": 3.3367837174035246, + "learning_rate": 7.391810742253035e-07, + "loss": 0.5555, + "step": 1322 + }, + { + "epoch": 0.5969543147208122, + "grad_norm": 3.361270385049288, + "learning_rate": 7.377700667679952e-07, + "loss": 0.4942, + "step": 1323 + }, + { + "epoch": 0.5974055273547659, + "grad_norm": 3.198915242112322, + "learning_rate": 7.363596197248488e-07, + "loss": 0.4949, + "step": 1324 + }, + { + "epoch": 0.5978567399887197, + "grad_norm": 3.329606223960123, + "learning_rate": 7.349497361101442e-07, + "loss": 0.5877, + "step": 1325 + }, + { + "epoch": 0.5983079526226734, + "grad_norm": 3.519739136879892, + "learning_rate": 7.335404189369578e-07, + "loss": 0.645, + "step": 1326 + }, + { + "epoch": 0.5987591652566272, + "grad_norm": 3.0247206849836976, + "learning_rate": 7.321316712171551e-07, + "loss": 0.4796, + "step": 1327 + }, + { + "epoch": 0.599210377890581, + "grad_norm": 3.186177089807284, + "learning_rate": 7.307234959613842e-07, + "loss": 0.5669, + "step": 1328 + }, + { + "epoch": 0.5996615905245347, + "grad_norm": 3.130095831838903, + "learning_rate": 7.293158961790714e-07, + "loss": 0.5828, + "step": 1329 + }, + { + "epoch": 0.6001128031584885, + "grad_norm": 3.384495690869306, + "learning_rate": 7.279088748784105e-07, + "loss": 0.6257, + "step": 1330 + }, + { + "epoch": 0.6005640157924422, + "grad_norm": 3.3442845371887135, + "learning_rate": 7.265024350663615e-07, + "loss": 0.5929, + "step": 1331 + }, + { + "epoch": 0.601015228426396, + "grad_norm": 3.329300257806507, + "learning_rate": 7.250965797486404e-07, + "loss": 0.5622, + "step": 1332 + }, + { + "epoch": 0.6014664410603497, + "grad_norm": 3.137210816893963, + "learning_rate": 7.236913119297144e-07, + "loss": 0.4791, + "step": 1333 + }, + { + "epoch": 0.6019176536943034, + "grad_norm": 3.2475644991308297, + "learning_rate": 7.222866346127952e-07, + "loss": 0.5562, + "step": 1334 + }, + { + "epoch": 0.6023688663282571, + "grad_norm": 2.964087669113336, + "learning_rate": 7.208825507998325e-07, + "loss": 0.5684, + "step": 1335 + }, + { + "epoch": 0.6028200789622109, + "grad_norm": 3.41721759180177, + "learning_rate": 7.194790634915075e-07, + "loss": 0.5463, + "step": 1336 + }, + { + "epoch": 0.6032712915961647, + "grad_norm": 2.8453268851912923, + "learning_rate": 7.180761756872267e-07, + "loss": 0.5151, + "step": 1337 + }, + { + "epoch": 0.6037225042301184, + "grad_norm": 3.2646935195604483, + "learning_rate": 7.166738903851153e-07, + "loss": 0.4982, + "step": 1338 + }, + { + "epoch": 0.6041737168640722, + "grad_norm": 2.970220890446487, + "learning_rate": 7.152722105820112e-07, + "loss": 0.5154, + "step": 1339 + }, + { + "epoch": 0.6046249294980259, + "grad_norm": 3.4187797390981114, + "learning_rate": 7.138711392734578e-07, + "loss": 0.4689, + "step": 1340 + }, + { + "epoch": 0.6050761421319797, + "grad_norm": 2.985186967050454, + "learning_rate": 7.124706794536983e-07, + "loss": 0.5244, + "step": 1341 + }, + { + "epoch": 0.6055273547659334, + "grad_norm": 3.455866470082041, + "learning_rate": 7.110708341156698e-07, + "loss": 0.5728, + "step": 1342 + }, + { + "epoch": 0.6059785673998872, + "grad_norm": 3.6106718461697977, + "learning_rate": 7.096716062509947e-07, + "loss": 0.5279, + "step": 1343 + }, + { + "epoch": 0.606429780033841, + "grad_norm": 4.105211530606187, + "learning_rate": 7.082729988499768e-07, + "loss": 0.6094, + "step": 1344 + }, + { + "epoch": 0.6068809926677947, + "grad_norm": 2.693788412848332, + "learning_rate": 7.068750149015936e-07, + "loss": 0.4555, + "step": 1345 + }, + { + "epoch": 0.6073322053017485, + "grad_norm": 3.0065637977545676, + "learning_rate": 7.054776573934905e-07, + "loss": 0.565, + "step": 1346 + }, + { + "epoch": 0.6077834179357022, + "grad_norm": 3.289187175236741, + "learning_rate": 7.04080929311974e-07, + "loss": 0.5429, + "step": 1347 + }, + { + "epoch": 0.608234630569656, + "grad_norm": 3.7046080997067903, + "learning_rate": 7.026848336420052e-07, + "loss": 0.5946, + "step": 1348 + }, + { + "epoch": 0.6086858432036097, + "grad_norm": 2.9252591310820586, + "learning_rate": 7.012893733671943e-07, + "loss": 0.4993, + "step": 1349 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 2.8882168795713894, + "learning_rate": 6.998945514697923e-07, + "loss": 0.4905, + "step": 1350 + }, + { + "epoch": 0.6095882684715171, + "grad_norm": 3.583128007061669, + "learning_rate": 6.985003709306871e-07, + "loss": 0.4726, + "step": 1351 + }, + { + "epoch": 0.6100394811054709, + "grad_norm": 3.130137883692283, + "learning_rate": 6.97106834729396e-07, + "loss": 0.4106, + "step": 1352 + }, + { + "epoch": 0.6104906937394247, + "grad_norm": 3.0703439274804682, + "learning_rate": 6.957139458440584e-07, + "loss": 0.5424, + "step": 1353 + }, + { + "epoch": 0.6109419063733784, + "grad_norm": 3.0314143588988496, + "learning_rate": 6.943217072514311e-07, + "loss": 0.6235, + "step": 1354 + }, + { + "epoch": 0.6113931190073322, + "grad_norm": 3.495368048533246, + "learning_rate": 6.929301219268805e-07, + "loss": 0.664, + "step": 1355 + }, + { + "epoch": 0.6118443316412859, + "grad_norm": 3.2783156409995318, + "learning_rate": 6.915391928443779e-07, + "loss": 0.5994, + "step": 1356 + }, + { + "epoch": 0.6122955442752397, + "grad_norm": 3.1900987577022075, + "learning_rate": 6.90148922976491e-07, + "loss": 0.473, + "step": 1357 + }, + { + "epoch": 0.6127467569091934, + "grad_norm": 3.542481502196449, + "learning_rate": 6.887593152943789e-07, + "loss": 0.6318, + "step": 1358 + }, + { + "epoch": 0.6131979695431472, + "grad_norm": 3.0512134064626077, + "learning_rate": 6.873703727677862e-07, + "loss": 0.4577, + "step": 1359 + }, + { + "epoch": 0.613649182177101, + "grad_norm": 3.0267553714028814, + "learning_rate": 6.859820983650355e-07, + "loss": 0.5358, + "step": 1360 + }, + { + "epoch": 0.6141003948110547, + "grad_norm": 3.320872889591731, + "learning_rate": 6.845944950530218e-07, + "loss": 0.5442, + "step": 1361 + }, + { + "epoch": 0.6145516074450085, + "grad_norm": 3.2701182397565347, + "learning_rate": 6.832075657972054e-07, + "loss": 0.5837, + "step": 1362 + }, + { + "epoch": 0.6150028200789622, + "grad_norm": 3.3678828828961533, + "learning_rate": 6.818213135616071e-07, + "loss": 0.663, + "step": 1363 + }, + { + "epoch": 0.615454032712916, + "grad_norm": 3.1223684191824854, + "learning_rate": 6.804357413087992e-07, + "loss": 0.529, + "step": 1364 + }, + { + "epoch": 0.6159052453468697, + "grad_norm": 3.686360531224555, + "learning_rate": 6.790508519999023e-07, + "loss": 0.6493, + "step": 1365 + }, + { + "epoch": 0.6163564579808235, + "grad_norm": 2.9271029540244005, + "learning_rate": 6.776666485945769e-07, + "loss": 0.5776, + "step": 1366 + }, + { + "epoch": 0.6168076706147773, + "grad_norm": 3.4729445853273737, + "learning_rate": 6.762831340510174e-07, + "loss": 0.512, + "step": 1367 + }, + { + "epoch": 0.617258883248731, + "grad_norm": 3.14390720715517, + "learning_rate": 6.749003113259466e-07, + "loss": 0.5518, + "step": 1368 + }, + { + "epoch": 0.6177100958826847, + "grad_norm": 3.138843726117474, + "learning_rate": 6.735181833746087e-07, + "loss": 0.6031, + "step": 1369 + }, + { + "epoch": 0.6181613085166384, + "grad_norm": 3.272624979852151, + "learning_rate": 6.721367531507626e-07, + "loss": 0.5369, + "step": 1370 + }, + { + "epoch": 0.6186125211505922, + "grad_norm": 3.562938902127868, + "learning_rate": 6.70756023606676e-07, + "loss": 0.6634, + "step": 1371 + }, + { + "epoch": 0.6190637337845459, + "grad_norm": 2.937710815940084, + "learning_rate": 6.6937599769312e-07, + "loss": 0.6368, + "step": 1372 + }, + { + "epoch": 0.6195149464184997, + "grad_norm": 3.2281736301884596, + "learning_rate": 6.679966783593615e-07, + "loss": 0.5482, + "step": 1373 + }, + { + "epoch": 0.6199661590524534, + "grad_norm": 3.46989691765386, + "learning_rate": 6.666180685531575e-07, + "loss": 0.5121, + "step": 1374 + }, + { + "epoch": 0.6204173716864072, + "grad_norm": 2.9512285549656525, + "learning_rate": 6.65240171220748e-07, + "loss": 0.6428, + "step": 1375 + }, + { + "epoch": 0.620868584320361, + "grad_norm": 3.5246959007009044, + "learning_rate": 6.638629893068515e-07, + "loss": 0.5731, + "step": 1376 + }, + { + "epoch": 0.6213197969543147, + "grad_norm": 3.0891012405500304, + "learning_rate": 6.62486525754657e-07, + "loss": 0.4931, + "step": 1377 + }, + { + "epoch": 0.6217710095882685, + "grad_norm": 2.9732550708765233, + "learning_rate": 6.611107835058174e-07, + "loss": 0.4172, + "step": 1378 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.933874068194838, + "learning_rate": 6.59735765500446e-07, + "loss": 0.5248, + "step": 1379 + }, + { + "epoch": 0.622673434856176, + "grad_norm": 3.1023005159339165, + "learning_rate": 6.583614746771064e-07, + "loss": 0.5489, + "step": 1380 + }, + { + "epoch": 0.6231246474901297, + "grad_norm": 2.9712725239339477, + "learning_rate": 6.569879139728096e-07, + "loss": 0.5301, + "step": 1381 + }, + { + "epoch": 0.6235758601240835, + "grad_norm": 3.1648033323683045, + "learning_rate": 6.556150863230054e-07, + "loss": 0.5682, + "step": 1382 + }, + { + "epoch": 0.6240270727580373, + "grad_norm": 2.7399149382162564, + "learning_rate": 6.542429946615773e-07, + "loss": 0.4796, + "step": 1383 + }, + { + "epoch": 0.624478285391991, + "grad_norm": 2.8260892433984073, + "learning_rate": 6.528716419208361e-07, + "loss": 0.5453, + "step": 1384 + }, + { + "epoch": 0.6249294980259448, + "grad_norm": 3.145384660363434, + "learning_rate": 6.515010310315125e-07, + "loss": 0.638, + "step": 1385 + }, + { + "epoch": 0.6253807106598985, + "grad_norm": 3.0447519086357846, + "learning_rate": 6.50131164922753e-07, + "loss": 0.5322, + "step": 1386 + }, + { + "epoch": 0.6258319232938522, + "grad_norm": 3.4448461436407594, + "learning_rate": 6.487620465221117e-07, + "loss": 0.5624, + "step": 1387 + }, + { + "epoch": 0.6262831359278059, + "grad_norm": 3.0855788626135023, + "learning_rate": 6.473936787555452e-07, + "loss": 0.4002, + "step": 1388 + }, + { + "epoch": 0.6267343485617597, + "grad_norm": 2.943659166164125, + "learning_rate": 6.460260645474052e-07, + "loss": 0.4525, + "step": 1389 + }, + { + "epoch": 0.6271855611957134, + "grad_norm": 2.9945684467434135, + "learning_rate": 6.44659206820434e-07, + "loss": 0.4683, + "step": 1390 + }, + { + "epoch": 0.6276367738296672, + "grad_norm": 2.929306663688666, + "learning_rate": 6.432931084957566e-07, + "loss": 0.5541, + "step": 1391 + }, + { + "epoch": 0.628087986463621, + "grad_norm": 3.0908644677993817, + "learning_rate": 6.419277724928747e-07, + "loss": 0.5163, + "step": 1392 + }, + { + "epoch": 0.6285391990975747, + "grad_norm": 3.001491521797685, + "learning_rate": 6.405632017296614e-07, + "loss": 0.6203, + "step": 1393 + }, + { + "epoch": 0.6289904117315285, + "grad_norm": 3.1474986627885895, + "learning_rate": 6.391993991223543e-07, + "loss": 0.544, + "step": 1394 + }, + { + "epoch": 0.6294416243654822, + "grad_norm": 3.2290177386909966, + "learning_rate": 6.378363675855494e-07, + "loss": 0.5752, + "step": 1395 + }, + { + "epoch": 0.629892836999436, + "grad_norm": 3.222879746589294, + "learning_rate": 6.364741100321947e-07, + "loss": 0.4536, + "step": 1396 + }, + { + "epoch": 0.6303440496333897, + "grad_norm": 3.0349129642474755, + "learning_rate": 6.351126293735842e-07, + "loss": 0.4456, + "step": 1397 + }, + { + "epoch": 0.6307952622673435, + "grad_norm": 3.218838258944328, + "learning_rate": 6.33751928519352e-07, + "loss": 0.5432, + "step": 1398 + }, + { + "epoch": 0.6312464749012973, + "grad_norm": 3.2926190596476284, + "learning_rate": 6.323920103774644e-07, + "loss": 0.5973, + "step": 1399 + }, + { + "epoch": 0.631697687535251, + "grad_norm": 3.3559310121476775, + "learning_rate": 6.310328778542162e-07, + "loss": 0.5248, + "step": 1400 + }, + { + "epoch": 0.6321489001692048, + "grad_norm": 3.6153857414340793, + "learning_rate": 6.296745338542229e-07, + "loss": 0.6443, + "step": 1401 + }, + { + "epoch": 0.6326001128031585, + "grad_norm": 3.1304394163996623, + "learning_rate": 6.283169812804146e-07, + "loss": 0.5722, + "step": 1402 + }, + { + "epoch": 0.6330513254371123, + "grad_norm": 3.0269630181138396, + "learning_rate": 6.269602230340304e-07, + "loss": 0.5002, + "step": 1403 + }, + { + "epoch": 0.6335025380710659, + "grad_norm": 2.901434555273011, + "learning_rate": 6.256042620146118e-07, + "loss": 0.4313, + "step": 1404 + }, + { + "epoch": 0.6339537507050197, + "grad_norm": 2.904472954933817, + "learning_rate": 6.242491011199963e-07, + "loss": 0.5316, + "step": 1405 + }, + { + "epoch": 0.6344049633389734, + "grad_norm": 3.569318854020443, + "learning_rate": 6.228947432463111e-07, + "loss": 0.6989, + "step": 1406 + }, + { + "epoch": 0.6348561759729272, + "grad_norm": 3.70336661181829, + "learning_rate": 6.21541191287968e-07, + "loss": 0.4956, + "step": 1407 + }, + { + "epoch": 0.635307388606881, + "grad_norm": 2.835895432940531, + "learning_rate": 6.201884481376562e-07, + "loss": 0.5343, + "step": 1408 + }, + { + "epoch": 0.6357586012408347, + "grad_norm": 3.154332164144894, + "learning_rate": 6.188365166863365e-07, + "loss": 0.4946, + "step": 1409 + }, + { + "epoch": 0.6362098138747885, + "grad_norm": 2.999966045677623, + "learning_rate": 6.174853998232346e-07, + "loss": 0.4792, + "step": 1410 + }, + { + "epoch": 0.6366610265087422, + "grad_norm": 2.823054257582786, + "learning_rate": 6.161351004358359e-07, + "loss": 0.4774, + "step": 1411 + }, + { + "epoch": 0.637112239142696, + "grad_norm": 3.1087901532322473, + "learning_rate": 6.14785621409878e-07, + "loss": 0.5926, + "step": 1412 + }, + { + "epoch": 0.6375634517766497, + "grad_norm": 3.19373907556398, + "learning_rate": 6.13436965629346e-07, + "loss": 0.4821, + "step": 1413 + }, + { + "epoch": 0.6380146644106035, + "grad_norm": 3.7849347226642136, + "learning_rate": 6.120891359764655e-07, + "loss": 0.5625, + "step": 1414 + }, + { + "epoch": 0.6384658770445573, + "grad_norm": 3.214785840415643, + "learning_rate": 6.107421353316964e-07, + "loss": 0.5056, + "step": 1415 + }, + { + "epoch": 0.638917089678511, + "grad_norm": 3.2716930771356387, + "learning_rate": 6.093959665737267e-07, + "loss": 0.5974, + "step": 1416 + }, + { + "epoch": 0.6393683023124648, + "grad_norm": 2.657202103063792, + "learning_rate": 6.080506325794674e-07, + "loss": 0.458, + "step": 1417 + }, + { + "epoch": 0.6398195149464185, + "grad_norm": 3.4613963613287284, + "learning_rate": 6.067061362240448e-07, + "loss": 0.5536, + "step": 1418 + }, + { + "epoch": 0.6402707275803723, + "grad_norm": 3.1871447082433213, + "learning_rate": 6.053624803807951e-07, + "loss": 0.5803, + "step": 1419 + }, + { + "epoch": 0.640721940214326, + "grad_norm": 3.1180639042969274, + "learning_rate": 6.040196679212582e-07, + "loss": 0.4619, + "step": 1420 + }, + { + "epoch": 0.6411731528482798, + "grad_norm": 3.179821687875411, + "learning_rate": 6.026777017151718e-07, + "loss": 0.584, + "step": 1421 + }, + { + "epoch": 0.6416243654822334, + "grad_norm": 3.917598055933022, + "learning_rate": 6.013365846304657e-07, + "loss": 0.4427, + "step": 1422 + }, + { + "epoch": 0.6420755781161872, + "grad_norm": 3.259154452067742, + "learning_rate": 5.999963195332536e-07, + "loss": 0.5559, + "step": 1423 + }, + { + "epoch": 0.642526790750141, + "grad_norm": 3.4742010090894806, + "learning_rate": 5.986569092878296e-07, + "loss": 0.5994, + "step": 1424 + }, + { + "epoch": 0.6429780033840947, + "grad_norm": 3.753798551895116, + "learning_rate": 5.973183567566604e-07, + "loss": 0.6971, + "step": 1425 + }, + { + "epoch": 0.6434292160180485, + "grad_norm": 2.941155690931197, + "learning_rate": 5.959806648003796e-07, + "loss": 0.516, + "step": 1426 + }, + { + "epoch": 0.6438804286520022, + "grad_norm": 2.9533012729371326, + "learning_rate": 5.946438362777819e-07, + "loss": 0.6212, + "step": 1427 + }, + { + "epoch": 0.644331641285956, + "grad_norm": 3.3329411483235964, + "learning_rate": 5.933078740458166e-07, + "loss": 0.6503, + "step": 1428 + }, + { + "epoch": 0.6447828539199097, + "grad_norm": 3.498152521685046, + "learning_rate": 5.919727809595815e-07, + "loss": 0.5699, + "step": 1429 + }, + { + "epoch": 0.6452340665538635, + "grad_norm": 3.638822037194711, + "learning_rate": 5.906385598723178e-07, + "loss": 0.5854, + "step": 1430 + }, + { + "epoch": 0.6456852791878173, + "grad_norm": 3.381793813968072, + "learning_rate": 5.893052136354017e-07, + "loss": 0.4964, + "step": 1431 + }, + { + "epoch": 0.646136491821771, + "grad_norm": 3.3009296858703268, + "learning_rate": 5.879727450983412e-07, + "loss": 0.593, + "step": 1432 + }, + { + "epoch": 0.6465877044557248, + "grad_norm": 3.550794325305599, + "learning_rate": 5.866411571087671e-07, + "loss": 0.5447, + "step": 1433 + }, + { + "epoch": 0.6470389170896785, + "grad_norm": 3.3261654579811992, + "learning_rate": 5.853104525124297e-07, + "loss": 0.5494, + "step": 1434 + }, + { + "epoch": 0.6474901297236323, + "grad_norm": 3.428245584517914, + "learning_rate": 5.839806341531908e-07, + "loss": 0.6307, + "step": 1435 + }, + { + "epoch": 0.647941342357586, + "grad_norm": 3.0629393848960365, + "learning_rate": 5.82651704873018e-07, + "loss": 0.5884, + "step": 1436 + }, + { + "epoch": 0.6483925549915398, + "grad_norm": 2.948994135334962, + "learning_rate": 5.813236675119793e-07, + "loss": 0.4318, + "step": 1437 + }, + { + "epoch": 0.6488437676254936, + "grad_norm": 3.760438297720544, + "learning_rate": 5.79996524908236e-07, + "loss": 0.5324, + "step": 1438 + }, + { + "epoch": 0.6492949802594473, + "grad_norm": 3.49087174910647, + "learning_rate": 5.786702798980388e-07, + "loss": 0.5048, + "step": 1439 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 2.9974874928675614, + "learning_rate": 5.773449353157171e-07, + "loss": 0.5413, + "step": 1440 + }, + { + "epoch": 0.6501974055273547, + "grad_norm": 3.777448022885416, + "learning_rate": 5.76020493993679e-07, + "loss": 0.5136, + "step": 1441 + }, + { + "epoch": 0.6506486181613085, + "grad_norm": 3.210445403802698, + "learning_rate": 5.74696958762401e-07, + "loss": 0.5167, + "step": 1442 + }, + { + "epoch": 0.6510998307952622, + "grad_norm": 3.336796933709652, + "learning_rate": 5.733743324504224e-07, + "loss": 0.5119, + "step": 1443 + }, + { + "epoch": 0.651551043429216, + "grad_norm": 3.3282911026676176, + "learning_rate": 5.720526178843418e-07, + "loss": 0.5884, + "step": 1444 + }, + { + "epoch": 0.6520022560631697, + "grad_norm": 3.21378303870658, + "learning_rate": 5.707318178888082e-07, + "loss": 0.538, + "step": 1445 + }, + { + "epoch": 0.6524534686971235, + "grad_norm": 2.969064914210708, + "learning_rate": 5.694119352865159e-07, + "loss": 0.5845, + "step": 1446 + }, + { + "epoch": 0.6529046813310773, + "grad_norm": 3.30695390232686, + "learning_rate": 5.680929728981991e-07, + "loss": 0.5142, + "step": 1447 + }, + { + "epoch": 0.653355893965031, + "grad_norm": 3.0732966058972604, + "learning_rate": 5.667749335426246e-07, + "loss": 0.4201, + "step": 1448 + }, + { + "epoch": 0.6538071065989848, + "grad_norm": 3.254645914821211, + "learning_rate": 5.654578200365885e-07, + "loss": 0.6627, + "step": 1449 + }, + { + "epoch": 0.6542583192329385, + "grad_norm": 2.9760909272929683, + "learning_rate": 5.641416351949062e-07, + "loss": 0.4217, + "step": 1450 + }, + { + "epoch": 0.6547095318668923, + "grad_norm": 2.997343851542973, + "learning_rate": 5.628263818304091e-07, + "loss": 0.5031, + "step": 1451 + }, + { + "epoch": 0.655160744500846, + "grad_norm": 3.0077707371721862, + "learning_rate": 5.615120627539387e-07, + "loss": 0.4886, + "step": 1452 + }, + { + "epoch": 0.6556119571347998, + "grad_norm": 3.5304208905303915, + "learning_rate": 5.601986807743387e-07, + "loss": 0.5286, + "step": 1453 + }, + { + "epoch": 0.6560631697687536, + "grad_norm": 3.6248543822772463, + "learning_rate": 5.588862386984509e-07, + "loss": 0.6958, + "step": 1454 + }, + { + "epoch": 0.6565143824027073, + "grad_norm": 3.153359939961065, + "learning_rate": 5.575747393311078e-07, + "loss": 0.5553, + "step": 1455 + }, + { + "epoch": 0.6569655950366611, + "grad_norm": 3.0726315593606093, + "learning_rate": 5.562641854751274e-07, + "loss": 0.4999, + "step": 1456 + }, + { + "epoch": 0.6574168076706147, + "grad_norm": 2.6907674515493873, + "learning_rate": 5.54954579931308e-07, + "loss": 0.4409, + "step": 1457 + }, + { + "epoch": 0.6578680203045685, + "grad_norm": 2.976942150158183, + "learning_rate": 5.536459254984194e-07, + "loss": 0.5027, + "step": 1458 + }, + { + "epoch": 0.6583192329385222, + "grad_norm": 4.099210213112422, + "learning_rate": 5.523382249732009e-07, + "loss": 0.5853, + "step": 1459 + }, + { + "epoch": 0.658770445572476, + "grad_norm": 3.5743013453464427, + "learning_rate": 5.510314811503519e-07, + "loss": 0.6072, + "step": 1460 + }, + { + "epoch": 0.6592216582064298, + "grad_norm": 3.0791361228178205, + "learning_rate": 5.497256968225263e-07, + "loss": 0.4773, + "step": 1461 + }, + { + "epoch": 0.6596728708403835, + "grad_norm": 3.127066793240839, + "learning_rate": 5.484208747803301e-07, + "loss": 0.6281, + "step": 1462 + }, + { + "epoch": 0.6601240834743373, + "grad_norm": 3.1864970517695275, + "learning_rate": 5.4711701781231e-07, + "loss": 0.5335, + "step": 1463 + }, + { + "epoch": 0.660575296108291, + "grad_norm": 2.7910338972873676, + "learning_rate": 5.458141287049525e-07, + "loss": 0.4367, + "step": 1464 + }, + { + "epoch": 0.6610265087422448, + "grad_norm": 3.112978170764596, + "learning_rate": 5.445122102426745e-07, + "loss": 0.4787, + "step": 1465 + }, + { + "epoch": 0.6614777213761985, + "grad_norm": 3.2428968725230436, + "learning_rate": 5.432112652078179e-07, + "loss": 0.4979, + "step": 1466 + }, + { + "epoch": 0.6619289340101523, + "grad_norm": 3.64761923117337, + "learning_rate": 5.419112963806467e-07, + "loss": 0.6766, + "step": 1467 + }, + { + "epoch": 0.662380146644106, + "grad_norm": 3.1374440807407966, + "learning_rate": 5.406123065393351e-07, + "loss": 0.585, + "step": 1468 + }, + { + "epoch": 0.6628313592780598, + "grad_norm": 3.0743306845398872, + "learning_rate": 5.393142984599684e-07, + "loss": 0.5432, + "step": 1469 + }, + { + "epoch": 0.6632825719120136, + "grad_norm": 3.0220167684458, + "learning_rate": 5.380172749165321e-07, + "loss": 0.6321, + "step": 1470 + }, + { + "epoch": 0.6637337845459673, + "grad_norm": 3.2506207588342115, + "learning_rate": 5.367212386809073e-07, + "loss": 0.4709, + "step": 1471 + }, + { + "epoch": 0.6641849971799211, + "grad_norm": 2.7879147797631463, + "learning_rate": 5.354261925228666e-07, + "loss": 0.4185, + "step": 1472 + }, + { + "epoch": 0.6646362098138748, + "grad_norm": 3.372496137176609, + "learning_rate": 5.341321392100655e-07, + "loss": 0.5043, + "step": 1473 + }, + { + "epoch": 0.6650874224478286, + "grad_norm": 3.3089075981861913, + "learning_rate": 5.32839081508038e-07, + "loss": 0.5101, + "step": 1474 + }, + { + "epoch": 0.6655386350817822, + "grad_norm": 3.141227268377863, + "learning_rate": 5.315470221801905e-07, + "loss": 0.5693, + "step": 1475 + }, + { + "epoch": 0.665989847715736, + "grad_norm": 3.200943952541811, + "learning_rate": 5.302559639877952e-07, + "loss": 0.5679, + "step": 1476 + }, + { + "epoch": 0.6664410603496898, + "grad_norm": 3.264422897311328, + "learning_rate": 5.289659096899859e-07, + "loss": 0.5888, + "step": 1477 + }, + { + "epoch": 0.6668922729836435, + "grad_norm": 3.2617925134864456, + "learning_rate": 5.2767686204375e-07, + "loss": 0.618, + "step": 1478 + }, + { + "epoch": 0.6673434856175973, + "grad_norm": 3.103687607569448, + "learning_rate": 5.263888238039234e-07, + "loss": 0.4523, + "step": 1479 + }, + { + "epoch": 0.667794698251551, + "grad_norm": 3.288349444384094, + "learning_rate": 5.251017977231862e-07, + "loss": 0.5756, + "step": 1480 + }, + { + "epoch": 0.6682459108855048, + "grad_norm": 3.3422954628890325, + "learning_rate": 5.238157865520538e-07, + "loss": 0.5607, + "step": 1481 + }, + { + "epoch": 0.6686971235194585, + "grad_norm": 2.9514424726562414, + "learning_rate": 5.225307930388736e-07, + "loss": 0.5046, + "step": 1482 + }, + { + "epoch": 0.6691483361534123, + "grad_norm": 3.381792256243614, + "learning_rate": 5.212468199298177e-07, + "loss": 0.5893, + "step": 1483 + }, + { + "epoch": 0.669599548787366, + "grad_norm": 2.974690993406971, + "learning_rate": 5.199638699688771e-07, + "loss": 0.6419, + "step": 1484 + }, + { + "epoch": 0.6700507614213198, + "grad_norm": 3.323658905812843, + "learning_rate": 5.186819458978577e-07, + "loss": 0.575, + "step": 1485 + }, + { + "epoch": 0.6705019740552736, + "grad_norm": 3.1513857698424426, + "learning_rate": 5.174010504563715e-07, + "loss": 0.4734, + "step": 1486 + }, + { + "epoch": 0.6709531866892273, + "grad_norm": 2.8289877159247427, + "learning_rate": 5.161211863818328e-07, + "loss": 0.5453, + "step": 1487 + }, + { + "epoch": 0.6714043993231811, + "grad_norm": 3.288966427725167, + "learning_rate": 5.148423564094516e-07, + "loss": 0.5392, + "step": 1488 + }, + { + "epoch": 0.6718556119571348, + "grad_norm": 3.224946055142921, + "learning_rate": 5.135645632722276e-07, + "loss": 0.5281, + "step": 1489 + }, + { + "epoch": 0.6723068245910886, + "grad_norm": 3.5118397978024625, + "learning_rate": 5.122878097009459e-07, + "loss": 0.4893, + "step": 1490 + }, + { + "epoch": 0.6727580372250423, + "grad_norm": 3.288392437565718, + "learning_rate": 5.110120984241686e-07, + "loss": 0.528, + "step": 1491 + }, + { + "epoch": 0.6732092498589961, + "grad_norm": 3.3121054039299254, + "learning_rate": 5.097374321682303e-07, + "loss": 0.5505, + "step": 1492 + }, + { + "epoch": 0.6736604624929498, + "grad_norm": 3.1922673429233317, + "learning_rate": 5.084638136572337e-07, + "loss": 0.5267, + "step": 1493 + }, + { + "epoch": 0.6741116751269035, + "grad_norm": 3.1998010986229546, + "learning_rate": 5.071912456130409e-07, + "loss": 0.6137, + "step": 1494 + }, + { + "epoch": 0.6745628877608573, + "grad_norm": 2.9001454970418448, + "learning_rate": 5.059197307552697e-07, + "loss": 0.5806, + "step": 1495 + }, + { + "epoch": 0.675014100394811, + "grad_norm": 3.381015585223304, + "learning_rate": 5.046492718012869e-07, + "loss": 0.5509, + "step": 1496 + }, + { + "epoch": 0.6754653130287648, + "grad_norm": 3.518395239900884, + "learning_rate": 5.033798714662023e-07, + "loss": 0.5119, + "step": 1497 + }, + { + "epoch": 0.6759165256627185, + "grad_norm": 2.9919505064871705, + "learning_rate": 5.021115324628647e-07, + "loss": 0.5477, + "step": 1498 + }, + { + "epoch": 0.6763677382966723, + "grad_norm": 3.1660202084920095, + "learning_rate": 5.008442575018533e-07, + "loss": 0.4414, + "step": 1499 + }, + { + "epoch": 0.676818950930626, + "grad_norm": 3.3320708896626527, + "learning_rate": 4.995780492914736e-07, + "loss": 0.6299, + "step": 1500 + }, + { + "epoch": 0.6772701635645798, + "grad_norm": 2.872291861282731, + "learning_rate": 4.983129105377527e-07, + "loss": 0.5124, + "step": 1501 + }, + { + "epoch": 0.6777213761985336, + "grad_norm": 3.1930837083504997, + "learning_rate": 4.970488439444296e-07, + "loss": 0.576, + "step": 1502 + }, + { + "epoch": 0.6781725888324873, + "grad_norm": 3.172575535409837, + "learning_rate": 4.957858522129544e-07, + "loss": 0.5872, + "step": 1503 + }, + { + "epoch": 0.6786238014664411, + "grad_norm": 3.0896523132819573, + "learning_rate": 4.945239380424786e-07, + "loss": 0.5427, + "step": 1504 + }, + { + "epoch": 0.6790750141003948, + "grad_norm": 2.8752434836894243, + "learning_rate": 4.932631041298513e-07, + "loss": 0.505, + "step": 1505 + }, + { + "epoch": 0.6795262267343486, + "grad_norm": 2.973393299716313, + "learning_rate": 4.920033531696136e-07, + "loss": 0.5431, + "step": 1506 + }, + { + "epoch": 0.6799774393683023, + "grad_norm": 2.9302261733817225, + "learning_rate": 4.907446878539912e-07, + "loss": 0.5481, + "step": 1507 + }, + { + "epoch": 0.6804286520022561, + "grad_norm": 2.6511702770482266, + "learning_rate": 4.894871108728903e-07, + "loss": 0.4476, + "step": 1508 + }, + { + "epoch": 0.6808798646362099, + "grad_norm": 3.741059117355058, + "learning_rate": 4.882306249138909e-07, + "loss": 0.5295, + "step": 1509 + }, + { + "epoch": 0.6813310772701635, + "grad_norm": 3.1716709800903184, + "learning_rate": 4.86975232662241e-07, + "loss": 0.6054, + "step": 1510 + }, + { + "epoch": 0.6817822899041173, + "grad_norm": 2.6341030249717026, + "learning_rate": 4.857209368008524e-07, + "loss": 0.5144, + "step": 1511 + }, + { + "epoch": 0.682233502538071, + "grad_norm": 3.422325855185031, + "learning_rate": 4.844677400102929e-07, + "loss": 0.538, + "step": 1512 + }, + { + "epoch": 0.6826847151720248, + "grad_norm": 3.1391602045967444, + "learning_rate": 4.832156449687811e-07, + "loss": 0.5469, + "step": 1513 + }, + { + "epoch": 0.6831359278059785, + "grad_norm": 3.7792088207002283, + "learning_rate": 4.819646543521824e-07, + "loss": 0.579, + "step": 1514 + }, + { + "epoch": 0.6835871404399323, + "grad_norm": 3.639662253926238, + "learning_rate": 4.807147708340009e-07, + "loss": 0.5355, + "step": 1515 + }, + { + "epoch": 0.684038353073886, + "grad_norm": 3.0964113537901916, + "learning_rate": 4.794659970853748e-07, + "loss": 0.5282, + "step": 1516 + }, + { + "epoch": 0.6844895657078398, + "grad_norm": 3.3232471100038894, + "learning_rate": 4.782183357750707e-07, + "loss": 0.6801, + "step": 1517 + }, + { + "epoch": 0.6849407783417936, + "grad_norm": 3.3328995149094305, + "learning_rate": 4.769717895694774e-07, + "loss": 0.5651, + "step": 1518 + }, + { + "epoch": 0.6853919909757473, + "grad_norm": 2.9076854833073575, + "learning_rate": 4.7572636113260176e-07, + "loss": 0.6316, + "step": 1519 + }, + { + "epoch": 0.6858432036097011, + "grad_norm": 3.5121904313302874, + "learning_rate": 4.744820531260608e-07, + "loss": 0.4922, + "step": 1520 + }, + { + "epoch": 0.6862944162436548, + "grad_norm": 2.946462024735822, + "learning_rate": 4.732388682090771e-07, + "loss": 0.4305, + "step": 1521 + }, + { + "epoch": 0.6867456288776086, + "grad_norm": 3.3183567408446493, + "learning_rate": 4.7199680903847426e-07, + "loss": 0.6194, + "step": 1522 + }, + { + "epoch": 0.6871968415115624, + "grad_norm": 2.970061784609735, + "learning_rate": 4.7075587826866767e-07, + "loss": 0.5048, + "step": 1523 + }, + { + "epoch": 0.6876480541455161, + "grad_norm": 3.3231922435824757, + "learning_rate": 4.695160785516639e-07, + "loss": 0.6005, + "step": 1524 + }, + { + "epoch": 0.6880992667794699, + "grad_norm": 3.248780126563559, + "learning_rate": 4.6827741253705054e-07, + "loss": 0.5268, + "step": 1525 + }, + { + "epoch": 0.6885504794134236, + "grad_norm": 2.9816778422414076, + "learning_rate": 4.670398828719926e-07, + "loss": 0.5717, + "step": 1526 + }, + { + "epoch": 0.6890016920473774, + "grad_norm": 3.2874556360576883, + "learning_rate": 4.658034922012275e-07, + "loss": 0.5634, + "step": 1527 + }, + { + "epoch": 0.689452904681331, + "grad_norm": 3.2116212084250373, + "learning_rate": 4.6456824316705725e-07, + "loss": 0.5602, + "step": 1528 + }, + { + "epoch": 0.6899041173152848, + "grad_norm": 3.4186841183082364, + "learning_rate": 4.6333413840934575e-07, + "loss": 0.6244, + "step": 1529 + }, + { + "epoch": 0.6903553299492385, + "grad_norm": 3.1280202580241614, + "learning_rate": 4.621011805655093e-07, + "loss": 0.4692, + "step": 1530 + }, + { + "epoch": 0.6908065425831923, + "grad_norm": 2.973990284802939, + "learning_rate": 4.60869372270514e-07, + "loss": 0.5565, + "step": 1531 + }, + { + "epoch": 0.691257755217146, + "grad_norm": 3.7131828396505706, + "learning_rate": 4.5963871615687045e-07, + "loss": 0.6157, + "step": 1532 + }, + { + "epoch": 0.6917089678510998, + "grad_norm": 2.8869266707436823, + "learning_rate": 4.584092148546254e-07, + "loss": 0.6354, + "step": 1533 + }, + { + "epoch": 0.6921601804850536, + "grad_norm": 3.3918689462338265, + "learning_rate": 4.5718087099135773e-07, + "loss": 0.5702, + "step": 1534 + }, + { + "epoch": 0.6926113931190073, + "grad_norm": 3.2001428288161344, + "learning_rate": 4.5595368719217397e-07, + "loss": 0.5083, + "step": 1535 + }, + { + "epoch": 0.6930626057529611, + "grad_norm": 3.084523211589884, + "learning_rate": 4.5472766607970024e-07, + "loss": 0.5727, + "step": 1536 + }, + { + "epoch": 0.6935138183869148, + "grad_norm": 3.157807512575682, + "learning_rate": 4.5350281027407843e-07, + "loss": 0.5678, + "step": 1537 + }, + { + "epoch": 0.6939650310208686, + "grad_norm": 3.2071372806652905, + "learning_rate": 4.522791223929596e-07, + "loss": 0.5597, + "step": 1538 + }, + { + "epoch": 0.6944162436548224, + "grad_norm": 3.225339649209973, + "learning_rate": 4.51056605051499e-07, + "loss": 0.5693, + "step": 1539 + }, + { + "epoch": 0.6948674562887761, + "grad_norm": 3.0766839452947856, + "learning_rate": 4.4983526086235103e-07, + "loss": 0.5093, + "step": 1540 + }, + { + "epoch": 0.6953186689227299, + "grad_norm": 3.3661435682809535, + "learning_rate": 4.4861509243566164e-07, + "loss": 0.6246, + "step": 1541 + }, + { + "epoch": 0.6957698815566836, + "grad_norm": 2.782237105741097, + "learning_rate": 4.4739610237906545e-07, + "loss": 0.5195, + "step": 1542 + }, + { + "epoch": 0.6962210941906374, + "grad_norm": 3.1390391034941563, + "learning_rate": 4.461782932976782e-07, + "loss": 0.6254, + "step": 1543 + }, + { + "epoch": 0.6966723068245911, + "grad_norm": 2.837001264646572, + "learning_rate": 4.4496166779409026e-07, + "loss": 0.5682, + "step": 1544 + }, + { + "epoch": 0.6971235194585449, + "grad_norm": 3.151679334520682, + "learning_rate": 4.437462284683653e-07, + "loss": 0.5967, + "step": 1545 + }, + { + "epoch": 0.6975747320924985, + "grad_norm": 3.2614811697248096, + "learning_rate": 4.4253197791802967e-07, + "loss": 0.4696, + "step": 1546 + }, + { + "epoch": 0.6980259447264523, + "grad_norm": 3.1064249116523706, + "learning_rate": 4.4131891873807103e-07, + "loss": 0.5832, + "step": 1547 + }, + { + "epoch": 0.698477157360406, + "grad_norm": 3.7129814910314884, + "learning_rate": 4.401070535209296e-07, + "loss": 0.6575, + "step": 1548 + }, + { + "epoch": 0.6989283699943598, + "grad_norm": 3.5598656251240293, + "learning_rate": 4.3889638485649405e-07, + "loss": 0.5255, + "step": 1549 + }, + { + "epoch": 0.6993795826283136, + "grad_norm": 3.0419102505278355, + "learning_rate": 4.3768691533209735e-07, + "loss": 0.5589, + "step": 1550 + }, + { + "epoch": 0.6998307952622673, + "grad_norm": 3.250677316394861, + "learning_rate": 4.364786475325072e-07, + "loss": 0.4775, + "step": 1551 + }, + { + "epoch": 0.7002820078962211, + "grad_norm": 3.012263276867671, + "learning_rate": 4.3527158403992567e-07, + "loss": 0.503, + "step": 1552 + }, + { + "epoch": 0.7007332205301748, + "grad_norm": 3.120176925995655, + "learning_rate": 4.3406572743397975e-07, + "loss": 0.6046, + "step": 1553 + }, + { + "epoch": 0.7011844331641286, + "grad_norm": 3.134496372403675, + "learning_rate": 4.3286108029171685e-07, + "loss": 0.5061, + "step": 1554 + }, + { + "epoch": 0.7016356457980824, + "grad_norm": 3.5153313421474373, + "learning_rate": 4.3165764518760104e-07, + "loss": 0.556, + "step": 1555 + }, + { + "epoch": 0.7020868584320361, + "grad_norm": 3.361928587880027, + "learning_rate": 4.304554246935049e-07, + "loss": 0.5555, + "step": 1556 + }, + { + "epoch": 0.7025380710659899, + "grad_norm": 3.113389444703251, + "learning_rate": 4.292544213787056e-07, + "loss": 0.508, + "step": 1557 + }, + { + "epoch": 0.7029892836999436, + "grad_norm": 2.951119900702608, + "learning_rate": 4.280546378098792e-07, + "loss": 0.4627, + "step": 1558 + }, + { + "epoch": 0.7034404963338974, + "grad_norm": 3.3068113437135747, + "learning_rate": 4.2685607655109446e-07, + "loss": 0.4641, + "step": 1559 + }, + { + "epoch": 0.7038917089678511, + "grad_norm": 3.443473545652759, + "learning_rate": 4.256587401638091e-07, + "loss": 0.5881, + "step": 1560 + }, + { + "epoch": 0.7043429216018049, + "grad_norm": 3.672614498272048, + "learning_rate": 4.244626312068622e-07, + "loss": 0.5562, + "step": 1561 + }, + { + "epoch": 0.7047941342357587, + "grad_norm": 3.1807964103718316, + "learning_rate": 4.232677522364696e-07, + "loss": 0.514, + "step": 1562 + }, + { + "epoch": 0.7052453468697123, + "grad_norm": 2.8379876257690118, + "learning_rate": 4.220741058062194e-07, + "loss": 0.6209, + "step": 1563 + }, + { + "epoch": 0.705696559503666, + "grad_norm": 3.115298174903329, + "learning_rate": 4.2088169446706487e-07, + "loss": 0.5598, + "step": 1564 + }, + { + "epoch": 0.7061477721376198, + "grad_norm": 3.616655218693416, + "learning_rate": 4.1969052076732005e-07, + "loss": 0.5609, + "step": 1565 + }, + { + "epoch": 0.7065989847715736, + "grad_norm": 3.209766337154335, + "learning_rate": 4.185005872526538e-07, + "loss": 0.5969, + "step": 1566 + }, + { + "epoch": 0.7070501974055273, + "grad_norm": 3.469400141091681, + "learning_rate": 4.1731189646608434e-07, + "loss": 0.6814, + "step": 1567 + }, + { + "epoch": 0.7075014100394811, + "grad_norm": 3.1397943853367374, + "learning_rate": 4.161244509479751e-07, + "loss": 0.5017, + "step": 1568 + }, + { + "epoch": 0.7079526226734348, + "grad_norm": 3.5877431815311436, + "learning_rate": 4.1493825323602737e-07, + "loss": 0.6132, + "step": 1569 + }, + { + "epoch": 0.7084038353073886, + "grad_norm": 2.681923696233627, + "learning_rate": 4.137533058652758e-07, + "loss": 0.516, + "step": 1570 + }, + { + "epoch": 0.7088550479413424, + "grad_norm": 3.389585300189951, + "learning_rate": 4.12569611368083e-07, + "loss": 0.5234, + "step": 1571 + }, + { + "epoch": 0.7093062605752961, + "grad_norm": 3.180876954986152, + "learning_rate": 4.113871722741337e-07, + "loss": 0.4276, + "step": 1572 + }, + { + "epoch": 0.7097574732092499, + "grad_norm": 3.932081857493054, + "learning_rate": 4.1020599111043084e-07, + "loss": 0.5718, + "step": 1573 + }, + { + "epoch": 0.7102086858432036, + "grad_norm": 3.0660045119592416, + "learning_rate": 4.09026070401288e-07, + "loss": 0.4444, + "step": 1574 + }, + { + "epoch": 0.7106598984771574, + "grad_norm": 2.9150397257470377, + "learning_rate": 4.078474126683249e-07, + "loss": 0.5647, + "step": 1575 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 3.260434948896054, + "learning_rate": 4.0667002043046304e-07, + "loss": 0.5476, + "step": 1576 + }, + { + "epoch": 0.7115623237450649, + "grad_norm": 3.2788448633519867, + "learning_rate": 4.0549389620391896e-07, + "loss": 0.5775, + "step": 1577 + }, + { + "epoch": 0.7120135363790187, + "grad_norm": 3.0034190041080064, + "learning_rate": 4.0431904250219893e-07, + "loss": 0.5515, + "step": 1578 + }, + { + "epoch": 0.7124647490129724, + "grad_norm": 3.2382651735959223, + "learning_rate": 4.031454618360944e-07, + "loss": 0.4369, + "step": 1579 + }, + { + "epoch": 0.7129159616469262, + "grad_norm": 3.0531108303993135, + "learning_rate": 4.0197315671367593e-07, + "loss": 0.5064, + "step": 1580 + }, + { + "epoch": 0.7133671742808798, + "grad_norm": 2.9453398873817025, + "learning_rate": 4.008021296402888e-07, + "loss": 0.427, + "step": 1581 + }, + { + "epoch": 0.7138183869148336, + "grad_norm": 3.265675895615181, + "learning_rate": 3.996323831185462e-07, + "loss": 0.6767, + "step": 1582 + }, + { + "epoch": 0.7142695995487873, + "grad_norm": 2.9358827987112535, + "learning_rate": 3.984639196483245e-07, + "loss": 0.5785, + "step": 1583 + }, + { + "epoch": 0.7147208121827411, + "grad_norm": 3.3867250177331782, + "learning_rate": 3.9729674172675954e-07, + "loss": 0.5236, + "step": 1584 + }, + { + "epoch": 0.7151720248166948, + "grad_norm": 3.31448568517522, + "learning_rate": 3.961308518482372e-07, + "loss": 0.609, + "step": 1585 + }, + { + "epoch": 0.7156232374506486, + "grad_norm": 3.71248092590625, + "learning_rate": 3.949662525043934e-07, + "loss": 0.6091, + "step": 1586 + }, + { + "epoch": 0.7160744500846024, + "grad_norm": 3.8185840047248125, + "learning_rate": 3.9380294618410434e-07, + "loss": 0.6641, + "step": 1587 + }, + { + "epoch": 0.7165256627185561, + "grad_norm": 3.3840710545866632, + "learning_rate": 3.92640935373483e-07, + "loss": 0.5043, + "step": 1588 + }, + { + "epoch": 0.7169768753525099, + "grad_norm": 3.109852062826725, + "learning_rate": 3.9148022255587487e-07, + "loss": 0.5068, + "step": 1589 + }, + { + "epoch": 0.7174280879864636, + "grad_norm": 3.0466909947937184, + "learning_rate": 3.9032081021185024e-07, + "loss": 0.8023, + "step": 1590 + }, + { + "epoch": 0.7178793006204174, + "grad_norm": 3.494445337066531, + "learning_rate": 3.891627008192007e-07, + "loss": 0.6285, + "step": 1591 + }, + { + "epoch": 0.7183305132543711, + "grad_norm": 3.0639765960841387, + "learning_rate": 3.8800589685293294e-07, + "loss": 0.4803, + "step": 1592 + }, + { + "epoch": 0.7187817258883249, + "grad_norm": 3.370717647642269, + "learning_rate": 3.868504007852641e-07, + "loss": 0.5468, + "step": 1593 + }, + { + "epoch": 0.7192329385222787, + "grad_norm": 3.252865929125753, + "learning_rate": 3.8569621508561666e-07, + "loss": 0.5753, + "step": 1594 + }, + { + "epoch": 0.7196841511562324, + "grad_norm": 3.1394814071549484, + "learning_rate": 3.845433422206118e-07, + "loss": 0.4729, + "step": 1595 + }, + { + "epoch": 0.7201353637901862, + "grad_norm": 3.1631245452667973, + "learning_rate": 3.833917846540651e-07, + "loss": 0.6474, + "step": 1596 + }, + { + "epoch": 0.7205865764241399, + "grad_norm": 3.162870418400208, + "learning_rate": 3.8224154484698234e-07, + "loss": 0.4195, + "step": 1597 + }, + { + "epoch": 0.7210377890580937, + "grad_norm": 3.161335436131402, + "learning_rate": 3.8109262525755183e-07, + "loss": 0.5058, + "step": 1598 + }, + { + "epoch": 0.7214890016920473, + "grad_norm": 3.3185670575255433, + "learning_rate": 3.7994502834114085e-07, + "loss": 0.4979, + "step": 1599 + }, + { + "epoch": 0.7219402143260011, + "grad_norm": 3.176929378675548, + "learning_rate": 3.7879875655029017e-07, + "loss": 0.5728, + "step": 1600 + }, + { + "epoch": 0.7223914269599548, + "grad_norm": 3.1684081635191066, + "learning_rate": 3.7765381233470794e-07, + "loss": 0.4601, + "step": 1601 + }, + { + "epoch": 0.7228426395939086, + "grad_norm": 3.127695957202654, + "learning_rate": 3.765101981412665e-07, + "loss": 0.5002, + "step": 1602 + }, + { + "epoch": 0.7232938522278624, + "grad_norm": 3.219550164344783, + "learning_rate": 3.753679164139947e-07, + "loss": 0.5385, + "step": 1603 + }, + { + "epoch": 0.7237450648618161, + "grad_norm": 3.601540512399754, + "learning_rate": 3.742269695940734e-07, + "loss": 0.664, + "step": 1604 + }, + { + "epoch": 0.7241962774957699, + "grad_norm": 3.375042559176746, + "learning_rate": 3.730873601198326e-07, + "loss": 0.6187, + "step": 1605 + }, + { + "epoch": 0.7246474901297236, + "grad_norm": 3.0597581559151794, + "learning_rate": 3.7194909042674115e-07, + "loss": 0.4807, + "step": 1606 + }, + { + "epoch": 0.7250987027636774, + "grad_norm": 2.9753785627396105, + "learning_rate": 3.708121629474077e-07, + "loss": 0.5375, + "step": 1607 + }, + { + "epoch": 0.7255499153976311, + "grad_norm": 2.9762038394272765, + "learning_rate": 3.6967658011157054e-07, + "loss": 0.4504, + "step": 1608 + }, + { + "epoch": 0.7260011280315849, + "grad_norm": 3.1965956621894063, + "learning_rate": 3.6854234434609477e-07, + "loss": 0.5014, + "step": 1609 + }, + { + "epoch": 0.7264523406655387, + "grad_norm": 3.499485873703523, + "learning_rate": 3.6740945807496736e-07, + "loss": 0.4979, + "step": 1610 + }, + { + "epoch": 0.7269035532994924, + "grad_norm": 3.2588338513217274, + "learning_rate": 3.662779237192899e-07, + "loss": 0.5223, + "step": 1611 + }, + { + "epoch": 0.7273547659334462, + "grad_norm": 3.0490054858602584, + "learning_rate": 3.6514774369727676e-07, + "loss": 0.5, + "step": 1612 + }, + { + "epoch": 0.7278059785673999, + "grad_norm": 3.12273878068711, + "learning_rate": 3.6401892042424556e-07, + "loss": 0.5755, + "step": 1613 + }, + { + "epoch": 0.7282571912013537, + "grad_norm": 3.073986060170658, + "learning_rate": 3.6289145631261554e-07, + "loss": 0.5491, + "step": 1614 + }, + { + "epoch": 0.7287084038353074, + "grad_norm": 3.290781958913942, + "learning_rate": 3.617653537719022e-07, + "loss": 0.5943, + "step": 1615 + }, + { + "epoch": 0.7291596164692611, + "grad_norm": 3.400579286261728, + "learning_rate": 3.606406152087095e-07, + "loss": 0.5581, + "step": 1616 + }, + { + "epoch": 0.7296108291032148, + "grad_norm": 3.450947773384265, + "learning_rate": 3.595172430267279e-07, + "loss": 0.5873, + "step": 1617 + }, + { + "epoch": 0.7300620417371686, + "grad_norm": 2.8791493307158014, + "learning_rate": 3.583952396267269e-07, + "loss": 0.5159, + "step": 1618 + }, + { + "epoch": 0.7305132543711224, + "grad_norm": 3.3597310964990994, + "learning_rate": 3.572746074065509e-07, + "loss": 0.5292, + "step": 1619 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 3.2911307641618457, + "learning_rate": 3.56155348761114e-07, + "loss": 0.6023, + "step": 1620 + }, + { + "epoch": 0.7314156796390299, + "grad_norm": 3.208895269155749, + "learning_rate": 3.550374660823948e-07, + "loss": 0.6399, + "step": 1621 + }, + { + "epoch": 0.7318668922729836, + "grad_norm": 3.411723131804797, + "learning_rate": 3.539209617594311e-07, + "loss": 0.5386, + "step": 1622 + }, + { + "epoch": 0.7323181049069374, + "grad_norm": 3.017726904850449, + "learning_rate": 3.5280583817831577e-07, + "loss": 0.5199, + "step": 1623 + }, + { + "epoch": 0.7327693175408911, + "grad_norm": 3.448141993818582, + "learning_rate": 3.516920977221898e-07, + "loss": 0.6594, + "step": 1624 + }, + { + "epoch": 0.7332205301748449, + "grad_norm": 3.1018446805384268, + "learning_rate": 3.5057974277123935e-07, + "loss": 0.432, + "step": 1625 + }, + { + "epoch": 0.7336717428087987, + "grad_norm": 3.292832340665522, + "learning_rate": 3.494687757026894e-07, + "loss": 0.6043, + "step": 1626 + }, + { + "epoch": 0.7341229554427524, + "grad_norm": 3.37654871788207, + "learning_rate": 3.483591988907972e-07, + "loss": 0.6076, + "step": 1627 + }, + { + "epoch": 0.7345741680767062, + "grad_norm": 2.9139739574958794, + "learning_rate": 3.472510147068515e-07, + "loss": 0.5498, + "step": 1628 + }, + { + "epoch": 0.7350253807106599, + "grad_norm": 3.0177269585869415, + "learning_rate": 3.461442255191628e-07, + "loss": 0.4855, + "step": 1629 + }, + { + "epoch": 0.7354765933446137, + "grad_norm": 3.5169507921427083, + "learning_rate": 3.4503883369306176e-07, + "loss": 0.5087, + "step": 1630 + }, + { + "epoch": 0.7359278059785674, + "grad_norm": 3.028359718330194, + "learning_rate": 3.439348415908918e-07, + "loss": 0.5116, + "step": 1631 + }, + { + "epoch": 0.7363790186125212, + "grad_norm": 3.3434023450295416, + "learning_rate": 3.42832251572005e-07, + "loss": 0.623, + "step": 1632 + }, + { + "epoch": 0.736830231246475, + "grad_norm": 3.0861385669753822, + "learning_rate": 3.4173106599275827e-07, + "loss": 0.5863, + "step": 1633 + }, + { + "epoch": 0.7372814438804286, + "grad_norm": 3.4871736340602344, + "learning_rate": 3.406312872065047e-07, + "loss": 0.6979, + "step": 1634 + }, + { + "epoch": 0.7377326565143824, + "grad_norm": 3.3007203094285322, + "learning_rate": 3.395329175635935e-07, + "loss": 0.5642, + "step": 1635 + }, + { + "epoch": 0.7381838691483361, + "grad_norm": 3.042983266092788, + "learning_rate": 3.384359594113606e-07, + "loss": 0.5794, + "step": 1636 + }, + { + "epoch": 0.7386350817822899, + "grad_norm": 3.1322148623384174, + "learning_rate": 3.373404150941258e-07, + "loss": 0.5112, + "step": 1637 + }, + { + "epoch": 0.7390862944162436, + "grad_norm": 3.3451076966064037, + "learning_rate": 3.3624628695318845e-07, + "loss": 0.5563, + "step": 1638 + }, + { + "epoch": 0.7395375070501974, + "grad_norm": 2.898388910225105, + "learning_rate": 3.3515357732682e-07, + "loss": 0.5745, + "step": 1639 + }, + { + "epoch": 0.7399887196841511, + "grad_norm": 3.3344993868159536, + "learning_rate": 3.34062288550261e-07, + "loss": 0.5693, + "step": 1640 + }, + { + "epoch": 0.7404399323181049, + "grad_norm": 3.399041172987303, + "learning_rate": 3.3297242295571526e-07, + "loss": 0.4952, + "step": 1641 + }, + { + "epoch": 0.7408911449520587, + "grad_norm": 3.552119321092643, + "learning_rate": 3.3188398287234496e-07, + "loss": 0.5461, + "step": 1642 + }, + { + "epoch": 0.7413423575860124, + "grad_norm": 3.1787290156115406, + "learning_rate": 3.307969706262669e-07, + "loss": 0.6037, + "step": 1643 + }, + { + "epoch": 0.7417935702199662, + "grad_norm": 3.5000027663014492, + "learning_rate": 3.2971138854054505e-07, + "loss": 0.5315, + "step": 1644 + }, + { + "epoch": 0.7422447828539199, + "grad_norm": 3.1720666447080808, + "learning_rate": 3.286272389351874e-07, + "loss": 0.5329, + "step": 1645 + }, + { + "epoch": 0.7426959954878737, + "grad_norm": 3.09152659928442, + "learning_rate": 3.2754452412714153e-07, + "loss": 0.4382, + "step": 1646 + }, + { + "epoch": 0.7431472081218274, + "grad_norm": 2.798294812436822, + "learning_rate": 3.2646324643028664e-07, + "loss": 0.4975, + "step": 1647 + }, + { + "epoch": 0.7435984207557812, + "grad_norm": 3.1293398353599393, + "learning_rate": 3.2538340815543287e-07, + "loss": 0.463, + "step": 1648 + }, + { + "epoch": 0.744049633389735, + "grad_norm": 3.291575473368299, + "learning_rate": 3.243050116103128e-07, + "loss": 0.4967, + "step": 1649 + }, + { + "epoch": 0.7445008460236887, + "grad_norm": 3.1465605176299825, + "learning_rate": 3.2322805909957795e-07, + "loss": 0.5417, + "step": 1650 + }, + { + "epoch": 0.7449520586576425, + "grad_norm": 2.9534255260466624, + "learning_rate": 3.221525529247949e-07, + "loss": 0.484, + "step": 1651 + }, + { + "epoch": 0.7454032712915961, + "grad_norm": 3.2799558091998215, + "learning_rate": 3.21078495384438e-07, + "loss": 0.5042, + "step": 1652 + }, + { + "epoch": 0.7458544839255499, + "grad_norm": 3.3413912958342102, + "learning_rate": 3.20005888773886e-07, + "loss": 0.5555, + "step": 1653 + }, + { + "epoch": 0.7463056965595036, + "grad_norm": 3.5148486290813548, + "learning_rate": 3.189347353854173e-07, + "loss": 0.5452, + "step": 1654 + }, + { + "epoch": 0.7467569091934574, + "grad_norm": 3.235645631726398, + "learning_rate": 3.178650375082038e-07, + "loss": 0.5124, + "step": 1655 + }, + { + "epoch": 0.7472081218274111, + "grad_norm": 3.335777914025934, + "learning_rate": 3.16796797428308e-07, + "loss": 0.551, + "step": 1656 + }, + { + "epoch": 0.7476593344613649, + "grad_norm": 3.3336251599770117, + "learning_rate": 3.157300174286759e-07, + "loss": 0.4606, + "step": 1657 + }, + { + "epoch": 0.7481105470953187, + "grad_norm": 3.3954076174897683, + "learning_rate": 3.1466469978913325e-07, + "loss": 0.5403, + "step": 1658 + }, + { + "epoch": 0.7485617597292724, + "grad_norm": 3.1227703274844614, + "learning_rate": 3.136008467863814e-07, + "loss": 0.4239, + "step": 1659 + }, + { + "epoch": 0.7490129723632262, + "grad_norm": 3.144214684776084, + "learning_rate": 3.125384606939908e-07, + "loss": 0.4472, + "step": 1660 + }, + { + "epoch": 0.7494641849971799, + "grad_norm": 3.075384566057599, + "learning_rate": 3.114775437823971e-07, + "loss": 0.4441, + "step": 1661 + }, + { + "epoch": 0.7499153976311337, + "grad_norm": 3.407644282112315, + "learning_rate": 3.104180983188963e-07, + "loss": 0.6542, + "step": 1662 + }, + { + "epoch": 0.7503666102650874, + "grad_norm": 3.0829123476918703, + "learning_rate": 3.0936012656763933e-07, + "loss": 0.5066, + "step": 1663 + }, + { + "epoch": 0.7508178228990412, + "grad_norm": 3.197121809224084, + "learning_rate": 3.0830363078962853e-07, + "loss": 0.5604, + "step": 1664 + }, + { + "epoch": 0.751269035532995, + "grad_norm": 3.4708630812887162, + "learning_rate": 3.0724861324271136e-07, + "loss": 0.4919, + "step": 1665 + }, + { + "epoch": 0.7517202481669487, + "grad_norm": 3.2521891509707173, + "learning_rate": 3.061950761815755e-07, + "loss": 0.6188, + "step": 1666 + }, + { + "epoch": 0.7521714608009025, + "grad_norm": 3.047231852620941, + "learning_rate": 3.0514302185774653e-07, + "loss": 0.5921, + "step": 1667 + }, + { + "epoch": 0.7526226734348562, + "grad_norm": 3.2765361212999204, + "learning_rate": 3.040924525195786e-07, + "loss": 0.5818, + "step": 1668 + }, + { + "epoch": 0.7530738860688099, + "grad_norm": 3.299009096476747, + "learning_rate": 3.030433704122549e-07, + "loss": 0.5344, + "step": 1669 + }, + { + "epoch": 0.7535250987027636, + "grad_norm": 2.905729407389529, + "learning_rate": 3.0199577777777875e-07, + "loss": 0.5905, + "step": 1670 + }, + { + "epoch": 0.7539763113367174, + "grad_norm": 2.886347624748166, + "learning_rate": 3.0094967685497017e-07, + "loss": 0.5058, + "step": 1671 + }, + { + "epoch": 0.7544275239706711, + "grad_norm": 3.3966367759928184, + "learning_rate": 2.999050698794624e-07, + "loss": 0.6179, + "step": 1672 + }, + { + "epoch": 0.7548787366046249, + "grad_norm": 3.6135542251703874, + "learning_rate": 2.9886195908369504e-07, + "loss": 0.5298, + "step": 1673 + }, + { + "epoch": 0.7553299492385787, + "grad_norm": 3.3688596527662864, + "learning_rate": 2.9782034669691027e-07, + "loss": 0.5498, + "step": 1674 + }, + { + "epoch": 0.7557811618725324, + "grad_norm": 3.2566793348820537, + "learning_rate": 2.9678023494514815e-07, + "loss": 0.5749, + "step": 1675 + }, + { + "epoch": 0.7562323745064862, + "grad_norm": 3.0116943004039576, + "learning_rate": 2.957416260512414e-07, + "loss": 0.4711, + "step": 1676 + }, + { + "epoch": 0.7566835871404399, + "grad_norm": 3.52833333229487, + "learning_rate": 2.94704522234812e-07, + "loss": 0.5615, + "step": 1677 + }, + { + "epoch": 0.7571347997743937, + "grad_norm": 3.075366987645075, + "learning_rate": 2.936689257122642e-07, + "loss": 0.485, + "step": 1678 + }, + { + "epoch": 0.7575860124083474, + "grad_norm": 3.3966268101373216, + "learning_rate": 2.926348386967813e-07, + "loss": 0.6323, + "step": 1679 + }, + { + "epoch": 0.7580372250423012, + "grad_norm": 2.8446858198235483, + "learning_rate": 2.9160226339832137e-07, + "loss": 0.4601, + "step": 1680 + }, + { + "epoch": 0.758488437676255, + "grad_norm": 3.3602462761752934, + "learning_rate": 2.9057120202361094e-07, + "loss": 0.4662, + "step": 1681 + }, + { + "epoch": 0.7589396503102087, + "grad_norm": 3.3791987672981794, + "learning_rate": 2.895416567761414e-07, + "loss": 0.6081, + "step": 1682 + }, + { + "epoch": 0.7593908629441625, + "grad_norm": 3.4250371237897217, + "learning_rate": 2.8851362985616387e-07, + "loss": 0.5844, + "step": 1683 + }, + { + "epoch": 0.7598420755781162, + "grad_norm": 3.46770568979179, + "learning_rate": 2.874871234606846e-07, + "loss": 0.5083, + "step": 1684 + }, + { + "epoch": 0.76029328821207, + "grad_norm": 3.4131489026616357, + "learning_rate": 2.86462139783461e-07, + "loss": 0.5537, + "step": 1685 + }, + { + "epoch": 0.7607445008460237, + "grad_norm": 3.270548918149311, + "learning_rate": 2.854386810149955e-07, + "loss": 0.4998, + "step": 1686 + }, + { + "epoch": 0.7611957134799774, + "grad_norm": 3.530660029135758, + "learning_rate": 2.8441674934253135e-07, + "loss": 0.5188, + "step": 1687 + }, + { + "epoch": 0.7616469261139311, + "grad_norm": 3.386488218146081, + "learning_rate": 2.8339634695005e-07, + "loss": 0.529, + "step": 1688 + }, + { + "epoch": 0.7620981387478849, + "grad_norm": 3.1388677063133716, + "learning_rate": 2.823774760182619e-07, + "loss": 0.6524, + "step": 1689 + }, + { + "epoch": 0.7625493513818387, + "grad_norm": 3.4194129454744906, + "learning_rate": 2.813601387246073e-07, + "loss": 0.6083, + "step": 1690 + }, + { + "epoch": 0.7630005640157924, + "grad_norm": 3.253968578183408, + "learning_rate": 2.8034433724324715e-07, + "loss": 0.5288, + "step": 1691 + }, + { + "epoch": 0.7634517766497462, + "grad_norm": 2.8786430697629752, + "learning_rate": 2.7933007374506045e-07, + "loss": 0.4924, + "step": 1692 + }, + { + "epoch": 0.7639029892836999, + "grad_norm": 3.7085567077881403, + "learning_rate": 2.783173503976405e-07, + "loss": 0.6246, + "step": 1693 + }, + { + "epoch": 0.7643542019176537, + "grad_norm": 2.977804758799455, + "learning_rate": 2.7730616936528763e-07, + "loss": 0.5176, + "step": 1694 + }, + { + "epoch": 0.7648054145516074, + "grad_norm": 3.327404515557504, + "learning_rate": 2.7629653280900744e-07, + "loss": 0.5808, + "step": 1695 + }, + { + "epoch": 0.7652566271855612, + "grad_norm": 3.069935757487436, + "learning_rate": 2.7528844288650345e-07, + "loss": 0.4892, + "step": 1696 + }, + { + "epoch": 0.765707839819515, + "grad_norm": 3.2131875481565553, + "learning_rate": 2.7428190175217437e-07, + "loss": 0.5039, + "step": 1697 + }, + { + "epoch": 0.7661590524534687, + "grad_norm": 2.9817468613273737, + "learning_rate": 2.7327691155710976e-07, + "loss": 0.5042, + "step": 1698 + }, + { + "epoch": 0.7666102650874225, + "grad_norm": 3.3064977092399097, + "learning_rate": 2.7227347444908347e-07, + "loss": 0.4653, + "step": 1699 + }, + { + "epoch": 0.7670614777213762, + "grad_norm": 3.0238137206901183, + "learning_rate": 2.7127159257255136e-07, + "loss": 0.464, + "step": 1700 + }, + { + "epoch": 0.76751269035533, + "grad_norm": 3.169051965683476, + "learning_rate": 2.7027126806864465e-07, + "loss": 0.708, + "step": 1701 + }, + { + "epoch": 0.7679639029892837, + "grad_norm": 3.133987039383184, + "learning_rate": 2.692725030751668e-07, + "loss": 0.5191, + "step": 1702 + }, + { + "epoch": 0.7684151156232375, + "grad_norm": 3.1774428494885063, + "learning_rate": 2.6827529972658816e-07, + "loss": 0.5386, + "step": 1703 + }, + { + "epoch": 0.7688663282571913, + "grad_norm": 3.1659341840204474, + "learning_rate": 2.6727966015404144e-07, + "loss": 0.5514, + "step": 1704 + }, + { + "epoch": 0.7693175408911449, + "grad_norm": 3.567018735324809, + "learning_rate": 2.662855864853184e-07, + "loss": 0.5817, + "step": 1705 + }, + { + "epoch": 0.7697687535250987, + "grad_norm": 3.1177868647304514, + "learning_rate": 2.6529308084486334e-07, + "loss": 0.5047, + "step": 1706 + }, + { + "epoch": 0.7702199661590524, + "grad_norm": 3.6947986746375534, + "learning_rate": 2.643021453537695e-07, + "loss": 0.5996, + "step": 1707 + }, + { + "epoch": 0.7706711787930062, + "grad_norm": 3.026216223845999, + "learning_rate": 2.633127821297754e-07, + "loss": 0.5287, + "step": 1708 + }, + { + "epoch": 0.7711223914269599, + "grad_norm": 3.4446775186050633, + "learning_rate": 2.623249932872589e-07, + "loss": 0.5906, + "step": 1709 + }, + { + "epoch": 0.7715736040609137, + "grad_norm": 3.5855936274951534, + "learning_rate": 2.6133878093723227e-07, + "loss": 0.5337, + "step": 1710 + }, + { + "epoch": 0.7720248166948674, + "grad_norm": 3.28611792990953, + "learning_rate": 2.603541471873405e-07, + "loss": 0.552, + "step": 1711 + }, + { + "epoch": 0.7724760293288212, + "grad_norm": 3.4992707402338823, + "learning_rate": 2.5937109414185364e-07, + "loss": 0.4644, + "step": 1712 + }, + { + "epoch": 0.772927241962775, + "grad_norm": 2.9170590421526104, + "learning_rate": 2.583896239016643e-07, + "loss": 0.4364, + "step": 1713 + }, + { + "epoch": 0.7733784545967287, + "grad_norm": 2.7757056581823782, + "learning_rate": 2.5740973856428205e-07, + "loss": 0.55, + "step": 1714 + }, + { + "epoch": 0.7738296672306825, + "grad_norm": 3.4368687127239195, + "learning_rate": 2.56431440223829e-07, + "loss": 0.6541, + "step": 1715 + }, + { + "epoch": 0.7742808798646362, + "grad_norm": 3.655602500119933, + "learning_rate": 2.5545473097103723e-07, + "loss": 0.5804, + "step": 1716 + }, + { + "epoch": 0.77473209249859, + "grad_norm": 2.9071464976070995, + "learning_rate": 2.5447961289324024e-07, + "loss": 0.4734, + "step": 1717 + }, + { + "epoch": 0.7751833051325437, + "grad_norm": 3.0403001990908383, + "learning_rate": 2.5350608807437356e-07, + "loss": 0.4796, + "step": 1718 + }, + { + "epoch": 0.7756345177664975, + "grad_norm": 3.401670150665487, + "learning_rate": 2.525341585949662e-07, + "loss": 0.6389, + "step": 1719 + }, + { + "epoch": 0.7760857304004513, + "grad_norm": 3.2058804560472125, + "learning_rate": 2.5156382653213783e-07, + "loss": 0.5616, + "step": 1720 + }, + { + "epoch": 0.776536943034405, + "grad_norm": 2.9830405398098097, + "learning_rate": 2.5059509395959523e-07, + "loss": 0.4926, + "step": 1721 + }, + { + "epoch": 0.7769881556683587, + "grad_norm": 3.120574964296058, + "learning_rate": 2.496279629476261e-07, + "loss": 0.5861, + "step": 1722 + }, + { + "epoch": 0.7774393683023124, + "grad_norm": 3.390755640754869, + "learning_rate": 2.4866243556309554e-07, + "loss": 0.5654, + "step": 1723 + }, + { + "epoch": 0.7778905809362662, + "grad_norm": 2.892098028372284, + "learning_rate": 2.476985138694415e-07, + "loss": 0.4677, + "step": 1724 + }, + { + "epoch": 0.7783417935702199, + "grad_norm": 3.250949839615332, + "learning_rate": 2.467361999266704e-07, + "loss": 0.4674, + "step": 1725 + }, + { + "epoch": 0.7787930062041737, + "grad_norm": 3.149351653501838, + "learning_rate": 2.4577549579135317e-07, + "loss": 0.5966, + "step": 1726 + }, + { + "epoch": 0.7792442188381274, + "grad_norm": 2.983139524124556, + "learning_rate": 2.448164035166199e-07, + "loss": 0.4793, + "step": 1727 + }, + { + "epoch": 0.7796954314720812, + "grad_norm": 3.5014980256358172, + "learning_rate": 2.438589251521558e-07, + "loss": 0.5634, + "step": 1728 + }, + { + "epoch": 0.780146644106035, + "grad_norm": 2.9881299554568757, + "learning_rate": 2.4290306274419793e-07, + "loss": 0.5164, + "step": 1729 + }, + { + "epoch": 0.7805978567399887, + "grad_norm": 3.116274993285628, + "learning_rate": 2.419488183355284e-07, + "loss": 0.5957, + "step": 1730 + }, + { + "epoch": 0.7810490693739425, + "grad_norm": 3.148408583672423, + "learning_rate": 2.409961939654729e-07, + "loss": 0.4581, + "step": 1731 + }, + { + "epoch": 0.7815002820078962, + "grad_norm": 2.9310980700141784, + "learning_rate": 2.40045191669894e-07, + "loss": 0.5326, + "step": 1732 + }, + { + "epoch": 0.78195149464185, + "grad_norm": 2.841995600846533, + "learning_rate": 2.39095813481188e-07, + "loss": 0.3975, + "step": 1733 + }, + { + "epoch": 0.7824027072758037, + "grad_norm": 3.360507399857603, + "learning_rate": 2.381480614282807e-07, + "loss": 0.5217, + "step": 1734 + }, + { + "epoch": 0.7828539199097575, + "grad_norm": 3.3345269121171786, + "learning_rate": 2.37201937536622e-07, + "loss": 0.6034, + "step": 1735 + }, + { + "epoch": 0.7833051325437113, + "grad_norm": 3.2457173184176327, + "learning_rate": 2.362574438281827e-07, + "loss": 0.5572, + "step": 1736 + }, + { + "epoch": 0.783756345177665, + "grad_norm": 2.5998859002425654, + "learning_rate": 2.353145823214495e-07, + "loss": 0.4872, + "step": 1737 + }, + { + "epoch": 0.7842075578116188, + "grad_norm": 3.3464133340861433, + "learning_rate": 2.3437335503142063e-07, + "loss": 0.5293, + "step": 1738 + }, + { + "epoch": 0.7846587704455725, + "grad_norm": 3.4826017860389027, + "learning_rate": 2.3343376396960278e-07, + "loss": 0.487, + "step": 1739 + }, + { + "epoch": 0.7851099830795262, + "grad_norm": 3.359966006460181, + "learning_rate": 2.3249581114400507e-07, + "loss": 0.6411, + "step": 1740 + }, + { + "epoch": 0.7855611957134799, + "grad_norm": 3.108189759258103, + "learning_rate": 2.3155949855913515e-07, + "loss": 0.5771, + "step": 1741 + }, + { + "epoch": 0.7860124083474337, + "grad_norm": 3.206925034569629, + "learning_rate": 2.306248282159965e-07, + "loss": 0.5679, + "step": 1742 + }, + { + "epoch": 0.7864636209813874, + "grad_norm": 3.174330922672863, + "learning_rate": 2.2969180211208195e-07, + "loss": 0.5643, + "step": 1743 + }, + { + "epoch": 0.7869148336153412, + "grad_norm": 3.1224925255844, + "learning_rate": 2.2876042224137081e-07, + "loss": 0.5932, + "step": 1744 + }, + { + "epoch": 0.787366046249295, + "grad_norm": 3.1025659930391107, + "learning_rate": 2.2783069059432415e-07, + "loss": 0.5617, + "step": 1745 + }, + { + "epoch": 0.7878172588832487, + "grad_norm": 3.4166793932429416, + "learning_rate": 2.2690260915788029e-07, + "loss": 0.6249, + "step": 1746 + }, + { + "epoch": 0.7882684715172025, + "grad_norm": 3.62085438351349, + "learning_rate": 2.2597617991545158e-07, + "loss": 0.5649, + "step": 1747 + }, + { + "epoch": 0.7887196841511562, + "grad_norm": 3.5339972671901885, + "learning_rate": 2.2505140484691897e-07, + "loss": 0.6335, + "step": 1748 + }, + { + "epoch": 0.78917089678511, + "grad_norm": 3.4985614392697415, + "learning_rate": 2.2412828592862798e-07, + "loss": 0.6082, + "step": 1749 + }, + { + "epoch": 0.7896221094190637, + "grad_norm": 3.267096741544989, + "learning_rate": 2.2320682513338595e-07, + "loss": 0.5948, + "step": 1750 + }, + { + "epoch": 0.7900733220530175, + "grad_norm": 3.0173344659741486, + "learning_rate": 2.2228702443045454e-07, + "loss": 0.5234, + "step": 1751 + }, + { + "epoch": 0.7905245346869713, + "grad_norm": 2.7526133536838358, + "learning_rate": 2.213688857855499e-07, + "loss": 0.4142, + "step": 1752 + }, + { + "epoch": 0.790975747320925, + "grad_norm": 3.509571340813883, + "learning_rate": 2.2045241116083467e-07, + "loss": 0.602, + "step": 1753 + }, + { + "epoch": 0.7914269599548788, + "grad_norm": 3.3780395439717323, + "learning_rate": 2.195376025149156e-07, + "loss": 0.5572, + "step": 1754 + }, + { + "epoch": 0.7918781725888325, + "grad_norm": 3.3496281451911325, + "learning_rate": 2.1862446180283966e-07, + "loss": 0.5613, + "step": 1755 + }, + { + "epoch": 0.7923293852227863, + "grad_norm": 3.0508967113065713, + "learning_rate": 2.1771299097608864e-07, + "loss": 0.47, + "step": 1756 + }, + { + "epoch": 0.79278059785674, + "grad_norm": 3.1585837200192697, + "learning_rate": 2.1680319198257568e-07, + "loss": 0.4856, + "step": 1757 + }, + { + "epoch": 0.7932318104906937, + "grad_norm": 2.8592335827975366, + "learning_rate": 2.1589506676664125e-07, + "loss": 0.5273, + "step": 1758 + }, + { + "epoch": 0.7936830231246474, + "grad_norm": 3.5664002528716128, + "learning_rate": 2.1498861726904838e-07, + "loss": 0.4738, + "step": 1759 + }, + { + "epoch": 0.7941342357586012, + "grad_norm": 3.3257110040445705, + "learning_rate": 2.1408384542697953e-07, + "loss": 0.5896, + "step": 1760 + }, + { + "epoch": 0.794585448392555, + "grad_norm": 3.2893705188785627, + "learning_rate": 2.131807531740315e-07, + "loss": 0.5908, + "step": 1761 + }, + { + "epoch": 0.7950366610265087, + "grad_norm": 3.2439198495340724, + "learning_rate": 2.1227934244021106e-07, + "loss": 0.5418, + "step": 1762 + }, + { + "epoch": 0.7954878736604625, + "grad_norm": 2.890014525204039, + "learning_rate": 2.113796151519327e-07, + "loss": 0.5701, + "step": 1763 + }, + { + "epoch": 0.7959390862944162, + "grad_norm": 2.9555433016653243, + "learning_rate": 2.10481573232012e-07, + "loss": 0.5724, + "step": 1764 + }, + { + "epoch": 0.79639029892837, + "grad_norm": 3.1595499191577985, + "learning_rate": 2.0958521859966317e-07, + "loss": 0.58, + "step": 1765 + }, + { + "epoch": 0.7968415115623237, + "grad_norm": 3.2406481995005683, + "learning_rate": 2.0869055317049454e-07, + "loss": 0.5815, + "step": 1766 + }, + { + "epoch": 0.7972927241962775, + "grad_norm": 3.013567529689706, + "learning_rate": 2.0779757885650407e-07, + "loss": 0.4969, + "step": 1767 + }, + { + "epoch": 0.7977439368302313, + "grad_norm": 3.285613814118948, + "learning_rate": 2.0690629756607647e-07, + "loss": 0.5407, + "step": 1768 + }, + { + "epoch": 0.798195149464185, + "grad_norm": 3.05321531009751, + "learning_rate": 2.0601671120397747e-07, + "loss": 0.5402, + "step": 1769 + }, + { + "epoch": 0.7986463620981388, + "grad_norm": 3.3714424558205, + "learning_rate": 2.0512882167135047e-07, + "loss": 0.5022, + "step": 1770 + }, + { + "epoch": 0.7990975747320925, + "grad_norm": 3.3008668639596284, + "learning_rate": 2.042426308657138e-07, + "loss": 0.5349, + "step": 1771 + }, + { + "epoch": 0.7995487873660463, + "grad_norm": 3.5138662487604444, + "learning_rate": 2.0335814068095336e-07, + "loss": 0.5402, + "step": 1772 + }, + { + "epoch": 0.8, + "grad_norm": 2.9913939825555014, + "learning_rate": 2.0247535300732265e-07, + "loss": 0.5572, + "step": 1773 + }, + { + "epoch": 0.8004512126339538, + "grad_norm": 3.6435559524273406, + "learning_rate": 2.0159426973143568e-07, + "loss": 0.4992, + "step": 1774 + }, + { + "epoch": 0.8009024252679074, + "grad_norm": 3.2412230999711817, + "learning_rate": 2.0071489273626374e-07, + "loss": 0.6071, + "step": 1775 + }, + { + "epoch": 0.8013536379018612, + "grad_norm": 2.969248009278257, + "learning_rate": 1.9983722390113255e-07, + "loss": 0.5, + "step": 1776 + }, + { + "epoch": 0.801804850535815, + "grad_norm": 2.768580600024177, + "learning_rate": 1.9896126510171641e-07, + "loss": 0.4454, + "step": 1777 + }, + { + "epoch": 0.8022560631697687, + "grad_norm": 3.2476101536079787, + "learning_rate": 1.9808701821003614e-07, + "loss": 0.5911, + "step": 1778 + }, + { + "epoch": 0.8027072758037225, + "grad_norm": 2.9616657048741244, + "learning_rate": 1.972144850944526e-07, + "loss": 0.5681, + "step": 1779 + }, + { + "epoch": 0.8031584884376762, + "grad_norm": 3.105204828849402, + "learning_rate": 1.963436676196649e-07, + "loss": 0.5379, + "step": 1780 + }, + { + "epoch": 0.80360970107163, + "grad_norm": 3.292817844044641, + "learning_rate": 1.95474567646706e-07, + "loss": 0.5259, + "step": 1781 + }, + { + "epoch": 0.8040609137055837, + "grad_norm": 3.264350635402717, + "learning_rate": 1.9460718703293765e-07, + "loss": 0.4252, + "step": 1782 + }, + { + "epoch": 0.8045121263395375, + "grad_norm": 2.829404764906008, + "learning_rate": 1.9374152763204777e-07, + "loss": 0.5153, + "step": 1783 + }, + { + "epoch": 0.8049633389734913, + "grad_norm": 3.2480177263767063, + "learning_rate": 1.9287759129404536e-07, + "loss": 0.4709, + "step": 1784 + }, + { + "epoch": 0.805414551607445, + "grad_norm": 2.899713242999487, + "learning_rate": 1.920153798652574e-07, + "loss": 0.4611, + "step": 1785 + }, + { + "epoch": 0.8058657642413988, + "grad_norm": 3.1635502073692345, + "learning_rate": 1.9115489518832418e-07, + "loss": 0.4709, + "step": 1786 + }, + { + "epoch": 0.8063169768753525, + "grad_norm": 3.1508557769862438, + "learning_rate": 1.9029613910219577e-07, + "loss": 0.5433, + "step": 1787 + }, + { + "epoch": 0.8067681895093063, + "grad_norm": 3.5210820961966425, + "learning_rate": 1.8943911344212872e-07, + "loss": 0.6146, + "step": 1788 + }, + { + "epoch": 0.80721940214326, + "grad_norm": 3.3366881648206403, + "learning_rate": 1.8858382003968077e-07, + "loss": 0.5368, + "step": 1789 + }, + { + "epoch": 0.8076706147772138, + "grad_norm": 3.3247727610257005, + "learning_rate": 1.8773026072270759e-07, + "loss": 0.8037, + "step": 1790 + }, + { + "epoch": 0.8081218274111676, + "grad_norm": 3.4817252725861763, + "learning_rate": 1.8687843731535956e-07, + "loss": 0.5695, + "step": 1791 + }, + { + "epoch": 0.8085730400451213, + "grad_norm": 3.336918219285363, + "learning_rate": 1.8602835163807662e-07, + "loss": 0.6866, + "step": 1792 + }, + { + "epoch": 0.809024252679075, + "grad_norm": 3.614272310931608, + "learning_rate": 1.8518000550758527e-07, + "loss": 0.4701, + "step": 1793 + }, + { + "epoch": 0.8094754653130287, + "grad_norm": 3.219275535851942, + "learning_rate": 1.843334007368943e-07, + "loss": 0.6077, + "step": 1794 + }, + { + "epoch": 0.8099266779469825, + "grad_norm": 2.89680741186188, + "learning_rate": 1.8348853913529083e-07, + "loss": 0.6083, + "step": 1795 + }, + { + "epoch": 0.8103778905809362, + "grad_norm": 3.6574243799798385, + "learning_rate": 1.8264542250833748e-07, + "loss": 0.6222, + "step": 1796 + }, + { + "epoch": 0.81082910321489, + "grad_norm": 3.0840696058845793, + "learning_rate": 1.8180405265786657e-07, + "loss": 0.5185, + "step": 1797 + }, + { + "epoch": 0.8112803158488437, + "grad_norm": 3.027696852321835, + "learning_rate": 1.8096443138197804e-07, + "loss": 0.5319, + "step": 1798 + }, + { + "epoch": 0.8117315284827975, + "grad_norm": 3.159795651179558, + "learning_rate": 1.801265604750347e-07, + "loss": 0.5356, + "step": 1799 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 3.3263427018468628, + "learning_rate": 1.792904417276584e-07, + "loss": 0.6881, + "step": 1800 + }, + { + "epoch": 0.812633953750705, + "grad_norm": 3.491430348543849, + "learning_rate": 1.7845607692672726e-07, + "loss": 0.6757, + "step": 1801 + }, + { + "epoch": 0.8130851663846588, + "grad_norm": 3.452496098068734, + "learning_rate": 1.776234678553702e-07, + "loss": 0.634, + "step": 1802 + }, + { + "epoch": 0.8135363790186125, + "grad_norm": 3.046194568097652, + "learning_rate": 1.7679261629296405e-07, + "loss": 0.5394, + "step": 1803 + }, + { + "epoch": 0.8139875916525663, + "grad_norm": 3.41189405392855, + "learning_rate": 1.7596352401513025e-07, + "loss": 0.6579, + "step": 1804 + }, + { + "epoch": 0.81443880428652, + "grad_norm": 3.3370355365005433, + "learning_rate": 1.7513619279372982e-07, + "loss": 0.584, + "step": 1805 + }, + { + "epoch": 0.8148900169204738, + "grad_norm": 3.1564896201684296, + "learning_rate": 1.743106243968605e-07, + "loss": 0.454, + "step": 1806 + }, + { + "epoch": 0.8153412295544276, + "grad_norm": 3.4096665409080305, + "learning_rate": 1.7348682058885244e-07, + "loss": 0.5079, + "step": 1807 + }, + { + "epoch": 0.8157924421883813, + "grad_norm": 3.422237532293, + "learning_rate": 1.7266478313026467e-07, + "loss": 0.49, + "step": 1808 + }, + { + "epoch": 0.8162436548223351, + "grad_norm": 2.9182124049243043, + "learning_rate": 1.71844513777882e-07, + "loss": 0.5393, + "step": 1809 + }, + { + "epoch": 0.8166948674562888, + "grad_norm": 3.298575760582701, + "learning_rate": 1.7102601428470986e-07, + "loss": 0.5132, + "step": 1810 + }, + { + "epoch": 0.8171460800902425, + "grad_norm": 3.1076000971626323, + "learning_rate": 1.7020928639997133e-07, + "loss": 0.5096, + "step": 1811 + }, + { + "epoch": 0.8175972927241962, + "grad_norm": 2.9576537154131914, + "learning_rate": 1.6939433186910435e-07, + "loss": 0.4751, + "step": 1812 + }, + { + "epoch": 0.81804850535815, + "grad_norm": 3.1438542828461884, + "learning_rate": 1.6858115243375516e-07, + "loss": 0.635, + "step": 1813 + }, + { + "epoch": 0.8184997179921037, + "grad_norm": 3.3925384992387566, + "learning_rate": 1.6776974983177827e-07, + "loss": 0.5006, + "step": 1814 + }, + { + "epoch": 0.8189509306260575, + "grad_norm": 3.155615848697049, + "learning_rate": 1.6696012579722986e-07, + "loss": 0.5519, + "step": 1815 + }, + { + "epoch": 0.8194021432600113, + "grad_norm": 3.2055385260502516, + "learning_rate": 1.6615228206036524e-07, + "loss": 0.5571, + "step": 1816 + }, + { + "epoch": 0.819853355893965, + "grad_norm": 3.3036737895825485, + "learning_rate": 1.6534622034763556e-07, + "loss": 0.5955, + "step": 1817 + }, + { + "epoch": 0.8203045685279188, + "grad_norm": 3.246191316789231, + "learning_rate": 1.6454194238168318e-07, + "loss": 0.4955, + "step": 1818 + }, + { + "epoch": 0.8207557811618725, + "grad_norm": 3.7297330377785847, + "learning_rate": 1.6373944988133815e-07, + "loss": 0.6329, + "step": 1819 + }, + { + "epoch": 0.8212069937958263, + "grad_norm": 3.3710927359486176, + "learning_rate": 1.6293874456161516e-07, + "loss": 0.6353, + "step": 1820 + }, + { + "epoch": 0.82165820642978, + "grad_norm": 3.075825293080748, + "learning_rate": 1.621398281337093e-07, + "loss": 0.5383, + "step": 1821 + }, + { + "epoch": 0.8221094190637338, + "grad_norm": 3.0510999526952673, + "learning_rate": 1.6134270230499292e-07, + "loss": 0.4723, + "step": 1822 + }, + { + "epoch": 0.8225606316976876, + "grad_norm": 2.990800015781615, + "learning_rate": 1.6054736877901154e-07, + "loss": 0.5556, + "step": 1823 + }, + { + "epoch": 0.8230118443316413, + "grad_norm": 3.2691319322666086, + "learning_rate": 1.5975382925547965e-07, + "loss": 0.6269, + "step": 1824 + }, + { + "epoch": 0.8234630569655951, + "grad_norm": 3.36974210150047, + "learning_rate": 1.5896208543027911e-07, + "loss": 0.6239, + "step": 1825 + }, + { + "epoch": 0.8239142695995488, + "grad_norm": 3.2846656450026863, + "learning_rate": 1.5817213899545289e-07, + "loss": 0.5369, + "step": 1826 + }, + { + "epoch": 0.8243654822335026, + "grad_norm": 3.6310256851443357, + "learning_rate": 1.5738399163920356e-07, + "loss": 0.5704, + "step": 1827 + }, + { + "epoch": 0.8248166948674562, + "grad_norm": 2.8015522594684414, + "learning_rate": 1.5659764504588845e-07, + "loss": 0.5314, + "step": 1828 + }, + { + "epoch": 0.82526790750141, + "grad_norm": 2.9898520678955145, + "learning_rate": 1.558131008960163e-07, + "loss": 0.4779, + "step": 1829 + }, + { + "epoch": 0.8257191201353637, + "grad_norm": 3.1623967382624345, + "learning_rate": 1.5503036086624454e-07, + "loss": 0.5098, + "step": 1830 + }, + { + "epoch": 0.8261703327693175, + "grad_norm": 3.4072485811873543, + "learning_rate": 1.5424942662937434e-07, + "loss": 0.563, + "step": 1831 + }, + { + "epoch": 0.8266215454032713, + "grad_norm": 2.855993326323611, + "learning_rate": 1.5347029985434777e-07, + "loss": 0.4537, + "step": 1832 + }, + { + "epoch": 0.827072758037225, + "grad_norm": 3.4851239971474195, + "learning_rate": 1.5269298220624505e-07, + "loss": 0.6242, + "step": 1833 + }, + { + "epoch": 0.8275239706711788, + "grad_norm": 3.388860727365143, + "learning_rate": 1.5191747534627819e-07, + "loss": 0.5445, + "step": 1834 + }, + { + "epoch": 0.8279751833051325, + "grad_norm": 3.554115255182286, + "learning_rate": 1.5114378093179147e-07, + "loss": 0.5375, + "step": 1835 + }, + { + "epoch": 0.8284263959390863, + "grad_norm": 2.655545323566283, + "learning_rate": 1.5037190061625427e-07, + "loss": 0.3131, + "step": 1836 + }, + { + "epoch": 0.82887760857304, + "grad_norm": 3.3338895146881002, + "learning_rate": 1.4960183604925968e-07, + "loss": 0.5707, + "step": 1837 + }, + { + "epoch": 0.8293288212069938, + "grad_norm": 3.218344870415116, + "learning_rate": 1.4883358887652042e-07, + "loss": 0.5006, + "step": 1838 + }, + { + "epoch": 0.8297800338409476, + "grad_norm": 3.1468119802066203, + "learning_rate": 1.4806716073986504e-07, + "loss": 0.6662, + "step": 1839 + }, + { + "epoch": 0.8302312464749013, + "grad_norm": 3.5160680404028493, + "learning_rate": 1.4730255327723452e-07, + "loss": 0.5002, + "step": 1840 + }, + { + "epoch": 0.8306824591088551, + "grad_norm": 3.260575279604774, + "learning_rate": 1.4653976812267898e-07, + "loss": 0.5138, + "step": 1841 + }, + { + "epoch": 0.8311336717428088, + "grad_norm": 2.9649200815505456, + "learning_rate": 1.457788069063538e-07, + "loss": 0.5762, + "step": 1842 + }, + { + "epoch": 0.8315848843767626, + "grad_norm": 2.942782774050298, + "learning_rate": 1.4501967125451718e-07, + "loss": 0.5477, + "step": 1843 + }, + { + "epoch": 0.8320360970107163, + "grad_norm": 3.0094337078574127, + "learning_rate": 1.442623627895251e-07, + "loss": 0.4498, + "step": 1844 + }, + { + "epoch": 0.8324873096446701, + "grad_norm": 3.225704846247271, + "learning_rate": 1.4350688312982862e-07, + "loss": 0.6199, + "step": 1845 + }, + { + "epoch": 0.8329385222786237, + "grad_norm": 3.1240951847264076, + "learning_rate": 1.4275323388997117e-07, + "loss": 0.4944, + "step": 1846 + }, + { + "epoch": 0.8333897349125775, + "grad_norm": 3.5826594571402963, + "learning_rate": 1.4200141668058396e-07, + "loss": 0.6276, + "step": 1847 + }, + { + "epoch": 0.8338409475465313, + "grad_norm": 3.3094708533679102, + "learning_rate": 1.412514331083826e-07, + "loss": 0.6394, + "step": 1848 + }, + { + "epoch": 0.834292160180485, + "grad_norm": 3.5135736501243446, + "learning_rate": 1.4050328477616458e-07, + "loss": 0.5399, + "step": 1849 + }, + { + "epoch": 0.8347433728144388, + "grad_norm": 3.482173704832845, + "learning_rate": 1.3975697328280456e-07, + "loss": 0.4879, + "step": 1850 + }, + { + "epoch": 0.8351945854483925, + "grad_norm": 3.0662306539692543, + "learning_rate": 1.3901250022325283e-07, + "loss": 0.4534, + "step": 1851 + }, + { + "epoch": 0.8356457980823463, + "grad_norm": 3.0598921330176214, + "learning_rate": 1.382698671885295e-07, + "loss": 0.4628, + "step": 1852 + }, + { + "epoch": 0.8360970107163, + "grad_norm": 3.1520019214759456, + "learning_rate": 1.3752907576572347e-07, + "loss": 0.6246, + "step": 1853 + }, + { + "epoch": 0.8365482233502538, + "grad_norm": 2.6571093677090363, + "learning_rate": 1.3679012753798724e-07, + "loss": 0.4899, + "step": 1854 + }, + { + "epoch": 0.8369994359842076, + "grad_norm": 2.9836867986668523, + "learning_rate": 1.3605302408453356e-07, + "loss": 0.4217, + "step": 1855 + }, + { + "epoch": 0.8374506486181613, + "grad_norm": 2.898222008405317, + "learning_rate": 1.3531776698063436e-07, + "loss": 0.5292, + "step": 1856 + }, + { + "epoch": 0.8379018612521151, + "grad_norm": 2.950876249789325, + "learning_rate": 1.3458435779761425e-07, + "loss": 0.4231, + "step": 1857 + }, + { + "epoch": 0.8383530738860688, + "grad_norm": 3.174963854424885, + "learning_rate": 1.3385279810284956e-07, + "loss": 0.4779, + "step": 1858 + }, + { + "epoch": 0.8388042865200226, + "grad_norm": 3.135298443318726, + "learning_rate": 1.3312308945976347e-07, + "loss": 0.5428, + "step": 1859 + }, + { + "epoch": 0.8392554991539763, + "grad_norm": 3.5971103192719505, + "learning_rate": 1.3239523342782344e-07, + "loss": 0.5815, + "step": 1860 + }, + { + "epoch": 0.8397067117879301, + "grad_norm": 3.413779126654346, + "learning_rate": 1.3166923156253817e-07, + "loss": 0.5965, + "step": 1861 + }, + { + "epoch": 0.8401579244218839, + "grad_norm": 3.397541524879773, + "learning_rate": 1.309450854154528e-07, + "loss": 0.6834, + "step": 1862 + }, + { + "epoch": 0.8406091370558376, + "grad_norm": 3.328000762597347, + "learning_rate": 1.3022279653414725e-07, + "loss": 0.64, + "step": 1863 + }, + { + "epoch": 0.8410603496897913, + "grad_norm": 3.0084605090395353, + "learning_rate": 1.2950236646223244e-07, + "loss": 0.5704, + "step": 1864 + }, + { + "epoch": 0.841511562323745, + "grad_norm": 3.2459425568437053, + "learning_rate": 1.2878379673934615e-07, + "loss": 0.5634, + "step": 1865 + }, + { + "epoch": 0.8419627749576988, + "grad_norm": 2.974415189859473, + "learning_rate": 1.2806708890115137e-07, + "loss": 0.4767, + "step": 1866 + }, + { + "epoch": 0.8424139875916525, + "grad_norm": 3.3391852048885813, + "learning_rate": 1.2735224447933102e-07, + "loss": 0.528, + "step": 1867 + }, + { + "epoch": 0.8428652002256063, + "grad_norm": 3.263226862766161, + "learning_rate": 1.2663926500158618e-07, + "loss": 0.6521, + "step": 1868 + }, + { + "epoch": 0.84331641285956, + "grad_norm": 3.208837226211113, + "learning_rate": 1.2592815199163244e-07, + "loss": 0.4281, + "step": 1869 + }, + { + "epoch": 0.8437676254935138, + "grad_norm": 3.704953009121333, + "learning_rate": 1.2521890696919602e-07, + "loss": 0.6876, + "step": 1870 + }, + { + "epoch": 0.8442188381274676, + "grad_norm": 3.1149574368441075, + "learning_rate": 1.245115314500118e-07, + "loss": 0.5546, + "step": 1871 + }, + { + "epoch": 0.8446700507614213, + "grad_norm": 3.2673584564454083, + "learning_rate": 1.2380602694581888e-07, + "loss": 0.5626, + "step": 1872 + }, + { + "epoch": 0.8451212633953751, + "grad_norm": 3.6047315203131025, + "learning_rate": 1.2310239496435748e-07, + "loss": 0.6006, + "step": 1873 + }, + { + "epoch": 0.8455724760293288, + "grad_norm": 3.3754199818311004, + "learning_rate": 1.224006370093672e-07, + "loss": 0.5484, + "step": 1874 + }, + { + "epoch": 0.8460236886632826, + "grad_norm": 3.299566444490645, + "learning_rate": 1.2170075458058083e-07, + "loss": 0.5683, + "step": 1875 + }, + { + "epoch": 0.8464749012972363, + "grad_norm": 3.1617484748413154, + "learning_rate": 1.2100274917372477e-07, + "loss": 0.5741, + "step": 1876 + }, + { + "epoch": 0.8469261139311901, + "grad_norm": 3.2176898145338533, + "learning_rate": 1.203066222805129e-07, + "loss": 0.5392, + "step": 1877 + }, + { + "epoch": 0.8473773265651439, + "grad_norm": 2.6695851978983662, + "learning_rate": 1.1961237538864467e-07, + "loss": 0.586, + "step": 1878 + }, + { + "epoch": 0.8478285391990976, + "grad_norm": 3.2205550742995057, + "learning_rate": 1.189200099818024e-07, + "loss": 0.5225, + "step": 1879 + }, + { + "epoch": 0.8482797518330514, + "grad_norm": 3.1361790423595144, + "learning_rate": 1.1822952753964666e-07, + "loss": 0.4771, + "step": 1880 + }, + { + "epoch": 0.848730964467005, + "grad_norm": 2.9693045075564726, + "learning_rate": 1.1754092953781425e-07, + "loss": 0.582, + "step": 1881 + }, + { + "epoch": 0.8491821771009588, + "grad_norm": 3.010501184967778, + "learning_rate": 1.168542174479148e-07, + "loss": 0.4967, + "step": 1882 + }, + { + "epoch": 0.8496333897349125, + "grad_norm": 2.9444396444036416, + "learning_rate": 1.1616939273752713e-07, + "loss": 0.597, + "step": 1883 + }, + { + "epoch": 0.8500846023688663, + "grad_norm": 3.1244893204773887, + "learning_rate": 1.1548645687019742e-07, + "loss": 0.476, + "step": 1884 + }, + { + "epoch": 0.85053581500282, + "grad_norm": 2.895489653963696, + "learning_rate": 1.1480541130543431e-07, + "loss": 0.4374, + "step": 1885 + }, + { + "epoch": 0.8509870276367738, + "grad_norm": 3.6679522394726902, + "learning_rate": 1.1412625749870675e-07, + "loss": 0.6452, + "step": 1886 + }, + { + "epoch": 0.8514382402707276, + "grad_norm": 3.4012095936653255, + "learning_rate": 1.1344899690144138e-07, + "loss": 0.6069, + "step": 1887 + }, + { + "epoch": 0.8518894529046813, + "grad_norm": 3.4427078769517427, + "learning_rate": 1.1277363096101833e-07, + "loss": 0.4773, + "step": 1888 + }, + { + "epoch": 0.8523406655386351, + "grad_norm": 2.8891427428108485, + "learning_rate": 1.1210016112076869e-07, + "loss": 0.5172, + "step": 1889 + }, + { + "epoch": 0.8527918781725888, + "grad_norm": 3.1882213169736326, + "learning_rate": 1.1142858881997153e-07, + "loss": 0.6214, + "step": 1890 + }, + { + "epoch": 0.8532430908065426, + "grad_norm": 3.2631974785916578, + "learning_rate": 1.107589154938503e-07, + "loss": 0.3816, + "step": 1891 + }, + { + "epoch": 0.8536943034404963, + "grad_norm": 3.646300547484839, + "learning_rate": 1.10091142573571e-07, + "loss": 0.5078, + "step": 1892 + }, + { + "epoch": 0.8541455160744501, + "grad_norm": 3.0800229323313446, + "learning_rate": 1.0942527148623736e-07, + "loss": 0.5741, + "step": 1893 + }, + { + "epoch": 0.8545967287084039, + "grad_norm": 3.4437111992470695, + "learning_rate": 1.0876130365488878e-07, + "loss": 0.4977, + "step": 1894 + }, + { + "epoch": 0.8550479413423576, + "grad_norm": 3.415687476852208, + "learning_rate": 1.0809924049849816e-07, + "loss": 0.5176, + "step": 1895 + }, + { + "epoch": 0.8554991539763114, + "grad_norm": 3.423266820929565, + "learning_rate": 1.0743908343196629e-07, + "loss": 0.5477, + "step": 1896 + }, + { + "epoch": 0.8559503666102651, + "grad_norm": 3.319978860721829, + "learning_rate": 1.067808338661219e-07, + "loss": 0.6, + "step": 1897 + }, + { + "epoch": 0.8564015792442189, + "grad_norm": 3.2937834521614766, + "learning_rate": 1.0612449320771644e-07, + "loss": 0.6547, + "step": 1898 + }, + { + "epoch": 0.8568527918781725, + "grad_norm": 2.7650015075242136, + "learning_rate": 1.0547006285942162e-07, + "loss": 0.4555, + "step": 1899 + }, + { + "epoch": 0.8573040045121263, + "grad_norm": 3.2864394879320495, + "learning_rate": 1.0481754421982758e-07, + "loss": 0.5315, + "step": 1900 + }, + { + "epoch": 0.85775521714608, + "grad_norm": 3.359413365564091, + "learning_rate": 1.0416693868343795e-07, + "loss": 0.4423, + "step": 1901 + }, + { + "epoch": 0.8582064297800338, + "grad_norm": 2.9927056449757714, + "learning_rate": 1.0351824764066819e-07, + "loss": 0.5731, + "step": 1902 + }, + { + "epoch": 0.8586576424139876, + "grad_norm": 3.61532226087724, + "learning_rate": 1.0287147247784244e-07, + "loss": 0.6, + "step": 1903 + }, + { + "epoch": 0.8591088550479413, + "grad_norm": 2.9636046590677902, + "learning_rate": 1.0222661457718985e-07, + "loss": 0.683, + "step": 1904 + }, + { + "epoch": 0.8595600676818951, + "grad_norm": 3.2418007105981337, + "learning_rate": 1.015836753168431e-07, + "loss": 0.5704, + "step": 1905 + }, + { + "epoch": 0.8600112803158488, + "grad_norm": 3.270787762505259, + "learning_rate": 1.0094265607083374e-07, + "loss": 0.5531, + "step": 1906 + }, + { + "epoch": 0.8604624929498026, + "grad_norm": 3.4241238059817753, + "learning_rate": 1.0030355820908997e-07, + "loss": 0.4724, + "step": 1907 + }, + { + "epoch": 0.8609137055837564, + "grad_norm": 3.120442346322047, + "learning_rate": 9.966638309743481e-08, + "loss": 0.5969, + "step": 1908 + }, + { + "epoch": 0.8613649182177101, + "grad_norm": 2.8700401112362983, + "learning_rate": 9.903113209758096e-08, + "loss": 0.639, + "step": 1909 + }, + { + "epoch": 0.8618161308516639, + "grad_norm": 2.90952553707339, + "learning_rate": 9.839780656712959e-08, + "loss": 0.4335, + "step": 1910 + }, + { + "epoch": 0.8622673434856176, + "grad_norm": 3.15352399159264, + "learning_rate": 9.776640785956702e-08, + "loss": 0.602, + "step": 1911 + }, + { + "epoch": 0.8627185561195714, + "grad_norm": 3.558953242347675, + "learning_rate": 9.713693732426131e-08, + "loss": 0.5247, + "step": 1912 + }, + { + "epoch": 0.8631697687535251, + "grad_norm": 3.058394803478645, + "learning_rate": 9.65093963064606e-08, + "loss": 0.5442, + "step": 1913 + }, + { + "epoch": 0.8636209813874789, + "grad_norm": 3.0343736318962646, + "learning_rate": 9.588378614728864e-08, + "loss": 0.4304, + "step": 1914 + }, + { + "epoch": 0.8640721940214326, + "grad_norm": 3.103162455870594, + "learning_rate": 9.526010818374309e-08, + "loss": 0.6602, + "step": 1915 + }, + { + "epoch": 0.8645234066553864, + "grad_norm": 3.345412268134743, + "learning_rate": 9.46383637486925e-08, + "loss": 0.5215, + "step": 1916 + }, + { + "epoch": 0.86497461928934, + "grad_norm": 2.950588770881482, + "learning_rate": 9.401855417087234e-08, + "loss": 0.432, + "step": 1917 + }, + { + "epoch": 0.8654258319232938, + "grad_norm": 2.9028721420815438, + "learning_rate": 9.34006807748845e-08, + "loss": 0.5536, + "step": 1918 + }, + { + "epoch": 0.8658770445572476, + "grad_norm": 3.422382660689821, + "learning_rate": 9.278474488119182e-08, + "loss": 0.5321, + "step": 1919 + }, + { + "epoch": 0.8663282571912013, + "grad_norm": 3.052528705427497, + "learning_rate": 9.217074780611688e-08, + "loss": 0.5474, + "step": 1920 + }, + { + "epoch": 0.8667794698251551, + "grad_norm": 3.1194719596998484, + "learning_rate": 9.155869086183921e-08, + "loss": 0.5148, + "step": 1921 + }, + { + "epoch": 0.8672306824591088, + "grad_norm": 2.9070890387145494, + "learning_rate": 9.094857535639156e-08, + "loss": 0.4189, + "step": 1922 + }, + { + "epoch": 0.8676818950930626, + "grad_norm": 3.177762649776062, + "learning_rate": 9.03404025936576e-08, + "loss": 0.4111, + "step": 1923 + }, + { + "epoch": 0.8681331077270164, + "grad_norm": 3.4014028718324885, + "learning_rate": 8.973417387336946e-08, + "loss": 0.5615, + "step": 1924 + }, + { + "epoch": 0.8685843203609701, + "grad_norm": 3.314370098733984, + "learning_rate": 8.91298904911043e-08, + "loss": 0.5898, + "step": 1925 + }, + { + "epoch": 0.8690355329949239, + "grad_norm": 3.2243090136724772, + "learning_rate": 8.852755373828235e-08, + "loss": 0.5159, + "step": 1926 + }, + { + "epoch": 0.8694867456288776, + "grad_norm": 3.1990156923608506, + "learning_rate": 8.792716490216335e-08, + "loss": 0.5222, + "step": 1927 + }, + { + "epoch": 0.8699379582628314, + "grad_norm": 3.0312256180294996, + "learning_rate": 8.732872526584379e-08, + "loss": 0.7078, + "step": 1928 + }, + { + "epoch": 0.8703891708967851, + "grad_norm": 3.0987202755918983, + "learning_rate": 8.67322361082553e-08, + "loss": 0.4882, + "step": 1929 + }, + { + "epoch": 0.8708403835307389, + "grad_norm": 2.7196240122489446, + "learning_rate": 8.613769870416066e-08, + "loss": 0.5035, + "step": 1930 + }, + { + "epoch": 0.8712915961646926, + "grad_norm": 2.9325956476707224, + "learning_rate": 8.554511432415145e-08, + "loss": 0.4407, + "step": 1931 + }, + { + "epoch": 0.8717428087986464, + "grad_norm": 3.563913043226757, + "learning_rate": 8.495448423464568e-08, + "loss": 0.6164, + "step": 1932 + }, + { + "epoch": 0.8721940214326002, + "grad_norm": 3.0026341334651323, + "learning_rate": 8.436580969788431e-08, + "loss": 0.4445, + "step": 1933 + }, + { + "epoch": 0.8726452340665538, + "grad_norm": 3.2398204365530656, + "learning_rate": 8.377909197193011e-08, + "loss": 0.5261, + "step": 1934 + }, + { + "epoch": 0.8730964467005076, + "grad_norm": 3.552891767506158, + "learning_rate": 8.319433231066264e-08, + "loss": 0.683, + "step": 1935 + }, + { + "epoch": 0.8735476593344613, + "grad_norm": 2.857576906676008, + "learning_rate": 8.261153196377813e-08, + "loss": 0.4816, + "step": 1936 + }, + { + "epoch": 0.8739988719684151, + "grad_norm": 3.172569622874956, + "learning_rate": 8.20306921767847e-08, + "loss": 0.4672, + "step": 1937 + }, + { + "epoch": 0.8744500846023688, + "grad_norm": 3.515851107453122, + "learning_rate": 8.145181419100034e-08, + "loss": 0.6234, + "step": 1938 + }, + { + "epoch": 0.8749012972363226, + "grad_norm": 3.439159617356599, + "learning_rate": 8.08748992435514e-08, + "loss": 0.5984, + "step": 1939 + }, + { + "epoch": 0.8753525098702764, + "grad_norm": 3.3171348098631857, + "learning_rate": 8.02999485673681e-08, + "loss": 0.5059, + "step": 1940 + }, + { + "epoch": 0.8758037225042301, + "grad_norm": 3.008859919440221, + "learning_rate": 7.972696339118346e-08, + "loss": 0.5124, + "step": 1941 + }, + { + "epoch": 0.8762549351381839, + "grad_norm": 3.3407440096371626, + "learning_rate": 7.91559449395296e-08, + "loss": 0.5261, + "step": 1942 + }, + { + "epoch": 0.8767061477721376, + "grad_norm": 3.2680192388068985, + "learning_rate": 7.858689443273547e-08, + "loss": 0.4824, + "step": 1943 + }, + { + "epoch": 0.8771573604060914, + "grad_norm": 3.2302885233475247, + "learning_rate": 7.801981308692507e-08, + "loss": 0.5301, + "step": 1944 + }, + { + "epoch": 0.8776085730400451, + "grad_norm": 3.3144955548693846, + "learning_rate": 7.745470211401273e-08, + "loss": 0.5292, + "step": 1945 + }, + { + "epoch": 0.8780597856739989, + "grad_norm": 3.6338069876593972, + "learning_rate": 7.689156272170316e-08, + "loss": 0.6645, + "step": 1946 + }, + { + "epoch": 0.8785109983079527, + "grad_norm": 3.320705078381082, + "learning_rate": 7.633039611348701e-08, + "loss": 0.5731, + "step": 1947 + }, + { + "epoch": 0.8789622109419064, + "grad_norm": 3.400427088861012, + "learning_rate": 7.577120348863864e-08, + "loss": 0.5424, + "step": 1948 + }, + { + "epoch": 0.8794134235758602, + "grad_norm": 2.952134565078521, + "learning_rate": 7.521398604221451e-08, + "loss": 0.4211, + "step": 1949 + }, + { + "epoch": 0.8798646362098139, + "grad_norm": 3.1223077524770804, + "learning_rate": 7.465874496504943e-08, + "loss": 0.4525, + "step": 1950 + }, + { + "epoch": 0.8803158488437677, + "grad_norm": 3.3651190746871213, + "learning_rate": 7.410548144375417e-08, + "loss": 0.5718, + "step": 1951 + }, + { + "epoch": 0.8807670614777213, + "grad_norm": 2.8278868757405995, + "learning_rate": 7.355419666071406e-08, + "loss": 0.5105, + "step": 1952 + }, + { + "epoch": 0.8812182741116751, + "grad_norm": 3.3691970534959346, + "learning_rate": 7.300489179408476e-08, + "loss": 0.4373, + "step": 1953 + }, + { + "epoch": 0.8816694867456288, + "grad_norm": 3.1396491136987374, + "learning_rate": 7.245756801779158e-08, + "loss": 0.5241, + "step": 1954 + }, + { + "epoch": 0.8821206993795826, + "grad_norm": 2.9990018272726338, + "learning_rate": 7.191222650152528e-08, + "loss": 0.5162, + "step": 1955 + }, + { + "epoch": 0.8825719120135364, + "grad_norm": 2.872850179093246, + "learning_rate": 7.136886841074052e-08, + "loss": 0.4716, + "step": 1956 + }, + { + "epoch": 0.8830231246474901, + "grad_norm": 3.7713414016675255, + "learning_rate": 7.082749490665351e-08, + "loss": 0.6061, + "step": 1957 + }, + { + "epoch": 0.8834743372814439, + "grad_norm": 3.476662997099093, + "learning_rate": 7.028810714623846e-08, + "loss": 0.4683, + "step": 1958 + }, + { + "epoch": 0.8839255499153976, + "grad_norm": 3.2389720481182764, + "learning_rate": 6.975070628222646e-08, + "loss": 0.5448, + "step": 1959 + }, + { + "epoch": 0.8843767625493514, + "grad_norm": 3.1694385457983127, + "learning_rate": 6.921529346310218e-08, + "loss": 0.5136, + "step": 1960 + }, + { + "epoch": 0.8848279751833051, + "grad_norm": 3.6314284620989787, + "learning_rate": 6.868186983310131e-08, + "loss": 0.4929, + "step": 1961 + }, + { + "epoch": 0.8852791878172589, + "grad_norm": 3.123575388660666, + "learning_rate": 6.81504365322092e-08, + "loss": 0.5635, + "step": 1962 + }, + { + "epoch": 0.8857304004512127, + "grad_norm": 2.6321261997764807, + "learning_rate": 6.76209946961569e-08, + "loss": 0.4772, + "step": 1963 + }, + { + "epoch": 0.8861816130851664, + "grad_norm": 3.210961705882933, + "learning_rate": 6.709354545641987e-08, + "loss": 0.5765, + "step": 1964 + }, + { + "epoch": 0.8866328257191202, + "grad_norm": 3.102505340660919, + "learning_rate": 6.65680899402149e-08, + "loss": 0.6185, + "step": 1965 + }, + { + "epoch": 0.8870840383530739, + "grad_norm": 3.318813163094756, + "learning_rate": 6.604462927049804e-08, + "loss": 0.5496, + "step": 1966 + }, + { + "epoch": 0.8875352509870277, + "grad_norm": 3.693495485179525, + "learning_rate": 6.552316456596252e-08, + "loss": 0.6225, + "step": 1967 + }, + { + "epoch": 0.8879864636209814, + "grad_norm": 2.7909920602545957, + "learning_rate": 6.500369694103558e-08, + "loss": 0.4657, + "step": 1968 + }, + { + "epoch": 0.8884376762549352, + "grad_norm": 3.8268919463461315, + "learning_rate": 6.44862275058763e-08, + "loss": 0.5475, + "step": 1969 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.8955466658153273, + "learning_rate": 6.397075736637403e-08, + "loss": 0.4761, + "step": 1970 + }, + { + "epoch": 0.8893401015228426, + "grad_norm": 3.0393342411424054, + "learning_rate": 6.345728762414503e-08, + "loss": 0.5003, + "step": 1971 + }, + { + "epoch": 0.8897913141567964, + "grad_norm": 3.0480382524304677, + "learning_rate": 6.294581937653042e-08, + "loss": 0.5091, + "step": 1972 + }, + { + "epoch": 0.8902425267907501, + "grad_norm": 3.2472044245989027, + "learning_rate": 6.243635371659395e-08, + "loss": 0.5822, + "step": 1973 + }, + { + "epoch": 0.8906937394247039, + "grad_norm": 2.9373896773076926, + "learning_rate": 6.192889173311966e-08, + "loss": 0.5317, + "step": 1974 + }, + { + "epoch": 0.8911449520586576, + "grad_norm": 3.4088834880295034, + "learning_rate": 6.142343451060972e-08, + "loss": 0.5696, + "step": 1975 + }, + { + "epoch": 0.8915961646926114, + "grad_norm": 3.2061069267520645, + "learning_rate": 6.091998312928171e-08, + "loss": 0.5439, + "step": 1976 + }, + { + "epoch": 0.8920473773265651, + "grad_norm": 2.9538556019221733, + "learning_rate": 6.04185386650662e-08, + "loss": 0.4199, + "step": 1977 + }, + { + "epoch": 0.8924985899605189, + "grad_norm": 3.3347490781585125, + "learning_rate": 5.99191021896055e-08, + "loss": 0.585, + "step": 1978 + }, + { + "epoch": 0.8929498025944727, + "grad_norm": 3.252069083231302, + "learning_rate": 5.9421674770249844e-08, + "loss": 0.5721, + "step": 1979 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 3.373090912932044, + "learning_rate": 5.8926257470056415e-08, + "loss": 0.6677, + "step": 1980 + }, + { + "epoch": 0.8938522278623802, + "grad_norm": 3.470419272509584, + "learning_rate": 5.8432851347786414e-08, + "loss": 0.6336, + "step": 1981 + }, + { + "epoch": 0.8943034404963339, + "grad_norm": 2.954923563225483, + "learning_rate": 5.794145745790269e-08, + "loss": 0.5161, + "step": 1982 + }, + { + "epoch": 0.8947546531302877, + "grad_norm": 2.9240499120629373, + "learning_rate": 5.7452076850568186e-08, + "loss": 0.5764, + "step": 1983 + }, + { + "epoch": 0.8952058657642414, + "grad_norm": 3.163381313365714, + "learning_rate": 5.696471057164298e-08, + "loss": 0.4723, + "step": 1984 + }, + { + "epoch": 0.8956570783981952, + "grad_norm": 3.001631089880879, + "learning_rate": 5.6479359662682246e-08, + "loss": 0.4736, + "step": 1985 + }, + { + "epoch": 0.896108291032149, + "grad_norm": 3.1077853978436414, + "learning_rate": 5.599602516093427e-08, + "loss": 0.4647, + "step": 1986 + }, + { + "epoch": 0.8965595036661026, + "grad_norm": 2.9246040215255658, + "learning_rate": 5.551470809933756e-08, + "loss": 0.4681, + "step": 1987 + }, + { + "epoch": 0.8970107163000564, + "grad_norm": 3.4387470409961223, + "learning_rate": 5.503540950652008e-08, + "loss": 0.507, + "step": 1988 + }, + { + "epoch": 0.8974619289340101, + "grad_norm": 2.6230689040197364, + "learning_rate": 5.4558130406795355e-08, + "loss": 0.4589, + "step": 1989 + }, + { + "epoch": 0.8979131415679639, + "grad_norm": 3.388098608880633, + "learning_rate": 5.408287182016091e-08, + "loss": 0.4525, + "step": 1990 + }, + { + "epoch": 0.8983643542019176, + "grad_norm": 3.119545140896645, + "learning_rate": 5.360963476229707e-08, + "loss": 0.5486, + "step": 1991 + }, + { + "epoch": 0.8988155668358714, + "grad_norm": 3.1519502919215694, + "learning_rate": 5.313842024456305e-08, + "loss": 0.4839, + "step": 1992 + }, + { + "epoch": 0.8992667794698251, + "grad_norm": 3.7651904706666395, + "learning_rate": 5.2669229273996084e-08, + "loss": 0.6181, + "step": 1993 + }, + { + "epoch": 0.8997179921037789, + "grad_norm": 3.058263724338039, + "learning_rate": 5.220206285330886e-08, + "loss": 0.5597, + "step": 1994 + }, + { + "epoch": 0.9001692047377327, + "grad_norm": 3.0934748276186337, + "learning_rate": 5.173692198088708e-08, + "loss": 0.4815, + "step": 1995 + }, + { + "epoch": 0.9006204173716864, + "grad_norm": 3.968781722790635, + "learning_rate": 5.1273807650788146e-08, + "loss": 0.5632, + "step": 1996 + }, + { + "epoch": 0.9010716300056402, + "grad_norm": 3.131751903358897, + "learning_rate": 5.081272085273825e-08, + "loss": 0.4128, + "step": 1997 + }, + { + "epoch": 0.9015228426395939, + "grad_norm": 2.9193006683184275, + "learning_rate": 5.035366257213014e-08, + "loss": 0.4801, + "step": 1998 + }, + { + "epoch": 0.9019740552735477, + "grad_norm": 3.2693871968854875, + "learning_rate": 4.98966337900224e-08, + "loss": 0.5536, + "step": 1999 + }, + { + "epoch": 0.9024252679075014, + "grad_norm": 3.2813024613985227, + "learning_rate": 4.944163548313496e-08, + "loss": 0.6593, + "step": 2000 + }, + { + "epoch": 0.9028764805414552, + "grad_norm": 3.369192254537855, + "learning_rate": 4.898866862384976e-08, + "loss": 0.4618, + "step": 2001 + }, + { + "epoch": 0.903327693175409, + "grad_norm": 3.043012997543813, + "learning_rate": 4.853773418020646e-08, + "loss": 0.4676, + "step": 2002 + }, + { + "epoch": 0.9037789058093627, + "grad_norm": 3.434166016020396, + "learning_rate": 4.8088833115901395e-08, + "loss": 0.3948, + "step": 2003 + }, + { + "epoch": 0.9042301184433165, + "grad_norm": 2.7646887236809854, + "learning_rate": 4.764196639028572e-08, + "loss": 0.4774, + "step": 2004 + }, + { + "epoch": 0.9046813310772701, + "grad_norm": 3.3361977687273927, + "learning_rate": 4.719713495836242e-08, + "loss": 0.5685, + "step": 2005 + }, + { + "epoch": 0.9051325437112239, + "grad_norm": 3.4831449664643257, + "learning_rate": 4.6754339770785465e-08, + "loss": 0.5089, + "step": 2006 + }, + { + "epoch": 0.9055837563451776, + "grad_norm": 3.4671397319854522, + "learning_rate": 4.631358177385647e-08, + "loss": 0.5439, + "step": 2007 + }, + { + "epoch": 0.9060349689791314, + "grad_norm": 3.2933483172414335, + "learning_rate": 4.58748619095235e-08, + "loss": 0.5683, + "step": 2008 + }, + { + "epoch": 0.9064861816130851, + "grad_norm": 2.985093722715634, + "learning_rate": 4.543818111537956e-08, + "loss": 0.5344, + "step": 2009 + }, + { + "epoch": 0.9069373942470389, + "grad_norm": 3.4328919623699674, + "learning_rate": 4.500354032465925e-08, + "loss": 0.4476, + "step": 2010 + }, + { + "epoch": 0.9073886068809927, + "grad_norm": 3.304919883947093, + "learning_rate": 4.457094046623755e-08, + "loss": 0.6034, + "step": 2011 + }, + { + "epoch": 0.9078398195149464, + "grad_norm": 2.8099205651955206, + "learning_rate": 4.414038246462803e-08, + "loss": 0.4899, + "step": 2012 + }, + { + "epoch": 0.9082910321489002, + "grad_norm": 3.3739143855436535, + "learning_rate": 4.3711867239980324e-08, + "loss": 0.6359, + "step": 2013 + }, + { + "epoch": 0.9087422447828539, + "grad_norm": 3.189618086168555, + "learning_rate": 4.3285395708078546e-08, + "loss": 0.5732, + "step": 2014 + }, + { + "epoch": 0.9091934574168077, + "grad_norm": 3.367461634350275, + "learning_rate": 4.286096878033929e-08, + "loss": 0.4531, + "step": 2015 + }, + { + "epoch": 0.9096446700507614, + "grad_norm": 3.6483005516482767, + "learning_rate": 4.243858736380912e-08, + "loss": 0.6179, + "step": 2016 + }, + { + "epoch": 0.9100958826847152, + "grad_norm": 3.415485122808768, + "learning_rate": 4.2018252361164076e-08, + "loss": 0.5615, + "step": 2017 + }, + { + "epoch": 0.910547095318669, + "grad_norm": 2.741772493137757, + "learning_rate": 4.15999646707057e-08, + "loss": 0.4732, + "step": 2018 + }, + { + "epoch": 0.9109983079526227, + "grad_norm": 3.0538104199950307, + "learning_rate": 4.118372518636104e-08, + "loss": 0.5523, + "step": 2019 + }, + { + "epoch": 0.9114495205865765, + "grad_norm": 3.0764370937470704, + "learning_rate": 4.076953479767964e-08, + "loss": 0.5979, + "step": 2020 + }, + { + "epoch": 0.9119007332205302, + "grad_norm": 3.066765697208492, + "learning_rate": 4.035739438983143e-08, + "loss": 0.4439, + "step": 2021 + }, + { + "epoch": 0.9123519458544839, + "grad_norm": 3.668533201777122, + "learning_rate": 3.994730484360609e-08, + "loss": 0.572, + "step": 2022 + }, + { + "epoch": 0.9128031584884376, + "grad_norm": 3.401968524293534, + "learning_rate": 3.953926703540977e-08, + "loss": 0.5584, + "step": 2023 + }, + { + "epoch": 0.9132543711223914, + "grad_norm": 3.4337981912223174, + "learning_rate": 3.9133281837264385e-08, + "loss": 0.513, + "step": 2024 + }, + { + "epoch": 0.9137055837563451, + "grad_norm": 4.049276576839161, + "learning_rate": 3.872935011680456e-08, + "loss": 0.5475, + "step": 2025 + }, + { + "epoch": 0.9141567963902989, + "grad_norm": 3.227324543340619, + "learning_rate": 3.832747273727699e-08, + "loss": 0.4974, + "step": 2026 + }, + { + "epoch": 0.9146080090242527, + "grad_norm": 3.700152680818643, + "learning_rate": 3.792765055753755e-08, + "loss": 0.6376, + "step": 2027 + }, + { + "epoch": 0.9150592216582064, + "grad_norm": 3.134825348605703, + "learning_rate": 3.7529884432050074e-08, + "loss": 0.5864, + "step": 2028 + }, + { + "epoch": 0.9155104342921602, + "grad_norm": 3.287365982894785, + "learning_rate": 3.71341752108848e-08, + "loss": 0.5592, + "step": 2029 + }, + { + "epoch": 0.9159616469261139, + "grad_norm": 3.3656815978558408, + "learning_rate": 3.674052373971559e-08, + "loss": 0.5296, + "step": 2030 + }, + { + "epoch": 0.9164128595600677, + "grad_norm": 3.08635532381097, + "learning_rate": 3.634893085981872e-08, + "loss": 0.4835, + "step": 2031 + }, + { + "epoch": 0.9168640721940214, + "grad_norm": 3.120263533445725, + "learning_rate": 3.595939740807141e-08, + "loss": 0.5167, + "step": 2032 + }, + { + "epoch": 0.9173152848279752, + "grad_norm": 2.9603539149063374, + "learning_rate": 3.557192421694932e-08, + "loss": 0.5019, + "step": 2033 + }, + { + "epoch": 0.917766497461929, + "grad_norm": 3.300594188794194, + "learning_rate": 3.518651211452528e-08, + "loss": 0.6458, + "step": 2034 + }, + { + "epoch": 0.9182177100958827, + "grad_norm": 3.122902491830948, + "learning_rate": 3.4803161924467196e-08, + "loss": 0.5104, + "step": 2035 + }, + { + "epoch": 0.9186689227298365, + "grad_norm": 3.4528349816512103, + "learning_rate": 3.4421874466036285e-08, + "loss": 0.5535, + "step": 2036 + }, + { + "epoch": 0.9191201353637902, + "grad_norm": 3.3890580466360913, + "learning_rate": 3.404265055408617e-08, + "loss": 0.4116, + "step": 2037 + }, + { + "epoch": 0.919571347997744, + "grad_norm": 3.2546448072566645, + "learning_rate": 3.36654909990598e-08, + "loss": 0.5527, + "step": 2038 + }, + { + "epoch": 0.9200225606316977, + "grad_norm": 2.6288215794522314, + "learning_rate": 3.3290396606988405e-08, + "loss": 0.4729, + "step": 2039 + }, + { + "epoch": 0.9204737732656514, + "grad_norm": 3.0588000528716344, + "learning_rate": 3.29173681794902e-08, + "loss": 0.5862, + "step": 2040 + }, + { + "epoch": 0.9209249858996051, + "grad_norm": 3.3085745174099244, + "learning_rate": 3.25464065137675e-08, + "loss": 0.4663, + "step": 2041 + }, + { + "epoch": 0.9213761985335589, + "grad_norm": 3.3049910550091286, + "learning_rate": 3.217751240260647e-08, + "loss": 0.5277, + "step": 2042 + }, + { + "epoch": 0.9218274111675127, + "grad_norm": 3.1470391275241405, + "learning_rate": 3.1810686634374253e-08, + "loss": 0.5739, + "step": 2043 + }, + { + "epoch": 0.9222786238014664, + "grad_norm": 2.8926904316169897, + "learning_rate": 3.144592999301754e-08, + "loss": 0.5079, + "step": 2044 + }, + { + "epoch": 0.9227298364354202, + "grad_norm": 3.3359025166747753, + "learning_rate": 3.1083243258061666e-08, + "loss": 0.6565, + "step": 2045 + }, + { + "epoch": 0.9231810490693739, + "grad_norm": 3.2267530813326193, + "learning_rate": 3.072262720460783e-08, + "loss": 0.511, + "step": 2046 + }, + { + "epoch": 0.9236322617033277, + "grad_norm": 3.172137007655229, + "learning_rate": 3.036408260333223e-08, + "loss": 0.5653, + "step": 2047 + }, + { + "epoch": 0.9240834743372814, + "grad_norm": 2.9652257724423183, + "learning_rate": 3.000761022048393e-08, + "loss": 0.5494, + "step": 2048 + }, + { + "epoch": 0.9245346869712352, + "grad_norm": 3.162831602182436, + "learning_rate": 2.9653210817883634e-08, + "loss": 0.5811, + "step": 2049 + }, + { + "epoch": 0.924985899605189, + "grad_norm": 3.3548734459357195, + "learning_rate": 2.930088515292173e-08, + "loss": 0.4703, + "step": 2050 + }, + { + "epoch": 0.9254371122391427, + "grad_norm": 2.971253690703245, + "learning_rate": 2.8950633978556906e-08, + "loss": 0.6375, + "step": 2051 + }, + { + "epoch": 0.9258883248730965, + "grad_norm": 3.282908277488212, + "learning_rate": 2.860245804331429e-08, + "loss": 0.5976, + "step": 2052 + }, + { + "epoch": 0.9263395375070502, + "grad_norm": 3.4398673398293833, + "learning_rate": 2.8256358091284238e-08, + "loss": 0.5767, + "step": 2053 + }, + { + "epoch": 0.926790750141004, + "grad_norm": 3.0750885972506654, + "learning_rate": 2.79123348621203e-08, + "loss": 0.492, + "step": 2054 + }, + { + "epoch": 0.9272419627749577, + "grad_norm": 3.295821068137443, + "learning_rate": 2.7570389091037926e-08, + "loss": 0.5278, + "step": 2055 + }, + { + "epoch": 0.9276931754089115, + "grad_norm": 3.54320248311843, + "learning_rate": 2.7230521508812553e-08, + "loss": 0.5603, + "step": 2056 + }, + { + "epoch": 0.9281443880428653, + "grad_norm": 3.6284081250605063, + "learning_rate": 2.689273284177873e-08, + "loss": 0.6389, + "step": 2057 + }, + { + "epoch": 0.9285956006768189, + "grad_norm": 3.136805769994715, + "learning_rate": 2.6557023811827894e-08, + "loss": 0.5818, + "step": 2058 + }, + { + "epoch": 0.9290468133107727, + "grad_norm": 2.943590397644658, + "learning_rate": 2.6223395136407145e-08, + "loss": 0.5318, + "step": 2059 + }, + { + "epoch": 0.9294980259447264, + "grad_norm": 3.126539259551608, + "learning_rate": 2.5891847528517476e-08, + "loss": 0.4908, + "step": 2060 + }, + { + "epoch": 0.9299492385786802, + "grad_norm": 3.2669493617048198, + "learning_rate": 2.5562381696712654e-08, + "loss": 0.5458, + "step": 2061 + }, + { + "epoch": 0.9304004512126339, + "grad_norm": 3.0754859463953803, + "learning_rate": 2.5234998345097237e-08, + "loss": 0.4965, + "step": 2062 + }, + { + "epoch": 0.9308516638465877, + "grad_norm": 3.6839611131375336, + "learning_rate": 2.4909698173325443e-08, + "loss": 0.5531, + "step": 2063 + }, + { + "epoch": 0.9313028764805414, + "grad_norm": 3.22462453822676, + "learning_rate": 2.458648187659962e-08, + "loss": 0.4566, + "step": 2064 + }, + { + "epoch": 0.9317540891144952, + "grad_norm": 3.4060647980598, + "learning_rate": 2.4265350145668106e-08, + "loss": 0.7031, + "step": 2065 + }, + { + "epoch": 0.932205301748449, + "grad_norm": 3.089929652023956, + "learning_rate": 2.394630366682493e-08, + "loss": 0.4991, + "step": 2066 + }, + { + "epoch": 0.9326565143824027, + "grad_norm": 3.0972124530500427, + "learning_rate": 2.3629343121907562e-08, + "loss": 0.5814, + "step": 2067 + }, + { + "epoch": 0.9331077270163565, + "grad_norm": 2.956122127539399, + "learning_rate": 2.3314469188295272e-08, + "loss": 0.447, + "step": 2068 + }, + { + "epoch": 0.9335589396503102, + "grad_norm": 3.4655230691165775, + "learning_rate": 2.300168253890833e-08, + "loss": 0.5864, + "step": 2069 + }, + { + "epoch": 0.934010152284264, + "grad_norm": 3.2759006855749764, + "learning_rate": 2.2690983842205914e-08, + "loss": 0.5745, + "step": 2070 + }, + { + "epoch": 0.9344613649182177, + "grad_norm": 3.0615174312777014, + "learning_rate": 2.2382373762185658e-08, + "loss": 0.4796, + "step": 2071 + }, + { + "epoch": 0.9349125775521715, + "grad_norm": 3.432283728728365, + "learning_rate": 2.207585295838099e-08, + "loss": 0.6578, + "step": 2072 + }, + { + "epoch": 0.9353637901861253, + "grad_norm": 3.3123138166785275, + "learning_rate": 2.177142208586047e-08, + "loss": 0.5887, + "step": 2073 + }, + { + "epoch": 0.935815002820079, + "grad_norm": 3.3385012625964143, + "learning_rate": 2.146908179522644e-08, + "loss": 0.5244, + "step": 2074 + }, + { + "epoch": 0.9362662154540327, + "grad_norm": 3.412059349668194, + "learning_rate": 2.116883273261316e-08, + "loss": 0.4841, + "step": 2075 + }, + { + "epoch": 0.9367174280879864, + "grad_norm": 3.0584437698376314, + "learning_rate": 2.087067553968602e-08, + "loss": 0.3759, + "step": 2076 + }, + { + "epoch": 0.9371686407219402, + "grad_norm": 3.2188511717745123, + "learning_rate": 2.057461085363954e-08, + "loss": 0.5688, + "step": 2077 + }, + { + "epoch": 0.9376198533558939, + "grad_norm": 2.9976242270120594, + "learning_rate": 2.028063930719637e-08, + "loss": 0.4009, + "step": 2078 + }, + { + "epoch": 0.9380710659898477, + "grad_norm": 2.892808153960682, + "learning_rate": 1.9988761528606178e-08, + "loss": 0.4392, + "step": 2079 + }, + { + "epoch": 0.9385222786238014, + "grad_norm": 3.2358320547434443, + "learning_rate": 1.9698978141643784e-08, + "loss": 0.503, + "step": 2080 + }, + { + "epoch": 0.9389734912577552, + "grad_norm": 3.3962163022150653, + "learning_rate": 1.941128976560791e-08, + "loss": 0.5694, + "step": 2081 + }, + { + "epoch": 0.939424703891709, + "grad_norm": 2.998665449003642, + "learning_rate": 1.912569701532063e-08, + "loss": 0.5902, + "step": 2082 + }, + { + "epoch": 0.9398759165256627, + "grad_norm": 3.3801760197505835, + "learning_rate": 1.8842200501124615e-08, + "loss": 0.5383, + "step": 2083 + }, + { + "epoch": 0.9403271291596165, + "grad_norm": 2.9348713239408593, + "learning_rate": 1.8560800828883227e-08, + "loss": 0.4877, + "step": 2084 + }, + { + "epoch": 0.9407783417935702, + "grad_norm": 3.097739950065729, + "learning_rate": 1.8281498599978407e-08, + "loss": 0.6081, + "step": 2085 + }, + { + "epoch": 0.941229554427524, + "grad_norm": 3.2982172408128814, + "learning_rate": 1.8004294411309685e-08, + "loss": 0.5686, + "step": 2086 + }, + { + "epoch": 0.9416807670614777, + "grad_norm": 2.9622880055723084, + "learning_rate": 1.7729188855292954e-08, + "loss": 0.578, + "step": 2087 + }, + { + "epoch": 0.9421319796954315, + "grad_norm": 2.947330771863818, + "learning_rate": 1.7456182519858808e-08, + "loss": 0.5588, + "step": 2088 + }, + { + "epoch": 0.9425831923293853, + "grad_norm": 2.937817283751574, + "learning_rate": 1.7185275988451986e-08, + "loss": 0.4514, + "step": 2089 + }, + { + "epoch": 0.943034404963339, + "grad_norm": 3.3308032462704644, + "learning_rate": 1.691646984002937e-08, + "loss": 0.5546, + "step": 2090 + }, + { + "epoch": 0.9434856175972928, + "grad_norm": 2.8609200678838875, + "learning_rate": 1.6649764649059094e-08, + "loss": 0.4772, + "step": 2091 + }, + { + "epoch": 0.9439368302312465, + "grad_norm": 3.3257921371498256, + "learning_rate": 1.6385160985519564e-08, + "loss": 0.4277, + "step": 2092 + }, + { + "epoch": 0.9443880428652002, + "grad_norm": 3.280225796167774, + "learning_rate": 1.6122659414897876e-08, + "loss": 0.6392, + "step": 2093 + }, + { + "epoch": 0.9448392554991539, + "grad_norm": 3.8085560156358054, + "learning_rate": 1.5862260498188728e-08, + "loss": 0.7179, + "step": 2094 + }, + { + "epoch": 0.9452904681331077, + "grad_norm": 3.3335448343938268, + "learning_rate": 1.56039647918933e-08, + "loss": 0.6197, + "step": 2095 + }, + { + "epoch": 0.9457416807670614, + "grad_norm": 3.031288983603637, + "learning_rate": 1.5347772848017583e-08, + "loss": 0.4715, + "step": 2096 + }, + { + "epoch": 0.9461928934010152, + "grad_norm": 3.164918639328314, + "learning_rate": 1.509368521407217e-08, + "loss": 0.5569, + "step": 2097 + }, + { + "epoch": 0.946644106034969, + "grad_norm": 3.210287170541672, + "learning_rate": 1.4841702433070037e-08, + "loss": 0.6519, + "step": 2098 + }, + { + "epoch": 0.9470953186689227, + "grad_norm": 2.6369570547287977, + "learning_rate": 1.4591825043526073e-08, + "loss": 0.4529, + "step": 2099 + }, + { + "epoch": 0.9475465313028765, + "grad_norm": 3.391418189703669, + "learning_rate": 1.4344053579455894e-08, + "loss": 0.4644, + "step": 2100 + }, + { + "epoch": 0.9479977439368302, + "grad_norm": 2.9940931472415793, + "learning_rate": 1.4098388570374154e-08, + "loss": 0.4532, + "step": 2101 + }, + { + "epoch": 0.948448956570784, + "grad_norm": 3.1943317280291077, + "learning_rate": 1.3854830541294105e-08, + "loss": 0.5737, + "step": 2102 + }, + { + "epoch": 0.9489001692047377, + "grad_norm": 3.463904492669196, + "learning_rate": 1.3613380012725717e-08, + "loss": 0.5669, + "step": 2103 + }, + { + "epoch": 0.9493513818386915, + "grad_norm": 2.950631481291773, + "learning_rate": 1.337403750067545e-08, + "loss": 0.5732, + "step": 2104 + }, + { + "epoch": 0.9498025944726453, + "grad_norm": 3.2538975233288303, + "learning_rate": 1.3136803516644701e-08, + "loss": 0.5587, + "step": 2105 + }, + { + "epoch": 0.950253807106599, + "grad_norm": 3.3928844420275692, + "learning_rate": 1.2901678567628249e-08, + "loss": 0.5758, + "step": 2106 + }, + { + "epoch": 0.9507050197405528, + "grad_norm": 3.402553144487364, + "learning_rate": 1.2668663156114035e-08, + "loss": 0.5521, + "step": 2107 + }, + { + "epoch": 0.9511562323745065, + "grad_norm": 3.1090460601388688, + "learning_rate": 1.2437757780081715e-08, + "loss": 0.4712, + "step": 2108 + }, + { + "epoch": 0.9516074450084603, + "grad_norm": 3.157111834878996, + "learning_rate": 1.2208962933001332e-08, + "loss": 0.7341, + "step": 2109 + }, + { + "epoch": 0.952058657642414, + "grad_norm": 3.6264603472952692, + "learning_rate": 1.1982279103832539e-08, + "loss": 0.6544, + "step": 2110 + }, + { + "epoch": 0.9525098702763677, + "grad_norm": 2.997104592016241, + "learning_rate": 1.175770677702359e-08, + "loss": 0.6325, + "step": 2111 + }, + { + "epoch": 0.9529610829103214, + "grad_norm": 3.428867574227346, + "learning_rate": 1.1535246432510249e-08, + "loss": 0.6784, + "step": 2112 + }, + { + "epoch": 0.9534122955442752, + "grad_norm": 3.215528072634851, + "learning_rate": 1.1314898545714768e-08, + "loss": 0.5196, + "step": 2113 + }, + { + "epoch": 0.953863508178229, + "grad_norm": 3.0135546799565507, + "learning_rate": 1.1096663587544574e-08, + "loss": 0.4638, + "step": 2114 + }, + { + "epoch": 0.9543147208121827, + "grad_norm": 3.4816204927235277, + "learning_rate": 1.0880542024391926e-08, + "loss": 0.5687, + "step": 2115 + }, + { + "epoch": 0.9547659334461365, + "grad_norm": 3.2568560809861333, + "learning_rate": 1.0666534318132248e-08, + "loss": 0.521, + "step": 2116 + }, + { + "epoch": 0.9552171460800902, + "grad_norm": 3.1510176627650037, + "learning_rate": 1.0454640926123581e-08, + "loss": 0.6516, + "step": 2117 + }, + { + "epoch": 0.955668358714044, + "grad_norm": 3.3030358895502543, + "learning_rate": 1.0244862301205248e-08, + "loss": 0.4968, + "step": 2118 + }, + { + "epoch": 0.9561195713479977, + "grad_norm": 3.204100014366423, + "learning_rate": 1.0037198891697297e-08, + "loss": 0.5478, + "step": 2119 + }, + { + "epoch": 0.9565707839819515, + "grad_norm": 3.180566457304421, + "learning_rate": 9.831651141399167e-09, + "loss": 0.6166, + "step": 2120 + }, + { + "epoch": 0.9570219966159053, + "grad_norm": 2.9835868653318114, + "learning_rate": 9.62821948958914e-09, + "loss": 0.5578, + "step": 2121 + }, + { + "epoch": 0.957473209249859, + "grad_norm": 3.335223111473605, + "learning_rate": 9.42690437102267e-09, + "loss": 0.6616, + "step": 2122 + }, + { + "epoch": 0.9579244218838128, + "grad_norm": 3.0269577095198965, + "learning_rate": 9.227706215932718e-09, + "loss": 0.4504, + "step": 2123 + }, + { + "epoch": 0.9583756345177665, + "grad_norm": 3.034533006063733, + "learning_rate": 9.030625450027197e-09, + "loss": 0.535, + "step": 2124 + }, + { + "epoch": 0.9588268471517203, + "grad_norm": 3.4956858458681195, + "learning_rate": 8.835662494489638e-09, + "loss": 0.5865, + "step": 2125 + }, + { + "epoch": 0.959278059785674, + "grad_norm": 3.2430278438524804, + "learning_rate": 8.642817765977084e-09, + "loss": 0.568, + "step": 2126 + }, + { + "epoch": 0.9597292724196278, + "grad_norm": 3.259168460054936, + "learning_rate": 8.452091676619976e-09, + "loss": 0.4907, + "step": 2127 + }, + { + "epoch": 0.9601804850535814, + "grad_norm": 2.9506525517388447, + "learning_rate": 8.263484634020934e-09, + "loss": 0.5421, + "step": 2128 + }, + { + "epoch": 0.9606316976875352, + "grad_norm": 3.3734708681832846, + "learning_rate": 8.076997041253864e-09, + "loss": 0.6559, + "step": 2129 + }, + { + "epoch": 0.961082910321489, + "grad_norm": 3.239791595659205, + "learning_rate": 7.892629296863296e-09, + "loss": 0.6207, + "step": 2130 + }, + { + "epoch": 0.9615341229554427, + "grad_norm": 2.9754850080940325, + "learning_rate": 7.710381794863275e-09, + "loss": 0.4749, + "step": 2131 + }, + { + "epoch": 0.9619853355893965, + "grad_norm": 3.1775519947710387, + "learning_rate": 7.53025492473669e-09, + "loss": 0.6391, + "step": 2132 + }, + { + "epoch": 0.9624365482233502, + "grad_norm": 3.066624979505507, + "learning_rate": 7.352249071434613e-09, + "loss": 0.6327, + "step": 2133 + }, + { + "epoch": 0.962887760857304, + "grad_norm": 3.1723120674863563, + "learning_rate": 7.176364615374964e-09, + "loss": 0.4504, + "step": 2134 + }, + { + "epoch": 0.9633389734912577, + "grad_norm": 3.026276568695946, + "learning_rate": 7.002601932442176e-09, + "loss": 0.5376, + "step": 2135 + }, + { + "epoch": 0.9637901861252115, + "grad_norm": 3.0281976135848914, + "learning_rate": 6.830961393986201e-09, + "loss": 0.5515, + "step": 2136 + }, + { + "epoch": 0.9642413987591653, + "grad_norm": 3.5847532083995337, + "learning_rate": 6.661443366821618e-09, + "loss": 0.5691, + "step": 2137 + }, + { + "epoch": 0.964692611393119, + "grad_norm": 3.1294010668010026, + "learning_rate": 6.4940482132272985e-09, + "loss": 0.5833, + "step": 2138 + }, + { + "epoch": 0.9651438240270728, + "grad_norm": 3.387530884932056, + "learning_rate": 6.3287762909447486e-09, + "loss": 0.5154, + "step": 2139 + }, + { + "epoch": 0.9655950366610265, + "grad_norm": 3.646146989054173, + "learning_rate": 6.165627953178432e-09, + "loss": 0.6854, + "step": 2140 + }, + { + "epoch": 0.9660462492949803, + "grad_norm": 3.083896198719111, + "learning_rate": 6.0046035485941114e-09, + "loss": 0.64, + "step": 2141 + }, + { + "epoch": 0.966497461928934, + "grad_norm": 3.1945218965338165, + "learning_rate": 5.845703421318849e-09, + "loss": 0.5138, + "step": 2142 + }, + { + "epoch": 0.9669486745628878, + "grad_norm": 2.9725665961761156, + "learning_rate": 5.688927910939445e-09, + "loss": 0.5811, + "step": 2143 + }, + { + "epoch": 0.9673998871968416, + "grad_norm": 3.065001768071281, + "learning_rate": 5.534277352502448e-09, + "loss": 0.4917, + "step": 2144 + }, + { + "epoch": 0.9678510998307953, + "grad_norm": 2.9575837796796147, + "learning_rate": 5.381752076513146e-09, + "loss": 0.4729, + "step": 2145 + }, + { + "epoch": 0.968302312464749, + "grad_norm": 3.490044797232381, + "learning_rate": 5.231352408934686e-09, + "loss": 0.5855, + "step": 2146 + }, + { + "epoch": 0.9687535250987027, + "grad_norm": 3.2834883714280823, + "learning_rate": 5.083078671187846e-09, + "loss": 0.603, + "step": 2147 + }, + { + "epoch": 0.9692047377326565, + "grad_norm": 3.6227714783334855, + "learning_rate": 4.936931180149706e-09, + "loss": 0.5146, + "step": 2148 + }, + { + "epoch": 0.9696559503666102, + "grad_norm": 3.360342852449879, + "learning_rate": 4.792910248153537e-09, + "loss": 0.6926, + "step": 2149 + }, + { + "epoch": 0.970107163000564, + "grad_norm": 2.9745190180859997, + "learning_rate": 4.6510161829880215e-09, + "loss": 0.4959, + "step": 2150 + }, + { + "epoch": 0.9705583756345177, + "grad_norm": 2.8409802136356115, + "learning_rate": 4.511249287896257e-09, + "loss": 0.4792, + "step": 2151 + }, + { + "epoch": 0.9710095882684715, + "grad_norm": 2.9527499447768024, + "learning_rate": 4.373609861575422e-09, + "loss": 0.5262, + "step": 2152 + }, + { + "epoch": 0.9714608009024253, + "grad_norm": 2.8234329667683995, + "learning_rate": 4.238098198175999e-09, + "loss": 0.453, + "step": 2153 + }, + { + "epoch": 0.971912013536379, + "grad_norm": 3.1955934495623177, + "learning_rate": 4.1047145873015494e-09, + "loss": 0.5705, + "step": 2154 + }, + { + "epoch": 0.9723632261703328, + "grad_norm": 3.2186315351530177, + "learning_rate": 3.9734593140072766e-09, + "loss": 0.5774, + "step": 2155 + }, + { + "epoch": 0.9728144388042865, + "grad_norm": 3.2781816568797115, + "learning_rate": 3.844332658800131e-09, + "loss": 0.534, + "step": 2156 + }, + { + "epoch": 0.9732656514382403, + "grad_norm": 3.1271563804768516, + "learning_rate": 3.717334897638147e-09, + "loss": 0.4134, + "step": 2157 + }, + { + "epoch": 0.973716864072194, + "grad_norm": 3.1072908879945045, + "learning_rate": 3.59246630192922e-09, + "loss": 0.6476, + "step": 2158 + }, + { + "epoch": 0.9741680767061478, + "grad_norm": 3.0501654737467048, + "learning_rate": 3.469727138531442e-09, + "loss": 0.517, + "step": 2159 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 3.1296507770053585, + "learning_rate": 3.3491176697517663e-09, + "loss": 0.5104, + "step": 2160 + }, + { + "epoch": 0.9750705019740553, + "grad_norm": 3.452718374388686, + "learning_rate": 3.2306381533460103e-09, + "loss": 0.572, + "step": 2161 + }, + { + "epoch": 0.9755217146080091, + "grad_norm": 3.002928921865537, + "learning_rate": 3.1142888425177428e-09, + "loss": 0.5037, + "step": 2162 + }, + { + "epoch": 0.9759729272419628, + "grad_norm": 3.1485029119178862, + "learning_rate": 3.0000699859183965e-09, + "loss": 0.5625, + "step": 2163 + }, + { + "epoch": 0.9764241398759165, + "grad_norm": 3.0075732113850138, + "learning_rate": 2.8879818276459357e-09, + "loss": 0.6007, + "step": 2164 + }, + { + "epoch": 0.9768753525098702, + "grad_norm": 3.5339981344018234, + "learning_rate": 2.7780246072454103e-09, + "loss": 0.641, + "step": 2165 + }, + { + "epoch": 0.977326565143824, + "grad_norm": 2.9938714355039138, + "learning_rate": 2.6701985597071817e-09, + "loss": 0.6232, + "step": 2166 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 3.5748734649397735, + "learning_rate": 2.5645039154675863e-09, + "loss": 0.5054, + "step": 2167 + }, + { + "epoch": 0.9782289904117315, + "grad_norm": 2.821490730111447, + "learning_rate": 2.4609409004074934e-09, + "loss": 0.3994, + "step": 2168 + }, + { + "epoch": 0.9786802030456853, + "grad_norm": 3.2813541645652347, + "learning_rate": 2.3595097358525275e-09, + "loss": 0.5495, + "step": 2169 + }, + { + "epoch": 0.979131415679639, + "grad_norm": 3.1682355394847694, + "learning_rate": 2.26021063857218e-09, + "loss": 0.4805, + "step": 2170 + }, + { + "epoch": 0.9795826283135928, + "grad_norm": 3.940594922294758, + "learning_rate": 2.1630438207795864e-09, + "loss": 0.5344, + "step": 2171 + }, + { + "epoch": 0.9800338409475465, + "grad_norm": 2.8957814567912292, + "learning_rate": 2.068009490130862e-09, + "loss": 0.4688, + "step": 2172 + }, + { + "epoch": 0.9804850535815003, + "grad_norm": 3.2705425141515643, + "learning_rate": 1.9751078497248776e-09, + "loss": 0.5038, + "step": 2173 + }, + { + "epoch": 0.980936266215454, + "grad_norm": 3.2831548812730134, + "learning_rate": 1.884339098102483e-09, + "loss": 0.4829, + "step": 2174 + }, + { + "epoch": 0.9813874788494078, + "grad_norm": 3.0060524899921393, + "learning_rate": 1.79570342924662e-09, + "loss": 0.5734, + "step": 2175 + }, + { + "epoch": 0.9818386914833616, + "grad_norm": 3.2596135761529332, + "learning_rate": 1.709201032581431e-09, + "loss": 0.4523, + "step": 2176 + }, + { + "epoch": 0.9822899041173153, + "grad_norm": 2.987843850321189, + "learning_rate": 1.6248320929719283e-09, + "loss": 0.5607, + "step": 2177 + }, + { + "epoch": 0.9827411167512691, + "grad_norm": 2.9922698251969893, + "learning_rate": 1.5425967907239933e-09, + "loss": 0.5095, + "step": 2178 + }, + { + "epoch": 0.9831923293852228, + "grad_norm": 3.1200632797078933, + "learning_rate": 1.4624953015832663e-09, + "loss": 0.5743, + "step": 2179 + }, + { + "epoch": 0.9836435420191766, + "grad_norm": 3.0528829193084914, + "learning_rate": 1.3845277967355905e-09, + "loss": 0.4337, + "step": 2180 + }, + { + "epoch": 0.9840947546531302, + "grad_norm": 2.9733715654870614, + "learning_rate": 1.3086944428060132e-09, + "loss": 0.3556, + "step": 2181 + }, + { + "epoch": 0.984545967287084, + "grad_norm": 2.9735610218703026, + "learning_rate": 1.234995401858785e-09, + "loss": 0.5374, + "step": 2182 + }, + { + "epoch": 0.9849971799210377, + "grad_norm": 3.3255247596982183, + "learning_rate": 1.1634308313966944e-09, + "loss": 0.5705, + "step": 2183 + }, + { + "epoch": 0.9854483925549915, + "grad_norm": 3.379807226305083, + "learning_rate": 1.0940008843612903e-09, + "loss": 0.6434, + "step": 2184 + }, + { + "epoch": 0.9858996051889453, + "grad_norm": 3.2662985750115894, + "learning_rate": 1.026705709131992e-09, + "loss": 0.5889, + "step": 2185 + }, + { + "epoch": 0.986350817822899, + "grad_norm": 3.4188254818839927, + "learning_rate": 9.61545449525758e-10, + "loss": 0.4827, + "step": 2186 + }, + { + "epoch": 0.9868020304568528, + "grad_norm": 3.3483494345972713, + "learning_rate": 8.985202447974183e-10, + "loss": 0.5504, + "step": 2187 + }, + { + "epoch": 0.9872532430908065, + "grad_norm": 3.2171372630578676, + "learning_rate": 8.376302296387861e-10, + "loss": 0.5418, + "step": 2188 + }, + { + "epoch": 0.9877044557247603, + "grad_norm": 3.6688593817328794, + "learning_rate": 7.788755341783249e-10, + "loss": 0.5123, + "step": 2189 + }, + { + "epoch": 0.988155668358714, + "grad_norm": 3.6499281007897153, + "learning_rate": 7.222562839813706e-10, + "loss": 0.6186, + "step": 2190 + }, + { + "epoch": 0.9886068809926678, + "grad_norm": 3.39857073628445, + "learning_rate": 6.677726000494655e-10, + "loss": 0.5808, + "step": 2191 + }, + { + "epoch": 0.9890580936266216, + "grad_norm": 3.5449673321180204, + "learning_rate": 6.154245988202466e-10, + "loss": 0.7051, + "step": 2192 + }, + { + "epoch": 0.9895093062605753, + "grad_norm": 3.248475022142295, + "learning_rate": 5.652123921672247e-10, + "loss": 0.5403, + "step": 2193 + }, + { + "epoch": 0.9899605188945291, + "grad_norm": 3.3216368867145736, + "learning_rate": 5.171360873991171e-10, + "loss": 0.5873, + "step": 2194 + }, + { + "epoch": 0.9904117315284828, + "grad_norm": 3.0891117032715267, + "learning_rate": 4.711957872606254e-10, + "loss": 0.5611, + "step": 2195 + }, + { + "epoch": 0.9908629441624366, + "grad_norm": 3.340990779373879, + "learning_rate": 4.273915899309921e-10, + "loss": 0.5511, + "step": 2196 + }, + { + "epoch": 0.9913141567963903, + "grad_norm": 3.062648006172101, + "learning_rate": 3.857235890245558e-10, + "loss": 0.4507, + "step": 2197 + }, + { + "epoch": 0.9917653694303441, + "grad_norm": 3.168752716935158, + "learning_rate": 3.461918735905289e-10, + "loss": 0.5381, + "step": 2198 + }, + { + "epoch": 0.9922165820642977, + "grad_norm": 3.215884817371043, + "learning_rate": 3.0879652811255376e-10, + "loss": 0.5669, + "step": 2199 + }, + { + "epoch": 0.9926677946982515, + "grad_norm": 3.3933100109929524, + "learning_rate": 2.735376325084804e-10, + "loss": 0.5129, + "step": 2200 + }, + { + "epoch": 0.9931190073322053, + "grad_norm": 3.2902465372899896, + "learning_rate": 2.404152621305888e-10, + "loss": 0.5563, + "step": 2201 + }, + { + "epoch": 0.993570219966159, + "grad_norm": 3.2517872002551265, + "learning_rate": 2.0942948776481173e-10, + "loss": 0.4794, + "step": 2202 + }, + { + "epoch": 0.9940214326001128, + "grad_norm": 3.0936052923048227, + "learning_rate": 1.805803756314006e-10, + "loss": 0.53, + "step": 2203 + }, + { + "epoch": 0.9944726452340665, + "grad_norm": 3.3848557451197743, + "learning_rate": 1.5386798738381557e-10, + "loss": 0.5361, + "step": 2204 + }, + { + "epoch": 0.9949238578680203, + "grad_norm": 2.5261957634383094, + "learning_rate": 1.292923801096135e-10, + "loss": 0.3873, + "step": 2205 + }, + { + "epoch": 0.995375070501974, + "grad_norm": 3.2421625947937587, + "learning_rate": 1.0685360632933793e-10, + "loss": 0.5038, + "step": 2206 + }, + { + "epoch": 0.9958262831359278, + "grad_norm": 3.283331275472657, + "learning_rate": 8.655171399718497e-11, + "loss": 0.568, + "step": 2207 + }, + { + "epoch": 0.9962774957698816, + "grad_norm": 2.9967877566234473, + "learning_rate": 6.838674650067044e-11, + "loss": 0.4354, + "step": 2208 + }, + { + "epoch": 0.9967287084038353, + "grad_norm": 3.2686080869856142, + "learning_rate": 5.235874266018569e-11, + "loss": 0.4931, + "step": 2209 + }, + { + "epoch": 0.9971799210377891, + "grad_norm": 2.926365790766811, + "learning_rate": 3.846773672933068e-11, + "loss": 0.6223, + "step": 2210 + }, + { + "epoch": 0.9976311336717428, + "grad_norm": 3.0651076068440113, + "learning_rate": 2.6713758394802943e-11, + "loss": 0.6735, + "step": 2211 + }, + { + "epoch": 0.9980823463056966, + "grad_norm": 3.317938299846236, + "learning_rate": 1.709683277606455e-11, + "loss": 0.5608, + "step": 2212 + }, + { + "epoch": 0.9985335589396503, + "grad_norm": 3.170578430333791, + "learning_rate": 9.616980425453113e-12, + "loss": 0.569, + "step": 2213 + }, + { + "epoch": 0.9989847715736041, + "grad_norm": 2.805405937442547, + "learning_rate": 4.274217328514851e-12, + "loss": 0.5771, + "step": 2214 + }, + { + "epoch": 0.9994359842075579, + "grad_norm": 3.2152320900605185, + "learning_rate": 1.068554903005392e-12, + "loss": 0.4857, + "step": 2215 + }, + { + "epoch": 0.9998871968415116, + "grad_norm": 3.2758141104619267, + "learning_rate": 0.0, + "loss": 0.479, + "step": 2216 + }, + { + "epoch": 0.9998871968415116, + "step": 2216, + "total_flos": 1474645333573632.0, + "train_loss": 0.580713667661382, + "train_runtime": 104901.7766, + "train_samples_per_second": 1.352, + "train_steps_per_second": 0.021 + } + ], + "logging_steps": 1.0, + "max_steps": 2216, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1474645333573632.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}