diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,73920 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5000236910684672, + "eval_steps": 10553, + "global_step": 10553, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.738213693437574e-05, + "grad_norm": 0.265625, + "learning_rate": 2e-05, + "loss": 1.5032, + "step": 1 + }, + { + "epoch": 4.738213693437574e-05, + "eval_loss": 1.6555964946746826, + "eval_runtime": 1297.243, + "eval_samples_per_second": 1.808, + "eval_steps_per_second": 1.808, + "step": 1 + }, + { + "epoch": 9.476427386875148e-05, + "grad_norm": 0.455078125, + "learning_rate": 4e-05, + "loss": 1.7934, + "step": 2 + }, + { + "epoch": 0.00014214641080312722, + "grad_norm": 0.232421875, + "learning_rate": 6e-05, + "loss": 1.5472, + "step": 3 + }, + { + "epoch": 0.00018952854773750296, + "grad_norm": 0.35546875, + "learning_rate": 8e-05, + "loss": 1.7256, + "step": 4 + }, + { + "epoch": 0.00023691068467187872, + "grad_norm": 0.296875, + "learning_rate": 0.0001, + "loss": 1.2578, + "step": 5 + }, + { + "epoch": 0.00028429282160625445, + "grad_norm": 0.2431640625, + "learning_rate": 0.00012, + "loss": 1.3301, + "step": 6 + }, + { + "epoch": 0.0003316749585406302, + "grad_norm": 0.306640625, + "learning_rate": 0.00014, + "loss": 1.5469, + "step": 7 + }, + { + "epoch": 0.0003790570954750059, + "grad_norm": 0.35546875, + "learning_rate": 0.00016, + "loss": 1.4694, + "step": 8 + }, + { + "epoch": 0.00042643923240938164, + "grad_norm": 0.267578125, + "learning_rate": 0.00018, + "loss": 0.9671, + "step": 9 + }, + { + "epoch": 0.00047382136934375743, + "grad_norm": 0.431640625, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 10 + }, + { + "epoch": 0.0005212035062781332, + "grad_norm": 0.4453125, + "learning_rate": 0.00019999999889105325, + "loss": 1.5963, + "step": 11 + }, + { + "epoch": 0.0005685856432125089, + "grad_norm": 0.5, + "learning_rate": 0.00019999999556421307, + "loss": 1.694, + "step": 12 + }, + { + "epoch": 0.0006159677801468846, + "grad_norm": 1.3046875, + "learning_rate": 0.00019999999001947948, + "loss": 2.0886, + "step": 13 + }, + { + "epoch": 0.0006633499170812604, + "grad_norm": 0.5703125, + "learning_rate": 0.00019999998225685262, + "loss": 1.9375, + "step": 14 + }, + { + "epoch": 0.0007107320540156361, + "grad_norm": 0.396484375, + "learning_rate": 0.00019999997227633268, + "loss": 1.429, + "step": 15 + }, + { + "epoch": 0.0007581141909500118, + "grad_norm": 0.458984375, + "learning_rate": 0.00019999996007791988, + "loss": 1.2525, + "step": 16 + }, + { + "epoch": 0.0008054963278843876, + "grad_norm": 0.47265625, + "learning_rate": 0.00019999994566161444, + "loss": 0.9333, + "step": 17 + }, + { + "epoch": 0.0008528784648187633, + "grad_norm": 0.75, + "learning_rate": 0.00019999992902741678, + "loss": 1.07, + "step": 18 + }, + { + "epoch": 0.000900260601753139, + "grad_norm": 0.7890625, + "learning_rate": 0.00019999991017532716, + "loss": 1.0739, + "step": 19 + }, + { + "epoch": 0.0009476427386875149, + "grad_norm": 0.640625, + "learning_rate": 0.00019999988910534606, + "loss": 0.9974, + "step": 20 + }, + { + "epoch": 0.0009950248756218905, + "grad_norm": 0.416015625, + "learning_rate": 0.00019999986581747394, + "loss": 1.147, + "step": 21 + }, + { + "epoch": 0.0010424070125562663, + "grad_norm": 0.43359375, + "learning_rate": 0.0001999998403117113, + "loss": 1.2521, + "step": 22 + }, + { + "epoch": 0.001089789149490642, + "grad_norm": 1.4296875, + "learning_rate": 0.00019999981258805874, + "loss": 0.9572, + "step": 23 + }, + { + "epoch": 0.0011371712864250178, + "grad_norm": 0.54296875, + "learning_rate": 0.00019999978264651684, + "loss": 1.0038, + "step": 24 + }, + { + "epoch": 0.0011845534233593934, + "grad_norm": 0.53125, + "learning_rate": 0.00019999975048708626, + "loss": 1.1757, + "step": 25 + }, + { + "epoch": 0.0012319355602937693, + "grad_norm": 0.6171875, + "learning_rate": 0.00019999971610976775, + "loss": 0.8609, + "step": 26 + }, + { + "epoch": 0.001279317697228145, + "grad_norm": 0.734375, + "learning_rate": 0.00019999967951456204, + "loss": 1.7934, + "step": 27 + }, + { + "epoch": 0.0013266998341625207, + "grad_norm": 0.51171875, + "learning_rate": 0.00019999964070146998, + "loss": 1.3382, + "step": 28 + }, + { + "epoch": 0.0013740819710968966, + "grad_norm": 0.427734375, + "learning_rate": 0.00019999959967049237, + "loss": 1.0583, + "step": 29 + }, + { + "epoch": 0.0014214641080312722, + "grad_norm": 0.482421875, + "learning_rate": 0.00019999955642163015, + "loss": 1.0192, + "step": 30 + }, + { + "epoch": 0.001468846244965648, + "grad_norm": 0.45703125, + "learning_rate": 0.00019999951095488433, + "loss": 1.5542, + "step": 31 + }, + { + "epoch": 0.0015162283819000236, + "grad_norm": 0.44921875, + "learning_rate": 0.00019999946327025584, + "loss": 1.3448, + "step": 32 + }, + { + "epoch": 0.0015636105188343995, + "grad_norm": 0.474609375, + "learning_rate": 0.00019999941336774576, + "loss": 1.6881, + "step": 33 + }, + { + "epoch": 0.0016109926557687751, + "grad_norm": 0.65625, + "learning_rate": 0.00019999936124735524, + "loss": 0.7713, + "step": 34 + }, + { + "epoch": 0.001658374792703151, + "grad_norm": 0.44921875, + "learning_rate": 0.00019999930690908535, + "loss": 1.1188, + "step": 35 + }, + { + "epoch": 0.0017057569296375266, + "grad_norm": 0.859375, + "learning_rate": 0.00019999925035293738, + "loss": 1.0596, + "step": 36 + }, + { + "epoch": 0.0017531390665719024, + "grad_norm": 0.5546875, + "learning_rate": 0.00019999919157891256, + "loss": 1.3195, + "step": 37 + }, + { + "epoch": 0.001800521203506278, + "grad_norm": 0.33984375, + "learning_rate": 0.00019999913058701217, + "loss": 0.9574, + "step": 38 + }, + { + "epoch": 0.0018479033404406539, + "grad_norm": 0.93359375, + "learning_rate": 0.00019999906737723757, + "loss": 0.7413, + "step": 39 + }, + { + "epoch": 0.0018952854773750297, + "grad_norm": 0.6875, + "learning_rate": 0.00019999900194959017, + "loss": 1.1054, + "step": 40 + }, + { + "epoch": 0.0019426676143094053, + "grad_norm": 0.5234375, + "learning_rate": 0.00019999893430407145, + "loss": 0.8824, + "step": 41 + }, + { + "epoch": 0.001990049751243781, + "grad_norm": 0.39453125, + "learning_rate": 0.00019999886444068286, + "loss": 1.0459, + "step": 42 + }, + { + "epoch": 0.002037431888178157, + "grad_norm": 0.455078125, + "learning_rate": 0.000199998792359426, + "loss": 0.7952, + "step": 43 + }, + { + "epoch": 0.0020848140251125327, + "grad_norm": 0.7265625, + "learning_rate": 0.00019999871806030239, + "loss": 1.1249, + "step": 44 + }, + { + "epoch": 0.0021321961620469083, + "grad_norm": 0.462890625, + "learning_rate": 0.00019999864154331376, + "loss": 1.4, + "step": 45 + }, + { + "epoch": 0.002179578298981284, + "grad_norm": 0.4296875, + "learning_rate": 0.00019999856280846173, + "loss": 1.5702, + "step": 46 + }, + { + "epoch": 0.00222696043591566, + "grad_norm": 0.578125, + "learning_rate": 0.00019999848185574815, + "loss": 1.3921, + "step": 47 + }, + { + "epoch": 0.0022743425728500356, + "grad_norm": 0.5546875, + "learning_rate": 0.00019999839868517475, + "loss": 1.0725, + "step": 48 + }, + { + "epoch": 0.002321724709784411, + "grad_norm": 0.51171875, + "learning_rate": 0.00019999831329674334, + "loss": 1.0056, + "step": 49 + }, + { + "epoch": 0.002369106846718787, + "grad_norm": 0.53125, + "learning_rate": 0.00019999822569045589, + "loss": 1.0241, + "step": 50 + }, + { + "epoch": 0.002416488983653163, + "grad_norm": 0.498046875, + "learning_rate": 0.00019999813586631427, + "loss": 1.0889, + "step": 51 + }, + { + "epoch": 0.0024638711205875385, + "grad_norm": 0.5234375, + "learning_rate": 0.00019999804382432053, + "loss": 1.1529, + "step": 52 + }, + { + "epoch": 0.002511253257521914, + "grad_norm": 0.5234375, + "learning_rate": 0.00019999794956447673, + "loss": 1.4482, + "step": 53 + }, + { + "epoch": 0.00255863539445629, + "grad_norm": 0.5390625, + "learning_rate": 0.00019999785308678488, + "loss": 0.7506, + "step": 54 + }, + { + "epoch": 0.002606017531390666, + "grad_norm": 0.6015625, + "learning_rate": 0.00019999775439124716, + "loss": 0.9041, + "step": 55 + }, + { + "epoch": 0.0026533996683250414, + "grad_norm": 0.57421875, + "learning_rate": 0.00019999765347786578, + "loss": 1.0598, + "step": 56 + }, + { + "epoch": 0.002700781805259417, + "grad_norm": 0.64453125, + "learning_rate": 0.00019999755034664295, + "loss": 0.9802, + "step": 57 + }, + { + "epoch": 0.002748163942193793, + "grad_norm": 0.5078125, + "learning_rate": 0.00019999744499758096, + "loss": 0.7913, + "step": 58 + }, + { + "epoch": 0.0027955460791281687, + "grad_norm": 0.515625, + "learning_rate": 0.00019999733743068215, + "loss": 0.9532, + "step": 59 + }, + { + "epoch": 0.0028429282160625444, + "grad_norm": 0.54296875, + "learning_rate": 0.0001999972276459489, + "loss": 1.6101, + "step": 60 + }, + { + "epoch": 0.00289031035299692, + "grad_norm": 0.515625, + "learning_rate": 0.00019999711564338367, + "loss": 1.2862, + "step": 61 + }, + { + "epoch": 0.002937692489931296, + "grad_norm": 0.72265625, + "learning_rate": 0.0001999970014229889, + "loss": 0.1047, + "step": 62 + }, + { + "epoch": 0.0029850746268656717, + "grad_norm": 0.609375, + "learning_rate": 0.0001999968849847672, + "loss": 1.4375, + "step": 63 + }, + { + "epoch": 0.0030324567638000473, + "grad_norm": 0.5390625, + "learning_rate": 0.00019999676632872108, + "loss": 1.3146, + "step": 64 + }, + { + "epoch": 0.003079838900734423, + "grad_norm": 0.7890625, + "learning_rate": 0.0001999966454548532, + "loss": 0.2704, + "step": 65 + }, + { + "epoch": 0.003127221037668799, + "grad_norm": 0.46875, + "learning_rate": 0.0001999965223631662, + "loss": 1.0367, + "step": 66 + }, + { + "epoch": 0.0031746031746031746, + "grad_norm": 0.54296875, + "learning_rate": 0.0001999963970536629, + "loss": 0.7727, + "step": 67 + }, + { + "epoch": 0.0032219853115375502, + "grad_norm": 0.43359375, + "learning_rate": 0.00019999626952634599, + "loss": 1.244, + "step": 68 + }, + { + "epoch": 0.0032693674484719263, + "grad_norm": 0.45703125, + "learning_rate": 0.00019999613978121834, + "loss": 1.08, + "step": 69 + }, + { + "epoch": 0.003316749585406302, + "grad_norm": 0.50390625, + "learning_rate": 0.0001999960078182828, + "loss": 1.1897, + "step": 70 + }, + { + "epoch": 0.0033641317223406775, + "grad_norm": 0.51953125, + "learning_rate": 0.00019999587363754234, + "loss": 1.0152, + "step": 71 + }, + { + "epoch": 0.003411513859275053, + "grad_norm": 0.50390625, + "learning_rate": 0.00019999573723899992, + "loss": 1.1051, + "step": 72 + }, + { + "epoch": 0.003458895996209429, + "grad_norm": 0.421875, + "learning_rate": 0.00019999559862265856, + "loss": 0.8628, + "step": 73 + }, + { + "epoch": 0.003506278133143805, + "grad_norm": 0.408203125, + "learning_rate": 0.00019999545778852132, + "loss": 0.9969, + "step": 74 + }, + { + "epoch": 0.0035536602700781805, + "grad_norm": 0.458984375, + "learning_rate": 0.00019999531473659135, + "loss": 1.3045, + "step": 75 + }, + { + "epoch": 0.003601042407012556, + "grad_norm": 0.59765625, + "learning_rate": 0.0001999951694668718, + "loss": 0.7187, + "step": 76 + }, + { + "epoch": 0.003648424543946932, + "grad_norm": 0.53515625, + "learning_rate": 0.0001999950219793659, + "loss": 1.2484, + "step": 77 + }, + { + "epoch": 0.0036958066808813078, + "grad_norm": 0.58984375, + "learning_rate": 0.0001999948722740769, + "loss": 1.1648, + "step": 78 + }, + { + "epoch": 0.0037431888178156834, + "grad_norm": 0.73046875, + "learning_rate": 0.0001999947203510082, + "loss": 1.1182, + "step": 79 + }, + { + "epoch": 0.0037905709547500594, + "grad_norm": 0.52734375, + "learning_rate": 0.0001999945662101631, + "loss": 0.9652, + "step": 80 + }, + { + "epoch": 0.003837953091684435, + "grad_norm": 0.390625, + "learning_rate": 0.00019999440985154498, + "loss": 1.1329, + "step": 81 + }, + { + "epoch": 0.0038853352286188107, + "grad_norm": 0.400390625, + "learning_rate": 0.0001999942512751574, + "loss": 0.6673, + "step": 82 + }, + { + "epoch": 0.003932717365553186, + "grad_norm": 0.57421875, + "learning_rate": 0.00019999409048100382, + "loss": 0.743, + "step": 83 + }, + { + "epoch": 0.003980099502487562, + "grad_norm": 0.400390625, + "learning_rate": 0.0001999939274690878, + "loss": 0.8757, + "step": 84 + }, + { + "epoch": 0.0040274816394219376, + "grad_norm": 0.66015625, + "learning_rate": 0.000199993762239413, + "loss": 1.2065, + "step": 85 + }, + { + "epoch": 0.004074863776356314, + "grad_norm": 0.56640625, + "learning_rate": 0.0001999935947919831, + "loss": 1.5393, + "step": 86 + }, + { + "epoch": 0.00412224591329069, + "grad_norm": 0.302734375, + "learning_rate": 0.00019999342512680172, + "loss": 0.6055, + "step": 87 + }, + { + "epoch": 0.004169628050225065, + "grad_norm": 0.388671875, + "learning_rate": 0.0001999932532438727, + "loss": 0.963, + "step": 88 + }, + { + "epoch": 0.004217010187159441, + "grad_norm": 0.482421875, + "learning_rate": 0.00019999307914319981, + "loss": 0.5133, + "step": 89 + }, + { + "epoch": 0.0042643923240938165, + "grad_norm": 0.56640625, + "learning_rate": 0.00019999290282478698, + "loss": 1.1098, + "step": 90 + }, + { + "epoch": 0.004311774461028192, + "grad_norm": 0.515625, + "learning_rate": 0.00019999272428863804, + "loss": 0.9357, + "step": 91 + }, + { + "epoch": 0.004359156597962568, + "grad_norm": 0.4375, + "learning_rate": 0.00019999254353475702, + "loss": 0.9475, + "step": 92 + }, + { + "epoch": 0.004406538734896943, + "grad_norm": 0.56640625, + "learning_rate": 0.00019999236056314783, + "loss": 0.9209, + "step": 93 + }, + { + "epoch": 0.00445392087183132, + "grad_norm": 0.71484375, + "learning_rate": 0.00019999217537381464, + "loss": 1.2401, + "step": 94 + }, + { + "epoch": 0.0045013030087656955, + "grad_norm": 0.421875, + "learning_rate": 0.0001999919879667615, + "loss": 1.2056, + "step": 95 + }, + { + "epoch": 0.004548685145700071, + "grad_norm": 0.484375, + "learning_rate": 0.00019999179834199256, + "loss": 0.8911, + "step": 96 + }, + { + "epoch": 0.004596067282634447, + "grad_norm": 0.71484375, + "learning_rate": 0.00019999160649951202, + "loss": 1.5826, + "step": 97 + }, + { + "epoch": 0.004643449419568822, + "grad_norm": 0.55078125, + "learning_rate": 0.00019999141243932418, + "loss": 1.5075, + "step": 98 + }, + { + "epoch": 0.004690831556503198, + "grad_norm": 0.51171875, + "learning_rate": 0.00019999121616143332, + "loss": 1.2497, + "step": 99 + }, + { + "epoch": 0.004738213693437574, + "grad_norm": 0.439453125, + "learning_rate": 0.00019999101766584378, + "loss": 1.0032, + "step": 100 + }, + { + "epoch": 0.00478559583037195, + "grad_norm": 0.419921875, + "learning_rate": 0.00019999081695255998, + "loss": 1.241, + "step": 101 + }, + { + "epoch": 0.004832977967306326, + "grad_norm": 0.6484375, + "learning_rate": 0.00019999061402158636, + "loss": 0.1521, + "step": 102 + }, + { + "epoch": 0.004880360104240701, + "grad_norm": 0.42578125, + "learning_rate": 0.00019999040887292745, + "loss": 1.5866, + "step": 103 + }, + { + "epoch": 0.004927742241175077, + "grad_norm": 0.41015625, + "learning_rate": 0.00019999020150658774, + "loss": 1.6, + "step": 104 + }, + { + "epoch": 0.004975124378109453, + "grad_norm": 0.546875, + "learning_rate": 0.00019998999192257188, + "loss": 1.1149, + "step": 105 + }, + { + "epoch": 0.005022506515043828, + "grad_norm": 0.447265625, + "learning_rate": 0.0001999897801208845, + "loss": 0.9298, + "step": 106 + }, + { + "epoch": 0.005069888651978204, + "grad_norm": 0.359375, + "learning_rate": 0.0001999895661015303, + "loss": 1.2095, + "step": 107 + }, + { + "epoch": 0.00511727078891258, + "grad_norm": 0.59375, + "learning_rate": 0.00019998934986451404, + "loss": 0.8567, + "step": 108 + }, + { + "epoch": 0.005164652925846956, + "grad_norm": 0.421875, + "learning_rate": 0.0001999891314098405, + "loss": 0.9423, + "step": 109 + }, + { + "epoch": 0.005212035062781332, + "grad_norm": 0.392578125, + "learning_rate": 0.00019998891073751452, + "loss": 0.8099, + "step": 110 + }, + { + "epoch": 0.005259417199715707, + "grad_norm": 0.51171875, + "learning_rate": 0.00019998868784754103, + "loss": 1.0169, + "step": 111 + }, + { + "epoch": 0.005306799336650083, + "grad_norm": 0.546875, + "learning_rate": 0.00019998846273992492, + "loss": 1.4406, + "step": 112 + }, + { + "epoch": 0.0053541814735844585, + "grad_norm": 0.52734375, + "learning_rate": 0.00019998823541467122, + "loss": 0.9936, + "step": 113 + }, + { + "epoch": 0.005401563610518834, + "grad_norm": 0.37890625, + "learning_rate": 0.00019998800587178495, + "loss": 1.3192, + "step": 114 + }, + { + "epoch": 0.00544894574745321, + "grad_norm": 0.4296875, + "learning_rate": 0.00019998777411127123, + "loss": 0.8875, + "step": 115 + }, + { + "epoch": 0.005496327884387586, + "grad_norm": 0.494140625, + "learning_rate": 0.00019998754013313515, + "loss": 0.9446, + "step": 116 + }, + { + "epoch": 0.005543710021321962, + "grad_norm": 0.63671875, + "learning_rate": 0.00019998730393738198, + "loss": 1.1031, + "step": 117 + }, + { + "epoch": 0.0055910921582563375, + "grad_norm": 0.4453125, + "learning_rate": 0.0001999870655240169, + "loss": 1.1828, + "step": 118 + }, + { + "epoch": 0.005638474295190713, + "grad_norm": 0.48828125, + "learning_rate": 0.00019998682489304515, + "loss": 1.3674, + "step": 119 + }, + { + "epoch": 0.005685856432125089, + "grad_norm": 0.50390625, + "learning_rate": 0.00019998658204447217, + "loss": 0.8399, + "step": 120 + }, + { + "epoch": 0.005733238569059464, + "grad_norm": 0.5, + "learning_rate": 0.0001999863369783033, + "loss": 1.0854, + "step": 121 + }, + { + "epoch": 0.00578062070599384, + "grad_norm": 0.59375, + "learning_rate": 0.000199986089694544, + "loss": 0.4969, + "step": 122 + }, + { + "epoch": 0.0058280028429282165, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001999858401931997, + "loss": 0.0215, + "step": 123 + }, + { + "epoch": 0.005875384979862592, + "grad_norm": 0.70703125, + "learning_rate": 0.00019998558847427597, + "loss": 1.1571, + "step": 124 + }, + { + "epoch": 0.005922767116796968, + "grad_norm": 0.515625, + "learning_rate": 0.00019998533453777838, + "loss": 1.3631, + "step": 125 + }, + { + "epoch": 0.005970149253731343, + "grad_norm": 0.515625, + "learning_rate": 0.0001999850783837126, + "loss": 0.6508, + "step": 126 + }, + { + "epoch": 0.006017531390665719, + "grad_norm": 0.63671875, + "learning_rate": 0.00019998482001208425, + "loss": 0.8974, + "step": 127 + }, + { + "epoch": 0.006064913527600095, + "grad_norm": 1.078125, + "learning_rate": 0.00019998455942289912, + "loss": 1.0397, + "step": 128 + }, + { + "epoch": 0.00611229566453447, + "grad_norm": 0.7578125, + "learning_rate": 0.00019998429661616292, + "loss": 0.8684, + "step": 129 + }, + { + "epoch": 0.006159677801468846, + "grad_norm": 0.51953125, + "learning_rate": 0.00019998403159188154, + "loss": 1.4368, + "step": 130 + }, + { + "epoch": 0.006207059938403222, + "grad_norm": 0.451171875, + "learning_rate": 0.00019998376435006082, + "loss": 0.6405, + "step": 131 + }, + { + "epoch": 0.006254442075337598, + "grad_norm": 0.65234375, + "learning_rate": 0.00019998349489070677, + "loss": 0.6771, + "step": 132 + }, + { + "epoch": 0.006301824212271974, + "grad_norm": 0.70703125, + "learning_rate": 0.00019998322321382523, + "loss": 1.0881, + "step": 133 + }, + { + "epoch": 0.006349206349206349, + "grad_norm": 0.30859375, + "learning_rate": 0.00019998294931942233, + "loss": 0.6219, + "step": 134 + }, + { + "epoch": 0.006396588486140725, + "grad_norm": 0.45703125, + "learning_rate": 0.0001999826732075041, + "loss": 0.9019, + "step": 135 + }, + { + "epoch": 0.0064439706230751004, + "grad_norm": 0.47265625, + "learning_rate": 0.00019998239487807666, + "loss": 1.1105, + "step": 136 + }, + { + "epoch": 0.006491352760009476, + "grad_norm": 0.50390625, + "learning_rate": 0.00019998211433114622, + "loss": 1.1331, + "step": 137 + }, + { + "epoch": 0.0065387348969438526, + "grad_norm": 1.3828125, + "learning_rate": 0.00019998183156671898, + "loss": 0.9425, + "step": 138 + }, + { + "epoch": 0.006586117033878228, + "grad_norm": 0.51171875, + "learning_rate": 0.00019998154658480122, + "loss": 0.9189, + "step": 139 + }, + { + "epoch": 0.006633499170812604, + "grad_norm": 1.140625, + "learning_rate": 0.00019998125938539924, + "loss": 0.8435, + "step": 140 + }, + { + "epoch": 0.0066808813077469794, + "grad_norm": 0.5625, + "learning_rate": 0.0001999809699685194, + "loss": 1.3486, + "step": 141 + }, + { + "epoch": 0.006728263444681355, + "grad_norm": 0.408203125, + "learning_rate": 0.00019998067833416815, + "loss": 1.1616, + "step": 142 + }, + { + "epoch": 0.006775645581615731, + "grad_norm": 0.8515625, + "learning_rate": 0.00019998038448235195, + "loss": 1.0803, + "step": 143 + }, + { + "epoch": 0.006823027718550106, + "grad_norm": 0.703125, + "learning_rate": 0.00019998008841307736, + "loss": 0.9512, + "step": 144 + }, + { + "epoch": 0.006870409855484482, + "grad_norm": 1.046875, + "learning_rate": 0.00019997979012635085, + "loss": 0.2863, + "step": 145 + }, + { + "epoch": 0.006917791992418858, + "grad_norm": 0.41796875, + "learning_rate": 0.00019997948962217912, + "loss": 0.7258, + "step": 146 + }, + { + "epoch": 0.006965174129353234, + "grad_norm": 0.43359375, + "learning_rate": 0.0001999791869005688, + "loss": 1.2192, + "step": 147 + }, + { + "epoch": 0.00701255626628761, + "grad_norm": 0.58203125, + "learning_rate": 0.0001999788819615266, + "loss": 1.2817, + "step": 148 + }, + { + "epoch": 0.007059938403221985, + "grad_norm": 0.419921875, + "learning_rate": 0.00019997857480505928, + "loss": 0.8469, + "step": 149 + }, + { + "epoch": 0.007107320540156361, + "grad_norm": 0.51171875, + "learning_rate": 0.0001999782654311737, + "loss": 0.6144, + "step": 150 + }, + { + "epoch": 0.0071547026770907365, + "grad_norm": 0.466796875, + "learning_rate": 0.00019997795383987663, + "loss": 1.7007, + "step": 151 + }, + { + "epoch": 0.007202084814025112, + "grad_norm": 0.380859375, + "learning_rate": 0.00019997764003117509, + "loss": 0.8697, + "step": 152 + }, + { + "epoch": 0.007249466950959489, + "grad_norm": 0.35546875, + "learning_rate": 0.00019997732400507597, + "loss": 1.0246, + "step": 153 + }, + { + "epoch": 0.007296849087893864, + "grad_norm": 0.4453125, + "learning_rate": 0.00019997700576158628, + "loss": 0.9222, + "step": 154 + }, + { + "epoch": 0.00734423122482824, + "grad_norm": 0.83984375, + "learning_rate": 0.00019997668530071308, + "loss": 1.0439, + "step": 155 + }, + { + "epoch": 0.0073916133617626155, + "grad_norm": 0.439453125, + "learning_rate": 0.0001999763626224635, + "loss": 1.2715, + "step": 156 + }, + { + "epoch": 0.007438995498696991, + "grad_norm": 0.33203125, + "learning_rate": 0.0001999760377268447, + "loss": 0.5978, + "step": 157 + }, + { + "epoch": 0.007486377635631367, + "grad_norm": 0.423828125, + "learning_rate": 0.00019997571061386386, + "loss": 1.2903, + "step": 158 + }, + { + "epoch": 0.007533759772565742, + "grad_norm": 0.5234375, + "learning_rate": 0.00019997538128352826, + "loss": 0.6958, + "step": 159 + }, + { + "epoch": 0.007581141909500119, + "grad_norm": 0.48828125, + "learning_rate": 0.0001999750497358452, + "loss": 1.2971, + "step": 160 + }, + { + "epoch": 0.0076285240464344945, + "grad_norm": 0.62890625, + "learning_rate": 0.000199974715970822, + "loss": 0.7183, + "step": 161 + }, + { + "epoch": 0.00767590618336887, + "grad_norm": 0.5703125, + "learning_rate": 0.0001999743799884661, + "loss": 1.6736, + "step": 162 + }, + { + "epoch": 0.007723288320303246, + "grad_norm": 0.376953125, + "learning_rate": 0.00019997404178878495, + "loss": 0.7535, + "step": 163 + }, + { + "epoch": 0.007770670457237621, + "grad_norm": 0.423828125, + "learning_rate": 0.00019997370137178603, + "loss": 1.0898, + "step": 164 + }, + { + "epoch": 0.007818052594171997, + "grad_norm": 0.298828125, + "learning_rate": 0.0001999733587374769, + "loss": 0.5674, + "step": 165 + }, + { + "epoch": 0.007865434731106373, + "grad_norm": 0.58203125, + "learning_rate": 0.00019997301388586519, + "loss": 0.8733, + "step": 166 + }, + { + "epoch": 0.007912816868040748, + "grad_norm": 0.890625, + "learning_rate": 0.00019997266681695845, + "loss": 0.6212, + "step": 167 + }, + { + "epoch": 0.007960199004975124, + "grad_norm": 0.435546875, + "learning_rate": 0.00019997231753076452, + "loss": 1.2933, + "step": 168 + }, + { + "epoch": 0.0080075811419095, + "grad_norm": 0.75, + "learning_rate": 0.00019997196602729102, + "loss": 0.9396, + "step": 169 + }, + { + "epoch": 0.008054963278843875, + "grad_norm": 0.482421875, + "learning_rate": 0.0001999716123065458, + "loss": 0.7431, + "step": 170 + }, + { + "epoch": 0.00810234541577825, + "grad_norm": 0.9296875, + "learning_rate": 0.00019997125636853676, + "loss": 1.022, + "step": 171 + }, + { + "epoch": 0.008149727552712628, + "grad_norm": 0.3359375, + "learning_rate": 0.00019997089821327172, + "loss": 0.5743, + "step": 172 + }, + { + "epoch": 0.008197109689647004, + "grad_norm": 0.58203125, + "learning_rate": 0.00019997053784075858, + "loss": 0.4988, + "step": 173 + }, + { + "epoch": 0.00824449182658138, + "grad_norm": 0.53125, + "learning_rate": 0.00019997017525100546, + "loss": 0.9723, + "step": 174 + }, + { + "epoch": 0.008291873963515755, + "grad_norm": 0.52734375, + "learning_rate": 0.00019996981044402033, + "loss": 1.2186, + "step": 175 + }, + { + "epoch": 0.00833925610045013, + "grad_norm": 0.5234375, + "learning_rate": 0.00019996944341981124, + "loss": 0.628, + "step": 176 + }, + { + "epoch": 0.008386638237384506, + "grad_norm": 0.3203125, + "learning_rate": 0.00019996907417838642, + "loss": 1.0701, + "step": 177 + }, + { + "epoch": 0.008434020374318882, + "grad_norm": 0.400390625, + "learning_rate": 0.00019996870271975402, + "loss": 0.8977, + "step": 178 + }, + { + "epoch": 0.008481402511253257, + "grad_norm": 0.4375, + "learning_rate": 0.00019996832904392226, + "loss": 1.2282, + "step": 179 + }, + { + "epoch": 0.008528784648187633, + "grad_norm": 0.44140625, + "learning_rate": 0.0001999679531508994, + "loss": 1.3241, + "step": 180 + }, + { + "epoch": 0.008576166785122009, + "grad_norm": 0.466796875, + "learning_rate": 0.0001999675750406939, + "loss": 0.7242, + "step": 181 + }, + { + "epoch": 0.008623548922056384, + "grad_norm": 0.50390625, + "learning_rate": 0.00019996719471331403, + "loss": 0.5549, + "step": 182 + }, + { + "epoch": 0.00867093105899076, + "grad_norm": 0.412109375, + "learning_rate": 0.00019996681216876826, + "loss": 0.8259, + "step": 183 + }, + { + "epoch": 0.008718313195925136, + "grad_norm": 0.5, + "learning_rate": 0.00019996642740706508, + "loss": 1.0661, + "step": 184 + }, + { + "epoch": 0.008765695332859511, + "grad_norm": 0.45703125, + "learning_rate": 0.000199966040428213, + "loss": 1.3544, + "step": 185 + }, + { + "epoch": 0.008813077469793887, + "grad_norm": 0.515625, + "learning_rate": 0.00019996565123222066, + "loss": 1.1912, + "step": 186 + }, + { + "epoch": 0.008860459606728264, + "grad_norm": 0.44140625, + "learning_rate": 0.00019996525981909663, + "loss": 0.4306, + "step": 187 + }, + { + "epoch": 0.00890784174366264, + "grad_norm": 0.734375, + "learning_rate": 0.0001999648661888496, + "loss": 0.7892, + "step": 188 + }, + { + "epoch": 0.008955223880597015, + "grad_norm": 0.455078125, + "learning_rate": 0.00019996447034148837, + "loss": 1.3539, + "step": 189 + }, + { + "epoch": 0.009002606017531391, + "grad_norm": 0.486328125, + "learning_rate": 0.00019996407227702162, + "loss": 1.5831, + "step": 190 + }, + { + "epoch": 0.009049988154465767, + "grad_norm": 0.57421875, + "learning_rate": 0.00019996367199545824, + "loss": 0.1309, + "step": 191 + }, + { + "epoch": 0.009097370291400142, + "grad_norm": 0.310546875, + "learning_rate": 0.00019996326949680708, + "loss": 1.0598, + "step": 192 + }, + { + "epoch": 0.009144752428334518, + "grad_norm": 0.40234375, + "learning_rate": 0.00019996286478107708, + "loss": 0.7607, + "step": 193 + }, + { + "epoch": 0.009192134565268894, + "grad_norm": 0.48828125, + "learning_rate": 0.00019996245784827723, + "loss": 0.9167, + "step": 194 + }, + { + "epoch": 0.00923951670220327, + "grad_norm": 0.396484375, + "learning_rate": 0.00019996204869841654, + "loss": 0.5669, + "step": 195 + }, + { + "epoch": 0.009286898839137645, + "grad_norm": 0.42578125, + "learning_rate": 0.00019996163733150408, + "loss": 0.2394, + "step": 196 + }, + { + "epoch": 0.00933428097607202, + "grad_norm": 0.515625, + "learning_rate": 0.00019996122374754896, + "loss": 0.5832, + "step": 197 + }, + { + "epoch": 0.009381663113006396, + "grad_norm": 0.365234375, + "learning_rate": 0.00019996080794656038, + "loss": 0.8465, + "step": 198 + }, + { + "epoch": 0.009429045249940772, + "grad_norm": 0.6875, + "learning_rate": 0.00019996038992854757, + "loss": 0.631, + "step": 199 + }, + { + "epoch": 0.009476427386875147, + "grad_norm": 0.6484375, + "learning_rate": 0.00019995996969351978, + "loss": 1.4833, + "step": 200 + }, + { + "epoch": 0.009523809523809525, + "grad_norm": 0.5234375, + "learning_rate": 0.0001999595472414863, + "loss": 1.349, + "step": 201 + }, + { + "epoch": 0.0095711916607439, + "grad_norm": 0.431640625, + "learning_rate": 0.0001999591225724566, + "loss": 0.9398, + "step": 202 + }, + { + "epoch": 0.009618573797678276, + "grad_norm": 0.396484375, + "learning_rate": 0.00019995869568643996, + "loss": 0.6837, + "step": 203 + }, + { + "epoch": 0.009665955934612652, + "grad_norm": 0.37890625, + "learning_rate": 0.000199958266583446, + "loss": 0.6042, + "step": 204 + }, + { + "epoch": 0.009713338071547027, + "grad_norm": 0.421875, + "learning_rate": 0.0001999578352634841, + "loss": 1.0077, + "step": 205 + }, + { + "epoch": 0.009760720208481403, + "grad_norm": 1.3828125, + "learning_rate": 0.00019995740172656386, + "loss": 0.671, + "step": 206 + }, + { + "epoch": 0.009808102345415778, + "grad_norm": 1.046875, + "learning_rate": 0.00019995696597269498, + "loss": 0.7752, + "step": 207 + }, + { + "epoch": 0.009855484482350154, + "grad_norm": 0.52734375, + "learning_rate": 0.00019995652800188705, + "loss": 1.3379, + "step": 208 + }, + { + "epoch": 0.00990286661928453, + "grad_norm": 1.5234375, + "learning_rate": 0.00019995608781414977, + "loss": 1.0481, + "step": 209 + }, + { + "epoch": 0.009950248756218905, + "grad_norm": 0.4609375, + "learning_rate": 0.00019995564540949297, + "loss": 1.1031, + "step": 210 + }, + { + "epoch": 0.009997630893153281, + "grad_norm": 0.49609375, + "learning_rate": 0.0001999552007879264, + "loss": 0.9984, + "step": 211 + }, + { + "epoch": 0.010045013030087657, + "grad_norm": 0.498046875, + "learning_rate": 0.00019995475394945996, + "loss": 1.442, + "step": 212 + }, + { + "epoch": 0.010092395167022032, + "grad_norm": 0.58203125, + "learning_rate": 0.00019995430489410353, + "loss": 1.1315, + "step": 213 + }, + { + "epoch": 0.010139777303956408, + "grad_norm": 0.5, + "learning_rate": 0.00019995385362186707, + "loss": 1.4601, + "step": 214 + }, + { + "epoch": 0.010187159440890783, + "grad_norm": 0.48828125, + "learning_rate": 0.00019995340013276064, + "loss": 1.091, + "step": 215 + }, + { + "epoch": 0.01023454157782516, + "grad_norm": 0.412109375, + "learning_rate": 0.00019995294442679421, + "loss": 0.94, + "step": 216 + }, + { + "epoch": 0.010281923714759536, + "grad_norm": 0.55078125, + "learning_rate": 0.00019995248650397799, + "loss": 1.067, + "step": 217 + }, + { + "epoch": 0.010329305851693912, + "grad_norm": 0.84765625, + "learning_rate": 0.00019995202636432203, + "loss": 1.0678, + "step": 218 + }, + { + "epoch": 0.010376687988628288, + "grad_norm": 0.49609375, + "learning_rate": 0.00019995156400783663, + "loss": 0.968, + "step": 219 + }, + { + "epoch": 0.010424070125562663, + "grad_norm": 0.52734375, + "learning_rate": 0.00019995109943453198, + "loss": 1.2878, + "step": 220 + }, + { + "epoch": 0.010471452262497039, + "grad_norm": 0.408203125, + "learning_rate": 0.00019995063264441844, + "loss": 1.0963, + "step": 221 + }, + { + "epoch": 0.010518834399431415, + "grad_norm": 0.50390625, + "learning_rate": 0.0001999501636375063, + "loss": 1.3468, + "step": 222 + }, + { + "epoch": 0.01056621653636579, + "grad_norm": 0.375, + "learning_rate": 0.000199949692413806, + "loss": 0.6686, + "step": 223 + }, + { + "epoch": 0.010613598673300166, + "grad_norm": 0.4453125, + "learning_rate": 0.000199949218973328, + "loss": 1.2847, + "step": 224 + }, + { + "epoch": 0.010660980810234541, + "grad_norm": 0.5, + "learning_rate": 0.0001999487433160828, + "loss": 1.4145, + "step": 225 + }, + { + "epoch": 0.010708362947168917, + "grad_norm": 0.58203125, + "learning_rate": 0.00019994826544208086, + "loss": 1.1932, + "step": 226 + }, + { + "epoch": 0.010755745084103293, + "grad_norm": 0.57421875, + "learning_rate": 0.00019994778535133292, + "loss": 1.2457, + "step": 227 + }, + { + "epoch": 0.010803127221037668, + "grad_norm": 0.388671875, + "learning_rate": 0.00019994730304384955, + "loss": 0.5284, + "step": 228 + }, + { + "epoch": 0.010850509357972044, + "grad_norm": 0.625, + "learning_rate": 0.00019994681851964144, + "loss": 1.2445, + "step": 229 + }, + { + "epoch": 0.01089789149490642, + "grad_norm": 0.447265625, + "learning_rate": 0.00019994633177871935, + "loss": 1.1842, + "step": 230 + }, + { + "epoch": 0.010945273631840797, + "grad_norm": 0.482421875, + "learning_rate": 0.0001999458428210941, + "loss": 1.0567, + "step": 231 + }, + { + "epoch": 0.010992655768775172, + "grad_norm": 0.59375, + "learning_rate": 0.00019994535164677651, + "loss": 0.9081, + "step": 232 + }, + { + "epoch": 0.011040037905709548, + "grad_norm": 0.390625, + "learning_rate": 0.00019994485825577748, + "loss": 0.788, + "step": 233 + }, + { + "epoch": 0.011087420042643924, + "grad_norm": 0.44921875, + "learning_rate": 0.0001999443626481079, + "loss": 1.2978, + "step": 234 + }, + { + "epoch": 0.0111348021795783, + "grad_norm": 0.498046875, + "learning_rate": 0.00019994386482377887, + "loss": 1.8736, + "step": 235 + }, + { + "epoch": 0.011182184316512675, + "grad_norm": 0.400390625, + "learning_rate": 0.00019994336478280138, + "loss": 0.6525, + "step": 236 + }, + { + "epoch": 0.01122956645344705, + "grad_norm": 0.4765625, + "learning_rate": 0.00019994286252518646, + "loss": 1.6496, + "step": 237 + }, + { + "epoch": 0.011276948590381426, + "grad_norm": 0.53125, + "learning_rate": 0.00019994235805094536, + "loss": 0.8822, + "step": 238 + }, + { + "epoch": 0.011324330727315802, + "grad_norm": 0.68359375, + "learning_rate": 0.00019994185136008917, + "loss": 0.8822, + "step": 239 + }, + { + "epoch": 0.011371712864250177, + "grad_norm": 0.435546875, + "learning_rate": 0.00019994134245262922, + "loss": 1.0044, + "step": 240 + }, + { + "epoch": 0.011419095001184553, + "grad_norm": 0.41796875, + "learning_rate": 0.0001999408313285767, + "loss": 1.0126, + "step": 241 + }, + { + "epoch": 0.011466477138118929, + "grad_norm": 0.369140625, + "learning_rate": 0.00019994031798794298, + "loss": 1.2154, + "step": 242 + }, + { + "epoch": 0.011513859275053304, + "grad_norm": 0.482421875, + "learning_rate": 0.0001999398024307395, + "loss": 1.3496, + "step": 243 + }, + { + "epoch": 0.01156124141198768, + "grad_norm": 0.34375, + "learning_rate": 0.00019993928465697765, + "loss": 1.4015, + "step": 244 + }, + { + "epoch": 0.011608623548922056, + "grad_norm": 0.44140625, + "learning_rate": 0.0001999387646666689, + "loss": 1.2544, + "step": 245 + }, + { + "epoch": 0.011656005685856433, + "grad_norm": 0.330078125, + "learning_rate": 0.0001999382424598248, + "loss": 0.6131, + "step": 246 + }, + { + "epoch": 0.011703387822790809, + "grad_norm": 0.58984375, + "learning_rate": 0.00019993771803645695, + "loss": 0.7921, + "step": 247 + }, + { + "epoch": 0.011750769959725184, + "grad_norm": 0.48046875, + "learning_rate": 0.00019993719139657694, + "loss": 1.1962, + "step": 248 + }, + { + "epoch": 0.01179815209665956, + "grad_norm": 0.4609375, + "learning_rate": 0.00019993666254019648, + "loss": 1.268, + "step": 249 + }, + { + "epoch": 0.011845534233593935, + "grad_norm": 0.458984375, + "learning_rate": 0.0001999361314673273, + "loss": 0.8002, + "step": 250 + }, + { + "epoch": 0.011892916370528311, + "grad_norm": 0.5, + "learning_rate": 0.00019993559817798118, + "loss": 0.6962, + "step": 251 + }, + { + "epoch": 0.011940298507462687, + "grad_norm": 0.50390625, + "learning_rate": 0.00019993506267216993, + "loss": 0.9077, + "step": 252 + }, + { + "epoch": 0.011987680644397062, + "grad_norm": 0.6640625, + "learning_rate": 0.00019993452494990543, + "loss": 0.9264, + "step": 253 + }, + { + "epoch": 0.012035062781331438, + "grad_norm": 0.375, + "learning_rate": 0.0001999339850111996, + "loss": 1.1098, + "step": 254 + }, + { + "epoch": 0.012082444918265814, + "grad_norm": 0.35546875, + "learning_rate": 0.00019993344285606447, + "loss": 1.0, + "step": 255 + }, + { + "epoch": 0.01212982705520019, + "grad_norm": 0.404296875, + "learning_rate": 0.00019993289848451197, + "loss": 1.2045, + "step": 256 + }, + { + "epoch": 0.012177209192134565, + "grad_norm": 0.453125, + "learning_rate": 0.00019993235189655426, + "loss": 0.9974, + "step": 257 + }, + { + "epoch": 0.01222459132906894, + "grad_norm": 0.70703125, + "learning_rate": 0.0001999318030922034, + "loss": 0.9157, + "step": 258 + }, + { + "epoch": 0.012271973466003316, + "grad_norm": 0.73046875, + "learning_rate": 0.0001999312520714716, + "loss": 0.7541, + "step": 259 + }, + { + "epoch": 0.012319355602937692, + "grad_norm": 0.498046875, + "learning_rate": 0.0001999306988343711, + "loss": 0.8848, + "step": 260 + }, + { + "epoch": 0.012366737739872069, + "grad_norm": 0.4375, + "learning_rate": 0.00019993014338091412, + "loss": 1.1219, + "step": 261 + }, + { + "epoch": 0.012414119876806445, + "grad_norm": 0.51953125, + "learning_rate": 0.000199929585711113, + "loss": 1.0632, + "step": 262 + }, + { + "epoch": 0.01246150201374082, + "grad_norm": 0.4140625, + "learning_rate": 0.0001999290258249801, + "loss": 0.7072, + "step": 263 + }, + { + "epoch": 0.012508884150675196, + "grad_norm": 0.384765625, + "learning_rate": 0.00019992846372252787, + "loss": 1.2486, + "step": 264 + }, + { + "epoch": 0.012556266287609572, + "grad_norm": 0.478515625, + "learning_rate": 0.00019992789940376872, + "loss": 0.6055, + "step": 265 + }, + { + "epoch": 0.012603648424543947, + "grad_norm": 0.61328125, + "learning_rate": 0.00019992733286871523, + "loss": 0.7229, + "step": 266 + }, + { + "epoch": 0.012651030561478323, + "grad_norm": 0.76953125, + "learning_rate": 0.00019992676411737992, + "loss": 0.4119, + "step": 267 + }, + { + "epoch": 0.012698412698412698, + "grad_norm": 0.384765625, + "learning_rate": 0.00019992619314977543, + "loss": 1.1707, + "step": 268 + }, + { + "epoch": 0.012745794835347074, + "grad_norm": 0.68359375, + "learning_rate": 0.0001999256199659144, + "loss": 0.8128, + "step": 269 + }, + { + "epoch": 0.01279317697228145, + "grad_norm": 0.46875, + "learning_rate": 0.0001999250445658096, + "loss": 1.109, + "step": 270 + }, + { + "epoch": 0.012840559109215825, + "grad_norm": 0.462890625, + "learning_rate": 0.00019992446694947367, + "loss": 1.1046, + "step": 271 + }, + { + "epoch": 0.012887941246150201, + "grad_norm": 0.64453125, + "learning_rate": 0.00019992388711691955, + "loss": 0.9036, + "step": 272 + }, + { + "epoch": 0.012935323383084577, + "grad_norm": 0.443359375, + "learning_rate": 0.00019992330506816001, + "loss": 1.2072, + "step": 273 + }, + { + "epoch": 0.012982705520018952, + "grad_norm": 1.0859375, + "learning_rate": 0.00019992272080320803, + "loss": 0.0806, + "step": 274 + }, + { + "epoch": 0.013030087656953328, + "grad_norm": 0.59375, + "learning_rate": 0.00019992213432207655, + "loss": 1.1417, + "step": 275 + }, + { + "epoch": 0.013077469793887705, + "grad_norm": 0.609375, + "learning_rate": 0.00019992154562477855, + "loss": 1.5429, + "step": 276 + }, + { + "epoch": 0.01312485193082208, + "grad_norm": 0.41015625, + "learning_rate": 0.00019992095471132705, + "loss": 0.7147, + "step": 277 + }, + { + "epoch": 0.013172234067756456, + "grad_norm": 0.42578125, + "learning_rate": 0.00019992036158173525, + "loss": 1.4001, + "step": 278 + }, + { + "epoch": 0.013219616204690832, + "grad_norm": 0.43359375, + "learning_rate": 0.00019991976623601624, + "loss": 0.8493, + "step": 279 + }, + { + "epoch": 0.013266998341625208, + "grad_norm": 0.466796875, + "learning_rate": 0.00019991916867418327, + "loss": 0.8411, + "step": 280 + }, + { + "epoch": 0.013314380478559583, + "grad_norm": 0.98046875, + "learning_rate": 0.00019991856889624957, + "loss": 1.1472, + "step": 281 + }, + { + "epoch": 0.013361762615493959, + "grad_norm": 0.6171875, + "learning_rate": 0.00019991796690222843, + "loss": 1.1025, + "step": 282 + }, + { + "epoch": 0.013409144752428334, + "grad_norm": 0.53515625, + "learning_rate": 0.00019991736269213322, + "loss": 1.2855, + "step": 283 + }, + { + "epoch": 0.01345652688936271, + "grad_norm": 0.462890625, + "learning_rate": 0.0001999167562659773, + "loss": 0.8279, + "step": 284 + }, + { + "epoch": 0.013503909026297086, + "grad_norm": 0.515625, + "learning_rate": 0.00019991614762377417, + "loss": 1.1669, + "step": 285 + }, + { + "epoch": 0.013551291163231461, + "grad_norm": 0.7734375, + "learning_rate": 0.00019991553676553734, + "loss": 0.5494, + "step": 286 + }, + { + "epoch": 0.013598673300165837, + "grad_norm": 0.423828125, + "learning_rate": 0.0001999149236912803, + "loss": 0.7617, + "step": 287 + }, + { + "epoch": 0.013646055437100213, + "grad_norm": 0.40234375, + "learning_rate": 0.00019991430840101668, + "loss": 0.9924, + "step": 288 + }, + { + "epoch": 0.013693437574034588, + "grad_norm": 0.462890625, + "learning_rate": 0.00019991369089476013, + "loss": 0.924, + "step": 289 + }, + { + "epoch": 0.013740819710968964, + "grad_norm": 0.48828125, + "learning_rate": 0.00019991307117252433, + "loss": 1.1997, + "step": 290 + }, + { + "epoch": 0.013788201847903341, + "grad_norm": 0.62890625, + "learning_rate": 0.00019991244923432303, + "loss": 0.4997, + "step": 291 + }, + { + "epoch": 0.013835583984837717, + "grad_norm": 0.4921875, + "learning_rate": 0.00019991182508017006, + "loss": 0.5573, + "step": 292 + }, + { + "epoch": 0.013882966121772092, + "grad_norm": 0.59765625, + "learning_rate": 0.0001999111987100792, + "loss": 1.2159, + "step": 293 + }, + { + "epoch": 0.013930348258706468, + "grad_norm": 0.80859375, + "learning_rate": 0.0001999105701240644, + "loss": 0.5836, + "step": 294 + }, + { + "epoch": 0.013977730395640844, + "grad_norm": 0.30078125, + "learning_rate": 0.00019990993932213956, + "loss": 0.5885, + "step": 295 + }, + { + "epoch": 0.01402511253257522, + "grad_norm": 0.25390625, + "learning_rate": 0.00019990930630431865, + "loss": 0.0234, + "step": 296 + }, + { + "epoch": 0.014072494669509595, + "grad_norm": 0.443359375, + "learning_rate": 0.0001999086710706158, + "loss": 1.3111, + "step": 297 + }, + { + "epoch": 0.01411987680644397, + "grad_norm": 0.490234375, + "learning_rate": 0.000199908033621045, + "loss": 0.678, + "step": 298 + }, + { + "epoch": 0.014167258943378346, + "grad_norm": 0.419921875, + "learning_rate": 0.00019990739395562047, + "loss": 1.1531, + "step": 299 + }, + { + "epoch": 0.014214641080312722, + "grad_norm": 0.28125, + "learning_rate": 0.00019990675207435634, + "loss": 0.0211, + "step": 300 + }, + { + "epoch": 0.014262023217247097, + "grad_norm": 0.875, + "learning_rate": 0.00019990610797726688, + "loss": 0.8344, + "step": 301 + }, + { + "epoch": 0.014309405354181473, + "grad_norm": 0.43359375, + "learning_rate": 0.00019990546166436635, + "loss": 1.1513, + "step": 302 + }, + { + "epoch": 0.014356787491115849, + "grad_norm": 0.56640625, + "learning_rate": 0.00019990481313566906, + "loss": 1.1517, + "step": 303 + }, + { + "epoch": 0.014404169628050224, + "grad_norm": 0.478515625, + "learning_rate": 0.0001999041623911895, + "loss": 1.2003, + "step": 304 + }, + { + "epoch": 0.0144515517649846, + "grad_norm": 0.359375, + "learning_rate": 0.00019990350943094196, + "loss": 0.6667, + "step": 305 + }, + { + "epoch": 0.014498933901918977, + "grad_norm": 0.49609375, + "learning_rate": 0.00019990285425494105, + "loss": 1.274, + "step": 306 + }, + { + "epoch": 0.014546316038853353, + "grad_norm": 0.515625, + "learning_rate": 0.0001999021968632012, + "loss": 1.4139, + "step": 307 + }, + { + "epoch": 0.014593698175787729, + "grad_norm": 0.5546875, + "learning_rate": 0.00019990153725573705, + "loss": 0.7986, + "step": 308 + }, + { + "epoch": 0.014641080312722104, + "grad_norm": 0.3359375, + "learning_rate": 0.00019990087543256323, + "loss": 0.6945, + "step": 309 + }, + { + "epoch": 0.01468846244965648, + "grad_norm": 0.609375, + "learning_rate": 0.00019990021139369436, + "loss": 0.6369, + "step": 310 + }, + { + "epoch": 0.014735844586590855, + "grad_norm": 0.50390625, + "learning_rate": 0.00019989954513914527, + "loss": 0.8179, + "step": 311 + }, + { + "epoch": 0.014783226723525231, + "grad_norm": 0.435546875, + "learning_rate": 0.00019989887666893062, + "loss": 0.7585, + "step": 312 + }, + { + "epoch": 0.014830608860459607, + "grad_norm": 0.388671875, + "learning_rate": 0.00019989820598306532, + "loss": 1.2037, + "step": 313 + }, + { + "epoch": 0.014877990997393982, + "grad_norm": 0.9453125, + "learning_rate": 0.00019989753308156423, + "loss": 0.9347, + "step": 314 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 0.44921875, + "learning_rate": 0.00019989685796444225, + "loss": 1.1497, + "step": 315 + }, + { + "epoch": 0.014972755271262734, + "grad_norm": 0.890625, + "learning_rate": 0.00019989618063171436, + "loss": 0.8473, + "step": 316 + }, + { + "epoch": 0.01502013740819711, + "grad_norm": 0.8671875, + "learning_rate": 0.0001998955010833956, + "loss": 0.8583, + "step": 317 + }, + { + "epoch": 0.015067519545131485, + "grad_norm": 0.69921875, + "learning_rate": 0.00019989481931950102, + "loss": 0.7101, + "step": 318 + }, + { + "epoch": 0.01511490168206586, + "grad_norm": 0.453125, + "learning_rate": 0.00019989413534004575, + "loss": 0.9949, + "step": 319 + }, + { + "epoch": 0.015162283819000238, + "grad_norm": 0.515625, + "learning_rate": 0.00019989344914504497, + "loss": 1.3381, + "step": 320 + }, + { + "epoch": 0.015209665955934613, + "grad_norm": 0.390625, + "learning_rate": 0.0001998927607345139, + "loss": 0.9605, + "step": 321 + }, + { + "epoch": 0.015257048092868989, + "grad_norm": 0.96484375, + "learning_rate": 0.00019989207010846777, + "loss": 0.8851, + "step": 322 + }, + { + "epoch": 0.015304430229803365, + "grad_norm": 0.423828125, + "learning_rate": 0.00019989137726692194, + "loss": 1.1357, + "step": 323 + }, + { + "epoch": 0.01535181236673774, + "grad_norm": 0.4921875, + "learning_rate": 0.00019989068220989175, + "loss": 0.9308, + "step": 324 + }, + { + "epoch": 0.015399194503672116, + "grad_norm": 0.462890625, + "learning_rate": 0.00019988998493739263, + "loss": 1.1891, + "step": 325 + }, + { + "epoch": 0.015446576640606492, + "grad_norm": 0.5703125, + "learning_rate": 0.00019988928544944007, + "loss": 0.6323, + "step": 326 + }, + { + "epoch": 0.015493958777540867, + "grad_norm": 0.578125, + "learning_rate": 0.00019988858374604953, + "loss": 1.0981, + "step": 327 + }, + { + "epoch": 0.015541340914475243, + "grad_norm": 0.39453125, + "learning_rate": 0.0001998878798272366, + "loss": 1.325, + "step": 328 + }, + { + "epoch": 0.015588723051409618, + "grad_norm": 0.36328125, + "learning_rate": 0.00019988717369301688, + "loss": 0.0412, + "step": 329 + }, + { + "epoch": 0.015636105188343994, + "grad_norm": 0.51953125, + "learning_rate": 0.00019988646534340606, + "loss": 0.5804, + "step": 330 + }, + { + "epoch": 0.01568348732527837, + "grad_norm": 0.37890625, + "learning_rate": 0.00019988575477841985, + "loss": 1.3774, + "step": 331 + }, + { + "epoch": 0.015730869462212745, + "grad_norm": 0.28125, + "learning_rate": 0.00019988504199807395, + "loss": 0.6024, + "step": 332 + }, + { + "epoch": 0.01577825159914712, + "grad_norm": 0.423828125, + "learning_rate": 0.00019988432700238424, + "loss": 1.3106, + "step": 333 + }, + { + "epoch": 0.015825633736081497, + "grad_norm": 0.69921875, + "learning_rate": 0.00019988360979136655, + "loss": 0.9213, + "step": 334 + }, + { + "epoch": 0.015873015873015872, + "grad_norm": 0.484375, + "learning_rate": 0.0001998828903650368, + "loss": 1.7168, + "step": 335 + }, + { + "epoch": 0.015920398009950248, + "grad_norm": 0.5, + "learning_rate": 0.0001998821687234109, + "loss": 0.8373, + "step": 336 + }, + { + "epoch": 0.015967780146884623, + "grad_norm": 0.494140625, + "learning_rate": 0.00019988144486650491, + "loss": 1.3647, + "step": 337 + }, + { + "epoch": 0.016015162283819, + "grad_norm": 0.427734375, + "learning_rate": 0.00019988071879433483, + "loss": 0.8493, + "step": 338 + }, + { + "epoch": 0.016062544420753375, + "grad_norm": 0.578125, + "learning_rate": 0.0001998799905069168, + "loss": 0.9083, + "step": 339 + }, + { + "epoch": 0.01610992655768775, + "grad_norm": 0.427734375, + "learning_rate": 0.00019987926000426703, + "loss": 1.4788, + "step": 340 + }, + { + "epoch": 0.016157308694622126, + "grad_norm": 0.34375, + "learning_rate": 0.0001998785272864016, + "loss": 0.0531, + "step": 341 + }, + { + "epoch": 0.0162046908315565, + "grad_norm": 0.482421875, + "learning_rate": 0.00019987779235333683, + "loss": 0.5946, + "step": 342 + }, + { + "epoch": 0.01625207296849088, + "grad_norm": 0.44921875, + "learning_rate": 0.000199877055205089, + "loss": 1.3108, + "step": 343 + }, + { + "epoch": 0.016299455105425256, + "grad_norm": 0.8359375, + "learning_rate": 0.0001998763158416745, + "loss": 0.6112, + "step": 344 + }, + { + "epoch": 0.016346837242359632, + "grad_norm": 0.421875, + "learning_rate": 0.0001998755742631097, + "loss": 0.7658, + "step": 345 + }, + { + "epoch": 0.016394219379294007, + "grad_norm": 0.625, + "learning_rate": 0.000199874830469411, + "loss": 0.6044, + "step": 346 + }, + { + "epoch": 0.016441601516228383, + "grad_norm": 0.53125, + "learning_rate": 0.000199874084460595, + "loss": 1.118, + "step": 347 + }, + { + "epoch": 0.01648898365316276, + "grad_norm": 0.6015625, + "learning_rate": 0.00019987333623667814, + "loss": 0.4469, + "step": 348 + }, + { + "epoch": 0.016536365790097134, + "grad_norm": 0.49609375, + "learning_rate": 0.0001998725857976771, + "loss": 0.9925, + "step": 349 + }, + { + "epoch": 0.01658374792703151, + "grad_norm": 0.515625, + "learning_rate": 0.00019987183314360848, + "loss": 1.8437, + "step": 350 + }, + { + "epoch": 0.016631130063965886, + "grad_norm": 0.41796875, + "learning_rate": 0.00019987107827448895, + "loss": 1.2108, + "step": 351 + }, + { + "epoch": 0.01667851220090026, + "grad_norm": 0.451171875, + "learning_rate": 0.00019987032119033528, + "loss": 1.193, + "step": 352 + }, + { + "epoch": 0.016725894337834637, + "grad_norm": 0.59375, + "learning_rate": 0.0001998695618911643, + "loss": 1.4704, + "step": 353 + }, + { + "epoch": 0.016773276474769012, + "grad_norm": 0.4921875, + "learning_rate": 0.00019986880037699278, + "loss": 1.2658, + "step": 354 + }, + { + "epoch": 0.016820658611703388, + "grad_norm": 0.29296875, + "learning_rate": 0.00019986803664783767, + "loss": 0.5875, + "step": 355 + }, + { + "epoch": 0.016868040748637764, + "grad_norm": 0.384765625, + "learning_rate": 0.00019986727070371587, + "loss": 1.1199, + "step": 356 + }, + { + "epoch": 0.01691542288557214, + "grad_norm": 0.412109375, + "learning_rate": 0.00019986650254464437, + "loss": 0.5635, + "step": 357 + }, + { + "epoch": 0.016962805022506515, + "grad_norm": 0.494140625, + "learning_rate": 0.00019986573217064022, + "loss": 1.0004, + "step": 358 + }, + { + "epoch": 0.01701018715944089, + "grad_norm": 0.55078125, + "learning_rate": 0.00019986495958172054, + "loss": 1.3553, + "step": 359 + }, + { + "epoch": 0.017057569296375266, + "grad_norm": 0.90625, + "learning_rate": 0.00019986418477790235, + "loss": 0.6413, + "step": 360 + }, + { + "epoch": 0.017104951433309642, + "grad_norm": 0.0869140625, + "learning_rate": 0.00019986340775920297, + "loss": 0.0047, + "step": 361 + }, + { + "epoch": 0.017152333570244017, + "grad_norm": 0.92578125, + "learning_rate": 0.00019986262852563958, + "loss": 0.7254, + "step": 362 + }, + { + "epoch": 0.017199715707178393, + "grad_norm": 1.3203125, + "learning_rate": 0.00019986184707722945, + "loss": 0.4382, + "step": 363 + }, + { + "epoch": 0.01724709784411277, + "grad_norm": 0.408203125, + "learning_rate": 0.00019986106341398992, + "loss": 0.7554, + "step": 364 + }, + { + "epoch": 0.017294479981047144, + "grad_norm": 1.0546875, + "learning_rate": 0.00019986027753593835, + "loss": 0.866, + "step": 365 + }, + { + "epoch": 0.01734186211798152, + "grad_norm": 0.46875, + "learning_rate": 0.00019985948944309221, + "loss": 1.6714, + "step": 366 + }, + { + "epoch": 0.017389244254915896, + "grad_norm": 0.765625, + "learning_rate": 0.00019985869913546894, + "loss": 0.4531, + "step": 367 + }, + { + "epoch": 0.01743662639185027, + "grad_norm": 0.384765625, + "learning_rate": 0.00019985790661308613, + "loss": 0.7205, + "step": 368 + }, + { + "epoch": 0.017484008528784647, + "grad_norm": 0.609375, + "learning_rate": 0.00019985711187596125, + "loss": 0.3726, + "step": 369 + }, + { + "epoch": 0.017531390665719022, + "grad_norm": 0.765625, + "learning_rate": 0.00019985631492411206, + "loss": 1.2968, + "step": 370 + }, + { + "epoch": 0.017578772802653398, + "grad_norm": 0.859375, + "learning_rate": 0.00019985551575755613, + "loss": 0.7039, + "step": 371 + }, + { + "epoch": 0.017626154939587774, + "grad_norm": 0.390625, + "learning_rate": 0.0001998547143763112, + "loss": 1.1429, + "step": 372 + }, + { + "epoch": 0.017673537076522153, + "grad_norm": 0.451171875, + "learning_rate": 0.00019985391078039514, + "loss": 0.865, + "step": 373 + }, + { + "epoch": 0.01772091921345653, + "grad_norm": 0.357421875, + "learning_rate": 0.00019985310496982564, + "loss": 1.5814, + "step": 374 + }, + { + "epoch": 0.017768301350390904, + "grad_norm": 0.478515625, + "learning_rate": 0.00019985229694462065, + "loss": 1.4083, + "step": 375 + }, + { + "epoch": 0.01781568348732528, + "grad_norm": 0.703125, + "learning_rate": 0.00019985148670479804, + "loss": 0.3757, + "step": 376 + }, + { + "epoch": 0.017863065624259655, + "grad_norm": 0.45703125, + "learning_rate": 0.00019985067425037583, + "loss": 1.2033, + "step": 377 + }, + { + "epoch": 0.01791044776119403, + "grad_norm": 0.466796875, + "learning_rate": 0.00019984985958137203, + "loss": 1.2772, + "step": 378 + }, + { + "epoch": 0.017957829898128407, + "grad_norm": 1.328125, + "learning_rate": 0.0001998490426978047, + "loss": 1.0075, + "step": 379 + }, + { + "epoch": 0.018005212035062782, + "grad_norm": 0.7578125, + "learning_rate": 0.00019984822359969196, + "loss": 0.7024, + "step": 380 + }, + { + "epoch": 0.018052594171997158, + "grad_norm": 0.474609375, + "learning_rate": 0.00019984740228705196, + "loss": 1.2398, + "step": 381 + }, + { + "epoch": 0.018099976308931533, + "grad_norm": 0.416015625, + "learning_rate": 0.00019984657875990296, + "loss": 0.8402, + "step": 382 + }, + { + "epoch": 0.01814735844586591, + "grad_norm": 0.5703125, + "learning_rate": 0.00019984575301826315, + "loss": 1.4791, + "step": 383 + }, + { + "epoch": 0.018194740582800285, + "grad_norm": 0.490234375, + "learning_rate": 0.00019984492506215092, + "loss": 1.2764, + "step": 384 + }, + { + "epoch": 0.01824212271973466, + "grad_norm": 0.47265625, + "learning_rate": 0.0001998440948915846, + "loss": 0.9856, + "step": 385 + }, + { + "epoch": 0.018289504856669036, + "grad_norm": 0.56640625, + "learning_rate": 0.0001998432625065826, + "loss": 0.8699, + "step": 386 + }, + { + "epoch": 0.01833688699360341, + "grad_norm": 0.486328125, + "learning_rate": 0.00019984242790716339, + "loss": 0.9452, + "step": 387 + }, + { + "epoch": 0.018384269130537787, + "grad_norm": 0.08740234375, + "learning_rate": 0.00019984159109334547, + "loss": 0.0059, + "step": 388 + }, + { + "epoch": 0.018431651267472163, + "grad_norm": 0.458984375, + "learning_rate": 0.00019984075206514742, + "loss": 1.2753, + "step": 389 + }, + { + "epoch": 0.01847903340440654, + "grad_norm": 0.361328125, + "learning_rate": 0.0001998399108225878, + "loss": 1.1387, + "step": 390 + }, + { + "epoch": 0.018526415541340914, + "grad_norm": 0.54296875, + "learning_rate": 0.00019983906736568532, + "loss": 1.1879, + "step": 391 + }, + { + "epoch": 0.01857379767827529, + "grad_norm": 0.7421875, + "learning_rate": 0.00019983822169445867, + "loss": 0.5234, + "step": 392 + }, + { + "epoch": 0.018621179815209665, + "grad_norm": 0.62109375, + "learning_rate": 0.00019983737380892662, + "loss": 0.5158, + "step": 393 + }, + { + "epoch": 0.01866856195214404, + "grad_norm": 0.396484375, + "learning_rate": 0.00019983652370910796, + "loss": 0.9222, + "step": 394 + }, + { + "epoch": 0.018715944089078417, + "grad_norm": 0.4609375, + "learning_rate": 0.00019983567139502152, + "loss": 0.7296, + "step": 395 + }, + { + "epoch": 0.018763326226012792, + "grad_norm": 0.48828125, + "learning_rate": 0.00019983481686668627, + "loss": 0.8614, + "step": 396 + }, + { + "epoch": 0.018810708362947168, + "grad_norm": 0.35546875, + "learning_rate": 0.00019983396012412109, + "loss": 0.6279, + "step": 397 + }, + { + "epoch": 0.018858090499881543, + "grad_norm": 0.431640625, + "learning_rate": 0.00019983310116734502, + "loss": 1.1019, + "step": 398 + }, + { + "epoch": 0.01890547263681592, + "grad_norm": 0.56640625, + "learning_rate": 0.0001998322399963771, + "loss": 1.1183, + "step": 399 + }, + { + "epoch": 0.018952854773750295, + "grad_norm": 0.396484375, + "learning_rate": 0.00019983137661123642, + "loss": 0.9234, + "step": 400 + }, + { + "epoch": 0.01900023691068467, + "grad_norm": 0.51953125, + "learning_rate": 0.00019983051101194217, + "loss": 1.2332, + "step": 401 + }, + { + "epoch": 0.01904761904761905, + "grad_norm": 0.439453125, + "learning_rate": 0.00019982964319851352, + "loss": 0.9963, + "step": 402 + }, + { + "epoch": 0.019095001184553425, + "grad_norm": 0.3984375, + "learning_rate": 0.0001998287731709697, + "loss": 0.875, + "step": 403 + }, + { + "epoch": 0.0191423833214878, + "grad_norm": 0.470703125, + "learning_rate": 0.00019982790092933002, + "loss": 1.4477, + "step": 404 + }, + { + "epoch": 0.019189765458422176, + "grad_norm": 0.40234375, + "learning_rate": 0.00019982702647361385, + "loss": 1.0502, + "step": 405 + }, + { + "epoch": 0.019237147595356552, + "grad_norm": 0.51171875, + "learning_rate": 0.00019982614980384056, + "loss": 1.3743, + "step": 406 + }, + { + "epoch": 0.019284529732290927, + "grad_norm": 0.55859375, + "learning_rate": 0.0001998252709200296, + "loss": 0.9722, + "step": 407 + }, + { + "epoch": 0.019331911869225303, + "grad_norm": 0.69140625, + "learning_rate": 0.00019982438982220043, + "loss": 0.9775, + "step": 408 + }, + { + "epoch": 0.01937929400615968, + "grad_norm": 0.6875, + "learning_rate": 0.00019982350651037264, + "loss": 0.4177, + "step": 409 + }, + { + "epoch": 0.019426676143094054, + "grad_norm": 0.3984375, + "learning_rate": 0.00019982262098456582, + "loss": 1.3982, + "step": 410 + }, + { + "epoch": 0.01947405828002843, + "grad_norm": 0.359375, + "learning_rate": 0.00019982173324479955, + "loss": 0.9668, + "step": 411 + }, + { + "epoch": 0.019521440416962806, + "grad_norm": 0.369140625, + "learning_rate": 0.0001998208432910936, + "loss": 0.5722, + "step": 412 + }, + { + "epoch": 0.01956882255389718, + "grad_norm": 0.46875, + "learning_rate": 0.00019981995112346764, + "loss": 0.0664, + "step": 413 + }, + { + "epoch": 0.019616204690831557, + "grad_norm": 0.49609375, + "learning_rate": 0.00019981905674194153, + "loss": 1.4617, + "step": 414 + }, + { + "epoch": 0.019663586827765932, + "grad_norm": 0.34765625, + "learning_rate": 0.00019981816014653502, + "loss": 1.0365, + "step": 415 + }, + { + "epoch": 0.019710968964700308, + "grad_norm": 0.421875, + "learning_rate": 0.00019981726133726807, + "loss": 0.9083, + "step": 416 + }, + { + "epoch": 0.019758351101634684, + "grad_norm": 0.8828125, + "learning_rate": 0.0001998163603141606, + "loss": 0.5751, + "step": 417 + }, + { + "epoch": 0.01980573323856906, + "grad_norm": 0.921875, + "learning_rate": 0.00019981545707723256, + "loss": 0.3827, + "step": 418 + }, + { + "epoch": 0.019853115375503435, + "grad_norm": 0.72265625, + "learning_rate": 0.00019981455162650398, + "loss": 0.7891, + "step": 419 + }, + { + "epoch": 0.01990049751243781, + "grad_norm": 0.7734375, + "learning_rate": 0.00019981364396199497, + "loss": 1.0511, + "step": 420 + }, + { + "epoch": 0.019947879649372186, + "grad_norm": 1.078125, + "learning_rate": 0.00019981273408372564, + "loss": 1.1038, + "step": 421 + }, + { + "epoch": 0.019995261786306562, + "grad_norm": 0.455078125, + "learning_rate": 0.00019981182199171622, + "loss": 1.2424, + "step": 422 + }, + { + "epoch": 0.020042643923240937, + "grad_norm": 0.6171875, + "learning_rate": 0.0001998109076859869, + "loss": 0.5426, + "step": 423 + }, + { + "epoch": 0.020090026060175313, + "grad_norm": 1.203125, + "learning_rate": 0.00019980999116655794, + "loss": 1.1897, + "step": 424 + }, + { + "epoch": 0.02013740819710969, + "grad_norm": 0.7890625, + "learning_rate": 0.00019980907243344968, + "loss": 1.1586, + "step": 425 + }, + { + "epoch": 0.020184790334044064, + "grad_norm": 0.3671875, + "learning_rate": 0.00019980815148668251, + "loss": 0.6897, + "step": 426 + }, + { + "epoch": 0.02023217247097844, + "grad_norm": 0.482421875, + "learning_rate": 0.0001998072283262769, + "loss": 1.2089, + "step": 427 + }, + { + "epoch": 0.020279554607912816, + "grad_norm": 0.85546875, + "learning_rate": 0.00019980630295225323, + "loss": 0.7024, + "step": 428 + }, + { + "epoch": 0.02032693674484719, + "grad_norm": 0.416015625, + "learning_rate": 0.00019980537536463207, + "loss": 0.3366, + "step": 429 + }, + { + "epoch": 0.020374318881781567, + "grad_norm": 0.796875, + "learning_rate": 0.000199804445563434, + "loss": 1.0042, + "step": 430 + }, + { + "epoch": 0.020421701018715942, + "grad_norm": 0.419921875, + "learning_rate": 0.00019980351354867963, + "loss": 0.5644, + "step": 431 + }, + { + "epoch": 0.02046908315565032, + "grad_norm": 0.37109375, + "learning_rate": 0.00019980257932038966, + "loss": 1.0856, + "step": 432 + }, + { + "epoch": 0.020516465292584697, + "grad_norm": 0.53515625, + "learning_rate": 0.00019980164287858475, + "loss": 0.4346, + "step": 433 + }, + { + "epoch": 0.020563847429519073, + "grad_norm": 0.5234375, + "learning_rate": 0.00019980070422328573, + "loss": 0.3099, + "step": 434 + }, + { + "epoch": 0.02061122956645345, + "grad_norm": 0.42578125, + "learning_rate": 0.00019979976335451338, + "loss": 0.991, + "step": 435 + }, + { + "epoch": 0.020658611703387824, + "grad_norm": 0.56640625, + "learning_rate": 0.0001997988202722886, + "loss": 1.0886, + "step": 436 + }, + { + "epoch": 0.0207059938403222, + "grad_norm": 0.44921875, + "learning_rate": 0.00019979787497663228, + "loss": 1.0824, + "step": 437 + }, + { + "epoch": 0.020753375977256575, + "grad_norm": 0.451171875, + "learning_rate": 0.00019979692746756536, + "loss": 1.2677, + "step": 438 + }, + { + "epoch": 0.02080075811419095, + "grad_norm": 0.5625, + "learning_rate": 0.00019979597774510892, + "loss": 0.7553, + "step": 439 + }, + { + "epoch": 0.020848140251125327, + "grad_norm": 0.466796875, + "learning_rate": 0.000199795025809284, + "loss": 1.0373, + "step": 440 + }, + { + "epoch": 0.020895522388059702, + "grad_norm": 0.498046875, + "learning_rate": 0.00019979407166011165, + "loss": 1.2302, + "step": 441 + }, + { + "epoch": 0.020942904524994078, + "grad_norm": 0.51171875, + "learning_rate": 0.00019979311529761312, + "loss": 0.3575, + "step": 442 + }, + { + "epoch": 0.020990286661928453, + "grad_norm": 0.390625, + "learning_rate": 0.00019979215672180962, + "loss": 0.9975, + "step": 443 + }, + { + "epoch": 0.02103766879886283, + "grad_norm": 0.490234375, + "learning_rate": 0.00019979119593272236, + "loss": 0.4121, + "step": 444 + }, + { + "epoch": 0.021085050935797205, + "grad_norm": 0.44140625, + "learning_rate": 0.00019979023293037265, + "loss": 1.1058, + "step": 445 + }, + { + "epoch": 0.02113243307273158, + "grad_norm": 0.43359375, + "learning_rate": 0.00019978926771478187, + "loss": 1.045, + "step": 446 + }, + { + "epoch": 0.021179815209665956, + "grad_norm": 0.78515625, + "learning_rate": 0.00019978830028597141, + "loss": 1.5996, + "step": 447 + }, + { + "epoch": 0.02122719734660033, + "grad_norm": 0.7734375, + "learning_rate": 0.00019978733064396277, + "loss": 0.6733, + "step": 448 + }, + { + "epoch": 0.021274579483534707, + "grad_norm": 0.451171875, + "learning_rate": 0.00019978635878877742, + "loss": 1.1244, + "step": 449 + }, + { + "epoch": 0.021321961620469083, + "grad_norm": 0.546875, + "learning_rate": 0.00019978538472043692, + "loss": 1.5741, + "step": 450 + }, + { + "epoch": 0.02136934375740346, + "grad_norm": 0.419921875, + "learning_rate": 0.00019978440843896285, + "loss": 0.7887, + "step": 451 + }, + { + "epoch": 0.021416725894337834, + "grad_norm": 0.5625, + "learning_rate": 0.00019978342994437688, + "loss": 0.4479, + "step": 452 + }, + { + "epoch": 0.02146410803127221, + "grad_norm": 0.50390625, + "learning_rate": 0.00019978244923670076, + "loss": 1.525, + "step": 453 + }, + { + "epoch": 0.021511490168206585, + "grad_norm": 0.38671875, + "learning_rate": 0.00019978146631595615, + "loss": 1.161, + "step": 454 + }, + { + "epoch": 0.02155887230514096, + "grad_norm": 0.462890625, + "learning_rate": 0.00019978048118216496, + "loss": 1.2118, + "step": 455 + }, + { + "epoch": 0.021606254442075336, + "grad_norm": 0.625, + "learning_rate": 0.00019977949383534894, + "loss": 0.6029, + "step": 456 + }, + { + "epoch": 0.021653636579009712, + "grad_norm": 0.48046875, + "learning_rate": 0.00019977850427553, + "loss": 0.9705, + "step": 457 + }, + { + "epoch": 0.021701018715944088, + "grad_norm": 0.39453125, + "learning_rate": 0.0001997775125027302, + "loss": 1.2315, + "step": 458 + }, + { + "epoch": 0.021748400852878463, + "grad_norm": 0.455078125, + "learning_rate": 0.00019977651851697136, + "loss": 1.1845, + "step": 459 + }, + { + "epoch": 0.02179578298981284, + "grad_norm": 0.310546875, + "learning_rate": 0.00019977552231827564, + "loss": 0.4129, + "step": 460 + }, + { + "epoch": 0.021843165126747215, + "grad_norm": 0.70703125, + "learning_rate": 0.00019977452390666515, + "loss": 0.3194, + "step": 461 + }, + { + "epoch": 0.021890547263681594, + "grad_norm": 0.578125, + "learning_rate": 0.00019977352328216197, + "loss": 0.784, + "step": 462 + }, + { + "epoch": 0.02193792940061597, + "grad_norm": 0.431640625, + "learning_rate": 0.0001997725204447883, + "loss": 1.1318, + "step": 463 + }, + { + "epoch": 0.021985311537550345, + "grad_norm": 0.400390625, + "learning_rate": 0.00019977151539456642, + "loss": 1.6012, + "step": 464 + }, + { + "epoch": 0.02203269367448472, + "grad_norm": 0.69921875, + "learning_rate": 0.0001997705081315186, + "loss": 0.3294, + "step": 465 + }, + { + "epoch": 0.022080075811419096, + "grad_norm": 0.9453125, + "learning_rate": 0.0001997694986556672, + "loss": 0.5496, + "step": 466 + }, + { + "epoch": 0.022127457948353472, + "grad_norm": 0.69921875, + "learning_rate": 0.00019976848696703456, + "loss": 1.0092, + "step": 467 + }, + { + "epoch": 0.022174840085287847, + "grad_norm": 0.375, + "learning_rate": 0.00019976747306564314, + "loss": 0.9297, + "step": 468 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 0.451171875, + "learning_rate": 0.00019976645695151546, + "loss": 1.193, + "step": 469 + }, + { + "epoch": 0.0222696043591566, + "grad_norm": 0.48828125, + "learning_rate": 0.00019976543862467404, + "loss": 1.4561, + "step": 470 + }, + { + "epoch": 0.022316986496090974, + "grad_norm": 1.0234375, + "learning_rate": 0.0001997644180851414, + "loss": 0.9823, + "step": 471 + }, + { + "epoch": 0.02236436863302535, + "grad_norm": 0.7578125, + "learning_rate": 0.00019976339533294028, + "loss": 0.5589, + "step": 472 + }, + { + "epoch": 0.022411750769959726, + "grad_norm": 0.54296875, + "learning_rate": 0.00019976237036809332, + "loss": 0.8081, + "step": 473 + }, + { + "epoch": 0.0224591329068941, + "grad_norm": 0.59375, + "learning_rate": 0.00019976134319062323, + "loss": 0.6358, + "step": 474 + }, + { + "epoch": 0.022506515043828477, + "grad_norm": 0.7421875, + "learning_rate": 0.0001997603138005528, + "loss": 0.8958, + "step": 475 + }, + { + "epoch": 0.022553897180762852, + "grad_norm": 0.3828125, + "learning_rate": 0.0001997592821979049, + "loss": 1.0532, + "step": 476 + }, + { + "epoch": 0.022601279317697228, + "grad_norm": 0.400390625, + "learning_rate": 0.00019975824838270234, + "loss": 1.1853, + "step": 477 + }, + { + "epoch": 0.022648661454631604, + "grad_norm": 0.640625, + "learning_rate": 0.00019975721235496811, + "loss": 0.4938, + "step": 478 + }, + { + "epoch": 0.02269604359156598, + "grad_norm": 0.52734375, + "learning_rate": 0.00019975617411472518, + "loss": 1.1591, + "step": 479 + }, + { + "epoch": 0.022743425728500355, + "grad_norm": 0.359375, + "learning_rate": 0.00019975513366199654, + "loss": 1.3403, + "step": 480 + }, + { + "epoch": 0.02279080786543473, + "grad_norm": 0.6015625, + "learning_rate": 0.0001997540909968053, + "loss": 1.0315, + "step": 481 + }, + { + "epoch": 0.022838190002369106, + "grad_norm": 0.92578125, + "learning_rate": 0.00019975304611917456, + "loss": 0.3196, + "step": 482 + }, + { + "epoch": 0.022885572139303482, + "grad_norm": 0.349609375, + "learning_rate": 0.00019975199902912754, + "loss": 0.6876, + "step": 483 + }, + { + "epoch": 0.022932954276237857, + "grad_norm": 0.55859375, + "learning_rate": 0.0001997509497266874, + "loss": 1.6713, + "step": 484 + }, + { + "epoch": 0.022980336413172233, + "grad_norm": 0.8046875, + "learning_rate": 0.00019974989821187745, + "loss": 0.1393, + "step": 485 + }, + { + "epoch": 0.02302771855010661, + "grad_norm": 0.66796875, + "learning_rate": 0.00019974884448472103, + "loss": 0.2337, + "step": 486 + }, + { + "epoch": 0.023075100687040984, + "grad_norm": 0.64453125, + "learning_rate": 0.00019974778854524148, + "loss": 0.1739, + "step": 487 + }, + { + "epoch": 0.02312248282397536, + "grad_norm": 0.486328125, + "learning_rate": 0.00019974673039346223, + "loss": 1.2593, + "step": 488 + }, + { + "epoch": 0.023169864960909736, + "grad_norm": 0.6640625, + "learning_rate": 0.00019974567002940675, + "loss": 1.5246, + "step": 489 + }, + { + "epoch": 0.02321724709784411, + "grad_norm": 0.419921875, + "learning_rate": 0.0001997446074530985, + "loss": 0.9928, + "step": 490 + }, + { + "epoch": 0.023264629234778487, + "grad_norm": 0.5, + "learning_rate": 0.00019974354266456116, + "loss": 0.9088, + "step": 491 + }, + { + "epoch": 0.023312011371712866, + "grad_norm": 0.486328125, + "learning_rate": 0.00019974247566381824, + "loss": 1.3176, + "step": 492 + }, + { + "epoch": 0.02335939350864724, + "grad_norm": 0.51953125, + "learning_rate": 0.00019974140645089347, + "loss": 0.9397, + "step": 493 + }, + { + "epoch": 0.023406775645581617, + "grad_norm": 0.48828125, + "learning_rate": 0.00019974033502581054, + "loss": 0.8935, + "step": 494 + }, + { + "epoch": 0.023454157782515993, + "grad_norm": 0.46875, + "learning_rate": 0.00019973926138859324, + "loss": 1.5524, + "step": 495 + }, + { + "epoch": 0.02350153991945037, + "grad_norm": 0.4921875, + "learning_rate": 0.00019973818553926535, + "loss": 1.1607, + "step": 496 + }, + { + "epoch": 0.023548922056384744, + "grad_norm": 0.44140625, + "learning_rate": 0.00019973710747785075, + "loss": 1.2721, + "step": 497 + }, + { + "epoch": 0.02359630419331912, + "grad_norm": 0.375, + "learning_rate": 0.00019973602720437336, + "loss": 0.7482, + "step": 498 + }, + { + "epoch": 0.023643686330253495, + "grad_norm": 0.458984375, + "learning_rate": 0.00019973494471885707, + "loss": 1.1923, + "step": 499 + }, + { + "epoch": 0.02369106846718787, + "grad_norm": 0.3828125, + "learning_rate": 0.00019973386002132597, + "loss": 1.0473, + "step": 500 + }, + { + "epoch": 0.023738450604122247, + "grad_norm": 0.462890625, + "learning_rate": 0.00019973277311180409, + "loss": 1.5173, + "step": 501 + }, + { + "epoch": 0.023785832741056622, + "grad_norm": 0.84765625, + "learning_rate": 0.00019973168399031548, + "loss": 0.5056, + "step": 502 + }, + { + "epoch": 0.023833214877990998, + "grad_norm": 0.6484375, + "learning_rate": 0.00019973059265688438, + "loss": 0.9238, + "step": 503 + }, + { + "epoch": 0.023880597014925373, + "grad_norm": 0.984375, + "learning_rate": 0.000199729499111535, + "loss": 0.3908, + "step": 504 + }, + { + "epoch": 0.02392797915185975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019972840335429152, + "loss": 1.0612, + "step": 505 + }, + { + "epoch": 0.023975361288794125, + "grad_norm": 0.412109375, + "learning_rate": 0.00019972730538517827, + "loss": 1.0104, + "step": 506 + }, + { + "epoch": 0.0240227434257285, + "grad_norm": 0.38671875, + "learning_rate": 0.00019972620520421964, + "loss": 1.2123, + "step": 507 + }, + { + "epoch": 0.024070125562662876, + "grad_norm": 0.41015625, + "learning_rate": 0.00019972510281144, + "loss": 1.2562, + "step": 508 + }, + { + "epoch": 0.02411750769959725, + "grad_norm": 0.75, + "learning_rate": 0.00019972399820686378, + "loss": 0.3947, + "step": 509 + }, + { + "epoch": 0.024164889836531627, + "grad_norm": 0.333984375, + "learning_rate": 0.00019972289139051551, + "loss": 0.5638, + "step": 510 + }, + { + "epoch": 0.024212271973466003, + "grad_norm": 0.341796875, + "learning_rate": 0.00019972178236241973, + "loss": 0.053, + "step": 511 + }, + { + "epoch": 0.02425965411040038, + "grad_norm": 0.451171875, + "learning_rate": 0.00019972067112260103, + "loss": 0.997, + "step": 512 + }, + { + "epoch": 0.024307036247334754, + "grad_norm": 0.43359375, + "learning_rate": 0.0001997195576710841, + "loss": 1.133, + "step": 513 + }, + { + "epoch": 0.02435441838426913, + "grad_norm": 0.51953125, + "learning_rate": 0.00019971844200789357, + "loss": 0.9191, + "step": 514 + }, + { + "epoch": 0.024401800521203505, + "grad_norm": 0.45703125, + "learning_rate": 0.0001997173241330542, + "loss": 0.9256, + "step": 515 + }, + { + "epoch": 0.02444918265813788, + "grad_norm": 0.64453125, + "learning_rate": 0.0001997162040465908, + "loss": 1.015, + "step": 516 + }, + { + "epoch": 0.024496564795072256, + "grad_norm": 0.625, + "learning_rate": 0.00019971508174852822, + "loss": 0.9148, + "step": 517 + }, + { + "epoch": 0.024543946932006632, + "grad_norm": 0.478515625, + "learning_rate": 0.00019971395723889133, + "loss": 1.2158, + "step": 518 + }, + { + "epoch": 0.024591329068941008, + "grad_norm": 0.4921875, + "learning_rate": 0.0001997128305177051, + "loss": 0.9604, + "step": 519 + }, + { + "epoch": 0.024638711205875383, + "grad_norm": 0.41015625, + "learning_rate": 0.00019971170158499443, + "loss": 1.173, + "step": 520 + }, + { + "epoch": 0.024686093342809762, + "grad_norm": 0.462890625, + "learning_rate": 0.0001997105704407845, + "loss": 1.0204, + "step": 521 + }, + { + "epoch": 0.024733475479744138, + "grad_norm": 0.427734375, + "learning_rate": 0.0001997094370851003, + "loss": 1.2085, + "step": 522 + }, + { + "epoch": 0.024780857616678514, + "grad_norm": 0.408203125, + "learning_rate": 0.00019970830151796697, + "loss": 0.9763, + "step": 523 + }, + { + "epoch": 0.02482823975361289, + "grad_norm": 0.5625, + "learning_rate": 0.00019970716373940976, + "loss": 1.2341, + "step": 524 + }, + { + "epoch": 0.024875621890547265, + "grad_norm": 0.9140625, + "learning_rate": 0.0001997060237494538, + "loss": 0.5408, + "step": 525 + }, + { + "epoch": 0.02492300402748164, + "grad_norm": 0.12353515625, + "learning_rate": 0.0001997048815481245, + "loss": 0.0094, + "step": 526 + }, + { + "epoch": 0.024970386164416016, + "grad_norm": 0.5703125, + "learning_rate": 0.0001997037371354471, + "loss": 1.1377, + "step": 527 + }, + { + "epoch": 0.025017768301350392, + "grad_norm": 0.4140625, + "learning_rate": 0.000199702590511447, + "loss": 0.3099, + "step": 528 + }, + { + "epoch": 0.025065150438284767, + "grad_norm": 0.396484375, + "learning_rate": 0.00019970144167614967, + "loss": 1.0221, + "step": 529 + }, + { + "epoch": 0.025112532575219143, + "grad_norm": 0.64453125, + "learning_rate": 0.00019970029062958054, + "loss": 0.4058, + "step": 530 + }, + { + "epoch": 0.02515991471215352, + "grad_norm": 0.408203125, + "learning_rate": 0.00019969913737176515, + "loss": 1.2623, + "step": 531 + }, + { + "epoch": 0.025207296849087894, + "grad_norm": 0.58203125, + "learning_rate": 0.0001996979819027291, + "loss": 1.618, + "step": 532 + }, + { + "epoch": 0.02525467898602227, + "grad_norm": 1.046875, + "learning_rate": 0.00019969682422249803, + "loss": 0.4917, + "step": 533 + }, + { + "epoch": 0.025302061122956646, + "grad_norm": 0.470703125, + "learning_rate": 0.00019969566433109757, + "loss": 1.1174, + "step": 534 + }, + { + "epoch": 0.02534944325989102, + "grad_norm": 0.5078125, + "learning_rate": 0.00019969450222855347, + "loss": 1.2709, + "step": 535 + }, + { + "epoch": 0.025396825396825397, + "grad_norm": 0.435546875, + "learning_rate": 0.0001996933379148915, + "loss": 1.5649, + "step": 536 + }, + { + "epoch": 0.025444207533759772, + "grad_norm": 0.30859375, + "learning_rate": 0.00019969217139013745, + "loss": 0.5722, + "step": 537 + }, + { + "epoch": 0.025491589670694148, + "grad_norm": 0.361328125, + "learning_rate": 0.00019969100265431727, + "loss": 1.0741, + "step": 538 + }, + { + "epoch": 0.025538971807628524, + "grad_norm": 0.396484375, + "learning_rate": 0.0001996898317074568, + "loss": 0.2779, + "step": 539 + }, + { + "epoch": 0.0255863539445629, + "grad_norm": 0.412109375, + "learning_rate": 0.00019968865854958208, + "loss": 0.9093, + "step": 540 + }, + { + "epoch": 0.025633736081497275, + "grad_norm": 0.478515625, + "learning_rate": 0.00019968748318071908, + "loss": 1.1398, + "step": 541 + }, + { + "epoch": 0.02568111821843165, + "grad_norm": 0.54296875, + "learning_rate": 0.0001996863056008939, + "loss": 1.2883, + "step": 542 + }, + { + "epoch": 0.025728500355366026, + "grad_norm": 0.5546875, + "learning_rate": 0.0001996851258101326, + "loss": 1.5229, + "step": 543 + }, + { + "epoch": 0.025775882492300402, + "grad_norm": 0.47265625, + "learning_rate": 0.00019968394380846146, + "loss": 0.4906, + "step": 544 + }, + { + "epoch": 0.025823264629234777, + "grad_norm": 0.5390625, + "learning_rate": 0.0001996827595959066, + "loss": 0.8691, + "step": 545 + }, + { + "epoch": 0.025870646766169153, + "grad_norm": 0.46484375, + "learning_rate": 0.00019968157317249428, + "loss": 0.9585, + "step": 546 + }, + { + "epoch": 0.02591802890310353, + "grad_norm": 0.4296875, + "learning_rate": 0.00019968038453825084, + "loss": 0.9995, + "step": 547 + }, + { + "epoch": 0.025965411040037904, + "grad_norm": 0.54296875, + "learning_rate": 0.00019967919369320267, + "loss": 0.9304, + "step": 548 + }, + { + "epoch": 0.02601279317697228, + "grad_norm": 0.6328125, + "learning_rate": 0.00019967800063737617, + "loss": 0.2167, + "step": 549 + }, + { + "epoch": 0.026060175313906656, + "grad_norm": 0.56640625, + "learning_rate": 0.00019967680537079775, + "loss": 0.8259, + "step": 550 + }, + { + "epoch": 0.026107557450841035, + "grad_norm": 0.2421875, + "learning_rate": 0.000199675607893494, + "loss": 0.0206, + "step": 551 + }, + { + "epoch": 0.02615493958777541, + "grad_norm": 0.455078125, + "learning_rate": 0.0001996744082054914, + "loss": 1.3086, + "step": 552 + }, + { + "epoch": 0.026202321724709786, + "grad_norm": 0.427734375, + "learning_rate": 0.0001996732063068166, + "loss": 1.187, + "step": 553 + }, + { + "epoch": 0.02624970386164416, + "grad_norm": 0.97265625, + "learning_rate": 0.00019967200219749628, + "loss": 0.6433, + "step": 554 + }, + { + "epoch": 0.026297085998578537, + "grad_norm": 0.671875, + "learning_rate": 0.0001996707958775571, + "loss": 0.8348, + "step": 555 + }, + { + "epoch": 0.026344468135512913, + "grad_norm": 0.58984375, + "learning_rate": 0.00019966958734702584, + "loss": 1.079, + "step": 556 + }, + { + "epoch": 0.02639185027244729, + "grad_norm": 0.373046875, + "learning_rate": 0.00019966837660592926, + "loss": 1.209, + "step": 557 + }, + { + "epoch": 0.026439232409381664, + "grad_norm": 0.39453125, + "learning_rate": 0.0001996671636542943, + "loss": 0.5099, + "step": 558 + }, + { + "epoch": 0.02648661454631604, + "grad_norm": 0.43359375, + "learning_rate": 0.0001996659484921478, + "loss": 1.0275, + "step": 559 + }, + { + "epoch": 0.026533996683250415, + "grad_norm": 0.84765625, + "learning_rate": 0.00019966473111951669, + "loss": 0.6466, + "step": 560 + }, + { + "epoch": 0.02658137882018479, + "grad_norm": 0.41015625, + "learning_rate": 0.000199663511536428, + "loss": 1.2751, + "step": 561 + }, + { + "epoch": 0.026628760957119166, + "grad_norm": 0.76953125, + "learning_rate": 0.0001996622897429088, + "loss": 0.6194, + "step": 562 + }, + { + "epoch": 0.026676143094053542, + "grad_norm": 0.48828125, + "learning_rate": 0.00019966106573898618, + "loss": 0.6814, + "step": 563 + }, + { + "epoch": 0.026723525230987918, + "grad_norm": 0.4375, + "learning_rate": 0.00019965983952468727, + "loss": 0.4884, + "step": 564 + }, + { + "epoch": 0.026770907367922293, + "grad_norm": 0.53515625, + "learning_rate": 0.00019965861110003927, + "loss": 0.7638, + "step": 565 + }, + { + "epoch": 0.02681828950485667, + "grad_norm": 0.5390625, + "learning_rate": 0.00019965738046506945, + "loss": 1.5112, + "step": 566 + }, + { + "epoch": 0.026865671641791045, + "grad_norm": 0.3828125, + "learning_rate": 0.000199656147619805, + "loss": 0.6674, + "step": 567 + }, + { + "epoch": 0.02691305377872542, + "grad_norm": 0.486328125, + "learning_rate": 0.00019965491256427345, + "loss": 1.3322, + "step": 568 + }, + { + "epoch": 0.026960435915659796, + "grad_norm": 0.46484375, + "learning_rate": 0.000199653675298502, + "loss": 1.0886, + "step": 569 + }, + { + "epoch": 0.02700781805259417, + "grad_norm": 0.484375, + "learning_rate": 0.00019965243582251824, + "loss": 0.4547, + "step": 570 + }, + { + "epoch": 0.027055200189528547, + "grad_norm": 0.64453125, + "learning_rate": 0.00019965119413634956, + "loss": 0.7786, + "step": 571 + }, + { + "epoch": 0.027102582326462923, + "grad_norm": 0.41796875, + "learning_rate": 0.0001996499502400236, + "loss": 1.0901, + "step": 572 + }, + { + "epoch": 0.0271499644633973, + "grad_norm": 0.408203125, + "learning_rate": 0.00019964870413356783, + "loss": 0.6203, + "step": 573 + }, + { + "epoch": 0.027197346600331674, + "grad_norm": 0.5625, + "learning_rate": 0.00019964745581700993, + "loss": 0.309, + "step": 574 + }, + { + "epoch": 0.02724472873726605, + "grad_norm": 0.423828125, + "learning_rate": 0.00019964620529037763, + "loss": 0.8369, + "step": 575 + }, + { + "epoch": 0.027292110874200425, + "grad_norm": 0.486328125, + "learning_rate": 0.0001996449525536986, + "loss": 1.0702, + "step": 576 + }, + { + "epoch": 0.0273394930111348, + "grad_norm": 0.5234375, + "learning_rate": 0.00019964369760700073, + "loss": 0.7614, + "step": 577 + }, + { + "epoch": 0.027386875148069176, + "grad_norm": 0.6875, + "learning_rate": 0.0001996424404503117, + "loss": 0.8197, + "step": 578 + }, + { + "epoch": 0.027434257285003552, + "grad_norm": 0.451171875, + "learning_rate": 0.00019964118108365954, + "loss": 0.9399, + "step": 579 + }, + { + "epoch": 0.027481639421937928, + "grad_norm": 0.400390625, + "learning_rate": 0.0001996399195070721, + "loss": 1.1644, + "step": 580 + }, + { + "epoch": 0.027529021558872307, + "grad_norm": 0.439453125, + "learning_rate": 0.00019963865572057734, + "loss": 1.0886, + "step": 581 + }, + { + "epoch": 0.027576403695806682, + "grad_norm": 0.93359375, + "learning_rate": 0.00019963738972420336, + "loss": 1.1885, + "step": 582 + }, + { + "epoch": 0.027623785832741058, + "grad_norm": 0.498046875, + "learning_rate": 0.00019963612151797819, + "loss": 0.2672, + "step": 583 + }, + { + "epoch": 0.027671167969675434, + "grad_norm": 0.4609375, + "learning_rate": 0.00019963485110193, + "loss": 1.0301, + "step": 584 + }, + { + "epoch": 0.02771855010660981, + "grad_norm": 0.59765625, + "learning_rate": 0.00019963357847608692, + "loss": 0.1862, + "step": 585 + }, + { + "epoch": 0.027765932243544185, + "grad_norm": 0.431640625, + "learning_rate": 0.0001996323036404772, + "loss": 0.9836, + "step": 586 + }, + { + "epoch": 0.02781331438047856, + "grad_norm": 0.54296875, + "learning_rate": 0.00019963102659512912, + "loss": 1.0559, + "step": 587 + }, + { + "epoch": 0.027860696517412936, + "grad_norm": 0.396484375, + "learning_rate": 0.00019962974734007095, + "loss": 0.7318, + "step": 588 + }, + { + "epoch": 0.027908078654347312, + "grad_norm": 0.419921875, + "learning_rate": 0.00019962846587533113, + "loss": 0.8617, + "step": 589 + }, + { + "epoch": 0.027955460791281687, + "grad_norm": 0.462890625, + "learning_rate": 0.00019962718220093806, + "loss": 1.2558, + "step": 590 + }, + { + "epoch": 0.028002842928216063, + "grad_norm": 0.578125, + "learning_rate": 0.0001996258963169202, + "loss": 1.2238, + "step": 591 + }, + { + "epoch": 0.02805022506515044, + "grad_norm": 0.5234375, + "learning_rate": 0.00019962460822330608, + "loss": 0.9095, + "step": 592 + }, + { + "epoch": 0.028097607202084814, + "grad_norm": 0.40625, + "learning_rate": 0.00019962331792012426, + "loss": 1.1602, + "step": 593 + }, + { + "epoch": 0.02814498933901919, + "grad_norm": 0.4375, + "learning_rate": 0.0001996220254074034, + "loss": 0.6363, + "step": 594 + }, + { + "epoch": 0.028192371475953566, + "grad_norm": 0.498046875, + "learning_rate": 0.00019962073068517205, + "loss": 0.1952, + "step": 595 + }, + { + "epoch": 0.02823975361288794, + "grad_norm": 0.51953125, + "learning_rate": 0.0001996194337534591, + "loss": 0.2863, + "step": 596 + }, + { + "epoch": 0.028287135749822317, + "grad_norm": 0.44140625, + "learning_rate": 0.00019961813461229314, + "loss": 1.1668, + "step": 597 + }, + { + "epoch": 0.028334517886756692, + "grad_norm": 0.478515625, + "learning_rate": 0.0001996168332617031, + "loss": 1.333, + "step": 598 + }, + { + "epoch": 0.028381900023691068, + "grad_norm": 0.392578125, + "learning_rate": 0.00019961552970171778, + "loss": 1.0673, + "step": 599 + }, + { + "epoch": 0.028429282160625444, + "grad_norm": 0.296875, + "learning_rate": 0.00019961422393236617, + "loss": 1.4091, + "step": 600 + }, + { + "epoch": 0.02847666429755982, + "grad_norm": 0.51171875, + "learning_rate": 0.00019961291595367714, + "loss": 1.3616, + "step": 601 + }, + { + "epoch": 0.028524046434494195, + "grad_norm": 0.45703125, + "learning_rate": 0.00019961160576567973, + "loss": 1.1052, + "step": 602 + }, + { + "epoch": 0.02857142857142857, + "grad_norm": 0.9453125, + "learning_rate": 0.00019961029336840302, + "loss": 0.095, + "step": 603 + }, + { + "epoch": 0.028618810708362946, + "grad_norm": 0.43359375, + "learning_rate": 0.0001996089787618761, + "loss": 1.1025, + "step": 604 + }, + { + "epoch": 0.028666192845297322, + "grad_norm": 0.466796875, + "learning_rate": 0.00019960766194612815, + "loss": 0.6186, + "step": 605 + }, + { + "epoch": 0.028713574982231697, + "grad_norm": 0.6484375, + "learning_rate": 0.0001996063429211883, + "loss": 0.8258, + "step": 606 + }, + { + "epoch": 0.028760957119166073, + "grad_norm": 1.046875, + "learning_rate": 0.0001996050216870859, + "loss": 1.029, + "step": 607 + }, + { + "epoch": 0.02880833925610045, + "grad_norm": 0.84375, + "learning_rate": 0.0001996036982438502, + "loss": 0.6272, + "step": 608 + }, + { + "epoch": 0.028855721393034824, + "grad_norm": 0.421875, + "learning_rate": 0.0001996023725915106, + "loss": 0.7262, + "step": 609 + }, + { + "epoch": 0.0289031035299692, + "grad_norm": 0.453125, + "learning_rate": 0.00019960104473009643, + "loss": 1.2872, + "step": 610 + }, + { + "epoch": 0.02895048566690358, + "grad_norm": 0.4453125, + "learning_rate": 0.0001995997146596372, + "loss": 0.0761, + "step": 611 + }, + { + "epoch": 0.028997867803837955, + "grad_norm": 0.52734375, + "learning_rate": 0.0001995983823801624, + "loss": 1.1936, + "step": 612 + }, + { + "epoch": 0.02904524994077233, + "grad_norm": 0.47265625, + "learning_rate": 0.00019959704789170152, + "loss": 1.2569, + "step": 613 + }, + { + "epoch": 0.029092632077706706, + "grad_norm": 0.890625, + "learning_rate": 0.0001995957111942842, + "loss": 0.5025, + "step": 614 + }, + { + "epoch": 0.02914001421464108, + "grad_norm": 0.443359375, + "learning_rate": 0.00019959437228794013, + "loss": 0.9133, + "step": 615 + }, + { + "epoch": 0.029187396351575457, + "grad_norm": 0.62109375, + "learning_rate": 0.00019959303117269897, + "loss": 0.635, + "step": 616 + }, + { + "epoch": 0.029234778488509833, + "grad_norm": 0.416015625, + "learning_rate": 0.00019959168784859044, + "loss": 0.3316, + "step": 617 + }, + { + "epoch": 0.02928216062544421, + "grad_norm": 1.171875, + "learning_rate": 0.00019959034231564434, + "loss": 0.7864, + "step": 618 + }, + { + "epoch": 0.029329542762378584, + "grad_norm": 0.44921875, + "learning_rate": 0.00019958899457389056, + "loss": 0.9831, + "step": 619 + }, + { + "epoch": 0.02937692489931296, + "grad_norm": 0.48046875, + "learning_rate": 0.00019958764462335894, + "loss": 1.1185, + "step": 620 + }, + { + "epoch": 0.029424307036247335, + "grad_norm": 0.3046875, + "learning_rate": 0.00019958629246407945, + "loss": 0.511, + "step": 621 + }, + { + "epoch": 0.02947168917318171, + "grad_norm": 0.546875, + "learning_rate": 0.00019958493809608206, + "loss": 0.1045, + "step": 622 + }, + { + "epoch": 0.029519071310116086, + "grad_norm": 0.37109375, + "learning_rate": 0.0001995835815193968, + "loss": 0.8508, + "step": 623 + }, + { + "epoch": 0.029566453447050462, + "grad_norm": 0.578125, + "learning_rate": 0.0001995822227340538, + "loss": 1.005, + "step": 624 + }, + { + "epoch": 0.029613835583984838, + "grad_norm": 0.4609375, + "learning_rate": 0.00019958086174008314, + "loss": 0.9452, + "step": 625 + }, + { + "epoch": 0.029661217720919213, + "grad_norm": 0.0322265625, + "learning_rate": 0.00019957949853751506, + "loss": 0.0025, + "step": 626 + }, + { + "epoch": 0.02970859985785359, + "grad_norm": 0.60546875, + "learning_rate": 0.00019957813312637977, + "loss": 0.6284, + "step": 627 + }, + { + "epoch": 0.029755981994787965, + "grad_norm": 0.5078125, + "learning_rate": 0.00019957676550670753, + "loss": 0.4623, + "step": 628 + }, + { + "epoch": 0.02980336413172234, + "grad_norm": 0.5234375, + "learning_rate": 0.00019957539567852872, + "loss": 1.3487, + "step": 629 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 0.84765625, + "learning_rate": 0.00019957402364187367, + "loss": 0.3309, + "step": 630 + }, + { + "epoch": 0.02989812840559109, + "grad_norm": 0.73828125, + "learning_rate": 0.00019957264939677287, + "loss": 0.4242, + "step": 631 + }, + { + "epoch": 0.029945510542525467, + "grad_norm": 0.404296875, + "learning_rate": 0.00019957127294325676, + "loss": 0.2819, + "step": 632 + }, + { + "epoch": 0.029992892679459843, + "grad_norm": 1.3671875, + "learning_rate": 0.00019956989428135584, + "loss": 0.5646, + "step": 633 + }, + { + "epoch": 0.03004027481639422, + "grad_norm": 0.6015625, + "learning_rate": 0.00019956851341110075, + "loss": 1.522, + "step": 634 + }, + { + "epoch": 0.030087656953328594, + "grad_norm": 0.52734375, + "learning_rate": 0.00019956713033252211, + "loss": 1.0439, + "step": 635 + }, + { + "epoch": 0.03013503909026297, + "grad_norm": 1.2109375, + "learning_rate": 0.00019956574504565054, + "loss": 0.4926, + "step": 636 + }, + { + "epoch": 0.030182421227197345, + "grad_norm": 0.265625, + "learning_rate": 0.0001995643575505168, + "loss": 0.0841, + "step": 637 + }, + { + "epoch": 0.03022980336413172, + "grad_norm": 0.1298828125, + "learning_rate": 0.00019956296784715168, + "loss": 0.0148, + "step": 638 + }, + { + "epoch": 0.030277185501066096, + "grad_norm": 0.7421875, + "learning_rate": 0.00019956157593558596, + "loss": 0.3761, + "step": 639 + }, + { + "epoch": 0.030324567638000476, + "grad_norm": 0.55859375, + "learning_rate": 0.00019956018181585054, + "loss": 1.1064, + "step": 640 + }, + { + "epoch": 0.03037194977493485, + "grad_norm": 0.5078125, + "learning_rate": 0.00019955878548797636, + "loss": 0.987, + "step": 641 + }, + { + "epoch": 0.030419331911869227, + "grad_norm": 0.3828125, + "learning_rate": 0.00019955738695199432, + "loss": 0.3002, + "step": 642 + }, + { + "epoch": 0.030466714048803602, + "grad_norm": 0.41796875, + "learning_rate": 0.00019955598620793552, + "loss": 0.7511, + "step": 643 + }, + { + "epoch": 0.030514096185737978, + "grad_norm": 0.3359375, + "learning_rate": 0.00019955458325583096, + "loss": 0.936, + "step": 644 + }, + { + "epoch": 0.030561478322672354, + "grad_norm": 0.498046875, + "learning_rate": 0.00019955317809571178, + "loss": 1.0025, + "step": 645 + }, + { + "epoch": 0.03060886045960673, + "grad_norm": 0.52734375, + "learning_rate": 0.0001995517707276092, + "loss": 1.0406, + "step": 646 + }, + { + "epoch": 0.030656242596541105, + "grad_norm": 0.828125, + "learning_rate": 0.00019955036115155435, + "loss": 1.1344, + "step": 647 + }, + { + "epoch": 0.03070362473347548, + "grad_norm": 0.77734375, + "learning_rate": 0.0001995489493675785, + "loss": 0.1149, + "step": 648 + }, + { + "epoch": 0.030751006870409856, + "grad_norm": 0.45703125, + "learning_rate": 0.00019954753537571303, + "loss": 1.2429, + "step": 649 + }, + { + "epoch": 0.030798389007344232, + "grad_norm": 0.3359375, + "learning_rate": 0.00019954611917598922, + "loss": 0.0303, + "step": 650 + }, + { + "epoch": 0.030845771144278607, + "grad_norm": 0.65234375, + "learning_rate": 0.00019954470076843854, + "loss": 0.8687, + "step": 651 + }, + { + "epoch": 0.030893153281212983, + "grad_norm": 0.7109375, + "learning_rate": 0.00019954328015309243, + "loss": 0.2572, + "step": 652 + }, + { + "epoch": 0.03094053541814736, + "grad_norm": 0.78515625, + "learning_rate": 0.0001995418573299824, + "loss": 0.5588, + "step": 653 + }, + { + "epoch": 0.030987917555081734, + "grad_norm": 0.640625, + "learning_rate": 0.00019954043229913996, + "loss": 0.1976, + "step": 654 + }, + { + "epoch": 0.03103529969201611, + "grad_norm": 0.376953125, + "learning_rate": 0.0001995390050605968, + "loss": 0.5403, + "step": 655 + }, + { + "epoch": 0.031082681828950486, + "grad_norm": 0.49609375, + "learning_rate": 0.00019953757561438454, + "loss": 1.1379, + "step": 656 + }, + { + "epoch": 0.03113006396588486, + "grad_norm": 0.6953125, + "learning_rate": 0.00019953614396053487, + "loss": 0.4124, + "step": 657 + }, + { + "epoch": 0.031177446102819237, + "grad_norm": 0.51171875, + "learning_rate": 0.00019953471009907952, + "loss": 1.3503, + "step": 658 + }, + { + "epoch": 0.031224828239753612, + "grad_norm": 0.478515625, + "learning_rate": 0.00019953327403005034, + "loss": 0.8431, + "step": 659 + }, + { + "epoch": 0.03127221037668799, + "grad_norm": 0.42578125, + "learning_rate": 0.00019953183575347914, + "loss": 0.8172, + "step": 660 + }, + { + "epoch": 0.03131959251362237, + "grad_norm": 0.69140625, + "learning_rate": 0.00019953039526939784, + "loss": 0.2071, + "step": 661 + }, + { + "epoch": 0.03136697465055674, + "grad_norm": 0.51171875, + "learning_rate": 0.00019952895257783842, + "loss": 0.2853, + "step": 662 + }, + { + "epoch": 0.03141435678749112, + "grad_norm": 0.39453125, + "learning_rate": 0.00019952750767883283, + "loss": 0.8138, + "step": 663 + }, + { + "epoch": 0.03146173892442549, + "grad_norm": 0.494140625, + "learning_rate": 0.00019952606057241313, + "loss": 1.1287, + "step": 664 + }, + { + "epoch": 0.03150912106135987, + "grad_norm": 0.40234375, + "learning_rate": 0.0001995246112586114, + "loss": 0.8817, + "step": 665 + }, + { + "epoch": 0.03155650319829424, + "grad_norm": 1.3671875, + "learning_rate": 0.00019952315973745984, + "loss": 0.6365, + "step": 666 + }, + { + "epoch": 0.03160388533522862, + "grad_norm": 0.51171875, + "learning_rate": 0.00019952170600899058, + "loss": 1.1044, + "step": 667 + }, + { + "epoch": 0.03165126747216299, + "grad_norm": 0.859375, + "learning_rate": 0.00019952025007323587, + "loss": 0.631, + "step": 668 + }, + { + "epoch": 0.03169864960909737, + "grad_norm": 0.423828125, + "learning_rate": 0.00019951879193022806, + "loss": 0.6363, + "step": 669 + }, + { + "epoch": 0.031746031746031744, + "grad_norm": 1.0703125, + "learning_rate": 0.0001995173315799994, + "loss": 0.662, + "step": 670 + }, + { + "epoch": 0.03179341388296612, + "grad_norm": 0.71484375, + "learning_rate": 0.00019951586902258237, + "loss": 0.2984, + "step": 671 + }, + { + "epoch": 0.031840796019900496, + "grad_norm": 0.55078125, + "learning_rate": 0.00019951440425800934, + "loss": 0.2706, + "step": 672 + }, + { + "epoch": 0.031888178156834875, + "grad_norm": 0.341796875, + "learning_rate": 0.00019951293728631282, + "loss": 0.9611, + "step": 673 + }, + { + "epoch": 0.03193556029376925, + "grad_norm": 0.376953125, + "learning_rate": 0.00019951146810752535, + "loss": 0.2135, + "step": 674 + }, + { + "epoch": 0.031982942430703626, + "grad_norm": 0.41796875, + "learning_rate": 0.0001995099967216795, + "loss": 0.224, + "step": 675 + }, + { + "epoch": 0.032030324567638, + "grad_norm": 1.0703125, + "learning_rate": 0.00019950852312880795, + "loss": 0.7542, + "step": 676 + }, + { + "epoch": 0.03207770670457238, + "grad_norm": 0.42578125, + "learning_rate": 0.0001995070473289433, + "loss": 0.7266, + "step": 677 + }, + { + "epoch": 0.03212508884150675, + "grad_norm": 0.373046875, + "learning_rate": 0.0001995055693221184, + "loss": 0.2165, + "step": 678 + }, + { + "epoch": 0.03217247097844113, + "grad_norm": 1.2421875, + "learning_rate": 0.0001995040891083659, + "loss": 1.2225, + "step": 679 + }, + { + "epoch": 0.0322198531153755, + "grad_norm": 0.60546875, + "learning_rate": 0.00019950260668771873, + "loss": 1.2573, + "step": 680 + }, + { + "epoch": 0.03226723525230988, + "grad_norm": 0.3828125, + "learning_rate": 0.0001995011220602097, + "loss": 0.1674, + "step": 681 + }, + { + "epoch": 0.03231461738924425, + "grad_norm": 0.3515625, + "learning_rate": 0.00019949963522587178, + "loss": 0.4874, + "step": 682 + }, + { + "epoch": 0.03236199952617863, + "grad_norm": 0.52734375, + "learning_rate": 0.00019949814618473792, + "loss": 1.2075, + "step": 683 + }, + { + "epoch": 0.032409381663113, + "grad_norm": 0.427734375, + "learning_rate": 0.00019949665493684119, + "loss": 1.2133, + "step": 684 + }, + { + "epoch": 0.03245676380004738, + "grad_norm": 0.431640625, + "learning_rate": 0.00019949516148221462, + "loss": 1.1786, + "step": 685 + }, + { + "epoch": 0.03250414593698176, + "grad_norm": 0.408203125, + "learning_rate": 0.00019949366582089134, + "loss": 1.2238, + "step": 686 + }, + { + "epoch": 0.03255152807391613, + "grad_norm": 0.734375, + "learning_rate": 0.00019949216795290452, + "loss": 0.5681, + "step": 687 + }, + { + "epoch": 0.03259891021085051, + "grad_norm": 0.41796875, + "learning_rate": 0.00019949066787828737, + "loss": 1.0104, + "step": 688 + }, + { + "epoch": 0.032646292347784885, + "grad_norm": 0.4375, + "learning_rate": 0.0001994891655970732, + "loss": 1.0594, + "step": 689 + }, + { + "epoch": 0.032693674484719264, + "grad_norm": 0.8515625, + "learning_rate": 0.00019948766110929533, + "loss": 1.0748, + "step": 690 + }, + { + "epoch": 0.032741056621653636, + "grad_norm": 0.71875, + "learning_rate": 0.00019948615441498708, + "loss": 0.7086, + "step": 691 + }, + { + "epoch": 0.032788438758588015, + "grad_norm": 0.419921875, + "learning_rate": 0.0001994846455141819, + "loss": 0.9541, + "step": 692 + }, + { + "epoch": 0.03283582089552239, + "grad_norm": 0.5078125, + "learning_rate": 0.00019948313440691325, + "loss": 1.1669, + "step": 693 + }, + { + "epoch": 0.032883203032456766, + "grad_norm": 0.44921875, + "learning_rate": 0.00019948162109321464, + "loss": 1.1247, + "step": 694 + }, + { + "epoch": 0.03293058516939114, + "grad_norm": 0.53125, + "learning_rate": 0.00019948010557311964, + "loss": 1.3189, + "step": 695 + }, + { + "epoch": 0.03297796730632552, + "grad_norm": 0.390625, + "learning_rate": 0.00019947858784666187, + "loss": 0.8425, + "step": 696 + }, + { + "epoch": 0.03302534944325989, + "grad_norm": 0.478515625, + "learning_rate": 0.00019947706791387498, + "loss": 1.0511, + "step": 697 + }, + { + "epoch": 0.03307273158019427, + "grad_norm": 0.6328125, + "learning_rate": 0.00019947554577479267, + "loss": 0.6043, + "step": 698 + }, + { + "epoch": 0.03312011371712864, + "grad_norm": 0.44921875, + "learning_rate": 0.00019947402142944868, + "loss": 0.9978, + "step": 699 + }, + { + "epoch": 0.03316749585406302, + "grad_norm": 0.435546875, + "learning_rate": 0.00019947249487787692, + "loss": 1.0231, + "step": 700 + }, + { + "epoch": 0.03321487799099739, + "grad_norm": 0.4296875, + "learning_rate": 0.00019947096612011112, + "loss": 0.7844, + "step": 701 + }, + { + "epoch": 0.03326226012793177, + "grad_norm": 0.78125, + "learning_rate": 0.00019946943515618525, + "loss": 0.6324, + "step": 702 + }, + { + "epoch": 0.03330964226486614, + "grad_norm": 0.357421875, + "learning_rate": 0.0001994679019861333, + "loss": 0.88, + "step": 703 + }, + { + "epoch": 0.03335702440180052, + "grad_norm": 1.046875, + "learning_rate": 0.0001994663666099892, + "loss": 0.0794, + "step": 704 + }, + { + "epoch": 0.033404406538734895, + "grad_norm": 0.427734375, + "learning_rate": 0.00019946482902778704, + "loss": 1.3594, + "step": 705 + }, + { + "epoch": 0.033451788675669274, + "grad_norm": 0.45703125, + "learning_rate": 0.00019946328923956092, + "loss": 1.3723, + "step": 706 + }, + { + "epoch": 0.033499170812603646, + "grad_norm": 0.484375, + "learning_rate": 0.00019946174724534498, + "loss": 0.7756, + "step": 707 + }, + { + "epoch": 0.033546552949538025, + "grad_norm": 1.2421875, + "learning_rate": 0.00019946020304517347, + "loss": 0.5788, + "step": 708 + }, + { + "epoch": 0.0335939350864724, + "grad_norm": 0.60546875, + "learning_rate": 0.00019945865663908055, + "loss": 0.1196, + "step": 709 + }, + { + "epoch": 0.033641317223406776, + "grad_norm": 0.921875, + "learning_rate": 0.00019945710802710056, + "loss": 0.7798, + "step": 710 + }, + { + "epoch": 0.03368869936034115, + "grad_norm": 0.396484375, + "learning_rate": 0.00019945555720926787, + "loss": 1.0431, + "step": 711 + }, + { + "epoch": 0.03373608149727553, + "grad_norm": 0.71875, + "learning_rate": 0.00019945400418561686, + "loss": 0.2399, + "step": 712 + }, + { + "epoch": 0.0337834636342099, + "grad_norm": 0.71875, + "learning_rate": 0.00019945244895618194, + "loss": 0.7883, + "step": 713 + }, + { + "epoch": 0.03383084577114428, + "grad_norm": 0.328125, + "learning_rate": 0.00019945089152099765, + "loss": 0.5802, + "step": 714 + }, + { + "epoch": 0.03387822790807866, + "grad_norm": 0.54296875, + "learning_rate": 0.00019944933188009855, + "loss": 1.1913, + "step": 715 + }, + { + "epoch": 0.03392561004501303, + "grad_norm": 0.431640625, + "learning_rate": 0.00019944777003351916, + "loss": 0.844, + "step": 716 + }, + { + "epoch": 0.03397299218194741, + "grad_norm": 0.46484375, + "learning_rate": 0.00019944620598129418, + "loss": 1.5475, + "step": 717 + }, + { + "epoch": 0.03402037431888178, + "grad_norm": 0.46484375, + "learning_rate": 0.00019944463972345827, + "loss": 1.2742, + "step": 718 + }, + { + "epoch": 0.03406775645581616, + "grad_norm": 1.2109375, + "learning_rate": 0.00019944307126004614, + "loss": 0.3556, + "step": 719 + }, + { + "epoch": 0.03411513859275053, + "grad_norm": 0.462890625, + "learning_rate": 0.00019944150059109266, + "loss": 0.7456, + "step": 720 + }, + { + "epoch": 0.03416252072968491, + "grad_norm": 0.56640625, + "learning_rate": 0.0001994399277166326, + "loss": 1.1194, + "step": 721 + }, + { + "epoch": 0.034209902866619284, + "grad_norm": 0.59765625, + "learning_rate": 0.00019943835263670084, + "loss": 1.0139, + "step": 722 + }, + { + "epoch": 0.03425728500355366, + "grad_norm": 0.494140625, + "learning_rate": 0.0001994367753513324, + "loss": 1.2507, + "step": 723 + }, + { + "epoch": 0.034304667140488035, + "grad_norm": 0.3828125, + "learning_rate": 0.00019943519586056212, + "loss": 0.8208, + "step": 724 + }, + { + "epoch": 0.034352049277422414, + "grad_norm": 0.53125, + "learning_rate": 0.00019943361416442515, + "loss": 1.0066, + "step": 725 + }, + { + "epoch": 0.034399431414356786, + "grad_norm": 0.62109375, + "learning_rate": 0.0001994320302629565, + "loss": 1.3527, + "step": 726 + }, + { + "epoch": 0.034446813551291165, + "grad_norm": 0.57421875, + "learning_rate": 0.00019943044415619138, + "loss": 0.8505, + "step": 727 + }, + { + "epoch": 0.03449419568822554, + "grad_norm": 1.15625, + "learning_rate": 0.00019942885584416488, + "loss": 0.097, + "step": 728 + }, + { + "epoch": 0.034541577825159916, + "grad_norm": 0.55078125, + "learning_rate": 0.00019942726532691228, + "loss": 0.2142, + "step": 729 + }, + { + "epoch": 0.03458895996209429, + "grad_norm": 0.484375, + "learning_rate": 0.00019942567260446885, + "loss": 0.8172, + "step": 730 + }, + { + "epoch": 0.03463634209902867, + "grad_norm": 0.55078125, + "learning_rate": 0.0001994240776768699, + "loss": 0.2967, + "step": 731 + }, + { + "epoch": 0.03468372423596304, + "grad_norm": 0.287109375, + "learning_rate": 0.0001994224805441508, + "loss": 0.0132, + "step": 732 + }, + { + "epoch": 0.03473110637289742, + "grad_norm": 0.5078125, + "learning_rate": 0.00019942088120634694, + "loss": 1.1554, + "step": 733 + }, + { + "epoch": 0.03477848850983179, + "grad_norm": 0.5, + "learning_rate": 0.00019941927966349388, + "loss": 0.7384, + "step": 734 + }, + { + "epoch": 0.03482587064676617, + "grad_norm": 0.5078125, + "learning_rate": 0.0001994176759156271, + "loss": 1.2028, + "step": 735 + }, + { + "epoch": 0.03487325278370054, + "grad_norm": 0.478515625, + "learning_rate": 0.00019941606996278215, + "loss": 0.3103, + "step": 736 + }, + { + "epoch": 0.03492063492063492, + "grad_norm": 0.5625, + "learning_rate": 0.00019941446180499466, + "loss": 0.557, + "step": 737 + }, + { + "epoch": 0.034968017057569294, + "grad_norm": 0.462890625, + "learning_rate": 0.00019941285144230029, + "loss": 0.9375, + "step": 738 + }, + { + "epoch": 0.03501539919450367, + "grad_norm": 0.55859375, + "learning_rate": 0.00019941123887473475, + "loss": 1.1975, + "step": 739 + }, + { + "epoch": 0.035062781331438045, + "grad_norm": 1.3203125, + "learning_rate": 0.00019940962410233386, + "loss": 0.589, + "step": 740 + }, + { + "epoch": 0.035110163468372424, + "grad_norm": 0.5078125, + "learning_rate": 0.0001994080071251334, + "loss": 1.0395, + "step": 741 + }, + { + "epoch": 0.035157545605306796, + "grad_norm": 0.58203125, + "learning_rate": 0.0001994063879431692, + "loss": 1.1947, + "step": 742 + }, + { + "epoch": 0.035204927742241175, + "grad_norm": 0.5234375, + "learning_rate": 0.0001994047665564772, + "loss": 1.1165, + "step": 743 + }, + { + "epoch": 0.03525230987917555, + "grad_norm": 0.65234375, + "learning_rate": 0.00019940314296509337, + "loss": 0.31, + "step": 744 + }, + { + "epoch": 0.035299692016109926, + "grad_norm": 0.81640625, + "learning_rate": 0.00019940151716905371, + "loss": 0.68, + "step": 745 + }, + { + "epoch": 0.035347074153044306, + "grad_norm": 1.1328125, + "learning_rate": 0.00019939988916839425, + "loss": 0.7773, + "step": 746 + }, + { + "epoch": 0.03539445628997868, + "grad_norm": 0.453125, + "learning_rate": 0.00019939825896315115, + "loss": 1.0553, + "step": 747 + }, + { + "epoch": 0.03544183842691306, + "grad_norm": 0.380859375, + "learning_rate": 0.00019939662655336053, + "loss": 0.7158, + "step": 748 + }, + { + "epoch": 0.03548922056384743, + "grad_norm": 0.443359375, + "learning_rate": 0.00019939499193905862, + "loss": 0.2265, + "step": 749 + }, + { + "epoch": 0.03553660270078181, + "grad_norm": 0.53125, + "learning_rate": 0.00019939335512028164, + "loss": 0.96, + "step": 750 + }, + { + "epoch": 0.03558398483771618, + "grad_norm": 0.27734375, + "learning_rate": 0.0001993917160970659, + "loss": 0.7222, + "step": 751 + }, + { + "epoch": 0.03563136697465056, + "grad_norm": 0.423828125, + "learning_rate": 0.0001993900748694478, + "loss": 1.0403, + "step": 752 + }, + { + "epoch": 0.03567874911158493, + "grad_norm": 0.44140625, + "learning_rate": 0.00019938843143746369, + "loss": 0.9728, + "step": 753 + }, + { + "epoch": 0.03572613124851931, + "grad_norm": 0.484375, + "learning_rate": 0.00019938678580115005, + "loss": 1.0014, + "step": 754 + }, + { + "epoch": 0.03577351338545368, + "grad_norm": 0.46484375, + "learning_rate": 0.00019938513796054333, + "loss": 1.2322, + "step": 755 + }, + { + "epoch": 0.03582089552238806, + "grad_norm": 0.51171875, + "learning_rate": 0.00019938348791568013, + "loss": 1.2852, + "step": 756 + }, + { + "epoch": 0.035868277659322434, + "grad_norm": 0.58984375, + "learning_rate": 0.00019938183566659703, + "loss": 1.0591, + "step": 757 + }, + { + "epoch": 0.03591565979625681, + "grad_norm": 0.482421875, + "learning_rate": 0.00019938018121333064, + "loss": 1.7937, + "step": 758 + }, + { + "epoch": 0.035963041933191185, + "grad_norm": 0.59765625, + "learning_rate": 0.00019937852455591772, + "loss": 1.0624, + "step": 759 + }, + { + "epoch": 0.036010424070125564, + "grad_norm": 1.09375, + "learning_rate": 0.00019937686569439496, + "loss": 0.564, + "step": 760 + }, + { + "epoch": 0.036057806207059936, + "grad_norm": 1.2265625, + "learning_rate": 0.00019937520462879918, + "loss": 0.9257, + "step": 761 + }, + { + "epoch": 0.036105188343994316, + "grad_norm": 0.38671875, + "learning_rate": 0.00019937354135916721, + "loss": 1.023, + "step": 762 + }, + { + "epoch": 0.03615257048092869, + "grad_norm": 0.94921875, + "learning_rate": 0.00019937187588553595, + "loss": 0.7796, + "step": 763 + }, + { + "epoch": 0.03619995261786307, + "grad_norm": 0.41796875, + "learning_rate": 0.00019937020820794233, + "loss": 0.0903, + "step": 764 + }, + { + "epoch": 0.03624733475479744, + "grad_norm": 2.0, + "learning_rate": 0.00019936853832642332, + "loss": 0.5821, + "step": 765 + }, + { + "epoch": 0.03629471689173182, + "grad_norm": 0.9140625, + "learning_rate": 0.00019936686624101596, + "loss": 0.3511, + "step": 766 + }, + { + "epoch": 0.03634209902866619, + "grad_norm": 0.48046875, + "learning_rate": 0.0001993651919517574, + "loss": 0.314, + "step": 767 + }, + { + "epoch": 0.03638948116560057, + "grad_norm": 0.48046875, + "learning_rate": 0.00019936351545868467, + "loss": 0.8487, + "step": 768 + }, + { + "epoch": 0.03643686330253494, + "grad_norm": 0.427734375, + "learning_rate": 0.000199361836761835, + "loss": 0.6685, + "step": 769 + }, + { + "epoch": 0.03648424543946932, + "grad_norm": 0.83984375, + "learning_rate": 0.0001993601558612457, + "loss": 0.7398, + "step": 770 + }, + { + "epoch": 0.03653162757640369, + "grad_norm": 0.5, + "learning_rate": 0.00019935847275695393, + "loss": 1.2331, + "step": 771 + }, + { + "epoch": 0.03657900971333807, + "grad_norm": 0.439453125, + "learning_rate": 0.00019935678744899705, + "loss": 1.2426, + "step": 772 + }, + { + "epoch": 0.036626391850272444, + "grad_norm": 0.1435546875, + "learning_rate": 0.00019935509993741245, + "loss": 0.0053, + "step": 773 + }, + { + "epoch": 0.03667377398720682, + "grad_norm": 0.470703125, + "learning_rate": 0.0001993534102222376, + "loss": 0.9312, + "step": 774 + }, + { + "epoch": 0.0367211561241412, + "grad_norm": 0.65234375, + "learning_rate": 0.0001993517183035099, + "loss": 1.0097, + "step": 775 + }, + { + "epoch": 0.036768538261075574, + "grad_norm": 0.052734375, + "learning_rate": 0.00019935002418126693, + "loss": 0.004, + "step": 776 + }, + { + "epoch": 0.03681592039800995, + "grad_norm": 0.56640625, + "learning_rate": 0.00019934832785554625, + "loss": 1.0161, + "step": 777 + }, + { + "epoch": 0.036863302534944326, + "grad_norm": 0.515625, + "learning_rate": 0.00019934662932638548, + "loss": 0.9238, + "step": 778 + }, + { + "epoch": 0.036910684671878705, + "grad_norm": 0.5078125, + "learning_rate": 0.00019934492859382226, + "loss": 0.1305, + "step": 779 + }, + { + "epoch": 0.03695806680881308, + "grad_norm": 0.5546875, + "learning_rate": 0.00019934322565789438, + "loss": 1.173, + "step": 780 + }, + { + "epoch": 0.037005448945747456, + "grad_norm": 0.83984375, + "learning_rate": 0.00019934152051863957, + "loss": 0.2643, + "step": 781 + }, + { + "epoch": 0.03705283108268183, + "grad_norm": 0.451171875, + "learning_rate": 0.00019933981317609562, + "loss": 1.4096, + "step": 782 + }, + { + "epoch": 0.03710021321961621, + "grad_norm": 0.4765625, + "learning_rate": 0.00019933810363030046, + "loss": 0.2747, + "step": 783 + }, + { + "epoch": 0.03714759535655058, + "grad_norm": 0.5078125, + "learning_rate": 0.00019933639188129195, + "loss": 0.9685, + "step": 784 + }, + { + "epoch": 0.03719497749348496, + "grad_norm": 0.6484375, + "learning_rate": 0.00019933467792910805, + "loss": 1.578, + "step": 785 + }, + { + "epoch": 0.03724235963041933, + "grad_norm": 0.68359375, + "learning_rate": 0.00019933296177378684, + "loss": 1.4234, + "step": 786 + }, + { + "epoch": 0.03728974176735371, + "grad_norm": 0.52734375, + "learning_rate": 0.00019933124341536633, + "loss": 1.0826, + "step": 787 + }, + { + "epoch": 0.03733712390428808, + "grad_norm": 0.59375, + "learning_rate": 0.00019932952285388463, + "loss": 0.0578, + "step": 788 + }, + { + "epoch": 0.03738450604122246, + "grad_norm": 0.392578125, + "learning_rate": 0.00019932780008937993, + "loss": 1.0469, + "step": 789 + }, + { + "epoch": 0.03743188817815683, + "grad_norm": 0.57421875, + "learning_rate": 0.00019932607512189042, + "loss": 0.0993, + "step": 790 + }, + { + "epoch": 0.03747927031509121, + "grad_norm": 0.83203125, + "learning_rate": 0.00019932434795145437, + "loss": 0.4258, + "step": 791 + }, + { + "epoch": 0.037526652452025584, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001993226185781101, + "loss": 0.0317, + "step": 792 + }, + { + "epoch": 0.03757403458895996, + "grad_norm": 0.65625, + "learning_rate": 0.0001993208870018959, + "loss": 0.2297, + "step": 793 + }, + { + "epoch": 0.037621416725894335, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019931915322285025, + "loss": 0.0231, + "step": 794 + }, + { + "epoch": 0.037668798862828715, + "grad_norm": 0.470703125, + "learning_rate": 0.00019931741724101153, + "loss": 0.7809, + "step": 795 + }, + { + "epoch": 0.03771618099976309, + "grad_norm": 0.48828125, + "learning_rate": 0.00019931567905641834, + "loss": 0.8968, + "step": 796 + }, + { + "epoch": 0.037763563136697466, + "grad_norm": 0.7578125, + "learning_rate": 0.00019931393866910914, + "loss": 0.3473, + "step": 797 + }, + { + "epoch": 0.03781094527363184, + "grad_norm": 0.33203125, + "learning_rate": 0.00019931219607912258, + "loss": 0.0396, + "step": 798 + }, + { + "epoch": 0.03785832741056622, + "grad_norm": 0.59765625, + "learning_rate": 0.00019931045128649725, + "loss": 0.4401, + "step": 799 + }, + { + "epoch": 0.03790570954750059, + "grad_norm": 0.5078125, + "learning_rate": 0.00019930870429127193, + "loss": 0.345, + "step": 800 + }, + { + "epoch": 0.03795309168443497, + "grad_norm": 0.52734375, + "learning_rate": 0.00019930695509348534, + "loss": 1.2678, + "step": 801 + }, + { + "epoch": 0.03800047382136934, + "grad_norm": 0.498046875, + "learning_rate": 0.00019930520369317622, + "loss": 0.1882, + "step": 802 + }, + { + "epoch": 0.03804785595830372, + "grad_norm": 0.59765625, + "learning_rate": 0.00019930345009038351, + "loss": 0.4099, + "step": 803 + }, + { + "epoch": 0.0380952380952381, + "grad_norm": 0.40234375, + "learning_rate": 0.000199301694285146, + "loss": 0.2211, + "step": 804 + }, + { + "epoch": 0.03814262023217247, + "grad_norm": 0.490234375, + "learning_rate": 0.00019929993627750272, + "loss": 0.8162, + "step": 805 + }, + { + "epoch": 0.03819000236910685, + "grad_norm": 0.435546875, + "learning_rate": 0.0001992981760674926, + "loss": 1.9835, + "step": 806 + }, + { + "epoch": 0.03823738450604122, + "grad_norm": 0.431640625, + "learning_rate": 0.00019929641365515474, + "loss": 0.9136, + "step": 807 + }, + { + "epoch": 0.0382847666429756, + "grad_norm": 1.3125, + "learning_rate": 0.00019929464904052812, + "loss": 0.6974, + "step": 808 + }, + { + "epoch": 0.03833214877990997, + "grad_norm": 0.4609375, + "learning_rate": 0.00019929288222365202, + "loss": 0.7456, + "step": 809 + }, + { + "epoch": 0.03837953091684435, + "grad_norm": 0.484375, + "learning_rate": 0.0001992911132045655, + "loss": 1.1999, + "step": 810 + }, + { + "epoch": 0.038426913053778725, + "grad_norm": 0.53125, + "learning_rate": 0.0001992893419833079, + "loss": 0.7235, + "step": 811 + }, + { + "epoch": 0.038474295190713104, + "grad_norm": 0.5234375, + "learning_rate": 0.0001992875685599184, + "loss": 1.0206, + "step": 812 + }, + { + "epoch": 0.038521677327647476, + "grad_norm": 0.890625, + "learning_rate": 0.0001992857929344364, + "loss": 0.3944, + "step": 813 + }, + { + "epoch": 0.038569059464581855, + "grad_norm": 0.52734375, + "learning_rate": 0.0001992840151069013, + "loss": 1.2379, + "step": 814 + }, + { + "epoch": 0.03861644160151623, + "grad_norm": 0.69140625, + "learning_rate": 0.00019928223507735248, + "loss": 1.1114, + "step": 815 + }, + { + "epoch": 0.038663823738450606, + "grad_norm": 0.37890625, + "learning_rate": 0.00019928045284582941, + "loss": 0.2301, + "step": 816 + }, + { + "epoch": 0.03871120587538498, + "grad_norm": 0.578125, + "learning_rate": 0.00019927866841237167, + "loss": 1.174, + "step": 817 + }, + { + "epoch": 0.03875858801231936, + "grad_norm": 0.55078125, + "learning_rate": 0.00019927688177701883, + "loss": 0.6894, + "step": 818 + }, + { + "epoch": 0.03880597014925373, + "grad_norm": 1.1875, + "learning_rate": 0.00019927509293981048, + "loss": 1.3543, + "step": 819 + }, + { + "epoch": 0.03885335228618811, + "grad_norm": 0.384765625, + "learning_rate": 0.0001992733019007863, + "loss": 0.2246, + "step": 820 + }, + { + "epoch": 0.03890073442312248, + "grad_norm": 0.78515625, + "learning_rate": 0.00019927150865998604, + "loss": 0.6539, + "step": 821 + }, + { + "epoch": 0.03894811656005686, + "grad_norm": 0.43359375, + "learning_rate": 0.00019926971321744942, + "loss": 1.4651, + "step": 822 + }, + { + "epoch": 0.03899549869699123, + "grad_norm": 0.423828125, + "learning_rate": 0.00019926791557321635, + "loss": 0.8091, + "step": 823 + }, + { + "epoch": 0.03904288083392561, + "grad_norm": 0.55078125, + "learning_rate": 0.00019926611572732662, + "loss": 1.3761, + "step": 824 + }, + { + "epoch": 0.03909026297085998, + "grad_norm": 0.625, + "learning_rate": 0.0001992643136798202, + "loss": 0.3683, + "step": 825 + }, + { + "epoch": 0.03913764510779436, + "grad_norm": 0.412109375, + "learning_rate": 0.00019926250943073698, + "loss": 0.5105, + "step": 826 + }, + { + "epoch": 0.039185027244728735, + "grad_norm": 1.390625, + "learning_rate": 0.0001992607029801171, + "loss": 0.542, + "step": 827 + }, + { + "epoch": 0.039232409381663114, + "grad_norm": 0.51953125, + "learning_rate": 0.0001992588943280005, + "loss": 0.2755, + "step": 828 + }, + { + "epoch": 0.039279791518597486, + "grad_norm": 0.3828125, + "learning_rate": 0.0001992570834744274, + "loss": 0.2822, + "step": 829 + }, + { + "epoch": 0.039327173655531865, + "grad_norm": 0.8515625, + "learning_rate": 0.0001992552704194379, + "loss": 0.9901, + "step": 830 + }, + { + "epoch": 0.03937455579246624, + "grad_norm": 0.0615234375, + "learning_rate": 0.00019925345516307217, + "loss": 0.0039, + "step": 831 + }, + { + "epoch": 0.039421937929400616, + "grad_norm": 0.478515625, + "learning_rate": 0.00019925163770537059, + "loss": 1.0106, + "step": 832 + }, + { + "epoch": 0.03946932006633499, + "grad_norm": 0.034423828125, + "learning_rate": 0.00019924981804637337, + "loss": 0.0026, + "step": 833 + }, + { + "epoch": 0.03951670220326937, + "grad_norm": 0.5234375, + "learning_rate": 0.0001992479961861209, + "loss": 0.9969, + "step": 834 + }, + { + "epoch": 0.039564084340203746, + "grad_norm": 1.125, + "learning_rate": 0.0001992461721246536, + "loss": 0.2964, + "step": 835 + }, + { + "epoch": 0.03961146647713812, + "grad_norm": 0.42578125, + "learning_rate": 0.00019924434586201191, + "loss": 1.2602, + "step": 836 + }, + { + "epoch": 0.0396588486140725, + "grad_norm": 0.396484375, + "learning_rate": 0.00019924251739823637, + "loss": 0.7571, + "step": 837 + }, + { + "epoch": 0.03970623075100687, + "grad_norm": 0.115234375, + "learning_rate": 0.00019924068673336746, + "loss": 0.0111, + "step": 838 + }, + { + "epoch": 0.03975361288794125, + "grad_norm": 0.3046875, + "learning_rate": 0.00019923885386744582, + "loss": 0.02, + "step": 839 + }, + { + "epoch": 0.03980099502487562, + "grad_norm": 0.5546875, + "learning_rate": 0.00019923701880051212, + "loss": 1.8461, + "step": 840 + }, + { + "epoch": 0.03984837716181, + "grad_norm": 0.486328125, + "learning_rate": 0.00019923518153260706, + "loss": 0.3594, + "step": 841 + }, + { + "epoch": 0.03989575929874437, + "grad_norm": 0.408203125, + "learning_rate": 0.00019923334206377135, + "loss": 0.2136, + "step": 842 + }, + { + "epoch": 0.03994314143567875, + "grad_norm": 0.388671875, + "learning_rate": 0.00019923150039404582, + "loss": 0.7874, + "step": 843 + }, + { + "epoch": 0.039990523572613124, + "grad_norm": 0.451171875, + "learning_rate": 0.00019922965652347134, + "loss": 1.266, + "step": 844 + }, + { + "epoch": 0.0400379057095475, + "grad_norm": 0.51953125, + "learning_rate": 0.00019922781045208875, + "loss": 1.0366, + "step": 845 + }, + { + "epoch": 0.040085287846481875, + "grad_norm": 0.4609375, + "learning_rate": 0.000199225962179939, + "loss": 1.495, + "step": 846 + }, + { + "epoch": 0.040132669983416254, + "grad_norm": 0.5390625, + "learning_rate": 0.00019922411170706313, + "loss": 0.9435, + "step": 847 + }, + { + "epoch": 0.040180052120350626, + "grad_norm": 0.4609375, + "learning_rate": 0.00019922225903350212, + "loss": 1.0605, + "step": 848 + }, + { + "epoch": 0.040227434257285005, + "grad_norm": 0.482421875, + "learning_rate": 0.0001992204041592971, + "loss": 1.4564, + "step": 849 + }, + { + "epoch": 0.04027481639421938, + "grad_norm": 0.59765625, + "learning_rate": 0.0001992185470844892, + "loss": 0.6306, + "step": 850 + }, + { + "epoch": 0.040322198531153756, + "grad_norm": 0.474609375, + "learning_rate": 0.00019921668780911963, + "loss": 0.2552, + "step": 851 + }, + { + "epoch": 0.04036958066808813, + "grad_norm": 0.310546875, + "learning_rate": 0.0001992148263332296, + "loss": 0.1815, + "step": 852 + }, + { + "epoch": 0.04041696280502251, + "grad_norm": 0.427734375, + "learning_rate": 0.0001992129626568604, + "loss": 1.2073, + "step": 853 + }, + { + "epoch": 0.04046434494195688, + "grad_norm": 0.04296875, + "learning_rate": 0.00019921109678005335, + "loss": 0.0031, + "step": 854 + }, + { + "epoch": 0.04051172707889126, + "grad_norm": 0.6875, + "learning_rate": 0.00019920922870284984, + "loss": 1.0907, + "step": 855 + }, + { + "epoch": 0.04055910921582563, + "grad_norm": 0.447265625, + "learning_rate": 0.0001992073584252913, + "loss": 0.8843, + "step": 856 + }, + { + "epoch": 0.04060649135276001, + "grad_norm": 0.400390625, + "learning_rate": 0.00019920548594741928, + "loss": 1.1248, + "step": 857 + }, + { + "epoch": 0.04065387348969438, + "grad_norm": 0.388671875, + "learning_rate": 0.00019920361126927522, + "loss": 0.2766, + "step": 858 + }, + { + "epoch": 0.04070125562662876, + "grad_norm": 0.439453125, + "learning_rate": 0.00019920173439090072, + "loss": 0.9496, + "step": 859 + }, + { + "epoch": 0.040748637763563134, + "grad_norm": 0.45703125, + "learning_rate": 0.00019919985531233742, + "loss": 0.2032, + "step": 860 + }, + { + "epoch": 0.04079601990049751, + "grad_norm": 0.435546875, + "learning_rate": 0.000199197974033627, + "loss": 0.7034, + "step": 861 + }, + { + "epoch": 0.040843402037431885, + "grad_norm": 0.455078125, + "learning_rate": 0.00019919609055481116, + "loss": 1.1872, + "step": 862 + }, + { + "epoch": 0.040890784174366264, + "grad_norm": 0.50390625, + "learning_rate": 0.0001991942048759317, + "loss": 1.3044, + "step": 863 + }, + { + "epoch": 0.04093816631130064, + "grad_norm": 0.58984375, + "learning_rate": 0.00019919231699703046, + "loss": 1.1731, + "step": 864 + }, + { + "epoch": 0.040985548448235015, + "grad_norm": 0.8984375, + "learning_rate": 0.00019919042691814924, + "loss": 0.2639, + "step": 865 + }, + { + "epoch": 0.041032930585169394, + "grad_norm": 0.4453125, + "learning_rate": 0.00019918853463933003, + "loss": 0.83, + "step": 866 + }, + { + "epoch": 0.041080312722103766, + "grad_norm": 0.435546875, + "learning_rate": 0.00019918664016061474, + "loss": 1.0699, + "step": 867 + }, + { + "epoch": 0.041127694859038146, + "grad_norm": 0.435546875, + "learning_rate": 0.00019918474348204544, + "loss": 1.2956, + "step": 868 + }, + { + "epoch": 0.04117507699597252, + "grad_norm": 0.51171875, + "learning_rate": 0.0001991828446036642, + "loss": 0.8589, + "step": 869 + }, + { + "epoch": 0.0412224591329069, + "grad_norm": 0.5390625, + "learning_rate": 0.00019918094352551312, + "loss": 0.6539, + "step": 870 + }, + { + "epoch": 0.04126984126984127, + "grad_norm": 0.318359375, + "learning_rate": 0.00019917904024763428, + "loss": 0.4389, + "step": 871 + }, + { + "epoch": 0.04131722340677565, + "grad_norm": 0.474609375, + "learning_rate": 0.00019917713477007003, + "loss": 1.028, + "step": 872 + }, + { + "epoch": 0.04136460554371002, + "grad_norm": 0.447265625, + "learning_rate": 0.00019917522709286256, + "loss": 0.2418, + "step": 873 + }, + { + "epoch": 0.0414119876806444, + "grad_norm": 0.318359375, + "learning_rate": 0.00019917331721605418, + "loss": 0.2146, + "step": 874 + }, + { + "epoch": 0.04145936981757877, + "grad_norm": 0.44921875, + "learning_rate": 0.00019917140513968725, + "loss": 1.2755, + "step": 875 + }, + { + "epoch": 0.04150675195451315, + "grad_norm": 0.43359375, + "learning_rate": 0.0001991694908638042, + "loss": 0.8287, + "step": 876 + }, + { + "epoch": 0.04155413409144752, + "grad_norm": 0.6484375, + "learning_rate": 0.0001991675743884475, + "loss": 0.2201, + "step": 877 + }, + { + "epoch": 0.0416015162283819, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001991656557136596, + "loss": 0.0061, + "step": 878 + }, + { + "epoch": 0.041648898365316274, + "grad_norm": 0.5703125, + "learning_rate": 0.00019916373483948308, + "loss": 1.242, + "step": 879 + }, + { + "epoch": 0.04169628050225065, + "grad_norm": 0.5625, + "learning_rate": 0.00019916181176596055, + "loss": 1.2381, + "step": 880 + }, + { + "epoch": 0.041743662639185025, + "grad_norm": 0.474609375, + "learning_rate": 0.00019915988649313467, + "loss": 1.2693, + "step": 881 + }, + { + "epoch": 0.041791044776119404, + "grad_norm": 0.515625, + "learning_rate": 0.0001991579590210481, + "loss": 1.3847, + "step": 882 + }, + { + "epoch": 0.041838426913053776, + "grad_norm": 0.48828125, + "learning_rate": 0.00019915602934974364, + "loss": 1.032, + "step": 883 + }, + { + "epoch": 0.041885809049988156, + "grad_norm": 0.6015625, + "learning_rate": 0.00019915409747926405, + "loss": 1.0213, + "step": 884 + }, + { + "epoch": 0.04193319118692253, + "grad_norm": 0.30859375, + "learning_rate": 0.0001991521634096522, + "loss": 0.7964, + "step": 885 + }, + { + "epoch": 0.04198057332385691, + "grad_norm": 0.447265625, + "learning_rate": 0.00019915022714095098, + "loss": 0.8511, + "step": 886 + }, + { + "epoch": 0.04202795546079128, + "grad_norm": 0.478515625, + "learning_rate": 0.00019914828867320335, + "loss": 0.734, + "step": 887 + }, + { + "epoch": 0.04207533759772566, + "grad_norm": 0.5546875, + "learning_rate": 0.00019914634800645225, + "loss": 0.9853, + "step": 888 + }, + { + "epoch": 0.04212271973466003, + "grad_norm": 0.451171875, + "learning_rate": 0.00019914440514074078, + "loss": 0.9674, + "step": 889 + }, + { + "epoch": 0.04217010187159441, + "grad_norm": 0.408203125, + "learning_rate": 0.000199142460076112, + "loss": 1.0232, + "step": 890 + }, + { + "epoch": 0.04221748400852878, + "grad_norm": 0.39453125, + "learning_rate": 0.00019914051281260905, + "loss": 0.8152, + "step": 891 + }, + { + "epoch": 0.04226486614546316, + "grad_norm": 0.298828125, + "learning_rate": 0.00019913856335027514, + "loss": 0.0387, + "step": 892 + }, + { + "epoch": 0.04231224828239753, + "grad_norm": 0.58984375, + "learning_rate": 0.0001991366116891535, + "loss": 0.8438, + "step": 893 + }, + { + "epoch": 0.04235963041933191, + "grad_norm": 0.74609375, + "learning_rate": 0.00019913465782928736, + "loss": 0.1963, + "step": 894 + }, + { + "epoch": 0.04240701255626629, + "grad_norm": 0.443359375, + "learning_rate": 0.00019913270177072015, + "loss": 0.8991, + "step": 895 + }, + { + "epoch": 0.04245439469320066, + "grad_norm": 0.26953125, + "learning_rate": 0.0001991307435134952, + "loss": 0.6034, + "step": 896 + }, + { + "epoch": 0.04250177683013504, + "grad_norm": 0.490234375, + "learning_rate": 0.00019912878305765593, + "loss": 1.2035, + "step": 897 + }, + { + "epoch": 0.042549158967069414, + "grad_norm": 0.46875, + "learning_rate": 0.00019912682040324587, + "loss": 1.2343, + "step": 898 + }, + { + "epoch": 0.04259654110400379, + "grad_norm": 0.62109375, + "learning_rate": 0.0001991248555503085, + "loss": 0.1443, + "step": 899 + }, + { + "epoch": 0.042643923240938165, + "grad_norm": 0.416015625, + "learning_rate": 0.00019912288849888743, + "loss": 1.281, + "step": 900 + }, + { + "epoch": 0.042691305377872545, + "grad_norm": 0.64453125, + "learning_rate": 0.00019912091924902624, + "loss": 0.0633, + "step": 901 + }, + { + "epoch": 0.04273868751480692, + "grad_norm": 0.470703125, + "learning_rate": 0.00019911894780076867, + "loss": 1.2478, + "step": 902 + }, + { + "epoch": 0.042786069651741296, + "grad_norm": 0.51171875, + "learning_rate": 0.0001991169741541584, + "loss": 0.7873, + "step": 903 + }, + { + "epoch": 0.04283345178867567, + "grad_norm": 0.3984375, + "learning_rate": 0.00019911499830923922, + "loss": 0.9015, + "step": 904 + }, + { + "epoch": 0.04288083392561005, + "grad_norm": 0.46484375, + "learning_rate": 0.00019911302026605495, + "loss": 1.2971, + "step": 905 + }, + { + "epoch": 0.04292821606254442, + "grad_norm": 0.421875, + "learning_rate": 0.00019911104002464947, + "loss": 0.965, + "step": 906 + }, + { + "epoch": 0.0429755981994788, + "grad_norm": 0.5546875, + "learning_rate": 0.00019910905758506667, + "loss": 1.2898, + "step": 907 + }, + { + "epoch": 0.04302298033641317, + "grad_norm": 0.6875, + "learning_rate": 0.00019910707294735057, + "loss": 0.9892, + "step": 908 + }, + { + "epoch": 0.04307036247334755, + "grad_norm": 0.5078125, + "learning_rate": 0.00019910508611154515, + "loss": 0.2499, + "step": 909 + }, + { + "epoch": 0.04311774461028192, + "grad_norm": 0.51171875, + "learning_rate": 0.00019910309707769447, + "loss": 0.1086, + "step": 910 + }, + { + "epoch": 0.0431651267472163, + "grad_norm": 0.43359375, + "learning_rate": 0.00019910110584584265, + "loss": 0.824, + "step": 911 + }, + { + "epoch": 0.04321250888415067, + "grad_norm": 0.451171875, + "learning_rate": 0.00019909911241603386, + "loss": 1.3195, + "step": 912 + }, + { + "epoch": 0.04325989102108505, + "grad_norm": 0.58203125, + "learning_rate": 0.0001990971167883123, + "loss": 0.1024, + "step": 913 + }, + { + "epoch": 0.043307273158019424, + "grad_norm": 0.42578125, + "learning_rate": 0.00019909511896272229, + "loss": 0.6601, + "step": 914 + }, + { + "epoch": 0.0433546552949538, + "grad_norm": 0.7734375, + "learning_rate": 0.00019909311893930807, + "loss": 0.3921, + "step": 915 + }, + { + "epoch": 0.043402037431888175, + "grad_norm": 0.51171875, + "learning_rate": 0.00019909111671811402, + "loss": 0.7832, + "step": 916 + }, + { + "epoch": 0.043449419568822555, + "grad_norm": 0.578125, + "learning_rate": 0.00019908911229918452, + "loss": 1.384, + "step": 917 + }, + { + "epoch": 0.04349680170575693, + "grad_norm": 1.1171875, + "learning_rate": 0.0001990871056825641, + "loss": 1.0337, + "step": 918 + }, + { + "epoch": 0.043544183842691306, + "grad_norm": 0.7578125, + "learning_rate": 0.0001990850968682972, + "loss": 0.2049, + "step": 919 + }, + { + "epoch": 0.04359156597962568, + "grad_norm": 1.6640625, + "learning_rate": 0.00019908308585642838, + "loss": 0.6315, + "step": 920 + }, + { + "epoch": 0.04363894811656006, + "grad_norm": 0.51171875, + "learning_rate": 0.00019908107264700225, + "loss": 1.143, + "step": 921 + }, + { + "epoch": 0.04368633025349443, + "grad_norm": 0.6640625, + "learning_rate": 0.0001990790572400635, + "loss": 0.3487, + "step": 922 + }, + { + "epoch": 0.04373371239042881, + "grad_norm": 0.427734375, + "learning_rate": 0.00019907703963565677, + "loss": 1.0009, + "step": 923 + }, + { + "epoch": 0.04378109452736319, + "grad_norm": 0.337890625, + "learning_rate": 0.00019907501983382683, + "loss": 0.0066, + "step": 924 + }, + { + "epoch": 0.04382847666429756, + "grad_norm": 0.74609375, + "learning_rate": 0.00019907299783461852, + "loss": 0.513, + "step": 925 + }, + { + "epoch": 0.04387585880123194, + "grad_norm": 0.416015625, + "learning_rate": 0.0001990709736380766, + "loss": 0.1052, + "step": 926 + }, + { + "epoch": 0.04392324093816631, + "grad_norm": 0.447265625, + "learning_rate": 0.000199068947244246, + "loss": 1.0993, + "step": 927 + }, + { + "epoch": 0.04397062307510069, + "grad_norm": 0.42578125, + "learning_rate": 0.00019906691865317173, + "loss": 0.859, + "step": 928 + }, + { + "epoch": 0.04401800521203506, + "grad_norm": 0.51171875, + "learning_rate": 0.00019906488786489867, + "loss": 1.018, + "step": 929 + }, + { + "epoch": 0.04406538734896944, + "grad_norm": 0.51171875, + "learning_rate": 0.00019906285487947197, + "loss": 1.2107, + "step": 930 + }, + { + "epoch": 0.04411276948590381, + "grad_norm": 0.625, + "learning_rate": 0.0001990608196969366, + "loss": 1.1488, + "step": 931 + }, + { + "epoch": 0.04416015162283819, + "grad_norm": 0.5390625, + "learning_rate": 0.00019905878231733781, + "loss": 1.1261, + "step": 932 + }, + { + "epoch": 0.044207533759772565, + "grad_norm": 0.71875, + "learning_rate": 0.00019905674274072076, + "loss": 0.1464, + "step": 933 + }, + { + "epoch": 0.044254915896706944, + "grad_norm": 1.265625, + "learning_rate": 0.0001990547009671306, + "loss": 0.7288, + "step": 934 + }, + { + "epoch": 0.044302298033641316, + "grad_norm": 0.375, + "learning_rate": 0.00019905265699661273, + "loss": 0.3238, + "step": 935 + }, + { + "epoch": 0.044349680170575695, + "grad_norm": 0.458984375, + "learning_rate": 0.00019905061082921242, + "loss": 0.8499, + "step": 936 + }, + { + "epoch": 0.04439706230751007, + "grad_norm": 0.53125, + "learning_rate": 0.00019904856246497508, + "loss": 1.4673, + "step": 937 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.494140625, + "learning_rate": 0.0001990465119039461, + "loss": 1.1882, + "step": 938 + }, + { + "epoch": 0.04449182658137882, + "grad_norm": 0.470703125, + "learning_rate": 0.000199044459146171, + "loss": 1.2626, + "step": 939 + }, + { + "epoch": 0.0445392087183132, + "grad_norm": 0.263671875, + "learning_rate": 0.0001990424041916953, + "loss": 0.0356, + "step": 940 + }, + { + "epoch": 0.04458659085524757, + "grad_norm": 0.60546875, + "learning_rate": 0.00019904034704056454, + "loss": 0.2159, + "step": 941 + }, + { + "epoch": 0.04463397299218195, + "grad_norm": 0.40234375, + "learning_rate": 0.0001990382876928244, + "loss": 0.8416, + "step": 942 + }, + { + "epoch": 0.04468135512911632, + "grad_norm": 0.4453125, + "learning_rate": 0.00019903622614852055, + "loss": 0.7343, + "step": 943 + }, + { + "epoch": 0.0447287372660507, + "grad_norm": 0.439453125, + "learning_rate": 0.00019903416240769865, + "loss": 0.2001, + "step": 944 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 0.50390625, + "learning_rate": 0.00019903209647040458, + "loss": 0.928, + "step": 945 + }, + { + "epoch": 0.04482350153991945, + "grad_norm": 0.34765625, + "learning_rate": 0.00019903002833668402, + "loss": 0.2404, + "step": 946 + }, + { + "epoch": 0.04487088367685382, + "grad_norm": 0.1201171875, + "learning_rate": 0.00019902795800658295, + "loss": 0.0152, + "step": 947 + }, + { + "epoch": 0.0449182658137882, + "grad_norm": 0.4765625, + "learning_rate": 0.00019902588548014724, + "loss": 0.8822, + "step": 948 + }, + { + "epoch": 0.044965647950722575, + "grad_norm": 0.439453125, + "learning_rate": 0.0001990238107574229, + "loss": 0.8807, + "step": 949 + }, + { + "epoch": 0.045013030087656954, + "grad_norm": 0.455078125, + "learning_rate": 0.00019902173383845587, + "loss": 0.8176, + "step": 950 + }, + { + "epoch": 0.045060412224591326, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019901965472329228, + "loss": 0.0183, + "step": 951 + }, + { + "epoch": 0.045107794361525705, + "grad_norm": 0.90625, + "learning_rate": 0.0001990175734119782, + "loss": 0.7378, + "step": 952 + }, + { + "epoch": 0.045155176498460084, + "grad_norm": 0.51171875, + "learning_rate": 0.0001990154899045598, + "loss": 1.3953, + "step": 953 + }, + { + "epoch": 0.045202558635394456, + "grad_norm": 0.56640625, + "learning_rate": 0.00019901340420108333, + "loss": 1.292, + "step": 954 + }, + { + "epoch": 0.045249940772328835, + "grad_norm": 0.5390625, + "learning_rate": 0.00019901131630159502, + "loss": 1.3624, + "step": 955 + }, + { + "epoch": 0.04529732290926321, + "grad_norm": 0.55078125, + "learning_rate": 0.00019900922620614119, + "loss": 1.2252, + "step": 956 + }, + { + "epoch": 0.045344705046197586, + "grad_norm": 0.7578125, + "learning_rate": 0.00019900713391476815, + "loss": 0.4908, + "step": 957 + }, + { + "epoch": 0.04539208718313196, + "grad_norm": 0.55859375, + "learning_rate": 0.00019900503942752235, + "loss": 1.1443, + "step": 958 + }, + { + "epoch": 0.04543946932006634, + "grad_norm": 0.46484375, + "learning_rate": 0.0001990029427444502, + "loss": 0.9862, + "step": 959 + }, + { + "epoch": 0.04548685145700071, + "grad_norm": 0.11279296875, + "learning_rate": 0.00019900084386559826, + "loss": 0.0205, + "step": 960 + }, + { + "epoch": 0.04553423359393509, + "grad_norm": 0.5703125, + "learning_rate": 0.00019899874279101306, + "loss": 0.1988, + "step": 961 + }, + { + "epoch": 0.04558161573086946, + "grad_norm": 0.55859375, + "learning_rate": 0.00019899663952074122, + "loss": 1.3923, + "step": 962 + }, + { + "epoch": 0.04562899786780384, + "grad_norm": 0.60546875, + "learning_rate": 0.00019899453405482933, + "loss": 1.2035, + "step": 963 + }, + { + "epoch": 0.04567638000473821, + "grad_norm": 1.0390625, + "learning_rate": 0.00019899242639332413, + "loss": 0.1353, + "step": 964 + }, + { + "epoch": 0.04572376214167259, + "grad_norm": 0.51171875, + "learning_rate": 0.0001989903165362723, + "loss": 0.1484, + "step": 965 + }, + { + "epoch": 0.045771144278606964, + "grad_norm": 0.5078125, + "learning_rate": 0.00019898820448372074, + "loss": 1.3354, + "step": 966 + }, + { + "epoch": 0.04581852641554134, + "grad_norm": 0.79296875, + "learning_rate": 0.00019898609023571626, + "loss": 0.4631, + "step": 967 + }, + { + "epoch": 0.045865908552475715, + "grad_norm": 1.078125, + "learning_rate": 0.0001989839737923057, + "loss": 0.4975, + "step": 968 + }, + { + "epoch": 0.045913290689410094, + "grad_norm": 0.431640625, + "learning_rate": 0.00019898185515353608, + "loss": 1.0471, + "step": 969 + }, + { + "epoch": 0.045960672826344466, + "grad_norm": 0.494140625, + "learning_rate": 0.0001989797343194543, + "loss": 0.8438, + "step": 970 + }, + { + "epoch": 0.046008054963278845, + "grad_norm": 0.578125, + "learning_rate": 0.00019897761129010743, + "loss": 1.1028, + "step": 971 + }, + { + "epoch": 0.04605543710021322, + "grad_norm": 0.400390625, + "learning_rate": 0.00019897548606554258, + "loss": 0.7336, + "step": 972 + }, + { + "epoch": 0.046102819237147596, + "grad_norm": 0.482421875, + "learning_rate": 0.0001989733586458069, + "loss": 0.954, + "step": 973 + }, + { + "epoch": 0.04615020137408197, + "grad_norm": 0.423828125, + "learning_rate": 0.00019897122903094752, + "loss": 1.1067, + "step": 974 + }, + { + "epoch": 0.04619758351101635, + "grad_norm": 0.53125, + "learning_rate": 0.0001989690972210117, + "loss": 1.6079, + "step": 975 + }, + { + "epoch": 0.04624496564795072, + "grad_norm": 0.6796875, + "learning_rate": 0.00019896696321604674, + "loss": 0.3096, + "step": 976 + }, + { + "epoch": 0.0462923477848851, + "grad_norm": 0.482421875, + "learning_rate": 0.00019896482701609993, + "loss": 0.9771, + "step": 977 + }, + { + "epoch": 0.04633972992181947, + "grad_norm": 0.474609375, + "learning_rate": 0.00019896268862121868, + "loss": 0.9484, + "step": 978 + }, + { + "epoch": 0.04638711205875385, + "grad_norm": 0.34375, + "learning_rate": 0.00019896054803145039, + "loss": 0.2152, + "step": 979 + }, + { + "epoch": 0.04643449419568822, + "grad_norm": 0.51171875, + "learning_rate": 0.00019895840524684257, + "loss": 0.9165, + "step": 980 + }, + { + "epoch": 0.0464818763326226, + "grad_norm": 0.486328125, + "learning_rate": 0.0001989562602674427, + "loss": 1.0838, + "step": 981 + }, + { + "epoch": 0.046529258469556974, + "grad_norm": 0.515625, + "learning_rate": 0.00019895411309329845, + "loss": 1.2539, + "step": 982 + }, + { + "epoch": 0.04657664060649135, + "grad_norm": 0.59765625, + "learning_rate": 0.0001989519637244573, + "loss": 1.1238, + "step": 983 + }, + { + "epoch": 0.04662402274342573, + "grad_norm": 0.48828125, + "learning_rate": 0.00019894981216096703, + "loss": 1.024, + "step": 984 + }, + { + "epoch": 0.046671404880360104, + "grad_norm": 0.50390625, + "learning_rate": 0.00019894765840287532, + "loss": 1.2483, + "step": 985 + }, + { + "epoch": 0.04671878701729448, + "grad_norm": 0.43359375, + "learning_rate": 0.00019894550245022993, + "loss": 1.0601, + "step": 986 + }, + { + "epoch": 0.046766169154228855, + "grad_norm": 0.59375, + "learning_rate": 0.00019894334430307868, + "loss": 0.9888, + "step": 987 + }, + { + "epoch": 0.046813551291163234, + "grad_norm": 0.44921875, + "learning_rate": 0.0001989411839614695, + "loss": 0.8472, + "step": 988 + }, + { + "epoch": 0.046860933428097606, + "grad_norm": 0.39453125, + "learning_rate": 0.00019893902142545015, + "loss": 0.0727, + "step": 989 + }, + { + "epoch": 0.046908315565031986, + "grad_norm": 0.80078125, + "learning_rate": 0.00019893685669506876, + "loss": 0.3552, + "step": 990 + }, + { + "epoch": 0.04695569770196636, + "grad_norm": 0.50390625, + "learning_rate": 0.00019893468977037325, + "loss": 0.9933, + "step": 991 + }, + { + "epoch": 0.04700307983890074, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001989325206514117, + "loss": 0.0244, + "step": 992 + }, + { + "epoch": 0.04705046197583511, + "grad_norm": 0.58203125, + "learning_rate": 0.00019893034933823222, + "loss": 0.9056, + "step": 993 + }, + { + "epoch": 0.04709784411276949, + "grad_norm": 1.015625, + "learning_rate": 0.000198928175830883, + "loss": 0.3743, + "step": 994 + }, + { + "epoch": 0.04714522624970386, + "grad_norm": 0.44921875, + "learning_rate": 0.00019892600012941217, + "loss": 0.8073, + "step": 995 + }, + { + "epoch": 0.04719260838663824, + "grad_norm": 0.490234375, + "learning_rate": 0.00019892382223386806, + "loss": 0.947, + "step": 996 + }, + { + "epoch": 0.04723999052357261, + "grad_norm": 0.498046875, + "learning_rate": 0.0001989216421442989, + "loss": 1.0194, + "step": 997 + }, + { + "epoch": 0.04728737266050699, + "grad_norm": 0.58984375, + "learning_rate": 0.0001989194598607531, + "loss": 1.2811, + "step": 998 + }, + { + "epoch": 0.04733475479744136, + "grad_norm": 0.48828125, + "learning_rate": 0.0001989172753832791, + "loss": 0.7862, + "step": 999 + }, + { + "epoch": 0.04738213693437574, + "grad_norm": 0.640625, + "learning_rate": 0.00019891508871192523, + "loss": 1.246, + "step": 1000 + }, + { + "epoch": 0.047429519071310114, + "grad_norm": 0.373046875, + "learning_rate": 0.0001989128998467401, + "loss": 0.2165, + "step": 1001 + }, + { + "epoch": 0.04747690120824449, + "grad_norm": 0.419921875, + "learning_rate": 0.00019891070878777213, + "loss": 1.1898, + "step": 1002 + }, + { + "epoch": 0.047524283345178865, + "grad_norm": 0.435546875, + "learning_rate": 0.00019890851553507006, + "loss": 0.3253, + "step": 1003 + }, + { + "epoch": 0.047571665482113244, + "grad_norm": 0.61328125, + "learning_rate": 0.00019890632008868244, + "loss": 0.8785, + "step": 1004 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.9921875, + "learning_rate": 0.000198904122448658, + "loss": 0.6022, + "step": 1005 + }, + { + "epoch": 0.047666429755981995, + "grad_norm": 0.48046875, + "learning_rate": 0.00019890192261504548, + "loss": 1.3854, + "step": 1006 + }, + { + "epoch": 0.04771381189291637, + "grad_norm": 0.23828125, + "learning_rate": 0.00019889972058789366, + "loss": 0.0278, + "step": 1007 + }, + { + "epoch": 0.04776119402985075, + "grad_norm": 0.59375, + "learning_rate": 0.00019889751636725138, + "loss": 1.2147, + "step": 1008 + }, + { + "epoch": 0.04780857616678512, + "grad_norm": 0.51171875, + "learning_rate": 0.00019889530995316753, + "loss": 1.0106, + "step": 1009 + }, + { + "epoch": 0.0478559583037195, + "grad_norm": 0.59765625, + "learning_rate": 0.00019889310134569104, + "loss": 0.8297, + "step": 1010 + }, + { + "epoch": 0.04790334044065387, + "grad_norm": 0.37109375, + "learning_rate": 0.00019889089054487088, + "loss": 0.8559, + "step": 1011 + }, + { + "epoch": 0.04795072257758825, + "grad_norm": 0.71484375, + "learning_rate": 0.00019888867755075613, + "loss": 0.322, + "step": 1012 + }, + { + "epoch": 0.04799810471452263, + "grad_norm": 0.0576171875, + "learning_rate": 0.00019888646236339584, + "loss": 0.0064, + "step": 1013 + }, + { + "epoch": 0.048045486851457, + "grad_norm": 1.1484375, + "learning_rate": 0.00019888424498283914, + "loss": 0.1653, + "step": 1014 + }, + { + "epoch": 0.04809286898839138, + "grad_norm": 1.453125, + "learning_rate": 0.00019888202540913522, + "loss": 0.8939, + "step": 1015 + }, + { + "epoch": 0.04814025112532575, + "grad_norm": 0.6015625, + "learning_rate": 0.00019887980364233327, + "loss": 0.6065, + "step": 1016 + }, + { + "epoch": 0.04818763326226013, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019887757968248263, + "loss": 0.007, + "step": 1017 + }, + { + "epoch": 0.0482350153991945, + "grad_norm": 0.875, + "learning_rate": 0.00019887535352963257, + "loss": 1.1738, + "step": 1018 + }, + { + "epoch": 0.04828239753612888, + "grad_norm": 0.74609375, + "learning_rate": 0.0001988731251838325, + "loss": 0.4478, + "step": 1019 + }, + { + "epoch": 0.048329779673063254, + "grad_norm": 0.82421875, + "learning_rate": 0.00019887089464513182, + "loss": 1.1147, + "step": 1020 + }, + { + "epoch": 0.04837716180999763, + "grad_norm": 0.490234375, + "learning_rate": 0.00019886866191358, + "loss": 0.625, + "step": 1021 + }, + { + "epoch": 0.048424543946932005, + "grad_norm": 0.458984375, + "learning_rate": 0.0001988664269892266, + "loss": 0.8374, + "step": 1022 + }, + { + "epoch": 0.048471926083866385, + "grad_norm": 0.453125, + "learning_rate": 0.00019886418987212113, + "loss": 0.7784, + "step": 1023 + }, + { + "epoch": 0.04851930822080076, + "grad_norm": 0.65625, + "learning_rate": 0.00019886195056231326, + "loss": 0.1788, + "step": 1024 + }, + { + "epoch": 0.048566690357735136, + "grad_norm": 1.0859375, + "learning_rate": 0.0001988597090598526, + "loss": 1.3, + "step": 1025 + }, + { + "epoch": 0.04861407249466951, + "grad_norm": 0.416015625, + "learning_rate": 0.0001988574653647889, + "loss": 0.0471, + "step": 1026 + }, + { + "epoch": 0.04866145463160389, + "grad_norm": 0.6484375, + "learning_rate": 0.00019885521947717193, + "loss": 0.835, + "step": 1027 + }, + { + "epoch": 0.04870883676853826, + "grad_norm": 0.466796875, + "learning_rate": 0.0001988529713970515, + "loss": 1.1535, + "step": 1028 + }, + { + "epoch": 0.04875621890547264, + "grad_norm": 0.435546875, + "learning_rate": 0.00019885072112447741, + "loss": 0.7651, + "step": 1029 + }, + { + "epoch": 0.04880360104240701, + "grad_norm": 0.6640625, + "learning_rate": 0.00019884846865949967, + "loss": 0.3877, + "step": 1030 + }, + { + "epoch": 0.04885098317934139, + "grad_norm": 0.3828125, + "learning_rate": 0.00019884621400216815, + "loss": 0.2153, + "step": 1031 + }, + { + "epoch": 0.04889836531627576, + "grad_norm": 0.67578125, + "learning_rate": 0.0001988439571525329, + "loss": 1.0758, + "step": 1032 + }, + { + "epoch": 0.04894574745321014, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019884169811064395, + "loss": 0.015, + "step": 1033 + }, + { + "epoch": 0.04899312959014451, + "grad_norm": 0.322265625, + "learning_rate": 0.00019883943687655143, + "loss": 0.0581, + "step": 1034 + }, + { + "epoch": 0.04904051172707889, + "grad_norm": 0.48828125, + "learning_rate": 0.0001988371734503055, + "loss": 1.1398, + "step": 1035 + }, + { + "epoch": 0.049087893864013264, + "grad_norm": 0.515625, + "learning_rate": 0.00019883490783195628, + "loss": 1.2845, + "step": 1036 + }, + { + "epoch": 0.04913527600094764, + "grad_norm": 0.58203125, + "learning_rate": 0.00019883264002155414, + "loss": 1.0436, + "step": 1037 + }, + { + "epoch": 0.049182658137882015, + "grad_norm": 0.66796875, + "learning_rate": 0.0001988303700191493, + "loss": 0.2564, + "step": 1038 + }, + { + "epoch": 0.049230040274816395, + "grad_norm": 0.494140625, + "learning_rate": 0.0001988280978247921, + "loss": 1.0179, + "step": 1039 + }, + { + "epoch": 0.04927742241175077, + "grad_norm": 0.53515625, + "learning_rate": 0.00019882582343853298, + "loss": 0.7744, + "step": 1040 + }, + { + "epoch": 0.049324804548685146, + "grad_norm": 0.65625, + "learning_rate": 0.00019882354686042236, + "loss": 1.5745, + "step": 1041 + }, + { + "epoch": 0.049372186685619525, + "grad_norm": 0.609375, + "learning_rate": 0.00019882126809051071, + "loss": 0.6615, + "step": 1042 + }, + { + "epoch": 0.0494195688225539, + "grad_norm": 0.4375, + "learning_rate": 0.0001988189871288486, + "loss": 0.9245, + "step": 1043 + }, + { + "epoch": 0.049466950959488276, + "grad_norm": 0.470703125, + "learning_rate": 0.00019881670397548664, + "loss": 0.8864, + "step": 1044 + }, + { + "epoch": 0.04951433309642265, + "grad_norm": 0.4140625, + "learning_rate": 0.00019881441863047543, + "loss": 0.6943, + "step": 1045 + }, + { + "epoch": 0.04956171523335703, + "grad_norm": 0.462890625, + "learning_rate": 0.00019881213109386567, + "loss": 1.1239, + "step": 1046 + }, + { + "epoch": 0.0496090973702914, + "grad_norm": 0.384765625, + "learning_rate": 0.00019880984136570805, + "loss": 1.1835, + "step": 1047 + }, + { + "epoch": 0.04965647950722578, + "grad_norm": 0.73046875, + "learning_rate": 0.00019880754944605344, + "loss": 0.058, + "step": 1048 + }, + { + "epoch": 0.04970386164416015, + "grad_norm": 0.455078125, + "learning_rate": 0.00019880525533495262, + "loss": 0.9848, + "step": 1049 + }, + { + "epoch": 0.04975124378109453, + "grad_norm": 0.478515625, + "learning_rate": 0.0001988029590324565, + "loss": 0.0888, + "step": 1050 + }, + { + "epoch": 0.0497986259180289, + "grad_norm": 0.490234375, + "learning_rate": 0.00019880066053861594, + "loss": 1.3071, + "step": 1051 + }, + { + "epoch": 0.04984600805496328, + "grad_norm": 0.70703125, + "learning_rate": 0.00019879835985348198, + "loss": 0.1956, + "step": 1052 + }, + { + "epoch": 0.04989339019189765, + "grad_norm": 0.52734375, + "learning_rate": 0.00019879605697710565, + "loss": 0.8406, + "step": 1053 + }, + { + "epoch": 0.04994077232883203, + "grad_norm": 0.306640625, + "learning_rate": 0.00019879375190953803, + "loss": 0.2062, + "step": 1054 + }, + { + "epoch": 0.049988154465766405, + "grad_norm": 0.85546875, + "learning_rate": 0.00019879144465083018, + "loss": 1.3425, + "step": 1055 + }, + { + "epoch": 0.050035536602700784, + "grad_norm": 0.546875, + "learning_rate": 0.00019878913520103334, + "loss": 0.5386, + "step": 1056 + }, + { + "epoch": 0.050082918739635156, + "grad_norm": 0.546875, + "learning_rate": 0.00019878682356019872, + "loss": 0.7189, + "step": 1057 + }, + { + "epoch": 0.050130300876569535, + "grad_norm": 0.5234375, + "learning_rate": 0.00019878450972837753, + "loss": 1.426, + "step": 1058 + }, + { + "epoch": 0.05017768301350391, + "grad_norm": 0.33203125, + "learning_rate": 0.00019878219370562117, + "loss": 0.7201, + "step": 1059 + }, + { + "epoch": 0.050225065150438286, + "grad_norm": 0.99609375, + "learning_rate": 0.00019877987549198097, + "loss": 0.0785, + "step": 1060 + }, + { + "epoch": 0.05027244728737266, + "grad_norm": 0.68359375, + "learning_rate": 0.00019877755508750832, + "loss": 0.1805, + "step": 1061 + }, + { + "epoch": 0.05031982942430704, + "grad_norm": 0.515625, + "learning_rate": 0.00019877523249225477, + "loss": 1.2644, + "step": 1062 + }, + { + "epoch": 0.05036721156124141, + "grad_norm": 0.431640625, + "learning_rate": 0.0001987729077062717, + "loss": 0.9881, + "step": 1063 + }, + { + "epoch": 0.05041459369817579, + "grad_norm": 0.453125, + "learning_rate": 0.0001987705807296108, + "loss": 0.7509, + "step": 1064 + }, + { + "epoch": 0.05046197583511016, + "grad_norm": 0.123046875, + "learning_rate": 0.0001987682515623236, + "loss": 0.0115, + "step": 1065 + }, + { + "epoch": 0.05050935797204454, + "grad_norm": 0.421875, + "learning_rate": 0.00019876592020446178, + "loss": 0.2251, + "step": 1066 + }, + { + "epoch": 0.05055674010897891, + "grad_norm": 0.55078125, + "learning_rate": 0.00019876358665607706, + "loss": 1.3754, + "step": 1067 + }, + { + "epoch": 0.05060412224591329, + "grad_norm": 0.53515625, + "learning_rate": 0.0001987612509172212, + "loss": 0.1373, + "step": 1068 + }, + { + "epoch": 0.05065150438284766, + "grad_norm": 0.185546875, + "learning_rate": 0.00019875891298794596, + "loss": 0.0087, + "step": 1069 + }, + { + "epoch": 0.05069888651978204, + "grad_norm": 0.54296875, + "learning_rate": 0.00019875657286830324, + "loss": 1.1929, + "step": 1070 + }, + { + "epoch": 0.050746268656716415, + "grad_norm": 0.515625, + "learning_rate": 0.00019875423055834492, + "loss": 0.0568, + "step": 1071 + }, + { + "epoch": 0.050793650793650794, + "grad_norm": 0.7578125, + "learning_rate": 0.00019875188605812297, + "loss": 0.7779, + "step": 1072 + }, + { + "epoch": 0.05084103293058517, + "grad_norm": 0.50390625, + "learning_rate": 0.00019874953936768936, + "loss": 1.5075, + "step": 1073 + }, + { + "epoch": 0.050888415067519545, + "grad_norm": 0.392578125, + "learning_rate": 0.00019874719048709616, + "loss": 1.1444, + "step": 1074 + }, + { + "epoch": 0.050935797204453924, + "grad_norm": 0.51171875, + "learning_rate": 0.00019874483941639546, + "loss": 1.0448, + "step": 1075 + }, + { + "epoch": 0.050983179341388296, + "grad_norm": 0.5546875, + "learning_rate": 0.00019874248615563936, + "loss": 1.245, + "step": 1076 + }, + { + "epoch": 0.051030561478322675, + "grad_norm": 0.50390625, + "learning_rate": 0.00019874013070488014, + "loss": 0.8945, + "step": 1077 + }, + { + "epoch": 0.05107794361525705, + "grad_norm": 0.484375, + "learning_rate": 0.00019873777306416996, + "loss": 0.7757, + "step": 1078 + }, + { + "epoch": 0.051125325752191426, + "grad_norm": 0.5078125, + "learning_rate": 0.00019873541323356118, + "loss": 1.0556, + "step": 1079 + }, + { + "epoch": 0.0511727078891258, + "grad_norm": 0.5703125, + "learning_rate": 0.00019873305121310609, + "loss": 0.1753, + "step": 1080 + }, + { + "epoch": 0.05122009002606018, + "grad_norm": 0.5078125, + "learning_rate": 0.00019873068700285704, + "loss": 1.4021, + "step": 1081 + }, + { + "epoch": 0.05126747216299455, + "grad_norm": 0.9140625, + "learning_rate": 0.00019872832060286656, + "loss": 0.8074, + "step": 1082 + }, + { + "epoch": 0.05131485429992893, + "grad_norm": 0.44921875, + "learning_rate": 0.00019872595201318708, + "loss": 0.9757, + "step": 1083 + }, + { + "epoch": 0.0513622364368633, + "grad_norm": 0.302734375, + "learning_rate": 0.00019872358123387116, + "loss": 0.1936, + "step": 1084 + }, + { + "epoch": 0.05140961857379768, + "grad_norm": 0.484375, + "learning_rate": 0.00019872120826497136, + "loss": 1.2161, + "step": 1085 + }, + { + "epoch": 0.05145700071073205, + "grad_norm": 0.53515625, + "learning_rate": 0.00019871883310654031, + "loss": 1.0108, + "step": 1086 + }, + { + "epoch": 0.05150438284766643, + "grad_norm": 0.44921875, + "learning_rate": 0.0001987164557586307, + "loss": 0.9926, + "step": 1087 + }, + { + "epoch": 0.051551764984600804, + "grad_norm": 0.65625, + "learning_rate": 0.00019871407622129523, + "loss": 1.2252, + "step": 1088 + }, + { + "epoch": 0.05159914712153518, + "grad_norm": 0.64453125, + "learning_rate": 0.0001987116944945867, + "loss": 0.2188, + "step": 1089 + }, + { + "epoch": 0.051646529258469555, + "grad_norm": 0.036865234375, + "learning_rate": 0.00019870931057855792, + "loss": 0.0031, + "step": 1090 + }, + { + "epoch": 0.051693911395403934, + "grad_norm": 0.51171875, + "learning_rate": 0.0001987069244732618, + "loss": 1.4619, + "step": 1091 + }, + { + "epoch": 0.051741293532338306, + "grad_norm": 0.5703125, + "learning_rate": 0.00019870453617875123, + "loss": 0.9762, + "step": 1092 + }, + { + "epoch": 0.051788675669272685, + "grad_norm": 1.5390625, + "learning_rate": 0.00019870214569507914, + "loss": 0.3705, + "step": 1093 + }, + { + "epoch": 0.05183605780620706, + "grad_norm": 0.53125, + "learning_rate": 0.00019869975302229864, + "loss": 1.2158, + "step": 1094 + }, + { + "epoch": 0.051883439943141436, + "grad_norm": 0.4296875, + "learning_rate": 0.0001986973581604627, + "loss": 0.9881, + "step": 1095 + }, + { + "epoch": 0.05193082208007581, + "grad_norm": 0.494140625, + "learning_rate": 0.00019869496110962452, + "loss": 1.101, + "step": 1096 + }, + { + "epoch": 0.05197820421701019, + "grad_norm": 0.9921875, + "learning_rate": 0.00019869256186983724, + "loss": 0.5085, + "step": 1097 + }, + { + "epoch": 0.05202558635394456, + "grad_norm": 0.41015625, + "learning_rate": 0.00019869016044115405, + "loss": 0.767, + "step": 1098 + }, + { + "epoch": 0.05207296849087894, + "grad_norm": 0.640625, + "learning_rate": 0.0001986877568236282, + "loss": 0.0881, + "step": 1099 + }, + { + "epoch": 0.05212035062781331, + "grad_norm": 0.86328125, + "learning_rate": 0.00019868535101731305, + "loss": 0.2183, + "step": 1100 + }, + { + "epoch": 0.05216773276474769, + "grad_norm": 0.4296875, + "learning_rate": 0.00019868294302226192, + "loss": 1.084, + "step": 1101 + }, + { + "epoch": 0.05221511490168207, + "grad_norm": 1.6953125, + "learning_rate": 0.0001986805328385282, + "loss": 0.9669, + "step": 1102 + }, + { + "epoch": 0.05226249703861644, + "grad_norm": 0.07177734375, + "learning_rate": 0.0001986781204661654, + "loss": 0.0065, + "step": 1103 + }, + { + "epoch": 0.05230987917555082, + "grad_norm": 0.640625, + "learning_rate": 0.00019867570590522698, + "loss": 1.0672, + "step": 1104 + }, + { + "epoch": 0.05235726131248519, + "grad_norm": 0.5078125, + "learning_rate": 0.0001986732891557665, + "loss": 0.205, + "step": 1105 + }, + { + "epoch": 0.05240464344941957, + "grad_norm": 0.69140625, + "learning_rate": 0.0001986708702178376, + "loss": 0.5019, + "step": 1106 + }, + { + "epoch": 0.052452025586353944, + "grad_norm": 0.419921875, + "learning_rate": 0.00019866844909149388, + "loss": 0.9741, + "step": 1107 + }, + { + "epoch": 0.05249940772328832, + "grad_norm": 0.5234375, + "learning_rate": 0.00019866602577678905, + "loss": 1.3385, + "step": 1108 + }, + { + "epoch": 0.052546789860222695, + "grad_norm": 0.306640625, + "learning_rate": 0.00019866360027377686, + "loss": 0.6885, + "step": 1109 + }, + { + "epoch": 0.052594171997157074, + "grad_norm": 1.09375, + "learning_rate": 0.00019866117258251112, + "loss": 0.1899, + "step": 1110 + }, + { + "epoch": 0.052641554134091446, + "grad_norm": 0.5390625, + "learning_rate": 0.00019865874270304565, + "loss": 0.7239, + "step": 1111 + }, + { + "epoch": 0.052688936271025825, + "grad_norm": 0.04638671875, + "learning_rate": 0.00019865631063543438, + "loss": 0.0036, + "step": 1112 + }, + { + "epoch": 0.0527363184079602, + "grad_norm": 0.55078125, + "learning_rate": 0.0001986538763797312, + "loss": 0.9317, + "step": 1113 + }, + { + "epoch": 0.05278370054489458, + "grad_norm": 0.29296875, + "learning_rate": 0.00019865143993599014, + "loss": 0.0368, + "step": 1114 + }, + { + "epoch": 0.05283108268182895, + "grad_norm": 0.5, + "learning_rate": 0.0001986490013042652, + "loss": 1.0516, + "step": 1115 + }, + { + "epoch": 0.05287846481876333, + "grad_norm": 0.4765625, + "learning_rate": 0.0001986465604846105, + "loss": 1.0051, + "step": 1116 + }, + { + "epoch": 0.0529258469556977, + "grad_norm": 0.6484375, + "learning_rate": 0.00019864411747708016, + "loss": 0.5711, + "step": 1117 + }, + { + "epoch": 0.05297322909263208, + "grad_norm": 0.51953125, + "learning_rate": 0.00019864167228172836, + "loss": 1.2666, + "step": 1118 + }, + { + "epoch": 0.05302061122956645, + "grad_norm": 0.5625, + "learning_rate": 0.00019863922489860936, + "loss": 1.4304, + "step": 1119 + }, + { + "epoch": 0.05306799336650083, + "grad_norm": 0.48046875, + "learning_rate": 0.00019863677532777738, + "loss": 0.9424, + "step": 1120 + }, + { + "epoch": 0.0531153755034352, + "grad_norm": 0.59765625, + "learning_rate": 0.0001986343235692868, + "loss": 1.1107, + "step": 1121 + }, + { + "epoch": 0.05316275764036958, + "grad_norm": 0.4609375, + "learning_rate": 0.000198631869623192, + "loss": 1.2194, + "step": 1122 + }, + { + "epoch": 0.053210139777303954, + "grad_norm": 0.5546875, + "learning_rate": 0.00019862941348954737, + "loss": 0.9687, + "step": 1123 + }, + { + "epoch": 0.05325752191423833, + "grad_norm": 0.423828125, + "learning_rate": 0.00019862695516840743, + "loss": 0.7604, + "step": 1124 + }, + { + "epoch": 0.053304904051172705, + "grad_norm": 0.470703125, + "learning_rate": 0.00019862449465982666, + "loss": 1.1728, + "step": 1125 + }, + { + "epoch": 0.053352286188107084, + "grad_norm": 0.5625, + "learning_rate": 0.00019862203196385964, + "loss": 1.0629, + "step": 1126 + }, + { + "epoch": 0.053399668325041456, + "grad_norm": 0.734375, + "learning_rate": 0.000198619567080561, + "loss": 0.3663, + "step": 1127 + }, + { + "epoch": 0.053447050461975835, + "grad_norm": 0.59375, + "learning_rate": 0.0001986171000099854, + "loss": 1.1313, + "step": 1128 + }, + { + "epoch": 0.05349443259891021, + "grad_norm": 0.494140625, + "learning_rate": 0.0001986146307521876, + "loss": 0.9301, + "step": 1129 + }, + { + "epoch": 0.05354181473584459, + "grad_norm": 0.55078125, + "learning_rate": 0.00019861215930722234, + "loss": 0.9257, + "step": 1130 + }, + { + "epoch": 0.05358919687277896, + "grad_norm": 0.52734375, + "learning_rate": 0.00019860968567514436, + "loss": 1.0074, + "step": 1131 + }, + { + "epoch": 0.05363657900971334, + "grad_norm": 0.57421875, + "learning_rate": 0.0001986072098560086, + "loss": 0.9213, + "step": 1132 + }, + { + "epoch": 0.05368396114664772, + "grad_norm": 0.4765625, + "learning_rate": 0.00019860473184987, + "loss": 0.7373, + "step": 1133 + }, + { + "epoch": 0.05373134328358209, + "grad_norm": 0.50390625, + "learning_rate": 0.00019860225165678345, + "loss": 1.1619, + "step": 1134 + }, + { + "epoch": 0.05377872542051647, + "grad_norm": 0.91015625, + "learning_rate": 0.00019859976927680397, + "loss": 1.2205, + "step": 1135 + }, + { + "epoch": 0.05382610755745084, + "grad_norm": 0.578125, + "learning_rate": 0.00019859728470998666, + "loss": 1.1632, + "step": 1136 + }, + { + "epoch": 0.05387348969438522, + "grad_norm": 0.06591796875, + "learning_rate": 0.00019859479795638658, + "loss": 0.0056, + "step": 1137 + }, + { + "epoch": 0.05392087183131959, + "grad_norm": 0.138671875, + "learning_rate": 0.0001985923090160589, + "loss": 0.0159, + "step": 1138 + }, + { + "epoch": 0.05396825396825397, + "grad_norm": 0.73828125, + "learning_rate": 0.00019858981788905883, + "loss": 1.1557, + "step": 1139 + }, + { + "epoch": 0.05401563610518834, + "grad_norm": 0.578125, + "learning_rate": 0.0001985873245754416, + "loss": 0.7595, + "step": 1140 + }, + { + "epoch": 0.05406301824212272, + "grad_norm": 0.53125, + "learning_rate": 0.00019858482907526254, + "loss": 1.2538, + "step": 1141 + }, + { + "epoch": 0.054110400379057094, + "grad_norm": 0.73828125, + "learning_rate": 0.00019858233138857697, + "loss": 0.5855, + "step": 1142 + }, + { + "epoch": 0.05415778251599147, + "grad_norm": 0.55859375, + "learning_rate": 0.0001985798315154403, + "loss": 0.5724, + "step": 1143 + }, + { + "epoch": 0.054205164652925845, + "grad_norm": 0.29296875, + "learning_rate": 0.00019857732945590794, + "loss": 0.2013, + "step": 1144 + }, + { + "epoch": 0.054252546789860225, + "grad_norm": 0.388671875, + "learning_rate": 0.00019857482521003545, + "loss": 1.0684, + "step": 1145 + }, + { + "epoch": 0.0542999289267946, + "grad_norm": 0.5390625, + "learning_rate": 0.0001985723187778783, + "loss": 0.821, + "step": 1146 + }, + { + "epoch": 0.054347311063728976, + "grad_norm": 0.50390625, + "learning_rate": 0.00019856981015949215, + "loss": 0.8093, + "step": 1147 + }, + { + "epoch": 0.05439469320066335, + "grad_norm": 0.5078125, + "learning_rate": 0.00019856729935493258, + "loss": 0.831, + "step": 1148 + }, + { + "epoch": 0.05444207533759773, + "grad_norm": 0.439453125, + "learning_rate": 0.0001985647863642553, + "loss": 0.9024, + "step": 1149 + }, + { + "epoch": 0.0544894574745321, + "grad_norm": 0.9140625, + "learning_rate": 0.00019856227118751605, + "loss": 0.8989, + "step": 1150 + }, + { + "epoch": 0.05453683961146648, + "grad_norm": 0.65234375, + "learning_rate": 0.0001985597538247706, + "loss": 0.4668, + "step": 1151 + }, + { + "epoch": 0.05458422174840085, + "grad_norm": 0.248046875, + "learning_rate": 0.00019855723427607483, + "loss": 0.0173, + "step": 1152 + }, + { + "epoch": 0.05463160388533523, + "grad_norm": 0.5390625, + "learning_rate": 0.0001985547125414845, + "loss": 1.0429, + "step": 1153 + }, + { + "epoch": 0.0546789860222696, + "grad_norm": 0.41015625, + "learning_rate": 0.00019855218862105568, + "loss": 1.1953, + "step": 1154 + }, + { + "epoch": 0.05472636815920398, + "grad_norm": 0.54296875, + "learning_rate": 0.0001985496625148443, + "loss": 0.9374, + "step": 1155 + }, + { + "epoch": 0.05477375029613835, + "grad_norm": 0.5234375, + "learning_rate": 0.00019854713422290637, + "loss": 1.2047, + "step": 1156 + }, + { + "epoch": 0.05482113243307273, + "grad_norm": 0.57421875, + "learning_rate": 0.00019854460374529794, + "loss": 1.514, + "step": 1157 + }, + { + "epoch": 0.054868514570007104, + "grad_norm": 0.859375, + "learning_rate": 0.00019854207108207521, + "loss": 0.2, + "step": 1158 + }, + { + "epoch": 0.05491589670694148, + "grad_norm": 0.953125, + "learning_rate": 0.00019853953623329428, + "loss": 0.8601, + "step": 1159 + }, + { + "epoch": 0.054963278843875855, + "grad_norm": 0.5703125, + "learning_rate": 0.00019853699919901137, + "loss": 0.6545, + "step": 1160 + }, + { + "epoch": 0.055010660980810235, + "grad_norm": 0.50390625, + "learning_rate": 0.0001985344599792828, + "loss": 1.0417, + "step": 1161 + }, + { + "epoch": 0.055058043117744614, + "grad_norm": 0.51953125, + "learning_rate": 0.00019853191857416487, + "loss": 1.3017, + "step": 1162 + }, + { + "epoch": 0.055105425254678986, + "grad_norm": 0.85546875, + "learning_rate": 0.00019852937498371394, + "loss": 0.6266, + "step": 1163 + }, + { + "epoch": 0.055152807391613365, + "grad_norm": 0.515625, + "learning_rate": 0.0001985268292079864, + "loss": 0.9562, + "step": 1164 + }, + { + "epoch": 0.05520018952854774, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019852428124703876, + "loss": 0.0217, + "step": 1165 + }, + { + "epoch": 0.055247571665482116, + "grad_norm": 0.5078125, + "learning_rate": 0.0001985217311009275, + "loss": 0.8682, + "step": 1166 + }, + { + "epoch": 0.05529495380241649, + "grad_norm": 0.447265625, + "learning_rate": 0.00019851917876970916, + "loss": 1.2461, + "step": 1167 + }, + { + "epoch": 0.05534233593935087, + "grad_norm": 0.396484375, + "learning_rate": 0.00019851662425344037, + "loss": 0.0585, + "step": 1168 + }, + { + "epoch": 0.05538971807628524, + "grad_norm": 0.6015625, + "learning_rate": 0.0001985140675521778, + "loss": 0.1793, + "step": 1169 + }, + { + "epoch": 0.05543710021321962, + "grad_norm": 0.341796875, + "learning_rate": 0.00019851150866597816, + "loss": 0.1714, + "step": 1170 + }, + { + "epoch": 0.05548448235015399, + "grad_norm": 0.546875, + "learning_rate": 0.0001985089475948982, + "loss": 0.1487, + "step": 1171 + }, + { + "epoch": 0.05553186448708837, + "grad_norm": 0.5390625, + "learning_rate": 0.00019850638433899467, + "loss": 0.7608, + "step": 1172 + }, + { + "epoch": 0.05557924662402274, + "grad_norm": 0.5234375, + "learning_rate": 0.00019850381889832447, + "loss": 1.2617, + "step": 1173 + }, + { + "epoch": 0.05562662876095712, + "grad_norm": 0.5546875, + "learning_rate": 0.0001985012512729445, + "loss": 1.1914, + "step": 1174 + }, + { + "epoch": 0.05567401089789149, + "grad_norm": 0.46484375, + "learning_rate": 0.00019849868146291168, + "loss": 1.0033, + "step": 1175 + }, + { + "epoch": 0.05572139303482587, + "grad_norm": 0.59765625, + "learning_rate": 0.00019849610946828306, + "loss": 0.8603, + "step": 1176 + }, + { + "epoch": 0.055768775171760245, + "grad_norm": 0.462890625, + "learning_rate": 0.0001984935352891156, + "loss": 0.8712, + "step": 1177 + }, + { + "epoch": 0.055816157308694624, + "grad_norm": 0.484375, + "learning_rate": 0.00019849095892546646, + "loss": 0.648, + "step": 1178 + }, + { + "epoch": 0.055863539445628996, + "grad_norm": 0.6171875, + "learning_rate": 0.00019848838037739275, + "loss": 1.5737, + "step": 1179 + }, + { + "epoch": 0.055910921582563375, + "grad_norm": 0.416015625, + "learning_rate": 0.0001984857996449517, + "loss": 0.2043, + "step": 1180 + }, + { + "epoch": 0.05595830371949775, + "grad_norm": 0.546875, + "learning_rate": 0.00019848321672820047, + "loss": 0.8374, + "step": 1181 + }, + { + "epoch": 0.056005685856432126, + "grad_norm": 0.490234375, + "learning_rate": 0.0001984806316271964, + "loss": 0.6035, + "step": 1182 + }, + { + "epoch": 0.0560530679933665, + "grad_norm": 1.1015625, + "learning_rate": 0.00019847804434199685, + "loss": 0.3609, + "step": 1183 + }, + { + "epoch": 0.05610045013030088, + "grad_norm": 0.42578125, + "learning_rate": 0.00019847545487265914, + "loss": 0.8299, + "step": 1184 + }, + { + "epoch": 0.05614783226723525, + "grad_norm": 0.396484375, + "learning_rate": 0.00019847286321924074, + "loss": 0.2107, + "step": 1185 + }, + { + "epoch": 0.05619521440416963, + "grad_norm": 0.65625, + "learning_rate": 0.00019847026938179914, + "loss": 1.2353, + "step": 1186 + }, + { + "epoch": 0.056242596541104, + "grad_norm": 0.09765625, + "learning_rate": 0.00019846767336039183, + "loss": 0.0066, + "step": 1187 + }, + { + "epoch": 0.05628997867803838, + "grad_norm": 1.609375, + "learning_rate": 0.0001984650751550764, + "loss": 0.1846, + "step": 1188 + }, + { + "epoch": 0.05633736081497275, + "grad_norm": 0.490234375, + "learning_rate": 0.0001984624747659105, + "loss": 0.9159, + "step": 1189 + }, + { + "epoch": 0.05638474295190713, + "grad_norm": 0.73046875, + "learning_rate": 0.0001984598721929518, + "loss": 0.0875, + "step": 1190 + }, + { + "epoch": 0.05643212508884151, + "grad_norm": 0.48828125, + "learning_rate": 0.00019845726743625798, + "loss": 1.0923, + "step": 1191 + }, + { + "epoch": 0.05647950722577588, + "grad_norm": 0.498046875, + "learning_rate": 0.00019845466049588686, + "loss": 1.2815, + "step": 1192 + }, + { + "epoch": 0.05652688936271026, + "grad_norm": 0.6015625, + "learning_rate": 0.00019845205137189624, + "loss": 1.1373, + "step": 1193 + }, + { + "epoch": 0.056574271499644634, + "grad_norm": 0.51171875, + "learning_rate": 0.00019844944006434397, + "loss": 1.4879, + "step": 1194 + }, + { + "epoch": 0.05662165363657901, + "grad_norm": 0.43359375, + "learning_rate": 0.000198446826573288, + "loss": 0.7608, + "step": 1195 + }, + { + "epoch": 0.056669035773513385, + "grad_norm": 1.078125, + "learning_rate": 0.00019844421089878627, + "loss": 0.2131, + "step": 1196 + }, + { + "epoch": 0.056716417910447764, + "grad_norm": 0.671875, + "learning_rate": 0.0001984415930408968, + "loss": 1.0517, + "step": 1197 + }, + { + "epoch": 0.056763800047382136, + "grad_norm": 0.60546875, + "learning_rate": 0.00019843897299967765, + "loss": 0.9982, + "step": 1198 + }, + { + "epoch": 0.056811182184316515, + "grad_norm": 0.52734375, + "learning_rate": 0.00019843635077518691, + "loss": 1.1964, + "step": 1199 + }, + { + "epoch": 0.05685856432125089, + "grad_norm": 0.65625, + "learning_rate": 0.00019843372636748282, + "loss": 1.0349, + "step": 1200 + }, + { + "epoch": 0.056905946458185266, + "grad_norm": 0.490234375, + "learning_rate": 0.00019843109977662348, + "loss": 0.8252, + "step": 1201 + }, + { + "epoch": 0.05695332859511964, + "grad_norm": 0.5234375, + "learning_rate": 0.00019842847100266718, + "loss": 0.7719, + "step": 1202 + }, + { + "epoch": 0.05700071073205402, + "grad_norm": 0.4609375, + "learning_rate": 0.00019842584004567225, + "loss": 1.2267, + "step": 1203 + }, + { + "epoch": 0.05704809286898839, + "grad_norm": 0.53125, + "learning_rate": 0.00019842320690569704, + "loss": 1.3288, + "step": 1204 + }, + { + "epoch": 0.05709547500592277, + "grad_norm": 0.52734375, + "learning_rate": 0.00019842057158279993, + "loss": 1.3921, + "step": 1205 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.62890625, + "learning_rate": 0.0001984179340770394, + "loss": 0.2586, + "step": 1206 + }, + { + "epoch": 0.05719023927979152, + "grad_norm": 0.8671875, + "learning_rate": 0.00019841529438847387, + "loss": 0.8311, + "step": 1207 + }, + { + "epoch": 0.05723762141672589, + "grad_norm": 0.443359375, + "learning_rate": 0.00019841265251716196, + "loss": 0.0966, + "step": 1208 + }, + { + "epoch": 0.05728500355366027, + "grad_norm": 0.44921875, + "learning_rate": 0.00019841000846316224, + "loss": 1.0222, + "step": 1209 + }, + { + "epoch": 0.057332385690594644, + "grad_norm": 0.67578125, + "learning_rate": 0.00019840736222653335, + "loss": 0.2326, + "step": 1210 + }, + { + "epoch": 0.05737976782752902, + "grad_norm": 0.46484375, + "learning_rate": 0.00019840471380733402, + "loss": 1.1139, + "step": 1211 + }, + { + "epoch": 0.057427149964463395, + "grad_norm": 0.419921875, + "learning_rate": 0.00019840206320562295, + "loss": 0.9745, + "step": 1212 + }, + { + "epoch": 0.057474532101397774, + "grad_norm": 0.4453125, + "learning_rate": 0.00019839941042145887, + "loss": 0.6654, + "step": 1213 + }, + { + "epoch": 0.057521914238332146, + "grad_norm": 0.6015625, + "learning_rate": 0.00019839675545490072, + "loss": 0.2254, + "step": 1214 + }, + { + "epoch": 0.057569296375266525, + "grad_norm": 0.482421875, + "learning_rate": 0.00019839409830600733, + "loss": 0.9748, + "step": 1215 + }, + { + "epoch": 0.0576166785122009, + "grad_norm": 0.34765625, + "learning_rate": 0.00019839143897483767, + "loss": 0.2524, + "step": 1216 + }, + { + "epoch": 0.057664060649135276, + "grad_norm": 0.330078125, + "learning_rate": 0.00019838877746145066, + "loss": 0.1985, + "step": 1217 + }, + { + "epoch": 0.05771144278606965, + "grad_norm": 0.39453125, + "learning_rate": 0.00019838611376590538, + "loss": 0.1742, + "step": 1218 + }, + { + "epoch": 0.05775882492300403, + "grad_norm": 0.486328125, + "learning_rate": 0.0001983834478882609, + "loss": 1.2711, + "step": 1219 + }, + { + "epoch": 0.0578062070599384, + "grad_norm": 0.515625, + "learning_rate": 0.00019838077982857634, + "loss": 0.7279, + "step": 1220 + }, + { + "epoch": 0.05785358919687278, + "grad_norm": 0.447265625, + "learning_rate": 0.00019837810958691087, + "loss": 0.6179, + "step": 1221 + }, + { + "epoch": 0.05790097133380716, + "grad_norm": 0.546875, + "learning_rate": 0.0001983754371633237, + "loss": 0.9773, + "step": 1222 + }, + { + "epoch": 0.05794835347074153, + "grad_norm": 0.404296875, + "learning_rate": 0.00019837276255787415, + "loss": 0.3931, + "step": 1223 + }, + { + "epoch": 0.05799573560767591, + "grad_norm": 0.515625, + "learning_rate": 0.0001983700857706215, + "loss": 1.3234, + "step": 1224 + }, + { + "epoch": 0.05804311774461028, + "grad_norm": 0.39453125, + "learning_rate": 0.00019836740680162512, + "loss": 0.8854, + "step": 1225 + }, + { + "epoch": 0.05809049988154466, + "grad_norm": 0.53515625, + "learning_rate": 0.00019836472565094442, + "loss": 0.9522, + "step": 1226 + }, + { + "epoch": 0.05813788201847903, + "grad_norm": 0.44921875, + "learning_rate": 0.00019836204231863888, + "loss": 1.0914, + "step": 1227 + }, + { + "epoch": 0.05818526415541341, + "grad_norm": 0.7890625, + "learning_rate": 0.00019835935680476803, + "loss": 1.223, + "step": 1228 + }, + { + "epoch": 0.058232646292347784, + "grad_norm": 0.439453125, + "learning_rate": 0.00019835666910939138, + "loss": 1.3825, + "step": 1229 + }, + { + "epoch": 0.05828002842928216, + "grad_norm": 0.5625, + "learning_rate": 0.0001983539792325686, + "loss": 1.5757, + "step": 1230 + }, + { + "epoch": 0.058327410566216535, + "grad_norm": 0.54296875, + "learning_rate": 0.00019835128717435934, + "loss": 0.8738, + "step": 1231 + }, + { + "epoch": 0.058374792703150914, + "grad_norm": 0.423828125, + "learning_rate": 0.00019834859293482328, + "loss": 0.8583, + "step": 1232 + }, + { + "epoch": 0.058422174840085286, + "grad_norm": 0.453125, + "learning_rate": 0.0001983458965140202, + "loss": 0.2097, + "step": 1233 + }, + { + "epoch": 0.058469556977019665, + "grad_norm": 0.46875, + "learning_rate": 0.00019834319791200983, + "loss": 0.9326, + "step": 1234 + }, + { + "epoch": 0.05851693911395404, + "grad_norm": 0.45703125, + "learning_rate": 0.0001983404971288521, + "loss": 1.0178, + "step": 1235 + }, + { + "epoch": 0.05856432125088842, + "grad_norm": 0.55078125, + "learning_rate": 0.00019833779416460692, + "loss": 1.1593, + "step": 1236 + }, + { + "epoch": 0.05861170338782279, + "grad_norm": 0.50390625, + "learning_rate": 0.0001983350890193342, + "loss": 1.1695, + "step": 1237 + }, + { + "epoch": 0.05865908552475717, + "grad_norm": 1.203125, + "learning_rate": 0.00019833238169309395, + "loss": 0.4912, + "step": 1238 + }, + { + "epoch": 0.05870646766169154, + "grad_norm": 0.64453125, + "learning_rate": 0.0001983296721859462, + "loss": 0.3485, + "step": 1239 + }, + { + "epoch": 0.05875384979862592, + "grad_norm": 0.5390625, + "learning_rate": 0.00019832696049795108, + "loss": 0.9958, + "step": 1240 + }, + { + "epoch": 0.05880123193556029, + "grad_norm": 0.49609375, + "learning_rate": 0.0001983242466291687, + "loss": 1.262, + "step": 1241 + }, + { + "epoch": 0.05884861407249467, + "grad_norm": 0.7265625, + "learning_rate": 0.00019832153057965926, + "loss": 0.1754, + "step": 1242 + }, + { + "epoch": 0.05889599620942904, + "grad_norm": 0.72265625, + "learning_rate": 0.00019831881234948296, + "loss": 0.0748, + "step": 1243 + }, + { + "epoch": 0.05894337834636342, + "grad_norm": 0.353515625, + "learning_rate": 0.00019831609193870015, + "loss": 0.2001, + "step": 1244 + }, + { + "epoch": 0.058990760483297794, + "grad_norm": 0.474609375, + "learning_rate": 0.00019831336934737117, + "loss": 1.4368, + "step": 1245 + }, + { + "epoch": 0.05903814262023217, + "grad_norm": 1.2265625, + "learning_rate": 0.00019831064457555636, + "loss": 0.8099, + "step": 1246 + }, + { + "epoch": 0.059085524757166545, + "grad_norm": 0.53125, + "learning_rate": 0.00019830791762331617, + "loss": 1.5033, + "step": 1247 + }, + { + "epoch": 0.059132906894100924, + "grad_norm": 0.57421875, + "learning_rate": 0.0001983051884907111, + "loss": 1.082, + "step": 1248 + }, + { + "epoch": 0.059180289031035296, + "grad_norm": 0.357421875, + "learning_rate": 0.0001983024571778016, + "loss": 0.442, + "step": 1249 + }, + { + "epoch": 0.059227671167969675, + "grad_norm": 0.4453125, + "learning_rate": 0.00019829972368464835, + "loss": 0.855, + "step": 1250 + }, + { + "epoch": 0.059275053304904055, + "grad_norm": 0.03271484375, + "learning_rate": 0.00019829698801131194, + "loss": 0.0025, + "step": 1251 + }, + { + "epoch": 0.05932243544183843, + "grad_norm": 0.69921875, + "learning_rate": 0.00019829425015785305, + "loss": 1.1192, + "step": 1252 + }, + { + "epoch": 0.059369817578772806, + "grad_norm": 0.3203125, + "learning_rate": 0.00019829151012433233, + "loss": 0.1857, + "step": 1253 + }, + { + "epoch": 0.05941719971570718, + "grad_norm": 0.419921875, + "learning_rate": 0.00019828876791081065, + "loss": 0.0894, + "step": 1254 + }, + { + "epoch": 0.05946458185264156, + "grad_norm": 0.41796875, + "learning_rate": 0.00019828602351734879, + "loss": 1.186, + "step": 1255 + }, + { + "epoch": 0.05951196398957593, + "grad_norm": 0.435546875, + "learning_rate": 0.0001982832769440076, + "loss": 0.7192, + "step": 1256 + }, + { + "epoch": 0.05955934612651031, + "grad_norm": 0.51953125, + "learning_rate": 0.00019828052819084803, + "loss": 1.0787, + "step": 1257 + }, + { + "epoch": 0.05960672826344468, + "grad_norm": 0.451171875, + "learning_rate": 0.00019827777725793104, + "loss": 1.3895, + "step": 1258 + }, + { + "epoch": 0.05965411040037906, + "grad_norm": 0.43359375, + "learning_rate": 0.0001982750241453176, + "loss": 0.7737, + "step": 1259 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 0.44140625, + "learning_rate": 0.0001982722688530688, + "loss": 0.8477, + "step": 1260 + }, + { + "epoch": 0.05974887467424781, + "grad_norm": 0.380859375, + "learning_rate": 0.00019826951138124578, + "loss": 0.0817, + "step": 1261 + }, + { + "epoch": 0.05979625681118218, + "grad_norm": 0.62109375, + "learning_rate": 0.00019826675172990966, + "loss": 1.0333, + "step": 1262 + }, + { + "epoch": 0.05984363894811656, + "grad_norm": 0.474609375, + "learning_rate": 0.00019826398989912167, + "loss": 0.9203, + "step": 1263 + }, + { + "epoch": 0.059891021085050934, + "grad_norm": 0.6171875, + "learning_rate": 0.00019826122588894305, + "loss": 0.1183, + "step": 1264 + }, + { + "epoch": 0.05993840322198531, + "grad_norm": 0.49609375, + "learning_rate": 0.00019825845969943505, + "loss": 1.0712, + "step": 1265 + }, + { + "epoch": 0.059985785358919685, + "grad_norm": 0.53515625, + "learning_rate": 0.00019825569133065913, + "loss": 0.9647, + "step": 1266 + }, + { + "epoch": 0.060033167495854065, + "grad_norm": 0.326171875, + "learning_rate": 0.0001982529207826766, + "loss": 0.3805, + "step": 1267 + }, + { + "epoch": 0.06008054963278844, + "grad_norm": 0.26953125, + "learning_rate": 0.000198250148055549, + "loss": 0.5593, + "step": 1268 + }, + { + "epoch": 0.060127931769722816, + "grad_norm": 0.474609375, + "learning_rate": 0.0001982473731493377, + "loss": 0.9391, + "step": 1269 + }, + { + "epoch": 0.06017531390665719, + "grad_norm": 0.48828125, + "learning_rate": 0.00019824459606410436, + "loss": 0.6489, + "step": 1270 + }, + { + "epoch": 0.06022269604359157, + "grad_norm": 0.447265625, + "learning_rate": 0.0001982418167999105, + "loss": 0.7921, + "step": 1271 + }, + { + "epoch": 0.06027007818052594, + "grad_norm": 0.6015625, + "learning_rate": 0.00019823903535681777, + "loss": 0.7933, + "step": 1272 + }, + { + "epoch": 0.06031746031746032, + "grad_norm": 0.69140625, + "learning_rate": 0.0001982362517348879, + "loss": 0.8601, + "step": 1273 + }, + { + "epoch": 0.06036484245439469, + "grad_norm": 0.55078125, + "learning_rate": 0.0001982334659341826, + "loss": 0.7573, + "step": 1274 + }, + { + "epoch": 0.06041222459132907, + "grad_norm": 0.58203125, + "learning_rate": 0.00019823067795476365, + "loss": 0.2582, + "step": 1275 + }, + { + "epoch": 0.06045960672826344, + "grad_norm": 0.42578125, + "learning_rate": 0.0001982278877966929, + "loss": 0.8929, + "step": 1276 + }, + { + "epoch": 0.06050698886519782, + "grad_norm": 0.427734375, + "learning_rate": 0.0001982250954600322, + "loss": 0.9258, + "step": 1277 + }, + { + "epoch": 0.06055437100213219, + "grad_norm": 0.443359375, + "learning_rate": 0.00019822230094484355, + "loss": 1.0504, + "step": 1278 + }, + { + "epoch": 0.06060175313906657, + "grad_norm": 0.5234375, + "learning_rate": 0.00019821950425118887, + "loss": 0.8885, + "step": 1279 + }, + { + "epoch": 0.06064913527600095, + "grad_norm": 0.478515625, + "learning_rate": 0.00019821670537913022, + "loss": 0.1725, + "step": 1280 + }, + { + "epoch": 0.06069651741293532, + "grad_norm": 0.408203125, + "learning_rate": 0.0001982139043287296, + "loss": 0.65, + "step": 1281 + }, + { + "epoch": 0.0607438995498697, + "grad_norm": 0.64453125, + "learning_rate": 0.0001982111011000493, + "loss": 0.0851, + "step": 1282 + }, + { + "epoch": 0.060791281686804075, + "grad_norm": 0.52734375, + "learning_rate": 0.0001982082956931513, + "loss": 1.096, + "step": 1283 + }, + { + "epoch": 0.060838663823738454, + "grad_norm": 0.302734375, + "learning_rate": 0.0001982054881080979, + "loss": 0.0401, + "step": 1284 + }, + { + "epoch": 0.060886045960672826, + "grad_norm": 0.515625, + "learning_rate": 0.00019820267834495144, + "loss": 1.1779, + "step": 1285 + }, + { + "epoch": 0.060933428097607205, + "grad_norm": 0.5859375, + "learning_rate": 0.00019819986640377414, + "loss": 1.2213, + "step": 1286 + }, + { + "epoch": 0.06098081023454158, + "grad_norm": 0.53515625, + "learning_rate": 0.00019819705228462842, + "loss": 1.2332, + "step": 1287 + }, + { + "epoch": 0.061028192371475956, + "grad_norm": 0.515625, + "learning_rate": 0.00019819423598757668, + "loss": 1.1858, + "step": 1288 + }, + { + "epoch": 0.06107557450841033, + "grad_norm": 0.60546875, + "learning_rate": 0.00019819141751268134, + "loss": 0.0969, + "step": 1289 + }, + { + "epoch": 0.06112295664534471, + "grad_norm": 0.453125, + "learning_rate": 0.00019818859686000496, + "loss": 1.1394, + "step": 1290 + }, + { + "epoch": 0.06117033878227908, + "grad_norm": 0.51953125, + "learning_rate": 0.00019818577402961013, + "loss": 1.091, + "step": 1291 + }, + { + "epoch": 0.06121772091921346, + "grad_norm": 0.81640625, + "learning_rate": 0.00019818294902155937, + "loss": 0.5775, + "step": 1292 + }, + { + "epoch": 0.06126510305614783, + "grad_norm": 0.5859375, + "learning_rate": 0.00019818012183591538, + "loss": 0.8449, + "step": 1293 + }, + { + "epoch": 0.06131248519308221, + "grad_norm": 0.61328125, + "learning_rate": 0.0001981772924727409, + "loss": 1.2039, + "step": 1294 + }, + { + "epoch": 0.06135986733001658, + "grad_norm": 0.515625, + "learning_rate": 0.0001981744609320986, + "loss": 0.0453, + "step": 1295 + }, + { + "epoch": 0.06140724946695096, + "grad_norm": 0.671875, + "learning_rate": 0.00019817162721405134, + "loss": 0.814, + "step": 1296 + }, + { + "epoch": 0.06145463160388533, + "grad_norm": 0.52734375, + "learning_rate": 0.00019816879131866197, + "loss": 0.6862, + "step": 1297 + }, + { + "epoch": 0.06150201374081971, + "grad_norm": 0.4765625, + "learning_rate": 0.00019816595324599335, + "loss": 0.1128, + "step": 1298 + }, + { + "epoch": 0.061549395877754084, + "grad_norm": 0.21875, + "learning_rate": 0.0001981631129961085, + "loss": 0.0284, + "step": 1299 + }, + { + "epoch": 0.061596778014688464, + "grad_norm": 0.61328125, + "learning_rate": 0.00019816027056907034, + "loss": 1.3758, + "step": 1300 + }, + { + "epoch": 0.061644160151622836, + "grad_norm": 0.8515625, + "learning_rate": 0.00019815742596494192, + "loss": 0.558, + "step": 1301 + }, + { + "epoch": 0.061691542288557215, + "grad_norm": 0.53125, + "learning_rate": 0.00019815457918378635, + "loss": 0.3337, + "step": 1302 + }, + { + "epoch": 0.06173892442549159, + "grad_norm": 0.63671875, + "learning_rate": 0.00019815173022566675, + "loss": 1.1537, + "step": 1303 + }, + { + "epoch": 0.061786306562425966, + "grad_norm": 0.52734375, + "learning_rate": 0.00019814887909064632, + "loss": 1.2278, + "step": 1304 + }, + { + "epoch": 0.06183368869936034, + "grad_norm": 0.5078125, + "learning_rate": 0.00019814602577878835, + "loss": 1.1332, + "step": 1305 + }, + { + "epoch": 0.06188107083629472, + "grad_norm": 0.546875, + "learning_rate": 0.000198143170290156, + "loss": 1.1716, + "step": 1306 + }, + { + "epoch": 0.06192845297322909, + "grad_norm": 0.515625, + "learning_rate": 0.0001981403126248127, + "loss": 0.3662, + "step": 1307 + }, + { + "epoch": 0.06197583511016347, + "grad_norm": 0.4609375, + "learning_rate": 0.0001981374527828218, + "loss": 1.4196, + "step": 1308 + }, + { + "epoch": 0.06202321724709784, + "grad_norm": 1.546875, + "learning_rate": 0.00019813459076424672, + "loss": 0.5595, + "step": 1309 + }, + { + "epoch": 0.06207059938403222, + "grad_norm": 0.4609375, + "learning_rate": 0.00019813172656915094, + "loss": 0.2389, + "step": 1310 + }, + { + "epoch": 0.0621179815209666, + "grad_norm": 0.45703125, + "learning_rate": 0.000198128860197598, + "loss": 0.6669, + "step": 1311 + }, + { + "epoch": 0.06216536365790097, + "grad_norm": 0.86328125, + "learning_rate": 0.00019812599164965148, + "loss": 0.5671, + "step": 1312 + }, + { + "epoch": 0.06221274579483535, + "grad_norm": 0.416015625, + "learning_rate": 0.00019812312092537497, + "loss": 1.0529, + "step": 1313 + }, + { + "epoch": 0.06226012793176972, + "grad_norm": 0.470703125, + "learning_rate": 0.00019812024802483212, + "loss": 0.9078, + "step": 1314 + }, + { + "epoch": 0.0623075100687041, + "grad_norm": 0.55859375, + "learning_rate": 0.00019811737294808673, + "loss": 1.1995, + "step": 1315 + }, + { + "epoch": 0.062354892205638474, + "grad_norm": 0.50390625, + "learning_rate": 0.00019811449569520252, + "loss": 1.0019, + "step": 1316 + }, + { + "epoch": 0.06240227434257285, + "grad_norm": 0.486328125, + "learning_rate": 0.00019811161626624328, + "loss": 0.8394, + "step": 1317 + }, + { + "epoch": 0.062449656479507225, + "grad_norm": 0.58203125, + "learning_rate": 0.00019810873466127288, + "loss": 1.4806, + "step": 1318 + }, + { + "epoch": 0.062497038616441604, + "grad_norm": 0.5703125, + "learning_rate": 0.00019810585088035526, + "loss": 0.7276, + "step": 1319 + }, + { + "epoch": 0.06254442075337598, + "grad_norm": 0.5078125, + "learning_rate": 0.0001981029649235544, + "loss": 0.8709, + "step": 1320 + }, + { + "epoch": 0.06259180289031036, + "grad_norm": 0.412109375, + "learning_rate": 0.00019810007679093422, + "loss": 0.0994, + "step": 1321 + }, + { + "epoch": 0.06263918502724473, + "grad_norm": 0.57421875, + "learning_rate": 0.00019809718648255888, + "loss": 1.3156, + "step": 1322 + }, + { + "epoch": 0.0626865671641791, + "grad_norm": 0.4765625, + "learning_rate": 0.00019809429399849238, + "loss": 0.8856, + "step": 1323 + }, + { + "epoch": 0.06273394930111348, + "grad_norm": 0.486328125, + "learning_rate": 0.00019809139933879897, + "loss": 1.0598, + "step": 1324 + }, + { + "epoch": 0.06278133143804786, + "grad_norm": 0.59375, + "learning_rate": 0.00019808850250354278, + "loss": 1.2765, + "step": 1325 + }, + { + "epoch": 0.06282871357498224, + "grad_norm": 0.53125, + "learning_rate": 0.00019808560349278808, + "loss": 1.4646, + "step": 1326 + }, + { + "epoch": 0.0628760957119166, + "grad_norm": 0.470703125, + "learning_rate": 0.0001980827023065992, + "loss": 1.0001, + "step": 1327 + }, + { + "epoch": 0.06292347784885098, + "grad_norm": 0.275390625, + "learning_rate": 0.00019807979894504043, + "loss": 0.1755, + "step": 1328 + }, + { + "epoch": 0.06297085998578536, + "grad_norm": 0.5, + "learning_rate": 0.00019807689340817618, + "loss": 1.2925, + "step": 1329 + }, + { + "epoch": 0.06301824212271974, + "grad_norm": 0.65234375, + "learning_rate": 0.0001980739856960709, + "loss": 0.8089, + "step": 1330 + }, + { + "epoch": 0.0630656242596541, + "grad_norm": 0.609375, + "learning_rate": 0.0001980710758087891, + "loss": 1.3729, + "step": 1331 + }, + { + "epoch": 0.06311300639658848, + "grad_norm": 0.35546875, + "learning_rate": 0.0001980681637463953, + "loss": 0.2018, + "step": 1332 + }, + { + "epoch": 0.06316038853352286, + "grad_norm": 0.84765625, + "learning_rate": 0.00019806524950895406, + "loss": 0.626, + "step": 1333 + }, + { + "epoch": 0.06320777067045724, + "grad_norm": 0.50390625, + "learning_rate": 0.00019806233309653008, + "loss": 1.0182, + "step": 1334 + }, + { + "epoch": 0.06325515280739161, + "grad_norm": 0.640625, + "learning_rate": 0.00019805941450918798, + "loss": 1.1142, + "step": 1335 + }, + { + "epoch": 0.06330253494432599, + "grad_norm": 0.91015625, + "learning_rate": 0.0001980564937469925, + "loss": 0.4311, + "step": 1336 + }, + { + "epoch": 0.06334991708126037, + "grad_norm": 0.3125, + "learning_rate": 0.00019805357081000845, + "loss": 0.3452, + "step": 1337 + }, + { + "epoch": 0.06339729921819474, + "grad_norm": 0.451171875, + "learning_rate": 0.00019805064569830067, + "loss": 0.0286, + "step": 1338 + }, + { + "epoch": 0.06344468135512911, + "grad_norm": 0.671875, + "learning_rate": 0.000198047718411934, + "loss": 1.5651, + "step": 1339 + }, + { + "epoch": 0.06349206349206349, + "grad_norm": 0.478515625, + "learning_rate": 0.00019804478895097335, + "loss": 0.9273, + "step": 1340 + }, + { + "epoch": 0.06353944562899787, + "grad_norm": 0.361328125, + "learning_rate": 0.00019804185731548367, + "loss": 0.4678, + "step": 1341 + }, + { + "epoch": 0.06358682776593225, + "grad_norm": 0.5859375, + "learning_rate": 0.0001980389235055301, + "loss": 0.9688, + "step": 1342 + }, + { + "epoch": 0.06363420990286663, + "grad_norm": 1.09375, + "learning_rate": 0.0001980359875211776, + "loss": 0.4255, + "step": 1343 + }, + { + "epoch": 0.06368159203980099, + "grad_norm": 0.5546875, + "learning_rate": 0.00019803304936249133, + "loss": 1.298, + "step": 1344 + }, + { + "epoch": 0.06372897417673537, + "grad_norm": 0.5625, + "learning_rate": 0.00019803010902953643, + "loss": 0.4256, + "step": 1345 + }, + { + "epoch": 0.06377635631366975, + "grad_norm": 0.5078125, + "learning_rate": 0.00019802716652237811, + "loss": 1.1817, + "step": 1346 + }, + { + "epoch": 0.06382373845060413, + "grad_norm": 0.6640625, + "learning_rate": 0.0001980242218410817, + "loss": 0.2303, + "step": 1347 + }, + { + "epoch": 0.0638711205875385, + "grad_norm": 0.470703125, + "learning_rate": 0.00019802127498571244, + "loss": 0.8684, + "step": 1348 + }, + { + "epoch": 0.06391850272447287, + "grad_norm": 0.5078125, + "learning_rate": 0.00019801832595633568, + "loss": 1.2578, + "step": 1349 + }, + { + "epoch": 0.06396588486140725, + "grad_norm": 0.48046875, + "learning_rate": 0.0001980153747530169, + "loss": 0.9623, + "step": 1350 + }, + { + "epoch": 0.06401326699834163, + "grad_norm": 0.39453125, + "learning_rate": 0.00019801242137582148, + "loss": 0.8354, + "step": 1351 + }, + { + "epoch": 0.064060649135276, + "grad_norm": 0.52734375, + "learning_rate": 0.00019800946582481497, + "loss": 0.9811, + "step": 1352 + }, + { + "epoch": 0.06410803127221038, + "grad_norm": 0.4765625, + "learning_rate": 0.0001980065081000629, + "loss": 0.7545, + "step": 1353 + }, + { + "epoch": 0.06415541340914475, + "grad_norm": 0.5859375, + "learning_rate": 0.00019800354820163088, + "loss": 0.7627, + "step": 1354 + }, + { + "epoch": 0.06420279554607913, + "grad_norm": 0.470703125, + "learning_rate": 0.00019800058612958453, + "loss": 0.6435, + "step": 1355 + }, + { + "epoch": 0.0642501776830135, + "grad_norm": 1.421875, + "learning_rate": 0.00019799762188398953, + "loss": 0.9989, + "step": 1356 + }, + { + "epoch": 0.06429755981994788, + "grad_norm": 0.484375, + "learning_rate": 0.00019799465546491173, + "loss": 0.8533, + "step": 1357 + }, + { + "epoch": 0.06434494195688226, + "grad_norm": 0.306640625, + "learning_rate": 0.0001979916868724168, + "loss": 0.2298, + "step": 1358 + }, + { + "epoch": 0.06439232409381664, + "grad_norm": 0.51171875, + "learning_rate": 0.00019798871610657068, + "loss": 1.1798, + "step": 1359 + }, + { + "epoch": 0.064439706230751, + "grad_norm": 0.408203125, + "learning_rate": 0.00019798574316743915, + "loss": 0.8694, + "step": 1360 + }, + { + "epoch": 0.06448708836768538, + "grad_norm": 0.4609375, + "learning_rate": 0.00019798276805508826, + "loss": 1.2998, + "step": 1361 + }, + { + "epoch": 0.06453447050461976, + "grad_norm": 0.609375, + "learning_rate": 0.0001979797907695839, + "loss": 0.8954, + "step": 1362 + }, + { + "epoch": 0.06458185264155414, + "grad_norm": 0.51171875, + "learning_rate": 0.0001979768113109922, + "loss": 1.4044, + "step": 1363 + }, + { + "epoch": 0.0646292347784885, + "grad_norm": 0.5, + "learning_rate": 0.00019797382967937912, + "loss": 0.919, + "step": 1364 + }, + { + "epoch": 0.06467661691542288, + "grad_norm": 0.390625, + "learning_rate": 0.0001979708458748109, + "loss": 0.0239, + "step": 1365 + }, + { + "epoch": 0.06472399905235726, + "grad_norm": 0.53125, + "learning_rate": 0.00019796785989735364, + "loss": 1.1159, + "step": 1366 + }, + { + "epoch": 0.06477138118929164, + "grad_norm": 0.5625, + "learning_rate": 0.00019796487174707363, + "loss": 0.0594, + "step": 1367 + }, + { + "epoch": 0.064818763326226, + "grad_norm": 0.486328125, + "learning_rate": 0.00019796188142403708, + "loss": 1.0021, + "step": 1368 + }, + { + "epoch": 0.06486614546316039, + "grad_norm": 0.78125, + "learning_rate": 0.0001979588889283104, + "loss": 0.4291, + "step": 1369 + }, + { + "epoch": 0.06491352760009476, + "grad_norm": 0.478515625, + "learning_rate": 0.00019795589425995985, + "loss": 1.3191, + "step": 1370 + }, + { + "epoch": 0.06496090973702914, + "grad_norm": 0.5390625, + "learning_rate": 0.0001979528974190519, + "loss": 1.1967, + "step": 1371 + }, + { + "epoch": 0.06500829187396352, + "grad_norm": 0.458984375, + "learning_rate": 0.00019794989840565307, + "loss": 1.1394, + "step": 1372 + }, + { + "epoch": 0.06505567401089789, + "grad_norm": 0.8359375, + "learning_rate": 0.00019794689721982977, + "loss": 0.9012, + "step": 1373 + }, + { + "epoch": 0.06510305614783227, + "grad_norm": 0.453125, + "learning_rate": 0.00019794389386164864, + "loss": 0.1004, + "step": 1374 + }, + { + "epoch": 0.06515043828476665, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019794088833117627, + "loss": 0.4262, + "step": 1375 + }, + { + "epoch": 0.06519782042170102, + "grad_norm": 0.546875, + "learning_rate": 0.00019793788062847932, + "loss": 1.051, + "step": 1376 + }, + { + "epoch": 0.06524520255863539, + "grad_norm": 0.419921875, + "learning_rate": 0.00019793487075362448, + "loss": 0.4535, + "step": 1377 + }, + { + "epoch": 0.06529258469556977, + "grad_norm": 0.828125, + "learning_rate": 0.00019793185870667856, + "loss": 0.2997, + "step": 1378 + }, + { + "epoch": 0.06533996683250415, + "grad_norm": 0.458984375, + "learning_rate": 0.00019792884448770827, + "loss": 0.7945, + "step": 1379 + }, + { + "epoch": 0.06538734896943853, + "grad_norm": 0.484375, + "learning_rate": 0.00019792582809678057, + "loss": 0.6099, + "step": 1380 + }, + { + "epoch": 0.06543473110637289, + "grad_norm": 0.85546875, + "learning_rate": 0.0001979228095339623, + "loss": 0.5786, + "step": 1381 + }, + { + "epoch": 0.06548211324330727, + "grad_norm": 0.23828125, + "learning_rate": 0.00019791978879932042, + "loss": 0.0349, + "step": 1382 + }, + { + "epoch": 0.06552949538024165, + "grad_norm": 0.62890625, + "learning_rate": 0.00019791676589292189, + "loss": 0.3752, + "step": 1383 + }, + { + "epoch": 0.06557687751717603, + "grad_norm": 0.5078125, + "learning_rate": 0.0001979137408148338, + "loss": 0.8369, + "step": 1384 + }, + { + "epoch": 0.0656242596541104, + "grad_norm": 0.052978515625, + "learning_rate": 0.00019791071356512326, + "loss": 0.0036, + "step": 1385 + }, + { + "epoch": 0.06567164179104477, + "grad_norm": 0.3125, + "learning_rate": 0.00019790768414385736, + "loss": 0.0213, + "step": 1386 + }, + { + "epoch": 0.06571902392797915, + "grad_norm": 0.353515625, + "learning_rate": 0.00019790465255110334, + "loss": 0.1986, + "step": 1387 + }, + { + "epoch": 0.06576640606491353, + "grad_norm": 0.61328125, + "learning_rate": 0.00019790161878692836, + "loss": 1.0587, + "step": 1388 + }, + { + "epoch": 0.0658137882018479, + "grad_norm": 0.47265625, + "learning_rate": 0.0001978985828513998, + "loss": 1.2339, + "step": 1389 + }, + { + "epoch": 0.06586117033878228, + "grad_norm": 0.66015625, + "learning_rate": 0.00019789554474458493, + "loss": 1.2089, + "step": 1390 + }, + { + "epoch": 0.06590855247571666, + "grad_norm": 0.546875, + "learning_rate": 0.00019789250446655116, + "loss": 1.0333, + "step": 1391 + }, + { + "epoch": 0.06595593461265103, + "grad_norm": 1.296875, + "learning_rate": 0.0001978894620173659, + "loss": 0.8205, + "step": 1392 + }, + { + "epoch": 0.0660033167495854, + "grad_norm": 1.4765625, + "learning_rate": 0.00019788641739709663, + "loss": 0.6403, + "step": 1393 + }, + { + "epoch": 0.06605069888651978, + "grad_norm": 0.5546875, + "learning_rate": 0.00019788337060581092, + "loss": 0.7673, + "step": 1394 + }, + { + "epoch": 0.06609808102345416, + "grad_norm": 0.392578125, + "learning_rate": 0.00019788032164357627, + "loss": 0.7747, + "step": 1395 + }, + { + "epoch": 0.06614546316038854, + "grad_norm": 0.412109375, + "learning_rate": 0.0001978772705104604, + "loss": 0.916, + "step": 1396 + }, + { + "epoch": 0.0661928452973229, + "grad_norm": 0.30859375, + "learning_rate": 0.00019787421720653087, + "loss": 0.1706, + "step": 1397 + }, + { + "epoch": 0.06624022743425728, + "grad_norm": 0.5546875, + "learning_rate": 0.00019787116173185546, + "loss": 1.6059, + "step": 1398 + }, + { + "epoch": 0.06628760957119166, + "grad_norm": 0.44921875, + "learning_rate": 0.00019786810408650195, + "loss": 0.4818, + "step": 1399 + }, + { + "epoch": 0.06633499170812604, + "grad_norm": 0.7578125, + "learning_rate": 0.00019786504427053814, + "loss": 0.3894, + "step": 1400 + }, + { + "epoch": 0.06638237384506042, + "grad_norm": 0.61328125, + "learning_rate": 0.0001978619822840319, + "loss": 0.8414, + "step": 1401 + }, + { + "epoch": 0.06642975598199478, + "grad_norm": 0.546875, + "learning_rate": 0.00019785891812705106, + "loss": 0.6433, + "step": 1402 + }, + { + "epoch": 0.06647713811892916, + "grad_norm": 0.63671875, + "learning_rate": 0.00019785585179966372, + "loss": 1.1861, + "step": 1403 + }, + { + "epoch": 0.06652452025586354, + "grad_norm": 0.43359375, + "learning_rate": 0.0001978527833019378, + "loss": 0.5291, + "step": 1404 + }, + { + "epoch": 0.06657190239279792, + "grad_norm": 0.51171875, + "learning_rate": 0.00019784971263394136, + "loss": 1.133, + "step": 1405 + }, + { + "epoch": 0.06661928452973229, + "grad_norm": 0.63671875, + "learning_rate": 0.00019784663979574254, + "loss": 0.9909, + "step": 1406 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.9453125, + "learning_rate": 0.00019784356478740945, + "loss": 0.7536, + "step": 1407 + }, + { + "epoch": 0.06671404880360104, + "grad_norm": 0.15625, + "learning_rate": 0.00019784048760901031, + "loss": 0.0098, + "step": 1408 + }, + { + "epoch": 0.06676143094053542, + "grad_norm": 0.404296875, + "learning_rate": 0.0001978374082606134, + "loss": 1.0359, + "step": 1409 + }, + { + "epoch": 0.06680881307746979, + "grad_norm": 0.55078125, + "learning_rate": 0.00019783432674228696, + "loss": 0.9342, + "step": 1410 + }, + { + "epoch": 0.06685619521440417, + "grad_norm": 1.2421875, + "learning_rate": 0.0001978312430540994, + "loss": 1.0898, + "step": 1411 + }, + { + "epoch": 0.06690357735133855, + "grad_norm": 0.57421875, + "learning_rate": 0.000197828157196119, + "loss": 1.1065, + "step": 1412 + }, + { + "epoch": 0.06695095948827293, + "grad_norm": 1.0078125, + "learning_rate": 0.00019782506916841435, + "loss": 0.5028, + "step": 1413 + }, + { + "epoch": 0.06699834162520729, + "grad_norm": 0.578125, + "learning_rate": 0.00019782197897105384, + "loss": 1.3315, + "step": 1414 + }, + { + "epoch": 0.06704572376214167, + "grad_norm": 0.83984375, + "learning_rate": 0.00019781888660410602, + "loss": 0.0758, + "step": 1415 + }, + { + "epoch": 0.06709310589907605, + "grad_norm": 0.55078125, + "learning_rate": 0.0001978157920676395, + "loss": 1.1478, + "step": 1416 + }, + { + "epoch": 0.06714048803601043, + "grad_norm": 0.451171875, + "learning_rate": 0.00019781269536172288, + "loss": 0.1899, + "step": 1417 + }, + { + "epoch": 0.0671878701729448, + "grad_norm": 0.4609375, + "learning_rate": 0.00019780959648642489, + "loss": 1.3329, + "step": 1418 + }, + { + "epoch": 0.06723525230987917, + "grad_norm": 0.451171875, + "learning_rate": 0.00019780649544181423, + "loss": 1.0074, + "step": 1419 + }, + { + "epoch": 0.06728263444681355, + "grad_norm": 0.609375, + "learning_rate": 0.00019780339222795964, + "loss": 1.6396, + "step": 1420 + }, + { + "epoch": 0.06733001658374793, + "grad_norm": 0.6953125, + "learning_rate": 0.00019780028684493, + "loss": 0.5259, + "step": 1421 + }, + { + "epoch": 0.0673773987206823, + "grad_norm": 0.431640625, + "learning_rate": 0.00019779717929279422, + "loss": 1.2355, + "step": 1422 + }, + { + "epoch": 0.06742478085761668, + "grad_norm": 0.54296875, + "learning_rate": 0.0001977940695716211, + "loss": 0.1711, + "step": 1423 + }, + { + "epoch": 0.06747216299455105, + "grad_norm": 0.89453125, + "learning_rate": 0.00019779095768147973, + "loss": 0.0939, + "step": 1424 + }, + { + "epoch": 0.06751954513148543, + "grad_norm": 0.33203125, + "learning_rate": 0.0001977878436224391, + "loss": 0.0777, + "step": 1425 + }, + { + "epoch": 0.0675669272684198, + "grad_norm": 0.7890625, + "learning_rate": 0.0001977847273945682, + "loss": 0.2649, + "step": 1426 + }, + { + "epoch": 0.06761430940535418, + "grad_norm": 0.6875, + "learning_rate": 0.00019778160899793624, + "loss": 1.2112, + "step": 1427 + }, + { + "epoch": 0.06766169154228856, + "grad_norm": 0.5859375, + "learning_rate": 0.00019777848843261232, + "loss": 1.1193, + "step": 1428 + }, + { + "epoch": 0.06770907367922294, + "grad_norm": 0.216796875, + "learning_rate": 0.0001977753656986657, + "loss": 0.0224, + "step": 1429 + }, + { + "epoch": 0.06775645581615732, + "grad_norm": 0.515625, + "learning_rate": 0.00019777224079616562, + "loss": 1.253, + "step": 1430 + }, + { + "epoch": 0.06780383795309168, + "grad_norm": 0.81640625, + "learning_rate": 0.00019776911372518135, + "loss": 0.5958, + "step": 1431 + }, + { + "epoch": 0.06785122009002606, + "grad_norm": 1.375, + "learning_rate": 0.00019776598448578229, + "loss": 1.0039, + "step": 1432 + }, + { + "epoch": 0.06789860222696044, + "grad_norm": 0.46875, + "learning_rate": 0.00019776285307803782, + "loss": 1.2828, + "step": 1433 + }, + { + "epoch": 0.06794598436389482, + "grad_norm": 0.494140625, + "learning_rate": 0.00019775971950201742, + "loss": 1.4507, + "step": 1434 + }, + { + "epoch": 0.06799336650082918, + "grad_norm": 0.443359375, + "learning_rate": 0.00019775658375779056, + "loss": 0.9144, + "step": 1435 + }, + { + "epoch": 0.06804074863776356, + "grad_norm": 0.490234375, + "learning_rate": 0.0001977534458454268, + "loss": 1.1339, + "step": 1436 + }, + { + "epoch": 0.06808813077469794, + "grad_norm": 0.64453125, + "learning_rate": 0.00019775030576499574, + "loss": 0.8736, + "step": 1437 + }, + { + "epoch": 0.06813551291163232, + "grad_norm": 0.040283203125, + "learning_rate": 0.000197747163516567, + "loss": 0.0034, + "step": 1438 + }, + { + "epoch": 0.06818289504856669, + "grad_norm": 0.47265625, + "learning_rate": 0.00019774401910021031, + "loss": 0.9527, + "step": 1439 + }, + { + "epoch": 0.06823027718550106, + "grad_norm": 0.36328125, + "learning_rate": 0.00019774087251599537, + "loss": 0.6436, + "step": 1440 + }, + { + "epoch": 0.06827765932243544, + "grad_norm": 0.310546875, + "learning_rate": 0.000197737723763992, + "loss": 0.2099, + "step": 1441 + }, + { + "epoch": 0.06832504145936982, + "grad_norm": 0.5078125, + "learning_rate": 0.00019773457284427, + "loss": 1.2293, + "step": 1442 + }, + { + "epoch": 0.06837242359630419, + "grad_norm": 0.5, + "learning_rate": 0.0001977314197568993, + "loss": 0.7126, + "step": 1443 + }, + { + "epoch": 0.06841980573323857, + "grad_norm": 0.515625, + "learning_rate": 0.00019772826450194982, + "loss": 0.0341, + "step": 1444 + }, + { + "epoch": 0.06846718787017295, + "grad_norm": 0.45703125, + "learning_rate": 0.00019772510707949154, + "loss": 1.6576, + "step": 1445 + }, + { + "epoch": 0.06851457000710733, + "grad_norm": 0.58984375, + "learning_rate": 0.00019772194748959442, + "loss": 1.482, + "step": 1446 + }, + { + "epoch": 0.06856195214404169, + "grad_norm": 0.478515625, + "learning_rate": 0.0001977187857323286, + "loss": 0.8749, + "step": 1447 + }, + { + "epoch": 0.06860933428097607, + "grad_norm": 0.625, + "learning_rate": 0.00019771562180776424, + "loss": 0.2753, + "step": 1448 + }, + { + "epoch": 0.06865671641791045, + "grad_norm": 0.5625, + "learning_rate": 0.00019771245571597142, + "loss": 0.1535, + "step": 1449 + }, + { + "epoch": 0.06870409855484483, + "grad_norm": 0.828125, + "learning_rate": 0.00019770928745702047, + "loss": 0.5296, + "step": 1450 + }, + { + "epoch": 0.0687514806917792, + "grad_norm": 0.30078125, + "learning_rate": 0.00019770611703098155, + "loss": 0.1664, + "step": 1451 + }, + { + "epoch": 0.06879886282871357, + "grad_norm": 0.51953125, + "learning_rate": 0.00019770294443792507, + "loss": 1.132, + "step": 1452 + }, + { + "epoch": 0.06884624496564795, + "grad_norm": 0.09033203125, + "learning_rate": 0.00019769976967792132, + "loss": 0.0082, + "step": 1453 + }, + { + "epoch": 0.06889362710258233, + "grad_norm": 0.498046875, + "learning_rate": 0.00019769659275104074, + "loss": 1.4106, + "step": 1454 + }, + { + "epoch": 0.0689410092395167, + "grad_norm": 1.0234375, + "learning_rate": 0.00019769341365735382, + "loss": 0.3698, + "step": 1455 + }, + { + "epoch": 0.06898839137645107, + "grad_norm": 0.421875, + "learning_rate": 0.00019769023239693103, + "loss": 0.9214, + "step": 1456 + }, + { + "epoch": 0.06903577351338545, + "grad_norm": 0.44921875, + "learning_rate": 0.00019768704896984293, + "loss": 0.7105, + "step": 1457 + }, + { + "epoch": 0.06908315565031983, + "grad_norm": 0.63671875, + "learning_rate": 0.00019768386337616013, + "loss": 1.3286, + "step": 1458 + }, + { + "epoch": 0.06913053778725421, + "grad_norm": 0.451171875, + "learning_rate": 0.0001976806756159533, + "loss": 1.117, + "step": 1459 + }, + { + "epoch": 0.06917791992418858, + "grad_norm": 0.68359375, + "learning_rate": 0.00019767748568929317, + "loss": 0.8185, + "step": 1460 + }, + { + "epoch": 0.06922530206112296, + "grad_norm": 0.494140625, + "learning_rate": 0.00019767429359625037, + "loss": 1.0807, + "step": 1461 + }, + { + "epoch": 0.06927268419805734, + "grad_norm": 0.60546875, + "learning_rate": 0.00019767109933689584, + "loss": 0.8929, + "step": 1462 + }, + { + "epoch": 0.06932006633499171, + "grad_norm": 0.482421875, + "learning_rate": 0.00019766790291130036, + "loss": 0.708, + "step": 1463 + }, + { + "epoch": 0.06936744847192608, + "grad_norm": 0.04248046875, + "learning_rate": 0.00019766470431953478, + "loss": 0.0032, + "step": 1464 + }, + { + "epoch": 0.06941483060886046, + "grad_norm": 0.388671875, + "learning_rate": 0.0001976615035616701, + "loss": 0.58, + "step": 1465 + }, + { + "epoch": 0.06946221274579484, + "grad_norm": 0.326171875, + "learning_rate": 0.00019765830063777734, + "loss": 0.0398, + "step": 1466 + }, + { + "epoch": 0.06950959488272922, + "grad_norm": 0.50390625, + "learning_rate": 0.00019765509554792746, + "loss": 1.1605, + "step": 1467 + }, + { + "epoch": 0.06955697701966358, + "grad_norm": 0.5859375, + "learning_rate": 0.00019765188829219156, + "loss": 1.0195, + "step": 1468 + }, + { + "epoch": 0.06960435915659796, + "grad_norm": 0.54296875, + "learning_rate": 0.0001976486788706408, + "loss": 1.1521, + "step": 1469 + }, + { + "epoch": 0.06965174129353234, + "grad_norm": 0.482421875, + "learning_rate": 0.00019764546728334636, + "loss": 0.8, + "step": 1470 + }, + { + "epoch": 0.06969912343046672, + "grad_norm": 0.61328125, + "learning_rate": 0.00019764225353037946, + "loss": 1.2107, + "step": 1471 + }, + { + "epoch": 0.06974650556740108, + "grad_norm": 0.59765625, + "learning_rate": 0.0001976390376118114, + "loss": 1.4391, + "step": 1472 + }, + { + "epoch": 0.06979388770433546, + "grad_norm": 1.234375, + "learning_rate": 0.00019763581952771347, + "loss": 0.6741, + "step": 1473 + }, + { + "epoch": 0.06984126984126984, + "grad_norm": 0.431640625, + "learning_rate": 0.00019763259927815704, + "loss": 0.7293, + "step": 1474 + }, + { + "epoch": 0.06988865197820422, + "grad_norm": 0.49609375, + "learning_rate": 0.0001976293768632136, + "loss": 0.9597, + "step": 1475 + }, + { + "epoch": 0.06993603411513859, + "grad_norm": 0.435546875, + "learning_rate": 0.0001976261522829545, + "loss": 0.9556, + "step": 1476 + }, + { + "epoch": 0.06998341625207297, + "grad_norm": 0.6015625, + "learning_rate": 0.00019762292553745142, + "loss": 1.0678, + "step": 1477 + }, + { + "epoch": 0.07003079838900735, + "grad_norm": 0.5234375, + "learning_rate": 0.00019761969662677578, + "loss": 1.1374, + "step": 1478 + }, + { + "epoch": 0.07007818052594172, + "grad_norm": 0.55859375, + "learning_rate": 0.00019761646555099924, + "loss": 1.0522, + "step": 1479 + }, + { + "epoch": 0.07012556266287609, + "grad_norm": 0.486328125, + "learning_rate": 0.00019761323231019348, + "loss": 1.129, + "step": 1480 + }, + { + "epoch": 0.07017294479981047, + "grad_norm": 0.330078125, + "learning_rate": 0.00019760999690443022, + "loss": 0.1712, + "step": 1481 + }, + { + "epoch": 0.07022032693674485, + "grad_norm": 0.46484375, + "learning_rate": 0.0001976067593337812, + "loss": 1.0533, + "step": 1482 + }, + { + "epoch": 0.07026770907367923, + "grad_norm": 0.455078125, + "learning_rate": 0.00019760351959831817, + "loss": 1.1679, + "step": 1483 + }, + { + "epoch": 0.07031509121061359, + "grad_norm": 0.5703125, + "learning_rate": 0.0001976002776981131, + "loss": 0.9336, + "step": 1484 + }, + { + "epoch": 0.07036247334754797, + "grad_norm": 0.369140625, + "learning_rate": 0.0001975970336332378, + "loss": 0.0366, + "step": 1485 + }, + { + "epoch": 0.07040985548448235, + "grad_norm": 0.5625, + "learning_rate": 0.00019759378740376426, + "loss": 0.9989, + "step": 1486 + }, + { + "epoch": 0.07045723762141673, + "grad_norm": 0.91796875, + "learning_rate": 0.00019759053900976446, + "loss": 0.8363, + "step": 1487 + }, + { + "epoch": 0.0705046197583511, + "grad_norm": 1.2109375, + "learning_rate": 0.00019758728845131044, + "loss": 0.4774, + "step": 1488 + }, + { + "epoch": 0.07055200189528547, + "grad_norm": 1.171875, + "learning_rate": 0.00019758403572847435, + "loss": 0.1674, + "step": 1489 + }, + { + "epoch": 0.07059938403221985, + "grad_norm": 0.609375, + "learning_rate": 0.00019758078084132827, + "loss": 1.1625, + "step": 1490 + }, + { + "epoch": 0.07064676616915423, + "grad_norm": 0.47265625, + "learning_rate": 0.00019757752378994438, + "loss": 1.334, + "step": 1491 + }, + { + "epoch": 0.07069414830608861, + "grad_norm": 0.53515625, + "learning_rate": 0.00019757426457439497, + "loss": 1.1302, + "step": 1492 + }, + { + "epoch": 0.07074153044302298, + "grad_norm": 0.91796875, + "learning_rate": 0.00019757100319475233, + "loss": 0.3671, + "step": 1493 + }, + { + "epoch": 0.07078891257995736, + "grad_norm": 1.7734375, + "learning_rate": 0.00019756773965108875, + "loss": 0.384, + "step": 1494 + }, + { + "epoch": 0.07083629471689173, + "grad_norm": 0.484375, + "learning_rate": 0.0001975644739434766, + "loss": 0.8549, + "step": 1495 + }, + { + "epoch": 0.07088367685382611, + "grad_norm": 0.455078125, + "learning_rate": 0.00019756120607198835, + "loss": 0.8868, + "step": 1496 + }, + { + "epoch": 0.07093105899076048, + "grad_norm": 0.48828125, + "learning_rate": 0.0001975579360366965, + "loss": 1.2682, + "step": 1497 + }, + { + "epoch": 0.07097844112769486, + "grad_norm": 0.5546875, + "learning_rate": 0.0001975546638376735, + "loss": 0.968, + "step": 1498 + }, + { + "epoch": 0.07102582326462924, + "grad_norm": 0.58203125, + "learning_rate": 0.000197551389474992, + "loss": 1.1436, + "step": 1499 + }, + { + "epoch": 0.07107320540156362, + "grad_norm": 0.498046875, + "learning_rate": 0.00019754811294872456, + "loss": 0.1859, + "step": 1500 + }, + { + "epoch": 0.07112058753849798, + "grad_norm": 0.462890625, + "learning_rate": 0.0001975448342589439, + "loss": 0.9387, + "step": 1501 + }, + { + "epoch": 0.07116796967543236, + "grad_norm": 0.458984375, + "learning_rate": 0.00019754155340572272, + "loss": 0.7409, + "step": 1502 + }, + { + "epoch": 0.07121535181236674, + "grad_norm": 0.478515625, + "learning_rate": 0.00019753827038913375, + "loss": 1.5493, + "step": 1503 + }, + { + "epoch": 0.07126273394930112, + "grad_norm": 0.375, + "learning_rate": 0.00019753498520924987, + "loss": 0.1718, + "step": 1504 + }, + { + "epoch": 0.07131011608623548, + "grad_norm": 0.64453125, + "learning_rate": 0.0001975316978661439, + "loss": 1.1719, + "step": 1505 + }, + { + "epoch": 0.07135749822316986, + "grad_norm": 0.6953125, + "learning_rate": 0.00019752840835988872, + "loss": 1.3937, + "step": 1506 + }, + { + "epoch": 0.07140488036010424, + "grad_norm": 0.53515625, + "learning_rate": 0.00019752511669055738, + "loss": 1.2344, + "step": 1507 + }, + { + "epoch": 0.07145226249703862, + "grad_norm": 0.474609375, + "learning_rate": 0.0001975218228582228, + "loss": 0.1861, + "step": 1508 + }, + { + "epoch": 0.07149964463397299, + "grad_norm": 0.63671875, + "learning_rate": 0.00019751852686295806, + "loss": 1.4951, + "step": 1509 + }, + { + "epoch": 0.07154702677090737, + "grad_norm": 0.384765625, + "learning_rate": 0.0001975152287048363, + "loss": 0.6003, + "step": 1510 + }, + { + "epoch": 0.07159440890784174, + "grad_norm": 0.54296875, + "learning_rate": 0.00019751192838393062, + "loss": 1.3992, + "step": 1511 + }, + { + "epoch": 0.07164179104477612, + "grad_norm": 0.80859375, + "learning_rate": 0.00019750862590031424, + "loss": 0.9387, + "step": 1512 + }, + { + "epoch": 0.07168917318171049, + "grad_norm": 0.462890625, + "learning_rate": 0.00019750532125406035, + "loss": 1.4259, + "step": 1513 + }, + { + "epoch": 0.07173655531864487, + "grad_norm": 0.49609375, + "learning_rate": 0.00019750201444524235, + "loss": 1.1457, + "step": 1514 + }, + { + "epoch": 0.07178393745557925, + "grad_norm": 0.06591796875, + "learning_rate": 0.0001974987054739335, + "loss": 0.0032, + "step": 1515 + }, + { + "epoch": 0.07183131959251363, + "grad_norm": 0.470703125, + "learning_rate": 0.0001974953943402072, + "loss": 0.745, + "step": 1516 + }, + { + "epoch": 0.07187870172944799, + "grad_norm": 0.515625, + "learning_rate": 0.00019749208104413694, + "loss": 1.2319, + "step": 1517 + }, + { + "epoch": 0.07192608386638237, + "grad_norm": 0.6796875, + "learning_rate": 0.00019748876558579612, + "loss": 1.3171, + "step": 1518 + }, + { + "epoch": 0.07197346600331675, + "grad_norm": 0.490234375, + "learning_rate": 0.00019748544796525835, + "loss": 0.0755, + "step": 1519 + }, + { + "epoch": 0.07202084814025113, + "grad_norm": 0.6171875, + "learning_rate": 0.0001974821281825972, + "loss": 1.3535, + "step": 1520 + }, + { + "epoch": 0.07206823027718551, + "grad_norm": 0.435546875, + "learning_rate": 0.00019747880623788625, + "loss": 0.876, + "step": 1521 + }, + { + "epoch": 0.07211561241411987, + "grad_norm": 0.6484375, + "learning_rate": 0.00019747548213119918, + "loss": 0.7843, + "step": 1522 + }, + { + "epoch": 0.07216299455105425, + "grad_norm": 0.52734375, + "learning_rate": 0.00019747215586260982, + "loss": 1.4776, + "step": 1523 + }, + { + "epoch": 0.07221037668798863, + "grad_norm": 0.4765625, + "learning_rate": 0.0001974688274321918, + "loss": 0.6193, + "step": 1524 + }, + { + "epoch": 0.07225775882492301, + "grad_norm": 0.5625, + "learning_rate": 0.00019746549684001902, + "loss": 0.9984, + "step": 1525 + }, + { + "epoch": 0.07230514096185738, + "grad_norm": 0.494140625, + "learning_rate": 0.00019746216408616536, + "loss": 1.0317, + "step": 1526 + }, + { + "epoch": 0.07235252309879175, + "grad_norm": 0.5546875, + "learning_rate": 0.0001974588291707047, + "loss": 0.0613, + "step": 1527 + }, + { + "epoch": 0.07239990523572613, + "grad_norm": 0.408203125, + "learning_rate": 0.000197455492093711, + "loss": 0.7384, + "step": 1528 + }, + { + "epoch": 0.07244728737266051, + "grad_norm": 0.431640625, + "learning_rate": 0.0001974521528552583, + "loss": 0.0511, + "step": 1529 + }, + { + "epoch": 0.07249466950959488, + "grad_norm": 0.48046875, + "learning_rate": 0.00019744881145542068, + "loss": 0.969, + "step": 1530 + }, + { + "epoch": 0.07254205164652926, + "grad_norm": 1.09375, + "learning_rate": 0.0001974454678942722, + "loss": 0.6165, + "step": 1531 + }, + { + "epoch": 0.07258943378346364, + "grad_norm": 0.42578125, + "learning_rate": 0.000197442122171887, + "loss": 0.8376, + "step": 1532 + }, + { + "epoch": 0.07263681592039802, + "grad_norm": 0.73046875, + "learning_rate": 0.00019743877428833934, + "loss": 0.1542, + "step": 1533 + }, + { + "epoch": 0.07268419805733238, + "grad_norm": 0.63671875, + "learning_rate": 0.00019743542424370346, + "loss": 0.0372, + "step": 1534 + }, + { + "epoch": 0.07273158019426676, + "grad_norm": 0.4765625, + "learning_rate": 0.00019743207203805368, + "loss": 1.7193, + "step": 1535 + }, + { + "epoch": 0.07277896233120114, + "grad_norm": 0.80078125, + "learning_rate": 0.00019742871767146428, + "loss": 1.0667, + "step": 1536 + }, + { + "epoch": 0.07282634446813552, + "grad_norm": 0.50390625, + "learning_rate": 0.00019742536114400973, + "loss": 0.1448, + "step": 1537 + }, + { + "epoch": 0.07287372660506988, + "grad_norm": 0.60546875, + "learning_rate": 0.00019742200245576443, + "loss": 1.2105, + "step": 1538 + }, + { + "epoch": 0.07292110874200426, + "grad_norm": 0.54296875, + "learning_rate": 0.0001974186416068029, + "loss": 1.3034, + "step": 1539 + }, + { + "epoch": 0.07296849087893864, + "grad_norm": 0.58203125, + "learning_rate": 0.00019741527859719966, + "loss": 1.3104, + "step": 1540 + }, + { + "epoch": 0.07301587301587302, + "grad_norm": 0.484375, + "learning_rate": 0.00019741191342702928, + "loss": 0.1806, + "step": 1541 + }, + { + "epoch": 0.07306325515280739, + "grad_norm": 0.57421875, + "learning_rate": 0.00019740854609636644, + "loss": 1.0215, + "step": 1542 + }, + { + "epoch": 0.07311063728974176, + "grad_norm": 0.47265625, + "learning_rate": 0.00019740517660528579, + "loss": 1.149, + "step": 1543 + }, + { + "epoch": 0.07315801942667614, + "grad_norm": 0.5625, + "learning_rate": 0.0001974018049538621, + "loss": 0.8894, + "step": 1544 + }, + { + "epoch": 0.07320540156361052, + "grad_norm": 0.59375, + "learning_rate": 0.00019739843114217006, + "loss": 1.1542, + "step": 1545 + }, + { + "epoch": 0.07325278370054489, + "grad_norm": 0.498046875, + "learning_rate": 0.00019739505517028463, + "loss": 1.0084, + "step": 1546 + }, + { + "epoch": 0.07330016583747927, + "grad_norm": 0.443359375, + "learning_rate": 0.00019739167703828058, + "loss": 0.6863, + "step": 1547 + }, + { + "epoch": 0.07334754797441365, + "grad_norm": 0.53515625, + "learning_rate": 0.0001973882967462329, + "loss": 0.8286, + "step": 1548 + }, + { + "epoch": 0.07339493011134803, + "grad_norm": 0.48828125, + "learning_rate": 0.0001973849142942165, + "loss": 0.8003, + "step": 1549 + }, + { + "epoch": 0.0734423122482824, + "grad_norm": 0.55859375, + "learning_rate": 0.00019738152968230645, + "loss": 0.8031, + "step": 1550 + }, + { + "epoch": 0.07348969438521677, + "grad_norm": 1.578125, + "learning_rate": 0.0001973781429105778, + "loss": 0.6852, + "step": 1551 + }, + { + "epoch": 0.07353707652215115, + "grad_norm": 0.369140625, + "learning_rate": 0.00019737475397910563, + "loss": 0.027, + "step": 1552 + }, + { + "epoch": 0.07358445865908553, + "grad_norm": 0.53515625, + "learning_rate": 0.00019737136288796515, + "loss": 0.1264, + "step": 1553 + }, + { + "epoch": 0.0736318407960199, + "grad_norm": 0.48046875, + "learning_rate": 0.0001973679696372316, + "loss": 0.6384, + "step": 1554 + }, + { + "epoch": 0.07367922293295427, + "grad_norm": 0.55078125, + "learning_rate": 0.00019736457422698015, + "loss": 0.8887, + "step": 1555 + }, + { + "epoch": 0.07372660506988865, + "grad_norm": 0.494140625, + "learning_rate": 0.00019736117665728617, + "loss": 0.9889, + "step": 1556 + }, + { + "epoch": 0.07377398720682303, + "grad_norm": 0.5078125, + "learning_rate": 0.000197357776928225, + "loss": 1.2968, + "step": 1557 + }, + { + "epoch": 0.07382136934375741, + "grad_norm": 0.4921875, + "learning_rate": 0.00019735437503987202, + "loss": 0.8644, + "step": 1558 + }, + { + "epoch": 0.07386875148069177, + "grad_norm": 0.5078125, + "learning_rate": 0.0001973509709923027, + "loss": 1.1251, + "step": 1559 + }, + { + "epoch": 0.07391613361762615, + "grad_norm": 0.439453125, + "learning_rate": 0.00019734756478559254, + "loss": 0.2686, + "step": 1560 + }, + { + "epoch": 0.07396351575456053, + "grad_norm": 0.44921875, + "learning_rate": 0.0001973441564198171, + "loss": 0.7783, + "step": 1561 + }, + { + "epoch": 0.07401089789149491, + "grad_norm": 0.44140625, + "learning_rate": 0.00019734074589505195, + "loss": 1.1619, + "step": 1562 + }, + { + "epoch": 0.07405828002842928, + "grad_norm": 0.515625, + "learning_rate": 0.00019733733321137273, + "loss": 0.8184, + "step": 1563 + }, + { + "epoch": 0.07410566216536366, + "grad_norm": 0.546875, + "learning_rate": 0.00019733391836885514, + "loss": 1.2703, + "step": 1564 + }, + { + "epoch": 0.07415304430229804, + "grad_norm": 0.83203125, + "learning_rate": 0.0001973305013675749, + "loss": 0.1781, + "step": 1565 + }, + { + "epoch": 0.07420042643923241, + "grad_norm": 0.8359375, + "learning_rate": 0.00019732708220760782, + "loss": 1.2109, + "step": 1566 + }, + { + "epoch": 0.07424780857616678, + "grad_norm": 0.498046875, + "learning_rate": 0.00019732366088902976, + "loss": 0.7686, + "step": 1567 + }, + { + "epoch": 0.07429519071310116, + "grad_norm": 0.90625, + "learning_rate": 0.00019732023741191653, + "loss": 0.5418, + "step": 1568 + }, + { + "epoch": 0.07434257285003554, + "grad_norm": 0.43359375, + "learning_rate": 0.00019731681177634412, + "loss": 0.6996, + "step": 1569 + }, + { + "epoch": 0.07438995498696992, + "grad_norm": 0.515625, + "learning_rate": 0.00019731338398238846, + "loss": 1.1675, + "step": 1570 + }, + { + "epoch": 0.07443733712390428, + "grad_norm": 0.396484375, + "learning_rate": 0.0001973099540301256, + "loss": 0.891, + "step": 1571 + }, + { + "epoch": 0.07448471926083866, + "grad_norm": 0.66796875, + "learning_rate": 0.0001973065219196316, + "loss": 1.2264, + "step": 1572 + }, + { + "epoch": 0.07453210139777304, + "grad_norm": 0.890625, + "learning_rate": 0.00019730308765098263, + "loss": 1.1571, + "step": 1573 + }, + { + "epoch": 0.07457948353470742, + "grad_norm": 0.376953125, + "learning_rate": 0.0001972996512242548, + "loss": 0.6126, + "step": 1574 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.4765625, + "learning_rate": 0.0001972962126395243, + "loss": 0.6741, + "step": 1575 + }, + { + "epoch": 0.07467424780857616, + "grad_norm": 0.875, + "learning_rate": 0.00019729277189686748, + "loss": 0.4545, + "step": 1576 + }, + { + "epoch": 0.07472162994551054, + "grad_norm": 0.404296875, + "learning_rate": 0.0001972893289963606, + "loss": 0.7829, + "step": 1577 + }, + { + "epoch": 0.07476901208244492, + "grad_norm": 0.56640625, + "learning_rate": 0.00019728588393808005, + "loss": 1.5995, + "step": 1578 + }, + { + "epoch": 0.0748163942193793, + "grad_norm": 0.423828125, + "learning_rate": 0.00019728243672210223, + "loss": 0.4342, + "step": 1579 + }, + { + "epoch": 0.07486377635631367, + "grad_norm": 0.478515625, + "learning_rate": 0.00019727898734850355, + "loss": 0.7056, + "step": 1580 + }, + { + "epoch": 0.07491115849324805, + "grad_norm": 0.46484375, + "learning_rate": 0.00019727553581736054, + "loss": 0.0762, + "step": 1581 + }, + { + "epoch": 0.07495854063018242, + "grad_norm": 0.7109375, + "learning_rate": 0.00019727208212874978, + "loss": 0.3199, + "step": 1582 + }, + { + "epoch": 0.0750059227671168, + "grad_norm": 0.5390625, + "learning_rate": 0.00019726862628274784, + "loss": 1.4885, + "step": 1583 + }, + { + "epoch": 0.07505330490405117, + "grad_norm": 0.022216796875, + "learning_rate": 0.0001972651682794314, + "loss": 0.0019, + "step": 1584 + }, + { + "epoch": 0.07510068704098555, + "grad_norm": 0.54296875, + "learning_rate": 0.0001972617081188771, + "loss": 0.8026, + "step": 1585 + }, + { + "epoch": 0.07514806917791993, + "grad_norm": 0.546875, + "learning_rate": 0.00019725824580116172, + "loss": 0.9925, + "step": 1586 + }, + { + "epoch": 0.0751954513148543, + "grad_norm": 0.5234375, + "learning_rate": 0.00019725478132636207, + "loss": 1.1016, + "step": 1587 + }, + { + "epoch": 0.07524283345178867, + "grad_norm": 0.5625, + "learning_rate": 0.0001972513146945549, + "loss": 0.6538, + "step": 1588 + }, + { + "epoch": 0.07529021558872305, + "grad_norm": 0.65234375, + "learning_rate": 0.0001972478459058172, + "loss": 1.4446, + "step": 1589 + }, + { + "epoch": 0.07533759772565743, + "grad_norm": 1.2578125, + "learning_rate": 0.00019724437496022588, + "loss": 0.8096, + "step": 1590 + }, + { + "epoch": 0.07538497986259181, + "grad_norm": 0.53515625, + "learning_rate": 0.00019724090185785787, + "loss": 0.0483, + "step": 1591 + }, + { + "epoch": 0.07543236199952617, + "grad_norm": 0.5390625, + "learning_rate": 0.00019723742659879024, + "loss": 1.3905, + "step": 1592 + }, + { + "epoch": 0.07547974413646055, + "grad_norm": 0.5703125, + "learning_rate": 0.00019723394918310007, + "loss": 0.8611, + "step": 1593 + }, + { + "epoch": 0.07552712627339493, + "grad_norm": 0.4921875, + "learning_rate": 0.00019723046961086444, + "loss": 1.3968, + "step": 1594 + }, + { + "epoch": 0.07557450841032931, + "grad_norm": 0.494140625, + "learning_rate": 0.0001972269878821606, + "loss": 1.1161, + "step": 1595 + }, + { + "epoch": 0.07562189054726368, + "grad_norm": 0.56640625, + "learning_rate": 0.0001972235039970657, + "loss": 1.2614, + "step": 1596 + }, + { + "epoch": 0.07566927268419806, + "grad_norm": 0.5546875, + "learning_rate": 0.00019722001795565705, + "loss": 1.1092, + "step": 1597 + }, + { + "epoch": 0.07571665482113243, + "grad_norm": 0.498046875, + "learning_rate": 0.00019721652975801196, + "loss": 1.2873, + "step": 1598 + }, + { + "epoch": 0.07576403695806681, + "grad_norm": 0.466796875, + "learning_rate": 0.0001972130394042078, + "loss": 1.0667, + "step": 1599 + }, + { + "epoch": 0.07581141909500118, + "grad_norm": 0.498046875, + "learning_rate": 0.00019720954689432199, + "loss": 0.9517, + "step": 1600 + }, + { + "epoch": 0.07585880123193556, + "grad_norm": 0.5078125, + "learning_rate": 0.00019720605222843193, + "loss": 1.1561, + "step": 1601 + }, + { + "epoch": 0.07590618336886994, + "grad_norm": 0.4921875, + "learning_rate": 0.0001972025554066152, + "loss": 1.052, + "step": 1602 + }, + { + "epoch": 0.07595356550580432, + "grad_norm": 0.73828125, + "learning_rate": 0.0001971990564289493, + "loss": 0.2228, + "step": 1603 + }, + { + "epoch": 0.07600094764273868, + "grad_norm": 0.58984375, + "learning_rate": 0.0001971955552955119, + "loss": 1.0376, + "step": 1604 + }, + { + "epoch": 0.07604832977967306, + "grad_norm": 0.2734375, + "learning_rate": 0.00019719205200638057, + "loss": 0.0396, + "step": 1605 + }, + { + "epoch": 0.07609571191660744, + "grad_norm": 0.546875, + "learning_rate": 0.00019718854656163308, + "loss": 0.8643, + "step": 1606 + }, + { + "epoch": 0.07614309405354182, + "grad_norm": 0.431640625, + "learning_rate": 0.00019718503896134712, + "loss": 1.0733, + "step": 1607 + }, + { + "epoch": 0.0761904761904762, + "grad_norm": 0.482421875, + "learning_rate": 0.00019718152920560056, + "loss": 1.1269, + "step": 1608 + }, + { + "epoch": 0.07623785832741056, + "grad_norm": 0.83984375, + "learning_rate": 0.00019717801729447114, + "loss": 0.4496, + "step": 1609 + }, + { + "epoch": 0.07628524046434494, + "grad_norm": 0.458984375, + "learning_rate": 0.00019717450322803684, + "loss": 1.1536, + "step": 1610 + }, + { + "epoch": 0.07633262260127932, + "grad_norm": 0.8125, + "learning_rate": 0.00019717098700637554, + "loss": 0.6839, + "step": 1611 + }, + { + "epoch": 0.0763800047382137, + "grad_norm": 0.515625, + "learning_rate": 0.00019716746862956527, + "loss": 1.3748, + "step": 1612 + }, + { + "epoch": 0.07642738687514807, + "grad_norm": 0.494140625, + "learning_rate": 0.00019716394809768403, + "loss": 1.0119, + "step": 1613 + }, + { + "epoch": 0.07647476901208244, + "grad_norm": 0.55859375, + "learning_rate": 0.00019716042541080992, + "loss": 1.0805, + "step": 1614 + }, + { + "epoch": 0.07652215114901682, + "grad_norm": 0.6171875, + "learning_rate": 0.00019715690056902108, + "loss": 1.3178, + "step": 1615 + }, + { + "epoch": 0.0765695332859512, + "grad_norm": 0.57421875, + "learning_rate": 0.00019715337357239566, + "loss": 1.1677, + "step": 1616 + }, + { + "epoch": 0.07661691542288557, + "grad_norm": 0.55078125, + "learning_rate": 0.0001971498444210119, + "loss": 0.7856, + "step": 1617 + }, + { + "epoch": 0.07666429755981995, + "grad_norm": 0.396484375, + "learning_rate": 0.00019714631311494807, + "loss": 0.8945, + "step": 1618 + }, + { + "epoch": 0.07671167969675433, + "grad_norm": 0.59765625, + "learning_rate": 0.0001971427796542825, + "loss": 1.3548, + "step": 1619 + }, + { + "epoch": 0.0767590618336887, + "grad_norm": 0.48828125, + "learning_rate": 0.00019713924403909352, + "loss": 1.1525, + "step": 1620 + }, + { + "epoch": 0.07680644397062307, + "grad_norm": 0.5078125, + "learning_rate": 0.0001971357062694596, + "loss": 0.0897, + "step": 1621 + }, + { + "epoch": 0.07685382610755745, + "grad_norm": 0.474609375, + "learning_rate": 0.0001971321663454592, + "loss": 0.826, + "step": 1622 + }, + { + "epoch": 0.07690120824449183, + "grad_norm": 0.48828125, + "learning_rate": 0.00019712862426717075, + "loss": 0.8226, + "step": 1623 + }, + { + "epoch": 0.07694859038142621, + "grad_norm": 0.33203125, + "learning_rate": 0.0001971250800346729, + "loss": 0.8715, + "step": 1624 + }, + { + "epoch": 0.07699597251836057, + "grad_norm": 0.375, + "learning_rate": 0.00019712153364804424, + "loss": 0.9103, + "step": 1625 + }, + { + "epoch": 0.07704335465529495, + "grad_norm": 0.416015625, + "learning_rate": 0.0001971179851073634, + "loss": 1.124, + "step": 1626 + }, + { + "epoch": 0.07709073679222933, + "grad_norm": 0.57421875, + "learning_rate": 0.00019711443441270912, + "loss": 0.8573, + "step": 1627 + }, + { + "epoch": 0.07713811892916371, + "grad_norm": 0.3984375, + "learning_rate": 0.00019711088156416012, + "loss": 0.524, + "step": 1628 + }, + { + "epoch": 0.07718550106609808, + "grad_norm": 0.54296875, + "learning_rate": 0.00019710732656179518, + "loss": 1.0545, + "step": 1629 + }, + { + "epoch": 0.07723288320303245, + "grad_norm": 0.56640625, + "learning_rate": 0.00019710376940569317, + "loss": 0.681, + "step": 1630 + }, + { + "epoch": 0.07728026533996683, + "grad_norm": 0.62109375, + "learning_rate": 0.000197100210095933, + "loss": 1.4263, + "step": 1631 + }, + { + "epoch": 0.07732764747690121, + "grad_norm": 0.392578125, + "learning_rate": 0.00019709664863259358, + "loss": 0.2156, + "step": 1632 + }, + { + "epoch": 0.07737502961383558, + "grad_norm": 0.78125, + "learning_rate": 0.00019709308501575398, + "loss": 1.1149, + "step": 1633 + }, + { + "epoch": 0.07742241175076996, + "grad_norm": 0.515625, + "learning_rate": 0.00019708951924549307, + "loss": 1.2251, + "step": 1634 + }, + { + "epoch": 0.07746979388770434, + "grad_norm": 0.55078125, + "learning_rate": 0.0001970859513218901, + "loss": 0.7541, + "step": 1635 + }, + { + "epoch": 0.07751717602463871, + "grad_norm": 0.373046875, + "learning_rate": 0.00019708238124502417, + "loss": 1.5339, + "step": 1636 + }, + { + "epoch": 0.07756455816157308, + "grad_norm": 0.64453125, + "learning_rate": 0.00019707880901497437, + "loss": 1.0511, + "step": 1637 + }, + { + "epoch": 0.07761194029850746, + "grad_norm": 0.240234375, + "learning_rate": 0.00019707523463182, + "loss": 0.0204, + "step": 1638 + }, + { + "epoch": 0.07765932243544184, + "grad_norm": 0.55078125, + "learning_rate": 0.00019707165809564034, + "loss": 0.9405, + "step": 1639 + }, + { + "epoch": 0.07770670457237622, + "grad_norm": 0.439453125, + "learning_rate": 0.00019706807940651473, + "loss": 1.1919, + "step": 1640 + }, + { + "epoch": 0.0777540867093106, + "grad_norm": 0.482421875, + "learning_rate": 0.00019706449856452248, + "loss": 0.7183, + "step": 1641 + }, + { + "epoch": 0.07780146884624496, + "grad_norm": 0.67578125, + "learning_rate": 0.00019706091556974303, + "loss": 1.4933, + "step": 1642 + }, + { + "epoch": 0.07784885098317934, + "grad_norm": 0.4140625, + "learning_rate": 0.00019705733042225588, + "loss": 1.0473, + "step": 1643 + }, + { + "epoch": 0.07789623312011372, + "grad_norm": 0.01409912109375, + "learning_rate": 0.0001970537431221405, + "loss": 0.001, + "step": 1644 + }, + { + "epoch": 0.0779436152570481, + "grad_norm": 0.36328125, + "learning_rate": 0.00019705015366947647, + "loss": 0.9944, + "step": 1645 + }, + { + "epoch": 0.07799099739398246, + "grad_norm": 0.435546875, + "learning_rate": 0.00019704656206434343, + "loss": 0.9846, + "step": 1646 + }, + { + "epoch": 0.07803837953091684, + "grad_norm": 0.3671875, + "learning_rate": 0.00019704296830682098, + "loss": 0.9973, + "step": 1647 + }, + { + "epoch": 0.07808576166785122, + "grad_norm": 0.271484375, + "learning_rate": 0.0001970393723969889, + "loss": 0.0165, + "step": 1648 + }, + { + "epoch": 0.0781331438047856, + "grad_norm": 0.62890625, + "learning_rate": 0.00019703577433492688, + "loss": 1.2609, + "step": 1649 + }, + { + "epoch": 0.07818052594171997, + "grad_norm": 0.314453125, + "learning_rate": 0.00019703217412071476, + "loss": 0.2079, + "step": 1650 + }, + { + "epoch": 0.07822790807865435, + "grad_norm": 0.9453125, + "learning_rate": 0.00019702857175443234, + "loss": 0.2338, + "step": 1651 + }, + { + "epoch": 0.07827529021558872, + "grad_norm": 0.259765625, + "learning_rate": 0.00019702496723615956, + "loss": 0.1863, + "step": 1652 + }, + { + "epoch": 0.0783226723525231, + "grad_norm": 0.427734375, + "learning_rate": 0.00019702136056597635, + "loss": 0.7679, + "step": 1653 + }, + { + "epoch": 0.07837005448945747, + "grad_norm": 0.53125, + "learning_rate": 0.00019701775174396272, + "loss": 1.2124, + "step": 1654 + }, + { + "epoch": 0.07841743662639185, + "grad_norm": 0.232421875, + "learning_rate": 0.00019701414077019871, + "loss": 0.0344, + "step": 1655 + }, + { + "epoch": 0.07846481876332623, + "grad_norm": 1.5859375, + "learning_rate": 0.00019701052764476437, + "loss": 1.0308, + "step": 1656 + }, + { + "epoch": 0.0785122009002606, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019700691236773987, + "loss": 0.0132, + "step": 1657 + }, + { + "epoch": 0.07855958303719497, + "grad_norm": 0.453125, + "learning_rate": 0.00019700329493920534, + "loss": 1.3492, + "step": 1658 + }, + { + "epoch": 0.07860696517412935, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019699967535924108, + "loss": 0.1897, + "step": 1659 + }, + { + "epoch": 0.07865434731106373, + "grad_norm": 0.6171875, + "learning_rate": 0.00019699605362792736, + "loss": 1.41, + "step": 1660 + }, + { + "epoch": 0.07870172944799811, + "grad_norm": 0.5390625, + "learning_rate": 0.00019699242974534445, + "loss": 1.0693, + "step": 1661 + }, + { + "epoch": 0.07874911158493247, + "grad_norm": 0.6015625, + "learning_rate": 0.0001969888037115728, + "loss": 1.2154, + "step": 1662 + }, + { + "epoch": 0.07879649372186685, + "grad_norm": 0.455078125, + "learning_rate": 0.00019698517552669276, + "loss": 0.7537, + "step": 1663 + }, + { + "epoch": 0.07884387585880123, + "grad_norm": 0.4921875, + "learning_rate": 0.00019698154519078484, + "loss": 1.3872, + "step": 1664 + }, + { + "epoch": 0.07889125799573561, + "grad_norm": 0.515625, + "learning_rate": 0.00019697791270392956, + "loss": 1.2254, + "step": 1665 + }, + { + "epoch": 0.07893864013266998, + "grad_norm": 0.01055908203125, + "learning_rate": 0.00019697427806620744, + "loss": 0.0009, + "step": 1666 + }, + { + "epoch": 0.07898602226960436, + "grad_norm": 0.5859375, + "learning_rate": 0.00019697064127769916, + "loss": 0.8843, + "step": 1667 + }, + { + "epoch": 0.07903340440653873, + "grad_norm": 0.5234375, + "learning_rate": 0.00019696700233848532, + "loss": 0.954, + "step": 1668 + }, + { + "epoch": 0.07908078654347311, + "grad_norm": 0.515625, + "learning_rate": 0.00019696336124864667, + "loss": 0.8493, + "step": 1669 + }, + { + "epoch": 0.07912816868040749, + "grad_norm": 0.287109375, + "learning_rate": 0.00019695971800826395, + "loss": 0.1835, + "step": 1670 + }, + { + "epoch": 0.07917555081734186, + "grad_norm": 0.6171875, + "learning_rate": 0.00019695607261741794, + "loss": 0.2005, + "step": 1671 + }, + { + "epoch": 0.07922293295427624, + "grad_norm": 0.427734375, + "learning_rate": 0.00019695242507618952, + "loss": 0.7346, + "step": 1672 + }, + { + "epoch": 0.07927031509121062, + "grad_norm": 0.65234375, + "learning_rate": 0.0001969487753846596, + "loss": 0.7522, + "step": 1673 + }, + { + "epoch": 0.079317697228145, + "grad_norm": 0.46484375, + "learning_rate": 0.00019694512354290908, + "loss": 0.6101, + "step": 1674 + }, + { + "epoch": 0.07936507936507936, + "grad_norm": 0.58203125, + "learning_rate": 0.000196941469551019, + "loss": 0.7977, + "step": 1675 + }, + { + "epoch": 0.07941246150201374, + "grad_norm": 0.62890625, + "learning_rate": 0.00019693781340907038, + "loss": 0.2747, + "step": 1676 + }, + { + "epoch": 0.07945984363894812, + "grad_norm": 0.44921875, + "learning_rate": 0.00019693415511714432, + "loss": 0.9906, + "step": 1677 + }, + { + "epoch": 0.0795072257758825, + "grad_norm": 0.67578125, + "learning_rate": 0.00019693049467532192, + "loss": 1.2076, + "step": 1678 + }, + { + "epoch": 0.07955460791281686, + "grad_norm": 0.3828125, + "learning_rate": 0.00019692683208368444, + "loss": 0.1237, + "step": 1679 + }, + { + "epoch": 0.07960199004975124, + "grad_norm": 0.65234375, + "learning_rate": 0.00019692316734231302, + "loss": 0.645, + "step": 1680 + }, + { + "epoch": 0.07964937218668562, + "grad_norm": 1.03125, + "learning_rate": 0.00019691950045128901, + "loss": 1.1272, + "step": 1681 + }, + { + "epoch": 0.07969675432362, + "grad_norm": 0.474609375, + "learning_rate": 0.00019691583141069372, + "loss": 1.2546, + "step": 1682 + }, + { + "epoch": 0.07974413646055437, + "grad_norm": 0.890625, + "learning_rate": 0.0001969121602206085, + "loss": 0.1895, + "step": 1683 + }, + { + "epoch": 0.07979151859748874, + "grad_norm": 0.6015625, + "learning_rate": 0.0001969084868811148, + "loss": 0.8877, + "step": 1684 + }, + { + "epoch": 0.07983890073442312, + "grad_norm": 0.82421875, + "learning_rate": 0.0001969048113922941, + "loss": 0.7107, + "step": 1685 + }, + { + "epoch": 0.0798862828713575, + "grad_norm": 0.416015625, + "learning_rate": 0.00019690113375422789, + "loss": 0.0773, + "step": 1686 + }, + { + "epoch": 0.07993366500829187, + "grad_norm": 0.59765625, + "learning_rate": 0.0001968974539669977, + "loss": 1.6357, + "step": 1687 + }, + { + "epoch": 0.07998104714522625, + "grad_norm": 0.51171875, + "learning_rate": 0.00019689377203068524, + "loss": 1.1769, + "step": 1688 + }, + { + "epoch": 0.08002842928216063, + "grad_norm": 0.51171875, + "learning_rate": 0.00019689008794537212, + "loss": 0.2874, + "step": 1689 + }, + { + "epoch": 0.080075811419095, + "grad_norm": 0.5859375, + "learning_rate": 0.00019688640171114006, + "loss": 1.636, + "step": 1690 + }, + { + "epoch": 0.08012319355602937, + "grad_norm": 0.6015625, + "learning_rate": 0.0001968827133280708, + "loss": 0.9001, + "step": 1691 + }, + { + "epoch": 0.08017057569296375, + "grad_norm": 0.447265625, + "learning_rate": 0.00019687902279624616, + "loss": 0.9937, + "step": 1692 + }, + { + "epoch": 0.08021795782989813, + "grad_norm": 0.40234375, + "learning_rate": 0.00019687533011574797, + "loss": 0.0951, + "step": 1693 + }, + { + "epoch": 0.08026533996683251, + "grad_norm": 0.56640625, + "learning_rate": 0.00019687163528665815, + "loss": 1.5956, + "step": 1694 + }, + { + "epoch": 0.08031272210376687, + "grad_norm": 0.298828125, + "learning_rate": 0.00019686793830905864, + "loss": 0.0098, + "step": 1695 + }, + { + "epoch": 0.08036010424070125, + "grad_norm": 0.5625, + "learning_rate": 0.00019686423918303144, + "loss": 0.219, + "step": 1696 + }, + { + "epoch": 0.08040748637763563, + "grad_norm": 0.55078125, + "learning_rate": 0.0001968605379086586, + "loss": 0.236, + "step": 1697 + }, + { + "epoch": 0.08045486851457001, + "grad_norm": 0.48828125, + "learning_rate": 0.0001968568344860222, + "loss": 0.1505, + "step": 1698 + }, + { + "epoch": 0.08050225065150439, + "grad_norm": 0.5390625, + "learning_rate": 0.00019685312891520437, + "loss": 0.9413, + "step": 1699 + }, + { + "epoch": 0.08054963278843875, + "grad_norm": 0.49609375, + "learning_rate": 0.0001968494211962873, + "loss": 0.0553, + "step": 1700 + }, + { + "epoch": 0.08059701492537313, + "grad_norm": 0.58203125, + "learning_rate": 0.00019684571132935324, + "loss": 1.17, + "step": 1701 + }, + { + "epoch": 0.08064439706230751, + "grad_norm": 0.609375, + "learning_rate": 0.00019684199931448444, + "loss": 1.0493, + "step": 1702 + }, + { + "epoch": 0.08069177919924189, + "grad_norm": 0.5546875, + "learning_rate": 0.00019683828515176325, + "loss": 0.108, + "step": 1703 + }, + { + "epoch": 0.08073916133617626, + "grad_norm": 0.65234375, + "learning_rate": 0.00019683456884127205, + "loss": 0.8528, + "step": 1704 + }, + { + "epoch": 0.08078654347311064, + "grad_norm": 0.5625, + "learning_rate": 0.00019683085038309326, + "loss": 0.2052, + "step": 1705 + }, + { + "epoch": 0.08083392561004502, + "grad_norm": 0.515625, + "learning_rate": 0.00019682712977730935, + "loss": 0.7148, + "step": 1706 + }, + { + "epoch": 0.0808813077469794, + "grad_norm": 0.90625, + "learning_rate": 0.00019682340702400285, + "loss": 0.2189, + "step": 1707 + }, + { + "epoch": 0.08092868988391376, + "grad_norm": 0.671875, + "learning_rate": 0.00019681968212325628, + "loss": 0.773, + "step": 1708 + }, + { + "epoch": 0.08097607202084814, + "grad_norm": 0.71875, + "learning_rate": 0.0001968159550751523, + "loss": 0.2618, + "step": 1709 + }, + { + "epoch": 0.08102345415778252, + "grad_norm": 0.4296875, + "learning_rate": 0.00019681222587977356, + "loss": 0.6582, + "step": 1710 + }, + { + "epoch": 0.0810708362947169, + "grad_norm": 0.578125, + "learning_rate": 0.00019680849453720275, + "loss": 1.3603, + "step": 1711 + }, + { + "epoch": 0.08111821843165126, + "grad_norm": 0.66015625, + "learning_rate": 0.00019680476104752269, + "loss": 0.078, + "step": 1712 + }, + { + "epoch": 0.08116560056858564, + "grad_norm": 0.7421875, + "learning_rate": 0.00019680102541081614, + "loss": 1.651, + "step": 1713 + }, + { + "epoch": 0.08121298270552002, + "grad_norm": 0.5625, + "learning_rate": 0.00019679728762716592, + "loss": 0.8839, + "step": 1714 + }, + { + "epoch": 0.0812603648424544, + "grad_norm": 0.66015625, + "learning_rate": 0.00019679354769665497, + "loss": 0.1678, + "step": 1715 + }, + { + "epoch": 0.08130774697938876, + "grad_norm": 0.69140625, + "learning_rate": 0.00019678980561936625, + "loss": 0.1759, + "step": 1716 + }, + { + "epoch": 0.08135512911632314, + "grad_norm": 0.51171875, + "learning_rate": 0.00019678606139538274, + "loss": 1.0799, + "step": 1717 + }, + { + "epoch": 0.08140251125325752, + "grad_norm": 0.53515625, + "learning_rate": 0.00019678231502478745, + "loss": 0.816, + "step": 1718 + }, + { + "epoch": 0.0814498933901919, + "grad_norm": 0.44140625, + "learning_rate": 0.00019677856650766353, + "loss": 1.0411, + "step": 1719 + }, + { + "epoch": 0.08149727552712627, + "grad_norm": 0.2421875, + "learning_rate": 0.00019677481584409406, + "loss": 0.0126, + "step": 1720 + }, + { + "epoch": 0.08154465766406065, + "grad_norm": 0.9921875, + "learning_rate": 0.00019677106303416227, + "loss": 1.2617, + "step": 1721 + }, + { + "epoch": 0.08159203980099503, + "grad_norm": 0.5234375, + "learning_rate": 0.0001967673080779514, + "loss": 0.6694, + "step": 1722 + }, + { + "epoch": 0.0816394219379294, + "grad_norm": 0.53125, + "learning_rate": 0.00019676355097554468, + "loss": 1.0204, + "step": 1723 + }, + { + "epoch": 0.08168680407486377, + "grad_norm": 0.51171875, + "learning_rate": 0.0001967597917270255, + "loss": 1.0117, + "step": 1724 + }, + { + "epoch": 0.08173418621179815, + "grad_norm": 1.03125, + "learning_rate": 0.00019675603033247717, + "loss": 0.4565, + "step": 1725 + }, + { + "epoch": 0.08178156834873253, + "grad_norm": 0.5390625, + "learning_rate": 0.0001967522667919832, + "loss": 1.0873, + "step": 1726 + }, + { + "epoch": 0.08182895048566691, + "grad_norm": 0.224609375, + "learning_rate": 0.00019674850110562692, + "loss": 0.0202, + "step": 1727 + }, + { + "epoch": 0.08187633262260129, + "grad_norm": 0.2412109375, + "learning_rate": 0.000196744733273492, + "loss": 0.1749, + "step": 1728 + }, + { + "epoch": 0.08192371475953565, + "grad_norm": 0.484375, + "learning_rate": 0.00019674096329566192, + "loss": 0.2461, + "step": 1729 + }, + { + "epoch": 0.08197109689647003, + "grad_norm": 0.6796875, + "learning_rate": 0.0001967371911722203, + "loss": 0.9353, + "step": 1730 + }, + { + "epoch": 0.08201847903340441, + "grad_norm": 0.515625, + "learning_rate": 0.00019673341690325087, + "loss": 1.253, + "step": 1731 + }, + { + "epoch": 0.08206586117033879, + "grad_norm": 0.462890625, + "learning_rate": 0.00019672964048883727, + "loss": 0.072, + "step": 1732 + }, + { + "epoch": 0.08211324330727315, + "grad_norm": 0.32421875, + "learning_rate": 0.00019672586192906325, + "loss": 0.829, + "step": 1733 + }, + { + "epoch": 0.08216062544420753, + "grad_norm": 0.3984375, + "learning_rate": 0.00019672208122401268, + "loss": 0.2471, + "step": 1734 + }, + { + "epoch": 0.08220800758114191, + "grad_norm": 0.5546875, + "learning_rate": 0.00019671829837376935, + "loss": 0.9004, + "step": 1735 + }, + { + "epoch": 0.08225538971807629, + "grad_norm": 0.515625, + "learning_rate": 0.00019671451337841718, + "loss": 1.1285, + "step": 1736 + }, + { + "epoch": 0.08230277185501066, + "grad_norm": 0.3828125, + "learning_rate": 0.00019671072623804012, + "loss": 0.0457, + "step": 1737 + }, + { + "epoch": 0.08235015399194504, + "grad_norm": 0.64453125, + "learning_rate": 0.00019670693695272216, + "loss": 1.2687, + "step": 1738 + }, + { + "epoch": 0.08239753612887941, + "grad_norm": 0.69140625, + "learning_rate": 0.00019670314552254736, + "loss": 0.8366, + "step": 1739 + }, + { + "epoch": 0.0824449182658138, + "grad_norm": 0.5234375, + "learning_rate": 0.00019669935194759978, + "loss": 1.333, + "step": 1740 + }, + { + "epoch": 0.08249230040274816, + "grad_norm": 0.6015625, + "learning_rate": 0.00019669555622796358, + "loss": 1.0499, + "step": 1741 + }, + { + "epoch": 0.08253968253968254, + "grad_norm": 0.009033203125, + "learning_rate": 0.00019669175836372293, + "loss": 0.001, + "step": 1742 + }, + { + "epoch": 0.08258706467661692, + "grad_norm": 0.5546875, + "learning_rate": 0.0001966879583549621, + "loss": 0.8029, + "step": 1743 + }, + { + "epoch": 0.0826344468135513, + "grad_norm": 0.5078125, + "learning_rate": 0.0001966841562017653, + "loss": 0.8387, + "step": 1744 + }, + { + "epoch": 0.08268182895048566, + "grad_norm": 0.80859375, + "learning_rate": 0.00019668035190421694, + "loss": 0.2109, + "step": 1745 + }, + { + "epoch": 0.08272921108742004, + "grad_norm": 0.91796875, + "learning_rate": 0.00019667654546240133, + "loss": 0.2607, + "step": 1746 + }, + { + "epoch": 0.08277659322435442, + "grad_norm": 0.54296875, + "learning_rate": 0.00019667273687640292, + "loss": 1.5496, + "step": 1747 + }, + { + "epoch": 0.0828239753612888, + "grad_norm": 0.53125, + "learning_rate": 0.00019666892614630618, + "loss": 0.4689, + "step": 1748 + }, + { + "epoch": 0.08287135749822316, + "grad_norm": 0.44921875, + "learning_rate": 0.00019666511327219563, + "loss": 0.9652, + "step": 1749 + }, + { + "epoch": 0.08291873963515754, + "grad_norm": 0.51171875, + "learning_rate": 0.00019666129825415582, + "loss": 0.789, + "step": 1750 + }, + { + "epoch": 0.08296612177209192, + "grad_norm": 0.5625, + "learning_rate": 0.0001966574810922714, + "loss": 0.3402, + "step": 1751 + }, + { + "epoch": 0.0830135039090263, + "grad_norm": 0.76171875, + "learning_rate": 0.00019665366178662697, + "loss": 0.1491, + "step": 1752 + }, + { + "epoch": 0.08306088604596067, + "grad_norm": 0.703125, + "learning_rate": 0.0001966498403373073, + "loss": 0.0646, + "step": 1753 + }, + { + "epoch": 0.08310826818289505, + "grad_norm": 0.68359375, + "learning_rate": 0.00019664601674439708, + "loss": 0.2229, + "step": 1754 + }, + { + "epoch": 0.08315565031982942, + "grad_norm": 0.5390625, + "learning_rate": 0.00019664219100798118, + "loss": 0.0445, + "step": 1755 + }, + { + "epoch": 0.0832030324567638, + "grad_norm": 0.45703125, + "learning_rate": 0.0001966383631281444, + "loss": 0.9004, + "step": 1756 + }, + { + "epoch": 0.08325041459369818, + "grad_norm": 0.546875, + "learning_rate": 0.0001966345331049717, + "loss": 1.1838, + "step": 1757 + }, + { + "epoch": 0.08329779673063255, + "grad_norm": 0.416015625, + "learning_rate": 0.00019663070093854797, + "loss": 0.8954, + "step": 1758 + }, + { + "epoch": 0.08334517886756693, + "grad_norm": 0.57421875, + "learning_rate": 0.00019662686662895822, + "loss": 0.9397, + "step": 1759 + }, + { + "epoch": 0.0833925610045013, + "grad_norm": 0.515625, + "learning_rate": 0.00019662303017628746, + "loss": 1.0675, + "step": 1760 + }, + { + "epoch": 0.08343994314143569, + "grad_norm": 0.388671875, + "learning_rate": 0.00019661919158062084, + "loss": 0.0825, + "step": 1761 + }, + { + "epoch": 0.08348732527837005, + "grad_norm": 0.53125, + "learning_rate": 0.00019661535084204346, + "loss": 0.9899, + "step": 1762 + }, + { + "epoch": 0.08353470741530443, + "grad_norm": 0.4296875, + "learning_rate": 0.0001966115079606405, + "loss": 0.5338, + "step": 1763 + }, + { + "epoch": 0.08358208955223881, + "grad_norm": 0.8984375, + "learning_rate": 0.00019660766293649718, + "loss": 0.8799, + "step": 1764 + }, + { + "epoch": 0.08362947168917319, + "grad_norm": 0.55859375, + "learning_rate": 0.00019660381576969886, + "loss": 0.9777, + "step": 1765 + }, + { + "epoch": 0.08367685382610755, + "grad_norm": 0.455078125, + "learning_rate": 0.00019659996646033076, + "loss": 0.7217, + "step": 1766 + }, + { + "epoch": 0.08372423596304193, + "grad_norm": 0.5625, + "learning_rate": 0.0001965961150084783, + "loss": 0.978, + "step": 1767 + }, + { + "epoch": 0.08377161809997631, + "grad_norm": 0.6484375, + "learning_rate": 0.0001965922614142269, + "loss": 0.4204, + "step": 1768 + }, + { + "epoch": 0.08381900023691069, + "grad_norm": 0.5546875, + "learning_rate": 0.00019658840567766205, + "loss": 0.9237, + "step": 1769 + }, + { + "epoch": 0.08386638237384506, + "grad_norm": 0.482421875, + "learning_rate": 0.0001965845477988692, + "loss": 1.1636, + "step": 1770 + }, + { + "epoch": 0.08391376451077943, + "grad_norm": 0.62890625, + "learning_rate": 0.000196580687777934, + "loss": 0.9592, + "step": 1771 + }, + { + "epoch": 0.08396114664771381, + "grad_norm": 0.55859375, + "learning_rate": 0.00019657682561494198, + "loss": 0.9794, + "step": 1772 + }, + { + "epoch": 0.08400852878464819, + "grad_norm": 0.5859375, + "learning_rate": 0.00019657296130997886, + "loss": 1.0554, + "step": 1773 + }, + { + "epoch": 0.08405591092158256, + "grad_norm": 0.34765625, + "learning_rate": 0.00019656909486313033, + "loss": 0.0215, + "step": 1774 + }, + { + "epoch": 0.08410329305851694, + "grad_norm": 1.0546875, + "learning_rate": 0.00019656522627448212, + "loss": 0.4928, + "step": 1775 + }, + { + "epoch": 0.08415067519545132, + "grad_norm": 0.443359375, + "learning_rate": 0.00019656135554412004, + "loss": 1.4965, + "step": 1776 + }, + { + "epoch": 0.0841980573323857, + "grad_norm": 0.34375, + "learning_rate": 0.00019655748267212998, + "loss": 0.1763, + "step": 1777 + }, + { + "epoch": 0.08424543946932006, + "grad_norm": 0.486328125, + "learning_rate": 0.00019655360765859778, + "loss": 1.0559, + "step": 1778 + }, + { + "epoch": 0.08429282160625444, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019654973050360942, + "loss": 0.0206, + "step": 1779 + }, + { + "epoch": 0.08434020374318882, + "grad_norm": 0.6953125, + "learning_rate": 0.00019654585120725085, + "loss": 1.4635, + "step": 1780 + }, + { + "epoch": 0.0843875858801232, + "grad_norm": 0.51171875, + "learning_rate": 0.00019654196976960818, + "loss": 1.0031, + "step": 1781 + }, + { + "epoch": 0.08443496801705756, + "grad_norm": 0.5546875, + "learning_rate": 0.0001965380861907674, + "loss": 0.6398, + "step": 1782 + }, + { + "epoch": 0.08448235015399194, + "grad_norm": 0.6015625, + "learning_rate": 0.00019653420047081473, + "loss": 1.2147, + "step": 1783 + }, + { + "epoch": 0.08452973229092632, + "grad_norm": 0.7421875, + "learning_rate": 0.0001965303126098363, + "loss": 0.259, + "step": 1784 + }, + { + "epoch": 0.0845771144278607, + "grad_norm": 0.609375, + "learning_rate": 0.00019652642260791837, + "loss": 0.988, + "step": 1785 + }, + { + "epoch": 0.08462449656479507, + "grad_norm": 0.419921875, + "learning_rate": 0.0001965225304651472, + "loss": 0.7848, + "step": 1786 + }, + { + "epoch": 0.08467187870172944, + "grad_norm": 0.3046875, + "learning_rate": 0.0001965186361816091, + "loss": 0.022, + "step": 1787 + }, + { + "epoch": 0.08471926083866382, + "grad_norm": 0.251953125, + "learning_rate": 0.00019651473975739047, + "loss": 0.0087, + "step": 1788 + }, + { + "epoch": 0.0847666429755982, + "grad_norm": 0.58984375, + "learning_rate": 0.0001965108411925777, + "loss": 1.2164, + "step": 1789 + }, + { + "epoch": 0.08481402511253258, + "grad_norm": 1.015625, + "learning_rate": 0.00019650694048725732, + "loss": 0.8642, + "step": 1790 + }, + { + "epoch": 0.08486140724946695, + "grad_norm": 0.57421875, + "learning_rate": 0.00019650303764151574, + "loss": 1.1453, + "step": 1791 + }, + { + "epoch": 0.08490878938640133, + "grad_norm": 0.14453125, + "learning_rate": 0.0001964991326554396, + "loss": 0.0116, + "step": 1792 + }, + { + "epoch": 0.0849561715233357, + "grad_norm": 0.32421875, + "learning_rate": 0.00019649522552911547, + "loss": 0.1656, + "step": 1793 + }, + { + "epoch": 0.08500355366027008, + "grad_norm": 0.447265625, + "learning_rate": 0.00019649131626263002, + "loss": 0.791, + "step": 1794 + }, + { + "epoch": 0.08505093579720445, + "grad_norm": 0.421875, + "learning_rate": 0.00019648740485606996, + "loss": 0.0354, + "step": 1795 + }, + { + "epoch": 0.08509831793413883, + "grad_norm": 0.48828125, + "learning_rate": 0.00019648349130952207, + "loss": 0.9545, + "step": 1796 + }, + { + "epoch": 0.08514570007107321, + "grad_norm": 0.6796875, + "learning_rate": 0.00019647957562307305, + "loss": 0.036, + "step": 1797 + }, + { + "epoch": 0.08519308220800759, + "grad_norm": 0.296875, + "learning_rate": 0.00019647565779680983, + "loss": 0.1932, + "step": 1798 + }, + { + "epoch": 0.08524046434494195, + "grad_norm": 0.51953125, + "learning_rate": 0.00019647173783081932, + "loss": 0.8699, + "step": 1799 + }, + { + "epoch": 0.08528784648187633, + "grad_norm": 0.0556640625, + "learning_rate": 0.00019646781572518838, + "loss": 0.0034, + "step": 1800 + }, + { + "epoch": 0.08533522861881071, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019646389148000404, + "loss": 0.0264, + "step": 1801 + }, + { + "epoch": 0.08538261075574509, + "grad_norm": 0.4296875, + "learning_rate": 0.00019645996509535334, + "loss": 0.0255, + "step": 1802 + }, + { + "epoch": 0.08542999289267945, + "grad_norm": 0.474609375, + "learning_rate": 0.00019645603657132335, + "loss": 1.1372, + "step": 1803 + }, + { + "epoch": 0.08547737502961383, + "grad_norm": 0.455078125, + "learning_rate": 0.00019645210590800124, + "loss": 1.076, + "step": 1804 + }, + { + "epoch": 0.08552475716654821, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019644817310547416, + "loss": 0.0127, + "step": 1805 + }, + { + "epoch": 0.08557213930348259, + "grad_norm": 0.76171875, + "learning_rate": 0.0001964442381638293, + "loss": 0.7601, + "step": 1806 + }, + { + "epoch": 0.08561952144041696, + "grad_norm": 0.625, + "learning_rate": 0.00019644030108315397, + "loss": 1.2834, + "step": 1807 + }, + { + "epoch": 0.08566690357735134, + "grad_norm": 0.63671875, + "learning_rate": 0.0001964363618635355, + "loss": 0.9358, + "step": 1808 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.4375, + "learning_rate": 0.00019643242050506124, + "loss": 0.1022, + "step": 1809 + }, + { + "epoch": 0.0857616678512201, + "grad_norm": 0.6171875, + "learning_rate": 0.0001964284770078186, + "loss": 1.0071, + "step": 1810 + }, + { + "epoch": 0.08580904998815446, + "grad_norm": 1.203125, + "learning_rate": 0.00019642453137189505, + "loss": 0.0967, + "step": 1811 + }, + { + "epoch": 0.08585643212508884, + "grad_norm": 0.9921875, + "learning_rate": 0.00019642058359737813, + "loss": 0.9234, + "step": 1812 + }, + { + "epoch": 0.08590381426202322, + "grad_norm": 0.60546875, + "learning_rate": 0.00019641663368435533, + "loss": 1.2388, + "step": 1813 + }, + { + "epoch": 0.0859511963989576, + "grad_norm": 0.1484375, + "learning_rate": 0.0001964126816329143, + "loss": 0.007, + "step": 1814 + }, + { + "epoch": 0.08599857853589196, + "grad_norm": 0.953125, + "learning_rate": 0.0001964087274431427, + "loss": 0.0974, + "step": 1815 + }, + { + "epoch": 0.08604596067282634, + "grad_norm": 1.234375, + "learning_rate": 0.0001964047711151282, + "loss": 0.4457, + "step": 1816 + }, + { + "epoch": 0.08609334280976072, + "grad_norm": 0.44921875, + "learning_rate": 0.00019640081264895857, + "loss": 1.1292, + "step": 1817 + }, + { + "epoch": 0.0861407249466951, + "grad_norm": 0.5390625, + "learning_rate": 0.0001963968520447216, + "loss": 0.7719, + "step": 1818 + }, + { + "epoch": 0.08618810708362948, + "grad_norm": 0.52734375, + "learning_rate": 0.00019639288930250516, + "loss": 0.7508, + "step": 1819 + }, + { + "epoch": 0.08623548922056384, + "grad_norm": 0.384765625, + "learning_rate": 0.00019638892442239706, + "loss": 1.1699, + "step": 1820 + }, + { + "epoch": 0.08628287135749822, + "grad_norm": 0.78515625, + "learning_rate": 0.00019638495740448528, + "loss": 0.9526, + "step": 1821 + }, + { + "epoch": 0.0863302534944326, + "grad_norm": 0.5234375, + "learning_rate": 0.00019638098824885784, + "loss": 1.1326, + "step": 1822 + }, + { + "epoch": 0.08637763563136698, + "grad_norm": 0.470703125, + "learning_rate": 0.00019637701695560274, + "loss": 0.1677, + "step": 1823 + }, + { + "epoch": 0.08642501776830135, + "grad_norm": 0.462890625, + "learning_rate": 0.00019637304352480806, + "loss": 1.0371, + "step": 1824 + }, + { + "epoch": 0.08647239990523573, + "grad_norm": 0.34375, + "learning_rate": 0.0001963690679565619, + "loss": 0.208, + "step": 1825 + }, + { + "epoch": 0.0865197820421701, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019636509025095248, + "loss": 0.0226, + "step": 1826 + }, + { + "epoch": 0.08656716417910448, + "grad_norm": 0.333984375, + "learning_rate": 0.00019636111040806798, + "loss": 0.1305, + "step": 1827 + }, + { + "epoch": 0.08661454631603885, + "grad_norm": 0.54296875, + "learning_rate": 0.00019635712842799673, + "loss": 1.0012, + "step": 1828 + }, + { + "epoch": 0.08666192845297323, + "grad_norm": 0.6015625, + "learning_rate": 0.00019635314431082698, + "loss": 0.9523, + "step": 1829 + }, + { + "epoch": 0.0867093105899076, + "grad_norm": 0.89453125, + "learning_rate": 0.00019634915805664713, + "loss": 0.7971, + "step": 1830 + }, + { + "epoch": 0.08675669272684199, + "grad_norm": 0.54296875, + "learning_rate": 0.0001963451696655456, + "loss": 0.8661, + "step": 1831 + }, + { + "epoch": 0.08680407486377635, + "grad_norm": 0.51953125, + "learning_rate": 0.0001963411791376108, + "loss": 1.1867, + "step": 1832 + }, + { + "epoch": 0.08685145700071073, + "grad_norm": 0.5703125, + "learning_rate": 0.00019633718647293127, + "loss": 0.7809, + "step": 1833 + }, + { + "epoch": 0.08689883913764511, + "grad_norm": 0.60546875, + "learning_rate": 0.00019633319167159557, + "loss": 0.9822, + "step": 1834 + }, + { + "epoch": 0.08694622127457949, + "grad_norm": 0.58984375, + "learning_rate": 0.00019632919473369228, + "loss": 1.0346, + "step": 1835 + }, + { + "epoch": 0.08699360341151385, + "grad_norm": 0.265625, + "learning_rate": 0.00019632519565931006, + "loss": 0.1719, + "step": 1836 + }, + { + "epoch": 0.08704098554844823, + "grad_norm": 0.498046875, + "learning_rate": 0.00019632119444853757, + "loss": 0.3176, + "step": 1837 + }, + { + "epoch": 0.08708836768538261, + "grad_norm": 0.59375, + "learning_rate": 0.00019631719110146362, + "loss": 1.0714, + "step": 1838 + }, + { + "epoch": 0.08713574982231699, + "grad_norm": 0.330078125, + "learning_rate": 0.00019631318561817696, + "loss": 0.2127, + "step": 1839 + }, + { + "epoch": 0.08718313195925136, + "grad_norm": 0.44140625, + "learning_rate": 0.00019630917799876642, + "loss": 1.0841, + "step": 1840 + }, + { + "epoch": 0.08723051409618574, + "grad_norm": 0.69921875, + "learning_rate": 0.0001963051682433209, + "loss": 0.55, + "step": 1841 + }, + { + "epoch": 0.08727789623312011, + "grad_norm": 0.48046875, + "learning_rate": 0.00019630115635192933, + "loss": 0.7196, + "step": 1842 + }, + { + "epoch": 0.0873252783700545, + "grad_norm": 0.5859375, + "learning_rate": 0.0001962971423246807, + "loss": 1.0728, + "step": 1843 + }, + { + "epoch": 0.08737266050698886, + "grad_norm": 0.24609375, + "learning_rate": 0.000196293126161664, + "loss": 0.0204, + "step": 1844 + }, + { + "epoch": 0.08742004264392324, + "grad_norm": 0.6171875, + "learning_rate": 0.00019628910786296833, + "loss": 0.126, + "step": 1845 + }, + { + "epoch": 0.08746742478085762, + "grad_norm": 0.55078125, + "learning_rate": 0.00019628508742868285, + "loss": 1.0929, + "step": 1846 + }, + { + "epoch": 0.087514806917792, + "grad_norm": 0.498046875, + "learning_rate": 0.00019628106485889663, + "loss": 1.3432, + "step": 1847 + }, + { + "epoch": 0.08756218905472637, + "grad_norm": 0.6328125, + "learning_rate": 0.000196277040153699, + "loss": 1.217, + "step": 1848 + }, + { + "epoch": 0.08760957119166074, + "grad_norm": 0.57421875, + "learning_rate": 0.0001962730133131791, + "loss": 1.0241, + "step": 1849 + }, + { + "epoch": 0.08765695332859512, + "grad_norm": 0.01507568359375, + "learning_rate": 0.00019626898433742636, + "loss": 0.0011, + "step": 1850 + }, + { + "epoch": 0.0877043354655295, + "grad_norm": 0.55859375, + "learning_rate": 0.0001962649532265301, + "loss": 1.0595, + "step": 1851 + }, + { + "epoch": 0.08775171760246388, + "grad_norm": 0.51953125, + "learning_rate": 0.0001962609199805797, + "loss": 0.9597, + "step": 1852 + }, + { + "epoch": 0.08779909973939824, + "grad_norm": 0.74609375, + "learning_rate": 0.00019625688459966464, + "loss": 0.3966, + "step": 1853 + }, + { + "epoch": 0.08784648187633262, + "grad_norm": 0.58984375, + "learning_rate": 0.00019625284708387435, + "loss": 0.17, + "step": 1854 + }, + { + "epoch": 0.087893864013267, + "grad_norm": 0.0927734375, + "learning_rate": 0.00019624880743329847, + "loss": 0.006, + "step": 1855 + }, + { + "epoch": 0.08794124615020138, + "grad_norm": 0.58203125, + "learning_rate": 0.00019624476564802657, + "loss": 0.2036, + "step": 1856 + }, + { + "epoch": 0.08798862828713575, + "grad_norm": 0.59765625, + "learning_rate": 0.00019624072172814828, + "loss": 1.4628, + "step": 1857 + }, + { + "epoch": 0.08803601042407012, + "grad_norm": 0.93359375, + "learning_rate": 0.0001962366756737533, + "loss": 0.377, + "step": 1858 + }, + { + "epoch": 0.0880833925610045, + "grad_norm": 0.65234375, + "learning_rate": 0.00019623262748493135, + "loss": 1.082, + "step": 1859 + }, + { + "epoch": 0.08813077469793888, + "grad_norm": 0.435546875, + "learning_rate": 0.00019622857716177224, + "loss": 0.1617, + "step": 1860 + }, + { + "epoch": 0.08817815683487325, + "grad_norm": 0.474609375, + "learning_rate": 0.00019622452470436578, + "loss": 1.0647, + "step": 1861 + }, + { + "epoch": 0.08822553897180763, + "grad_norm": 0.5234375, + "learning_rate": 0.00019622047011280184, + "loss": 1.0775, + "step": 1862 + }, + { + "epoch": 0.088272921108742, + "grad_norm": 0.5703125, + "learning_rate": 0.00019621641338717037, + "loss": 1.3918, + "step": 1863 + }, + { + "epoch": 0.08832030324567638, + "grad_norm": 0.796875, + "learning_rate": 0.00019621235452756136, + "loss": 0.1497, + "step": 1864 + }, + { + "epoch": 0.08836768538261075, + "grad_norm": 0.373046875, + "learning_rate": 0.00019620829353406477, + "loss": 1.1239, + "step": 1865 + }, + { + "epoch": 0.08841506751954513, + "grad_norm": 0.53515625, + "learning_rate": 0.00019620423040677074, + "loss": 0.9789, + "step": 1866 + }, + { + "epoch": 0.08846244965647951, + "grad_norm": 0.51171875, + "learning_rate": 0.00019620016514576932, + "loss": 1.0354, + "step": 1867 + }, + { + "epoch": 0.08850983179341389, + "grad_norm": 0.5, + "learning_rate": 0.00019619609775115072, + "loss": 1.2406, + "step": 1868 + }, + { + "epoch": 0.08855721393034825, + "grad_norm": 0.458984375, + "learning_rate": 0.00019619202822300513, + "loss": 1.5745, + "step": 1869 + }, + { + "epoch": 0.08860459606728263, + "grad_norm": 0.61328125, + "learning_rate": 0.0001961879565614228, + "loss": 1.1745, + "step": 1870 + }, + { + "epoch": 0.08865197820421701, + "grad_norm": 1.0234375, + "learning_rate": 0.00019618388276649407, + "loss": 0.2836, + "step": 1871 + }, + { + "epoch": 0.08869936034115139, + "grad_norm": 1.21875, + "learning_rate": 0.00019617980683830926, + "loss": 0.379, + "step": 1872 + }, + { + "epoch": 0.08874674247808576, + "grad_norm": 0.81640625, + "learning_rate": 0.00019617572877695882, + "loss": 0.1641, + "step": 1873 + }, + { + "epoch": 0.08879412461502013, + "grad_norm": 0.33984375, + "learning_rate": 0.00019617164858253311, + "loss": 0.0742, + "step": 1874 + }, + { + "epoch": 0.08884150675195451, + "grad_norm": 0.49609375, + "learning_rate": 0.0001961675662551227, + "loss": 1.1144, + "step": 1875 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.5546875, + "learning_rate": 0.00019616348179481808, + "loss": 0.0538, + "step": 1876 + }, + { + "epoch": 0.08893627102582327, + "grad_norm": 0.44921875, + "learning_rate": 0.00019615939520170988, + "loss": 0.9442, + "step": 1877 + }, + { + "epoch": 0.08898365316275764, + "grad_norm": 0.6640625, + "learning_rate": 0.00019615530647588872, + "loss": 1.5793, + "step": 1878 + }, + { + "epoch": 0.08903103529969202, + "grad_norm": 0.478515625, + "learning_rate": 0.00019615121561744524, + "loss": 0.7052, + "step": 1879 + }, + { + "epoch": 0.0890784174366264, + "grad_norm": 0.60546875, + "learning_rate": 0.00019614712262647026, + "loss": 1.4601, + "step": 1880 + }, + { + "epoch": 0.08912579957356077, + "grad_norm": 0.65625, + "learning_rate": 0.00019614302750305452, + "loss": 1.1465, + "step": 1881 + }, + { + "epoch": 0.08917318171049514, + "grad_norm": 0.83984375, + "learning_rate": 0.0001961389302472888, + "loss": 1.4365, + "step": 1882 + }, + { + "epoch": 0.08922056384742952, + "grad_norm": 0.427734375, + "learning_rate": 0.000196134830859264, + "loss": 1.0184, + "step": 1883 + }, + { + "epoch": 0.0892679459843639, + "grad_norm": 0.65625, + "learning_rate": 0.0001961307293390711, + "loss": 1.4051, + "step": 1884 + }, + { + "epoch": 0.08931532812129828, + "grad_norm": 0.498046875, + "learning_rate": 0.00019612662568680098, + "loss": 0.7338, + "step": 1885 + }, + { + "epoch": 0.08936271025823264, + "grad_norm": 0.490234375, + "learning_rate": 0.0001961225199025447, + "loss": 1.3631, + "step": 1886 + }, + { + "epoch": 0.08941009239516702, + "grad_norm": 0.5234375, + "learning_rate": 0.00019611841198639333, + "loss": 0.8304, + "step": 1887 + }, + { + "epoch": 0.0894574745321014, + "grad_norm": 0.58203125, + "learning_rate": 0.00019611430193843793, + "loss": 0.8106, + "step": 1888 + }, + { + "epoch": 0.08950485666903578, + "grad_norm": 0.66015625, + "learning_rate": 0.00019611018975876972, + "loss": 0.0576, + "step": 1889 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 0.48046875, + "learning_rate": 0.00019610607544747987, + "loss": 0.5524, + "step": 1890 + }, + { + "epoch": 0.08959962094290452, + "grad_norm": 0.58984375, + "learning_rate": 0.00019610195900465963, + "loss": 1.0035, + "step": 1891 + }, + { + "epoch": 0.0896470030798389, + "grad_norm": 0.53125, + "learning_rate": 0.00019609784043040032, + "loss": 1.0234, + "step": 1892 + }, + { + "epoch": 0.08969438521677328, + "grad_norm": 0.62109375, + "learning_rate": 0.0001960937197247932, + "loss": 1.0955, + "step": 1893 + }, + { + "epoch": 0.08974176735370765, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001960895968879298, + "loss": 0.0322, + "step": 1894 + }, + { + "epoch": 0.08978914949064203, + "grad_norm": 0.48828125, + "learning_rate": 0.00019608547191990146, + "loss": 0.7134, + "step": 1895 + }, + { + "epoch": 0.0898365316275764, + "grad_norm": 0.498046875, + "learning_rate": 0.00019608134482079973, + "loss": 0.8305, + "step": 1896 + }, + { + "epoch": 0.08988391376451078, + "grad_norm": 0.703125, + "learning_rate": 0.00019607721559071608, + "loss": 1.1003, + "step": 1897 + }, + { + "epoch": 0.08993129590144515, + "grad_norm": 0.625, + "learning_rate": 0.00019607308422974216, + "loss": 0.9224, + "step": 1898 + }, + { + "epoch": 0.08997867803837953, + "grad_norm": 0.47265625, + "learning_rate": 0.0001960689507379695, + "loss": 0.2233, + "step": 1899 + }, + { + "epoch": 0.09002606017531391, + "grad_norm": 0.5546875, + "learning_rate": 0.0001960648151154899, + "loss": 0.1031, + "step": 1900 + }, + { + "epoch": 0.09007344231224829, + "grad_norm": 0.3359375, + "learning_rate": 0.000196060677362395, + "loss": 0.1767, + "step": 1901 + }, + { + "epoch": 0.09012082444918265, + "grad_norm": 0.042236328125, + "learning_rate": 0.0001960565374787766, + "loss": 0.0024, + "step": 1902 + }, + { + "epoch": 0.09016820658611703, + "grad_norm": 0.51953125, + "learning_rate": 0.0001960523954647265, + "loss": 1.0201, + "step": 1903 + }, + { + "epoch": 0.09021558872305141, + "grad_norm": 0.5390625, + "learning_rate": 0.00019604825132033663, + "loss": 1.044, + "step": 1904 + }, + { + "epoch": 0.09026297085998579, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001960441050456988, + "loss": 0.0299, + "step": 1905 + }, + { + "epoch": 0.09031035299692017, + "grad_norm": 0.498046875, + "learning_rate": 0.000196039956640905, + "loss": 1.0385, + "step": 1906 + }, + { + "epoch": 0.09035773513385453, + "grad_norm": 0.53125, + "learning_rate": 0.0001960358061060473, + "loss": 1.0603, + "step": 1907 + }, + { + "epoch": 0.09040511727078891, + "grad_norm": 0.38671875, + "learning_rate": 0.00019603165344121774, + "loss": 0.8072, + "step": 1908 + }, + { + "epoch": 0.09045249940772329, + "grad_norm": 0.8046875, + "learning_rate": 0.00019602749864650835, + "loss": 0.3386, + "step": 1909 + }, + { + "epoch": 0.09049988154465767, + "grad_norm": 0.6875, + "learning_rate": 0.00019602334172201137, + "loss": 1.4393, + "step": 1910 + }, + { + "epoch": 0.09054726368159204, + "grad_norm": 0.6015625, + "learning_rate": 0.0001960191826678189, + "loss": 1.5312, + "step": 1911 + }, + { + "epoch": 0.09059464581852641, + "grad_norm": 0.96484375, + "learning_rate": 0.00019601502148402323, + "loss": 0.3824, + "step": 1912 + }, + { + "epoch": 0.0906420279554608, + "grad_norm": 0.80078125, + "learning_rate": 0.0001960108581707167, + "loss": 0.2593, + "step": 1913 + }, + { + "epoch": 0.09068941009239517, + "grad_norm": 0.51171875, + "learning_rate": 0.00019600669272799157, + "loss": 0.102, + "step": 1914 + }, + { + "epoch": 0.09073679222932954, + "grad_norm": 0.62109375, + "learning_rate": 0.00019600252515594027, + "loss": 0.9024, + "step": 1915 + }, + { + "epoch": 0.09078417436626392, + "grad_norm": 0.59765625, + "learning_rate": 0.0001959983554546552, + "loss": 0.6981, + "step": 1916 + }, + { + "epoch": 0.0908315565031983, + "grad_norm": 0.5234375, + "learning_rate": 0.0001959941836242289, + "loss": 1.0576, + "step": 1917 + }, + { + "epoch": 0.09087893864013268, + "grad_norm": 0.5390625, + "learning_rate": 0.00019599000966475383, + "loss": 0.6964, + "step": 1918 + }, + { + "epoch": 0.09092632077706704, + "grad_norm": 0.53515625, + "learning_rate": 0.00019598583357632258, + "loss": 0.0821, + "step": 1919 + }, + { + "epoch": 0.09097370291400142, + "grad_norm": 0.5546875, + "learning_rate": 0.00019598165535902778, + "loss": 0.8816, + "step": 1920 + }, + { + "epoch": 0.0910210850509358, + "grad_norm": 0.55859375, + "learning_rate": 0.0001959774750129621, + "loss": 1.0731, + "step": 1921 + }, + { + "epoch": 0.09106846718787018, + "grad_norm": 0.58984375, + "learning_rate": 0.0001959732925382183, + "loss": 0.7151, + "step": 1922 + }, + { + "epoch": 0.09111584932480454, + "grad_norm": 0.67578125, + "learning_rate": 0.00019596910793488902, + "loss": 1.176, + "step": 1923 + }, + { + "epoch": 0.09116323146173892, + "grad_norm": 0.85546875, + "learning_rate": 0.0001959649212030672, + "loss": 0.3397, + "step": 1924 + }, + { + "epoch": 0.0912106135986733, + "grad_norm": 0.484375, + "learning_rate": 0.00019596073234284561, + "loss": 1.1506, + "step": 1925 + }, + { + "epoch": 0.09125799573560768, + "grad_norm": 0.58203125, + "learning_rate": 0.00019595654135431723, + "loss": 0.7399, + "step": 1926 + }, + { + "epoch": 0.09130537787254205, + "grad_norm": 0.58203125, + "learning_rate": 0.00019595234823757495, + "loss": 0.1751, + "step": 1927 + }, + { + "epoch": 0.09135276000947642, + "grad_norm": 0.5625, + "learning_rate": 0.0001959481529927118, + "loss": 1.2673, + "step": 1928 + }, + { + "epoch": 0.0914001421464108, + "grad_norm": 0.53125, + "learning_rate": 0.00019594395561982081, + "loss": 0.9565, + "step": 1929 + }, + { + "epoch": 0.09144752428334518, + "grad_norm": 0.34375, + "learning_rate": 0.00019593975611899506, + "loss": 0.4006, + "step": 1930 + }, + { + "epoch": 0.09149490642027955, + "grad_norm": 0.451171875, + "learning_rate": 0.00019593555449032773, + "loss": 0.62, + "step": 1931 + }, + { + "epoch": 0.09154228855721393, + "grad_norm": 0.671875, + "learning_rate": 0.000195931350733912, + "loss": 1.1285, + "step": 1932 + }, + { + "epoch": 0.0915896706941483, + "grad_norm": 0.447265625, + "learning_rate": 0.00019592714484984106, + "loss": 0.8639, + "step": 1933 + }, + { + "epoch": 0.09163705283108269, + "grad_norm": 0.55859375, + "learning_rate": 0.00019592293683820826, + "loss": 1.1909, + "step": 1934 + }, + { + "epoch": 0.09168443496801706, + "grad_norm": 0.546875, + "learning_rate": 0.0001959187266991069, + "loss": 1.2091, + "step": 1935 + }, + { + "epoch": 0.09173181710495143, + "grad_norm": 0.57421875, + "learning_rate": 0.0001959145144326303, + "loss": 1.3906, + "step": 1936 + }, + { + "epoch": 0.09177919924188581, + "grad_norm": 0.54296875, + "learning_rate": 0.00019591030003887196, + "loss": 0.248, + "step": 1937 + }, + { + "epoch": 0.09182658137882019, + "grad_norm": 0.5625, + "learning_rate": 0.00019590608351792533, + "loss": 1.1041, + "step": 1938 + }, + { + "epoch": 0.09187396351575457, + "grad_norm": 0.609375, + "learning_rate": 0.00019590186486988391, + "loss": 1.1581, + "step": 1939 + }, + { + "epoch": 0.09192134565268893, + "grad_norm": 0.46875, + "learning_rate": 0.0001958976440948413, + "loss": 0.9681, + "step": 1940 + }, + { + "epoch": 0.09196872778962331, + "grad_norm": 0.447265625, + "learning_rate": 0.00019589342119289105, + "loss": 0.8862, + "step": 1941 + }, + { + "epoch": 0.09201610992655769, + "grad_norm": 0.6171875, + "learning_rate": 0.0001958891961641269, + "loss": 1.1146, + "step": 1942 + }, + { + "epoch": 0.09206349206349207, + "grad_norm": 0.439453125, + "learning_rate": 0.0001958849690086425, + "loss": 0.5821, + "step": 1943 + }, + { + "epoch": 0.09211087420042643, + "grad_norm": 1.140625, + "learning_rate": 0.0001958807397265316, + "loss": 0.6119, + "step": 1944 + }, + { + "epoch": 0.09215825633736081, + "grad_norm": 0.88671875, + "learning_rate": 0.00019587650831788805, + "loss": 0.5591, + "step": 1945 + }, + { + "epoch": 0.09220563847429519, + "grad_norm": 0.546875, + "learning_rate": 0.00019587227478280562, + "loss": 0.3989, + "step": 1946 + }, + { + "epoch": 0.09225302061122957, + "grad_norm": 0.57421875, + "learning_rate": 0.0001958680391213783, + "loss": 1.3922, + "step": 1947 + }, + { + "epoch": 0.09230040274816394, + "grad_norm": 0.50390625, + "learning_rate": 0.00019586380133369998, + "loss": 0.1463, + "step": 1948 + }, + { + "epoch": 0.09234778488509832, + "grad_norm": 1.1484375, + "learning_rate": 0.00019585956141986467, + "loss": 0.922, + "step": 1949 + }, + { + "epoch": 0.0923951670220327, + "grad_norm": 0.3671875, + "learning_rate": 0.00019585531937996636, + "loss": 0.2105, + "step": 1950 + }, + { + "epoch": 0.09244254915896707, + "grad_norm": 0.50390625, + "learning_rate": 0.00019585107521409917, + "loss": 0.9343, + "step": 1951 + }, + { + "epoch": 0.09248993129590144, + "grad_norm": 0.44921875, + "learning_rate": 0.00019584682892235725, + "loss": 1.093, + "step": 1952 + }, + { + "epoch": 0.09253731343283582, + "grad_norm": 0.58203125, + "learning_rate": 0.00019584258050483474, + "loss": 1.3191, + "step": 1953 + }, + { + "epoch": 0.0925846955697702, + "grad_norm": 0.259765625, + "learning_rate": 0.0001958383299616259, + "loss": 0.0516, + "step": 1954 + }, + { + "epoch": 0.09263207770670458, + "grad_norm": 0.4765625, + "learning_rate": 0.00019583407729282498, + "loss": 0.8796, + "step": 1955 + }, + { + "epoch": 0.09267945984363894, + "grad_norm": 0.3984375, + "learning_rate": 0.00019582982249852625, + "loss": 0.5609, + "step": 1956 + }, + { + "epoch": 0.09272684198057332, + "grad_norm": 0.404296875, + "learning_rate": 0.0001958255655788242, + "loss": 0.6923, + "step": 1957 + }, + { + "epoch": 0.0927742241175077, + "grad_norm": 0.43359375, + "learning_rate": 0.00019582130653381314, + "loss": 0.2134, + "step": 1958 + }, + { + "epoch": 0.09282160625444208, + "grad_norm": 0.671875, + "learning_rate": 0.00019581704536358755, + "loss": 1.4877, + "step": 1959 + }, + { + "epoch": 0.09286898839137644, + "grad_norm": 0.5625, + "learning_rate": 0.000195812782068242, + "loss": 0.8046, + "step": 1960 + }, + { + "epoch": 0.09291637052831082, + "grad_norm": 0.52734375, + "learning_rate": 0.00019580851664787098, + "loss": 1.401, + "step": 1961 + }, + { + "epoch": 0.0929637526652452, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001958042491025691, + "loss": 0.017, + "step": 1962 + }, + { + "epoch": 0.09301113480217958, + "grad_norm": 0.53515625, + "learning_rate": 0.00019579997943243102, + "loss": 0.961, + "step": 1963 + }, + { + "epoch": 0.09305851693911395, + "grad_norm": 0.5859375, + "learning_rate": 0.00019579570763755144, + "loss": 0.157, + "step": 1964 + }, + { + "epoch": 0.09310589907604833, + "grad_norm": 0.37890625, + "learning_rate": 0.0001957914337180251, + "loss": 0.1928, + "step": 1965 + }, + { + "epoch": 0.0931532812129827, + "grad_norm": 0.55859375, + "learning_rate": 0.00019578715767394682, + "loss": 0.2222, + "step": 1966 + }, + { + "epoch": 0.09320066334991708, + "grad_norm": 0.30078125, + "learning_rate": 0.00019578287950541137, + "loss": 0.2013, + "step": 1967 + }, + { + "epoch": 0.09324804548685146, + "grad_norm": 0.4765625, + "learning_rate": 0.00019577859921251372, + "loss": 0.1711, + "step": 1968 + }, + { + "epoch": 0.09329542762378583, + "grad_norm": 0.5, + "learning_rate": 0.00019577431679534876, + "loss": 1.3195, + "step": 1969 + }, + { + "epoch": 0.09334280976072021, + "grad_norm": 0.625, + "learning_rate": 0.00019577003225401142, + "loss": 0.9525, + "step": 1970 + }, + { + "epoch": 0.09339019189765459, + "grad_norm": 0.5390625, + "learning_rate": 0.00019576574558859678, + "loss": 0.8443, + "step": 1971 + }, + { + "epoch": 0.09343757403458897, + "grad_norm": 0.56640625, + "learning_rate": 0.00019576145679919992, + "loss": 0.8663, + "step": 1972 + }, + { + "epoch": 0.09348495617152333, + "grad_norm": 0.62109375, + "learning_rate": 0.00019575716588591597, + "loss": 0.1549, + "step": 1973 + }, + { + "epoch": 0.09353233830845771, + "grad_norm": 0.490234375, + "learning_rate": 0.00019575287284884005, + "loss": 0.211, + "step": 1974 + }, + { + "epoch": 0.09357972044539209, + "grad_norm": 0.494140625, + "learning_rate": 0.00019574857768806742, + "loss": 0.5792, + "step": 1975 + }, + { + "epoch": 0.09362710258232647, + "grad_norm": 0.53125, + "learning_rate": 0.0001957442804036933, + "loss": 1.4567, + "step": 1976 + }, + { + "epoch": 0.09367448471926083, + "grad_norm": 0.125, + "learning_rate": 0.00019573998099581304, + "loss": 0.012, + "step": 1977 + }, + { + "epoch": 0.09372186685619521, + "grad_norm": 0.404296875, + "learning_rate": 0.00019573567946452197, + "loss": 0.8247, + "step": 1978 + }, + { + "epoch": 0.09376924899312959, + "grad_norm": 0.18359375, + "learning_rate": 0.0001957313758099155, + "loss": 0.0189, + "step": 1979 + }, + { + "epoch": 0.09381663113006397, + "grad_norm": 0.58203125, + "learning_rate": 0.00019572707003208913, + "loss": 1.1493, + "step": 1980 + }, + { + "epoch": 0.09386401326699834, + "grad_norm": 0.58984375, + "learning_rate": 0.00019572276213113824, + "loss": 1.4851, + "step": 1981 + }, + { + "epoch": 0.09391139540393272, + "grad_norm": 0.58203125, + "learning_rate": 0.00019571845210715849, + "loss": 1.2883, + "step": 1982 + }, + { + "epoch": 0.0939587775408671, + "grad_norm": 0.5859375, + "learning_rate": 0.00019571413996024542, + "loss": 1.0645, + "step": 1983 + }, + { + "epoch": 0.09400615967780147, + "grad_norm": 0.71484375, + "learning_rate": 0.00019570982569049467, + "loss": 0.0704, + "step": 1984 + }, + { + "epoch": 0.09405354181473584, + "grad_norm": 0.5078125, + "learning_rate": 0.00019570550929800192, + "loss": 0.9932, + "step": 1985 + }, + { + "epoch": 0.09410092395167022, + "grad_norm": 0.57421875, + "learning_rate": 0.00019570119078286293, + "loss": 0.9952, + "step": 1986 + }, + { + "epoch": 0.0941483060886046, + "grad_norm": 0.34765625, + "learning_rate": 0.00019569687014517348, + "loss": 0.2108, + "step": 1987 + }, + { + "epoch": 0.09419568822553898, + "grad_norm": 0.46875, + "learning_rate": 0.0001956925473850294, + "loss": 0.8497, + "step": 1988 + }, + { + "epoch": 0.09424307036247334, + "grad_norm": 0.59375, + "learning_rate": 0.00019568822250252648, + "loss": 1.1613, + "step": 1989 + }, + { + "epoch": 0.09429045249940772, + "grad_norm": 0.498046875, + "learning_rate": 0.0001956838954977607, + "loss": 0.5845, + "step": 1990 + }, + { + "epoch": 0.0943378346363421, + "grad_norm": 0.6640625, + "learning_rate": 0.0001956795663708281, + "loss": 1.0806, + "step": 1991 + }, + { + "epoch": 0.09438521677327648, + "grad_norm": 0.498046875, + "learning_rate": 0.0001956752351218246, + "loss": 0.8906, + "step": 1992 + }, + { + "epoch": 0.09443259891021084, + "grad_norm": 0.609375, + "learning_rate": 0.00019567090175084633, + "loss": 1.265, + "step": 1993 + }, + { + "epoch": 0.09447998104714522, + "grad_norm": 0.7890625, + "learning_rate": 0.0001956665662579893, + "loss": 0.295, + "step": 1994 + }, + { + "epoch": 0.0945273631840796, + "grad_norm": 0.6640625, + "learning_rate": 0.0001956622286433498, + "loss": 0.0936, + "step": 1995 + }, + { + "epoch": 0.09457474532101398, + "grad_norm": 0.345703125, + "learning_rate": 0.0001956578889070239, + "loss": 0.4778, + "step": 1996 + }, + { + "epoch": 0.09462212745794836, + "grad_norm": 0.55859375, + "learning_rate": 0.00019565354704910792, + "loss": 1.3791, + "step": 1997 + }, + { + "epoch": 0.09466950959488273, + "grad_norm": 0.55078125, + "learning_rate": 0.00019564920306969818, + "loss": 1.0183, + "step": 1998 + }, + { + "epoch": 0.0947168917318171, + "grad_norm": 1.2265625, + "learning_rate": 0.00019564485696889098, + "loss": 1.125, + "step": 1999 + }, + { + "epoch": 0.09476427386875148, + "grad_norm": 0.59375, + "learning_rate": 0.00019564050874678276, + "loss": 0.881, + "step": 2000 + }, + { + "epoch": 0.09481165600568586, + "grad_norm": 0.07861328125, + "learning_rate": 0.0001956361584034699, + "loss": 0.0042, + "step": 2001 + }, + { + "epoch": 0.09485903814262023, + "grad_norm": 0.55078125, + "learning_rate": 0.0001956318059390489, + "loss": 0.9589, + "step": 2002 + }, + { + "epoch": 0.0949064202795546, + "grad_norm": 0.53125, + "learning_rate": 0.00019562745135361635, + "loss": 0.8589, + "step": 2003 + }, + { + "epoch": 0.09495380241648899, + "grad_norm": 0.427734375, + "learning_rate": 0.00019562309464726875, + "loss": 0.8546, + "step": 2004 + }, + { + "epoch": 0.09500118455342337, + "grad_norm": 0.59765625, + "learning_rate": 0.0001956187358201028, + "loss": 1.1615, + "step": 2005 + }, + { + "epoch": 0.09504856669035773, + "grad_norm": 0.76171875, + "learning_rate": 0.0001956143748722151, + "loss": 1.1912, + "step": 2006 + }, + { + "epoch": 0.09509594882729211, + "grad_norm": 0.56640625, + "learning_rate": 0.0001956100118037024, + "loss": 1.4969, + "step": 2007 + }, + { + "epoch": 0.09514333096422649, + "grad_norm": 0.5, + "learning_rate": 0.0001956056466146615, + "loss": 1.0647, + "step": 2008 + }, + { + "epoch": 0.09519071310116087, + "grad_norm": 0.6328125, + "learning_rate": 0.00019560127930518922, + "loss": 0.7003, + "step": 2009 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.7109375, + "learning_rate": 0.00019559690987538234, + "loss": 0.4448, + "step": 2010 + }, + { + "epoch": 0.09528547737502961, + "grad_norm": 0.5390625, + "learning_rate": 0.00019559253832533786, + "loss": 1.2646, + "step": 2011 + }, + { + "epoch": 0.09533285951196399, + "grad_norm": 0.546875, + "learning_rate": 0.0001955881646551527, + "loss": 1.2954, + "step": 2012 + }, + { + "epoch": 0.09538024164889837, + "grad_norm": 0.5703125, + "learning_rate": 0.00019558378886492387, + "loss": 0.9051, + "step": 2013 + }, + { + "epoch": 0.09542762378583274, + "grad_norm": 0.5078125, + "learning_rate": 0.0001955794109547484, + "loss": 0.9611, + "step": 2014 + }, + { + "epoch": 0.09547500592276711, + "grad_norm": 0.68359375, + "learning_rate": 0.0001955750309247234, + "loss": 1.608, + "step": 2015 + }, + { + "epoch": 0.0955223880597015, + "grad_norm": 0.0086669921875, + "learning_rate": 0.000195570648774946, + "loss": 0.0008, + "step": 2016 + }, + { + "epoch": 0.09556977019663587, + "grad_norm": 0.67578125, + "learning_rate": 0.00019556626450551343, + "loss": 1.1494, + "step": 2017 + }, + { + "epoch": 0.09561715233357024, + "grad_norm": 0.470703125, + "learning_rate": 0.0001955618781165229, + "loss": 1.3059, + "step": 2018 + }, + { + "epoch": 0.09566453447050462, + "grad_norm": 0.470703125, + "learning_rate": 0.0001955574896080717, + "loss": 1.0875, + "step": 2019 + }, + { + "epoch": 0.095711916607439, + "grad_norm": 0.5234375, + "learning_rate": 0.00019555309898025718, + "loss": 0.706, + "step": 2020 + }, + { + "epoch": 0.09575929874437338, + "grad_norm": 0.61328125, + "learning_rate": 0.00019554870623317668, + "loss": 0.9968, + "step": 2021 + }, + { + "epoch": 0.09580668088130774, + "grad_norm": 0.451171875, + "learning_rate": 0.00019554431136692765, + "loss": 0.8222, + "step": 2022 + }, + { + "epoch": 0.09585406301824212, + "grad_norm": 0.53515625, + "learning_rate": 0.0001955399143816076, + "loss": 1.1134, + "step": 2023 + }, + { + "epoch": 0.0959014451551765, + "grad_norm": 0.64453125, + "learning_rate": 0.00019553551527731397, + "loss": 0.9769, + "step": 2024 + }, + { + "epoch": 0.09594882729211088, + "grad_norm": 0.341796875, + "learning_rate": 0.0001955311140541444, + "loss": 0.0852, + "step": 2025 + }, + { + "epoch": 0.09599620942904526, + "grad_norm": 0.474609375, + "learning_rate": 0.00019552671071219644, + "loss": 1.1401, + "step": 2026 + }, + { + "epoch": 0.09604359156597962, + "grad_norm": 0.453125, + "learning_rate": 0.00019552230525156784, + "loss": 0.9021, + "step": 2027 + }, + { + "epoch": 0.096090973702914, + "grad_norm": 0.4765625, + "learning_rate": 0.0001955178976723562, + "loss": 0.5925, + "step": 2028 + }, + { + "epoch": 0.09613835583984838, + "grad_norm": 0.91015625, + "learning_rate": 0.00019551348797465935, + "loss": 1.123, + "step": 2029 + }, + { + "epoch": 0.09618573797678276, + "grad_norm": 0.66015625, + "learning_rate": 0.00019550907615857507, + "loss": 0.0715, + "step": 2030 + }, + { + "epoch": 0.09623312011371712, + "grad_norm": 0.515625, + "learning_rate": 0.00019550466222420125, + "loss": 1.0904, + "step": 2031 + }, + { + "epoch": 0.0962805022506515, + "grad_norm": 0.61328125, + "learning_rate": 0.0001955002461716357, + "loss": 1.1416, + "step": 2032 + }, + { + "epoch": 0.09632788438758588, + "grad_norm": 0.486328125, + "learning_rate": 0.00019549582800097644, + "loss": 1.2338, + "step": 2033 + }, + { + "epoch": 0.09637526652452026, + "grad_norm": 0.4375, + "learning_rate": 0.00019549140771232143, + "loss": 0.8136, + "step": 2034 + }, + { + "epoch": 0.09642264866145463, + "grad_norm": 0.51171875, + "learning_rate": 0.0001954869853057687, + "loss": 0.857, + "step": 2035 + }, + { + "epoch": 0.096470030798389, + "grad_norm": 1.8828125, + "learning_rate": 0.00019548256078141636, + "loss": 1.0993, + "step": 2036 + }, + { + "epoch": 0.09651741293532339, + "grad_norm": 0.76953125, + "learning_rate": 0.0001954781341393625, + "loss": 0.6322, + "step": 2037 + }, + { + "epoch": 0.09656479507225776, + "grad_norm": 0.5625, + "learning_rate": 0.00019547370537970532, + "loss": 0.1113, + "step": 2038 + }, + { + "epoch": 0.09661217720919213, + "grad_norm": 0.62890625, + "learning_rate": 0.0001954692745025431, + "loss": 0.9889, + "step": 2039 + }, + { + "epoch": 0.09665955934612651, + "grad_norm": 0.6015625, + "learning_rate": 0.00019546484150797402, + "loss": 1.0365, + "step": 2040 + }, + { + "epoch": 0.09670694148306089, + "grad_norm": 0.462890625, + "learning_rate": 0.00019546040639609644, + "loss": 1.5881, + "step": 2041 + }, + { + "epoch": 0.09675432361999527, + "grad_norm": 0.546875, + "learning_rate": 0.00019545596916700872, + "loss": 1.3077, + "step": 2042 + }, + { + "epoch": 0.09680170575692963, + "grad_norm": 0.62890625, + "learning_rate": 0.00019545152982080932, + "loss": 1.132, + "step": 2043 + }, + { + "epoch": 0.09684908789386401, + "grad_norm": 0.6015625, + "learning_rate": 0.00019544708835759662, + "loss": 1.2448, + "step": 2044 + }, + { + "epoch": 0.09689647003079839, + "grad_norm": 0.59375, + "learning_rate": 0.0001954426447774692, + "loss": 1.0699, + "step": 2045 + }, + { + "epoch": 0.09694385216773277, + "grad_norm": 0.78515625, + "learning_rate": 0.00019543819908052554, + "loss": 0.4107, + "step": 2046 + }, + { + "epoch": 0.09699123430466713, + "grad_norm": 0.083984375, + "learning_rate": 0.0001954337512668643, + "loss": 0.0083, + "step": 2047 + }, + { + "epoch": 0.09703861644160151, + "grad_norm": 0.7578125, + "learning_rate": 0.00019542930133658408, + "loss": 0.2417, + "step": 2048 + }, + { + "epoch": 0.09708599857853589, + "grad_norm": 0.5859375, + "learning_rate": 0.00019542484928978363, + "loss": 1.1712, + "step": 2049 + }, + { + "epoch": 0.09713338071547027, + "grad_norm": 0.486328125, + "learning_rate": 0.00019542039512656167, + "loss": 1.1497, + "step": 2050 + }, + { + "epoch": 0.09718076285240464, + "grad_norm": 0.3984375, + "learning_rate": 0.00019541593884701697, + "loss": 0.7131, + "step": 2051 + }, + { + "epoch": 0.09722814498933902, + "grad_norm": 0.58203125, + "learning_rate": 0.0001954114804512484, + "loss": 0.7727, + "step": 2052 + }, + { + "epoch": 0.0972755271262734, + "grad_norm": 0.56640625, + "learning_rate": 0.0001954070199393548, + "loss": 0.9297, + "step": 2053 + }, + { + "epoch": 0.09732290926320777, + "grad_norm": 0.56640625, + "learning_rate": 0.00019540255731143513, + "loss": 0.1139, + "step": 2054 + }, + { + "epoch": 0.09737029140014215, + "grad_norm": 0.0693359375, + "learning_rate": 0.00019539809256758836, + "loss": 0.0037, + "step": 2055 + }, + { + "epoch": 0.09741767353707652, + "grad_norm": 0.294921875, + "learning_rate": 0.00019539362570791352, + "loss": 0.208, + "step": 2056 + }, + { + "epoch": 0.0974650556740109, + "grad_norm": 0.62890625, + "learning_rate": 0.00019538915673250964, + "loss": 1.1762, + "step": 2057 + }, + { + "epoch": 0.09751243781094528, + "grad_norm": 0.52734375, + "learning_rate": 0.00019538468564147588, + "loss": 0.9383, + "step": 2058 + }, + { + "epoch": 0.09755981994787966, + "grad_norm": 0.4921875, + "learning_rate": 0.0001953802124349114, + "loss": 1.2892, + "step": 2059 + }, + { + "epoch": 0.09760720208481402, + "grad_norm": 1.0078125, + "learning_rate": 0.0001953757371129154, + "loss": 0.3101, + "step": 2060 + }, + { + "epoch": 0.0976545842217484, + "grad_norm": 0.416015625, + "learning_rate": 0.00019537125967558714, + "loss": 0.0716, + "step": 2061 + }, + { + "epoch": 0.09770196635868278, + "grad_norm": 0.64453125, + "learning_rate": 0.00019536678012302592, + "loss": 1.0844, + "step": 2062 + }, + { + "epoch": 0.09774934849561716, + "grad_norm": 0.6796875, + "learning_rate": 0.0001953622984553311, + "loss": 0.9733, + "step": 2063 + }, + { + "epoch": 0.09779673063255152, + "grad_norm": 0.39453125, + "learning_rate": 0.00019535781467260206, + "loss": 0.0277, + "step": 2064 + }, + { + "epoch": 0.0978441127694859, + "grad_norm": 0.66796875, + "learning_rate": 0.0001953533287749383, + "loss": 0.0644, + "step": 2065 + }, + { + "epoch": 0.09789149490642028, + "grad_norm": 0.59375, + "learning_rate": 0.00019534884076243922, + "loss": 0.8489, + "step": 2066 + }, + { + "epoch": 0.09793887704335466, + "grad_norm": 0.55859375, + "learning_rate": 0.00019534435063520446, + "loss": 0.8202, + "step": 2067 + }, + { + "epoch": 0.09798625918028903, + "grad_norm": 0.75, + "learning_rate": 0.00019533985839333354, + "loss": 0.9201, + "step": 2068 + }, + { + "epoch": 0.0980336413172234, + "grad_norm": 0.25390625, + "learning_rate": 0.0001953353640369261, + "loss": 0.0525, + "step": 2069 + }, + { + "epoch": 0.09808102345415778, + "grad_norm": 0.43359375, + "learning_rate": 0.00019533086756608187, + "loss": 0.9008, + "step": 2070 + }, + { + "epoch": 0.09812840559109216, + "grad_norm": 0.62109375, + "learning_rate": 0.0001953263689809005, + "loss": 1.6623, + "step": 2071 + }, + { + "epoch": 0.09817578772802653, + "grad_norm": 0.5546875, + "learning_rate": 0.00019532186828148183, + "loss": 1.2798, + "step": 2072 + }, + { + "epoch": 0.09822316986496091, + "grad_norm": 0.48046875, + "learning_rate": 0.00019531736546792562, + "loss": 0.8696, + "step": 2073 + }, + { + "epoch": 0.09827055200189529, + "grad_norm": 0.65234375, + "learning_rate": 0.0001953128605403318, + "loss": 1.2022, + "step": 2074 + }, + { + "epoch": 0.09831793413882967, + "grad_norm": 0.47265625, + "learning_rate": 0.00019530835349880027, + "loss": 0.8575, + "step": 2075 + }, + { + "epoch": 0.09836531627576403, + "grad_norm": 0.55859375, + "learning_rate": 0.00019530384434343097, + "loss": 1.1681, + "step": 2076 + }, + { + "epoch": 0.09841269841269841, + "grad_norm": 0.765625, + "learning_rate": 0.0001952993330743239, + "loss": 1.0974, + "step": 2077 + }, + { + "epoch": 0.09846008054963279, + "grad_norm": 0.451171875, + "learning_rate": 0.00019529481969157912, + "loss": 0.135, + "step": 2078 + }, + { + "epoch": 0.09850746268656717, + "grad_norm": 0.58203125, + "learning_rate": 0.00019529030419529675, + "loss": 1.3588, + "step": 2079 + }, + { + "epoch": 0.09855484482350153, + "grad_norm": 0.482421875, + "learning_rate": 0.00019528578658557696, + "loss": 1.0843, + "step": 2080 + }, + { + "epoch": 0.09860222696043591, + "grad_norm": 0.5703125, + "learning_rate": 0.00019528126686251988, + "loss": 0.9951, + "step": 2081 + }, + { + "epoch": 0.09864960909737029, + "grad_norm": 1.0078125, + "learning_rate": 0.0001952767450262258, + "loss": 0.3888, + "step": 2082 + }, + { + "epoch": 0.09869699123430467, + "grad_norm": 0.5390625, + "learning_rate": 0.00019527222107679502, + "loss": 1.2278, + "step": 2083 + }, + { + "epoch": 0.09874437337123905, + "grad_norm": 0.10546875, + "learning_rate": 0.00019526769501432785, + "loss": 0.0112, + "step": 2084 + }, + { + "epoch": 0.09879175550817342, + "grad_norm": 0.8046875, + "learning_rate": 0.00019526316683892464, + "loss": 0.9511, + "step": 2085 + }, + { + "epoch": 0.0988391376451078, + "grad_norm": 0.5, + "learning_rate": 0.00019525863655068593, + "loss": 1.0531, + "step": 2086 + }, + { + "epoch": 0.09888651978204217, + "grad_norm": 0.51953125, + "learning_rate": 0.00019525410414971206, + "loss": 1.47, + "step": 2087 + }, + { + "epoch": 0.09893390191897655, + "grad_norm": 0.37109375, + "learning_rate": 0.00019524956963610365, + "loss": 0.2093, + "step": 2088 + }, + { + "epoch": 0.09898128405591092, + "grad_norm": 1.2734375, + "learning_rate": 0.00019524503300996125, + "loss": 0.4769, + "step": 2089 + }, + { + "epoch": 0.0990286661928453, + "grad_norm": 0.494140625, + "learning_rate": 0.00019524049427138544, + "loss": 1.1945, + "step": 2090 + }, + { + "epoch": 0.09907604832977968, + "grad_norm": 0.458984375, + "learning_rate": 0.00019523595342047694, + "loss": 0.7347, + "step": 2091 + }, + { + "epoch": 0.09912343046671405, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019523141045733642, + "loss": 0.0159, + "step": 2092 + }, + { + "epoch": 0.09917081260364842, + "grad_norm": 0.58984375, + "learning_rate": 0.00019522686538206465, + "loss": 1.2201, + "step": 2093 + }, + { + "epoch": 0.0992181947405828, + "grad_norm": 0.515625, + "learning_rate": 0.00019522231819476244, + "loss": 0.8225, + "step": 2094 + }, + { + "epoch": 0.09926557687751718, + "grad_norm": 0.75390625, + "learning_rate": 0.00019521776889553066, + "loss": 0.9317, + "step": 2095 + }, + { + "epoch": 0.09931295901445156, + "grad_norm": 0.306640625, + "learning_rate": 0.00019521321748447015, + "loss": 0.1652, + "step": 2096 + }, + { + "epoch": 0.09936034115138592, + "grad_norm": 0.91015625, + "learning_rate": 0.0001952086639616819, + "loss": 0.2254, + "step": 2097 + }, + { + "epoch": 0.0994077232883203, + "grad_norm": 0.1318359375, + "learning_rate": 0.00019520410832726693, + "loss": 0.0149, + "step": 2098 + }, + { + "epoch": 0.09945510542525468, + "grad_norm": 0.53125, + "learning_rate": 0.0001951995505813262, + "loss": 1.0392, + "step": 2099 + }, + { + "epoch": 0.09950248756218906, + "grad_norm": 0.44140625, + "learning_rate": 0.00019519499072396087, + "loss": 0.2113, + "step": 2100 + }, + { + "epoch": 0.09954986969912342, + "grad_norm": 0.443359375, + "learning_rate": 0.00019519042875527202, + "loss": 0.7009, + "step": 2101 + }, + { + "epoch": 0.0995972518360578, + "grad_norm": 0.64453125, + "learning_rate": 0.00019518586467536086, + "loss": 1.1118, + "step": 2102 + }, + { + "epoch": 0.09964463397299218, + "grad_norm": 0.4765625, + "learning_rate": 0.0001951812984843286, + "loss": 0.8469, + "step": 2103 + }, + { + "epoch": 0.09969201610992656, + "grad_norm": 0.275390625, + "learning_rate": 0.00019517673018227654, + "loss": 0.0265, + "step": 2104 + }, + { + "epoch": 0.09973939824686093, + "grad_norm": 0.53515625, + "learning_rate": 0.000195172159769306, + "loss": 0.7084, + "step": 2105 + }, + { + "epoch": 0.0997867803837953, + "grad_norm": 0.67578125, + "learning_rate": 0.0001951675872455183, + "loss": 1.1953, + "step": 2106 + }, + { + "epoch": 0.09983416252072969, + "grad_norm": 0.95703125, + "learning_rate": 0.0001951630126110149, + "loss": 1.233, + "step": 2107 + }, + { + "epoch": 0.09988154465766406, + "grad_norm": 0.451171875, + "learning_rate": 0.0001951584358658972, + "loss": 1.1104, + "step": 2108 + }, + { + "epoch": 0.09992892679459843, + "grad_norm": 0.00634765625, + "learning_rate": 0.0001951538570102668, + "loss": 0.0004, + "step": 2109 + }, + { + "epoch": 0.09997630893153281, + "grad_norm": 0.427734375, + "learning_rate": 0.0001951492760442252, + "loss": 0.917, + "step": 2110 + }, + { + "epoch": 0.10002369106846719, + "grad_norm": 1.0546875, + "learning_rate": 0.000195144692967874, + "loss": 0.4697, + "step": 2111 + }, + { + "epoch": 0.10007107320540157, + "grad_norm": 0.6015625, + "learning_rate": 0.00019514010778131483, + "loss": 1.0373, + "step": 2112 + }, + { + "epoch": 0.10011845534233593, + "grad_norm": 0.62109375, + "learning_rate": 0.00019513552048464942, + "loss": 0.9711, + "step": 2113 + }, + { + "epoch": 0.10016583747927031, + "grad_norm": 0.625, + "learning_rate": 0.0001951309310779795, + "loss": 1.4549, + "step": 2114 + }, + { + "epoch": 0.10021321961620469, + "grad_norm": 0.0244140625, + "learning_rate": 0.00019512633956140688, + "loss": 0.002, + "step": 2115 + }, + { + "epoch": 0.10026060175313907, + "grad_norm": 0.37890625, + "learning_rate": 0.00019512174593503336, + "loss": 0.0166, + "step": 2116 + }, + { + "epoch": 0.10030798389007345, + "grad_norm": 0.55078125, + "learning_rate": 0.00019511715019896082, + "loss": 1.2642, + "step": 2117 + }, + { + "epoch": 0.10035536602700781, + "grad_norm": 0.38671875, + "learning_rate": 0.0001951125523532912, + "loss": 1.1373, + "step": 2118 + }, + { + "epoch": 0.1004027481639422, + "grad_norm": 0.66015625, + "learning_rate": 0.0001951079523981265, + "loss": 0.2338, + "step": 2119 + }, + { + "epoch": 0.10045013030087657, + "grad_norm": 0.47265625, + "learning_rate": 0.00019510335033356873, + "loss": 0.5619, + "step": 2120 + }, + { + "epoch": 0.10049751243781095, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019509874615971992, + "loss": 0.018, + "step": 2121 + }, + { + "epoch": 0.10054489457474532, + "grad_norm": 0.416015625, + "learning_rate": 0.00019509413987668222, + "loss": 0.939, + "step": 2122 + }, + { + "epoch": 0.1005922767116797, + "grad_norm": 0.63671875, + "learning_rate": 0.0001950895314845578, + "loss": 1.0705, + "step": 2123 + }, + { + "epoch": 0.10063965884861407, + "grad_norm": 0.6484375, + "learning_rate": 0.00019508492098344884, + "loss": 1.0411, + "step": 2124 + }, + { + "epoch": 0.10068704098554845, + "grad_norm": 0.6484375, + "learning_rate": 0.00019508030837345764, + "loss": 1.0597, + "step": 2125 + }, + { + "epoch": 0.10073442312248282, + "grad_norm": 0.36328125, + "learning_rate": 0.00019507569365468644, + "loss": 0.0604, + "step": 2126 + }, + { + "epoch": 0.1007818052594172, + "grad_norm": 0.51953125, + "learning_rate": 0.00019507107682723764, + "loss": 1.1313, + "step": 2127 + }, + { + "epoch": 0.10082918739635158, + "grad_norm": 0.07080078125, + "learning_rate": 0.00019506645789121364, + "loss": 0.0049, + "step": 2128 + }, + { + "epoch": 0.10087656953328596, + "grad_norm": 0.51953125, + "learning_rate": 0.00019506183684671685, + "loss": 0.6684, + "step": 2129 + }, + { + "epoch": 0.10092395167022032, + "grad_norm": 0.4140625, + "learning_rate": 0.00019505721369384975, + "loss": 0.1914, + "step": 2130 + }, + { + "epoch": 0.1009713338071547, + "grad_norm": 0.306640625, + "learning_rate": 0.0001950525884327149, + "loss": 0.2069, + "step": 2131 + }, + { + "epoch": 0.10101871594408908, + "grad_norm": 0.392578125, + "learning_rate": 0.00019504796106341492, + "loss": 0.9878, + "step": 2132 + }, + { + "epoch": 0.10106609808102346, + "grad_norm": 0.53125, + "learning_rate": 0.0001950433315860524, + "loss": 1.1931, + "step": 2133 + }, + { + "epoch": 0.10111348021795782, + "grad_norm": 0.498046875, + "learning_rate": 0.00019503870000073001, + "loss": 1.2078, + "step": 2134 + }, + { + "epoch": 0.1011608623548922, + "grad_norm": 0.69921875, + "learning_rate": 0.00019503406630755048, + "loss": 0.0691, + "step": 2135 + }, + { + "epoch": 0.10120824449182658, + "grad_norm": 0.6015625, + "learning_rate": 0.0001950294305066166, + "loss": 1.4365, + "step": 2136 + }, + { + "epoch": 0.10125562662876096, + "grad_norm": 0.7109375, + "learning_rate": 0.00019502479259803117, + "loss": 1.1509, + "step": 2137 + }, + { + "epoch": 0.10130300876569533, + "grad_norm": 0.453125, + "learning_rate": 0.00019502015258189703, + "loss": 0.3573, + "step": 2138 + }, + { + "epoch": 0.1013503909026297, + "grad_norm": 0.5859375, + "learning_rate": 0.00019501551045831715, + "loss": 1.3825, + "step": 2139 + }, + { + "epoch": 0.10139777303956408, + "grad_norm": 0.494140625, + "learning_rate": 0.00019501086622739444, + "loss": 0.8767, + "step": 2140 + }, + { + "epoch": 0.10144515517649846, + "grad_norm": 0.45703125, + "learning_rate": 0.00019500621988923191, + "loss": 0.9961, + "step": 2141 + }, + { + "epoch": 0.10149253731343283, + "grad_norm": 0.53515625, + "learning_rate": 0.0001950015714439326, + "loss": 0.7829, + "step": 2142 + }, + { + "epoch": 0.10153991945036721, + "grad_norm": 0.578125, + "learning_rate": 0.0001949969208915997, + "loss": 0.8525, + "step": 2143 + }, + { + "epoch": 0.10158730158730159, + "grad_norm": 0.56640625, + "learning_rate": 0.0001949922682323362, + "loss": 1.2719, + "step": 2144 + }, + { + "epoch": 0.10163468372423597, + "grad_norm": 0.44140625, + "learning_rate": 0.00019498761346624542, + "loss": 0.6178, + "step": 2145 + }, + { + "epoch": 0.10168206586117035, + "grad_norm": 0.455078125, + "learning_rate": 0.0001949829565934305, + "loss": 0.7717, + "step": 2146 + }, + { + "epoch": 0.10172944799810471, + "grad_norm": 0.439453125, + "learning_rate": 0.00019497829761399484, + "loss": 0.7072, + "step": 2147 + }, + { + "epoch": 0.10177683013503909, + "grad_norm": 0.5234375, + "learning_rate": 0.00019497363652804168, + "loss": 0.8323, + "step": 2148 + }, + { + "epoch": 0.10182421227197347, + "grad_norm": 0.65625, + "learning_rate": 0.0001949689733356744, + "loss": 0.1043, + "step": 2149 + }, + { + "epoch": 0.10187159440890785, + "grad_norm": 0.5234375, + "learning_rate": 0.00019496430803699646, + "loss": 1.0638, + "step": 2150 + }, + { + "epoch": 0.10191897654584221, + "grad_norm": 0.5703125, + "learning_rate": 0.00019495964063211135, + "loss": 0.8882, + "step": 2151 + }, + { + "epoch": 0.10196635868277659, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019495497112112254, + "loss": 0.146, + "step": 2152 + }, + { + "epoch": 0.10201374081971097, + "grad_norm": 0.796875, + "learning_rate": 0.0001949502995041336, + "loss": 0.4443, + "step": 2153 + }, + { + "epoch": 0.10206112295664535, + "grad_norm": 0.58984375, + "learning_rate": 0.00019494562578124813, + "loss": 0.9381, + "step": 2154 + }, + { + "epoch": 0.10210850509357972, + "grad_norm": 0.640625, + "learning_rate": 0.00019494094995256984, + "loss": 1.013, + "step": 2155 + }, + { + "epoch": 0.1021558872305141, + "grad_norm": 0.6015625, + "learning_rate": 0.00019493627201820242, + "loss": 0.9014, + "step": 2156 + }, + { + "epoch": 0.10220326936744847, + "grad_norm": 0.453125, + "learning_rate": 0.0001949315919782496, + "loss": 0.9222, + "step": 2157 + }, + { + "epoch": 0.10225065150438285, + "grad_norm": 0.89453125, + "learning_rate": 0.00019492690983281515, + "loss": 0.0349, + "step": 2158 + }, + { + "epoch": 0.10229803364131722, + "grad_norm": 0.26171875, + "learning_rate": 0.000194922225582003, + "loss": 0.1476, + "step": 2159 + }, + { + "epoch": 0.1023454157782516, + "grad_norm": 0.7578125, + "learning_rate": 0.00019491753922591695, + "loss": 0.1474, + "step": 2160 + }, + { + "epoch": 0.10239279791518598, + "grad_norm": 0.62890625, + "learning_rate": 0.00019491285076466102, + "loss": 0.107, + "step": 2161 + }, + { + "epoch": 0.10244018005212036, + "grad_norm": 0.06591796875, + "learning_rate": 0.00019490816019833914, + "loss": 0.0058, + "step": 2162 + }, + { + "epoch": 0.10248756218905472, + "grad_norm": 0.4765625, + "learning_rate": 0.00019490346752705536, + "loss": 0.2579, + "step": 2163 + }, + { + "epoch": 0.1025349443259891, + "grad_norm": 0.482421875, + "learning_rate": 0.00019489877275091378, + "loss": 1.0398, + "step": 2164 + }, + { + "epoch": 0.10258232646292348, + "grad_norm": 0.546875, + "learning_rate": 0.0001948940758700185, + "loss": 1.6905, + "step": 2165 + }, + { + "epoch": 0.10262970859985786, + "grad_norm": 0.330078125, + "learning_rate": 0.00019488937688447365, + "loss": 0.2404, + "step": 2166 + }, + { + "epoch": 0.10267709073679222, + "grad_norm": 0.56640625, + "learning_rate": 0.0001948846757943835, + "loss": 0.8703, + "step": 2167 + }, + { + "epoch": 0.1027244728737266, + "grad_norm": 0.66015625, + "learning_rate": 0.00019487997259985234, + "loss": 1.3456, + "step": 2168 + }, + { + "epoch": 0.10277185501066098, + "grad_norm": 0.00750732421875, + "learning_rate": 0.00019487526730098441, + "loss": 0.0007, + "step": 2169 + }, + { + "epoch": 0.10281923714759536, + "grad_norm": 0.41796875, + "learning_rate": 0.00019487055989788417, + "loss": 1.2684, + "step": 2170 + }, + { + "epoch": 0.10286661928452973, + "grad_norm": 0.51953125, + "learning_rate": 0.0001948658503906559, + "loss": 0.1757, + "step": 2171 + }, + { + "epoch": 0.1029140014214641, + "grad_norm": 0.412109375, + "learning_rate": 0.00019486113877940415, + "loss": 0.1336, + "step": 2172 + }, + { + "epoch": 0.10296138355839848, + "grad_norm": 0.56640625, + "learning_rate": 0.00019485642506423338, + "loss": 0.6852, + "step": 2173 + }, + { + "epoch": 0.10300876569533286, + "grad_norm": 0.59765625, + "learning_rate": 0.00019485170924524813, + "loss": 1.1825, + "step": 2174 + }, + { + "epoch": 0.10305614783226724, + "grad_norm": 0.400390625, + "learning_rate": 0.000194846991322553, + "loss": 0.2251, + "step": 2175 + }, + { + "epoch": 0.10310352996920161, + "grad_norm": 0.69140625, + "learning_rate": 0.00019484227129625264, + "loss": 0.3324, + "step": 2176 + }, + { + "epoch": 0.10315091210613599, + "grad_norm": 0.38671875, + "learning_rate": 0.00019483754916645168, + "loss": 0.0443, + "step": 2177 + }, + { + "epoch": 0.10319829424307037, + "grad_norm": 0.55859375, + "learning_rate": 0.00019483282493325493, + "loss": 0.9885, + "step": 2178 + }, + { + "epoch": 0.10324567638000474, + "grad_norm": 0.03125, + "learning_rate": 0.00019482809859676717, + "loss": 0.0012, + "step": 2179 + }, + { + "epoch": 0.10329305851693911, + "grad_norm": 0.4296875, + "learning_rate": 0.00019482337015709313, + "loss": 0.7013, + "step": 2180 + }, + { + "epoch": 0.10334044065387349, + "grad_norm": 0.65625, + "learning_rate": 0.00019481863961433776, + "loss": 1.396, + "step": 2181 + }, + { + "epoch": 0.10338782279080787, + "grad_norm": 0.435546875, + "learning_rate": 0.00019481390696860596, + "loss": 0.5346, + "step": 2182 + }, + { + "epoch": 0.10343520492774225, + "grad_norm": 0.64453125, + "learning_rate": 0.00019480917222000272, + "loss": 1.4599, + "step": 2183 + }, + { + "epoch": 0.10348258706467661, + "grad_norm": 0.494140625, + "learning_rate": 0.000194804435368633, + "loss": 0.0461, + "step": 2184 + }, + { + "epoch": 0.10352996920161099, + "grad_norm": 0.5859375, + "learning_rate": 0.0001947996964146019, + "loss": 1.3128, + "step": 2185 + }, + { + "epoch": 0.10357735133854537, + "grad_norm": 0.68359375, + "learning_rate": 0.0001947949553580145, + "loss": 1.5236, + "step": 2186 + }, + { + "epoch": 0.10362473347547975, + "grad_norm": 0.462890625, + "learning_rate": 0.00019479021219897594, + "loss": 0.6163, + "step": 2187 + }, + { + "epoch": 0.10367211561241411, + "grad_norm": 0.498046875, + "learning_rate": 0.00019478546693759148, + "loss": 0.9992, + "step": 2188 + }, + { + "epoch": 0.1037194977493485, + "grad_norm": 0.45703125, + "learning_rate": 0.00019478071957396628, + "loss": 1.2316, + "step": 2189 + }, + { + "epoch": 0.10376687988628287, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001947759701082057, + "loss": 0.0197, + "step": 2190 + }, + { + "epoch": 0.10381426202321725, + "grad_norm": 1.0859375, + "learning_rate": 0.0001947712185404151, + "loss": 0.1965, + "step": 2191 + }, + { + "epoch": 0.10386164416015162, + "grad_norm": 0.4921875, + "learning_rate": 0.00019476646487069977, + "loss": 0.1305, + "step": 2192 + }, + { + "epoch": 0.103909026297086, + "grad_norm": 0.5625, + "learning_rate": 0.00019476170909916518, + "loss": 0.6922, + "step": 2193 + }, + { + "epoch": 0.10395640843402038, + "grad_norm": 0.486328125, + "learning_rate": 0.0001947569512259168, + "loss": 1.1491, + "step": 2194 + }, + { + "epoch": 0.10400379057095475, + "grad_norm": 0.7578125, + "learning_rate": 0.00019475219125106023, + "loss": 1.0979, + "step": 2195 + }, + { + "epoch": 0.10405117270788912, + "grad_norm": 0.78515625, + "learning_rate": 0.00019474742917470096, + "loss": 0.2263, + "step": 2196 + }, + { + "epoch": 0.1040985548448235, + "grad_norm": 0.55078125, + "learning_rate": 0.00019474266499694464, + "loss": 0.2181, + "step": 2197 + }, + { + "epoch": 0.10414593698175788, + "grad_norm": 0.546875, + "learning_rate": 0.0001947378987178969, + "loss": 0.9503, + "step": 2198 + }, + { + "epoch": 0.10419331911869226, + "grad_norm": 0.671875, + "learning_rate": 0.0001947331303376635, + "loss": 1.2979, + "step": 2199 + }, + { + "epoch": 0.10424070125562662, + "grad_norm": 0.64453125, + "learning_rate": 0.00019472835985635017, + "loss": 1.5636, + "step": 2200 + }, + { + "epoch": 0.104288083392561, + "grad_norm": 0.6484375, + "learning_rate": 0.0001947235872740627, + "loss": 0.915, + "step": 2201 + }, + { + "epoch": 0.10433546552949538, + "grad_norm": 0.83203125, + "learning_rate": 0.00019471881259090697, + "loss": 0.8837, + "step": 2202 + }, + { + "epoch": 0.10438284766642976, + "grad_norm": 0.57421875, + "learning_rate": 0.00019471403580698885, + "loss": 1.2649, + "step": 2203 + }, + { + "epoch": 0.10443022980336414, + "grad_norm": 0.1142578125, + "learning_rate": 0.00019470925692241436, + "loss": 0.0166, + "step": 2204 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 0.92578125, + "learning_rate": 0.0001947044759372894, + "loss": 0.2473, + "step": 2205 + }, + { + "epoch": 0.10452499407723288, + "grad_norm": 0.5234375, + "learning_rate": 0.00019469969285172, + "loss": 1.3336, + "step": 2206 + }, + { + "epoch": 0.10457237621416726, + "grad_norm": 0.79296875, + "learning_rate": 0.0001946949076658123, + "loss": 0.089, + "step": 2207 + }, + { + "epoch": 0.10461975835110164, + "grad_norm": 0.55859375, + "learning_rate": 0.00019469012037967245, + "loss": 0.2336, + "step": 2208 + }, + { + "epoch": 0.104667140488036, + "grad_norm": 0.451171875, + "learning_rate": 0.00019468533099340656, + "loss": 0.5127, + "step": 2209 + }, + { + "epoch": 0.10471452262497039, + "grad_norm": 0.625, + "learning_rate": 0.00019468053950712086, + "loss": 1.1701, + "step": 2210 + }, + { + "epoch": 0.10476190476190476, + "grad_norm": 1.390625, + "learning_rate": 0.00019467574592092168, + "loss": 0.7627, + "step": 2211 + }, + { + "epoch": 0.10480928689883914, + "grad_norm": 0.52734375, + "learning_rate": 0.00019467095023491528, + "loss": 1.283, + "step": 2212 + }, + { + "epoch": 0.10485666903577351, + "grad_norm": 0.02294921875, + "learning_rate": 0.00019466615244920807, + "loss": 0.0013, + "step": 2213 + }, + { + "epoch": 0.10490405117270789, + "grad_norm": 0.166015625, + "learning_rate": 0.0001946613525639064, + "loss": 0.0062, + "step": 2214 + }, + { + "epoch": 0.10495143330964227, + "grad_norm": 0.69140625, + "learning_rate": 0.00019465655057911678, + "loss": 0.9722, + "step": 2215 + }, + { + "epoch": 0.10499881544657665, + "grad_norm": 0.61328125, + "learning_rate": 0.0001946517464949457, + "loss": 1.1721, + "step": 2216 + }, + { + "epoch": 0.10504619758351101, + "grad_norm": 0.435546875, + "learning_rate": 0.00019464694031149968, + "loss": 0.5812, + "step": 2217 + }, + { + "epoch": 0.10509357972044539, + "grad_norm": 0.42578125, + "learning_rate": 0.00019464213202888535, + "loss": 0.8201, + "step": 2218 + }, + { + "epoch": 0.10514096185737977, + "grad_norm": 0.6015625, + "learning_rate": 0.0001946373216472093, + "loss": 1.0211, + "step": 2219 + }, + { + "epoch": 0.10518834399431415, + "grad_norm": 0.62890625, + "learning_rate": 0.00019463250916657834, + "loss": 1.2849, + "step": 2220 + }, + { + "epoch": 0.10523572613124851, + "grad_norm": 0.82421875, + "learning_rate": 0.00019462769458709905, + "loss": 0.2717, + "step": 2221 + }, + { + "epoch": 0.10528310826818289, + "grad_norm": 0.51171875, + "learning_rate": 0.00019462287790887833, + "loss": 1.056, + "step": 2222 + }, + { + "epoch": 0.10533049040511727, + "grad_norm": 0.53125, + "learning_rate": 0.00019461805913202293, + "loss": 0.1818, + "step": 2223 + }, + { + "epoch": 0.10537787254205165, + "grad_norm": 0.62890625, + "learning_rate": 0.0001946132382566398, + "loss": 1.3601, + "step": 2224 + }, + { + "epoch": 0.10542525467898602, + "grad_norm": 0.6640625, + "learning_rate": 0.0001946084152828358, + "loss": 1.411, + "step": 2225 + }, + { + "epoch": 0.1054726368159204, + "grad_norm": 0.00494384765625, + "learning_rate": 0.00019460359021071793, + "loss": 0.0003, + "step": 2226 + }, + { + "epoch": 0.10552001895285477, + "grad_norm": 0.5390625, + "learning_rate": 0.0001945987630403932, + "loss": 0.1902, + "step": 2227 + }, + { + "epoch": 0.10556740108978915, + "grad_norm": 0.59375, + "learning_rate": 0.00019459393377196864, + "loss": 1.4867, + "step": 2228 + }, + { + "epoch": 0.10561478322672352, + "grad_norm": 0.703125, + "learning_rate": 0.0001945891024055514, + "loss": 0.1368, + "step": 2229 + }, + { + "epoch": 0.1056621653636579, + "grad_norm": 1.0625, + "learning_rate": 0.00019458426894124864, + "loss": 0.4235, + "step": 2230 + }, + { + "epoch": 0.10570954750059228, + "grad_norm": 0.5, + "learning_rate": 0.0001945794333791675, + "loss": 1.5729, + "step": 2231 + }, + { + "epoch": 0.10575692963752666, + "grad_norm": 0.494140625, + "learning_rate": 0.00019457459571941532, + "loss": 0.5732, + "step": 2232 + }, + { + "epoch": 0.10580431177446104, + "grad_norm": 0.59375, + "learning_rate": 0.00019456975596209933, + "loss": 1.2299, + "step": 2233 + }, + { + "epoch": 0.1058516939113954, + "grad_norm": 0.48046875, + "learning_rate": 0.00019456491410732688, + "loss": 0.8259, + "step": 2234 + }, + { + "epoch": 0.10589907604832978, + "grad_norm": 0.61328125, + "learning_rate": 0.00019456007015520533, + "loss": 0.0567, + "step": 2235 + }, + { + "epoch": 0.10594645818526416, + "grad_norm": 0.5859375, + "learning_rate": 0.00019455522410584218, + "loss": 0.9271, + "step": 2236 + }, + { + "epoch": 0.10599384032219854, + "grad_norm": 0.6796875, + "learning_rate": 0.00019455037595934486, + "loss": 1.4491, + "step": 2237 + }, + { + "epoch": 0.1060412224591329, + "grad_norm": 0.302734375, + "learning_rate": 0.0001945455257158209, + "loss": 0.2037, + "step": 2238 + }, + { + "epoch": 0.10608860459606728, + "grad_norm": 0.5546875, + "learning_rate": 0.0001945406733753779, + "loss": 1.2239, + "step": 2239 + }, + { + "epoch": 0.10613598673300166, + "grad_norm": 0.435546875, + "learning_rate": 0.0001945358189381235, + "loss": 0.7899, + "step": 2240 + }, + { + "epoch": 0.10618336886993604, + "grad_norm": 0.55078125, + "learning_rate": 0.00019453096240416528, + "loss": 0.9769, + "step": 2241 + }, + { + "epoch": 0.1062307510068704, + "grad_norm": 0.68359375, + "learning_rate": 0.00019452610377361103, + "loss": 1.6305, + "step": 2242 + }, + { + "epoch": 0.10627813314380478, + "grad_norm": 0.419921875, + "learning_rate": 0.00019452124304656846, + "loss": 0.6719, + "step": 2243 + }, + { + "epoch": 0.10632551528073916, + "grad_norm": 0.482421875, + "learning_rate": 0.00019451638022314541, + "loss": 0.8991, + "step": 2244 + }, + { + "epoch": 0.10637289741767354, + "grad_norm": 1.4140625, + "learning_rate": 0.00019451151530344973, + "loss": 1.1208, + "step": 2245 + }, + { + "epoch": 0.10642027955460791, + "grad_norm": 1.0234375, + "learning_rate": 0.0001945066482875893, + "loss": 0.0206, + "step": 2246 + }, + { + "epoch": 0.10646766169154229, + "grad_norm": 0.0026397705078125, + "learning_rate": 0.00019450177917567206, + "loss": 0.0002, + "step": 2247 + }, + { + "epoch": 0.10651504382847667, + "grad_norm": 0.58984375, + "learning_rate": 0.00019449690796780606, + "loss": 0.8428, + "step": 2248 + }, + { + "epoch": 0.10656242596541105, + "grad_norm": 0.65625, + "learning_rate": 0.00019449203466409928, + "loss": 1.1472, + "step": 2249 + }, + { + "epoch": 0.10660980810234541, + "grad_norm": 0.4140625, + "learning_rate": 0.00019448715926465978, + "loss": 0.9232, + "step": 2250 + }, + { + "epoch": 0.10665719023927979, + "grad_norm": 2.34375, + "learning_rate": 0.00019448228176959577, + "loss": 0.5728, + "step": 2251 + }, + { + "epoch": 0.10670457237621417, + "grad_norm": 0.64453125, + "learning_rate": 0.0001944774021790154, + "loss": 0.9843, + "step": 2252 + }, + { + "epoch": 0.10675195451314855, + "grad_norm": 0.63671875, + "learning_rate": 0.00019447252049302686, + "loss": 0.2289, + "step": 2253 + }, + { + "epoch": 0.10679933665008291, + "grad_norm": 0.58203125, + "learning_rate": 0.00019446763671173843, + "loss": 1.2277, + "step": 2254 + }, + { + "epoch": 0.10684671878701729, + "grad_norm": 0.5625, + "learning_rate": 0.00019446275083525848, + "loss": 0.5088, + "step": 2255 + }, + { + "epoch": 0.10689410092395167, + "grad_norm": 0.55078125, + "learning_rate": 0.00019445786286369527, + "loss": 0.1824, + "step": 2256 + }, + { + "epoch": 0.10694148306088605, + "grad_norm": 0.63671875, + "learning_rate": 0.0001944529727971573, + "loss": 0.7188, + "step": 2257 + }, + { + "epoch": 0.10698886519782042, + "grad_norm": 0.5078125, + "learning_rate": 0.00019444808063575302, + "loss": 1.2987, + "step": 2258 + }, + { + "epoch": 0.1070362473347548, + "grad_norm": 0.51953125, + "learning_rate": 0.00019444318637959091, + "loss": 0.9129, + "step": 2259 + }, + { + "epoch": 0.10708362947168917, + "grad_norm": 0.50390625, + "learning_rate": 0.00019443829002877951, + "loss": 0.5481, + "step": 2260 + }, + { + "epoch": 0.10713101160862355, + "grad_norm": 0.490234375, + "learning_rate": 0.00019443339158342745, + "loss": 0.6164, + "step": 2261 + }, + { + "epoch": 0.10717839374555792, + "grad_norm": 0.341796875, + "learning_rate": 0.00019442849104364334, + "loss": 0.0807, + "step": 2262 + }, + { + "epoch": 0.1072257758824923, + "grad_norm": 0.51171875, + "learning_rate": 0.00019442358840953588, + "loss": 1.028, + "step": 2263 + }, + { + "epoch": 0.10727315801942668, + "grad_norm": 0.6640625, + "learning_rate": 0.0001944186836812138, + "loss": 1.1778, + "step": 2264 + }, + { + "epoch": 0.10732054015636106, + "grad_norm": 0.73828125, + "learning_rate": 0.00019441377685878587, + "loss": 1.3525, + "step": 2265 + }, + { + "epoch": 0.10736792229329543, + "grad_norm": 0.7265625, + "learning_rate": 0.00019440886794236097, + "loss": 0.7213, + "step": 2266 + }, + { + "epoch": 0.1074153044302298, + "grad_norm": 0.45703125, + "learning_rate": 0.0001944039569320479, + "loss": 0.8769, + "step": 2267 + }, + { + "epoch": 0.10746268656716418, + "grad_norm": 0.45703125, + "learning_rate": 0.00019439904382795564, + "loss": 0.8678, + "step": 2268 + }, + { + "epoch": 0.10751006870409856, + "grad_norm": 0.2265625, + "learning_rate": 0.00019439412863019314, + "loss": 0.1564, + "step": 2269 + }, + { + "epoch": 0.10755745084103294, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001943892113388694, + "loss": 0.0198, + "step": 2270 + }, + { + "epoch": 0.1076048329779673, + "grad_norm": 0.34765625, + "learning_rate": 0.00019438429195409352, + "loss": 0.0609, + "step": 2271 + }, + { + "epoch": 0.10765221511490168, + "grad_norm": 0.4375, + "learning_rate": 0.00019437937047597455, + "loss": 0.508, + "step": 2272 + }, + { + "epoch": 0.10769959725183606, + "grad_norm": 0.94140625, + "learning_rate": 0.0001943744469046217, + "loss": 0.3306, + "step": 2273 + }, + { + "epoch": 0.10774697938877044, + "grad_norm": 0.296875, + "learning_rate": 0.0001943695212401441, + "loss": 0.0205, + "step": 2274 + }, + { + "epoch": 0.1077943615257048, + "grad_norm": 0.5, + "learning_rate": 0.00019436459348265106, + "loss": 0.569, + "step": 2275 + }, + { + "epoch": 0.10784174366263918, + "grad_norm": 0.38671875, + "learning_rate": 0.0001943596636322518, + "loss": 0.0908, + "step": 2276 + }, + { + "epoch": 0.10788912579957356, + "grad_norm": 0.734375, + "learning_rate": 0.00019435473168905577, + "loss": 1.1299, + "step": 2277 + }, + { + "epoch": 0.10793650793650794, + "grad_norm": 0.234375, + "learning_rate": 0.0001943497976531723, + "loss": 0.0519, + "step": 2278 + }, + { + "epoch": 0.1079838900734423, + "grad_norm": 0.63671875, + "learning_rate": 0.00019434486152471075, + "loss": 0.9495, + "step": 2279 + }, + { + "epoch": 0.10803127221037669, + "grad_norm": 0.388671875, + "learning_rate": 0.00019433992330378073, + "loss": 0.0155, + "step": 2280 + }, + { + "epoch": 0.10807865434731107, + "grad_norm": 0.5625, + "learning_rate": 0.00019433498299049168, + "loss": 0.94, + "step": 2281 + }, + { + "epoch": 0.10812603648424544, + "grad_norm": 0.5234375, + "learning_rate": 0.00019433004058495317, + "loss": 0.8623, + "step": 2282 + }, + { + "epoch": 0.10817341862117981, + "grad_norm": 0.66796875, + "learning_rate": 0.00019432509608727485, + "loss": 1.4621, + "step": 2283 + }, + { + "epoch": 0.10822080075811419, + "grad_norm": 0.045654296875, + "learning_rate": 0.00019432014949756637, + "loss": 0.0021, + "step": 2284 + }, + { + "epoch": 0.10826818289504857, + "grad_norm": 0.69140625, + "learning_rate": 0.00019431520081593742, + "loss": 1.0244, + "step": 2285 + }, + { + "epoch": 0.10831556503198295, + "grad_norm": 0.59765625, + "learning_rate": 0.0001943102500424978, + "loss": 1.214, + "step": 2286 + }, + { + "epoch": 0.10836294716891731, + "grad_norm": 0.65625, + "learning_rate": 0.00019430529717735727, + "loss": 1.6764, + "step": 2287 + }, + { + "epoch": 0.10841032930585169, + "grad_norm": 0.62109375, + "learning_rate": 0.00019430034222062573, + "loss": 0.8461, + "step": 2288 + }, + { + "epoch": 0.10845771144278607, + "grad_norm": 0.62109375, + "learning_rate": 0.00019429538517241302, + "loss": 1.0771, + "step": 2289 + }, + { + "epoch": 0.10850509357972045, + "grad_norm": 0.21875, + "learning_rate": 0.0001942904260328291, + "loss": 0.1528, + "step": 2290 + }, + { + "epoch": 0.10855247571665481, + "grad_norm": 0.6015625, + "learning_rate": 0.00019428546480198397, + "loss": 1.1033, + "step": 2291 + }, + { + "epoch": 0.1085998578535892, + "grad_norm": 0.703125, + "learning_rate": 0.00019428050147998765, + "loss": 1.2995, + "step": 2292 + }, + { + "epoch": 0.10864723999052357, + "grad_norm": 0.65234375, + "learning_rate": 0.00019427553606695024, + "loss": 0.656, + "step": 2293 + }, + { + "epoch": 0.10869462212745795, + "grad_norm": 0.51171875, + "learning_rate": 0.00019427056856298185, + "loss": 0.9073, + "step": 2294 + }, + { + "epoch": 0.10874200426439233, + "grad_norm": 0.25390625, + "learning_rate": 0.0001942655989681927, + "loss": 0.1786, + "step": 2295 + }, + { + "epoch": 0.1087893864013267, + "grad_norm": 0.71875, + "learning_rate": 0.0001942606272826929, + "loss": 0.4551, + "step": 2296 + }, + { + "epoch": 0.10883676853826108, + "grad_norm": 0.5859375, + "learning_rate": 0.00019425565350659286, + "loss": 0.9024, + "step": 2297 + }, + { + "epoch": 0.10888415067519545, + "grad_norm": 0.416015625, + "learning_rate": 0.00019425067764000276, + "loss": 0.028, + "step": 2298 + }, + { + "epoch": 0.10893153281212983, + "grad_norm": 0.48828125, + "learning_rate": 0.0001942456996830331, + "loss": 0.9282, + "step": 2299 + }, + { + "epoch": 0.1089789149490642, + "grad_norm": 0.55859375, + "learning_rate": 0.00019424071963579414, + "loss": 1.1809, + "step": 2300 + }, + { + "epoch": 0.10902629708599858, + "grad_norm": 0.62890625, + "learning_rate": 0.00019423573749839643, + "loss": 0.8025, + "step": 2301 + }, + { + "epoch": 0.10907367922293296, + "grad_norm": 0.59765625, + "learning_rate": 0.0001942307532709504, + "loss": 1.4061, + "step": 2302 + }, + { + "epoch": 0.10912106135986734, + "grad_norm": 0.578125, + "learning_rate": 0.00019422576695356667, + "loss": 1.2611, + "step": 2303 + }, + { + "epoch": 0.1091684434968017, + "grad_norm": 0.5234375, + "learning_rate": 0.0001942207785463558, + "loss": 1.0934, + "step": 2304 + }, + { + "epoch": 0.10921582563373608, + "grad_norm": 0.59765625, + "learning_rate": 0.00019421578804942842, + "loss": 1.3257, + "step": 2305 + }, + { + "epoch": 0.10926320777067046, + "grad_norm": 0.62109375, + "learning_rate": 0.00019421079546289518, + "loss": 0.4124, + "step": 2306 + }, + { + "epoch": 0.10931058990760484, + "grad_norm": 0.4609375, + "learning_rate": 0.00019420580078686689, + "loss": 1.3487, + "step": 2307 + }, + { + "epoch": 0.1093579720445392, + "grad_norm": 0.6640625, + "learning_rate": 0.00019420080402145424, + "loss": 0.8191, + "step": 2308 + }, + { + "epoch": 0.10940535418147358, + "grad_norm": 0.73828125, + "learning_rate": 0.0001941958051667681, + "loss": 1.0877, + "step": 2309 + }, + { + "epoch": 0.10945273631840796, + "grad_norm": 0.4921875, + "learning_rate": 0.00019419080422291936, + "loss": 1.1972, + "step": 2310 + }, + { + "epoch": 0.10950011845534234, + "grad_norm": 0.259765625, + "learning_rate": 0.00019418580119001888, + "loss": 0.0473, + "step": 2311 + }, + { + "epoch": 0.1095475005922767, + "grad_norm": 0.64453125, + "learning_rate": 0.00019418079606817767, + "loss": 0.0514, + "step": 2312 + }, + { + "epoch": 0.10959488272921108, + "grad_norm": 0.5859375, + "learning_rate": 0.00019417578885750673, + "loss": 0.1785, + "step": 2313 + }, + { + "epoch": 0.10964226486614546, + "grad_norm": 0.609375, + "learning_rate": 0.00019417077955811708, + "loss": 1.6825, + "step": 2314 + }, + { + "epoch": 0.10968964700307984, + "grad_norm": 0.5625, + "learning_rate": 0.00019416576817011988, + "loss": 1.0818, + "step": 2315 + }, + { + "epoch": 0.10973702914001421, + "grad_norm": 0.7265625, + "learning_rate": 0.0001941607546936262, + "loss": 1.2398, + "step": 2316 + }, + { + "epoch": 0.10978441127694859, + "grad_norm": 0.37109375, + "learning_rate": 0.00019415573912874733, + "loss": 0.0085, + "step": 2317 + }, + { + "epoch": 0.10983179341388297, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001941507214755944, + "loss": 0.0102, + "step": 2318 + }, + { + "epoch": 0.10987917555081735, + "grad_norm": 0.466796875, + "learning_rate": 0.0001941457017342788, + "loss": 0.8173, + "step": 2319 + }, + { + "epoch": 0.10992655768775171, + "grad_norm": 0.59375, + "learning_rate": 0.00019414067990491178, + "loss": 0.8912, + "step": 2320 + }, + { + "epoch": 0.10997393982468609, + "grad_norm": 0.5234375, + "learning_rate": 0.00019413565598760477, + "loss": 0.9078, + "step": 2321 + }, + { + "epoch": 0.11002132196162047, + "grad_norm": 0.41015625, + "learning_rate": 0.00019413062998246917, + "loss": 0.1027, + "step": 2322 + }, + { + "epoch": 0.11006870409855485, + "grad_norm": 0.703125, + "learning_rate": 0.00019412560188961648, + "loss": 0.0374, + "step": 2323 + }, + { + "epoch": 0.11011608623548923, + "grad_norm": 0.458984375, + "learning_rate": 0.0001941205717091582, + "loss": 0.0314, + "step": 2324 + }, + { + "epoch": 0.11016346837242359, + "grad_norm": 0.6328125, + "learning_rate": 0.00019411553944120584, + "loss": 1.018, + "step": 2325 + }, + { + "epoch": 0.11021085050935797, + "grad_norm": 0.703125, + "learning_rate": 0.0001941105050858711, + "loss": 1.2728, + "step": 2326 + }, + { + "epoch": 0.11025823264629235, + "grad_norm": 0.76171875, + "learning_rate": 0.0001941054686432656, + "loss": 0.3018, + "step": 2327 + }, + { + "epoch": 0.11030561478322673, + "grad_norm": 0.67578125, + "learning_rate": 0.00019410043011350102, + "loss": 1.0628, + "step": 2328 + }, + { + "epoch": 0.1103529969201611, + "grad_norm": 0.62890625, + "learning_rate": 0.00019409538949668916, + "loss": 0.2092, + "step": 2329 + }, + { + "epoch": 0.11040037905709547, + "grad_norm": 1.40625, + "learning_rate": 0.0001940903467929418, + "loss": 0.3697, + "step": 2330 + }, + { + "epoch": 0.11044776119402985, + "grad_norm": 0.51953125, + "learning_rate": 0.00019408530200237074, + "loss": 1.355, + "step": 2331 + }, + { + "epoch": 0.11049514333096423, + "grad_norm": 0.4921875, + "learning_rate": 0.0001940802551250879, + "loss": 1.1421, + "step": 2332 + }, + { + "epoch": 0.1105425254678986, + "grad_norm": 0.5078125, + "learning_rate": 0.00019407520616120523, + "loss": 0.7896, + "step": 2333 + }, + { + "epoch": 0.11058990760483298, + "grad_norm": 0.5625, + "learning_rate": 0.00019407015511083465, + "loss": 1.1293, + "step": 2334 + }, + { + "epoch": 0.11063728974176736, + "grad_norm": 0.53515625, + "learning_rate": 0.00019406510197408826, + "loss": 0.8314, + "step": 2335 + }, + { + "epoch": 0.11068467187870173, + "grad_norm": 0.478515625, + "learning_rate": 0.00019406004675107813, + "loss": 1.0182, + "step": 2336 + }, + { + "epoch": 0.1107320540156361, + "grad_norm": 0.498046875, + "learning_rate": 0.0001940549894419163, + "loss": 0.9839, + "step": 2337 + }, + { + "epoch": 0.11077943615257048, + "grad_norm": 0.55078125, + "learning_rate": 0.000194049930046715, + "loss": 1.1501, + "step": 2338 + }, + { + "epoch": 0.11082681828950486, + "grad_norm": 0.419921875, + "learning_rate": 0.00019404486856558644, + "loss": 0.4599, + "step": 2339 + }, + { + "epoch": 0.11087420042643924, + "grad_norm": 0.6328125, + "learning_rate": 0.00019403980499864285, + "loss": 1.2662, + "step": 2340 + }, + { + "epoch": 0.1109215825633736, + "grad_norm": 0.15234375, + "learning_rate": 0.00019403473934599655, + "loss": 0.0025, + "step": 2341 + }, + { + "epoch": 0.11096896470030798, + "grad_norm": 0.48046875, + "learning_rate": 0.0001940296716077599, + "loss": 0.6028, + "step": 2342 + }, + { + "epoch": 0.11101634683724236, + "grad_norm": 0.58984375, + "learning_rate": 0.00019402460178404534, + "loss": 0.8875, + "step": 2343 + }, + { + "epoch": 0.11106372897417674, + "grad_norm": 0.51171875, + "learning_rate": 0.0001940195298749652, + "loss": 1.0461, + "step": 2344 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.248046875, + "learning_rate": 0.00019401445588063205, + "loss": 0.0401, + "step": 2345 + }, + { + "epoch": 0.11115849324804548, + "grad_norm": 0.474609375, + "learning_rate": 0.0001940093798011584, + "loss": 0.9364, + "step": 2346 + }, + { + "epoch": 0.11120587538497986, + "grad_norm": 0.59375, + "learning_rate": 0.00019400430163665685, + "loss": 0.7033, + "step": 2347 + }, + { + "epoch": 0.11125325752191424, + "grad_norm": 0.62109375, + "learning_rate": 0.00019399922138724004, + "loss": 1.3435, + "step": 2348 + }, + { + "epoch": 0.11130063965884861, + "grad_norm": 0.5703125, + "learning_rate": 0.0001939941390530206, + "loss": 0.2642, + "step": 2349 + }, + { + "epoch": 0.11134802179578299, + "grad_norm": 1.2890625, + "learning_rate": 0.00019398905463411124, + "loss": 0.9699, + "step": 2350 + }, + { + "epoch": 0.11139540393271737, + "grad_norm": 0.71484375, + "learning_rate": 0.00019398396813062482, + "loss": 1.213, + "step": 2351 + }, + { + "epoch": 0.11144278606965174, + "grad_norm": 0.3203125, + "learning_rate": 0.00019397887954267408, + "loss": 0.1429, + "step": 2352 + }, + { + "epoch": 0.11149016820658612, + "grad_norm": 0.6171875, + "learning_rate": 0.00019397378887037187, + "loss": 1.3877, + "step": 2353 + }, + { + "epoch": 0.11153755034352049, + "grad_norm": 0.361328125, + "learning_rate": 0.00019396869611383114, + "loss": 0.1812, + "step": 2354 + }, + { + "epoch": 0.11158493248045487, + "grad_norm": 0.34375, + "learning_rate": 0.00019396360127316482, + "loss": 0.0505, + "step": 2355 + }, + { + "epoch": 0.11163231461738925, + "grad_norm": 0.05078125, + "learning_rate": 0.0001939585043484859, + "loss": 0.0035, + "step": 2356 + }, + { + "epoch": 0.11167969675432363, + "grad_norm": 0.51953125, + "learning_rate": 0.00019395340533990744, + "loss": 0.2011, + "step": 2357 + }, + { + "epoch": 0.11172707889125799, + "grad_norm": 0.7109375, + "learning_rate": 0.00019394830424754252, + "loss": 0.8884, + "step": 2358 + }, + { + "epoch": 0.11177446102819237, + "grad_norm": 0.53125, + "learning_rate": 0.00019394320107150428, + "loss": 0.7974, + "step": 2359 + }, + { + "epoch": 0.11182184316512675, + "grad_norm": 0.494140625, + "learning_rate": 0.0001939380958119059, + "loss": 0.2071, + "step": 2360 + }, + { + "epoch": 0.11186922530206113, + "grad_norm": 0.267578125, + "learning_rate": 0.00019393298846886062, + "loss": 0.1647, + "step": 2361 + }, + { + "epoch": 0.1119166074389955, + "grad_norm": 0.66015625, + "learning_rate": 0.0001939278790424817, + "loss": 0.9444, + "step": 2362 + }, + { + "epoch": 0.11196398957592987, + "grad_norm": 0.64453125, + "learning_rate": 0.00019392276753288248, + "loss": 0.9224, + "step": 2363 + }, + { + "epoch": 0.11201137171286425, + "grad_norm": 0.49609375, + "learning_rate": 0.0001939176539401763, + "loss": 0.7519, + "step": 2364 + }, + { + "epoch": 0.11205875384979863, + "grad_norm": 0.3203125, + "learning_rate": 0.00019391253826447663, + "loss": 0.1332, + "step": 2365 + }, + { + "epoch": 0.112106135986733, + "grad_norm": 0.53125, + "learning_rate": 0.00019390742050589687, + "loss": 0.548, + "step": 2366 + }, + { + "epoch": 0.11215351812366738, + "grad_norm": 0.482421875, + "learning_rate": 0.00019390230066455058, + "loss": 1.065, + "step": 2367 + }, + { + "epoch": 0.11220090026060175, + "grad_norm": 0.30078125, + "learning_rate": 0.00019389717874055125, + "loss": 0.1695, + "step": 2368 + }, + { + "epoch": 0.11224828239753613, + "grad_norm": 0.734375, + "learning_rate": 0.0001938920547340125, + "loss": 0.9457, + "step": 2369 + }, + { + "epoch": 0.1122956645344705, + "grad_norm": 0.76171875, + "learning_rate": 0.00019388692864504798, + "loss": 0.2065, + "step": 2370 + }, + { + "epoch": 0.11234304667140488, + "grad_norm": 1.1875, + "learning_rate": 0.00019388180047377144, + "loss": 0.761, + "step": 2371 + }, + { + "epoch": 0.11239042880833926, + "grad_norm": 0.56640625, + "learning_rate": 0.0001938766702202965, + "loss": 0.9179, + "step": 2372 + }, + { + "epoch": 0.11243781094527364, + "grad_norm": 0.54296875, + "learning_rate": 0.00019387153788473705, + "loss": 1.2718, + "step": 2373 + }, + { + "epoch": 0.112485193082208, + "grad_norm": 0.6015625, + "learning_rate": 0.00019386640346720686, + "loss": 0.8863, + "step": 2374 + }, + { + "epoch": 0.11253257521914238, + "grad_norm": 0.6328125, + "learning_rate": 0.00019386126696781982, + "loss": 0.0903, + "step": 2375 + }, + { + "epoch": 0.11257995735607676, + "grad_norm": 0.474609375, + "learning_rate": 0.00019385612838668985, + "loss": 0.3469, + "step": 2376 + }, + { + "epoch": 0.11262733949301114, + "grad_norm": 0.498046875, + "learning_rate": 0.00019385098772393096, + "loss": 0.8277, + "step": 2377 + }, + { + "epoch": 0.1126747216299455, + "grad_norm": 0.55078125, + "learning_rate": 0.0001938458449796571, + "loss": 0.7903, + "step": 2378 + }, + { + "epoch": 0.11272210376687988, + "grad_norm": 0.58203125, + "learning_rate": 0.00019384070015398235, + "loss": 0.6248, + "step": 2379 + }, + { + "epoch": 0.11276948590381426, + "grad_norm": 0.455078125, + "learning_rate": 0.00019383555324702082, + "loss": 0.5589, + "step": 2380 + }, + { + "epoch": 0.11281686804074864, + "grad_norm": 0.388671875, + "learning_rate": 0.00019383040425888668, + "loss": 0.4301, + "step": 2381 + }, + { + "epoch": 0.11286425017768302, + "grad_norm": 0.490234375, + "learning_rate": 0.00019382525318969413, + "loss": 1.2382, + "step": 2382 + }, + { + "epoch": 0.11291163231461739, + "grad_norm": 0.58984375, + "learning_rate": 0.0001938201000395574, + "loss": 0.9135, + "step": 2383 + }, + { + "epoch": 0.11295901445155176, + "grad_norm": 0.5390625, + "learning_rate": 0.00019381494480859076, + "loss": 0.4733, + "step": 2384 + }, + { + "epoch": 0.11300639658848614, + "grad_norm": 0.59375, + "learning_rate": 0.0001938097874969086, + "loss": 1.1656, + "step": 2385 + }, + { + "epoch": 0.11305377872542052, + "grad_norm": 0.234375, + "learning_rate": 0.00019380462810462525, + "loss": 0.0332, + "step": 2386 + }, + { + "epoch": 0.11310116086235489, + "grad_norm": 0.83984375, + "learning_rate": 0.0001937994666318552, + "loss": 1.0793, + "step": 2387 + }, + { + "epoch": 0.11314854299928927, + "grad_norm": 0.55859375, + "learning_rate": 0.00019379430307871288, + "loss": 0.7872, + "step": 2388 + }, + { + "epoch": 0.11319592513622365, + "grad_norm": 0.8046875, + "learning_rate": 0.0001937891374453128, + "loss": 0.0647, + "step": 2389 + }, + { + "epoch": 0.11324330727315803, + "grad_norm": 0.455078125, + "learning_rate": 0.00019378396973176955, + "loss": 0.4569, + "step": 2390 + }, + { + "epoch": 0.11329068941009239, + "grad_norm": 0.478515625, + "learning_rate": 0.00019377879993819777, + "loss": 0.8047, + "step": 2391 + }, + { + "epoch": 0.11333807154702677, + "grad_norm": 0.53125, + "learning_rate": 0.00019377362806471208, + "loss": 1.0586, + "step": 2392 + }, + { + "epoch": 0.11338545368396115, + "grad_norm": 0.474609375, + "learning_rate": 0.0001937684541114272, + "loss": 1.0862, + "step": 2393 + }, + { + "epoch": 0.11343283582089553, + "grad_norm": 0.625, + "learning_rate": 0.00019376327807845792, + "loss": 1.7054, + "step": 2394 + }, + { + "epoch": 0.1134802179578299, + "grad_norm": 0.53125, + "learning_rate": 0.00019375809996591896, + "loss": 1.0919, + "step": 2395 + }, + { + "epoch": 0.11352760009476427, + "grad_norm": 0.54296875, + "learning_rate": 0.00019375291977392523, + "loss": 0.983, + "step": 2396 + }, + { + "epoch": 0.11357498223169865, + "grad_norm": 0.4609375, + "learning_rate": 0.0001937477375025916, + "loss": 0.7082, + "step": 2397 + }, + { + "epoch": 0.11362236436863303, + "grad_norm": 0.4921875, + "learning_rate": 0.000193742553152033, + "loss": 0.8129, + "step": 2398 + }, + { + "epoch": 0.1136697465055674, + "grad_norm": 0.69921875, + "learning_rate": 0.00019373736672236445, + "loss": 1.3653, + "step": 2399 + }, + { + "epoch": 0.11371712864250177, + "grad_norm": 0.484375, + "learning_rate": 0.0001937321782137009, + "loss": 1.09, + "step": 2400 + }, + { + "epoch": 0.11376451077943615, + "grad_norm": 0.58984375, + "learning_rate": 0.0001937269876261575, + "loss": 0.521, + "step": 2401 + }, + { + "epoch": 0.11381189291637053, + "grad_norm": 0.375, + "learning_rate": 0.00019372179495984936, + "loss": 0.9942, + "step": 2402 + }, + { + "epoch": 0.1138592750533049, + "grad_norm": 0.61328125, + "learning_rate": 0.00019371660021489162, + "loss": 1.1217, + "step": 2403 + }, + { + "epoch": 0.11390665719023928, + "grad_norm": 0.5234375, + "learning_rate": 0.00019371140339139952, + "loss": 1.177, + "step": 2404 + }, + { + "epoch": 0.11395403932717366, + "grad_norm": 0.453125, + "learning_rate": 0.0001937062044894883, + "loss": 0.0618, + "step": 2405 + }, + { + "epoch": 0.11400142146410804, + "grad_norm": 0.515625, + "learning_rate": 0.00019370100350927328, + "loss": 1.4012, + "step": 2406 + }, + { + "epoch": 0.1140488036010424, + "grad_norm": 0.6796875, + "learning_rate": 0.0001936958004508698, + "loss": 0.2863, + "step": 2407 + }, + { + "epoch": 0.11409618573797678, + "grad_norm": 0.7734375, + "learning_rate": 0.00019369059531439332, + "loss": 1.1873, + "step": 2408 + }, + { + "epoch": 0.11414356787491116, + "grad_norm": 0.44921875, + "learning_rate": 0.00019368538809995918, + "loss": 1.0045, + "step": 2409 + }, + { + "epoch": 0.11419095001184554, + "grad_norm": 1.0703125, + "learning_rate": 0.0001936801788076829, + "loss": 0.8343, + "step": 2410 + }, + { + "epoch": 0.1142383321487799, + "grad_norm": 0.40625, + "learning_rate": 0.00019367496743768009, + "loss": 0.5111, + "step": 2411 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.271484375, + "learning_rate": 0.00019366975399006626, + "loss": 0.1618, + "step": 2412 + }, + { + "epoch": 0.11433309642264866, + "grad_norm": 0.61328125, + "learning_rate": 0.00019366453846495705, + "loss": 0.8744, + "step": 2413 + }, + { + "epoch": 0.11438047855958304, + "grad_norm": 0.78125, + "learning_rate": 0.00019365932086246813, + "loss": 0.7495, + "step": 2414 + }, + { + "epoch": 0.11442786069651742, + "grad_norm": 0.341796875, + "learning_rate": 0.00019365410118271528, + "loss": 0.179, + "step": 2415 + }, + { + "epoch": 0.11447524283345178, + "grad_norm": 0.56640625, + "learning_rate": 0.00019364887942581417, + "loss": 0.8303, + "step": 2416 + }, + { + "epoch": 0.11452262497038616, + "grad_norm": 0.515625, + "learning_rate": 0.0001936436555918807, + "loss": 0.5321, + "step": 2417 + }, + { + "epoch": 0.11457000710732054, + "grad_norm": 0.64453125, + "learning_rate": 0.0001936384296810307, + "loss": 0.6623, + "step": 2418 + }, + { + "epoch": 0.11461738924425492, + "grad_norm": 0.75390625, + "learning_rate": 0.00019363320169338004, + "loss": 0.2194, + "step": 2419 + }, + { + "epoch": 0.11466477138118929, + "grad_norm": 0.326171875, + "learning_rate": 0.0001936279716290447, + "loss": 0.5661, + "step": 2420 + }, + { + "epoch": 0.11471215351812367, + "grad_norm": 0.6640625, + "learning_rate": 0.00019362273948814068, + "loss": 1.2285, + "step": 2421 + }, + { + "epoch": 0.11475953565505805, + "grad_norm": 0.609375, + "learning_rate": 0.00019361750527078405, + "loss": 1.3478, + "step": 2422 + }, + { + "epoch": 0.11480691779199242, + "grad_norm": 0.52734375, + "learning_rate": 0.00019361226897709086, + "loss": 1.6165, + "step": 2423 + }, + { + "epoch": 0.11485429992892679, + "grad_norm": 0.6953125, + "learning_rate": 0.00019360703060717724, + "loss": 0.9164, + "step": 2424 + }, + { + "epoch": 0.11490168206586117, + "grad_norm": 0.609375, + "learning_rate": 0.0001936017901611594, + "loss": 0.9924, + "step": 2425 + }, + { + "epoch": 0.11494906420279555, + "grad_norm": 0.52734375, + "learning_rate": 0.00019359654763915354, + "loss": 0.7366, + "step": 2426 + }, + { + "epoch": 0.11499644633972993, + "grad_norm": 0.48046875, + "learning_rate": 0.00019359130304127595, + "loss": 0.0893, + "step": 2427 + }, + { + "epoch": 0.11504382847666429, + "grad_norm": 0.58203125, + "learning_rate": 0.00019358605636764296, + "loss": 0.8715, + "step": 2428 + }, + { + "epoch": 0.11509121061359867, + "grad_norm": 0.578125, + "learning_rate": 0.0001935808076183709, + "loss": 1.0793, + "step": 2429 + }, + { + "epoch": 0.11513859275053305, + "grad_norm": 0.69140625, + "learning_rate": 0.00019357555679357623, + "loss": 0.9252, + "step": 2430 + }, + { + "epoch": 0.11518597488746743, + "grad_norm": 0.5390625, + "learning_rate": 0.0001935703038933754, + "loss": 0.8074, + "step": 2431 + }, + { + "epoch": 0.1152333570244018, + "grad_norm": 0.6640625, + "learning_rate": 0.00019356504891788486, + "loss": 1.7112, + "step": 2432 + }, + { + "epoch": 0.11528073916133617, + "grad_norm": 0.55859375, + "learning_rate": 0.00019355979186722118, + "loss": 1.0211, + "step": 2433 + }, + { + "epoch": 0.11532812129827055, + "grad_norm": 0.2578125, + "learning_rate": 0.000193554532741501, + "loss": 0.1586, + "step": 2434 + }, + { + "epoch": 0.11537550343520493, + "grad_norm": 0.56640625, + "learning_rate": 0.00019354927154084095, + "loss": 0.1016, + "step": 2435 + }, + { + "epoch": 0.1154228855721393, + "grad_norm": 0.57421875, + "learning_rate": 0.00019354400826535767, + "loss": 1.0657, + "step": 2436 + }, + { + "epoch": 0.11547026770907368, + "grad_norm": 0.42578125, + "learning_rate": 0.00019353874291516793, + "loss": 0.4119, + "step": 2437 + }, + { + "epoch": 0.11551764984600806, + "grad_norm": 0.70703125, + "learning_rate": 0.0001935334754903885, + "loss": 0.0556, + "step": 2438 + }, + { + "epoch": 0.11556503198294243, + "grad_norm": 0.05908203125, + "learning_rate": 0.00019352820599113622, + "loss": 0.0062, + "step": 2439 + }, + { + "epoch": 0.1156124141198768, + "grad_norm": 0.60546875, + "learning_rate": 0.00019352293441752798, + "loss": 1.1129, + "step": 2440 + }, + { + "epoch": 0.11565979625681118, + "grad_norm": 0.640625, + "learning_rate": 0.00019351766076968065, + "loss": 1.6857, + "step": 2441 + }, + { + "epoch": 0.11570717839374556, + "grad_norm": 0.54296875, + "learning_rate": 0.0001935123850477112, + "loss": 1.5279, + "step": 2442 + }, + { + "epoch": 0.11575456053067994, + "grad_norm": 0.484375, + "learning_rate": 0.00019350710725173667, + "loss": 0.855, + "step": 2443 + }, + { + "epoch": 0.11580194266761432, + "grad_norm": 0.515625, + "learning_rate": 0.0001935018273818741, + "loss": 1.1485, + "step": 2444 + }, + { + "epoch": 0.11584932480454868, + "grad_norm": 1.1171875, + "learning_rate": 0.00019349654543824059, + "loss": 0.5004, + "step": 2445 + }, + { + "epoch": 0.11589670694148306, + "grad_norm": 1.1484375, + "learning_rate": 0.00019349126142095328, + "loss": 0.9367, + "step": 2446 + }, + { + "epoch": 0.11594408907841744, + "grad_norm": 0.65625, + "learning_rate": 0.00019348597533012937, + "loss": 0.3459, + "step": 2447 + }, + { + "epoch": 0.11599147121535182, + "grad_norm": 0.435546875, + "learning_rate": 0.00019348068716588615, + "loss": 0.1398, + "step": 2448 + }, + { + "epoch": 0.11603885335228618, + "grad_norm": 0.6796875, + "learning_rate": 0.0001934753969283408, + "loss": 0.9748, + "step": 2449 + }, + { + "epoch": 0.11608623548922056, + "grad_norm": 0.58203125, + "learning_rate": 0.00019347010461761075, + "loss": 1.0431, + "step": 2450 + }, + { + "epoch": 0.11613361762615494, + "grad_norm": 0.5859375, + "learning_rate": 0.00019346481023381333, + "loss": 1.1636, + "step": 2451 + }, + { + "epoch": 0.11618099976308932, + "grad_norm": 0.6953125, + "learning_rate": 0.00019345951377706597, + "loss": 0.1638, + "step": 2452 + }, + { + "epoch": 0.11622838190002369, + "grad_norm": 0.5625, + "learning_rate": 0.00019345421524748614, + "loss": 1.0393, + "step": 2453 + }, + { + "epoch": 0.11627576403695807, + "grad_norm": 0.59375, + "learning_rate": 0.00019344891464519138, + "loss": 0.4738, + "step": 2454 + }, + { + "epoch": 0.11632314617389244, + "grad_norm": 0.58984375, + "learning_rate": 0.00019344361197029918, + "loss": 0.9189, + "step": 2455 + }, + { + "epoch": 0.11637052831082682, + "grad_norm": 0.51171875, + "learning_rate": 0.00019343830722292726, + "loss": 0.9371, + "step": 2456 + }, + { + "epoch": 0.11641791044776119, + "grad_norm": 0.62109375, + "learning_rate": 0.00019343300040319317, + "loss": 0.3964, + "step": 2457 + }, + { + "epoch": 0.11646529258469557, + "grad_norm": 0.46875, + "learning_rate": 0.00019342769151121467, + "loss": 0.8681, + "step": 2458 + }, + { + "epoch": 0.11651267472162995, + "grad_norm": 0.5625, + "learning_rate": 0.0001934223805471095, + "loss": 0.8003, + "step": 2459 + }, + { + "epoch": 0.11656005685856433, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019341706751099542, + "loss": 0.0212, + "step": 2460 + }, + { + "epoch": 0.11660743899549869, + "grad_norm": 0.08837890625, + "learning_rate": 0.00019341175240299028, + "loss": 0.0065, + "step": 2461 + }, + { + "epoch": 0.11665482113243307, + "grad_norm": 0.53125, + "learning_rate": 0.000193406435223212, + "loss": 1.2321, + "step": 2462 + }, + { + "epoch": 0.11670220326936745, + "grad_norm": 0.69140625, + "learning_rate": 0.00019340111597177843, + "loss": 1.1462, + "step": 2463 + }, + { + "epoch": 0.11674958540630183, + "grad_norm": 0.578125, + "learning_rate": 0.00019339579464880763, + "loss": 0.9835, + "step": 2464 + }, + { + "epoch": 0.1167969675432362, + "grad_norm": 0.41015625, + "learning_rate": 0.00019339047125441756, + "loss": 0.8235, + "step": 2465 + }, + { + "epoch": 0.11684434968017057, + "grad_norm": 0.56640625, + "learning_rate": 0.00019338514578872633, + "loss": 0.8367, + "step": 2466 + }, + { + "epoch": 0.11689173181710495, + "grad_norm": 0.65234375, + "learning_rate": 0.00019337981825185202, + "loss": 1.147, + "step": 2467 + }, + { + "epoch": 0.11693911395403933, + "grad_norm": 0.52734375, + "learning_rate": 0.00019337448864391283, + "loss": 0.4575, + "step": 2468 + }, + { + "epoch": 0.1169864960909737, + "grad_norm": 0.05712890625, + "learning_rate": 0.00019336915696502693, + "loss": 0.0063, + "step": 2469 + }, + { + "epoch": 0.11703387822790808, + "grad_norm": 0.8359375, + "learning_rate": 0.0001933638232153126, + "loss": 1.6739, + "step": 2470 + }, + { + "epoch": 0.11708126036484245, + "grad_norm": 0.04345703125, + "learning_rate": 0.00019335848739488807, + "loss": 0.0051, + "step": 2471 + }, + { + "epoch": 0.11712864250177683, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019335314950387174, + "loss": 0.0252, + "step": 2472 + }, + { + "epoch": 0.11717602463871121, + "grad_norm": 0.72265625, + "learning_rate": 0.00019334780954238204, + "loss": 1.0257, + "step": 2473 + }, + { + "epoch": 0.11722340677564558, + "grad_norm": 0.474609375, + "learning_rate": 0.0001933424675105373, + "loss": 0.0673, + "step": 2474 + }, + { + "epoch": 0.11727078891257996, + "grad_norm": 0.53515625, + "learning_rate": 0.00019333712340845608, + "loss": 1.181, + "step": 2475 + }, + { + "epoch": 0.11731817104951434, + "grad_norm": 0.53515625, + "learning_rate": 0.0001933317772362569, + "loss": 1.1553, + "step": 2476 + }, + { + "epoch": 0.11736555318644872, + "grad_norm": 0.609375, + "learning_rate": 0.00019332642899405825, + "loss": 0.8072, + "step": 2477 + }, + { + "epoch": 0.11741293532338308, + "grad_norm": 0.5078125, + "learning_rate": 0.00019332107868197886, + "loss": 0.0409, + "step": 2478 + }, + { + "epoch": 0.11746031746031746, + "grad_norm": 0.4609375, + "learning_rate": 0.00019331572630013736, + "loss": 0.2116, + "step": 2479 + }, + { + "epoch": 0.11750769959725184, + "grad_norm": 0.53125, + "learning_rate": 0.0001933103718486524, + "loss": 0.1154, + "step": 2480 + }, + { + "epoch": 0.11755508173418622, + "grad_norm": 0.458984375, + "learning_rate": 0.00019330501532764283, + "loss": 0.9118, + "step": 2481 + }, + { + "epoch": 0.11760246387112058, + "grad_norm": 0.443359375, + "learning_rate": 0.00019329965673722737, + "loss": 0.5453, + "step": 2482 + }, + { + "epoch": 0.11764984600805496, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001932942960775249, + "loss": 0.026, + "step": 2483 + }, + { + "epoch": 0.11769722814498934, + "grad_norm": 0.51171875, + "learning_rate": 0.00019328893334865431, + "loss": 1.1812, + "step": 2484 + }, + { + "epoch": 0.11774461028192372, + "grad_norm": 0.51953125, + "learning_rate": 0.0001932835685507346, + "loss": 1.4514, + "step": 2485 + }, + { + "epoch": 0.11779199241885809, + "grad_norm": 0.6171875, + "learning_rate": 0.00019327820168388464, + "loss": 0.6865, + "step": 2486 + }, + { + "epoch": 0.11783937455579246, + "grad_norm": 0.69921875, + "learning_rate": 0.00019327283274822357, + "loss": 1.2399, + "step": 2487 + }, + { + "epoch": 0.11788675669272684, + "grad_norm": 1.03125, + "learning_rate": 0.00019326746174387038, + "loss": 0.2101, + "step": 2488 + }, + { + "epoch": 0.11793413882966122, + "grad_norm": 0.318359375, + "learning_rate": 0.00019326208867094424, + "loss": 0.0275, + "step": 2489 + }, + { + "epoch": 0.11798152096659559, + "grad_norm": 0.59375, + "learning_rate": 0.0001932567135295643, + "loss": 0.8127, + "step": 2490 + }, + { + "epoch": 0.11802890310352997, + "grad_norm": 0.52734375, + "learning_rate": 0.00019325133631984981, + "loss": 0.7025, + "step": 2491 + }, + { + "epoch": 0.11807628524046435, + "grad_norm": 0.69140625, + "learning_rate": 0.00019324595704192, + "loss": 0.1387, + "step": 2492 + }, + { + "epoch": 0.11812366737739873, + "grad_norm": 0.56640625, + "learning_rate": 0.0001932405756958942, + "loss": 1.3282, + "step": 2493 + }, + { + "epoch": 0.11817104951433309, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019323519228189173, + "loss": 0.1715, + "step": 2494 + }, + { + "epoch": 0.11821843165126747, + "grad_norm": 0.26953125, + "learning_rate": 0.000193229806800032, + "loss": 0.1249, + "step": 2495 + }, + { + "epoch": 0.11826581378820185, + "grad_norm": 0.62109375, + "learning_rate": 0.00019322441925043448, + "loss": 1.3299, + "step": 2496 + }, + { + "epoch": 0.11831319592513623, + "grad_norm": 0.59375, + "learning_rate": 0.00019321902963321863, + "loss": 1.0653, + "step": 2497 + }, + { + "epoch": 0.11836057806207059, + "grad_norm": 0.458984375, + "learning_rate": 0.00019321363794850397, + "loss": 0.9062, + "step": 2498 + }, + { + "epoch": 0.11840796019900497, + "grad_norm": 0.6484375, + "learning_rate": 0.00019320824419641016, + "loss": 1.5567, + "step": 2499 + }, + { + "epoch": 0.11845534233593935, + "grad_norm": 0.5546875, + "learning_rate": 0.00019320284837705671, + "loss": 0.9094, + "step": 2500 + }, + { + "epoch": 0.11850272447287373, + "grad_norm": 0.6484375, + "learning_rate": 0.0001931974504905634, + "loss": 1.2802, + "step": 2501 + }, + { + "epoch": 0.11855010660980811, + "grad_norm": 0.55859375, + "learning_rate": 0.00019319205053704993, + "loss": 1.1052, + "step": 2502 + }, + { + "epoch": 0.11859748874674247, + "grad_norm": 0.34765625, + "learning_rate": 0.00019318664851663599, + "loss": 0.3962, + "step": 2503 + }, + { + "epoch": 0.11864487088367685, + "grad_norm": 0.48828125, + "learning_rate": 0.00019318124442944146, + "loss": 1.1065, + "step": 2504 + }, + { + "epoch": 0.11869225302061123, + "grad_norm": 0.5625, + "learning_rate": 0.0001931758382755862, + "loss": 1.4303, + "step": 2505 + }, + { + "epoch": 0.11873963515754561, + "grad_norm": 0.640625, + "learning_rate": 0.00019317043005519008, + "loss": 0.8571, + "step": 2506 + }, + { + "epoch": 0.11878701729447998, + "grad_norm": 0.478515625, + "learning_rate": 0.00019316501976837308, + "loss": 1.3223, + "step": 2507 + }, + { + "epoch": 0.11883439943141436, + "grad_norm": 0.98046875, + "learning_rate": 0.00019315960741525512, + "loss": 0.8867, + "step": 2508 + }, + { + "epoch": 0.11888178156834874, + "grad_norm": 0.55078125, + "learning_rate": 0.00019315419299595633, + "loss": 0.8417, + "step": 2509 + }, + { + "epoch": 0.11892916370528311, + "grad_norm": 0.57421875, + "learning_rate": 0.00019314877651059676, + "loss": 0.8918, + "step": 2510 + }, + { + "epoch": 0.11897654584221748, + "grad_norm": 0.51953125, + "learning_rate": 0.0001931433579592966, + "loss": 0.7333, + "step": 2511 + }, + { + "epoch": 0.11902392797915186, + "grad_norm": 0.5078125, + "learning_rate": 0.0001931379373421759, + "loss": 1.0359, + "step": 2512 + }, + { + "epoch": 0.11907131011608624, + "grad_norm": 0.828125, + "learning_rate": 0.00019313251465935498, + "loss": 1.1578, + "step": 2513 + }, + { + "epoch": 0.11911869225302062, + "grad_norm": 0.5859375, + "learning_rate": 0.00019312708991095408, + "loss": 1.3204, + "step": 2514 + }, + { + "epoch": 0.11916607438995498, + "grad_norm": 1.0625, + "learning_rate": 0.00019312166309709352, + "loss": 0.3033, + "step": 2515 + }, + { + "epoch": 0.11921345652688936, + "grad_norm": 0.5625, + "learning_rate": 0.00019311623421789368, + "loss": 1.3375, + "step": 2516 + }, + { + "epoch": 0.11926083866382374, + "grad_norm": 0.68359375, + "learning_rate": 0.00019311080327347492, + "loss": 1.2571, + "step": 2517 + }, + { + "epoch": 0.11930822080075812, + "grad_norm": 0.62890625, + "learning_rate": 0.00019310537026395773, + "loss": 0.9547, + "step": 2518 + }, + { + "epoch": 0.11935560293769248, + "grad_norm": 0.302734375, + "learning_rate": 0.00019309993518946264, + "loss": 0.0299, + "step": 2519 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 0.408203125, + "learning_rate": 0.0001930944980501101, + "loss": 0.7248, + "step": 2520 + }, + { + "epoch": 0.11945036721156124, + "grad_norm": 0.515625, + "learning_rate": 0.00019308905884602074, + "loss": 0.7792, + "step": 2521 + }, + { + "epoch": 0.11949774934849562, + "grad_norm": 0.546875, + "learning_rate": 0.00019308361757731528, + "loss": 1.0552, + "step": 2522 + }, + { + "epoch": 0.11954513148542999, + "grad_norm": 0.61328125, + "learning_rate": 0.00019307817424411426, + "loss": 0.7193, + "step": 2523 + }, + { + "epoch": 0.11959251362236437, + "grad_norm": 0.431640625, + "learning_rate": 0.0001930727288465385, + "loss": 1.0973, + "step": 2524 + }, + { + "epoch": 0.11963989575929874, + "grad_norm": 0.75, + "learning_rate": 0.00019306728138470877, + "loss": 0.8714, + "step": 2525 + }, + { + "epoch": 0.11968727789623312, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019306183185874585, + "loss": 0.0116, + "step": 2526 + }, + { + "epoch": 0.11973466003316749, + "grad_norm": 0.236328125, + "learning_rate": 0.00019305638026877064, + "loss": 0.1626, + "step": 2527 + }, + { + "epoch": 0.11978204217010187, + "grad_norm": 0.37890625, + "learning_rate": 0.00019305092661490406, + "loss": 0.0571, + "step": 2528 + }, + { + "epoch": 0.11982942430703625, + "grad_norm": 0.625, + "learning_rate": 0.000193045470897267, + "loss": 0.9813, + "step": 2529 + }, + { + "epoch": 0.11987680644397063, + "grad_norm": 0.55078125, + "learning_rate": 0.0001930400131159805, + "loss": 1.069, + "step": 2530 + }, + { + "epoch": 0.119924188580905, + "grad_norm": 0.5234375, + "learning_rate": 0.0001930345532711656, + "loss": 1.0806, + "step": 2531 + }, + { + "epoch": 0.11997157071783937, + "grad_norm": 1.0703125, + "learning_rate": 0.00019302909136294344, + "loss": 0.8771, + "step": 2532 + }, + { + "epoch": 0.12001895285477375, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019302362739143512, + "loss": 0.1197, + "step": 2533 + }, + { + "epoch": 0.12006633499170813, + "grad_norm": 0.62890625, + "learning_rate": 0.00019301816135676182, + "loss": 1.5407, + "step": 2534 + }, + { + "epoch": 0.12011371712864251, + "grad_norm": 0.62890625, + "learning_rate": 0.00019301269325904476, + "loss": 0.9555, + "step": 2535 + }, + { + "epoch": 0.12016109926557687, + "grad_norm": 0.053466796875, + "learning_rate": 0.00019300722309840526, + "loss": 0.0034, + "step": 2536 + }, + { + "epoch": 0.12020848140251125, + "grad_norm": 0.9765625, + "learning_rate": 0.00019300175087496463, + "loss": 0.8896, + "step": 2537 + }, + { + "epoch": 0.12025586353944563, + "grad_norm": 0.1376953125, + "learning_rate": 0.00019299627658884421, + "loss": 0.0083, + "step": 2538 + }, + { + "epoch": 0.12030324567638001, + "grad_norm": 0.60546875, + "learning_rate": 0.00019299080024016543, + "loss": 0.7692, + "step": 2539 + }, + { + "epoch": 0.12035062781331438, + "grad_norm": 0.49609375, + "learning_rate": 0.00019298532182904975, + "loss": 0.9235, + "step": 2540 + }, + { + "epoch": 0.12039800995024875, + "grad_norm": 0.66015625, + "learning_rate": 0.00019297984135561866, + "loss": 1.2387, + "step": 2541 + }, + { + "epoch": 0.12044539208718313, + "grad_norm": 0.32421875, + "learning_rate": 0.00019297435881999376, + "loss": 0.1512, + "step": 2542 + }, + { + "epoch": 0.12049277422411751, + "grad_norm": 0.5234375, + "learning_rate": 0.0001929688742222966, + "loss": 0.6726, + "step": 2543 + }, + { + "epoch": 0.12054015636105188, + "grad_norm": 0.65234375, + "learning_rate": 0.00019296338756264882, + "loss": 1.4926, + "step": 2544 + }, + { + "epoch": 0.12058753849798626, + "grad_norm": 0.54296875, + "learning_rate": 0.00019295789884117212, + "loss": 1.0649, + "step": 2545 + }, + { + "epoch": 0.12063492063492064, + "grad_norm": 0.298828125, + "learning_rate": 0.00019295240805798826, + "loss": 0.0948, + "step": 2546 + }, + { + "epoch": 0.12068230277185502, + "grad_norm": 0.35546875, + "learning_rate": 0.000192946915213219, + "loss": 0.7311, + "step": 2547 + }, + { + "epoch": 0.12072968490878938, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019294142030698615, + "loss": 0.1531, + "step": 2548 + }, + { + "epoch": 0.12077706704572376, + "grad_norm": 0.58984375, + "learning_rate": 0.00019293592333941158, + "loss": 0.6935, + "step": 2549 + }, + { + "epoch": 0.12082444918265814, + "grad_norm": 0.49609375, + "learning_rate": 0.00019293042431061725, + "loss": 1.1029, + "step": 2550 + }, + { + "epoch": 0.12087183131959252, + "grad_norm": 0.79296875, + "learning_rate": 0.00019292492322072507, + "loss": 1.3596, + "step": 2551 + }, + { + "epoch": 0.12091921345652688, + "grad_norm": 0.019775390625, + "learning_rate": 0.0001929194200698571, + "loss": 0.0017, + "step": 2552 + }, + { + "epoch": 0.12096659559346126, + "grad_norm": 1.03125, + "learning_rate": 0.00019291391485813533, + "loss": 0.8052, + "step": 2553 + }, + { + "epoch": 0.12101397773039564, + "grad_norm": 0.33203125, + "learning_rate": 0.00019290840758568194, + "loss": 0.0633, + "step": 2554 + }, + { + "epoch": 0.12106135986733002, + "grad_norm": 0.57421875, + "learning_rate": 0.000192902898252619, + "loss": 0.4026, + "step": 2555 + }, + { + "epoch": 0.12110874200426439, + "grad_norm": 0.7578125, + "learning_rate": 0.00019289738685906874, + "loss": 1.0573, + "step": 2556 + }, + { + "epoch": 0.12115612414119876, + "grad_norm": 0.07861328125, + "learning_rate": 0.0001928918734051534, + "loss": 0.008, + "step": 2557 + }, + { + "epoch": 0.12120350627813314, + "grad_norm": 0.8515625, + "learning_rate": 0.00019288635789099524, + "loss": 1.0692, + "step": 2558 + }, + { + "epoch": 0.12125088841506752, + "grad_norm": 0.6953125, + "learning_rate": 0.0001928808403167166, + "loss": 1.4574, + "step": 2559 + }, + { + "epoch": 0.1212982705520019, + "grad_norm": 0.53125, + "learning_rate": 0.00019287532068243984, + "loss": 0.7272, + "step": 2560 + }, + { + "epoch": 0.12134565268893627, + "grad_norm": 1.578125, + "learning_rate": 0.00019286979898828742, + "loss": 0.0126, + "step": 2561 + }, + { + "epoch": 0.12139303482587065, + "grad_norm": 0.56640625, + "learning_rate": 0.00019286427523438178, + "loss": 1.0641, + "step": 2562 + }, + { + "epoch": 0.12144041696280503, + "grad_norm": 0.88671875, + "learning_rate": 0.0001928587494208454, + "loss": 0.1086, + "step": 2563 + }, + { + "epoch": 0.1214877990997394, + "grad_norm": 0.58203125, + "learning_rate": 0.0001928532215478009, + "loss": 1.4355, + "step": 2564 + }, + { + "epoch": 0.12153518123667377, + "grad_norm": 0.53515625, + "learning_rate": 0.00019284769161537083, + "loss": 0.7239, + "step": 2565 + }, + { + "epoch": 0.12158256337360815, + "grad_norm": 0.8203125, + "learning_rate": 0.00019284215962367786, + "loss": 1.2765, + "step": 2566 + }, + { + "epoch": 0.12162994551054253, + "grad_norm": 0.478515625, + "learning_rate": 0.0001928366255728447, + "loss": 0.2377, + "step": 2567 + }, + { + "epoch": 0.12167732764747691, + "grad_norm": 0.439453125, + "learning_rate": 0.00019283108946299403, + "loss": 0.0398, + "step": 2568 + }, + { + "epoch": 0.12172470978441127, + "grad_norm": 0.5234375, + "learning_rate": 0.0001928255512942487, + "loss": 0.9694, + "step": 2569 + }, + { + "epoch": 0.12177209192134565, + "grad_norm": 0.474609375, + "learning_rate": 0.00019282001106673153, + "loss": 1.048, + "step": 2570 + }, + { + "epoch": 0.12181947405828003, + "grad_norm": 0.76953125, + "learning_rate": 0.00019281446878056534, + "loss": 0.5022, + "step": 2571 + }, + { + "epoch": 0.12186685619521441, + "grad_norm": 0.10791015625, + "learning_rate": 0.00019280892443587316, + "loss": 0.0089, + "step": 2572 + }, + { + "epoch": 0.12191423833214877, + "grad_norm": 0.416015625, + "learning_rate": 0.00019280337803277781, + "loss": 0.6116, + "step": 2573 + }, + { + "epoch": 0.12196162046908315, + "grad_norm": 0.6484375, + "learning_rate": 0.00019279782957140243, + "loss": 0.2022, + "step": 2574 + }, + { + "epoch": 0.12200900260601753, + "grad_norm": 0.06689453125, + "learning_rate": 0.00019279227905187005, + "loss": 0.0068, + "step": 2575 + }, + { + "epoch": 0.12205638474295191, + "grad_norm": 1.359375, + "learning_rate": 0.00019278672647430375, + "loss": 0.4393, + "step": 2576 + }, + { + "epoch": 0.12210376687988628, + "grad_norm": 0.64453125, + "learning_rate": 0.00019278117183882664, + "loss": 1.1396, + "step": 2577 + }, + { + "epoch": 0.12215114901682066, + "grad_norm": 1.265625, + "learning_rate": 0.00019277561514556202, + "loss": 0.3896, + "step": 2578 + }, + { + "epoch": 0.12219853115375504, + "grad_norm": 0.55859375, + "learning_rate": 0.00019277005639463304, + "loss": 1.1313, + "step": 2579 + }, + { + "epoch": 0.12224591329068941, + "grad_norm": 0.58984375, + "learning_rate": 0.00019276449558616306, + "loss": 0.6474, + "step": 2580 + }, + { + "epoch": 0.12229329542762378, + "grad_norm": 0.341796875, + "learning_rate": 0.00019275893272027535, + "loss": 0.0849, + "step": 2581 + }, + { + "epoch": 0.12234067756455816, + "grad_norm": 0.6640625, + "learning_rate": 0.0001927533677970933, + "loss": 1.3711, + "step": 2582 + }, + { + "epoch": 0.12238805970149254, + "grad_norm": 0.7421875, + "learning_rate": 0.0001927478008167404, + "loss": 0.8757, + "step": 2583 + }, + { + "epoch": 0.12243544183842692, + "grad_norm": 0.412109375, + "learning_rate": 0.00019274223177934, + "loss": 0.7932, + "step": 2584 + }, + { + "epoch": 0.12248282397536128, + "grad_norm": 0.65625, + "learning_rate": 0.00019273666068501575, + "loss": 0.963, + "step": 2585 + }, + { + "epoch": 0.12253020611229566, + "grad_norm": 0.875, + "learning_rate": 0.0001927310875338911, + "loss": 1.1819, + "step": 2586 + }, + { + "epoch": 0.12257758824923004, + "grad_norm": 0.466796875, + "learning_rate": 0.00019272551232608974, + "loss": 0.968, + "step": 2587 + }, + { + "epoch": 0.12262497038616442, + "grad_norm": 0.4609375, + "learning_rate": 0.00019271993506173526, + "loss": 0.5281, + "step": 2588 + }, + { + "epoch": 0.12267235252309878, + "grad_norm": 0.490234375, + "learning_rate": 0.00019271435574095137, + "loss": 0.8354, + "step": 2589 + }, + { + "epoch": 0.12271973466003316, + "grad_norm": 0.5234375, + "learning_rate": 0.00019270877436386186, + "loss": 1.0137, + "step": 2590 + }, + { + "epoch": 0.12276711679696754, + "grad_norm": 0.6484375, + "learning_rate": 0.00019270319093059044, + "loss": 0.9656, + "step": 2591 + }, + { + "epoch": 0.12281449893390192, + "grad_norm": 0.4765625, + "learning_rate": 0.000192697605441261, + "loss": 0.6587, + "step": 2592 + }, + { + "epoch": 0.1228618810708363, + "grad_norm": 0.314453125, + "learning_rate": 0.00019269201789599743, + "loss": 0.191, + "step": 2593 + }, + { + "epoch": 0.12290926320777067, + "grad_norm": 0.423828125, + "learning_rate": 0.00019268642829492363, + "loss": 0.1004, + "step": 2594 + }, + { + "epoch": 0.12295664534470505, + "grad_norm": 0.296875, + "learning_rate": 0.00019268083663816357, + "loss": 0.0405, + "step": 2595 + }, + { + "epoch": 0.12300402748163942, + "grad_norm": 0.1201171875, + "learning_rate": 0.00019267524292584126, + "loss": 0.0198, + "step": 2596 + }, + { + "epoch": 0.1230514096185738, + "grad_norm": 0.103515625, + "learning_rate": 0.00019266964715808078, + "loss": 0.0101, + "step": 2597 + }, + { + "epoch": 0.12309879175550817, + "grad_norm": 0.48828125, + "learning_rate": 0.00019266404933500624, + "loss": 0.7614, + "step": 2598 + }, + { + "epoch": 0.12314617389244255, + "grad_norm": 0.384765625, + "learning_rate": 0.00019265844945674177, + "loss": 0.6976, + "step": 2599 + }, + { + "epoch": 0.12319355602937693, + "grad_norm": 0.53125, + "learning_rate": 0.00019265284752341158, + "loss": 0.7968, + "step": 2600 + }, + { + "epoch": 0.1232409381663113, + "grad_norm": 0.392578125, + "learning_rate": 0.00019264724353514, + "loss": 0.6219, + "step": 2601 + }, + { + "epoch": 0.12328832030324567, + "grad_norm": 0.625, + "learning_rate": 0.00019264163749205116, + "loss": 1.4453, + "step": 2602 + }, + { + "epoch": 0.12333570244018005, + "grad_norm": 0.734375, + "learning_rate": 0.0001926360293942695, + "loss": 1.453, + "step": 2603 + }, + { + "epoch": 0.12338308457711443, + "grad_norm": 0.267578125, + "learning_rate": 0.00019263041924191937, + "loss": 0.0269, + "step": 2604 + }, + { + "epoch": 0.12343046671404881, + "grad_norm": 0.51953125, + "learning_rate": 0.00019262480703512521, + "loss": 0.6972, + "step": 2605 + }, + { + "epoch": 0.12347784885098317, + "grad_norm": 0.54296875, + "learning_rate": 0.00019261919277401154, + "loss": 0.6406, + "step": 2606 + }, + { + "epoch": 0.12352523098791755, + "grad_norm": 0.333984375, + "learning_rate": 0.00019261357645870275, + "loss": 0.2155, + "step": 2607 + }, + { + "epoch": 0.12357261312485193, + "grad_norm": 0.48828125, + "learning_rate": 0.00019260795808932353, + "loss": 0.0741, + "step": 2608 + }, + { + "epoch": 0.12361999526178631, + "grad_norm": 0.53125, + "learning_rate": 0.00019260233766599843, + "loss": 1.0198, + "step": 2609 + }, + { + "epoch": 0.12366737739872068, + "grad_norm": 0.6640625, + "learning_rate": 0.0001925967151888521, + "loss": 1.0583, + "step": 2610 + }, + { + "epoch": 0.12371475953565506, + "grad_norm": 0.84765625, + "learning_rate": 0.0001925910906580093, + "loss": 1.1802, + "step": 2611 + }, + { + "epoch": 0.12376214167258943, + "grad_norm": 0.412109375, + "learning_rate": 0.00019258546407359468, + "loss": 0.5909, + "step": 2612 + }, + { + "epoch": 0.12380952380952381, + "grad_norm": 0.72265625, + "learning_rate": 0.0001925798354357331, + "loss": 1.1352, + "step": 2613 + }, + { + "epoch": 0.12385690594645818, + "grad_norm": 1.265625, + "learning_rate": 0.0001925742047445494, + "loss": 0.2152, + "step": 2614 + }, + { + "epoch": 0.12390428808339256, + "grad_norm": 0.546875, + "learning_rate": 0.00019256857200016845, + "loss": 0.5009, + "step": 2615 + }, + { + "epoch": 0.12395167022032694, + "grad_norm": 0.474609375, + "learning_rate": 0.00019256293720271514, + "loss": 0.6322, + "step": 2616 + }, + { + "epoch": 0.12399905235726132, + "grad_norm": 0.470703125, + "learning_rate": 0.0001925573003523145, + "loss": 0.7375, + "step": 2617 + }, + { + "epoch": 0.12404643449419568, + "grad_norm": 0.5703125, + "learning_rate": 0.00019255166144909152, + "loss": 0.8515, + "step": 2618 + }, + { + "epoch": 0.12409381663113006, + "grad_norm": 0.5390625, + "learning_rate": 0.00019254602049317127, + "loss": 0.8582, + "step": 2619 + }, + { + "epoch": 0.12414119876806444, + "grad_norm": 0.66796875, + "learning_rate": 0.00019254037748467885, + "loss": 0.7536, + "step": 2620 + }, + { + "epoch": 0.12418858090499882, + "grad_norm": 0.74609375, + "learning_rate": 0.00019253473242373945, + "loss": 1.3561, + "step": 2621 + }, + { + "epoch": 0.1242359630419332, + "grad_norm": 0.75390625, + "learning_rate": 0.00019252908531047823, + "loss": 1.0946, + "step": 2622 + }, + { + "epoch": 0.12428334517886756, + "grad_norm": 0.50390625, + "learning_rate": 0.00019252343614502046, + "loss": 1.1829, + "step": 2623 + }, + { + "epoch": 0.12433072731580194, + "grad_norm": 0.93359375, + "learning_rate": 0.00019251778492749145, + "loss": 0.5284, + "step": 2624 + }, + { + "epoch": 0.12437810945273632, + "grad_norm": 0.431640625, + "learning_rate": 0.00019251213165801648, + "loss": 0.7203, + "step": 2625 + }, + { + "epoch": 0.1244254915896707, + "grad_norm": 0.5234375, + "learning_rate": 0.000192506476336721, + "loss": 1.3022, + "step": 2626 + }, + { + "epoch": 0.12447287372660507, + "grad_norm": 0.5625, + "learning_rate": 0.00019250081896373037, + "loss": 1.3903, + "step": 2627 + }, + { + "epoch": 0.12452025586353944, + "grad_norm": 0.490234375, + "learning_rate": 0.00019249515953917014, + "loss": 1.207, + "step": 2628 + }, + { + "epoch": 0.12456763800047382, + "grad_norm": 0.75, + "learning_rate": 0.00019248949806316578, + "loss": 0.473, + "step": 2629 + }, + { + "epoch": 0.1246150201374082, + "grad_norm": 0.32421875, + "learning_rate": 0.00019248383453584288, + "loss": 0.1061, + "step": 2630 + }, + { + "epoch": 0.12466240227434257, + "grad_norm": 0.73046875, + "learning_rate": 0.000192478168957327, + "loss": 0.4016, + "step": 2631 + }, + { + "epoch": 0.12470978441127695, + "grad_norm": 0.734375, + "learning_rate": 0.0001924725013277439, + "loss": 0.8508, + "step": 2632 + }, + { + "epoch": 0.12475716654821133, + "grad_norm": 0.65625, + "learning_rate": 0.00019246683164721916, + "loss": 0.8059, + "step": 2633 + }, + { + "epoch": 0.1248045486851457, + "grad_norm": 0.58984375, + "learning_rate": 0.00019246115991587858, + "loss": 1.0854, + "step": 2634 + }, + { + "epoch": 0.12485193082208007, + "grad_norm": 0.45703125, + "learning_rate": 0.000192455486133848, + "loss": 1.3106, + "step": 2635 + }, + { + "epoch": 0.12489931295901445, + "grad_norm": 0.53515625, + "learning_rate": 0.0001924498103012532, + "loss": 1.2176, + "step": 2636 + }, + { + "epoch": 0.12494669509594883, + "grad_norm": 0.484375, + "learning_rate": 0.00019244413241822008, + "loss": 0.8235, + "step": 2637 + }, + { + "epoch": 0.12499407723288321, + "grad_norm": 0.54296875, + "learning_rate": 0.00019243845248487456, + "loss": 1.1107, + "step": 2638 + }, + { + "epoch": 0.1250414593698176, + "grad_norm": 0.57421875, + "learning_rate": 0.00019243277050134266, + "loss": 0.9661, + "step": 2639 + }, + { + "epoch": 0.12508884150675195, + "grad_norm": 0.59375, + "learning_rate": 0.00019242708646775034, + "loss": 0.8563, + "step": 2640 + }, + { + "epoch": 0.12513622364368632, + "grad_norm": 0.6015625, + "learning_rate": 0.0001924214003842237, + "loss": 1.1063, + "step": 2641 + }, + { + "epoch": 0.1251836057806207, + "grad_norm": 0.49609375, + "learning_rate": 0.0001924157122508888, + "loss": 0.0833, + "step": 2642 + }, + { + "epoch": 0.12523098791755508, + "grad_norm": 0.75, + "learning_rate": 0.0001924100220678719, + "loss": 1.1375, + "step": 2643 + }, + { + "epoch": 0.12527837005448947, + "grad_norm": 0.6640625, + "learning_rate": 0.0001924043298352991, + "loss": 0.0631, + "step": 2644 + }, + { + "epoch": 0.12532575219142383, + "grad_norm": 0.7421875, + "learning_rate": 0.00019239863555329671, + "loss": 1.0201, + "step": 2645 + }, + { + "epoch": 0.1253731343283582, + "grad_norm": 0.94140625, + "learning_rate": 0.000192392939221991, + "loss": 0.2895, + "step": 2646 + }, + { + "epoch": 0.1254205164652926, + "grad_norm": 0.578125, + "learning_rate": 0.0001923872408415083, + "loss": 0.9183, + "step": 2647 + }, + { + "epoch": 0.12546789860222696, + "grad_norm": 0.185546875, + "learning_rate": 0.000192381540411975, + "loss": 0.0157, + "step": 2648 + }, + { + "epoch": 0.12551528073916132, + "grad_norm": 0.5078125, + "learning_rate": 0.00019237583793351758, + "loss": 0.6224, + "step": 2649 + }, + { + "epoch": 0.12556266287609572, + "grad_norm": 0.44921875, + "learning_rate": 0.0001923701334062624, + "loss": 0.5273, + "step": 2650 + }, + { + "epoch": 0.12561004501303008, + "grad_norm": 0.56640625, + "learning_rate": 0.0001923644268303361, + "loss": 0.8104, + "step": 2651 + }, + { + "epoch": 0.12565742714996447, + "grad_norm": 0.609375, + "learning_rate": 0.00019235871820586517, + "loss": 0.8764, + "step": 2652 + }, + { + "epoch": 0.12570480928689884, + "grad_norm": 0.60546875, + "learning_rate": 0.00019235300753297626, + "loss": 0.928, + "step": 2653 + }, + { + "epoch": 0.1257521914238332, + "grad_norm": 0.51171875, + "learning_rate": 0.000192347294811796, + "loss": 0.51, + "step": 2654 + }, + { + "epoch": 0.1257995735607676, + "grad_norm": 0.5859375, + "learning_rate": 0.00019234158004245112, + "loss": 1.4853, + "step": 2655 + }, + { + "epoch": 0.12584695569770196, + "grad_norm": 0.8046875, + "learning_rate": 0.00019233586322506832, + "loss": 1.1049, + "step": 2656 + }, + { + "epoch": 0.12589433783463636, + "grad_norm": 0.39453125, + "learning_rate": 0.00019233014435977448, + "loss": 0.4638, + "step": 2657 + }, + { + "epoch": 0.12594171997157072, + "grad_norm": 0.8046875, + "learning_rate": 0.00019232442344669634, + "loss": 0.6833, + "step": 2658 + }, + { + "epoch": 0.12598910210850509, + "grad_norm": 0.50390625, + "learning_rate": 0.00019231870048596085, + "loss": 0.0987, + "step": 2659 + }, + { + "epoch": 0.12603648424543948, + "grad_norm": 0.84375, + "learning_rate": 0.00019231297547769492, + "loss": 0.349, + "step": 2660 + }, + { + "epoch": 0.12608386638237384, + "grad_norm": 0.546875, + "learning_rate": 0.0001923072484220255, + "loss": 1.1312, + "step": 2661 + }, + { + "epoch": 0.1261312485193082, + "grad_norm": 0.5, + "learning_rate": 0.00019230151931907963, + "loss": 1.0402, + "step": 2662 + }, + { + "epoch": 0.1261786306562426, + "grad_norm": 0.53125, + "learning_rate": 0.0001922957881689844, + "loss": 1.4235, + "step": 2663 + }, + { + "epoch": 0.12622601279317697, + "grad_norm": 0.6484375, + "learning_rate": 0.00019229005497186687, + "loss": 1.2282, + "step": 2664 + }, + { + "epoch": 0.12627339493011136, + "grad_norm": 0.671875, + "learning_rate": 0.00019228431972785425, + "loss": 0.1748, + "step": 2665 + }, + { + "epoch": 0.12632077706704573, + "grad_norm": 0.38671875, + "learning_rate": 0.00019227858243707371, + "loss": 0.4302, + "step": 2666 + }, + { + "epoch": 0.1263681592039801, + "grad_norm": 0.55859375, + "learning_rate": 0.0001922728430996525, + "loss": 0.7094, + "step": 2667 + }, + { + "epoch": 0.12641554134091448, + "grad_norm": 0.58984375, + "learning_rate": 0.00019226710171571791, + "loss": 0.7875, + "step": 2668 + }, + { + "epoch": 0.12646292347784885, + "grad_norm": 0.5546875, + "learning_rate": 0.00019226135828539728, + "loss": 0.8099, + "step": 2669 + }, + { + "epoch": 0.12651030561478321, + "grad_norm": 0.291015625, + "learning_rate": 0.000192255612808818, + "loss": 0.1927, + "step": 2670 + }, + { + "epoch": 0.1265576877517176, + "grad_norm": 0.78125, + "learning_rate": 0.0001922498652861075, + "loss": 0.7767, + "step": 2671 + }, + { + "epoch": 0.12660506988865197, + "grad_norm": 0.6171875, + "learning_rate": 0.0001922441157173932, + "loss": 0.8704, + "step": 2672 + }, + { + "epoch": 0.12665245202558637, + "grad_norm": 0.59375, + "learning_rate": 0.00019223836410280274, + "loss": 1.3345, + "step": 2673 + }, + { + "epoch": 0.12669983416252073, + "grad_norm": 0.5625, + "learning_rate": 0.00019223261044246356, + "loss": 0.2152, + "step": 2674 + }, + { + "epoch": 0.1267472162994551, + "grad_norm": 0.498046875, + "learning_rate": 0.00019222685473650338, + "loss": 0.6204, + "step": 2675 + }, + { + "epoch": 0.1267945984363895, + "grad_norm": 0.71875, + "learning_rate": 0.00019222109698504975, + "loss": 0.3218, + "step": 2676 + }, + { + "epoch": 0.12684198057332385, + "grad_norm": 0.65234375, + "learning_rate": 0.00019221533718823044, + "loss": 1.485, + "step": 2677 + }, + { + "epoch": 0.12688936271025822, + "grad_norm": 0.431640625, + "learning_rate": 0.00019220957534617314, + "loss": 0.0332, + "step": 2678 + }, + { + "epoch": 0.1269367448471926, + "grad_norm": 0.515625, + "learning_rate": 0.0001922038114590057, + "loss": 1.5897, + "step": 2679 + }, + { + "epoch": 0.12698412698412698, + "grad_norm": 0.53125, + "learning_rate": 0.00019219804552685595, + "loss": 0.7048, + "step": 2680 + }, + { + "epoch": 0.12703150912106137, + "grad_norm": 0.328125, + "learning_rate": 0.00019219227754985175, + "loss": 0.0491, + "step": 2681 + }, + { + "epoch": 0.12707889125799574, + "grad_norm": 1.359375, + "learning_rate": 0.000192186507528121, + "loss": 0.8848, + "step": 2682 + }, + { + "epoch": 0.1271262733949301, + "grad_norm": 0.69140625, + "learning_rate": 0.00019218073546179172, + "loss": 1.0105, + "step": 2683 + }, + { + "epoch": 0.1271736555318645, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001921749613509919, + "loss": 0.1505, + "step": 2684 + }, + { + "epoch": 0.12722103766879886, + "grad_norm": 0.53515625, + "learning_rate": 0.00019216918519584966, + "loss": 0.5415, + "step": 2685 + }, + { + "epoch": 0.12726841980573325, + "grad_norm": 0.2265625, + "learning_rate": 0.00019216340699649304, + "loss": 0.145, + "step": 2686 + }, + { + "epoch": 0.12731580194266762, + "grad_norm": 0.55078125, + "learning_rate": 0.0001921576267530502, + "loss": 1.0475, + "step": 2687 + }, + { + "epoch": 0.12736318407960198, + "grad_norm": 0.69140625, + "learning_rate": 0.00019215184446564942, + "loss": 0.1473, + "step": 2688 + }, + { + "epoch": 0.12741056621653638, + "grad_norm": 0.5078125, + "learning_rate": 0.00019214606013441885, + "loss": 0.7821, + "step": 2689 + }, + { + "epoch": 0.12745794835347074, + "grad_norm": 0.609375, + "learning_rate": 0.0001921402737594868, + "loss": 0.5792, + "step": 2690 + }, + { + "epoch": 0.1275053304904051, + "grad_norm": 0.65625, + "learning_rate": 0.00019213448534098165, + "loss": 0.8598, + "step": 2691 + }, + { + "epoch": 0.1275527126273395, + "grad_norm": 0.298828125, + "learning_rate": 0.00019212869487903174, + "loss": 0.1843, + "step": 2692 + }, + { + "epoch": 0.12760009476427386, + "grad_norm": 0.431640625, + "learning_rate": 0.0001921229023737655, + "loss": 0.3087, + "step": 2693 + }, + { + "epoch": 0.12764747690120826, + "grad_norm": 0.625, + "learning_rate": 0.00019211710782531143, + "loss": 0.8627, + "step": 2694 + }, + { + "epoch": 0.12769485903814262, + "grad_norm": 0.040771484375, + "learning_rate": 0.00019211131123379803, + "loss": 0.0027, + "step": 2695 + }, + { + "epoch": 0.127742241175077, + "grad_norm": 0.5390625, + "learning_rate": 0.00019210551259935384, + "loss": 0.7263, + "step": 2696 + }, + { + "epoch": 0.12778962331201138, + "grad_norm": 0.7265625, + "learning_rate": 0.00019209971192210747, + "loss": 1.5114, + "step": 2697 + }, + { + "epoch": 0.12783700544894575, + "grad_norm": 0.5078125, + "learning_rate": 0.0001920939092021876, + "loss": 1.0862, + "step": 2698 + }, + { + "epoch": 0.1278843875858801, + "grad_norm": 0.62890625, + "learning_rate": 0.00019208810443972296, + "loss": 1.0623, + "step": 2699 + }, + { + "epoch": 0.1279317697228145, + "grad_norm": 0.51953125, + "learning_rate": 0.00019208229763484222, + "loss": 1.2576, + "step": 2700 + }, + { + "epoch": 0.12797915185974887, + "grad_norm": 0.51171875, + "learning_rate": 0.00019207648878767422, + "loss": 0.7411, + "step": 2701 + }, + { + "epoch": 0.12802653399668326, + "grad_norm": 0.57421875, + "learning_rate": 0.00019207067789834775, + "loss": 1.1092, + "step": 2702 + }, + { + "epoch": 0.12807391613361763, + "grad_norm": 0.828125, + "learning_rate": 0.0001920648649669917, + "loss": 1.7907, + "step": 2703 + }, + { + "epoch": 0.128121298270552, + "grad_norm": 0.7109375, + "learning_rate": 0.00019205904999373502, + "loss": 1.2354, + "step": 2704 + }, + { + "epoch": 0.12816868040748639, + "grad_norm": 0.86328125, + "learning_rate": 0.00019205323297870667, + "loss": 0.4706, + "step": 2705 + }, + { + "epoch": 0.12821606254442075, + "grad_norm": 0.609375, + "learning_rate": 0.00019204741392203568, + "loss": 1.2389, + "step": 2706 + }, + { + "epoch": 0.12826344468135512, + "grad_norm": 0.66796875, + "learning_rate": 0.00019204159282385107, + "loss": 1.2217, + "step": 2707 + }, + { + "epoch": 0.1283108268182895, + "grad_norm": 0.3671875, + "learning_rate": 0.000192035769684282, + "loss": 0.4758, + "step": 2708 + }, + { + "epoch": 0.12835820895522387, + "grad_norm": 0.42578125, + "learning_rate": 0.00019202994450345757, + "loss": 1.1666, + "step": 2709 + }, + { + "epoch": 0.12840559109215827, + "grad_norm": 0.61328125, + "learning_rate": 0.00019202411728150702, + "loss": 1.061, + "step": 2710 + }, + { + "epoch": 0.12845297322909263, + "grad_norm": 1.046875, + "learning_rate": 0.0001920182880185595, + "loss": 1.5762, + "step": 2711 + }, + { + "epoch": 0.128500355366027, + "grad_norm": 0.5625, + "learning_rate": 0.00019201245671474445, + "loss": 1.1611, + "step": 2712 + }, + { + "epoch": 0.1285477375029614, + "grad_norm": 0.5625, + "learning_rate": 0.00019200662337019107, + "loss": 1.3035, + "step": 2713 + }, + { + "epoch": 0.12859511963989576, + "grad_norm": 0.3203125, + "learning_rate": 0.0001920007879850288, + "loss": 0.1655, + "step": 2714 + }, + { + "epoch": 0.12864250177683015, + "grad_norm": 0.5390625, + "learning_rate": 0.00019199495055938703, + "loss": 0.8907, + "step": 2715 + }, + { + "epoch": 0.1286898839137645, + "grad_norm": 0.55078125, + "learning_rate": 0.00019198911109339525, + "loss": 0.8073, + "step": 2716 + }, + { + "epoch": 0.12873726605069888, + "grad_norm": 0.43359375, + "learning_rate": 0.000191983269587183, + "loss": 0.2654, + "step": 2717 + }, + { + "epoch": 0.12878464818763327, + "grad_norm": 0.6171875, + "learning_rate": 0.0001919774260408798, + "loss": 0.3346, + "step": 2718 + }, + { + "epoch": 0.12883203032456764, + "grad_norm": 0.68359375, + "learning_rate": 0.00019197158045461524, + "loss": 0.1698, + "step": 2719 + }, + { + "epoch": 0.128879412461502, + "grad_norm": 0.353515625, + "learning_rate": 0.00019196573282851897, + "loss": 0.0602, + "step": 2720 + }, + { + "epoch": 0.1289267945984364, + "grad_norm": 0.251953125, + "learning_rate": 0.00019195988316272073, + "loss": 0.1667, + "step": 2721 + }, + { + "epoch": 0.12897417673537076, + "grad_norm": 0.73046875, + "learning_rate": 0.00019195403145735025, + "loss": 0.5652, + "step": 2722 + }, + { + "epoch": 0.12902155887230515, + "grad_norm": 0.431640625, + "learning_rate": 0.00019194817771253726, + "loss": 0.7821, + "step": 2723 + }, + { + "epoch": 0.12906894100923952, + "grad_norm": 0.57421875, + "learning_rate": 0.00019194232192841165, + "loss": 0.9453, + "step": 2724 + }, + { + "epoch": 0.12911632314617388, + "grad_norm": 0.91796875, + "learning_rate": 0.00019193646410510325, + "loss": 0.2774, + "step": 2725 + }, + { + "epoch": 0.12916370528310828, + "grad_norm": 0.49609375, + "learning_rate": 0.00019193060424274204, + "loss": 0.0525, + "step": 2726 + }, + { + "epoch": 0.12921108742004264, + "grad_norm": 0.3125, + "learning_rate": 0.0001919247423414579, + "loss": 0.1739, + "step": 2727 + }, + { + "epoch": 0.129258469556977, + "grad_norm": 0.52734375, + "learning_rate": 0.00019191887840138092, + "loss": 0.6484, + "step": 2728 + }, + { + "epoch": 0.1293058516939114, + "grad_norm": 1.3828125, + "learning_rate": 0.0001919130124226411, + "loss": 1.1583, + "step": 2729 + }, + { + "epoch": 0.12935323383084577, + "grad_norm": 0.3828125, + "learning_rate": 0.0001919071444053686, + "loss": 0.7938, + "step": 2730 + }, + { + "epoch": 0.12940061596778016, + "grad_norm": 0.62890625, + "learning_rate": 0.0001919012743496935, + "loss": 0.8336, + "step": 2731 + }, + { + "epoch": 0.12944799810471452, + "grad_norm": 0.80859375, + "learning_rate": 0.00019189540225574608, + "loss": 0.5599, + "step": 2732 + }, + { + "epoch": 0.1294953802416489, + "grad_norm": 0.61328125, + "learning_rate": 0.00019188952812365645, + "loss": 1.0495, + "step": 2733 + }, + { + "epoch": 0.12954276237858328, + "grad_norm": 0.1865234375, + "learning_rate": 0.000191883651953555, + "loss": 0.0177, + "step": 2734 + }, + { + "epoch": 0.12959014451551765, + "grad_norm": 0.54296875, + "learning_rate": 0.000191877773745572, + "loss": 0.4574, + "step": 2735 + }, + { + "epoch": 0.129637526652452, + "grad_norm": 0.69921875, + "learning_rate": 0.00019187189349983787, + "loss": 0.2015, + "step": 2736 + }, + { + "epoch": 0.1296849087893864, + "grad_norm": 0.62109375, + "learning_rate": 0.000191866011216483, + "loss": 1.0366, + "step": 2737 + }, + { + "epoch": 0.12973229092632077, + "grad_norm": 0.494140625, + "learning_rate": 0.00019186012689563785, + "loss": 0.2846, + "step": 2738 + }, + { + "epoch": 0.12977967306325516, + "grad_norm": 0.8046875, + "learning_rate": 0.0001918542405374329, + "loss": 1.2374, + "step": 2739 + }, + { + "epoch": 0.12982705520018953, + "grad_norm": 0.2734375, + "learning_rate": 0.00019184835214199874, + "loss": 0.2086, + "step": 2740 + }, + { + "epoch": 0.1298744373371239, + "grad_norm": 0.6484375, + "learning_rate": 0.00019184246170946597, + "loss": 1.0787, + "step": 2741 + }, + { + "epoch": 0.1299218194740583, + "grad_norm": 0.55078125, + "learning_rate": 0.00019183656923996525, + "loss": 0.8791, + "step": 2742 + }, + { + "epoch": 0.12996920161099265, + "grad_norm": 0.447265625, + "learning_rate": 0.00019183067473362722, + "loss": 0.7641, + "step": 2743 + }, + { + "epoch": 0.13001658374792704, + "grad_norm": 0.376953125, + "learning_rate": 0.00019182477819058264, + "loss": 0.735, + "step": 2744 + }, + { + "epoch": 0.1300639658848614, + "grad_norm": 0.7890625, + "learning_rate": 0.0001918188796109623, + "loss": 0.598, + "step": 2745 + }, + { + "epoch": 0.13011134802179578, + "grad_norm": 0.5703125, + "learning_rate": 0.00019181297899489698, + "loss": 0.1809, + "step": 2746 + }, + { + "epoch": 0.13015873015873017, + "grad_norm": 0.6328125, + "learning_rate": 0.00019180707634251762, + "loss": 1.3001, + "step": 2747 + }, + { + "epoch": 0.13020611229566453, + "grad_norm": 0.0234375, + "learning_rate": 0.0001918011716539551, + "loss": 0.0017, + "step": 2748 + }, + { + "epoch": 0.1302534944325989, + "grad_norm": 1.0, + "learning_rate": 0.00019179526492934032, + "loss": 0.2991, + "step": 2749 + }, + { + "epoch": 0.1303008765695333, + "grad_norm": 0.47265625, + "learning_rate": 0.00019178935616880435, + "loss": 0.6896, + "step": 2750 + }, + { + "epoch": 0.13034825870646766, + "grad_norm": 0.6796875, + "learning_rate": 0.00019178344537247827, + "loss": 1.1772, + "step": 2751 + }, + { + "epoch": 0.13039564084340205, + "grad_norm": 0.71875, + "learning_rate": 0.0001917775325404931, + "loss": 0.8827, + "step": 2752 + }, + { + "epoch": 0.13044302298033641, + "grad_norm": 0.87890625, + "learning_rate": 0.00019177161767298004, + "loss": 0.3459, + "step": 2753 + }, + { + "epoch": 0.13049040511727078, + "grad_norm": 0.3203125, + "learning_rate": 0.00019176570077007025, + "loss": 0.0461, + "step": 2754 + }, + { + "epoch": 0.13053778725420517, + "grad_norm": 0.59375, + "learning_rate": 0.00019175978183189494, + "loss": 1.1708, + "step": 2755 + }, + { + "epoch": 0.13058516939113954, + "grad_norm": 0.515625, + "learning_rate": 0.00019175386085858542, + "loss": 0.1037, + "step": 2756 + }, + { + "epoch": 0.1306325515280739, + "grad_norm": 0.59375, + "learning_rate": 0.000191747937850273, + "loss": 1.2245, + "step": 2757 + }, + { + "epoch": 0.1306799336650083, + "grad_norm": 0.5390625, + "learning_rate": 0.00019174201280708903, + "loss": 0.968, + "step": 2758 + }, + { + "epoch": 0.13072731580194266, + "grad_norm": 0.609375, + "learning_rate": 0.00019173608572916497, + "loss": 0.7292, + "step": 2759 + }, + { + "epoch": 0.13077469793887705, + "grad_norm": 0.578125, + "learning_rate": 0.00019173015661663222, + "loss": 1.3887, + "step": 2760 + }, + { + "epoch": 0.13082208007581142, + "grad_norm": 0.5703125, + "learning_rate": 0.0001917242254696223, + "loss": 1.1602, + "step": 2761 + }, + { + "epoch": 0.13086946221274579, + "grad_norm": 0.62890625, + "learning_rate": 0.00019171829228826676, + "loss": 1.1013, + "step": 2762 + }, + { + "epoch": 0.13091684434968018, + "grad_norm": 0.388671875, + "learning_rate": 0.00019171235707269717, + "loss": 0.5124, + "step": 2763 + }, + { + "epoch": 0.13096422648661454, + "grad_norm": 0.322265625, + "learning_rate": 0.00019170641982304524, + "loss": 0.2174, + "step": 2764 + }, + { + "epoch": 0.1310116086235489, + "grad_norm": 0.578125, + "learning_rate": 0.00019170048053944255, + "loss": 0.9312, + "step": 2765 + }, + { + "epoch": 0.1310589907604833, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0001916945392220209, + "loss": 0.0015, + "step": 2766 + }, + { + "epoch": 0.13110637289741767, + "grad_norm": 1.171875, + "learning_rate": 0.00019168859587091205, + "loss": 0.5787, + "step": 2767 + }, + { + "epoch": 0.13115375503435206, + "grad_norm": 0.515625, + "learning_rate": 0.00019168265048624775, + "loss": 0.7287, + "step": 2768 + }, + { + "epoch": 0.13120113717128642, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019167670306815998, + "loss": 0.0285, + "step": 2769 + }, + { + "epoch": 0.1312485193082208, + "grad_norm": 0.58984375, + "learning_rate": 0.00019167075361678054, + "loss": 0.8246, + "step": 2770 + }, + { + "epoch": 0.13129590144515518, + "grad_norm": 0.486328125, + "learning_rate": 0.00019166480213224146, + "loss": 0.5416, + "step": 2771 + }, + { + "epoch": 0.13134328358208955, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019165884861467472, + "loss": 0.0086, + "step": 2772 + }, + { + "epoch": 0.13139066571902394, + "grad_norm": 0.59375, + "learning_rate": 0.00019165289306421232, + "loss": 1.7462, + "step": 2773 + }, + { + "epoch": 0.1314380478559583, + "grad_norm": 0.59375, + "learning_rate": 0.0001916469354809864, + "loss": 1.0287, + "step": 2774 + }, + { + "epoch": 0.13148542999289267, + "grad_norm": 0.10546875, + "learning_rate": 0.00019164097586512902, + "loss": 0.0097, + "step": 2775 + }, + { + "epoch": 0.13153281212982706, + "grad_norm": 0.5703125, + "learning_rate": 0.00019163501421677243, + "loss": 1.4429, + "step": 2776 + }, + { + "epoch": 0.13158019426676143, + "grad_norm": 0.75, + "learning_rate": 0.00019162905053604886, + "loss": 1.0607, + "step": 2777 + }, + { + "epoch": 0.1316275764036958, + "grad_norm": 0.41015625, + "learning_rate": 0.00019162308482309053, + "loss": 0.9388, + "step": 2778 + }, + { + "epoch": 0.1316749585406302, + "grad_norm": 0.75390625, + "learning_rate": 0.00019161711707802977, + "loss": 0.1465, + "step": 2779 + }, + { + "epoch": 0.13172234067756455, + "grad_norm": 0.625, + "learning_rate": 0.00019161114730099893, + "loss": 1.1465, + "step": 2780 + }, + { + "epoch": 0.13176972281449895, + "grad_norm": 0.314453125, + "learning_rate": 0.00019160517549213042, + "loss": 0.1622, + "step": 2781 + }, + { + "epoch": 0.1318171049514333, + "grad_norm": 0.578125, + "learning_rate": 0.00019159920165155668, + "loss": 1.3999, + "step": 2782 + }, + { + "epoch": 0.13186448708836768, + "grad_norm": 0.828125, + "learning_rate": 0.00019159322577941024, + "loss": 0.3647, + "step": 2783 + }, + { + "epoch": 0.13191186922530207, + "grad_norm": 0.578125, + "learning_rate": 0.00019158724787582363, + "loss": 1.1849, + "step": 2784 + }, + { + "epoch": 0.13195925136223643, + "grad_norm": 0.54296875, + "learning_rate": 0.0001915812679409294, + "loss": 0.7259, + "step": 2785 + }, + { + "epoch": 0.1320066334991708, + "grad_norm": 0.54296875, + "learning_rate": 0.00019157528597486017, + "loss": 0.9717, + "step": 2786 + }, + { + "epoch": 0.1320540156361052, + "grad_norm": 0.380859375, + "learning_rate": 0.00019156930197774866, + "loss": 0.0891, + "step": 2787 + }, + { + "epoch": 0.13210139777303956, + "grad_norm": 1.0859375, + "learning_rate": 0.00019156331594972757, + "loss": 0.2622, + "step": 2788 + }, + { + "epoch": 0.13214877990997395, + "grad_norm": 0.53515625, + "learning_rate": 0.00019155732789092965, + "loss": 1.1889, + "step": 2789 + }, + { + "epoch": 0.13219616204690832, + "grad_norm": 0.5078125, + "learning_rate": 0.00019155133780148772, + "loss": 0.9256, + "step": 2790 + }, + { + "epoch": 0.13224354418384268, + "grad_norm": 0.6015625, + "learning_rate": 0.00019154534568153464, + "loss": 0.9825, + "step": 2791 + }, + { + "epoch": 0.13229092632077707, + "grad_norm": 0.578125, + "learning_rate": 0.0001915393515312033, + "loss": 1.0119, + "step": 2792 + }, + { + "epoch": 0.13233830845771144, + "grad_norm": 0.546875, + "learning_rate": 0.00019153335535062666, + "loss": 0.9351, + "step": 2793 + }, + { + "epoch": 0.1323856905946458, + "grad_norm": 0.50390625, + "learning_rate": 0.00019152735713993767, + "loss": 0.7316, + "step": 2794 + }, + { + "epoch": 0.1324330727315802, + "grad_norm": 0.60546875, + "learning_rate": 0.0001915213568992694, + "loss": 0.9324, + "step": 2795 + }, + { + "epoch": 0.13248045486851456, + "grad_norm": 0.5234375, + "learning_rate": 0.00019151535462875495, + "loss": 0.7162, + "step": 2796 + }, + { + "epoch": 0.13252783700544896, + "grad_norm": 0.470703125, + "learning_rate": 0.00019150935032852736, + "loss": 0.7289, + "step": 2797 + }, + { + "epoch": 0.13257521914238332, + "grad_norm": 0.61328125, + "learning_rate": 0.0001915033439987199, + "loss": 1.1233, + "step": 2798 + }, + { + "epoch": 0.1326226012793177, + "grad_norm": 0.73046875, + "learning_rate": 0.00019149733563946568, + "loss": 0.9656, + "step": 2799 + }, + { + "epoch": 0.13266998341625208, + "grad_norm": 0.5546875, + "learning_rate": 0.00019149132525089806, + "loss": 1.158, + "step": 2800 + }, + { + "epoch": 0.13271736555318644, + "grad_norm": 0.58984375, + "learning_rate": 0.00019148531283315028, + "loss": 0.8042, + "step": 2801 + }, + { + "epoch": 0.13276474769012084, + "grad_norm": 0.6953125, + "learning_rate": 0.00019147929838635573, + "loss": 0.2657, + "step": 2802 + }, + { + "epoch": 0.1328121298270552, + "grad_norm": 0.443359375, + "learning_rate": 0.00019147328191064774, + "loss": 0.6868, + "step": 2803 + }, + { + "epoch": 0.13285951196398957, + "grad_norm": 0.5625, + "learning_rate": 0.0001914672634061598, + "loss": 0.9452, + "step": 2804 + }, + { + "epoch": 0.13290689410092396, + "grad_norm": 0.408203125, + "learning_rate": 0.0001914612428730254, + "loss": 0.5108, + "step": 2805 + }, + { + "epoch": 0.13295427623785833, + "grad_norm": 0.486328125, + "learning_rate": 0.00019145522031137807, + "loss": 0.863, + "step": 2806 + }, + { + "epoch": 0.1330016583747927, + "grad_norm": 0.51171875, + "learning_rate": 0.00019144919572135135, + "loss": 0.0661, + "step": 2807 + }, + { + "epoch": 0.13304904051172708, + "grad_norm": 0.5234375, + "learning_rate": 0.00019144316910307883, + "loss": 0.8761, + "step": 2808 + }, + { + "epoch": 0.13309642264866145, + "grad_norm": 0.447265625, + "learning_rate": 0.00019143714045669427, + "loss": 0.0686, + "step": 2809 + }, + { + "epoch": 0.13314380478559584, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019143110978233133, + "loss": 0.0148, + "step": 2810 + }, + { + "epoch": 0.1331911869225302, + "grad_norm": 0.49609375, + "learning_rate": 0.00019142507708012376, + "loss": 0.0215, + "step": 2811 + }, + { + "epoch": 0.13323856905946457, + "grad_norm": 0.10595703125, + "learning_rate": 0.00019141904235020537, + "loss": 0.0066, + "step": 2812 + }, + { + "epoch": 0.13328595119639897, + "grad_norm": 0.53125, + "learning_rate": 0.00019141300559270995, + "loss": 0.6714, + "step": 2813 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.51171875, + "learning_rate": 0.00019140696680777146, + "loss": 0.7004, + "step": 2814 + }, + { + "epoch": 0.1333807154702677, + "grad_norm": 0.8671875, + "learning_rate": 0.00019140092599552385, + "loss": 1.147, + "step": 2815 + }, + { + "epoch": 0.1334280976072021, + "grad_norm": 0.10400390625, + "learning_rate": 0.000191394883156101, + "loss": 0.0055, + "step": 2816 + }, + { + "epoch": 0.13347547974413645, + "grad_norm": 0.87890625, + "learning_rate": 0.000191388838289637, + "loss": 0.4192, + "step": 2817 + }, + { + "epoch": 0.13352286188107085, + "grad_norm": 1.90625, + "learning_rate": 0.00019138279139626594, + "loss": 0.714, + "step": 2818 + }, + { + "epoch": 0.1335702440180052, + "grad_norm": 0.59375, + "learning_rate": 0.00019137674247612186, + "loss": 0.732, + "step": 2819 + }, + { + "epoch": 0.13361762615493958, + "grad_norm": 0.455078125, + "learning_rate": 0.000191370691529339, + "loss": 0.4689, + "step": 2820 + }, + { + "epoch": 0.13366500829187397, + "grad_norm": 0.62890625, + "learning_rate": 0.0001913646385560515, + "loss": 0.087, + "step": 2821 + }, + { + "epoch": 0.13371239042880834, + "grad_norm": 0.55859375, + "learning_rate": 0.00019135858355639367, + "loss": 0.7453, + "step": 2822 + }, + { + "epoch": 0.1337597725657427, + "grad_norm": 0.6015625, + "learning_rate": 0.00019135252653049975, + "loss": 0.992, + "step": 2823 + }, + { + "epoch": 0.1338071547026771, + "grad_norm": 0.67578125, + "learning_rate": 0.00019134646747850408, + "loss": 0.8448, + "step": 2824 + }, + { + "epoch": 0.13385453683961146, + "grad_norm": 0.57421875, + "learning_rate": 0.00019134040640054106, + "loss": 0.6233, + "step": 2825 + }, + { + "epoch": 0.13390191897654585, + "grad_norm": 0.01019287109375, + "learning_rate": 0.00019133434329674512, + "loss": 0.0008, + "step": 2826 + }, + { + "epoch": 0.13394930111348022, + "grad_norm": 0.6640625, + "learning_rate": 0.00019132827816725073, + "loss": 1.0276, + "step": 2827 + }, + { + "epoch": 0.13399668325041458, + "grad_norm": 0.546875, + "learning_rate": 0.00019132221101219243, + "loss": 1.1445, + "step": 2828 + }, + { + "epoch": 0.13404406538734898, + "grad_norm": 0.337890625, + "learning_rate": 0.00019131614183170477, + "loss": 0.0918, + "step": 2829 + }, + { + "epoch": 0.13409144752428334, + "grad_norm": 0.4765625, + "learning_rate": 0.00019131007062592233, + "loss": 0.8945, + "step": 2830 + }, + { + "epoch": 0.13413882966121773, + "grad_norm": 0.453125, + "learning_rate": 0.00019130399739497977, + "loss": 0.3067, + "step": 2831 + }, + { + "epoch": 0.1341862117981521, + "grad_norm": 0.609375, + "learning_rate": 0.00019129792213901182, + "loss": 1.005, + "step": 2832 + }, + { + "epoch": 0.13423359393508646, + "grad_norm": 0.302734375, + "learning_rate": 0.0001912918448581532, + "loss": 0.0211, + "step": 2833 + }, + { + "epoch": 0.13428097607202086, + "grad_norm": 0.72265625, + "learning_rate": 0.00019128576555253868, + "loss": 1.0393, + "step": 2834 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 0.5546875, + "learning_rate": 0.00019127968422230315, + "loss": 0.6882, + "step": 2835 + }, + { + "epoch": 0.1343757403458896, + "grad_norm": 0.0927734375, + "learning_rate": 0.0001912736008675814, + "loss": 0.0056, + "step": 2836 + }, + { + "epoch": 0.13442312248282398, + "grad_norm": 0.63671875, + "learning_rate": 0.00019126751548850844, + "loss": 1.6895, + "step": 2837 + }, + { + "epoch": 0.13447050461975835, + "grad_norm": 0.84375, + "learning_rate": 0.00019126142808521918, + "loss": 0.7938, + "step": 2838 + }, + { + "epoch": 0.13451788675669274, + "grad_norm": 0.609375, + "learning_rate": 0.00019125533865784868, + "loss": 1.026, + "step": 2839 + }, + { + "epoch": 0.1345652688936271, + "grad_norm": 0.0155029296875, + "learning_rate": 0.00019124924720653196, + "loss": 0.0009, + "step": 2840 + }, + { + "epoch": 0.13461265103056147, + "grad_norm": 0.578125, + "learning_rate": 0.0001912431537314041, + "loss": 0.8351, + "step": 2841 + }, + { + "epoch": 0.13466003316749586, + "grad_norm": 0.4921875, + "learning_rate": 0.00019123705823260033, + "loss": 0.245, + "step": 2842 + }, + { + "epoch": 0.13470741530443023, + "grad_norm": 0.48046875, + "learning_rate": 0.00019123096071025576, + "loss": 0.8535, + "step": 2843 + }, + { + "epoch": 0.1347547974413646, + "grad_norm": 0.59375, + "learning_rate": 0.00019122486116450566, + "loss": 0.469, + "step": 2844 + }, + { + "epoch": 0.134802179578299, + "grad_norm": 0.640625, + "learning_rate": 0.0001912187595954853, + "loss": 0.3835, + "step": 2845 + }, + { + "epoch": 0.13484956171523335, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019121265600333004, + "loss": 0.022, + "step": 2846 + }, + { + "epoch": 0.13489694385216774, + "grad_norm": 0.546875, + "learning_rate": 0.0001912065503881752, + "loss": 0.1573, + "step": 2847 + }, + { + "epoch": 0.1349443259891021, + "grad_norm": 0.208984375, + "learning_rate": 0.00019120044275015624, + "loss": 0.1412, + "step": 2848 + }, + { + "epoch": 0.13499170812603647, + "grad_norm": 0.54296875, + "learning_rate": 0.00019119433308940857, + "loss": 1.1661, + "step": 2849 + }, + { + "epoch": 0.13503909026297087, + "grad_norm": 0.76953125, + "learning_rate": 0.00019118822140606776, + "loss": 1.1511, + "step": 2850 + }, + { + "epoch": 0.13508647239990523, + "grad_norm": 1.1875, + "learning_rate": 0.0001911821077002693, + "loss": 0.3723, + "step": 2851 + }, + { + "epoch": 0.1351338545368396, + "grad_norm": 0.8046875, + "learning_rate": 0.00019117599197214884, + "loss": 0.1053, + "step": 2852 + }, + { + "epoch": 0.135181236673774, + "grad_norm": 0.26171875, + "learning_rate": 0.00019116987422184198, + "loss": 0.1539, + "step": 2853 + }, + { + "epoch": 0.13522861881070836, + "grad_norm": 0.59765625, + "learning_rate": 0.0001911637544494844, + "loss": 0.8968, + "step": 2854 + }, + { + "epoch": 0.13527600094764275, + "grad_norm": 0.69921875, + "learning_rate": 0.0001911576326552119, + "loss": 0.7788, + "step": 2855 + }, + { + "epoch": 0.13532338308457711, + "grad_norm": 0.002777099609375, + "learning_rate": 0.00019115150883916018, + "loss": 0.0002, + "step": 2856 + }, + { + "epoch": 0.13537076522151148, + "grad_norm": 0.41015625, + "learning_rate": 0.00019114538300146508, + "loss": 0.1204, + "step": 2857 + }, + { + "epoch": 0.13541814735844587, + "grad_norm": 0.400390625, + "learning_rate": 0.00019113925514226246, + "loss": 0.0465, + "step": 2858 + }, + { + "epoch": 0.13546552949538024, + "grad_norm": 0.55078125, + "learning_rate": 0.00019113312526168827, + "loss": 0.9245, + "step": 2859 + }, + { + "epoch": 0.13551291163231463, + "grad_norm": 0.796875, + "learning_rate": 0.00019112699335987842, + "loss": 0.3414, + "step": 2860 + }, + { + "epoch": 0.135560293769249, + "grad_norm": 0.49609375, + "learning_rate": 0.00019112085943696888, + "loss": 0.7549, + "step": 2861 + }, + { + "epoch": 0.13560767590618336, + "grad_norm": 0.5859375, + "learning_rate": 0.00019111472349309577, + "loss": 0.6086, + "step": 2862 + }, + { + "epoch": 0.13565505804311775, + "grad_norm": 0.470703125, + "learning_rate": 0.00019110858552839517, + "loss": 0.5047, + "step": 2863 + }, + { + "epoch": 0.13570244018005212, + "grad_norm": 0.55859375, + "learning_rate": 0.00019110244554300314, + "loss": 0.9906, + "step": 2864 + }, + { + "epoch": 0.13574982231698648, + "grad_norm": 1.015625, + "learning_rate": 0.00019109630353705592, + "loss": 0.1466, + "step": 2865 + }, + { + "epoch": 0.13579720445392088, + "grad_norm": 0.15234375, + "learning_rate": 0.00019109015951068975, + "loss": 0.0174, + "step": 2866 + }, + { + "epoch": 0.13584458659085524, + "grad_norm": 0.2421875, + "learning_rate": 0.00019108401346404084, + "loss": 0.0408, + "step": 2867 + }, + { + "epoch": 0.13589196872778964, + "grad_norm": 0.69140625, + "learning_rate": 0.00019107786539724554, + "loss": 1.2381, + "step": 2868 + }, + { + "epoch": 0.135939350864724, + "grad_norm": 0.5859375, + "learning_rate": 0.00019107171531044018, + "loss": 0.967, + "step": 2869 + }, + { + "epoch": 0.13598673300165837, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001910655632037612, + "loss": 0.0303, + "step": 2870 + }, + { + "epoch": 0.13603411513859276, + "grad_norm": 0.65234375, + "learning_rate": 0.00019105940907734498, + "loss": 0.6367, + "step": 2871 + }, + { + "epoch": 0.13608149727552712, + "grad_norm": 0.67578125, + "learning_rate": 0.00019105325293132812, + "loss": 0.944, + "step": 2872 + }, + { + "epoch": 0.1361288794124615, + "grad_norm": 0.78515625, + "learning_rate": 0.00019104709476584707, + "loss": 1.4161, + "step": 2873 + }, + { + "epoch": 0.13617626154939588, + "grad_norm": 0.44921875, + "learning_rate": 0.00019104093458103841, + "loss": 0.6082, + "step": 2874 + }, + { + "epoch": 0.13622364368633025, + "grad_norm": 0.59375, + "learning_rate": 0.00019103477237703885, + "loss": 1.1064, + "step": 2875 + }, + { + "epoch": 0.13627102582326464, + "grad_norm": 0.02099609375, + "learning_rate": 0.00019102860815398495, + "loss": 0.0011, + "step": 2876 + }, + { + "epoch": 0.136318407960199, + "grad_norm": 0.482421875, + "learning_rate": 0.0001910224419120135, + "loss": 0.8008, + "step": 2877 + }, + { + "epoch": 0.13636579009713337, + "grad_norm": 0.451171875, + "learning_rate": 0.00019101627365126125, + "loss": 0.0692, + "step": 2878 + }, + { + "epoch": 0.13641317223406776, + "grad_norm": 0.2177734375, + "learning_rate": 0.000191010103371865, + "loss": 0.0145, + "step": 2879 + }, + { + "epoch": 0.13646055437100213, + "grad_norm": 1.1328125, + "learning_rate": 0.00019100393107396158, + "loss": 0.2141, + "step": 2880 + }, + { + "epoch": 0.1365079365079365, + "grad_norm": 0.3203125, + "learning_rate": 0.00019099775675768791, + "loss": 0.0824, + "step": 2881 + }, + { + "epoch": 0.1365553186448709, + "grad_norm": 0.609375, + "learning_rate": 0.00019099158042318096, + "loss": 0.1288, + "step": 2882 + }, + { + "epoch": 0.13660270078180525, + "grad_norm": 0.00469970703125, + "learning_rate": 0.0001909854020705776, + "loss": 0.0003, + "step": 2883 + }, + { + "epoch": 0.13665008291873965, + "grad_norm": 0.671875, + "learning_rate": 0.00019097922170001498, + "loss": 0.8375, + "step": 2884 + }, + { + "epoch": 0.136697465055674, + "grad_norm": 0.48828125, + "learning_rate": 0.00019097303931163014, + "loss": 1.0394, + "step": 2885 + }, + { + "epoch": 0.13674484719260838, + "grad_norm": 0.52734375, + "learning_rate": 0.00019096685490556018, + "loss": 0.7349, + "step": 2886 + }, + { + "epoch": 0.13679222932954277, + "grad_norm": 0.74609375, + "learning_rate": 0.00019096066848194225, + "loss": 1.3313, + "step": 2887 + }, + { + "epoch": 0.13683961146647713, + "grad_norm": 0.74609375, + "learning_rate": 0.00019095448004091358, + "loss": 1.1952, + "step": 2888 + }, + { + "epoch": 0.13688699360341153, + "grad_norm": 0.044189453125, + "learning_rate": 0.00019094828958261145, + "loss": 0.002, + "step": 2889 + }, + { + "epoch": 0.1369343757403459, + "grad_norm": 0.625, + "learning_rate": 0.0001909420971071731, + "loss": 1.0378, + "step": 2890 + }, + { + "epoch": 0.13698175787728026, + "grad_norm": 0.55859375, + "learning_rate": 0.00019093590261473592, + "loss": 1.2131, + "step": 2891 + }, + { + "epoch": 0.13702914001421465, + "grad_norm": 1.078125, + "learning_rate": 0.0001909297061054373, + "loss": 0.2519, + "step": 2892 + }, + { + "epoch": 0.13707652215114902, + "grad_norm": 0.36328125, + "learning_rate": 0.00019092350757941463, + "loss": 0.1355, + "step": 2893 + }, + { + "epoch": 0.13712390428808338, + "grad_norm": 0.55078125, + "learning_rate": 0.0001909173070368054, + "loss": 0.0684, + "step": 2894 + }, + { + "epoch": 0.13717128642501777, + "grad_norm": 0.59765625, + "learning_rate": 0.0001909111044777472, + "loss": 1.0816, + "step": 2895 + }, + { + "epoch": 0.13721866856195214, + "grad_norm": 0.59765625, + "learning_rate": 0.00019090489990237746, + "loss": 0.1926, + "step": 2896 + }, + { + "epoch": 0.13726605069888653, + "grad_norm": 0.5546875, + "learning_rate": 0.0001908986933108339, + "loss": 0.7542, + "step": 2897 + }, + { + "epoch": 0.1373134328358209, + "grad_norm": 1.171875, + "learning_rate": 0.00019089248470325414, + "loss": 1.1729, + "step": 2898 + }, + { + "epoch": 0.13736081497275526, + "grad_norm": 0.5859375, + "learning_rate": 0.00019088627407977588, + "loss": 1.0235, + "step": 2899 + }, + { + "epoch": 0.13740819710968966, + "grad_norm": 0.482421875, + "learning_rate": 0.00019088006144053686, + "loss": 1.096, + "step": 2900 + }, + { + "epoch": 0.13745557924662402, + "grad_norm": 0.294921875, + "learning_rate": 0.0001908738467856749, + "loss": 0.1479, + "step": 2901 + }, + { + "epoch": 0.1375029613835584, + "grad_norm": 0.5234375, + "learning_rate": 0.0001908676301153278, + "loss": 0.6035, + "step": 2902 + }, + { + "epoch": 0.13755034352049278, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001908614114296335, + "loss": 0.0202, + "step": 2903 + }, + { + "epoch": 0.13759772565742714, + "grad_norm": 0.271484375, + "learning_rate": 0.0001908551907287298, + "loss": 0.1763, + "step": 2904 + }, + { + "epoch": 0.13764510779436154, + "grad_norm": 0.640625, + "learning_rate": 0.00019084896801275478, + "loss": 1.3041, + "step": 2905 + }, + { + "epoch": 0.1376924899312959, + "grad_norm": 0.498046875, + "learning_rate": 0.0001908427432818464, + "loss": 1.2016, + "step": 2906 + }, + { + "epoch": 0.13773987206823027, + "grad_norm": 0.59765625, + "learning_rate": 0.00019083651653614277, + "loss": 1.1691, + "step": 2907 + }, + { + "epoch": 0.13778725420516466, + "grad_norm": 0.47265625, + "learning_rate": 0.00019083028777578192, + "loss": 0.8341, + "step": 2908 + }, + { + "epoch": 0.13783463634209903, + "grad_norm": 0.244140625, + "learning_rate": 0.00019082405700090207, + "loss": 0.1651, + "step": 2909 + }, + { + "epoch": 0.1378820184790334, + "grad_norm": 0.578125, + "learning_rate": 0.00019081782421164137, + "loss": 1.4576, + "step": 2910 + }, + { + "epoch": 0.13792940061596778, + "grad_norm": 0.5859375, + "learning_rate": 0.00019081158940813806, + "loss": 0.9515, + "step": 2911 + }, + { + "epoch": 0.13797678275290215, + "grad_norm": 0.435546875, + "learning_rate": 0.0001908053525905304, + "loss": 0.2158, + "step": 2912 + }, + { + "epoch": 0.13802416488983654, + "grad_norm": 0.7265625, + "learning_rate": 0.00019079911375895675, + "loss": 1.3892, + "step": 2913 + }, + { + "epoch": 0.1380715470267709, + "grad_norm": 0.5703125, + "learning_rate": 0.0001907928729135555, + "loss": 0.9338, + "step": 2914 + }, + { + "epoch": 0.13811892916370527, + "grad_norm": 0.361328125, + "learning_rate": 0.000190786630054465, + "loss": 0.0649, + "step": 2915 + }, + { + "epoch": 0.13816631130063967, + "grad_norm": 0.58984375, + "learning_rate": 0.00019078038518182376, + "loss": 0.9949, + "step": 2916 + }, + { + "epoch": 0.13821369343757403, + "grad_norm": 0.62890625, + "learning_rate": 0.0001907741382957703, + "loss": 1.1079, + "step": 2917 + }, + { + "epoch": 0.13826107557450842, + "grad_norm": 0.482421875, + "learning_rate": 0.00019076788939644313, + "loss": 0.1373, + "step": 2918 + }, + { + "epoch": 0.1383084577114428, + "grad_norm": 0.49609375, + "learning_rate": 0.0001907616384839808, + "loss": 0.8246, + "step": 2919 + }, + { + "epoch": 0.13835583984837715, + "grad_norm": 0.56640625, + "learning_rate": 0.00019075538555852207, + "loss": 1.201, + "step": 2920 + }, + { + "epoch": 0.13840322198531155, + "grad_norm": 0.69140625, + "learning_rate": 0.00019074913062020553, + "loss": 0.1198, + "step": 2921 + }, + { + "epoch": 0.1384506041222459, + "grad_norm": 0.50390625, + "learning_rate": 0.00019074287366916995, + "loss": 0.9153, + "step": 2922 + }, + { + "epoch": 0.13849798625918028, + "grad_norm": 0.6328125, + "learning_rate": 0.00019073661470555406, + "loss": 1.2473, + "step": 2923 + }, + { + "epoch": 0.13854536839611467, + "grad_norm": 0.53125, + "learning_rate": 0.00019073035372949671, + "loss": 0.9549, + "step": 2924 + }, + { + "epoch": 0.13859275053304904, + "grad_norm": 0.97265625, + "learning_rate": 0.00019072409074113677, + "loss": 0.7476, + "step": 2925 + }, + { + "epoch": 0.13864013266998343, + "grad_norm": 0.462890625, + "learning_rate": 0.00019071782574061312, + "loss": 0.6304, + "step": 2926 + }, + { + "epoch": 0.1386875148069178, + "grad_norm": 0.57421875, + "learning_rate": 0.00019071155872806475, + "loss": 1.4026, + "step": 2927 + }, + { + "epoch": 0.13873489694385216, + "grad_norm": 0.58203125, + "learning_rate": 0.0001907052897036306, + "loss": 0.1602, + "step": 2928 + }, + { + "epoch": 0.13878227908078655, + "grad_norm": 0.52734375, + "learning_rate": 0.00019069901866744976, + "loss": 0.7066, + "step": 2929 + }, + { + "epoch": 0.13882966121772092, + "grad_norm": 0.478515625, + "learning_rate": 0.00019069274561966127, + "loss": 0.9224, + "step": 2930 + }, + { + "epoch": 0.13887704335465528, + "grad_norm": 0.546875, + "learning_rate": 0.0001906864705604043, + "loss": 1.169, + "step": 2931 + }, + { + "epoch": 0.13892442549158968, + "grad_norm": 0.64453125, + "learning_rate": 0.000190680193489818, + "loss": 0.5324, + "step": 2932 + }, + { + "epoch": 0.13897180762852404, + "grad_norm": 0.46875, + "learning_rate": 0.00019067391440804162, + "loss": 0.424, + "step": 2933 + }, + { + "epoch": 0.13901918976545843, + "grad_norm": 0.038330078125, + "learning_rate": 0.00019066763331521437, + "loss": 0.0024, + "step": 2934 + }, + { + "epoch": 0.1390665719023928, + "grad_norm": 0.53125, + "learning_rate": 0.0001906613502114756, + "loss": 0.7701, + "step": 2935 + }, + { + "epoch": 0.13911395403932716, + "grad_norm": 0.625, + "learning_rate": 0.00019065506509696465, + "loss": 1.1735, + "step": 2936 + }, + { + "epoch": 0.13916133617626156, + "grad_norm": 0.62890625, + "learning_rate": 0.0001906487779718209, + "loss": 1.383, + "step": 2937 + }, + { + "epoch": 0.13920871831319592, + "grad_norm": 0.4453125, + "learning_rate": 0.00019064248883618383, + "loss": 0.4393, + "step": 2938 + }, + { + "epoch": 0.1392561004501303, + "grad_norm": 0.73046875, + "learning_rate": 0.0001906361976901929, + "loss": 0.4204, + "step": 2939 + }, + { + "epoch": 0.13930348258706468, + "grad_norm": 0.5625, + "learning_rate": 0.00019062990453398766, + "loss": 1.2174, + "step": 2940 + }, + { + "epoch": 0.13935086472399905, + "grad_norm": 0.5234375, + "learning_rate": 0.00019062360936770767, + "loss": 0.6505, + "step": 2941 + }, + { + "epoch": 0.13939824686093344, + "grad_norm": 0.796875, + "learning_rate": 0.00019061731219149251, + "loss": 1.3342, + "step": 2942 + }, + { + "epoch": 0.1394456289978678, + "grad_norm": 0.625, + "learning_rate": 0.00019061101300548195, + "loss": 1.5339, + "step": 2943 + }, + { + "epoch": 0.13949301113480217, + "grad_norm": 0.953125, + "learning_rate": 0.00019060471180981558, + "loss": 1.3235, + "step": 2944 + }, + { + "epoch": 0.13954039327173656, + "grad_norm": 0.6796875, + "learning_rate": 0.00019059840860463325, + "loss": 1.2126, + "step": 2945 + }, + { + "epoch": 0.13958777540867093, + "grad_norm": 0.61328125, + "learning_rate": 0.0001905921033900747, + "loss": 1.1159, + "step": 2946 + }, + { + "epoch": 0.13963515754560532, + "grad_norm": 0.423828125, + "learning_rate": 0.0001905857961662798, + "loss": 0.1903, + "step": 2947 + }, + { + "epoch": 0.13968253968253969, + "grad_norm": 0.70703125, + "learning_rate": 0.00019057948693338843, + "loss": 0.8658, + "step": 2948 + }, + { + "epoch": 0.13972992181947405, + "grad_norm": 0.58203125, + "learning_rate": 0.0001905731756915405, + "loss": 0.0776, + "step": 2949 + }, + { + "epoch": 0.13977730395640844, + "grad_norm": 0.455078125, + "learning_rate": 0.00019056686244087602, + "loss": 0.8222, + "step": 2950 + }, + { + "epoch": 0.1398246860933428, + "grad_norm": 0.66015625, + "learning_rate": 0.000190560547181535, + "loss": 1.0425, + "step": 2951 + }, + { + "epoch": 0.13987206823027717, + "grad_norm": 0.62109375, + "learning_rate": 0.00019055422991365754, + "loss": 0.9886, + "step": 2952 + }, + { + "epoch": 0.13991945036721157, + "grad_norm": 0.48828125, + "learning_rate": 0.00019054791063738365, + "loss": 0.6058, + "step": 2953 + }, + { + "epoch": 0.13996683250414593, + "grad_norm": 0.53125, + "learning_rate": 0.00019054158935285357, + "loss": 0.2055, + "step": 2954 + }, + { + "epoch": 0.14001421464108033, + "grad_norm": 1.15625, + "learning_rate": 0.0001905352660602075, + "loss": 1.2486, + "step": 2955 + }, + { + "epoch": 0.1400615967780147, + "grad_norm": 0.3046875, + "learning_rate": 0.00019052894075958566, + "loss": 0.1538, + "step": 2956 + }, + { + "epoch": 0.14010897891494906, + "grad_norm": 0.47265625, + "learning_rate": 0.0001905226134511283, + "loss": 0.6826, + "step": 2957 + }, + { + "epoch": 0.14015636105188345, + "grad_norm": 0.640625, + "learning_rate": 0.00019051628413497583, + "loss": 1.2466, + "step": 2958 + }, + { + "epoch": 0.14020374318881781, + "grad_norm": 0.4609375, + "learning_rate": 0.0001905099528112686, + "loss": 0.7137, + "step": 2959 + }, + { + "epoch": 0.14025112532575218, + "grad_norm": 1.0625, + "learning_rate": 0.00019050361948014702, + "loss": 0.0606, + "step": 2960 + }, + { + "epoch": 0.14029850746268657, + "grad_norm": 0.58984375, + "learning_rate": 0.00019049728414175153, + "loss": 0.8559, + "step": 2961 + }, + { + "epoch": 0.14034588959962094, + "grad_norm": 0.578125, + "learning_rate": 0.00019049094679622267, + "loss": 0.6058, + "step": 2962 + }, + { + "epoch": 0.14039327173655533, + "grad_norm": 0.65234375, + "learning_rate": 0.00019048460744370104, + "loss": 0.1514, + "step": 2963 + }, + { + "epoch": 0.1404406538734897, + "grad_norm": 0.50390625, + "learning_rate": 0.00019047826608432718, + "loss": 1.0782, + "step": 2964 + }, + { + "epoch": 0.14048803601042406, + "grad_norm": 0.4765625, + "learning_rate": 0.00019047192271824174, + "loss": 0.1652, + "step": 2965 + }, + { + "epoch": 0.14053541814735845, + "grad_norm": 0.66015625, + "learning_rate": 0.00019046557734558542, + "loss": 0.8274, + "step": 2966 + }, + { + "epoch": 0.14058280028429282, + "grad_norm": 0.5, + "learning_rate": 0.00019045922996649897, + "loss": 0.9063, + "step": 2967 + }, + { + "epoch": 0.14063018242122718, + "grad_norm": 0.62109375, + "learning_rate": 0.00019045288058112313, + "loss": 1.4254, + "step": 2968 + }, + { + "epoch": 0.14067756455816158, + "grad_norm": 0.435546875, + "learning_rate": 0.00019044652918959876, + "loss": 0.1689, + "step": 2969 + }, + { + "epoch": 0.14072494669509594, + "grad_norm": 1.109375, + "learning_rate": 0.00019044017579206672, + "loss": 0.1385, + "step": 2970 + }, + { + "epoch": 0.14077232883203034, + "grad_norm": 0.6796875, + "learning_rate": 0.00019043382038866789, + "loss": 0.9092, + "step": 2971 + }, + { + "epoch": 0.1408197109689647, + "grad_norm": 0.2490234375, + "learning_rate": 0.00019042746297954328, + "loss": 0.1566, + "step": 2972 + }, + { + "epoch": 0.14086709310589907, + "grad_norm": 0.5859375, + "learning_rate": 0.00019042110356483382, + "loss": 1.2021, + "step": 2973 + }, + { + "epoch": 0.14091447524283346, + "grad_norm": 0.7265625, + "learning_rate": 0.0001904147421446806, + "loss": 1.2113, + "step": 2974 + }, + { + "epoch": 0.14096185737976782, + "grad_norm": 0.6328125, + "learning_rate": 0.00019040837871922475, + "loss": 1.3793, + "step": 2975 + }, + { + "epoch": 0.1410092395167022, + "grad_norm": 0.95703125, + "learning_rate": 0.00019040201328860733, + "loss": 0.2159, + "step": 2976 + }, + { + "epoch": 0.14105662165363658, + "grad_norm": 0.99609375, + "learning_rate": 0.00019039564585296957, + "loss": 0.1552, + "step": 2977 + }, + { + "epoch": 0.14110400379057095, + "grad_norm": 0.640625, + "learning_rate": 0.00019038927641245264, + "loss": 1.4, + "step": 2978 + }, + { + "epoch": 0.14115138592750534, + "grad_norm": 0.5390625, + "learning_rate": 0.0001903829049671978, + "loss": 0.9316, + "step": 2979 + }, + { + "epoch": 0.1411987680644397, + "grad_norm": 0.6171875, + "learning_rate": 0.00019037653151734646, + "loss": 0.8791, + "step": 2980 + }, + { + "epoch": 0.14124615020137407, + "grad_norm": 0.5546875, + "learning_rate": 0.00019037015606303991, + "loss": 1.2623, + "step": 2981 + }, + { + "epoch": 0.14129353233830846, + "grad_norm": 0.5390625, + "learning_rate": 0.00019036377860441955, + "loss": 0.7812, + "step": 2982 + }, + { + "epoch": 0.14134091447524283, + "grad_norm": 0.92578125, + "learning_rate": 0.00019035739914162683, + "loss": 0.2049, + "step": 2983 + }, + { + "epoch": 0.14138829661217722, + "grad_norm": 0.5625, + "learning_rate": 0.00019035101767480322, + "loss": 1.0255, + "step": 2984 + }, + { + "epoch": 0.1414356787491116, + "grad_norm": 0.4921875, + "learning_rate": 0.0001903446342040903, + "loss": 0.916, + "step": 2985 + }, + { + "epoch": 0.14148306088604595, + "grad_norm": 0.7578125, + "learning_rate": 0.00019033824872962963, + "loss": 1.4644, + "step": 2986 + }, + { + "epoch": 0.14153044302298035, + "grad_norm": 0.66015625, + "learning_rate": 0.00019033186125156282, + "loss": 1.173, + "step": 2987 + }, + { + "epoch": 0.1415778251599147, + "grad_norm": 0.016845703125, + "learning_rate": 0.00019032547177003152, + "loss": 0.0009, + "step": 2988 + }, + { + "epoch": 0.14162520729684908, + "grad_norm": 0.482421875, + "learning_rate": 0.0001903190802851775, + "loss": 0.8895, + "step": 2989 + }, + { + "epoch": 0.14167258943378347, + "grad_norm": 0.6171875, + "learning_rate": 0.0001903126867971425, + "loss": 0.7037, + "step": 2990 + }, + { + "epoch": 0.14171997157071783, + "grad_norm": 1.328125, + "learning_rate": 0.00019030629130606825, + "loss": 0.5518, + "step": 2991 + }, + { + "epoch": 0.14176735370765223, + "grad_norm": 0.369140625, + "learning_rate": 0.00019029989381209669, + "loss": 0.968, + "step": 2992 + }, + { + "epoch": 0.1418147358445866, + "grad_norm": 0.59765625, + "learning_rate": 0.00019029349431536963, + "loss": 0.8648, + "step": 2993 + }, + { + "epoch": 0.14186211798152096, + "grad_norm": 0.51171875, + "learning_rate": 0.0001902870928160291, + "loss": 0.2046, + "step": 2994 + }, + { + "epoch": 0.14190950011845535, + "grad_norm": 0.6953125, + "learning_rate": 0.00019028068931421695, + "loss": 1.3475, + "step": 2995 + }, + { + "epoch": 0.14195688225538972, + "grad_norm": 0.478515625, + "learning_rate": 0.00019027428381007534, + "loss": 0.3664, + "step": 2996 + }, + { + "epoch": 0.14200426439232408, + "grad_norm": 0.546875, + "learning_rate": 0.00019026787630374621, + "loss": 1.3609, + "step": 2997 + }, + { + "epoch": 0.14205164652925847, + "grad_norm": 0.6328125, + "learning_rate": 0.00019026146679537175, + "loss": 1.3, + "step": 2998 + }, + { + "epoch": 0.14209902866619284, + "grad_norm": 0.58984375, + "learning_rate": 0.00019025505528509414, + "loss": 0.9183, + "step": 2999 + }, + { + "epoch": 0.14214641080312723, + "grad_norm": 0.7109375, + "learning_rate": 0.00019024864177305547, + "loss": 0.1665, + "step": 3000 + }, + { + "epoch": 0.1421937929400616, + "grad_norm": 0.69921875, + "learning_rate": 0.00019024222625939812, + "loss": 0.9524, + "step": 3001 + }, + { + "epoch": 0.14224117507699596, + "grad_norm": 0.8984375, + "learning_rate": 0.0001902358087442643, + "loss": 0.9756, + "step": 3002 + }, + { + "epoch": 0.14228855721393036, + "grad_norm": 1.296875, + "learning_rate": 0.00019022938922779633, + "loss": 0.3226, + "step": 3003 + }, + { + "epoch": 0.14233593935086472, + "grad_norm": 0.65625, + "learning_rate": 0.00019022296771013664, + "loss": 0.9146, + "step": 3004 + }, + { + "epoch": 0.14238332148779909, + "grad_norm": 0.55078125, + "learning_rate": 0.0001902165441914276, + "loss": 1.1521, + "step": 3005 + }, + { + "epoch": 0.14243070362473348, + "grad_norm": 0.53515625, + "learning_rate": 0.0001902101186718117, + "loss": 0.9258, + "step": 3006 + }, + { + "epoch": 0.14247808576166784, + "grad_norm": 0.63671875, + "learning_rate": 0.0001902036911514315, + "loss": 1.0645, + "step": 3007 + }, + { + "epoch": 0.14252546789860224, + "grad_norm": 0.6875, + "learning_rate": 0.0001901972616304295, + "loss": 1.4107, + "step": 3008 + }, + { + "epoch": 0.1425728500355366, + "grad_norm": 0.71875, + "learning_rate": 0.0001901908301089483, + "loss": 1.3563, + "step": 3009 + }, + { + "epoch": 0.14262023217247097, + "grad_norm": 0.75, + "learning_rate": 0.00019018439658713055, + "loss": 0.1836, + "step": 3010 + }, + { + "epoch": 0.14266761430940536, + "grad_norm": 0.74609375, + "learning_rate": 0.00019017796106511893, + "loss": 0.0152, + "step": 3011 + }, + { + "epoch": 0.14271499644633973, + "grad_norm": 0.63671875, + "learning_rate": 0.0001901715235430562, + "loss": 1.1516, + "step": 3012 + }, + { + "epoch": 0.14276237858327412, + "grad_norm": 0.46484375, + "learning_rate": 0.00019016508402108513, + "loss": 1.2557, + "step": 3013 + }, + { + "epoch": 0.14280976072020848, + "grad_norm": 0.6171875, + "learning_rate": 0.00019015864249934854, + "loss": 1.0156, + "step": 3014 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.65234375, + "learning_rate": 0.00019015219897798927, + "loss": 1.5077, + "step": 3015 + }, + { + "epoch": 0.14290452499407724, + "grad_norm": 0.53125, + "learning_rate": 0.00019014575345715028, + "loss": 0.8094, + "step": 3016 + }, + { + "epoch": 0.1429519071310116, + "grad_norm": 0.59375, + "learning_rate": 0.0001901393059369745, + "loss": 1.1134, + "step": 3017 + }, + { + "epoch": 0.14299928926794597, + "grad_norm": 0.37109375, + "learning_rate": 0.0001901328564176049, + "loss": 0.5888, + "step": 3018 + }, + { + "epoch": 0.14304667140488037, + "grad_norm": 0.5390625, + "learning_rate": 0.00019012640489918456, + "loss": 1.2123, + "step": 3019 + }, + { + "epoch": 0.14309405354181473, + "grad_norm": 0.640625, + "learning_rate": 0.00019011995138185656, + "loss": 0.0636, + "step": 3020 + }, + { + "epoch": 0.14314143567874912, + "grad_norm": 0.5234375, + "learning_rate": 0.00019011349586576403, + "loss": 0.9244, + "step": 3021 + }, + { + "epoch": 0.1431888178156835, + "grad_norm": 0.66796875, + "learning_rate": 0.00019010703835105016, + "loss": 1.1969, + "step": 3022 + }, + { + "epoch": 0.14323619995261785, + "grad_norm": 0.4921875, + "learning_rate": 0.0001901005788378581, + "loss": 0.0429, + "step": 3023 + }, + { + "epoch": 0.14328358208955225, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019009411732633124, + "loss": 0.128, + "step": 3024 + }, + { + "epoch": 0.1433309642264866, + "grad_norm": 0.58984375, + "learning_rate": 0.0001900876538166128, + "loss": 1.0117, + "step": 3025 + }, + { + "epoch": 0.14337834636342098, + "grad_norm": 0.20703125, + "learning_rate": 0.00019008118830884615, + "loss": 0.1456, + "step": 3026 + }, + { + "epoch": 0.14342572850035537, + "grad_norm": 0.48828125, + "learning_rate": 0.0001900747208031747, + "loss": 1.0432, + "step": 3027 + }, + { + "epoch": 0.14347311063728974, + "grad_norm": 0.54296875, + "learning_rate": 0.00019006825129974186, + "loss": 0.7482, + "step": 3028 + }, + { + "epoch": 0.14352049277422413, + "grad_norm": 0.61328125, + "learning_rate": 0.00019006177979869116, + "loss": 1.1897, + "step": 3029 + }, + { + "epoch": 0.1435678749111585, + "grad_norm": 0.6328125, + "learning_rate": 0.00019005530630016612, + "loss": 0.0816, + "step": 3030 + }, + { + "epoch": 0.14361525704809286, + "grad_norm": 0.703125, + "learning_rate": 0.00019004883080431034, + "loss": 1.1493, + "step": 3031 + }, + { + "epoch": 0.14366263918502725, + "grad_norm": 0.5234375, + "learning_rate": 0.00019004235331126736, + "loss": 0.0867, + "step": 3032 + }, + { + "epoch": 0.14371002132196162, + "grad_norm": 0.5625, + "learning_rate": 0.00019003587382118088, + "loss": 1.1148, + "step": 3033 + }, + { + "epoch": 0.14375740345889598, + "grad_norm": 0.0164794921875, + "learning_rate": 0.00019002939233419465, + "loss": 0.0006, + "step": 3034 + }, + { + "epoch": 0.14380478559583038, + "grad_norm": 0.61328125, + "learning_rate": 0.00019002290885045242, + "loss": 0.1768, + "step": 3035 + }, + { + "epoch": 0.14385216773276474, + "grad_norm": 0.62890625, + "learning_rate": 0.00019001642337009792, + "loss": 0.8938, + "step": 3036 + }, + { + "epoch": 0.14389954986969913, + "grad_norm": 0.61328125, + "learning_rate": 0.00019000993589327503, + "loss": 0.9925, + "step": 3037 + }, + { + "epoch": 0.1439469320066335, + "grad_norm": 0.5390625, + "learning_rate": 0.00019000344642012765, + "loss": 0.9377, + "step": 3038 + }, + { + "epoch": 0.14399431414356786, + "grad_norm": 0.72265625, + "learning_rate": 0.0001899969549507997, + "loss": 1.0997, + "step": 3039 + }, + { + "epoch": 0.14404169628050226, + "grad_norm": 0.5703125, + "learning_rate": 0.00018999046148543514, + "loss": 0.2018, + "step": 3040 + }, + { + "epoch": 0.14408907841743662, + "grad_norm": 0.5078125, + "learning_rate": 0.000189983966024178, + "loss": 0.7018, + "step": 3041 + }, + { + "epoch": 0.14413646055437102, + "grad_norm": 0.220703125, + "learning_rate": 0.00018997746856717234, + "loss": 0.0291, + "step": 3042 + }, + { + "epoch": 0.14418384269130538, + "grad_norm": 0.78125, + "learning_rate": 0.00018997096911456228, + "loss": 0.0811, + "step": 3043 + }, + { + "epoch": 0.14423122482823975, + "grad_norm": 0.419921875, + "learning_rate": 0.00018996446766649192, + "loss": 0.0217, + "step": 3044 + }, + { + "epoch": 0.14427860696517414, + "grad_norm": 0.66015625, + "learning_rate": 0.00018995796422310553, + "loss": 0.9543, + "step": 3045 + }, + { + "epoch": 0.1443259891021085, + "grad_norm": 0.50390625, + "learning_rate": 0.0001899514587845473, + "loss": 0.0619, + "step": 3046 + }, + { + "epoch": 0.14437337123904287, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018994495135096152, + "loss": 0.1531, + "step": 3047 + }, + { + "epoch": 0.14442075337597726, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018993844192249256, + "loss": 0.0108, + "step": 3048 + }, + { + "epoch": 0.14446813551291163, + "grad_norm": 0.89453125, + "learning_rate": 0.0001899319304992847, + "loss": 1.1234, + "step": 3049 + }, + { + "epoch": 0.14451551764984602, + "grad_norm": 0.6015625, + "learning_rate": 0.00018992541708148246, + "loss": 1.077, + "step": 3050 + }, + { + "epoch": 0.14456289978678039, + "grad_norm": 0.53515625, + "learning_rate": 0.0001899189016692302, + "loss": 0.5022, + "step": 3051 + }, + { + "epoch": 0.14461028192371475, + "grad_norm": 0.5859375, + "learning_rate": 0.00018991238426267252, + "loss": 0.6068, + "step": 3052 + }, + { + "epoch": 0.14465766406064914, + "grad_norm": 0.61328125, + "learning_rate": 0.00018990586486195392, + "loss": 1.5448, + "step": 3053 + }, + { + "epoch": 0.1447050461975835, + "grad_norm": 0.404296875, + "learning_rate": 0.00018989934346721901, + "loss": 0.582, + "step": 3054 + }, + { + "epoch": 0.14475242833451787, + "grad_norm": 0.490234375, + "learning_rate": 0.0001898928200786124, + "loss": 0.877, + "step": 3055 + }, + { + "epoch": 0.14479981047145227, + "grad_norm": 0.9765625, + "learning_rate": 0.0001898862946962788, + "loss": 1.1089, + "step": 3056 + }, + { + "epoch": 0.14484719260838663, + "grad_norm": 0.5703125, + "learning_rate": 0.00018987976732036293, + "loss": 0.8184, + "step": 3057 + }, + { + "epoch": 0.14489457474532103, + "grad_norm": 0.462890625, + "learning_rate": 0.00018987323795100955, + "loss": 0.9716, + "step": 3058 + }, + { + "epoch": 0.1449419568822554, + "grad_norm": 0.5546875, + "learning_rate": 0.00018986670658836346, + "loss": 0.8909, + "step": 3059 + }, + { + "epoch": 0.14498933901918976, + "grad_norm": 0.64453125, + "learning_rate": 0.0001898601732325696, + "loss": 1.1241, + "step": 3060 + }, + { + "epoch": 0.14503672115612415, + "grad_norm": 0.6328125, + "learning_rate": 0.00018985363788377274, + "loss": 0.8768, + "step": 3061 + }, + { + "epoch": 0.14508410329305851, + "grad_norm": 0.045654296875, + "learning_rate": 0.00018984710054211794, + "loss": 0.0018, + "step": 3062 + }, + { + "epoch": 0.14513148542999288, + "grad_norm": 0.55078125, + "learning_rate": 0.00018984056120775015, + "loss": 1.1582, + "step": 3063 + }, + { + "epoch": 0.14517886756692727, + "grad_norm": 0.7109375, + "learning_rate": 0.0001898340198808144, + "loss": 1.1511, + "step": 3064 + }, + { + "epoch": 0.14522624970386164, + "grad_norm": 0.75390625, + "learning_rate": 0.0001898274765614558, + "loss": 0.1091, + "step": 3065 + }, + { + "epoch": 0.14527363184079603, + "grad_norm": 0.1416015625, + "learning_rate": 0.00018982093124981941, + "loss": 0.0181, + "step": 3066 + }, + { + "epoch": 0.1453210139777304, + "grad_norm": 0.73828125, + "learning_rate": 0.00018981438394605044, + "loss": 1.2479, + "step": 3067 + }, + { + "epoch": 0.14536839611466476, + "grad_norm": 0.494140625, + "learning_rate": 0.00018980783465029412, + "loss": 0.6365, + "step": 3068 + }, + { + "epoch": 0.14541577825159915, + "grad_norm": 0.71484375, + "learning_rate": 0.0001898012833626957, + "loss": 1.0946, + "step": 3069 + }, + { + "epoch": 0.14546316038853352, + "grad_norm": 0.66796875, + "learning_rate": 0.00018979473008340044, + "loss": 1.1289, + "step": 3070 + }, + { + "epoch": 0.1455105425254679, + "grad_norm": 0.251953125, + "learning_rate": 0.0001897881748125537, + "loss": 0.1688, + "step": 3071 + }, + { + "epoch": 0.14555792466240228, + "grad_norm": 0.578125, + "learning_rate": 0.00018978161755030094, + "loss": 1.2244, + "step": 3072 + }, + { + "epoch": 0.14560530679933664, + "grad_norm": 0.72265625, + "learning_rate": 0.00018977505829678747, + "loss": 1.4977, + "step": 3073 + }, + { + "epoch": 0.14565268893627104, + "grad_norm": 0.7109375, + "learning_rate": 0.00018976849705215883, + "loss": 1.0672, + "step": 3074 + }, + { + "epoch": 0.1457000710732054, + "grad_norm": 0.65234375, + "learning_rate": 0.0001897619338165606, + "loss": 0.1644, + "step": 3075 + }, + { + "epoch": 0.14574745321013977, + "grad_norm": 0.7109375, + "learning_rate": 0.00018975536859013826, + "loss": 1.2112, + "step": 3076 + }, + { + "epoch": 0.14579483534707416, + "grad_norm": 0.734375, + "learning_rate": 0.00018974880137303743, + "loss": 1.2029, + "step": 3077 + }, + { + "epoch": 0.14584221748400852, + "grad_norm": 0.57421875, + "learning_rate": 0.00018974223216540378, + "loss": 0.9227, + "step": 3078 + }, + { + "epoch": 0.14588959962094292, + "grad_norm": 0.6328125, + "learning_rate": 0.00018973566096738305, + "loss": 1.0532, + "step": 3079 + }, + { + "epoch": 0.14593698175787728, + "grad_norm": 0.99609375, + "learning_rate": 0.0001897290877791209, + "loss": 0.6377, + "step": 3080 + }, + { + "epoch": 0.14598436389481165, + "grad_norm": 0.6328125, + "learning_rate": 0.00018972251260076316, + "loss": 1.3745, + "step": 3081 + }, + { + "epoch": 0.14603174603174604, + "grad_norm": 0.546875, + "learning_rate": 0.0001897159354324557, + "loss": 0.8157, + "step": 3082 + }, + { + "epoch": 0.1460791281686804, + "grad_norm": 0.01007080078125, + "learning_rate": 0.00018970935627434432, + "loss": 0.0007, + "step": 3083 + }, + { + "epoch": 0.14612651030561477, + "grad_norm": 0.640625, + "learning_rate": 0.00018970277512657497, + "loss": 0.8798, + "step": 3084 + }, + { + "epoch": 0.14617389244254916, + "grad_norm": 1.15625, + "learning_rate": 0.00018969619198929363, + "loss": 0.2542, + "step": 3085 + }, + { + "epoch": 0.14622127457948353, + "grad_norm": 0.76953125, + "learning_rate": 0.00018968960686264628, + "loss": 1.2204, + "step": 3086 + }, + { + "epoch": 0.14626865671641792, + "grad_norm": 0.6328125, + "learning_rate": 0.00018968301974677898, + "loss": 1.1266, + "step": 3087 + }, + { + "epoch": 0.1463160388533523, + "grad_norm": 0.56640625, + "learning_rate": 0.00018967643064183784, + "loss": 0.9517, + "step": 3088 + }, + { + "epoch": 0.14636342099028665, + "grad_norm": 0.60546875, + "learning_rate": 0.00018966983954796896, + "loss": 0.1777, + "step": 3089 + }, + { + "epoch": 0.14641080312722105, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001896632464653186, + "loss": 0.0187, + "step": 3090 + }, + { + "epoch": 0.1464581852641554, + "grad_norm": 0.453125, + "learning_rate": 0.0001896566513940329, + "loss": 1.1612, + "step": 3091 + }, + { + "epoch": 0.14650556740108978, + "grad_norm": 0.6484375, + "learning_rate": 0.0001896500543342582, + "loss": 1.0695, + "step": 3092 + }, + { + "epoch": 0.14655294953802417, + "grad_norm": 0.62890625, + "learning_rate": 0.00018964345528614077, + "loss": 1.1036, + "step": 3093 + }, + { + "epoch": 0.14660033167495853, + "grad_norm": 0.62890625, + "learning_rate": 0.000189636854249827, + "loss": 1.5296, + "step": 3094 + }, + { + "epoch": 0.14664771381189293, + "grad_norm": 1.1953125, + "learning_rate": 0.00018963025122546324, + "loss": 0.0533, + "step": 3095 + }, + { + "epoch": 0.1466950959488273, + "grad_norm": 0.69140625, + "learning_rate": 0.000189623646213196, + "loss": 1.1633, + "step": 3096 + }, + { + "epoch": 0.14674247808576166, + "grad_norm": 0.279296875, + "learning_rate": 0.00018961703921317174, + "loss": 0.1973, + "step": 3097 + }, + { + "epoch": 0.14678986022269605, + "grad_norm": 0.62109375, + "learning_rate": 0.00018961043022553705, + "loss": 1.3722, + "step": 3098 + }, + { + "epoch": 0.14683724235963042, + "grad_norm": 0.2265625, + "learning_rate": 0.00018960381925043842, + "loss": 0.1504, + "step": 3099 + }, + { + "epoch": 0.1468846244965648, + "grad_norm": 0.52734375, + "learning_rate": 0.00018959720628802253, + "loss": 0.2417, + "step": 3100 + }, + { + "epoch": 0.14693200663349917, + "grad_norm": 0.5703125, + "learning_rate": 0.00018959059133843607, + "loss": 0.8952, + "step": 3101 + }, + { + "epoch": 0.14697938877043354, + "grad_norm": 0.703125, + "learning_rate": 0.00018958397440182567, + "loss": 1.1358, + "step": 3102 + }, + { + "epoch": 0.14702677090736793, + "grad_norm": 0.53125, + "learning_rate": 0.00018957735547833816, + "loss": 1.035, + "step": 3103 + }, + { + "epoch": 0.1470741530443023, + "grad_norm": 0.8671875, + "learning_rate": 0.00018957073456812032, + "loss": 0.1452, + "step": 3104 + }, + { + "epoch": 0.14712153518123666, + "grad_norm": 0.4609375, + "learning_rate": 0.000189564111671319, + "loss": 0.9038, + "step": 3105 + }, + { + "epoch": 0.14716891731817106, + "grad_norm": 0.498046875, + "learning_rate": 0.00018955748678808106, + "loss": 0.7211, + "step": 3106 + }, + { + "epoch": 0.14721629945510542, + "grad_norm": 0.69921875, + "learning_rate": 0.0001895508599185535, + "loss": 0.0434, + "step": 3107 + }, + { + "epoch": 0.1472636815920398, + "grad_norm": 0.6328125, + "learning_rate": 0.00018954423106288322, + "loss": 0.996, + "step": 3108 + }, + { + "epoch": 0.14731106372897418, + "grad_norm": 0.59375, + "learning_rate": 0.0001895376002212173, + "loss": 1.033, + "step": 3109 + }, + { + "epoch": 0.14735844586590854, + "grad_norm": 0.65625, + "learning_rate": 0.00018953096739370272, + "loss": 1.0574, + "step": 3110 + }, + { + "epoch": 0.14740582800284294, + "grad_norm": 0.53515625, + "learning_rate": 0.0001895243325804867, + "loss": 1.3177, + "step": 3111 + }, + { + "epoch": 0.1474532101397773, + "grad_norm": 0.46484375, + "learning_rate": 0.0001895176957817163, + "loss": 0.1774, + "step": 3112 + }, + { + "epoch": 0.14750059227671167, + "grad_norm": 0.47265625, + "learning_rate": 0.0001895110569975388, + "loss": 0.6744, + "step": 3113 + }, + { + "epoch": 0.14754797441364606, + "grad_norm": 0.44140625, + "learning_rate": 0.0001895044162281014, + "loss": 0.9408, + "step": 3114 + }, + { + "epoch": 0.14759535655058043, + "grad_norm": 0.75, + "learning_rate": 0.00018949777347355138, + "loss": 0.8715, + "step": 3115 + }, + { + "epoch": 0.14764273868751482, + "grad_norm": 1.2265625, + "learning_rate": 0.00018949112873403604, + "loss": 0.6799, + "step": 3116 + }, + { + "epoch": 0.14769012082444918, + "grad_norm": 0.6640625, + "learning_rate": 0.0001894844820097028, + "loss": 0.7846, + "step": 3117 + }, + { + "epoch": 0.14773750296138355, + "grad_norm": 0.5390625, + "learning_rate": 0.00018947783330069908, + "loss": 0.7918, + "step": 3118 + }, + { + "epoch": 0.14778488509831794, + "grad_norm": 0.58984375, + "learning_rate": 0.0001894711826071723, + "loss": 1.1212, + "step": 3119 + }, + { + "epoch": 0.1478322672352523, + "grad_norm": 1.421875, + "learning_rate": 0.00018946452992927, + "loss": 1.0106, + "step": 3120 + }, + { + "epoch": 0.14787964937218667, + "grad_norm": 0.81640625, + "learning_rate": 0.00018945787526713974, + "loss": 0.8408, + "step": 3121 + }, + { + "epoch": 0.14792703150912107, + "grad_norm": 0.228515625, + "learning_rate": 0.00018945121862092907, + "loss": 0.1438, + "step": 3122 + }, + { + "epoch": 0.14797441364605543, + "grad_norm": 0.6875, + "learning_rate": 0.00018944455999078566, + "loss": 1.0483, + "step": 3123 + }, + { + "epoch": 0.14802179578298982, + "grad_norm": 0.76953125, + "learning_rate": 0.00018943789937685718, + "loss": 0.7446, + "step": 3124 + }, + { + "epoch": 0.1480691779199242, + "grad_norm": 0.625, + "learning_rate": 0.00018943123677929135, + "loss": 1.4493, + "step": 3125 + }, + { + "epoch": 0.14811656005685855, + "grad_norm": 0.62890625, + "learning_rate": 0.00018942457219823594, + "loss": 1.049, + "step": 3126 + }, + { + "epoch": 0.14816394219379295, + "grad_norm": 0.48828125, + "learning_rate": 0.0001894179056338388, + "loss": 1.1085, + "step": 3127 + }, + { + "epoch": 0.1482113243307273, + "grad_norm": 0.337890625, + "learning_rate": 0.00018941123708624772, + "loss": 0.0172, + "step": 3128 + }, + { + "epoch": 0.1482587064676617, + "grad_norm": 0.60546875, + "learning_rate": 0.00018940456655561064, + "loss": 0.0563, + "step": 3129 + }, + { + "epoch": 0.14830608860459607, + "grad_norm": 0.94921875, + "learning_rate": 0.00018939789404207556, + "loss": 0.3327, + "step": 3130 + }, + { + "epoch": 0.14835347074153044, + "grad_norm": 0.458984375, + "learning_rate": 0.00018939121954579036, + "loss": 0.8707, + "step": 3131 + }, + { + "epoch": 0.14840085287846483, + "grad_norm": 1.0703125, + "learning_rate": 0.00018938454306690315, + "loss": 0.489, + "step": 3132 + }, + { + "epoch": 0.1484482350153992, + "grad_norm": 0.421875, + "learning_rate": 0.00018937786460556196, + "loss": 0.1558, + "step": 3133 + }, + { + "epoch": 0.14849561715233356, + "grad_norm": 0.578125, + "learning_rate": 0.00018937118416191494, + "loss": 1.3409, + "step": 3134 + }, + { + "epoch": 0.14854299928926795, + "grad_norm": 0.287109375, + "learning_rate": 0.00018936450173611026, + "loss": 0.0315, + "step": 3135 + }, + { + "epoch": 0.14859038142620232, + "grad_norm": 0.203125, + "learning_rate": 0.0001893578173282961, + "loss": 0.1632, + "step": 3136 + }, + { + "epoch": 0.1486377635631367, + "grad_norm": 0.357421875, + "learning_rate": 0.00018935113093862074, + "loss": 0.031, + "step": 3137 + }, + { + "epoch": 0.14868514570007108, + "grad_norm": 0.6484375, + "learning_rate": 0.00018934444256723246, + "loss": 0.6992, + "step": 3138 + }, + { + "epoch": 0.14873252783700544, + "grad_norm": 0.419921875, + "learning_rate": 0.00018933775221427964, + "loss": 0.453, + "step": 3139 + }, + { + "epoch": 0.14877990997393983, + "grad_norm": 0.5078125, + "learning_rate": 0.0001893310598799106, + "loss": 0.9826, + "step": 3140 + }, + { + "epoch": 0.1488272921108742, + "grad_norm": 0.6484375, + "learning_rate": 0.00018932436556427383, + "loss": 0.1852, + "step": 3141 + }, + { + "epoch": 0.14887467424780856, + "grad_norm": 0.578125, + "learning_rate": 0.00018931766926751778, + "loss": 1.0186, + "step": 3142 + }, + { + "epoch": 0.14892205638474296, + "grad_norm": 0.58984375, + "learning_rate": 0.00018931097098979095, + "loss": 0.8663, + "step": 3143 + }, + { + "epoch": 0.14896943852167732, + "grad_norm": 0.52734375, + "learning_rate": 0.00018930427073124187, + "loss": 1.1418, + "step": 3144 + }, + { + "epoch": 0.14901682065861172, + "grad_norm": 0.423828125, + "learning_rate": 0.00018929756849201925, + "loss": 0.0588, + "step": 3145 + }, + { + "epoch": 0.14906420279554608, + "grad_norm": 0.412109375, + "learning_rate": 0.00018929086427227164, + "loss": 0.5915, + "step": 3146 + }, + { + "epoch": 0.14911158493248045, + "grad_norm": 0.66015625, + "learning_rate": 0.00018928415807214778, + "loss": 0.9467, + "step": 3147 + }, + { + "epoch": 0.14915896706941484, + "grad_norm": 0.73828125, + "learning_rate": 0.0001892774498917964, + "loss": 0.8641, + "step": 3148 + }, + { + "epoch": 0.1492063492063492, + "grad_norm": 0.236328125, + "learning_rate": 0.00018927073973136626, + "loss": 0.1645, + "step": 3149 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5859375, + "learning_rate": 0.00018926402759100622, + "loss": 1.561, + "step": 3150 + }, + { + "epoch": 0.14930111348021796, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018925731347086512, + "loss": 0.0236, + "step": 3151 + }, + { + "epoch": 0.14934849561715233, + "grad_norm": 0.51171875, + "learning_rate": 0.00018925059737109188, + "loss": 0.66, + "step": 3152 + }, + { + "epoch": 0.14939587775408672, + "grad_norm": 0.57421875, + "learning_rate": 0.00018924387929183546, + "loss": 1.2031, + "step": 3153 + }, + { + "epoch": 0.14944325989102109, + "grad_norm": 0.5859375, + "learning_rate": 0.00018923715923324484, + "loss": 0.9212, + "step": 3154 + }, + { + "epoch": 0.14949064202795545, + "grad_norm": 0.96484375, + "learning_rate": 0.0001892304371954691, + "loss": 0.5071, + "step": 3155 + }, + { + "epoch": 0.14953802416488984, + "grad_norm": 0.65625, + "learning_rate": 0.0001892237131786573, + "loss": 1.53, + "step": 3156 + }, + { + "epoch": 0.1495854063018242, + "grad_norm": 0.08544921875, + "learning_rate": 0.00018921698718295856, + "loss": 0.003, + "step": 3157 + }, + { + "epoch": 0.1496327884387586, + "grad_norm": 0.67578125, + "learning_rate": 0.0001892102592085221, + "loss": 0.8592, + "step": 3158 + }, + { + "epoch": 0.14968017057569297, + "grad_norm": 0.6171875, + "learning_rate": 0.00018920352925549708, + "loss": 1.1312, + "step": 3159 + }, + { + "epoch": 0.14972755271262733, + "grad_norm": 0.57421875, + "learning_rate": 0.00018919679732403284, + "loss": 0.9752, + "step": 3160 + }, + { + "epoch": 0.14977493484956172, + "grad_norm": 0.66796875, + "learning_rate": 0.0001891900634142786, + "loss": 0.942, + "step": 3161 + }, + { + "epoch": 0.1498223169864961, + "grad_norm": 0.65625, + "learning_rate": 0.0001891833275263838, + "loss": 1.1579, + "step": 3162 + }, + { + "epoch": 0.14986969912343046, + "grad_norm": 0.345703125, + "learning_rate": 0.00018917658966049778, + "loss": 0.173, + "step": 3163 + }, + { + "epoch": 0.14991708126036485, + "grad_norm": 0.7421875, + "learning_rate": 0.00018916984981676995, + "loss": 1.3253, + "step": 3164 + }, + { + "epoch": 0.1499644633972992, + "grad_norm": 0.1416015625, + "learning_rate": 0.00018916310799534986, + "loss": 0.0086, + "step": 3165 + }, + { + "epoch": 0.1500118455342336, + "grad_norm": 0.51953125, + "learning_rate": 0.00018915636419638703, + "loss": 1.1239, + "step": 3166 + }, + { + "epoch": 0.15005922767116797, + "grad_norm": 0.59375, + "learning_rate": 0.00018914961842003095, + "loss": 1.4062, + "step": 3167 + }, + { + "epoch": 0.15010660980810234, + "grad_norm": 1.515625, + "learning_rate": 0.00018914287066643134, + "loss": 0.9106, + "step": 3168 + }, + { + "epoch": 0.15015399194503673, + "grad_norm": 0.5390625, + "learning_rate": 0.00018913612093573778, + "loss": 0.9539, + "step": 3169 + }, + { + "epoch": 0.1502013740819711, + "grad_norm": 0.216796875, + "learning_rate": 0.0001891293692281, + "loss": 0.1564, + "step": 3170 + }, + { + "epoch": 0.15024875621890546, + "grad_norm": 0.36328125, + "learning_rate": 0.00018912261554366778, + "loss": 0.1612, + "step": 3171 + }, + { + "epoch": 0.15029613835583985, + "grad_norm": 0.66015625, + "learning_rate": 0.00018911585988259084, + "loss": 0.8047, + "step": 3172 + }, + { + "epoch": 0.15034352049277422, + "grad_norm": 0.67578125, + "learning_rate": 0.00018910910224501906, + "loss": 1.1706, + "step": 3173 + }, + { + "epoch": 0.1503909026297086, + "grad_norm": 0.66796875, + "learning_rate": 0.0001891023426311023, + "loss": 0.8821, + "step": 3174 + }, + { + "epoch": 0.15043828476664298, + "grad_norm": 0.984375, + "learning_rate": 0.0001890955810409905, + "loss": 0.4706, + "step": 3175 + }, + { + "epoch": 0.15048566690357734, + "grad_norm": 0.54296875, + "learning_rate": 0.00018908881747483364, + "loss": 0.8414, + "step": 3176 + }, + { + "epoch": 0.15053304904051173, + "grad_norm": 0.5234375, + "learning_rate": 0.00018908205193278165, + "loss": 0.9074, + "step": 3177 + }, + { + "epoch": 0.1505804311774461, + "grad_norm": 0.5234375, + "learning_rate": 0.00018907528441498465, + "loss": 0.1279, + "step": 3178 + }, + { + "epoch": 0.15062781331438047, + "grad_norm": 0.57421875, + "learning_rate": 0.00018906851492159275, + "loss": 0.0764, + "step": 3179 + }, + { + "epoch": 0.15067519545131486, + "grad_norm": 0.54296875, + "learning_rate": 0.00018906174345275603, + "loss": 0.9166, + "step": 3180 + }, + { + "epoch": 0.15072257758824922, + "grad_norm": 0.7578125, + "learning_rate": 0.00018905497000862474, + "loss": 0.9086, + "step": 3181 + }, + { + "epoch": 0.15076995972518362, + "grad_norm": 0.546875, + "learning_rate": 0.000189048194589349, + "loss": 0.5789, + "step": 3182 + }, + { + "epoch": 0.15081734186211798, + "grad_norm": 0.69140625, + "learning_rate": 0.00018904141719507922, + "loss": 0.7374, + "step": 3183 + }, + { + "epoch": 0.15086472399905235, + "grad_norm": 0.7578125, + "learning_rate": 0.00018903463782596563, + "loss": 0.9864, + "step": 3184 + }, + { + "epoch": 0.15091210613598674, + "grad_norm": 0.58203125, + "learning_rate": 0.00018902785648215858, + "loss": 0.9219, + "step": 3185 + }, + { + "epoch": 0.1509594882729211, + "grad_norm": 0.10302734375, + "learning_rate": 0.00018902107316380855, + "loss": 0.0079, + "step": 3186 + }, + { + "epoch": 0.1510068704098555, + "grad_norm": 0.71484375, + "learning_rate": 0.00018901428787106592, + "loss": 0.0924, + "step": 3187 + }, + { + "epoch": 0.15105425254678986, + "grad_norm": 0.6640625, + "learning_rate": 0.00018900750060408118, + "loss": 0.8376, + "step": 3188 + }, + { + "epoch": 0.15110163468372423, + "grad_norm": 0.6484375, + "learning_rate": 0.00018900071136300492, + "loss": 1.1438, + "step": 3189 + }, + { + "epoch": 0.15114901682065862, + "grad_norm": 0.171875, + "learning_rate": 0.00018899392014798766, + "loss": 0.009, + "step": 3190 + }, + { + "epoch": 0.151196398957593, + "grad_norm": 0.27734375, + "learning_rate": 0.00018898712695918004, + "loss": 0.1356, + "step": 3191 + }, + { + "epoch": 0.15124378109452735, + "grad_norm": 0.63671875, + "learning_rate": 0.00018898033179673274, + "loss": 1.1841, + "step": 3192 + }, + { + "epoch": 0.15129116323146174, + "grad_norm": 0.58984375, + "learning_rate": 0.00018897353466079645, + "loss": 1.0475, + "step": 3193 + }, + { + "epoch": 0.1513385453683961, + "grad_norm": 0.80078125, + "learning_rate": 0.00018896673555152195, + "loss": 0.2508, + "step": 3194 + }, + { + "epoch": 0.1513859275053305, + "grad_norm": 0.703125, + "learning_rate": 0.00018895993446905998, + "loss": 0.8208, + "step": 3195 + }, + { + "epoch": 0.15143330964226487, + "grad_norm": 0.333984375, + "learning_rate": 0.00018895313141356143, + "loss": 0.0463, + "step": 3196 + }, + { + "epoch": 0.15148069177919923, + "grad_norm": 0.69140625, + "learning_rate": 0.00018894632638517716, + "loss": 0.0668, + "step": 3197 + }, + { + "epoch": 0.15152807391613363, + "grad_norm": 0.53515625, + "learning_rate": 0.00018893951938405817, + "loss": 0.8999, + "step": 3198 + }, + { + "epoch": 0.151575456053068, + "grad_norm": 0.65625, + "learning_rate": 0.00018893271041035533, + "loss": 1.1003, + "step": 3199 + }, + { + "epoch": 0.15162283819000236, + "grad_norm": 0.6015625, + "learning_rate": 0.0001889258994642197, + "loss": 1.0037, + "step": 3200 + }, + { + "epoch": 0.15167022032693675, + "grad_norm": 0.6484375, + "learning_rate": 0.00018891908654580238, + "loss": 0.8504, + "step": 3201 + }, + { + "epoch": 0.15171760246387112, + "grad_norm": 0.46484375, + "learning_rate": 0.00018891227165525437, + "loss": 0.1001, + "step": 3202 + }, + { + "epoch": 0.1517649846008055, + "grad_norm": 0.66796875, + "learning_rate": 0.00018890545479272692, + "loss": 0.9516, + "step": 3203 + }, + { + "epoch": 0.15181236673773987, + "grad_norm": 0.5703125, + "learning_rate": 0.00018889863595837118, + "loss": 1.1313, + "step": 3204 + }, + { + "epoch": 0.15185974887467424, + "grad_norm": 0.76953125, + "learning_rate": 0.00018889181515233835, + "loss": 1.1998, + "step": 3205 + }, + { + "epoch": 0.15190713101160863, + "grad_norm": 0.578125, + "learning_rate": 0.0001888849923747798, + "loss": 0.2061, + "step": 3206 + }, + { + "epoch": 0.151954513148543, + "grad_norm": 0.5546875, + "learning_rate": 0.00018887816762584676, + "loss": 0.8141, + "step": 3207 + }, + { + "epoch": 0.15200189528547736, + "grad_norm": 0.65234375, + "learning_rate": 0.00018887134090569063, + "loss": 1.5754, + "step": 3208 + }, + { + "epoch": 0.15204927742241175, + "grad_norm": 0.6640625, + "learning_rate": 0.00018886451221446283, + "loss": 1.2315, + "step": 3209 + }, + { + "epoch": 0.15209665955934612, + "grad_norm": 0.61328125, + "learning_rate": 0.00018885768155231481, + "loss": 1.3574, + "step": 3210 + }, + { + "epoch": 0.1521440416962805, + "grad_norm": 0.62109375, + "learning_rate": 0.00018885084891939803, + "loss": 1.0823, + "step": 3211 + }, + { + "epoch": 0.15219142383321488, + "grad_norm": 0.62890625, + "learning_rate": 0.00018884401431586408, + "loss": 0.7045, + "step": 3212 + }, + { + "epoch": 0.15223880597014924, + "grad_norm": 0.609375, + "learning_rate": 0.00018883717774186454, + "loss": 0.9636, + "step": 3213 + }, + { + "epoch": 0.15228618810708364, + "grad_norm": 0.248046875, + "learning_rate": 0.000188830339197551, + "loss": 0.1879, + "step": 3214 + }, + { + "epoch": 0.152333570244018, + "grad_norm": 0.83203125, + "learning_rate": 0.00018882349868307516, + "loss": 0.955, + "step": 3215 + }, + { + "epoch": 0.1523809523809524, + "grad_norm": 0.365234375, + "learning_rate": 0.00018881665619858873, + "loss": 0.2655, + "step": 3216 + }, + { + "epoch": 0.15242833451788676, + "grad_norm": 0.45703125, + "learning_rate": 0.00018880981174424348, + "loss": 0.8976, + "step": 3217 + }, + { + "epoch": 0.15247571665482113, + "grad_norm": 1.109375, + "learning_rate": 0.0001888029653201912, + "loss": 0.0646, + "step": 3218 + }, + { + "epoch": 0.15252309879175552, + "grad_norm": 0.88671875, + "learning_rate": 0.00018879611692658373, + "loss": 0.2538, + "step": 3219 + }, + { + "epoch": 0.15257048092868988, + "grad_norm": 0.578125, + "learning_rate": 0.00018878926656357297, + "loss": 1.1655, + "step": 3220 + }, + { + "epoch": 0.15261786306562425, + "grad_norm": 0.6640625, + "learning_rate": 0.00018878241423131084, + "loss": 1.2484, + "step": 3221 + }, + { + "epoch": 0.15266524520255864, + "grad_norm": 0.5390625, + "learning_rate": 0.00018877555992994935, + "loss": 0.7451, + "step": 3222 + }, + { + "epoch": 0.152712627339493, + "grad_norm": 0.83984375, + "learning_rate": 0.00018876870365964048, + "loss": 0.2073, + "step": 3223 + }, + { + "epoch": 0.1527600094764274, + "grad_norm": 0.6171875, + "learning_rate": 0.00018876184542053633, + "loss": 1.1962, + "step": 3224 + }, + { + "epoch": 0.15280739161336176, + "grad_norm": 0.59375, + "learning_rate": 0.000188754985212789, + "loss": 1.0728, + "step": 3225 + }, + { + "epoch": 0.15285477375029613, + "grad_norm": 0.40625, + "learning_rate": 0.0001887481230365506, + "loss": 0.671, + "step": 3226 + }, + { + "epoch": 0.15290215588723052, + "grad_norm": 0.4453125, + "learning_rate": 0.00018874125889197337, + "loss": 1.2873, + "step": 3227 + }, + { + "epoch": 0.1529495380241649, + "grad_norm": 0.8046875, + "learning_rate": 0.00018873439277920957, + "loss": 0.7527, + "step": 3228 + }, + { + "epoch": 0.15299692016109925, + "grad_norm": 0.8125, + "learning_rate": 0.00018872752469841145, + "loss": 0.3701, + "step": 3229 + }, + { + "epoch": 0.15304430229803365, + "grad_norm": 0.447265625, + "learning_rate": 0.0001887206546497313, + "loss": 1.3629, + "step": 3230 + }, + { + "epoch": 0.153091684434968, + "grad_norm": 0.5078125, + "learning_rate": 0.0001887137826333216, + "loss": 0.8888, + "step": 3231 + }, + { + "epoch": 0.1531390665719024, + "grad_norm": 0.61328125, + "learning_rate": 0.00018870690864933467, + "loss": 1.2509, + "step": 3232 + }, + { + "epoch": 0.15318644870883677, + "grad_norm": 0.453125, + "learning_rate": 0.00018870003269792296, + "loss": 0.5475, + "step": 3233 + }, + { + "epoch": 0.15323383084577114, + "grad_norm": 0.59375, + "learning_rate": 0.00018869315477923904, + "loss": 1.0475, + "step": 3234 + }, + { + "epoch": 0.15328121298270553, + "grad_norm": 0.3125, + "learning_rate": 0.00018868627489343538, + "loss": 0.1699, + "step": 3235 + }, + { + "epoch": 0.1533285951196399, + "grad_norm": 0.072265625, + "learning_rate": 0.00018867939304066463, + "loss": 0.0069, + "step": 3236 + }, + { + "epoch": 0.15337597725657426, + "grad_norm": 0.56640625, + "learning_rate": 0.0001886725092210794, + "loss": 0.9158, + "step": 3237 + }, + { + "epoch": 0.15342335939350865, + "grad_norm": 1.5234375, + "learning_rate": 0.00018866562343483237, + "loss": 0.7676, + "step": 3238 + }, + { + "epoch": 0.15347074153044302, + "grad_norm": 0.484375, + "learning_rate": 0.00018865873568207623, + "loss": 0.9575, + "step": 3239 + }, + { + "epoch": 0.1535181236673774, + "grad_norm": 0.3828125, + "learning_rate": 0.0001886518459629638, + "loss": 0.2172, + "step": 3240 + }, + { + "epoch": 0.15356550580431177, + "grad_norm": 0.71484375, + "learning_rate": 0.00018864495427764783, + "loss": 0.9754, + "step": 3241 + }, + { + "epoch": 0.15361288794124614, + "grad_norm": 0.625, + "learning_rate": 0.0001886380606262812, + "loss": 1.0084, + "step": 3242 + }, + { + "epoch": 0.15366027007818053, + "grad_norm": 0.59375, + "learning_rate": 0.00018863116500901682, + "loss": 1.1972, + "step": 3243 + }, + { + "epoch": 0.1537076522151149, + "grad_norm": 0.578125, + "learning_rate": 0.00018862426742600756, + "loss": 0.046, + "step": 3244 + }, + { + "epoch": 0.1537550343520493, + "grad_norm": 0.384765625, + "learning_rate": 0.0001886173678774065, + "loss": 0.0177, + "step": 3245 + }, + { + "epoch": 0.15380241648898366, + "grad_norm": 0.66015625, + "learning_rate": 0.00018861046636336657, + "loss": 1.08, + "step": 3246 + }, + { + "epoch": 0.15384979862591802, + "grad_norm": 0.6796875, + "learning_rate": 0.0001886035628840409, + "loss": 0.9977, + "step": 3247 + }, + { + "epoch": 0.15389718076285241, + "grad_norm": 0.48828125, + "learning_rate": 0.0001885966574395826, + "loss": 0.55, + "step": 3248 + }, + { + "epoch": 0.15394456289978678, + "grad_norm": 0.6640625, + "learning_rate": 0.00018858975003014476, + "loss": 1.112, + "step": 3249 + }, + { + "epoch": 0.15399194503672115, + "grad_norm": 0.5625, + "learning_rate": 0.00018858284065588065, + "loss": 1.4552, + "step": 3250 + }, + { + "epoch": 0.15403932717365554, + "grad_norm": 0.6796875, + "learning_rate": 0.00018857592931694348, + "loss": 1.1829, + "step": 3251 + }, + { + "epoch": 0.1540867093105899, + "grad_norm": 0.78125, + "learning_rate": 0.00018856901601348657, + "loss": 0.1491, + "step": 3252 + }, + { + "epoch": 0.1541340914475243, + "grad_norm": 0.5390625, + "learning_rate": 0.0001885621007456632, + "loss": 0.7196, + "step": 3253 + }, + { + "epoch": 0.15418147358445866, + "grad_norm": 0.59765625, + "learning_rate": 0.00018855518351362677, + "loss": 1.1786, + "step": 3254 + }, + { + "epoch": 0.15422885572139303, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001885482643175307, + "loss": 0.1435, + "step": 3255 + }, + { + "epoch": 0.15427623785832742, + "grad_norm": 0.59765625, + "learning_rate": 0.00018854134315752846, + "loss": 0.7636, + "step": 3256 + }, + { + "epoch": 0.15432361999526178, + "grad_norm": 0.400390625, + "learning_rate": 0.00018853442003377352, + "loss": 0.7926, + "step": 3257 + }, + { + "epoch": 0.15437100213219615, + "grad_norm": 0.6328125, + "learning_rate": 0.00018852749494641943, + "loss": 1.0515, + "step": 3258 + }, + { + "epoch": 0.15441838426913054, + "grad_norm": 1.2265625, + "learning_rate": 0.00018852056789561982, + "loss": 0.7984, + "step": 3259 + }, + { + "epoch": 0.1544657664060649, + "grad_norm": 0.82421875, + "learning_rate": 0.00018851363888152832, + "loss": 1.0246, + "step": 3260 + }, + { + "epoch": 0.1545131485429993, + "grad_norm": 0.5625, + "learning_rate": 0.00018850670790429854, + "loss": 0.9434, + "step": 3261 + }, + { + "epoch": 0.15456053067993367, + "grad_norm": 0.5859375, + "learning_rate": 0.0001884997749640843, + "loss": 0.8916, + "step": 3262 + }, + { + "epoch": 0.15460791281686803, + "grad_norm": 1.03125, + "learning_rate": 0.0001884928400610393, + "loss": 0.81, + "step": 3263 + }, + { + "epoch": 0.15465529495380242, + "grad_norm": 0.8203125, + "learning_rate": 0.00018848590319531735, + "loss": 1.3236, + "step": 3264 + }, + { + "epoch": 0.1547026770907368, + "grad_norm": 0.5703125, + "learning_rate": 0.00018847896436707234, + "loss": 1.135, + "step": 3265 + }, + { + "epoch": 0.15475005922767116, + "grad_norm": 0.380859375, + "learning_rate": 0.00018847202357645814, + "loss": 0.0196, + "step": 3266 + }, + { + "epoch": 0.15479744136460555, + "grad_norm": 0.53125, + "learning_rate": 0.00018846508082362865, + "loss": 0.8033, + "step": 3267 + }, + { + "epoch": 0.1548448235015399, + "grad_norm": 0.51171875, + "learning_rate": 0.00018845813610873796, + "loss": 0.871, + "step": 3268 + }, + { + "epoch": 0.1548922056384743, + "grad_norm": 0.3828125, + "learning_rate": 0.00018845118943194, + "loss": 0.0952, + "step": 3269 + }, + { + "epoch": 0.15493958777540867, + "grad_norm": 0.47265625, + "learning_rate": 0.0001884442407933889, + "loss": 1.1118, + "step": 3270 + }, + { + "epoch": 0.15498696991234304, + "grad_norm": 1.0234375, + "learning_rate": 0.0001884372901932387, + "loss": 0.2118, + "step": 3271 + }, + { + "epoch": 0.15503435204927743, + "grad_norm": 0.5390625, + "learning_rate": 0.00018843033763164363, + "loss": 0.1658, + "step": 3272 + }, + { + "epoch": 0.1550817341862118, + "grad_norm": 0.443359375, + "learning_rate": 0.00018842338310875786, + "loss": 0.8745, + "step": 3273 + }, + { + "epoch": 0.15512911632314616, + "grad_norm": 0.546875, + "learning_rate": 0.00018841642662473565, + "loss": 0.0557, + "step": 3274 + }, + { + "epoch": 0.15517649846008055, + "grad_norm": 0.8046875, + "learning_rate": 0.00018840946817973126, + "loss": 0.1246, + "step": 3275 + }, + { + "epoch": 0.15522388059701492, + "grad_norm": 0.625, + "learning_rate": 0.00018840250777389902, + "loss": 0.8924, + "step": 3276 + }, + { + "epoch": 0.1552712627339493, + "grad_norm": 0.48828125, + "learning_rate": 0.00018839554540739335, + "loss": 0.5771, + "step": 3277 + }, + { + "epoch": 0.15531864487088368, + "grad_norm": 0.65234375, + "learning_rate": 0.00018838858108036864, + "loss": 1.1268, + "step": 3278 + }, + { + "epoch": 0.15536602700781804, + "grad_norm": 0.609375, + "learning_rate": 0.00018838161479297933, + "loss": 1.1074, + "step": 3279 + }, + { + "epoch": 0.15541340914475243, + "grad_norm": 0.65234375, + "learning_rate": 0.00018837464654537995, + "loss": 1.6613, + "step": 3280 + }, + { + "epoch": 0.1554607912816868, + "grad_norm": 0.5234375, + "learning_rate": 0.00018836767633772502, + "loss": 1.0114, + "step": 3281 + }, + { + "epoch": 0.1555081734186212, + "grad_norm": 0.67578125, + "learning_rate": 0.0001883607041701692, + "loss": 1.2269, + "step": 3282 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.54296875, + "learning_rate": 0.00018835373004286702, + "loss": 1.402, + "step": 3283 + }, + { + "epoch": 0.15560293769248992, + "grad_norm": 0.59375, + "learning_rate": 0.00018834675395597324, + "loss": 1.0193, + "step": 3284 + }, + { + "epoch": 0.15565031982942432, + "grad_norm": 0.5859375, + "learning_rate": 0.00018833977590964257, + "loss": 1.3892, + "step": 3285 + }, + { + "epoch": 0.15569770196635868, + "grad_norm": 0.376953125, + "learning_rate": 0.0001883327959040298, + "loss": 0.4987, + "step": 3286 + }, + { + "epoch": 0.15574508410329305, + "grad_norm": 0.60546875, + "learning_rate": 0.00018832581393928965, + "loss": 1.3572, + "step": 3287 + }, + { + "epoch": 0.15579246624022744, + "grad_norm": 0.50390625, + "learning_rate": 0.00018831883001557706, + "loss": 1.2606, + "step": 3288 + }, + { + "epoch": 0.1558398483771618, + "grad_norm": 0.49609375, + "learning_rate": 0.0001883118441330469, + "loss": 1.7267, + "step": 3289 + }, + { + "epoch": 0.1558872305140962, + "grad_norm": 0.56640625, + "learning_rate": 0.00018830485629185405, + "loss": 1.2203, + "step": 3290 + }, + { + "epoch": 0.15593461265103056, + "grad_norm": 0.91796875, + "learning_rate": 0.00018829786649215358, + "loss": 0.3889, + "step": 3291 + }, + { + "epoch": 0.15598199478796493, + "grad_norm": 0.67578125, + "learning_rate": 0.00018829087473410048, + "loss": 1.3834, + "step": 3292 + }, + { + "epoch": 0.15602937692489932, + "grad_norm": 0.7109375, + "learning_rate": 0.00018828388101784985, + "loss": 1.2501, + "step": 3293 + }, + { + "epoch": 0.1560767590618337, + "grad_norm": 0.4609375, + "learning_rate": 0.0001882768853435567, + "loss": 0.1653, + "step": 3294 + }, + { + "epoch": 0.15612414119876805, + "grad_norm": 0.578125, + "learning_rate": 0.00018826988771137635, + "loss": 0.2747, + "step": 3295 + }, + { + "epoch": 0.15617152333570244, + "grad_norm": 0.5390625, + "learning_rate": 0.00018826288812146387, + "loss": 0.7825, + "step": 3296 + }, + { + "epoch": 0.1562189054726368, + "grad_norm": 0.62109375, + "learning_rate": 0.00018825588657397452, + "loss": 0.854, + "step": 3297 + }, + { + "epoch": 0.1562662876095712, + "grad_norm": 0.443359375, + "learning_rate": 0.00018824888306906366, + "loss": 1.0428, + "step": 3298 + }, + { + "epoch": 0.15631366974650557, + "grad_norm": 0.71484375, + "learning_rate": 0.00018824187760688654, + "loss": 1.6231, + "step": 3299 + }, + { + "epoch": 0.15636105188343993, + "grad_norm": 0.478515625, + "learning_rate": 0.0001882348701875986, + "loss": 0.2232, + "step": 3300 + }, + { + "epoch": 0.15640843402037433, + "grad_norm": 0.53515625, + "learning_rate": 0.00018822786081135518, + "loss": 0.8423, + "step": 3301 + }, + { + "epoch": 0.1564558161573087, + "grad_norm": 0.61328125, + "learning_rate": 0.00018822084947831181, + "loss": 0.9263, + "step": 3302 + }, + { + "epoch": 0.15650319829424306, + "grad_norm": 0.376953125, + "learning_rate": 0.00018821383618862397, + "loss": 0.1621, + "step": 3303 + }, + { + "epoch": 0.15655058043117745, + "grad_norm": 0.58984375, + "learning_rate": 0.00018820682094244717, + "loss": 1.0773, + "step": 3304 + }, + { + "epoch": 0.15659796256811181, + "grad_norm": 0.216796875, + "learning_rate": 0.00018819980373993705, + "loss": 0.0241, + "step": 3305 + }, + { + "epoch": 0.1566453447050462, + "grad_norm": 0.5859375, + "learning_rate": 0.00018819278458124923, + "loss": 1.2362, + "step": 3306 + }, + { + "epoch": 0.15669272684198057, + "grad_norm": 0.62109375, + "learning_rate": 0.0001881857634665394, + "loss": 1.1092, + "step": 3307 + }, + { + "epoch": 0.15674010897891494, + "grad_norm": 0.73046875, + "learning_rate": 0.00018817874039596326, + "loss": 0.7047, + "step": 3308 + }, + { + "epoch": 0.15678749111584933, + "grad_norm": 0.095703125, + "learning_rate": 0.00018817171536967658, + "loss": 0.0124, + "step": 3309 + }, + { + "epoch": 0.1568348732527837, + "grad_norm": 1.046875, + "learning_rate": 0.00018816468838783518, + "loss": 0.4429, + "step": 3310 + }, + { + "epoch": 0.1568822553897181, + "grad_norm": 0.51171875, + "learning_rate": 0.0001881576594505949, + "loss": 0.8604, + "step": 3311 + }, + { + "epoch": 0.15692963752665245, + "grad_norm": 0.3359375, + "learning_rate": 0.0001881506285581116, + "loss": 0.155, + "step": 3312 + }, + { + "epoch": 0.15697701966358682, + "grad_norm": 0.546875, + "learning_rate": 0.0001881435957105413, + "loss": 0.875, + "step": 3313 + }, + { + "epoch": 0.1570244018005212, + "grad_norm": 0.8984375, + "learning_rate": 0.00018813656090803992, + "loss": 1.6006, + "step": 3314 + }, + { + "epoch": 0.15707178393745558, + "grad_norm": 0.251953125, + "learning_rate": 0.00018812952415076347, + "loss": 0.1606, + "step": 3315 + }, + { + "epoch": 0.15711916607438994, + "grad_norm": 0.25, + "learning_rate": 0.00018812248543886807, + "loss": 0.0366, + "step": 3316 + }, + { + "epoch": 0.15716654821132434, + "grad_norm": 1.171875, + "learning_rate": 0.0001881154447725098, + "loss": 0.5749, + "step": 3317 + }, + { + "epoch": 0.1572139303482587, + "grad_norm": 0.69921875, + "learning_rate": 0.00018810840215184485, + "loss": 0.1889, + "step": 3318 + }, + { + "epoch": 0.1572613124851931, + "grad_norm": 0.59765625, + "learning_rate": 0.00018810135757702935, + "loss": 1.1963, + "step": 3319 + }, + { + "epoch": 0.15730869462212746, + "grad_norm": 0.6328125, + "learning_rate": 0.0001880943110482196, + "loss": 1.172, + "step": 3320 + }, + { + "epoch": 0.15735607675906182, + "grad_norm": 0.478515625, + "learning_rate": 0.00018808726256557185, + "loss": 0.8667, + "step": 3321 + }, + { + "epoch": 0.15740345889599622, + "grad_norm": 0.6640625, + "learning_rate": 0.00018808021212924245, + "loss": 1.3381, + "step": 3322 + }, + { + "epoch": 0.15745084103293058, + "grad_norm": 0.6484375, + "learning_rate": 0.00018807315973938776, + "loss": 1.1065, + "step": 3323 + }, + { + "epoch": 0.15749822316986495, + "grad_norm": 0.53125, + "learning_rate": 0.00018806610539616423, + "loss": 1.1935, + "step": 3324 + }, + { + "epoch": 0.15754560530679934, + "grad_norm": 0.361328125, + "learning_rate": 0.00018805904909972824, + "loss": 0.7319, + "step": 3325 + }, + { + "epoch": 0.1575929874437337, + "grad_norm": 0.51953125, + "learning_rate": 0.00018805199085023637, + "loss": 1.0317, + "step": 3326 + }, + { + "epoch": 0.1576403695806681, + "grad_norm": 0.21484375, + "learning_rate": 0.00018804493064784511, + "loss": 0.1413, + "step": 3327 + }, + { + "epoch": 0.15768775171760246, + "grad_norm": 0.65234375, + "learning_rate": 0.00018803786849271107, + "loss": 1.0061, + "step": 3328 + }, + { + "epoch": 0.15773513385453683, + "grad_norm": 0.486328125, + "learning_rate": 0.0001880308043849909, + "loss": 0.4742, + "step": 3329 + }, + { + "epoch": 0.15778251599147122, + "grad_norm": 0.412109375, + "learning_rate": 0.00018802373832484123, + "loss": 0.0569, + "step": 3330 + }, + { + "epoch": 0.1578298981284056, + "grad_norm": 0.6875, + "learning_rate": 0.00018801667031241882, + "loss": 0.3141, + "step": 3331 + }, + { + "epoch": 0.15787728026533995, + "grad_norm": 0.6640625, + "learning_rate": 0.00018800960034788043, + "loss": 0.4049, + "step": 3332 + }, + { + "epoch": 0.15792466240227435, + "grad_norm": 0.6796875, + "learning_rate": 0.00018800252843138282, + "loss": 1.3762, + "step": 3333 + }, + { + "epoch": 0.1579720445392087, + "grad_norm": 0.546875, + "learning_rate": 0.00018799545456308289, + "loss": 1.1696, + "step": 3334 + }, + { + "epoch": 0.1580194266761431, + "grad_norm": 0.203125, + "learning_rate": 0.00018798837874313748, + "loss": 0.1247, + "step": 3335 + }, + { + "epoch": 0.15806680881307747, + "grad_norm": 0.486328125, + "learning_rate": 0.00018798130097170358, + "loss": 0.962, + "step": 3336 + }, + { + "epoch": 0.15811419095001183, + "grad_norm": 0.7734375, + "learning_rate": 0.00018797422124893807, + "loss": 0.8894, + "step": 3337 + }, + { + "epoch": 0.15816157308694623, + "grad_norm": 0.66015625, + "learning_rate": 0.0001879671395749981, + "loss": 0.8901, + "step": 3338 + }, + { + "epoch": 0.1582089552238806, + "grad_norm": 0.486328125, + "learning_rate": 0.00018796005595004067, + "loss": 0.9047, + "step": 3339 + }, + { + "epoch": 0.15825633736081499, + "grad_norm": 0.65234375, + "learning_rate": 0.0001879529703742229, + "loss": 1.0445, + "step": 3340 + }, + { + "epoch": 0.15830371949774935, + "grad_norm": 0.68359375, + "learning_rate": 0.00018794588284770186, + "loss": 1.084, + "step": 3341 + }, + { + "epoch": 0.15835110163468372, + "grad_norm": 0.53125, + "learning_rate": 0.00018793879337063488, + "loss": 1.3039, + "step": 3342 + }, + { + "epoch": 0.1583984837716181, + "grad_norm": 0.765625, + "learning_rate": 0.00018793170194317913, + "loss": 1.1952, + "step": 3343 + }, + { + "epoch": 0.15844586590855247, + "grad_norm": 0.5859375, + "learning_rate": 0.00018792460856549185, + "loss": 1.0394, + "step": 3344 + }, + { + "epoch": 0.15849324804548684, + "grad_norm": 0.4609375, + "learning_rate": 0.00018791751323773043, + "loss": 0.5739, + "step": 3345 + }, + { + "epoch": 0.15854063018242123, + "grad_norm": 0.640625, + "learning_rate": 0.00018791041596005225, + "loss": 0.9165, + "step": 3346 + }, + { + "epoch": 0.1585880123193556, + "grad_norm": 0.8046875, + "learning_rate": 0.00018790331673261464, + "loss": 1.1403, + "step": 3347 + }, + { + "epoch": 0.15863539445629, + "grad_norm": 0.6796875, + "learning_rate": 0.0001878962155555751, + "loss": 1.0392, + "step": 3348 + }, + { + "epoch": 0.15868277659322436, + "grad_norm": 0.7890625, + "learning_rate": 0.00018788911242909113, + "loss": 1.1052, + "step": 3349 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 0.71484375, + "learning_rate": 0.00018788200735332024, + "loss": 1.0922, + "step": 3350 + }, + { + "epoch": 0.15877754086709311, + "grad_norm": 0.55859375, + "learning_rate": 0.00018787490032842006, + "loss": 1.0305, + "step": 3351 + }, + { + "epoch": 0.15882492300402748, + "grad_norm": 0.54296875, + "learning_rate": 0.00018786779135454817, + "loss": 0.1058, + "step": 3352 + }, + { + "epoch": 0.15887230514096184, + "grad_norm": 0.5234375, + "learning_rate": 0.00018786068043186226, + "loss": 0.1585, + "step": 3353 + }, + { + "epoch": 0.15891968727789624, + "grad_norm": 0.62890625, + "learning_rate": 0.00018785356756052007, + "loss": 0.3798, + "step": 3354 + }, + { + "epoch": 0.1589670694148306, + "grad_norm": 0.734375, + "learning_rate": 0.00018784645274067931, + "loss": 0.9803, + "step": 3355 + }, + { + "epoch": 0.159014451551765, + "grad_norm": 0.1376953125, + "learning_rate": 0.0001878393359724978, + "loss": 0.0151, + "step": 3356 + }, + { + "epoch": 0.15906183368869936, + "grad_norm": 0.66015625, + "learning_rate": 0.00018783221725613336, + "loss": 0.762, + "step": 3357 + }, + { + "epoch": 0.15910921582563373, + "grad_norm": 0.21484375, + "learning_rate": 0.00018782509659174393, + "loss": 0.0296, + "step": 3358 + }, + { + "epoch": 0.15915659796256812, + "grad_norm": 0.455078125, + "learning_rate": 0.0001878179739794874, + "loss": 0.0517, + "step": 3359 + }, + { + "epoch": 0.15920398009950248, + "grad_norm": 0.80859375, + "learning_rate": 0.00018781084941952173, + "loss": 0.2991, + "step": 3360 + }, + { + "epoch": 0.15925136223643685, + "grad_norm": 0.7890625, + "learning_rate": 0.00018780372291200497, + "loss": 0.8215, + "step": 3361 + }, + { + "epoch": 0.15929874437337124, + "grad_norm": 0.53515625, + "learning_rate": 0.00018779659445709515, + "loss": 0.6526, + "step": 3362 + }, + { + "epoch": 0.1593461265103056, + "grad_norm": 0.67578125, + "learning_rate": 0.00018778946405495036, + "loss": 0.0389, + "step": 3363 + }, + { + "epoch": 0.15939350864724, + "grad_norm": 0.248046875, + "learning_rate": 0.00018778233170572877, + "loss": 0.1111, + "step": 3364 + }, + { + "epoch": 0.15944089078417437, + "grad_norm": 0.6796875, + "learning_rate": 0.00018777519740958862, + "loss": 1.1919, + "step": 3365 + }, + { + "epoch": 0.15948827292110873, + "grad_norm": 0.1259765625, + "learning_rate": 0.00018776806116668803, + "loss": 0.0036, + "step": 3366 + }, + { + "epoch": 0.15953565505804312, + "grad_norm": 0.5859375, + "learning_rate": 0.00018776092297718535, + "loss": 0.8826, + "step": 3367 + }, + { + "epoch": 0.1595830371949775, + "grad_norm": 0.228515625, + "learning_rate": 0.00018775378284123888, + "loss": 0.0293, + "step": 3368 + }, + { + "epoch": 0.15963041933191188, + "grad_norm": 0.5078125, + "learning_rate": 0.00018774664075900697, + "loss": 0.8727, + "step": 3369 + }, + { + "epoch": 0.15967780146884625, + "grad_norm": 0.48046875, + "learning_rate": 0.00018773949673064804, + "loss": 0.9371, + "step": 3370 + }, + { + "epoch": 0.1597251836057806, + "grad_norm": 0.65234375, + "learning_rate": 0.00018773235075632052, + "loss": 1.1879, + "step": 3371 + }, + { + "epoch": 0.159772565742715, + "grad_norm": 0.55078125, + "learning_rate": 0.0001877252028361829, + "loss": 0.7749, + "step": 3372 + }, + { + "epoch": 0.15981994787964937, + "grad_norm": 0.6171875, + "learning_rate": 0.00018771805297039374, + "loss": 0.829, + "step": 3373 + }, + { + "epoch": 0.15986733001658374, + "grad_norm": 0.62890625, + "learning_rate": 0.0001877109011591116, + "loss": 1.4261, + "step": 3374 + }, + { + "epoch": 0.15991471215351813, + "grad_norm": 0.68359375, + "learning_rate": 0.00018770374740249507, + "loss": 0.3003, + "step": 3375 + }, + { + "epoch": 0.1599620942904525, + "grad_norm": 0.7265625, + "learning_rate": 0.00018769659170070287, + "loss": 0.0686, + "step": 3376 + }, + { + "epoch": 0.1600094764273869, + "grad_norm": 0.85546875, + "learning_rate": 0.00018768943405389366, + "loss": 0.1909, + "step": 3377 + }, + { + "epoch": 0.16005685856432125, + "grad_norm": 0.20703125, + "learning_rate": 0.00018768227446222622, + "loss": 0.1452, + "step": 3378 + }, + { + "epoch": 0.16010424070125562, + "grad_norm": 0.6328125, + "learning_rate": 0.0001876751129258593, + "loss": 0.8432, + "step": 3379 + }, + { + "epoch": 0.16015162283819, + "grad_norm": 0.71484375, + "learning_rate": 0.00018766794944495178, + "loss": 0.8528, + "step": 3380 + }, + { + "epoch": 0.16019900497512438, + "grad_norm": 0.6171875, + "learning_rate": 0.0001876607840196625, + "loss": 0.9679, + "step": 3381 + }, + { + "epoch": 0.16024638711205874, + "grad_norm": 0.59765625, + "learning_rate": 0.00018765361665015043, + "loss": 0.9675, + "step": 3382 + }, + { + "epoch": 0.16029376924899313, + "grad_norm": 0.64453125, + "learning_rate": 0.00018764644733657452, + "loss": 0.1435, + "step": 3383 + }, + { + "epoch": 0.1603411513859275, + "grad_norm": 0.63671875, + "learning_rate": 0.00018763927607909375, + "loss": 0.6409, + "step": 3384 + }, + { + "epoch": 0.1603885335228619, + "grad_norm": 0.337890625, + "learning_rate": 0.0001876321028778672, + "loss": 0.0473, + "step": 3385 + }, + { + "epoch": 0.16043591565979626, + "grad_norm": 0.5859375, + "learning_rate": 0.0001876249277330539, + "loss": 1.3328, + "step": 3386 + }, + { + "epoch": 0.16048329779673062, + "grad_norm": 0.62890625, + "learning_rate": 0.00018761775064481308, + "loss": 0.0951, + "step": 3387 + }, + { + "epoch": 0.16053067993366502, + "grad_norm": 0.734375, + "learning_rate": 0.00018761057161330386, + "loss": 1.1392, + "step": 3388 + }, + { + "epoch": 0.16057806207059938, + "grad_norm": 0.279296875, + "learning_rate": 0.0001876033906386855, + "loss": 0.037, + "step": 3389 + }, + { + "epoch": 0.16062544420753375, + "grad_norm": 0.62109375, + "learning_rate": 0.00018759620772111721, + "loss": 0.0903, + "step": 3390 + }, + { + "epoch": 0.16067282634446814, + "grad_norm": 0.419921875, + "learning_rate": 0.00018758902286075837, + "loss": 0.7807, + "step": 3391 + }, + { + "epoch": 0.1607202084814025, + "grad_norm": 0.6171875, + "learning_rate": 0.00018758183605776827, + "loss": 1.0448, + "step": 3392 + }, + { + "epoch": 0.1607675906183369, + "grad_norm": 0.61328125, + "learning_rate": 0.00018757464731230635, + "loss": 1.4464, + "step": 3393 + }, + { + "epoch": 0.16081497275527126, + "grad_norm": 0.65234375, + "learning_rate": 0.00018756745662453205, + "loss": 1.3067, + "step": 3394 + }, + { + "epoch": 0.16086235489220563, + "grad_norm": 0.57421875, + "learning_rate": 0.0001875602639946048, + "loss": 1.1364, + "step": 3395 + }, + { + "epoch": 0.16090973702914002, + "grad_norm": 0.7734375, + "learning_rate": 0.00018755306942268418, + "loss": 0.1941, + "step": 3396 + }, + { + "epoch": 0.16095711916607439, + "grad_norm": 0.55859375, + "learning_rate": 0.00018754587290892974, + "loss": 0.7959, + "step": 3397 + }, + { + "epoch": 0.16100450130300878, + "grad_norm": 0.77734375, + "learning_rate": 0.00018753867445350108, + "loss": 1.3102, + "step": 3398 + }, + { + "epoch": 0.16105188343994314, + "grad_norm": 0.51953125, + "learning_rate": 0.00018753147405655787, + "loss": 0.9047, + "step": 3399 + }, + { + "epoch": 0.1610992655768775, + "grad_norm": 0.33984375, + "learning_rate": 0.0001875242717182598, + "loss": 0.0682, + "step": 3400 + }, + { + "epoch": 0.1611466477138119, + "grad_norm": 0.59375, + "learning_rate": 0.0001875170674387666, + "loss": 1.1625, + "step": 3401 + }, + { + "epoch": 0.16119402985074627, + "grad_norm": 0.68359375, + "learning_rate": 0.0001875098612182381, + "loss": 1.2987, + "step": 3402 + }, + { + "epoch": 0.16124141198768063, + "grad_norm": 0.59765625, + "learning_rate": 0.00018750265305683404, + "loss": 1.1018, + "step": 3403 + }, + { + "epoch": 0.16128879412461503, + "grad_norm": 0.671875, + "learning_rate": 0.00018749544295471436, + "loss": 1.2829, + "step": 3404 + }, + { + "epoch": 0.1613361762615494, + "grad_norm": 1.3984375, + "learning_rate": 0.00018748823091203892, + "loss": 0.4753, + "step": 3405 + }, + { + "epoch": 0.16138355839848378, + "grad_norm": 0.65625, + "learning_rate": 0.00018748101692896775, + "loss": 1.0085, + "step": 3406 + }, + { + "epoch": 0.16143094053541815, + "grad_norm": 0.53515625, + "learning_rate": 0.0001874738010056608, + "loss": 0.0389, + "step": 3407 + }, + { + "epoch": 0.16147832267235251, + "grad_norm": 0.490234375, + "learning_rate": 0.0001874665831422781, + "loss": 0.6164, + "step": 3408 + }, + { + "epoch": 0.1615257048092869, + "grad_norm": 0.65625, + "learning_rate": 0.00018745936333897976, + "loss": 0.8375, + "step": 3409 + }, + { + "epoch": 0.16157308694622127, + "grad_norm": 0.408203125, + "learning_rate": 0.0001874521415959259, + "loss": 0.0557, + "step": 3410 + }, + { + "epoch": 0.16162046908315564, + "grad_norm": 0.68359375, + "learning_rate": 0.00018744491791327668, + "loss": 1.331, + "step": 3411 + }, + { + "epoch": 0.16166785122009003, + "grad_norm": 0.57421875, + "learning_rate": 0.00018743769229119232, + "loss": 0.8699, + "step": 3412 + }, + { + "epoch": 0.1617152333570244, + "grad_norm": 0.419921875, + "learning_rate": 0.0001874304647298331, + "loss": 0.0352, + "step": 3413 + }, + { + "epoch": 0.1617626154939588, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001874232352293593, + "loss": 0.0239, + "step": 3414 + }, + { + "epoch": 0.16180999763089315, + "grad_norm": 0.62890625, + "learning_rate": 0.00018741600378993124, + "loss": 0.9394, + "step": 3415 + }, + { + "epoch": 0.16185737976782752, + "grad_norm": 0.58203125, + "learning_rate": 0.00018740877041170935, + "loss": 0.6944, + "step": 3416 + }, + { + "epoch": 0.1619047619047619, + "grad_norm": 0.5234375, + "learning_rate": 0.000187401535094854, + "loss": 1.0595, + "step": 3417 + }, + { + "epoch": 0.16195214404169628, + "grad_norm": 0.5078125, + "learning_rate": 0.00018739429783952574, + "loss": 0.993, + "step": 3418 + }, + { + "epoch": 0.16199952617863064, + "grad_norm": 0.470703125, + "learning_rate": 0.000187387058645885, + "loss": 0.6458, + "step": 3419 + }, + { + "epoch": 0.16204690831556504, + "grad_norm": 0.7109375, + "learning_rate": 0.00018737981751409241, + "loss": 0.9488, + "step": 3420 + }, + { + "epoch": 0.1620942904524994, + "grad_norm": 0.52734375, + "learning_rate": 0.00018737257444430855, + "loss": 0.7799, + "step": 3421 + }, + { + "epoch": 0.1621416725894338, + "grad_norm": 0.88671875, + "learning_rate": 0.00018736532943669404, + "loss": 1.1519, + "step": 3422 + }, + { + "epoch": 0.16218905472636816, + "grad_norm": 0.6953125, + "learning_rate": 0.00018735808249140953, + "loss": 1.179, + "step": 3423 + }, + { + "epoch": 0.16223643686330252, + "grad_norm": 0.703125, + "learning_rate": 0.00018735083360861587, + "loss": 0.9812, + "step": 3424 + }, + { + "epoch": 0.16228381900023692, + "grad_norm": 0.60546875, + "learning_rate": 0.00018734358278847376, + "loss": 1.1356, + "step": 3425 + }, + { + "epoch": 0.16233120113717128, + "grad_norm": 0.65625, + "learning_rate": 0.00018733633003114396, + "loss": 1.0417, + "step": 3426 + }, + { + "epoch": 0.16237858327410568, + "grad_norm": 0.65234375, + "learning_rate": 0.00018732907533678742, + "loss": 0.8498, + "step": 3427 + }, + { + "epoch": 0.16242596541104004, + "grad_norm": 0.63671875, + "learning_rate": 0.00018732181870556502, + "loss": 0.1318, + "step": 3428 + }, + { + "epoch": 0.1624733475479744, + "grad_norm": 0.59375, + "learning_rate": 0.0001873145601376377, + "loss": 0.9391, + "step": 3429 + }, + { + "epoch": 0.1625207296849088, + "grad_norm": 0.58984375, + "learning_rate": 0.00018730729963316642, + "loss": 0.9662, + "step": 3430 + }, + { + "epoch": 0.16256811182184316, + "grad_norm": 0.5859375, + "learning_rate": 0.00018730003719231224, + "loss": 1.0315, + "step": 3431 + }, + { + "epoch": 0.16261549395877753, + "grad_norm": 0.69140625, + "learning_rate": 0.0001872927728152362, + "loss": 1.0004, + "step": 3432 + }, + { + "epoch": 0.16266287609571192, + "grad_norm": 0.61328125, + "learning_rate": 0.00018728550650209946, + "loss": 0.0759, + "step": 3433 + }, + { + "epoch": 0.1627102582326463, + "grad_norm": 0.57421875, + "learning_rate": 0.00018727823825306317, + "loss": 0.9821, + "step": 3434 + }, + { + "epoch": 0.16275764036958068, + "grad_norm": 0.6953125, + "learning_rate": 0.00018727096806828847, + "loss": 0.1027, + "step": 3435 + }, + { + "epoch": 0.16280502250651505, + "grad_norm": 0.443359375, + "learning_rate": 0.00018726369594793672, + "loss": 0.5308, + "step": 3436 + }, + { + "epoch": 0.1628524046434494, + "grad_norm": 0.58984375, + "learning_rate": 0.00018725642189216908, + "loss": 1.4648, + "step": 3437 + }, + { + "epoch": 0.1628997867803838, + "grad_norm": 0.6328125, + "learning_rate": 0.000187249145901147, + "loss": 1.4506, + "step": 3438 + }, + { + "epoch": 0.16294716891731817, + "grad_norm": 0.5703125, + "learning_rate": 0.00018724186797503177, + "loss": 0.5585, + "step": 3439 + }, + { + "epoch": 0.16299455105425253, + "grad_norm": 0.58984375, + "learning_rate": 0.00018723458811398483, + "loss": 0.1731, + "step": 3440 + }, + { + "epoch": 0.16304193319118693, + "grad_norm": 0.69921875, + "learning_rate": 0.00018722730631816765, + "loss": 1.1493, + "step": 3441 + }, + { + "epoch": 0.1630893153281213, + "grad_norm": 0.34375, + "learning_rate": 0.00018722002258774172, + "loss": 0.0177, + "step": 3442 + }, + { + "epoch": 0.16313669746505569, + "grad_norm": 0.462890625, + "learning_rate": 0.0001872127369228686, + "loss": 0.3601, + "step": 3443 + }, + { + "epoch": 0.16318407960199005, + "grad_norm": 0.490234375, + "learning_rate": 0.00018720544932370988, + "loss": 0.0731, + "step": 3444 + }, + { + "epoch": 0.16323146173892442, + "grad_norm": 0.458984375, + "learning_rate": 0.00018719815979042714, + "loss": 0.8687, + "step": 3445 + }, + { + "epoch": 0.1632788438758588, + "grad_norm": 0.89453125, + "learning_rate": 0.00018719086832318213, + "loss": 0.9003, + "step": 3446 + }, + { + "epoch": 0.16332622601279317, + "grad_norm": 0.59765625, + "learning_rate": 0.00018718357492213654, + "loss": 0.1883, + "step": 3447 + }, + { + "epoch": 0.16337360814972754, + "grad_norm": 0.498046875, + "learning_rate": 0.0001871762795874521, + "loss": 0.9352, + "step": 3448 + }, + { + "epoch": 0.16342099028666193, + "grad_norm": 0.6484375, + "learning_rate": 0.00018716898231929064, + "loss": 1.4955, + "step": 3449 + }, + { + "epoch": 0.1634683724235963, + "grad_norm": 0.421875, + "learning_rate": 0.000187161683117814, + "loss": 0.0188, + "step": 3450 + }, + { + "epoch": 0.1635157545605307, + "grad_norm": 0.59375, + "learning_rate": 0.00018715438198318407, + "loss": 1.0635, + "step": 3451 + }, + { + "epoch": 0.16356313669746506, + "grad_norm": 0.6953125, + "learning_rate": 0.0001871470789155628, + "loss": 0.8331, + "step": 3452 + }, + { + "epoch": 0.16361051883439942, + "grad_norm": 0.7109375, + "learning_rate": 0.00018713977391511213, + "loss": 1.1757, + "step": 3453 + }, + { + "epoch": 0.16365790097133381, + "grad_norm": 0.61328125, + "learning_rate": 0.00018713246698199407, + "loss": 1.1352, + "step": 3454 + }, + { + "epoch": 0.16370528310826818, + "grad_norm": 0.48828125, + "learning_rate": 0.00018712515811637073, + "loss": 0.7567, + "step": 3455 + }, + { + "epoch": 0.16375266524520257, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018711784731840415, + "loss": 0.1355, + "step": 3456 + }, + { + "epoch": 0.16380004738213694, + "grad_norm": 0.5703125, + "learning_rate": 0.00018711053458825655, + "loss": 1.2875, + "step": 3457 + }, + { + "epoch": 0.1638474295190713, + "grad_norm": 0.6328125, + "learning_rate": 0.00018710321992609008, + "loss": 0.0153, + "step": 3458 + }, + { + "epoch": 0.1638948116560057, + "grad_norm": 0.5234375, + "learning_rate": 0.00018709590333206694, + "loss": 0.0086, + "step": 3459 + }, + { + "epoch": 0.16394219379294006, + "grad_norm": 0.9140625, + "learning_rate": 0.00018708858480634945, + "loss": 0.3582, + "step": 3460 + }, + { + "epoch": 0.16398957592987443, + "grad_norm": 0.58984375, + "learning_rate": 0.00018708126434909992, + "loss": 1.1043, + "step": 3461 + }, + { + "epoch": 0.16403695806680882, + "grad_norm": 0.921875, + "learning_rate": 0.0001870739419604807, + "loss": 0.7051, + "step": 3462 + }, + { + "epoch": 0.16408434020374318, + "grad_norm": 0.57421875, + "learning_rate": 0.00018706661764065417, + "loss": 0.8314, + "step": 3463 + }, + { + "epoch": 0.16413172234067758, + "grad_norm": 0.326171875, + "learning_rate": 0.0001870592913897828, + "loss": 0.0076, + "step": 3464 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 0.5703125, + "learning_rate": 0.0001870519632080291, + "loss": 0.216, + "step": 3465 + }, + { + "epoch": 0.1642264866145463, + "grad_norm": 0.380859375, + "learning_rate": 0.0001870446330955556, + "loss": 0.1688, + "step": 3466 + }, + { + "epoch": 0.1642738687514807, + "grad_norm": 0.5859375, + "learning_rate": 0.00018703730105252482, + "loss": 1.0041, + "step": 3467 + }, + { + "epoch": 0.16432125088841507, + "grad_norm": 0.31640625, + "learning_rate": 0.0001870299670790994, + "loss": 0.0299, + "step": 3468 + }, + { + "epoch": 0.16436863302534943, + "grad_norm": 0.7890625, + "learning_rate": 0.00018702263117544203, + "loss": 0.1967, + "step": 3469 + }, + { + "epoch": 0.16441601516228382, + "grad_norm": 0.6328125, + "learning_rate": 0.0001870152933417154, + "loss": 0.9879, + "step": 3470 + }, + { + "epoch": 0.1644633972992182, + "grad_norm": 0.51171875, + "learning_rate": 0.00018700795357808224, + "loss": 0.6545, + "step": 3471 + }, + { + "epoch": 0.16451077943615258, + "grad_norm": 0.4765625, + "learning_rate": 0.00018700061188470533, + "loss": 1.3322, + "step": 3472 + }, + { + "epoch": 0.16455816157308695, + "grad_norm": 0.63671875, + "learning_rate": 0.00018699326826174755, + "loss": 1.2804, + "step": 3473 + }, + { + "epoch": 0.1646055437100213, + "grad_norm": 0.546875, + "learning_rate": 0.00018698592270937172, + "loss": 1.1649, + "step": 3474 + }, + { + "epoch": 0.1646529258469557, + "grad_norm": 0.56640625, + "learning_rate": 0.0001869785752277408, + "loss": 0.8521, + "step": 3475 + }, + { + "epoch": 0.16470030798389007, + "grad_norm": 0.46484375, + "learning_rate": 0.00018697122581701767, + "loss": 0.6356, + "step": 3476 + }, + { + "epoch": 0.16474769012082444, + "grad_norm": 0.365234375, + "learning_rate": 0.00018696387447736544, + "loss": 0.1956, + "step": 3477 + }, + { + "epoch": 0.16479507225775883, + "grad_norm": 0.62109375, + "learning_rate": 0.0001869565212089471, + "loss": 0.9507, + "step": 3478 + }, + { + "epoch": 0.1648424543946932, + "grad_norm": 0.9453125, + "learning_rate": 0.00018694916601192573, + "loss": 0.1632, + "step": 3479 + }, + { + "epoch": 0.1648898365316276, + "grad_norm": 0.609375, + "learning_rate": 0.00018694180888646447, + "loss": 0.953, + "step": 3480 + }, + { + "epoch": 0.16493721866856195, + "grad_norm": 0.462890625, + "learning_rate": 0.0001869344498327265, + "loss": 1.1483, + "step": 3481 + }, + { + "epoch": 0.16498460080549632, + "grad_norm": 0.9140625, + "learning_rate": 0.00018692708885087504, + "loss": 1.2897, + "step": 3482 + }, + { + "epoch": 0.1650319829424307, + "grad_norm": 0.71484375, + "learning_rate": 0.00018691972594107333, + "loss": 0.8568, + "step": 3483 + }, + { + "epoch": 0.16507936507936508, + "grad_norm": 0.73046875, + "learning_rate": 0.00018691236110348467, + "loss": 0.276, + "step": 3484 + }, + { + "epoch": 0.16512674721629947, + "grad_norm": 0.75, + "learning_rate": 0.00018690499433827244, + "loss": 0.7986, + "step": 3485 + }, + { + "epoch": 0.16517412935323383, + "grad_norm": 0.4453125, + "learning_rate": 0.00018689762564559997, + "loss": 0.0847, + "step": 3486 + }, + { + "epoch": 0.1652215114901682, + "grad_norm": 0.255859375, + "learning_rate": 0.00018689025502563076, + "loss": 0.156, + "step": 3487 + }, + { + "epoch": 0.1652688936271026, + "grad_norm": 0.6640625, + "learning_rate": 0.00018688288247852822, + "loss": 1.1043, + "step": 3488 + }, + { + "epoch": 0.16531627576403696, + "grad_norm": 0.6640625, + "learning_rate": 0.00018687550800445591, + "loss": 1.3915, + "step": 3489 + }, + { + "epoch": 0.16536365790097132, + "grad_norm": 0.640625, + "learning_rate": 0.00018686813160357732, + "loss": 0.8655, + "step": 3490 + }, + { + "epoch": 0.16541104003790572, + "grad_norm": 0.55078125, + "learning_rate": 0.00018686075327605615, + "loss": 1.0847, + "step": 3491 + }, + { + "epoch": 0.16545842217484008, + "grad_norm": 0.7265625, + "learning_rate": 0.000186853373022056, + "loss": 1.0577, + "step": 3492 + }, + { + "epoch": 0.16550580431177447, + "grad_norm": 0.365234375, + "learning_rate": 0.00018684599084174051, + "loss": 0.199, + "step": 3493 + }, + { + "epoch": 0.16555318644870884, + "grad_norm": 0.640625, + "learning_rate": 0.00018683860673527348, + "loss": 1.3364, + "step": 3494 + }, + { + "epoch": 0.1656005685856432, + "grad_norm": 0.51953125, + "learning_rate": 0.0001868312207028186, + "loss": 0.2806, + "step": 3495 + }, + { + "epoch": 0.1656479507225776, + "grad_norm": 0.6328125, + "learning_rate": 0.00018682383274453977, + "loss": 0.9413, + "step": 3496 + }, + { + "epoch": 0.16569533285951196, + "grad_norm": 0.734375, + "learning_rate": 0.00018681644286060083, + "loss": 0.8934, + "step": 3497 + }, + { + "epoch": 0.16574271499644633, + "grad_norm": 0.5390625, + "learning_rate": 0.00018680905105116562, + "loss": 0.0685, + "step": 3498 + }, + { + "epoch": 0.16579009713338072, + "grad_norm": 0.71484375, + "learning_rate": 0.00018680165731639815, + "loss": 0.8225, + "step": 3499 + }, + { + "epoch": 0.16583747927031509, + "grad_norm": 0.52734375, + "learning_rate": 0.00018679426165646237, + "loss": 0.9894, + "step": 3500 + }, + { + "epoch": 0.16588486140724948, + "grad_norm": 0.53125, + "learning_rate": 0.00018678686407152227, + "loss": 1.1245, + "step": 3501 + }, + { + "epoch": 0.16593224354418384, + "grad_norm": 0.498046875, + "learning_rate": 0.00018677946456174206, + "loss": 0.104, + "step": 3502 + }, + { + "epoch": 0.1659796256811182, + "grad_norm": 0.55078125, + "learning_rate": 0.00018677206312728569, + "loss": 0.9713, + "step": 3503 + }, + { + "epoch": 0.1660270078180526, + "grad_norm": 0.79296875, + "learning_rate": 0.0001867646597683174, + "loss": 0.0467, + "step": 3504 + }, + { + "epoch": 0.16607438995498697, + "grad_norm": 0.30859375, + "learning_rate": 0.00018675725448500136, + "loss": 0.1175, + "step": 3505 + }, + { + "epoch": 0.16612177209192133, + "grad_norm": 0.7109375, + "learning_rate": 0.00018674984727750184, + "loss": 0.274, + "step": 3506 + }, + { + "epoch": 0.16616915422885573, + "grad_norm": 0.52734375, + "learning_rate": 0.0001867424381459831, + "loss": 0.8968, + "step": 3507 + }, + { + "epoch": 0.1662165363657901, + "grad_norm": 0.703125, + "learning_rate": 0.0001867350270906095, + "loss": 1.1854, + "step": 3508 + }, + { + "epoch": 0.16626391850272448, + "grad_norm": 0.6484375, + "learning_rate": 0.00018672761411154536, + "loss": 1.1038, + "step": 3509 + }, + { + "epoch": 0.16631130063965885, + "grad_norm": 0.578125, + "learning_rate": 0.00018672019920895513, + "loss": 0.5199, + "step": 3510 + }, + { + "epoch": 0.16635868277659321, + "grad_norm": 0.5546875, + "learning_rate": 0.00018671278238300328, + "loss": 1.3755, + "step": 3511 + }, + { + "epoch": 0.1664060649135276, + "grad_norm": 0.64453125, + "learning_rate": 0.0001867053636338542, + "loss": 0.9573, + "step": 3512 + }, + { + "epoch": 0.16645344705046197, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018669794296167258, + "loss": 0.0389, + "step": 3513 + }, + { + "epoch": 0.16650082918739637, + "grad_norm": 0.111328125, + "learning_rate": 0.0001866905203666229, + "loss": 0.0144, + "step": 3514 + }, + { + "epoch": 0.16654821132433073, + "grad_norm": 0.78515625, + "learning_rate": 0.00018668309584886984, + "loss": 1.4951, + "step": 3515 + }, + { + "epoch": 0.1665955934612651, + "grad_norm": 0.58203125, + "learning_rate": 0.00018667566940857805, + "loss": 0.8611, + "step": 3516 + }, + { + "epoch": 0.1666429755981995, + "grad_norm": 0.484375, + "learning_rate": 0.00018666824104591218, + "loss": 0.9698, + "step": 3517 + }, + { + "epoch": 0.16669035773513385, + "grad_norm": 0.65234375, + "learning_rate": 0.0001866608107610371, + "loss": 1.074, + "step": 3518 + }, + { + "epoch": 0.16673773987206822, + "grad_norm": 0.6484375, + "learning_rate": 0.0001866533785541175, + "loss": 0.5675, + "step": 3519 + }, + { + "epoch": 0.1667851220090026, + "grad_norm": 0.20703125, + "learning_rate": 0.00018664594442531832, + "loss": 0.0248, + "step": 3520 + }, + { + "epoch": 0.16683250414593698, + "grad_norm": 0.38671875, + "learning_rate": 0.0001866385083748043, + "loss": 0.0185, + "step": 3521 + }, + { + "epoch": 0.16687988628287137, + "grad_norm": 0.66015625, + "learning_rate": 0.0001866310704027405, + "loss": 0.7097, + "step": 3522 + }, + { + "epoch": 0.16692726841980574, + "grad_norm": 0.482421875, + "learning_rate": 0.00018662363050929182, + "loss": 0.1294, + "step": 3523 + }, + { + "epoch": 0.1669746505567401, + "grad_norm": 0.58984375, + "learning_rate": 0.00018661618869462328, + "loss": 0.8745, + "step": 3524 + }, + { + "epoch": 0.1670220326936745, + "grad_norm": 0.5859375, + "learning_rate": 0.00018660874495889996, + "loss": 1.0901, + "step": 3525 + }, + { + "epoch": 0.16706941483060886, + "grad_norm": 0.296875, + "learning_rate": 0.00018660129930228695, + "loss": 0.1195, + "step": 3526 + }, + { + "epoch": 0.16711679696754322, + "grad_norm": 0.310546875, + "learning_rate": 0.0001865938517249493, + "loss": 0.0636, + "step": 3527 + }, + { + "epoch": 0.16716417910447762, + "grad_norm": 0.63671875, + "learning_rate": 0.0001865864022270523, + "loss": 0.9913, + "step": 3528 + }, + { + "epoch": 0.16721156124141198, + "grad_norm": 0.6328125, + "learning_rate": 0.00018657895080876109, + "loss": 0.8216, + "step": 3529 + }, + { + "epoch": 0.16725894337834638, + "grad_norm": 0.671875, + "learning_rate": 0.00018657149747024099, + "loss": 1.1584, + "step": 3530 + }, + { + "epoch": 0.16730632551528074, + "grad_norm": 0.78515625, + "learning_rate": 0.00018656404221165728, + "loss": 1.5047, + "step": 3531 + }, + { + "epoch": 0.1673537076522151, + "grad_norm": 0.53125, + "learning_rate": 0.00018655658503317533, + "loss": 1.0045, + "step": 3532 + }, + { + "epoch": 0.1674010897891495, + "grad_norm": 0.53515625, + "learning_rate": 0.0001865491259349605, + "loss": 1.0516, + "step": 3533 + }, + { + "epoch": 0.16744847192608386, + "grad_norm": 0.5234375, + "learning_rate": 0.00018654166491717826, + "loss": 0.742, + "step": 3534 + }, + { + "epoch": 0.16749585406301823, + "grad_norm": 0.58203125, + "learning_rate": 0.0001865342019799941, + "loss": 0.2457, + "step": 3535 + }, + { + "epoch": 0.16754323619995262, + "grad_norm": 0.5234375, + "learning_rate": 0.00018652673712357345, + "loss": 1.1417, + "step": 3536 + }, + { + "epoch": 0.167590618336887, + "grad_norm": 0.3671875, + "learning_rate": 0.00018651927034808198, + "loss": 0.1849, + "step": 3537 + }, + { + "epoch": 0.16763800047382138, + "grad_norm": 0.62109375, + "learning_rate": 0.00018651180165368524, + "loss": 1.0322, + "step": 3538 + }, + { + "epoch": 0.16768538261075575, + "grad_norm": 0.68359375, + "learning_rate": 0.00018650433104054888, + "loss": 1.1652, + "step": 3539 + }, + { + "epoch": 0.1677327647476901, + "grad_norm": 0.6484375, + "learning_rate": 0.0001864968585088386, + "loss": 1.1635, + "step": 3540 + }, + { + "epoch": 0.1677801468846245, + "grad_norm": 0.1728515625, + "learning_rate": 0.00018648938405872015, + "loss": 0.0201, + "step": 3541 + }, + { + "epoch": 0.16782752902155887, + "grad_norm": 0.66015625, + "learning_rate": 0.00018648190769035927, + "loss": 1.3377, + "step": 3542 + }, + { + "epoch": 0.16787491115849326, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001864744294039218, + "loss": 0.0092, + "step": 3543 + }, + { + "epoch": 0.16792229329542763, + "grad_norm": 0.609375, + "learning_rate": 0.00018646694919957356, + "loss": 0.9292, + "step": 3544 + }, + { + "epoch": 0.167969675432362, + "grad_norm": 0.0072021484375, + "learning_rate": 0.00018645946707748052, + "loss": 0.0004, + "step": 3545 + }, + { + "epoch": 0.16801705756929639, + "grad_norm": 0.69140625, + "learning_rate": 0.0001864519830378086, + "loss": 1.5274, + "step": 3546 + }, + { + "epoch": 0.16806443970623075, + "grad_norm": 0.56640625, + "learning_rate": 0.00018644449708072377, + "loss": 0.6784, + "step": 3547 + }, + { + "epoch": 0.16811182184316512, + "grad_norm": 0.6484375, + "learning_rate": 0.00018643700920639207, + "loss": 1.0314, + "step": 3548 + }, + { + "epoch": 0.1681592039800995, + "grad_norm": 0.61328125, + "learning_rate": 0.00018642951941497956, + "loss": 1.3053, + "step": 3549 + }, + { + "epoch": 0.16820658611703387, + "grad_norm": 0.546875, + "learning_rate": 0.00018642202770665237, + "loss": 1.3362, + "step": 3550 + }, + { + "epoch": 0.16825396825396827, + "grad_norm": 0.69140625, + "learning_rate": 0.0001864145340815767, + "loss": 1.524, + "step": 3551 + }, + { + "epoch": 0.16830135039090263, + "grad_norm": 0.6171875, + "learning_rate": 0.00018640703853991869, + "loss": 0.9737, + "step": 3552 + }, + { + "epoch": 0.168348732527837, + "grad_norm": 0.40625, + "learning_rate": 0.00018639954108184457, + "loss": 0.5459, + "step": 3553 + }, + { + "epoch": 0.1683961146647714, + "grad_norm": 0.498046875, + "learning_rate": 0.00018639204170752067, + "loss": 0.8134, + "step": 3554 + }, + { + "epoch": 0.16844349680170576, + "grad_norm": 0.490234375, + "learning_rate": 0.00018638454041711332, + "loss": 1.1741, + "step": 3555 + }, + { + "epoch": 0.16849087893864012, + "grad_norm": 0.484375, + "learning_rate": 0.00018637703721078886, + "loss": 0.6779, + "step": 3556 + }, + { + "epoch": 0.1685382610755745, + "grad_norm": 0.0123291015625, + "learning_rate": 0.00018636953208871372, + "loss": 0.0007, + "step": 3557 + }, + { + "epoch": 0.16858564321250888, + "grad_norm": 0.61328125, + "learning_rate": 0.0001863620250510544, + "loss": 0.7599, + "step": 3558 + }, + { + "epoch": 0.16863302534944327, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001863545160979773, + "loss": 0.0232, + "step": 3559 + }, + { + "epoch": 0.16868040748637764, + "grad_norm": 0.322265625, + "learning_rate": 0.00018634700522964903, + "loss": 0.0911, + "step": 3560 + }, + { + "epoch": 0.168727789623312, + "grad_norm": 0.7734375, + "learning_rate": 0.00018633949244623615, + "loss": 1.1788, + "step": 3561 + }, + { + "epoch": 0.1687751717602464, + "grad_norm": 0.2138671875, + "learning_rate": 0.00018633197774790535, + "loss": 0.0551, + "step": 3562 + }, + { + "epoch": 0.16882255389718076, + "grad_norm": 0.490234375, + "learning_rate": 0.00018632446113482317, + "loss": 0.9242, + "step": 3563 + }, + { + "epoch": 0.16886993603411513, + "grad_norm": 0.5625, + "learning_rate": 0.0001863169426071564, + "loss": 0.9733, + "step": 3564 + }, + { + "epoch": 0.16891731817104952, + "grad_norm": 0.58984375, + "learning_rate": 0.00018630942216507182, + "loss": 1.0315, + "step": 3565 + }, + { + "epoch": 0.16896470030798388, + "grad_norm": 0.91015625, + "learning_rate": 0.00018630189980873617, + "loss": 1.143, + "step": 3566 + }, + { + "epoch": 0.16901208244491828, + "grad_norm": 0.515625, + "learning_rate": 0.00018629437553831628, + "loss": 0.8955, + "step": 3567 + }, + { + "epoch": 0.16905946458185264, + "grad_norm": 0.291015625, + "learning_rate": 0.00018628684935397907, + "loss": 0.0415, + "step": 3568 + }, + { + "epoch": 0.169106846718787, + "grad_norm": 0.10791015625, + "learning_rate": 0.00018627932125589146, + "loss": 0.0113, + "step": 3569 + }, + { + "epoch": 0.1691542288557214, + "grad_norm": 0.447265625, + "learning_rate": 0.0001862717912442204, + "loss": 0.8555, + "step": 3570 + }, + { + "epoch": 0.16920161099265577, + "grad_norm": 0.5625, + "learning_rate": 0.00018626425931913293, + "loss": 0.1145, + "step": 3571 + }, + { + "epoch": 0.16924899312959013, + "grad_norm": 0.546875, + "learning_rate": 0.00018625672548079606, + "loss": 0.8483, + "step": 3572 + }, + { + "epoch": 0.16929637526652452, + "grad_norm": 0.57421875, + "learning_rate": 0.00018624918972937686, + "loss": 1.2921, + "step": 3573 + }, + { + "epoch": 0.1693437574034589, + "grad_norm": 0.62109375, + "learning_rate": 0.00018624165206504253, + "loss": 1.0215, + "step": 3574 + }, + { + "epoch": 0.16939113954039328, + "grad_norm": 0.07958984375, + "learning_rate": 0.0001862341124879602, + "loss": 0.0082, + "step": 3575 + }, + { + "epoch": 0.16943852167732765, + "grad_norm": 0.63671875, + "learning_rate": 0.00018622657099829715, + "loss": 1.2434, + "step": 3576 + }, + { + "epoch": 0.169485903814262, + "grad_norm": 0.5390625, + "learning_rate": 0.00018621902759622056, + "loss": 0.1075, + "step": 3577 + }, + { + "epoch": 0.1695332859511964, + "grad_norm": 0.91015625, + "learning_rate": 0.00018621148228189778, + "loss": 0.5215, + "step": 3578 + }, + { + "epoch": 0.16958066808813077, + "grad_norm": 0.625, + "learning_rate": 0.00018620393505549616, + "loss": 0.1551, + "step": 3579 + }, + { + "epoch": 0.16962805022506516, + "grad_norm": 0.314453125, + "learning_rate": 0.00018619638591718307, + "loss": 0.0896, + "step": 3580 + }, + { + "epoch": 0.16967543236199953, + "grad_norm": 0.5390625, + "learning_rate": 0.00018618883486712595, + "loss": 0.6216, + "step": 3581 + }, + { + "epoch": 0.1697228144989339, + "grad_norm": 0.7578125, + "learning_rate": 0.0001861812819054923, + "loss": 0.8138, + "step": 3582 + }, + { + "epoch": 0.1697701966358683, + "grad_norm": 0.6640625, + "learning_rate": 0.0001861737270324496, + "loss": 1.1879, + "step": 3583 + }, + { + "epoch": 0.16981757877280265, + "grad_norm": 0.6875, + "learning_rate": 0.0001861661702481654, + "loss": 1.7843, + "step": 3584 + }, + { + "epoch": 0.16986496090973702, + "grad_norm": 0.3515625, + "learning_rate": 0.00018615861155280735, + "loss": 0.1884, + "step": 3585 + }, + { + "epoch": 0.1699123430466714, + "grad_norm": 0.52734375, + "learning_rate": 0.0001861510509465431, + "loss": 0.214, + "step": 3586 + }, + { + "epoch": 0.16995972518360578, + "grad_norm": 0.625, + "learning_rate": 0.00018614348842954024, + "loss": 0.0472, + "step": 3587 + }, + { + "epoch": 0.17000710732054017, + "grad_norm": 0.859375, + "learning_rate": 0.0001861359240019666, + "loss": 0.7151, + "step": 3588 + }, + { + "epoch": 0.17005448945747453, + "grad_norm": 0.69140625, + "learning_rate": 0.00018612835766398987, + "loss": 1.2108, + "step": 3589 + }, + { + "epoch": 0.1701018715944089, + "grad_norm": 0.98046875, + "learning_rate": 0.00018612078941577799, + "loss": 1.048, + "step": 3590 + }, + { + "epoch": 0.1701492537313433, + "grad_norm": 0.5390625, + "learning_rate": 0.00018611321925749867, + "loss": 1.0845, + "step": 3591 + }, + { + "epoch": 0.17019663586827766, + "grad_norm": 0.6328125, + "learning_rate": 0.0001861056471893199, + "loss": 1.0258, + "step": 3592 + }, + { + "epoch": 0.17024401800521202, + "grad_norm": 0.625, + "learning_rate": 0.00018609807321140956, + "loss": 1.1531, + "step": 3593 + }, + { + "epoch": 0.17029140014214642, + "grad_norm": 0.412109375, + "learning_rate": 0.0001860904973239357, + "loss": 0.7115, + "step": 3594 + }, + { + "epoch": 0.17033878227908078, + "grad_norm": 0.65234375, + "learning_rate": 0.00018608291952706626, + "loss": 0.7323, + "step": 3595 + }, + { + "epoch": 0.17038616441601517, + "grad_norm": 0.53515625, + "learning_rate": 0.0001860753398209694, + "loss": 1.2005, + "step": 3596 + }, + { + "epoch": 0.17043354655294954, + "grad_norm": 0.375, + "learning_rate": 0.00018606775820581315, + "loss": 0.0186, + "step": 3597 + }, + { + "epoch": 0.1704809286898839, + "grad_norm": 0.62109375, + "learning_rate": 0.0001860601746817657, + "loss": 0.8583, + "step": 3598 + }, + { + "epoch": 0.1705283108268183, + "grad_norm": 0.62890625, + "learning_rate": 0.00018605258924899527, + "loss": 0.8947, + "step": 3599 + }, + { + "epoch": 0.17057569296375266, + "grad_norm": 0.482421875, + "learning_rate": 0.00018604500190767007, + "loss": 0.7192, + "step": 3600 + }, + { + "epoch": 0.17062307510068703, + "grad_norm": 0.6640625, + "learning_rate": 0.00018603741265795835, + "loss": 0.0833, + "step": 3601 + }, + { + "epoch": 0.17067045723762142, + "grad_norm": 0.58984375, + "learning_rate": 0.0001860298215000285, + "loss": 0.8278, + "step": 3602 + }, + { + "epoch": 0.17071783937455579, + "grad_norm": 0.83203125, + "learning_rate": 0.00018602222843404882, + "loss": 0.6577, + "step": 3603 + }, + { + "epoch": 0.17076522151149018, + "grad_norm": 0.76953125, + "learning_rate": 0.00018601463346018776, + "loss": 0.9069, + "step": 3604 + }, + { + "epoch": 0.17081260364842454, + "grad_norm": 0.49609375, + "learning_rate": 0.00018600703657861372, + "loss": 0.4135, + "step": 3605 + }, + { + "epoch": 0.1708599857853589, + "grad_norm": 0.65625, + "learning_rate": 0.00018599943778949523, + "loss": 0.2287, + "step": 3606 + }, + { + "epoch": 0.1709073679222933, + "grad_norm": 0.78125, + "learning_rate": 0.0001859918370930008, + "loss": 0.5911, + "step": 3607 + }, + { + "epoch": 0.17095475005922767, + "grad_norm": 0.2099609375, + "learning_rate": 0.00018598423448929906, + "loss": 0.0499, + "step": 3608 + }, + { + "epoch": 0.17100213219616206, + "grad_norm": 0.7421875, + "learning_rate": 0.00018597662997855855, + "loss": 1.2342, + "step": 3609 + }, + { + "epoch": 0.17104951433309643, + "grad_norm": 0.83984375, + "learning_rate": 0.00018596902356094797, + "loss": 1.1811, + "step": 3610 + }, + { + "epoch": 0.1710968964700308, + "grad_norm": 0.58203125, + "learning_rate": 0.00018596141523663601, + "loss": 1.1274, + "step": 3611 + }, + { + "epoch": 0.17114427860696518, + "grad_norm": 0.5546875, + "learning_rate": 0.00018595380500579142, + "loss": 0.9245, + "step": 3612 + }, + { + "epoch": 0.17119166074389955, + "grad_norm": 0.73828125, + "learning_rate": 0.00018594619286858301, + "loss": 0.4514, + "step": 3613 + }, + { + "epoch": 0.1712390428808339, + "grad_norm": 0.275390625, + "learning_rate": 0.00018593857882517957, + "loss": 0.0604, + "step": 3614 + }, + { + "epoch": 0.1712864250177683, + "grad_norm": 0.5859375, + "learning_rate": 0.00018593096287575, + "loss": 0.7656, + "step": 3615 + }, + { + "epoch": 0.17133380715470267, + "grad_norm": 0.059814453125, + "learning_rate": 0.0001859233450204632, + "loss": 0.0035, + "step": 3616 + }, + { + "epoch": 0.17138118929163706, + "grad_norm": 0.4921875, + "learning_rate": 0.00018591572525948814, + "loss": 0.8791, + "step": 3617 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 0.6484375, + "learning_rate": 0.0001859081035929938, + "loss": 1.2135, + "step": 3618 + }, + { + "epoch": 0.1714759535655058, + "grad_norm": 0.6015625, + "learning_rate": 0.0001859004800211492, + "loss": 0.5741, + "step": 3619 + }, + { + "epoch": 0.1715233357024402, + "grad_norm": 0.84765625, + "learning_rate": 0.00018589285454412348, + "loss": 0.5292, + "step": 3620 + }, + { + "epoch": 0.17157071783937455, + "grad_norm": 0.859375, + "learning_rate": 0.00018588522716208575, + "loss": 0.0636, + "step": 3621 + }, + { + "epoch": 0.17161809997630892, + "grad_norm": 0.65234375, + "learning_rate": 0.00018587759787520514, + "loss": 0.921, + "step": 3622 + }, + { + "epoch": 0.1716654821132433, + "grad_norm": 0.6171875, + "learning_rate": 0.0001858699666836509, + "loss": 1.152, + "step": 3623 + }, + { + "epoch": 0.17171286425017768, + "grad_norm": 0.54296875, + "learning_rate": 0.00018586233358759222, + "loss": 0.4444, + "step": 3624 + }, + { + "epoch": 0.17176024638711207, + "grad_norm": 0.625, + "learning_rate": 0.00018585469858719845, + "loss": 0.6908, + "step": 3625 + }, + { + "epoch": 0.17180762852404644, + "grad_norm": 0.67578125, + "learning_rate": 0.00018584706168263895, + "loss": 1.046, + "step": 3626 + }, + { + "epoch": 0.1718550106609808, + "grad_norm": 0.58203125, + "learning_rate": 0.00018583942287408303, + "loss": 1.1907, + "step": 3627 + }, + { + "epoch": 0.1719023927979152, + "grad_norm": 0.58984375, + "learning_rate": 0.00018583178216170017, + "loss": 0.9779, + "step": 3628 + }, + { + "epoch": 0.17194977493484956, + "grad_norm": 0.515625, + "learning_rate": 0.0001858241395456598, + "loss": 1.2669, + "step": 3629 + }, + { + "epoch": 0.17199715707178392, + "grad_norm": 0.39453125, + "learning_rate": 0.00018581649502613138, + "loss": 0.0819, + "step": 3630 + }, + { + "epoch": 0.17204453920871832, + "grad_norm": 0.50390625, + "learning_rate": 0.00018580884860328455, + "loss": 0.0316, + "step": 3631 + }, + { + "epoch": 0.17209192134565268, + "grad_norm": 0.474609375, + "learning_rate": 0.00018580120027728887, + "loss": 1.0357, + "step": 3632 + }, + { + "epoch": 0.17213930348258707, + "grad_norm": 0.6640625, + "learning_rate": 0.00018579355004831393, + "loss": 1.4687, + "step": 3633 + }, + { + "epoch": 0.17218668561952144, + "grad_norm": 0.8515625, + "learning_rate": 0.00018578589791652946, + "loss": 0.9925, + "step": 3634 + }, + { + "epoch": 0.1722340677564558, + "grad_norm": 0.546875, + "learning_rate": 0.00018577824388210515, + "loss": 0.9039, + "step": 3635 + }, + { + "epoch": 0.1722814498933902, + "grad_norm": 0.44921875, + "learning_rate": 0.00018577058794521075, + "loss": 0.0527, + "step": 3636 + }, + { + "epoch": 0.17232883203032456, + "grad_norm": 0.5546875, + "learning_rate": 0.00018576293010601604, + "loss": 0.5258, + "step": 3637 + }, + { + "epoch": 0.17237621416725896, + "grad_norm": 0.451171875, + "learning_rate": 0.0001857552703646909, + "loss": 0.1037, + "step": 3638 + }, + { + "epoch": 0.17242359630419332, + "grad_norm": 0.0888671875, + "learning_rate": 0.00018574760872140523, + "loss": 0.0104, + "step": 3639 + }, + { + "epoch": 0.1724709784411277, + "grad_norm": 0.55078125, + "learning_rate": 0.00018573994517632892, + "loss": 1.4852, + "step": 3640 + }, + { + "epoch": 0.17251836057806208, + "grad_norm": 0.6640625, + "learning_rate": 0.00018573227972963196, + "loss": 1.1529, + "step": 3641 + }, + { + "epoch": 0.17256574271499645, + "grad_norm": 0.6015625, + "learning_rate": 0.00018572461238148432, + "loss": 1.4178, + "step": 3642 + }, + { + "epoch": 0.1726131248519308, + "grad_norm": 1.15625, + "learning_rate": 0.00018571694313205614, + "loss": 0.4535, + "step": 3643 + }, + { + "epoch": 0.1726605069888652, + "grad_norm": 0.6484375, + "learning_rate": 0.00018570927198151743, + "loss": 0.0695, + "step": 3644 + }, + { + "epoch": 0.17270788912579957, + "grad_norm": 1.078125, + "learning_rate": 0.00018570159893003835, + "loss": 0.0868, + "step": 3645 + }, + { + "epoch": 0.17275527126273396, + "grad_norm": 0.54296875, + "learning_rate": 0.0001856939239777891, + "loss": 0.6132, + "step": 3646 + }, + { + "epoch": 0.17280265339966833, + "grad_norm": 0.609375, + "learning_rate": 0.0001856862471249399, + "loss": 1.2419, + "step": 3647 + }, + { + "epoch": 0.1728500355366027, + "grad_norm": 0.482421875, + "learning_rate": 0.00018567856837166104, + "loss": 0.3735, + "step": 3648 + }, + { + "epoch": 0.17289741767353708, + "grad_norm": 0.55078125, + "learning_rate": 0.00018567088771812275, + "loss": 0.7501, + "step": 3649 + }, + { + "epoch": 0.17294479981047145, + "grad_norm": 0.59765625, + "learning_rate": 0.00018566320516449545, + "loss": 1.2527, + "step": 3650 + }, + { + "epoch": 0.17299218194740582, + "grad_norm": 0.5390625, + "learning_rate": 0.00018565552071094947, + "loss": 1.0334, + "step": 3651 + }, + { + "epoch": 0.1730395640843402, + "grad_norm": 0.58203125, + "learning_rate": 0.0001856478343576553, + "loss": 0.0948, + "step": 3652 + }, + { + "epoch": 0.17308694622127457, + "grad_norm": 0.8515625, + "learning_rate": 0.0001856401461047834, + "loss": 0.1733, + "step": 3653 + }, + { + "epoch": 0.17313432835820897, + "grad_norm": 0.55859375, + "learning_rate": 0.00018563245595250427, + "loss": 1.0906, + "step": 3654 + }, + { + "epoch": 0.17318171049514333, + "grad_norm": 0.5546875, + "learning_rate": 0.00018562476390098848, + "loss": 0.8008, + "step": 3655 + }, + { + "epoch": 0.1732290926320777, + "grad_norm": 0.23046875, + "learning_rate": 0.00018561706995040661, + "loss": 0.0755, + "step": 3656 + }, + { + "epoch": 0.1732764747690121, + "grad_norm": 0.609375, + "learning_rate": 0.00018560937410092934, + "loss": 1.0517, + "step": 3657 + }, + { + "epoch": 0.17332385690594646, + "grad_norm": 0.58984375, + "learning_rate": 0.00018560167635272735, + "loss": 1.3768, + "step": 3658 + }, + { + "epoch": 0.17337123904288082, + "grad_norm": 0.234375, + "learning_rate": 0.00018559397670597135, + "loss": 0.0153, + "step": 3659 + }, + { + "epoch": 0.1734186211798152, + "grad_norm": 0.59375, + "learning_rate": 0.00018558627516083212, + "loss": 1.3332, + "step": 3660 + }, + { + "epoch": 0.17346600331674958, + "grad_norm": 0.23828125, + "learning_rate": 0.00018557857171748045, + "loss": 0.1742, + "step": 3661 + }, + { + "epoch": 0.17351338545368397, + "grad_norm": 0.66015625, + "learning_rate": 0.00018557086637608722, + "loss": 0.9789, + "step": 3662 + }, + { + "epoch": 0.17356076759061834, + "grad_norm": 0.36328125, + "learning_rate": 0.00018556315913682335, + "loss": 0.1682, + "step": 3663 + }, + { + "epoch": 0.1736081497275527, + "grad_norm": 0.67578125, + "learning_rate": 0.00018555544999985973, + "loss": 1.0887, + "step": 3664 + }, + { + "epoch": 0.1736555318644871, + "grad_norm": 0.64453125, + "learning_rate": 0.00018554773896536735, + "loss": 1.5074, + "step": 3665 + }, + { + "epoch": 0.17370291400142146, + "grad_norm": 0.060302734375, + "learning_rate": 0.00018554002603351724, + "loss": 0.0026, + "step": 3666 + }, + { + "epoch": 0.17375029613835585, + "grad_norm": 0.375, + "learning_rate": 0.00018553231120448048, + "loss": 0.0545, + "step": 3667 + }, + { + "epoch": 0.17379767827529022, + "grad_norm": 0.6015625, + "learning_rate": 0.00018552459447842813, + "loss": 1.0061, + "step": 3668 + }, + { + "epoch": 0.17384506041222458, + "grad_norm": 0.68359375, + "learning_rate": 0.0001855168758555314, + "loss": 1.1792, + "step": 3669 + }, + { + "epoch": 0.17389244254915898, + "grad_norm": 0.3203125, + "learning_rate": 0.00018550915533596145, + "loss": 0.0447, + "step": 3670 + }, + { + "epoch": 0.17393982468609334, + "grad_norm": 0.6171875, + "learning_rate": 0.0001855014329198895, + "loss": 1.3617, + "step": 3671 + }, + { + "epoch": 0.1739872068230277, + "grad_norm": 0.65234375, + "learning_rate": 0.00018549370860748688, + "loss": 0.8235, + "step": 3672 + }, + { + "epoch": 0.1740345889599621, + "grad_norm": 0.62890625, + "learning_rate": 0.00018548598239892484, + "loss": 1.2566, + "step": 3673 + }, + { + "epoch": 0.17408197109689647, + "grad_norm": 0.5703125, + "learning_rate": 0.00018547825429437474, + "loss": 0.8101, + "step": 3674 + }, + { + "epoch": 0.17412935323383086, + "grad_norm": 0.48828125, + "learning_rate": 0.00018547052429400803, + "loss": 1.1601, + "step": 3675 + }, + { + "epoch": 0.17417673537076522, + "grad_norm": 0.66015625, + "learning_rate": 0.00018546279239799613, + "loss": 0.0429, + "step": 3676 + }, + { + "epoch": 0.1742241175076996, + "grad_norm": 0.65625, + "learning_rate": 0.00018545505860651055, + "loss": 1.0094, + "step": 3677 + }, + { + "epoch": 0.17427149964463398, + "grad_norm": 0.4921875, + "learning_rate": 0.00018544732291972274, + "loss": 1.0445, + "step": 3678 + }, + { + "epoch": 0.17431888178156835, + "grad_norm": 0.64453125, + "learning_rate": 0.00018543958533780437, + "loss": 0.3371, + "step": 3679 + }, + { + "epoch": 0.1743662639185027, + "grad_norm": 0.6484375, + "learning_rate": 0.000185431845860927, + "loss": 0.7795, + "step": 3680 + }, + { + "epoch": 0.1744136460554371, + "grad_norm": 0.5546875, + "learning_rate": 0.00018542410448926227, + "loss": 0.0906, + "step": 3681 + }, + { + "epoch": 0.17446102819237147, + "grad_norm": 0.609375, + "learning_rate": 0.00018541636122298188, + "loss": 0.7835, + "step": 3682 + }, + { + "epoch": 0.17450841032930586, + "grad_norm": 0.40625, + "learning_rate": 0.0001854086160622576, + "loss": 0.36, + "step": 3683 + }, + { + "epoch": 0.17455579246624023, + "grad_norm": 0.34375, + "learning_rate": 0.00018540086900726118, + "loss": 0.0148, + "step": 3684 + }, + { + "epoch": 0.1746031746031746, + "grad_norm": 0.64453125, + "learning_rate": 0.00018539312005816445, + "loss": 1.1809, + "step": 3685 + }, + { + "epoch": 0.174650556740109, + "grad_norm": 0.6484375, + "learning_rate": 0.00018538536921513927, + "loss": 0.9764, + "step": 3686 + }, + { + "epoch": 0.17469793887704335, + "grad_norm": 0.5546875, + "learning_rate": 0.00018537761647835754, + "loss": 0.9014, + "step": 3687 + }, + { + "epoch": 0.17474532101397772, + "grad_norm": 0.7421875, + "learning_rate": 0.00018536986184799123, + "loss": 0.2781, + "step": 3688 + }, + { + "epoch": 0.1747927031509121, + "grad_norm": 0.68359375, + "learning_rate": 0.00018536210532421233, + "loss": 0.8563, + "step": 3689 + }, + { + "epoch": 0.17484008528784648, + "grad_norm": 0.32421875, + "learning_rate": 0.00018535434690719285, + "loss": 0.0401, + "step": 3690 + }, + { + "epoch": 0.17488746742478087, + "grad_norm": 0.81640625, + "learning_rate": 0.00018534658659710487, + "loss": 1.1854, + "step": 3691 + }, + { + "epoch": 0.17493484956171523, + "grad_norm": 0.66015625, + "learning_rate": 0.0001853388243941205, + "loss": 1.3527, + "step": 3692 + }, + { + "epoch": 0.1749822316986496, + "grad_norm": 0.4921875, + "learning_rate": 0.00018533106029841188, + "loss": 1.0618, + "step": 3693 + }, + { + "epoch": 0.175029613835584, + "grad_norm": 0.234375, + "learning_rate": 0.00018532329431015127, + "loss": 0.1632, + "step": 3694 + }, + { + "epoch": 0.17507699597251836, + "grad_norm": 0.609375, + "learning_rate": 0.00018531552642951087, + "loss": 1.3013, + "step": 3695 + }, + { + "epoch": 0.17512437810945275, + "grad_norm": 0.78125, + "learning_rate": 0.00018530775665666295, + "loss": 1.2847, + "step": 3696 + }, + { + "epoch": 0.17517176024638711, + "grad_norm": 0.77734375, + "learning_rate": 0.0001852999849917799, + "loss": 0.2259, + "step": 3697 + }, + { + "epoch": 0.17521914238332148, + "grad_norm": 1.6328125, + "learning_rate": 0.000185292211435034, + "loss": 1.0169, + "step": 3698 + }, + { + "epoch": 0.17526652452025587, + "grad_norm": 0.87890625, + "learning_rate": 0.00018528443598659768, + "loss": 0.9275, + "step": 3699 + }, + { + "epoch": 0.17531390665719024, + "grad_norm": 0.671875, + "learning_rate": 0.00018527665864664344, + "loss": 0.1529, + "step": 3700 + }, + { + "epoch": 0.1753612887941246, + "grad_norm": 0.58203125, + "learning_rate": 0.00018526887941534373, + "loss": 1.0, + "step": 3701 + }, + { + "epoch": 0.175408670931059, + "grad_norm": 0.5078125, + "learning_rate": 0.00018526109829287112, + "loss": 0.6171, + "step": 3702 + }, + { + "epoch": 0.17545605306799336, + "grad_norm": 0.341796875, + "learning_rate": 0.00018525331527939818, + "loss": 0.1511, + "step": 3703 + }, + { + "epoch": 0.17550343520492775, + "grad_norm": 0.099609375, + "learning_rate": 0.00018524553037509747, + "loss": 0.0093, + "step": 3704 + }, + { + "epoch": 0.17555081734186212, + "grad_norm": 0.65234375, + "learning_rate": 0.00018523774358014173, + "loss": 1.238, + "step": 3705 + }, + { + "epoch": 0.17559819947879649, + "grad_norm": 0.51171875, + "learning_rate": 0.0001852299548947036, + "loss": 0.6388, + "step": 3706 + }, + { + "epoch": 0.17564558161573088, + "grad_norm": 0.6015625, + "learning_rate": 0.00018522216431895587, + "loss": 0.8282, + "step": 3707 + }, + { + "epoch": 0.17569296375266524, + "grad_norm": 0.53125, + "learning_rate": 0.00018521437185307132, + "loss": 0.5269, + "step": 3708 + }, + { + "epoch": 0.1757403458895996, + "grad_norm": 0.466796875, + "learning_rate": 0.00018520657749722275, + "loss": 0.666, + "step": 3709 + }, + { + "epoch": 0.175787728026534, + "grad_norm": 0.53125, + "learning_rate": 0.00018519878125158306, + "loss": 0.1391, + "step": 3710 + }, + { + "epoch": 0.17583511016346837, + "grad_norm": 0.43359375, + "learning_rate": 0.00018519098311632512, + "loss": 0.1767, + "step": 3711 + }, + { + "epoch": 0.17588249230040276, + "grad_norm": 0.61328125, + "learning_rate": 0.00018518318309162195, + "loss": 0.7675, + "step": 3712 + }, + { + "epoch": 0.17592987443733712, + "grad_norm": 0.4921875, + "learning_rate": 0.0001851753811776465, + "loss": 0.9559, + "step": 3713 + }, + { + "epoch": 0.1759772565742715, + "grad_norm": 0.65234375, + "learning_rate": 0.00018516757737457182, + "loss": 1.0426, + "step": 3714 + }, + { + "epoch": 0.17602463871120588, + "grad_norm": 1.328125, + "learning_rate": 0.000185159771682571, + "loss": 1.6759, + "step": 3715 + }, + { + "epoch": 0.17607202084814025, + "grad_norm": 0.62109375, + "learning_rate": 0.00018515196410181714, + "loss": 1.2135, + "step": 3716 + }, + { + "epoch": 0.1761194029850746, + "grad_norm": 0.53125, + "learning_rate": 0.00018514415463248343, + "loss": 1.1519, + "step": 3717 + }, + { + "epoch": 0.176166785122009, + "grad_norm": 0.734375, + "learning_rate": 0.00018513634327474305, + "loss": 1.2489, + "step": 3718 + }, + { + "epoch": 0.17621416725894337, + "grad_norm": 0.68359375, + "learning_rate": 0.00018512853002876928, + "loss": 1.2299, + "step": 3719 + }, + { + "epoch": 0.17626154939587776, + "grad_norm": 0.703125, + "learning_rate": 0.00018512071489473536, + "loss": 0.8664, + "step": 3720 + }, + { + "epoch": 0.17630893153281213, + "grad_norm": 0.609375, + "learning_rate": 0.00018511289787281467, + "loss": 0.5929, + "step": 3721 + }, + { + "epoch": 0.1763563136697465, + "grad_norm": 0.515625, + "learning_rate": 0.00018510507896318056, + "loss": 0.1134, + "step": 3722 + }, + { + "epoch": 0.1764036958066809, + "grad_norm": 0.5390625, + "learning_rate": 0.00018509725816600643, + "loss": 0.9311, + "step": 3723 + }, + { + "epoch": 0.17645107794361525, + "grad_norm": 0.58203125, + "learning_rate": 0.0001850894354814658, + "loss": 1.0983, + "step": 3724 + }, + { + "epoch": 0.17649846008054965, + "grad_norm": 0.6953125, + "learning_rate": 0.00018508161090973206, + "loss": 0.3915, + "step": 3725 + }, + { + "epoch": 0.176545842217484, + "grad_norm": 0.515625, + "learning_rate": 0.00018507378445097885, + "loss": 0.7702, + "step": 3726 + }, + { + "epoch": 0.17659322435441838, + "grad_norm": 0.59375, + "learning_rate": 0.0001850659561053797, + "loss": 0.9071, + "step": 3727 + }, + { + "epoch": 0.17664060649135277, + "grad_norm": 0.703125, + "learning_rate": 0.00018505812587310829, + "loss": 1.133, + "step": 3728 + }, + { + "epoch": 0.17668798862828713, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001850502937543382, + "loss": 0.0478, + "step": 3729 + }, + { + "epoch": 0.1767353707652215, + "grad_norm": 0.26171875, + "learning_rate": 0.00018504245974924324, + "loss": 0.0362, + "step": 3730 + }, + { + "epoch": 0.1767827529021559, + "grad_norm": 0.7734375, + "learning_rate": 0.00018503462385799707, + "loss": 1.0033, + "step": 3731 + }, + { + "epoch": 0.17683013503909026, + "grad_norm": 0.54296875, + "learning_rate": 0.00018502678608077355, + "loss": 1.1681, + "step": 3732 + }, + { + "epoch": 0.17687751717602465, + "grad_norm": 0.3828125, + "learning_rate": 0.00018501894641774643, + "loss": 0.0484, + "step": 3733 + }, + { + "epoch": 0.17692489931295902, + "grad_norm": 0.6796875, + "learning_rate": 0.00018501110486908968, + "loss": 0.8182, + "step": 3734 + }, + { + "epoch": 0.17697228144989338, + "grad_norm": 0.74609375, + "learning_rate": 0.00018500326143497715, + "loss": 0.9496, + "step": 3735 + }, + { + "epoch": 0.17701966358682777, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018499541611558283, + "loss": 0.1437, + "step": 3736 + }, + { + "epoch": 0.17706704572376214, + "grad_norm": 0.52734375, + "learning_rate": 0.00018498756891108072, + "loss": 0.1331, + "step": 3737 + }, + { + "epoch": 0.1771144278606965, + "grad_norm": 1.0625, + "learning_rate": 0.00018497971982164483, + "loss": 1.0338, + "step": 3738 + }, + { + "epoch": 0.1771618099976309, + "grad_norm": 0.28515625, + "learning_rate": 0.0001849718688474493, + "loss": 0.0395, + "step": 3739 + }, + { + "epoch": 0.17720919213456526, + "grad_norm": 0.5625, + "learning_rate": 0.0001849640159886682, + "loss": 0.9706, + "step": 3740 + }, + { + "epoch": 0.17725657427149966, + "grad_norm": 0.578125, + "learning_rate": 0.00018495616124547578, + "loss": 0.9039, + "step": 3741 + }, + { + "epoch": 0.17730395640843402, + "grad_norm": 0.384765625, + "learning_rate": 0.00018494830461804617, + "loss": 0.09, + "step": 3742 + }, + { + "epoch": 0.1773513385453684, + "grad_norm": 0.7421875, + "learning_rate": 0.00018494044610655358, + "loss": 0.9534, + "step": 3743 + }, + { + "epoch": 0.17739872068230278, + "grad_norm": 0.58984375, + "learning_rate": 0.00018493258571117244, + "loss": 1.2597, + "step": 3744 + }, + { + "epoch": 0.17744610281923714, + "grad_norm": 0.4453125, + "learning_rate": 0.000184924723432077, + "loss": 0.0954, + "step": 3745 + }, + { + "epoch": 0.1774934849561715, + "grad_norm": 0.427734375, + "learning_rate": 0.00018491685926944165, + "loss": 0.0738, + "step": 3746 + }, + { + "epoch": 0.1775408670931059, + "grad_norm": 0.73828125, + "learning_rate": 0.00018490899322344082, + "loss": 1.2257, + "step": 3747 + }, + { + "epoch": 0.17758824923004027, + "grad_norm": 0.671875, + "learning_rate": 0.00018490112529424897, + "loss": 1.2488, + "step": 3748 + }, + { + "epoch": 0.17763563136697466, + "grad_norm": 0.69140625, + "learning_rate": 0.00018489325548204057, + "loss": 1.0829, + "step": 3749 + }, + { + "epoch": 0.17768301350390903, + "grad_norm": 0.90625, + "learning_rate": 0.00018488538378699017, + "loss": 1.1709, + "step": 3750 + }, + { + "epoch": 0.1777303956408434, + "grad_norm": 0.55859375, + "learning_rate": 0.00018487751020927239, + "loss": 0.1868, + "step": 3751 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.5625, + "learning_rate": 0.00018486963474906187, + "loss": 0.0744, + "step": 3752 + }, + { + "epoch": 0.17782515991471215, + "grad_norm": 0.57421875, + "learning_rate": 0.00018486175740653322, + "loss": 1.1554, + "step": 3753 + }, + { + "epoch": 0.17787254205164654, + "grad_norm": 0.69921875, + "learning_rate": 0.0001848538781818612, + "loss": 1.1292, + "step": 3754 + }, + { + "epoch": 0.1779199241885809, + "grad_norm": 0.65234375, + "learning_rate": 0.00018484599707522053, + "loss": 0.9446, + "step": 3755 + }, + { + "epoch": 0.17796730632551527, + "grad_norm": 0.55859375, + "learning_rate": 0.00018483811408678602, + "loss": 1.2564, + "step": 3756 + }, + { + "epoch": 0.17801468846244967, + "grad_norm": 0.69140625, + "learning_rate": 0.00018483022921673249, + "loss": 1.0344, + "step": 3757 + }, + { + "epoch": 0.17806207059938403, + "grad_norm": 0.9765625, + "learning_rate": 0.00018482234246523487, + "loss": 1.0584, + "step": 3758 + }, + { + "epoch": 0.1781094527363184, + "grad_norm": 0.6328125, + "learning_rate": 0.00018481445383246802, + "loss": 0.8575, + "step": 3759 + }, + { + "epoch": 0.1781568348732528, + "grad_norm": 0.51953125, + "learning_rate": 0.00018480656331860694, + "loss": 0.7026, + "step": 3760 + }, + { + "epoch": 0.17820421701018715, + "grad_norm": 0.9296875, + "learning_rate": 0.0001847986709238266, + "loss": 0.1739, + "step": 3761 + }, + { + "epoch": 0.17825159914712155, + "grad_norm": 1.0703125, + "learning_rate": 0.00018479077664830206, + "loss": 1.1396, + "step": 3762 + }, + { + "epoch": 0.1782989812840559, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018478288049220842, + "loss": 0.1417, + "step": 3763 + }, + { + "epoch": 0.17834636342099028, + "grad_norm": 0.7265625, + "learning_rate": 0.0001847749824557208, + "loss": 1.2362, + "step": 3764 + }, + { + "epoch": 0.17839374555792467, + "grad_norm": 0.4140625, + "learning_rate": 0.00018476708253901435, + "loss": 0.0794, + "step": 3765 + }, + { + "epoch": 0.17844112769485904, + "grad_norm": 0.50390625, + "learning_rate": 0.00018475918074226433, + "loss": 1.0855, + "step": 3766 + }, + { + "epoch": 0.1784885098317934, + "grad_norm": 0.09765625, + "learning_rate": 0.00018475127706564593, + "loss": 0.002, + "step": 3767 + }, + { + "epoch": 0.1785358919687278, + "grad_norm": 0.28515625, + "learning_rate": 0.0001847433715093345, + "loss": 0.0393, + "step": 3768 + }, + { + "epoch": 0.17858327410566216, + "grad_norm": 0.7109375, + "learning_rate": 0.00018473546407350532, + "loss": 0.8788, + "step": 3769 + }, + { + "epoch": 0.17863065624259655, + "grad_norm": 0.7890625, + "learning_rate": 0.00018472755475833384, + "loss": 0.5201, + "step": 3770 + }, + { + "epoch": 0.17867803837953092, + "grad_norm": 0.81640625, + "learning_rate": 0.00018471964356399546, + "loss": 0.8121, + "step": 3771 + }, + { + "epoch": 0.17872542051646528, + "grad_norm": 0.9140625, + "learning_rate": 0.0001847117304906656, + "loss": 1.2936, + "step": 3772 + }, + { + "epoch": 0.17877280265339968, + "grad_norm": 0.02099609375, + "learning_rate": 0.00018470381553851976, + "loss": 0.0016, + "step": 3773 + }, + { + "epoch": 0.17882018479033404, + "grad_norm": 0.625, + "learning_rate": 0.00018469589870773355, + "loss": 1.0161, + "step": 3774 + }, + { + "epoch": 0.1788675669272684, + "grad_norm": 0.546875, + "learning_rate": 0.0001846879799984825, + "loss": 1.019, + "step": 3775 + }, + { + "epoch": 0.1789149490642028, + "grad_norm": 0.97265625, + "learning_rate": 0.00018468005941094226, + "loss": 0.4715, + "step": 3776 + }, + { + "epoch": 0.17896233120113716, + "grad_norm": 0.66796875, + "learning_rate": 0.00018467213694528853, + "loss": 0.1571, + "step": 3777 + }, + { + "epoch": 0.17900971333807156, + "grad_norm": 0.609375, + "learning_rate": 0.00018466421260169695, + "loss": 1.4319, + "step": 3778 + }, + { + "epoch": 0.17905709547500592, + "grad_norm": 0.482421875, + "learning_rate": 0.00018465628638034332, + "loss": 1.1197, + "step": 3779 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 0.59375, + "learning_rate": 0.00018464835828140347, + "loss": 1.3157, + "step": 3780 + }, + { + "epoch": 0.17915185974887468, + "grad_norm": 0.65234375, + "learning_rate": 0.00018464042830505317, + "loss": 1.264, + "step": 3781 + }, + { + "epoch": 0.17919924188580905, + "grad_norm": 0.48828125, + "learning_rate": 0.00018463249645146834, + "loss": 0.2718, + "step": 3782 + }, + { + "epoch": 0.17924662402274344, + "grad_norm": 0.50390625, + "learning_rate": 0.00018462456272082487, + "loss": 0.7724, + "step": 3783 + }, + { + "epoch": 0.1792940061596778, + "grad_norm": 0.2001953125, + "learning_rate": 0.00018461662711329876, + "loss": 0.0419, + "step": 3784 + }, + { + "epoch": 0.17934138829661217, + "grad_norm": 1.0859375, + "learning_rate": 0.00018460868962906594, + "loss": 0.3372, + "step": 3785 + }, + { + "epoch": 0.17938877043354656, + "grad_norm": 0.435546875, + "learning_rate": 0.00018460075026830252, + "loss": 1.0154, + "step": 3786 + }, + { + "epoch": 0.17943615257048093, + "grad_norm": 0.59765625, + "learning_rate": 0.0001845928090311846, + "loss": 0.8292, + "step": 3787 + }, + { + "epoch": 0.1794835347074153, + "grad_norm": 0.244140625, + "learning_rate": 0.00018458486591788826, + "loss": 0.1723, + "step": 3788 + }, + { + "epoch": 0.17953091684434969, + "grad_norm": 0.5546875, + "learning_rate": 0.0001845769209285897, + "loss": 0.6954, + "step": 3789 + }, + { + "epoch": 0.17957829898128405, + "grad_norm": 0.6640625, + "learning_rate": 0.0001845689740634651, + "loss": 0.8468, + "step": 3790 + }, + { + "epoch": 0.17962568111821844, + "grad_norm": 0.6640625, + "learning_rate": 0.00018456102532269077, + "loss": 1.1859, + "step": 3791 + }, + { + "epoch": 0.1796730632551528, + "grad_norm": 0.208984375, + "learning_rate": 0.00018455307470644294, + "loss": 0.1165, + "step": 3792 + }, + { + "epoch": 0.17972044539208717, + "grad_norm": 0.640625, + "learning_rate": 0.000184545122214898, + "loss": 0.6493, + "step": 3793 + }, + { + "epoch": 0.17976782752902157, + "grad_norm": 0.244140625, + "learning_rate": 0.00018453716784823227, + "loss": 0.1474, + "step": 3794 + }, + { + "epoch": 0.17981520966595593, + "grad_norm": 0.67578125, + "learning_rate": 0.00018452921160662223, + "loss": 0.1255, + "step": 3795 + }, + { + "epoch": 0.1798625918028903, + "grad_norm": 0.58984375, + "learning_rate": 0.0001845212534902443, + "loss": 0.6474, + "step": 3796 + }, + { + "epoch": 0.1799099739398247, + "grad_norm": 0.70703125, + "learning_rate": 0.000184513293499275, + "loss": 0.7447, + "step": 3797 + }, + { + "epoch": 0.17995735607675906, + "grad_norm": 0.5078125, + "learning_rate": 0.00018450533163389085, + "loss": 0.9997, + "step": 3798 + }, + { + "epoch": 0.18000473821369345, + "grad_norm": 0.5703125, + "learning_rate": 0.00018449736789426848, + "loss": 1.0661, + "step": 3799 + }, + { + "epoch": 0.18005212035062781, + "grad_norm": 0.6953125, + "learning_rate": 0.0001844894022805845, + "loss": 1.5189, + "step": 3800 + }, + { + "epoch": 0.18009950248756218, + "grad_norm": 0.2001953125, + "learning_rate": 0.00018448143479301554, + "loss": 0.0322, + "step": 3801 + }, + { + "epoch": 0.18014688462449657, + "grad_norm": 0.76171875, + "learning_rate": 0.00018447346543173836, + "loss": 0.9759, + "step": 3802 + }, + { + "epoch": 0.18019426676143094, + "grad_norm": 0.5703125, + "learning_rate": 0.00018446549419692972, + "loss": 1.4936, + "step": 3803 + }, + { + "epoch": 0.1802416488983653, + "grad_norm": 0.58984375, + "learning_rate": 0.00018445752108876633, + "loss": 1.0951, + "step": 3804 + }, + { + "epoch": 0.1802890310352997, + "grad_norm": 0.70703125, + "learning_rate": 0.00018444954610742512, + "loss": 0.0722, + "step": 3805 + }, + { + "epoch": 0.18033641317223406, + "grad_norm": 0.248046875, + "learning_rate": 0.0001844415692530829, + "loss": 0.0524, + "step": 3806 + }, + { + "epoch": 0.18038379530916845, + "grad_norm": 0.431640625, + "learning_rate": 0.00018443359052591665, + "loss": 0.0333, + "step": 3807 + }, + { + "epoch": 0.18043117744610282, + "grad_norm": 0.67578125, + "learning_rate": 0.0001844256099261033, + "loss": 0.6686, + "step": 3808 + }, + { + "epoch": 0.18047855958303718, + "grad_norm": 0.5859375, + "learning_rate": 0.0001844176274538198, + "loss": 1.0334, + "step": 3809 + }, + { + "epoch": 0.18052594171997158, + "grad_norm": 0.69140625, + "learning_rate": 0.00018440964310924328, + "loss": 0.979, + "step": 3810 + }, + { + "epoch": 0.18057332385690594, + "grad_norm": 0.4921875, + "learning_rate": 0.0001844016568925508, + "loss": 0.6279, + "step": 3811 + }, + { + "epoch": 0.18062070599384034, + "grad_norm": 0.158203125, + "learning_rate": 0.00018439366880391943, + "loss": 0.0123, + "step": 3812 + }, + { + "epoch": 0.1806680881307747, + "grad_norm": 0.51171875, + "learning_rate": 0.0001843856788435264, + "loss": 1.1228, + "step": 3813 + }, + { + "epoch": 0.18071547026770907, + "grad_norm": 0.71875, + "learning_rate": 0.00018437768701154891, + "loss": 1.2088, + "step": 3814 + }, + { + "epoch": 0.18076285240464346, + "grad_norm": 0.67578125, + "learning_rate": 0.00018436969330816417, + "loss": 0.9791, + "step": 3815 + }, + { + "epoch": 0.18081023454157782, + "grad_norm": 0.6484375, + "learning_rate": 0.00018436169773354953, + "loss": 1.1451, + "step": 3816 + }, + { + "epoch": 0.1808576166785122, + "grad_norm": 0.53515625, + "learning_rate": 0.00018435370028788226, + "loss": 0.9227, + "step": 3817 + }, + { + "epoch": 0.18090499881544658, + "grad_norm": 0.578125, + "learning_rate": 0.00018434570097133977, + "loss": 1.1372, + "step": 3818 + }, + { + "epoch": 0.18095238095238095, + "grad_norm": 0.5234375, + "learning_rate": 0.0001843376997840995, + "loss": 0.8091, + "step": 3819 + }, + { + "epoch": 0.18099976308931534, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018432969672633887, + "loss": 0.1405, + "step": 3820 + }, + { + "epoch": 0.1810471452262497, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018432169179823538, + "loss": 0.1652, + "step": 3821 + }, + { + "epoch": 0.18109452736318407, + "grad_norm": 0.28515625, + "learning_rate": 0.00018431368499996658, + "loss": 0.1791, + "step": 3822 + }, + { + "epoch": 0.18114190950011846, + "grad_norm": 0.0069580078125, + "learning_rate": 0.00018430567633171004, + "loss": 0.0005, + "step": 3823 + }, + { + "epoch": 0.18118929163705283, + "grad_norm": 0.58984375, + "learning_rate": 0.00018429766579364343, + "loss": 0.8203, + "step": 3824 + }, + { + "epoch": 0.1812366737739872, + "grad_norm": 0.1904296875, + "learning_rate": 0.00018428965338594436, + "loss": 0.1378, + "step": 3825 + }, + { + "epoch": 0.1812840559109216, + "grad_norm": 0.81640625, + "learning_rate": 0.00018428163910879058, + "loss": 1.7155, + "step": 3826 + }, + { + "epoch": 0.18133143804785595, + "grad_norm": 0.59765625, + "learning_rate": 0.0001842736229623598, + "loss": 0.8864, + "step": 3827 + }, + { + "epoch": 0.18137882018479035, + "grad_norm": 0.65234375, + "learning_rate": 0.00018426560494682982, + "loss": 1.245, + "step": 3828 + }, + { + "epoch": 0.1814262023217247, + "grad_norm": 0.48828125, + "learning_rate": 0.00018425758506237852, + "loss": 0.1062, + "step": 3829 + }, + { + "epoch": 0.18147358445865908, + "grad_norm": 0.263671875, + "learning_rate": 0.00018424956330918369, + "loss": 0.1808, + "step": 3830 + }, + { + "epoch": 0.18152096659559347, + "grad_norm": 0.74609375, + "learning_rate": 0.0001842415396874233, + "loss": 0.4874, + "step": 3831 + }, + { + "epoch": 0.18156834873252783, + "grad_norm": 0.609375, + "learning_rate": 0.0001842335141972753, + "loss": 1.0836, + "step": 3832 + }, + { + "epoch": 0.1816157308694622, + "grad_norm": 1.1640625, + "learning_rate": 0.00018422548683891764, + "loss": 0.5258, + "step": 3833 + }, + { + "epoch": 0.1816631130063966, + "grad_norm": 0.447265625, + "learning_rate": 0.0001842174576125284, + "loss": 0.5478, + "step": 3834 + }, + { + "epoch": 0.18171049514333096, + "grad_norm": 0.51953125, + "learning_rate": 0.0001842094265182857, + "loss": 0.8814, + "step": 3835 + }, + { + "epoch": 0.18175787728026535, + "grad_norm": 0.275390625, + "learning_rate": 0.00018420139355636756, + "loss": 0.0369, + "step": 3836 + }, + { + "epoch": 0.18180525941719972, + "grad_norm": 0.54296875, + "learning_rate": 0.00018419335872695225, + "loss": 0.8465, + "step": 3837 + }, + { + "epoch": 0.18185264155413408, + "grad_norm": 0.55859375, + "learning_rate": 0.00018418532203021788, + "loss": 0.9397, + "step": 3838 + }, + { + "epoch": 0.18190002369106847, + "grad_norm": 0.59375, + "learning_rate": 0.00018417728346634276, + "loss": 1.1063, + "step": 3839 + }, + { + "epoch": 0.18194740582800284, + "grad_norm": 0.0546875, + "learning_rate": 0.00018416924303550516, + "loss": 0.0021, + "step": 3840 + }, + { + "epoch": 0.18199478796493723, + "grad_norm": 0.68359375, + "learning_rate": 0.00018416120073788342, + "loss": 0.9548, + "step": 3841 + }, + { + "epoch": 0.1820421701018716, + "grad_norm": 0.4453125, + "learning_rate": 0.00018415315657365584, + "loss": 0.0584, + "step": 3842 + }, + { + "epoch": 0.18208955223880596, + "grad_norm": 0.57421875, + "learning_rate": 0.0001841451105430009, + "loss": 0.9115, + "step": 3843 + }, + { + "epoch": 0.18213693437574036, + "grad_norm": 0.7265625, + "learning_rate": 0.000184137062646097, + "loss": 1.344, + "step": 3844 + }, + { + "epoch": 0.18218431651267472, + "grad_norm": 0.51953125, + "learning_rate": 0.00018412901288312276, + "loss": 0.7186, + "step": 3845 + }, + { + "epoch": 0.1822316986496091, + "grad_norm": 0.5390625, + "learning_rate": 0.00018412096125425658, + "loss": 0.0939, + "step": 3846 + }, + { + "epoch": 0.18227908078654348, + "grad_norm": 0.609375, + "learning_rate": 0.00018411290775967708, + "loss": 1.3313, + "step": 3847 + }, + { + "epoch": 0.18232646292347784, + "grad_norm": 0.69921875, + "learning_rate": 0.00018410485239956286, + "loss": 0.9501, + "step": 3848 + }, + { + "epoch": 0.18237384506041224, + "grad_norm": 0.515625, + "learning_rate": 0.0001840967951740926, + "loss": 0.5894, + "step": 3849 + }, + { + "epoch": 0.1824212271973466, + "grad_norm": 0.59765625, + "learning_rate": 0.00018408873608344504, + "loss": 0.3189, + "step": 3850 + }, + { + "epoch": 0.18246860933428097, + "grad_norm": 0.3828125, + "learning_rate": 0.00018408067512779884, + "loss": 0.0554, + "step": 3851 + }, + { + "epoch": 0.18251599147121536, + "grad_norm": 0.68359375, + "learning_rate": 0.00018407261230733287, + "loss": 0.9996, + "step": 3852 + }, + { + "epoch": 0.18256337360814973, + "grad_norm": 0.392578125, + "learning_rate": 0.00018406454762222584, + "loss": 0.1408, + "step": 3853 + }, + { + "epoch": 0.1826107557450841, + "grad_norm": 0.6875, + "learning_rate": 0.00018405648107265674, + "loss": 0.6393, + "step": 3854 + }, + { + "epoch": 0.18265813788201848, + "grad_norm": 0.8359375, + "learning_rate": 0.00018404841265880442, + "loss": 1.1064, + "step": 3855 + }, + { + "epoch": 0.18270552001895285, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001840403423808478, + "loss": 0.1576, + "step": 3856 + }, + { + "epoch": 0.18275290215588724, + "grad_norm": 0.62109375, + "learning_rate": 0.00018403227023896593, + "loss": 1.107, + "step": 3857 + }, + { + "epoch": 0.1828002842928216, + "grad_norm": 0.455078125, + "learning_rate": 0.0001840241962333378, + "loss": 0.6858, + "step": 3858 + }, + { + "epoch": 0.18284766642975597, + "grad_norm": 0.765625, + "learning_rate": 0.0001840161203641425, + "loss": 0.9427, + "step": 3859 + }, + { + "epoch": 0.18289504856669037, + "grad_norm": 0.484375, + "learning_rate": 0.00018400804263155913, + "loss": 0.0761, + "step": 3860 + }, + { + "epoch": 0.18294243070362473, + "grad_norm": 0.6484375, + "learning_rate": 0.00018399996303576687, + "loss": 0.9487, + "step": 3861 + }, + { + "epoch": 0.1829898128405591, + "grad_norm": 0.69921875, + "learning_rate": 0.0001839918815769449, + "loss": 1.2183, + "step": 3862 + }, + { + "epoch": 0.1830371949774935, + "grad_norm": 0.63671875, + "learning_rate": 0.00018398379825527246, + "loss": 0.6553, + "step": 3863 + }, + { + "epoch": 0.18308457711442785, + "grad_norm": 0.8046875, + "learning_rate": 0.00018397571307092881, + "loss": 1.3488, + "step": 3864 + }, + { + "epoch": 0.18313195925136225, + "grad_norm": 0.546875, + "learning_rate": 0.00018396762602409332, + "loss": 1.0609, + "step": 3865 + }, + { + "epoch": 0.1831793413882966, + "grad_norm": 1.6953125, + "learning_rate": 0.00018395953711494533, + "loss": 0.2589, + "step": 3866 + }, + { + "epoch": 0.18322672352523098, + "grad_norm": 0.63671875, + "learning_rate": 0.0001839514463436642, + "loss": 1.275, + "step": 3867 + }, + { + "epoch": 0.18327410566216537, + "grad_norm": 0.97265625, + "learning_rate": 0.00018394335371042943, + "loss": 1.2991, + "step": 3868 + }, + { + "epoch": 0.18332148779909974, + "grad_norm": 0.66796875, + "learning_rate": 0.00018393525921542048, + "loss": 1.434, + "step": 3869 + }, + { + "epoch": 0.18336886993603413, + "grad_norm": 0.609375, + "learning_rate": 0.00018392716285881692, + "loss": 1.0181, + "step": 3870 + }, + { + "epoch": 0.1834162520729685, + "grad_norm": 0.5390625, + "learning_rate": 0.00018391906464079822, + "loss": 0.7575, + "step": 3871 + }, + { + "epoch": 0.18346363420990286, + "grad_norm": 0.68359375, + "learning_rate": 0.0001839109645615441, + "loss": 1.0288, + "step": 3872 + }, + { + "epoch": 0.18351101634683725, + "grad_norm": 1.0546875, + "learning_rate": 0.00018390286262123416, + "loss": 0.3225, + "step": 3873 + }, + { + "epoch": 0.18355839848377162, + "grad_norm": 0.60546875, + "learning_rate": 0.0001838947588200481, + "loss": 1.5614, + "step": 3874 + }, + { + "epoch": 0.18360578062070598, + "grad_norm": 0.80859375, + "learning_rate": 0.00018388665315816562, + "loss": 1.0404, + "step": 3875 + }, + { + "epoch": 0.18365316275764038, + "grad_norm": 0.6640625, + "learning_rate": 0.00018387854563576655, + "loss": 0.7009, + "step": 3876 + }, + { + "epoch": 0.18370054489457474, + "grad_norm": 0.55859375, + "learning_rate": 0.00018387043625303068, + "loss": 0.0823, + "step": 3877 + }, + { + "epoch": 0.18374792703150913, + "grad_norm": 0.5703125, + "learning_rate": 0.00018386232501013786, + "loss": 0.7296, + "step": 3878 + }, + { + "epoch": 0.1837953091684435, + "grad_norm": 0.65625, + "learning_rate": 0.000183854211907268, + "loss": 0.1192, + "step": 3879 + }, + { + "epoch": 0.18384269130537786, + "grad_norm": 0.453125, + "learning_rate": 0.00018384609694460102, + "loss": 0.1747, + "step": 3880 + }, + { + "epoch": 0.18389007344231226, + "grad_norm": 0.66796875, + "learning_rate": 0.00018383798012231695, + "loss": 1.5189, + "step": 3881 + }, + { + "epoch": 0.18393745557924662, + "grad_norm": 0.50390625, + "learning_rate": 0.00018382986144059577, + "loss": 0.0617, + "step": 3882 + }, + { + "epoch": 0.183984837716181, + "grad_norm": 0.87109375, + "learning_rate": 0.00018382174089961754, + "loss": 1.3788, + "step": 3883 + }, + { + "epoch": 0.18403221985311538, + "grad_norm": 0.94921875, + "learning_rate": 0.0001838136184995624, + "loss": 1.0907, + "step": 3884 + }, + { + "epoch": 0.18407960199004975, + "grad_norm": 0.8828125, + "learning_rate": 0.00018380549424061045, + "loss": 0.2911, + "step": 3885 + }, + { + "epoch": 0.18412698412698414, + "grad_norm": 0.51171875, + "learning_rate": 0.00018379736812294194, + "loss": 1.1701, + "step": 3886 + }, + { + "epoch": 0.1841743662639185, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018378924014673703, + "loss": 0.0224, + "step": 3887 + }, + { + "epoch": 0.18422174840085287, + "grad_norm": 0.01446533203125, + "learning_rate": 0.00018378111031217605, + "loss": 0.0009, + "step": 3888 + }, + { + "epoch": 0.18426913053778726, + "grad_norm": 0.5390625, + "learning_rate": 0.00018377297861943927, + "loss": 0.7568, + "step": 3889 + }, + { + "epoch": 0.18431651267472163, + "grad_norm": 0.470703125, + "learning_rate": 0.00018376484506870707, + "loss": 0.6754, + "step": 3890 + }, + { + "epoch": 0.184363894811656, + "grad_norm": 0.72265625, + "learning_rate": 0.0001837567096601598, + "loss": 1.0827, + "step": 3891 + }, + { + "epoch": 0.18441127694859039, + "grad_norm": 0.703125, + "learning_rate": 0.00018374857239397794, + "loss": 1.0899, + "step": 3892 + }, + { + "epoch": 0.18445865908552475, + "grad_norm": 0.859375, + "learning_rate": 0.00018374043327034194, + "loss": 1.1447, + "step": 3893 + }, + { + "epoch": 0.18450604122245914, + "grad_norm": 0.79296875, + "learning_rate": 0.0001837322922894323, + "loss": 0.729, + "step": 3894 + }, + { + "epoch": 0.1845534233593935, + "grad_norm": 1.140625, + "learning_rate": 0.00018372414945142963, + "loss": 0.6065, + "step": 3895 + }, + { + "epoch": 0.18460080549632787, + "grad_norm": 0.3671875, + "learning_rate": 0.00018371600475651455, + "loss": 0.0833, + "step": 3896 + }, + { + "epoch": 0.18464818763326227, + "grad_norm": 0.51171875, + "learning_rate": 0.0001837078582048676, + "loss": 1.0352, + "step": 3897 + }, + { + "epoch": 0.18469556977019663, + "grad_norm": 0.59765625, + "learning_rate": 0.00018369970979666952, + "loss": 0.5847, + "step": 3898 + }, + { + "epoch": 0.184742951907131, + "grad_norm": 0.6640625, + "learning_rate": 0.00018369155953210103, + "loss": 0.871, + "step": 3899 + }, + { + "epoch": 0.1847903340440654, + "grad_norm": 0.83203125, + "learning_rate": 0.00018368340741134294, + "loss": 1.4962, + "step": 3900 + }, + { + "epoch": 0.18483771618099976, + "grad_norm": 0.52734375, + "learning_rate": 0.00018367525343457596, + "loss": 0.5314, + "step": 3901 + }, + { + "epoch": 0.18488509831793415, + "grad_norm": 0.69140625, + "learning_rate": 0.000183667097601981, + "loss": 1.46, + "step": 3902 + }, + { + "epoch": 0.18493248045486851, + "grad_norm": 0.294921875, + "learning_rate": 0.00018365893991373892, + "loss": 0.071, + "step": 3903 + }, + { + "epoch": 0.18497986259180288, + "grad_norm": 0.62890625, + "learning_rate": 0.00018365078037003069, + "loss": 0.9815, + "step": 3904 + }, + { + "epoch": 0.18502724472873727, + "grad_norm": 0.7890625, + "learning_rate": 0.00018364261897103724, + "loss": 1.2039, + "step": 3905 + }, + { + "epoch": 0.18507462686567164, + "grad_norm": 0.8828125, + "learning_rate": 0.0001836344557169396, + "loss": 1.0092, + "step": 3906 + }, + { + "epoch": 0.18512200900260603, + "grad_norm": 0.58984375, + "learning_rate": 0.00018362629060791877, + "loss": 1.2131, + "step": 3907 + }, + { + "epoch": 0.1851693911395404, + "grad_norm": 0.01556396484375, + "learning_rate": 0.00018361812364415595, + "loss": 0.001, + "step": 3908 + }, + { + "epoch": 0.18521677327647476, + "grad_norm": 0.404296875, + "learning_rate": 0.0001836099548258322, + "loss": 0.0326, + "step": 3909 + }, + { + "epoch": 0.18526415541340915, + "grad_norm": 0.8359375, + "learning_rate": 0.00018360178415312867, + "loss": 0.6078, + "step": 3910 + }, + { + "epoch": 0.18531153755034352, + "grad_norm": 0.62109375, + "learning_rate": 0.00018359361162622662, + "loss": 1.2607, + "step": 3911 + }, + { + "epoch": 0.18535891968727788, + "grad_norm": 0.671875, + "learning_rate": 0.00018358543724530737, + "loss": 1.049, + "step": 3912 + }, + { + "epoch": 0.18540630182421228, + "grad_norm": 0.32421875, + "learning_rate": 0.00018357726101055207, + "loss": 0.0309, + "step": 3913 + }, + { + "epoch": 0.18545368396114664, + "grad_norm": 0.71484375, + "learning_rate": 0.0001835690829221422, + "loss": 1.0746, + "step": 3914 + }, + { + "epoch": 0.18550106609808104, + "grad_norm": 0.310546875, + "learning_rate": 0.0001835609029802591, + "loss": 0.0499, + "step": 3915 + }, + { + "epoch": 0.1855484482350154, + "grad_norm": 0.70703125, + "learning_rate": 0.00018355272118508414, + "loss": 1.2679, + "step": 3916 + }, + { + "epoch": 0.18559583037194977, + "grad_norm": 0.56640625, + "learning_rate": 0.00018354453753679882, + "loss": 1.0898, + "step": 3917 + }, + { + "epoch": 0.18564321250888416, + "grad_norm": 0.51953125, + "learning_rate": 0.00018353635203558467, + "loss": 0.6901, + "step": 3918 + }, + { + "epoch": 0.18569059464581852, + "grad_norm": 0.828125, + "learning_rate": 0.00018352816468162318, + "loss": 1.3228, + "step": 3919 + }, + { + "epoch": 0.1857379767827529, + "grad_norm": 0.58984375, + "learning_rate": 0.00018351997547509598, + "loss": 1.2299, + "step": 3920 + }, + { + "epoch": 0.18578535891968728, + "grad_norm": 0.62109375, + "learning_rate": 0.00018351178441618467, + "loss": 1.2638, + "step": 3921 + }, + { + "epoch": 0.18583274105662165, + "grad_norm": 0.6328125, + "learning_rate": 0.00018350359150507095, + "loss": 0.7652, + "step": 3922 + }, + { + "epoch": 0.18588012319355604, + "grad_norm": 0.41796875, + "learning_rate": 0.00018349539674193652, + "loss": 0.0936, + "step": 3923 + }, + { + "epoch": 0.1859275053304904, + "grad_norm": 0.515625, + "learning_rate": 0.00018348720012696312, + "loss": 1.1508, + "step": 3924 + }, + { + "epoch": 0.18597488746742477, + "grad_norm": 0.96875, + "learning_rate": 0.00018347900166033253, + "loss": 0.3565, + "step": 3925 + }, + { + "epoch": 0.18602226960435916, + "grad_norm": 0.498046875, + "learning_rate": 0.00018347080134222663, + "loss": 0.5486, + "step": 3926 + }, + { + "epoch": 0.18606965174129353, + "grad_norm": 0.9921875, + "learning_rate": 0.00018346259917282725, + "loss": 0.9104, + "step": 3927 + }, + { + "epoch": 0.1861170338782279, + "grad_norm": 0.609375, + "learning_rate": 0.00018345439515231633, + "loss": 1.1383, + "step": 3928 + }, + { + "epoch": 0.1861644160151623, + "grad_norm": 0.8515625, + "learning_rate": 0.00018344618928087584, + "loss": 0.5591, + "step": 3929 + }, + { + "epoch": 0.18621179815209665, + "grad_norm": 0.3984375, + "learning_rate": 0.0001834379815586877, + "loss": 0.1086, + "step": 3930 + }, + { + "epoch": 0.18625918028903105, + "grad_norm": 0.23046875, + "learning_rate": 0.00018342977198593404, + "loss": 0.0302, + "step": 3931 + }, + { + "epoch": 0.1863065624259654, + "grad_norm": 0.50390625, + "learning_rate": 0.00018342156056279686, + "loss": 0.8353, + "step": 3932 + }, + { + "epoch": 0.18635394456289978, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001834133472894584, + "loss": 0.007, + "step": 3933 + }, + { + "epoch": 0.18640132669983417, + "grad_norm": 0.5546875, + "learning_rate": 0.00018340513216610066, + "loss": 1.2658, + "step": 3934 + }, + { + "epoch": 0.18644870883676853, + "grad_norm": 0.53125, + "learning_rate": 0.00018339691519290597, + "loss": 0.1188, + "step": 3935 + }, + { + "epoch": 0.18649609097370293, + "grad_norm": 0.84765625, + "learning_rate": 0.00018338869637005652, + "loss": 0.4885, + "step": 3936 + }, + { + "epoch": 0.1865434731106373, + "grad_norm": 0.6953125, + "learning_rate": 0.00018338047569773464, + "loss": 1.4999, + "step": 3937 + }, + { + "epoch": 0.18659085524757166, + "grad_norm": 0.5625, + "learning_rate": 0.00018337225317612262, + "loss": 0.925, + "step": 3938 + }, + { + "epoch": 0.18663823738450605, + "grad_norm": 0.51953125, + "learning_rate": 0.00018336402880540277, + "loss": 0.9007, + "step": 3939 + }, + { + "epoch": 0.18668561952144042, + "grad_norm": 0.83203125, + "learning_rate": 0.00018335580258575763, + "loss": 1.3914, + "step": 3940 + }, + { + "epoch": 0.18673300165837478, + "grad_norm": 0.61328125, + "learning_rate": 0.00018334757451736951, + "loss": 1.3489, + "step": 3941 + }, + { + "epoch": 0.18678038379530917, + "grad_norm": 0.53125, + "learning_rate": 0.000183339344600421, + "loss": 0.9137, + "step": 3942 + }, + { + "epoch": 0.18682776593224354, + "grad_norm": 0.4609375, + "learning_rate": 0.00018333111283509458, + "loss": 0.9077, + "step": 3943 + }, + { + "epoch": 0.18687514806917793, + "grad_norm": 0.451171875, + "learning_rate": 0.00018332287922157288, + "loss": 0.8801, + "step": 3944 + }, + { + "epoch": 0.1869225302061123, + "grad_norm": 0.431640625, + "learning_rate": 0.00018331464376003843, + "loss": 0.5744, + "step": 3945 + }, + { + "epoch": 0.18696991234304666, + "grad_norm": 0.546875, + "learning_rate": 0.00018330640645067392, + "loss": 0.6218, + "step": 3946 + }, + { + "epoch": 0.18701729447998106, + "grad_norm": 1.0390625, + "learning_rate": 0.00018329816729366206, + "loss": 0.5785, + "step": 3947 + }, + { + "epoch": 0.18706467661691542, + "grad_norm": 0.58984375, + "learning_rate": 0.00018328992628918557, + "loss": 0.4056, + "step": 3948 + }, + { + "epoch": 0.18711205875384979, + "grad_norm": 0.490234375, + "learning_rate": 0.0001832816834374272, + "loss": 0.8687, + "step": 3949 + }, + { + "epoch": 0.18715944089078418, + "grad_norm": 0.310546875, + "learning_rate": 0.00018327343873856985, + "loss": 0.1306, + "step": 3950 + }, + { + "epoch": 0.18720682302771854, + "grad_norm": 0.55859375, + "learning_rate": 0.00018326519219279632, + "loss": 1.1476, + "step": 3951 + }, + { + "epoch": 0.18725420516465294, + "grad_norm": 0.4453125, + "learning_rate": 0.00018325694380028952, + "loss": 0.6137, + "step": 3952 + }, + { + "epoch": 0.1873015873015873, + "grad_norm": 0.546875, + "learning_rate": 0.00018324869356123238, + "loss": 0.6959, + "step": 3953 + }, + { + "epoch": 0.18734896943852167, + "grad_norm": 0.66015625, + "learning_rate": 0.0001832404414758079, + "loss": 0.6488, + "step": 3954 + }, + { + "epoch": 0.18739635157545606, + "grad_norm": 0.7421875, + "learning_rate": 0.00018323218754419905, + "loss": 0.7129, + "step": 3955 + }, + { + "epoch": 0.18744373371239043, + "grad_norm": 0.67578125, + "learning_rate": 0.00018322393176658898, + "loss": 0.9986, + "step": 3956 + }, + { + "epoch": 0.1874911158493248, + "grad_norm": 0.490234375, + "learning_rate": 0.00018321567414316073, + "loss": 0.7868, + "step": 3957 + }, + { + "epoch": 0.18753849798625918, + "grad_norm": 0.6953125, + "learning_rate": 0.00018320741467409748, + "loss": 0.3172, + "step": 3958 + }, + { + "epoch": 0.18758588012319355, + "grad_norm": 0.76171875, + "learning_rate": 0.0001831991533595824, + "loss": 1.3278, + "step": 3959 + }, + { + "epoch": 0.18763326226012794, + "grad_norm": 0.546875, + "learning_rate": 0.00018319089019979868, + "loss": 0.6262, + "step": 3960 + }, + { + "epoch": 0.1876806443970623, + "grad_norm": 0.578125, + "learning_rate": 0.00018318262519492965, + "loss": 1.1593, + "step": 3961 + }, + { + "epoch": 0.18772802653399667, + "grad_norm": 0.287109375, + "learning_rate": 0.00018317435834515862, + "loss": 0.0484, + "step": 3962 + }, + { + "epoch": 0.18777540867093107, + "grad_norm": 0.57421875, + "learning_rate": 0.00018316608965066887, + "loss": 0.9068, + "step": 3963 + }, + { + "epoch": 0.18782279080786543, + "grad_norm": 0.30859375, + "learning_rate": 0.00018315781911164386, + "loss": 0.1292, + "step": 3964 + }, + { + "epoch": 0.18787017294479982, + "grad_norm": 0.69921875, + "learning_rate": 0.00018314954672826703, + "loss": 0.2688, + "step": 3965 + }, + { + "epoch": 0.1879175550817342, + "grad_norm": 0.55859375, + "learning_rate": 0.00018314127250072178, + "loss": 1.1685, + "step": 3966 + }, + { + "epoch": 0.18796493721866855, + "grad_norm": 0.6484375, + "learning_rate": 0.00018313299642919167, + "loss": 1.2903, + "step": 3967 + }, + { + "epoch": 0.18801231935560295, + "grad_norm": 0.6875, + "learning_rate": 0.00018312471851386026, + "loss": 0.9526, + "step": 3968 + }, + { + "epoch": 0.1880597014925373, + "grad_norm": 0.66796875, + "learning_rate": 0.00018311643875491113, + "loss": 1.6321, + "step": 3969 + }, + { + "epoch": 0.18810708362947168, + "grad_norm": 0.62890625, + "learning_rate": 0.00018310815715252788, + "loss": 1.1504, + "step": 3970 + }, + { + "epoch": 0.18815446576640607, + "grad_norm": 0.6171875, + "learning_rate": 0.00018309987370689428, + "loss": 0.6617, + "step": 3971 + }, + { + "epoch": 0.18820184790334044, + "grad_norm": 1.4296875, + "learning_rate": 0.00018309158841819397, + "loss": 0.7718, + "step": 3972 + }, + { + "epoch": 0.18824923004027483, + "grad_norm": 0.59765625, + "learning_rate": 0.00018308330128661074, + "loss": 0.8172, + "step": 3973 + }, + { + "epoch": 0.1882966121772092, + "grad_norm": 0.76171875, + "learning_rate": 0.0001830750123123284, + "loss": 0.7052, + "step": 3974 + }, + { + "epoch": 0.18834399431414356, + "grad_norm": 0.80078125, + "learning_rate": 0.00018306672149553076, + "loss": 1.2497, + "step": 3975 + }, + { + "epoch": 0.18839137645107795, + "grad_norm": 0.58203125, + "learning_rate": 0.0001830584288364017, + "loss": 0.7786, + "step": 3976 + }, + { + "epoch": 0.18843875858801232, + "grad_norm": 0.39453125, + "learning_rate": 0.0001830501343351252, + "loss": 0.1944, + "step": 3977 + }, + { + "epoch": 0.18848614072494668, + "grad_norm": 0.30859375, + "learning_rate": 0.00018304183799188514, + "loss": 0.043, + "step": 3978 + }, + { + "epoch": 0.18853352286188108, + "grad_norm": 0.0064697265625, + "learning_rate": 0.00018303353980686558, + "loss": 0.0004, + "step": 3979 + }, + { + "epoch": 0.18858090499881544, + "grad_norm": 0.490234375, + "learning_rate": 0.00018302523978025058, + "loss": 0.1154, + "step": 3980 + }, + { + "epoch": 0.18862828713574983, + "grad_norm": 0.138671875, + "learning_rate": 0.00018301693791222413, + "loss": 0.0054, + "step": 3981 + }, + { + "epoch": 0.1886756692726842, + "grad_norm": 0.609375, + "learning_rate": 0.0001830086342029705, + "loss": 0.6819, + "step": 3982 + }, + { + "epoch": 0.18872305140961856, + "grad_norm": 0.671875, + "learning_rate": 0.0001830003286526737, + "loss": 0.9113, + "step": 3983 + }, + { + "epoch": 0.18877043354655296, + "grad_norm": 0.625, + "learning_rate": 0.00018299202126151808, + "loss": 1.3575, + "step": 3984 + }, + { + "epoch": 0.18881781568348732, + "grad_norm": 0.419921875, + "learning_rate": 0.00018298371202968782, + "loss": 0.74, + "step": 3985 + }, + { + "epoch": 0.1888651978204217, + "grad_norm": 0.71484375, + "learning_rate": 0.0001829754009573672, + "loss": 1.3455, + "step": 3986 + }, + { + "epoch": 0.18891257995735608, + "grad_norm": 1.0625, + "learning_rate": 0.00018296708804474058, + "loss": 0.1818, + "step": 3987 + }, + { + "epoch": 0.18895996209429045, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001829587732919923, + "loss": 0.0035, + "step": 3988 + }, + { + "epoch": 0.18900734423122484, + "grad_norm": 0.8203125, + "learning_rate": 0.00018295045669930682, + "loss": 0.9317, + "step": 3989 + }, + { + "epoch": 0.1890547263681592, + "grad_norm": 0.75390625, + "learning_rate": 0.00018294213826686853, + "loss": 1.188, + "step": 3990 + }, + { + "epoch": 0.18910210850509357, + "grad_norm": 0.5703125, + "learning_rate": 0.000182933817994862, + "loss": 0.7484, + "step": 3991 + }, + { + "epoch": 0.18914949064202796, + "grad_norm": 0.56640625, + "learning_rate": 0.00018292549588347167, + "loss": 1.4723, + "step": 3992 + }, + { + "epoch": 0.18919687277896233, + "grad_norm": 0.6171875, + "learning_rate": 0.00018291717193288224, + "loss": 0.9236, + "step": 3993 + }, + { + "epoch": 0.18924425491589672, + "grad_norm": 0.640625, + "learning_rate": 0.00018290884614327818, + "loss": 0.9172, + "step": 3994 + }, + { + "epoch": 0.18929163705283109, + "grad_norm": 0.9609375, + "learning_rate": 0.0001829005185148443, + "loss": 0.6576, + "step": 3995 + }, + { + "epoch": 0.18933901918976545, + "grad_norm": 0.75390625, + "learning_rate": 0.00018289218904776516, + "loss": 0.8123, + "step": 3996 + }, + { + "epoch": 0.18938640132669984, + "grad_norm": 0.69921875, + "learning_rate": 0.00018288385774222558, + "loss": 1.4964, + "step": 3997 + }, + { + "epoch": 0.1894337834636342, + "grad_norm": 0.95703125, + "learning_rate": 0.00018287552459841033, + "loss": 0.0254, + "step": 3998 + }, + { + "epoch": 0.18948116560056857, + "grad_norm": 1.3984375, + "learning_rate": 0.00018286718961650422, + "loss": 1.1917, + "step": 3999 + }, + { + "epoch": 0.18952854773750297, + "grad_norm": 0.6328125, + "learning_rate": 0.00018285885279669209, + "loss": 0.7383, + "step": 4000 + }, + { + "epoch": 0.18957592987443733, + "grad_norm": 0.6328125, + "learning_rate": 0.00018285051413915885, + "loss": 1.2202, + "step": 4001 + }, + { + "epoch": 0.18962331201137173, + "grad_norm": 1.015625, + "learning_rate": 0.0001828421736440895, + "loss": 0.1662, + "step": 4002 + }, + { + "epoch": 0.1896706941483061, + "grad_norm": 0.671875, + "learning_rate": 0.00018283383131166895, + "loss": 0.8653, + "step": 4003 + }, + { + "epoch": 0.18971807628524046, + "grad_norm": 0.67578125, + "learning_rate": 0.00018282548714208227, + "loss": 1.4906, + "step": 4004 + }, + { + "epoch": 0.18976545842217485, + "grad_norm": 0.58984375, + "learning_rate": 0.00018281714113551448, + "loss": 0.043, + "step": 4005 + }, + { + "epoch": 0.1898128405591092, + "grad_norm": 0.51171875, + "learning_rate": 0.00018280879329215076, + "loss": 0.642, + "step": 4006 + }, + { + "epoch": 0.18986022269604358, + "grad_norm": 0.61328125, + "learning_rate": 0.00018280044361217615, + "loss": 0.9417, + "step": 4007 + }, + { + "epoch": 0.18990760483297797, + "grad_norm": 0.63671875, + "learning_rate": 0.00018279209209577592, + "loss": 1.0475, + "step": 4008 + }, + { + "epoch": 0.18995498696991234, + "grad_norm": 0.70703125, + "learning_rate": 0.0001827837387431353, + "loss": 1.0232, + "step": 4009 + }, + { + "epoch": 0.19000236910684673, + "grad_norm": 0.06005859375, + "learning_rate": 0.0001827753835544395, + "loss": 0.005, + "step": 4010 + }, + { + "epoch": 0.1900497512437811, + "grad_norm": 0.5546875, + "learning_rate": 0.00018276702652987389, + "loss": 1.1849, + "step": 4011 + }, + { + "epoch": 0.19009713338071546, + "grad_norm": 0.314453125, + "learning_rate": 0.00018275866766962376, + "loss": 0.1887, + "step": 4012 + }, + { + "epoch": 0.19014451551764985, + "grad_norm": 0.53515625, + "learning_rate": 0.0001827503069738745, + "loss": 1.0838, + "step": 4013 + }, + { + "epoch": 0.19019189765458422, + "grad_norm": 0.58984375, + "learning_rate": 0.00018274194444281165, + "loss": 1.0973, + "step": 4014 + }, + { + "epoch": 0.19023927979151858, + "grad_norm": 0.6328125, + "learning_rate": 0.00018273358007662055, + "loss": 0.9148, + "step": 4015 + }, + { + "epoch": 0.19028666192845298, + "grad_norm": 0.6875, + "learning_rate": 0.0001827252138754868, + "loss": 0.805, + "step": 4016 + }, + { + "epoch": 0.19033404406538734, + "grad_norm": 0.671875, + "learning_rate": 0.0001827168458395959, + "loss": 0.6369, + "step": 4017 + }, + { + "epoch": 0.19038142620232174, + "grad_norm": 0.76953125, + "learning_rate": 0.0001827084759691335, + "loss": 1.0181, + "step": 4018 + }, + { + "epoch": 0.1904288083392561, + "grad_norm": 0.55859375, + "learning_rate": 0.00018270010426428516, + "loss": 0.5452, + "step": 4019 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.48046875, + "learning_rate": 0.00018269173072523663, + "loss": 1.1883, + "step": 4020 + }, + { + "epoch": 0.19052357261312486, + "grad_norm": 0.828125, + "learning_rate": 0.00018268335535217355, + "loss": 1.6229, + "step": 4021 + }, + { + "epoch": 0.19057095475005922, + "grad_norm": 0.60546875, + "learning_rate": 0.00018267497814528175, + "loss": 0.7858, + "step": 4022 + }, + { + "epoch": 0.19061833688699362, + "grad_norm": 0.5625, + "learning_rate": 0.00018266659910474696, + "loss": 1.03, + "step": 4023 + }, + { + "epoch": 0.19066571902392798, + "grad_norm": 0.65234375, + "learning_rate": 0.00018265821823075512, + "loss": 0.8899, + "step": 4024 + }, + { + "epoch": 0.19071310116086235, + "grad_norm": 0.6171875, + "learning_rate": 0.00018264983552349198, + "loss": 0.2098, + "step": 4025 + }, + { + "epoch": 0.19076048329779674, + "grad_norm": 0.46484375, + "learning_rate": 0.00018264145098314356, + "loss": 0.0286, + "step": 4026 + }, + { + "epoch": 0.1908078654347311, + "grad_norm": 0.578125, + "learning_rate": 0.00018263306460989575, + "loss": 0.5669, + "step": 4027 + }, + { + "epoch": 0.19085524757166547, + "grad_norm": 0.08251953125, + "learning_rate": 0.00018262467640393462, + "loss": 0.008, + "step": 4028 + }, + { + "epoch": 0.19090262970859986, + "grad_norm": 0.470703125, + "learning_rate": 0.00018261628636544614, + "loss": 0.5928, + "step": 4029 + }, + { + "epoch": 0.19095001184553423, + "grad_norm": 0.5625, + "learning_rate": 0.00018260789449461645, + "loss": 0.8542, + "step": 4030 + }, + { + "epoch": 0.19099739398246862, + "grad_norm": 1.0859375, + "learning_rate": 0.00018259950079163164, + "loss": 0.9255, + "step": 4031 + }, + { + "epoch": 0.191044776119403, + "grad_norm": 0.71484375, + "learning_rate": 0.0001825911052566779, + "loss": 1.2408, + "step": 4032 + }, + { + "epoch": 0.19109215825633735, + "grad_norm": 0.6171875, + "learning_rate": 0.00018258270788994142, + "loss": 1.4613, + "step": 4033 + }, + { + "epoch": 0.19113954039327175, + "grad_norm": 0.796875, + "learning_rate": 0.00018257430869160843, + "loss": 1.1682, + "step": 4034 + }, + { + "epoch": 0.1911869225302061, + "grad_norm": 0.95703125, + "learning_rate": 0.00018256590766186522, + "loss": 1.0742, + "step": 4035 + }, + { + "epoch": 0.19123430466714048, + "grad_norm": 0.56640625, + "learning_rate": 0.00018255750480089812, + "loss": 0.4825, + "step": 4036 + }, + { + "epoch": 0.19128168680407487, + "grad_norm": 0.796875, + "learning_rate": 0.00018254910010889354, + "loss": 0.9041, + "step": 4037 + }, + { + "epoch": 0.19132906894100923, + "grad_norm": 0.73828125, + "learning_rate": 0.0001825406935860378, + "loss": 1.0246, + "step": 4038 + }, + { + "epoch": 0.19137645107794363, + "grad_norm": 0.6640625, + "learning_rate": 0.0001825322852325174, + "loss": 1.1757, + "step": 4039 + }, + { + "epoch": 0.191423833214878, + "grad_norm": 0.53515625, + "learning_rate": 0.00018252387504851883, + "loss": 1.2713, + "step": 4040 + }, + { + "epoch": 0.19147121535181236, + "grad_norm": 0.63671875, + "learning_rate": 0.00018251546303422865, + "loss": 1.3427, + "step": 4041 + }, + { + "epoch": 0.19151859748874675, + "grad_norm": 0.84375, + "learning_rate": 0.00018250704918983335, + "loss": 1.2905, + "step": 4042 + }, + { + "epoch": 0.19156597962568112, + "grad_norm": 0.578125, + "learning_rate": 0.00018249863351551957, + "loss": 0.9729, + "step": 4043 + }, + { + "epoch": 0.19161336176261548, + "grad_norm": 0.419921875, + "learning_rate": 0.000182490216011474, + "loss": 0.6328, + "step": 4044 + }, + { + "epoch": 0.19166074389954987, + "grad_norm": 0.408203125, + "learning_rate": 0.0001824817966778833, + "loss": 0.0854, + "step": 4045 + }, + { + "epoch": 0.19170812603648424, + "grad_norm": 0.048095703125, + "learning_rate": 0.0001824733755149342, + "loss": 0.0029, + "step": 4046 + }, + { + "epoch": 0.19175550817341863, + "grad_norm": 0.01312255859375, + "learning_rate": 0.0001824649525228135, + "loss": 0.0006, + "step": 4047 + }, + { + "epoch": 0.191802890310353, + "grad_norm": 0.56640625, + "learning_rate": 0.00018245652770170794, + "loss": 0.3941, + "step": 4048 + }, + { + "epoch": 0.19185027244728736, + "grad_norm": 0.609375, + "learning_rate": 0.00018244810105180444, + "loss": 1.1245, + "step": 4049 + }, + { + "epoch": 0.19189765458422176, + "grad_norm": 0.65234375, + "learning_rate": 0.0001824396725732899, + "loss": 0.1825, + "step": 4050 + }, + { + "epoch": 0.19194503672115612, + "grad_norm": 0.65625, + "learning_rate": 0.00018243124226635123, + "loss": 0.8779, + "step": 4051 + }, + { + "epoch": 0.1919924188580905, + "grad_norm": 0.55859375, + "learning_rate": 0.0001824228101311754, + "loss": 0.842, + "step": 4052 + }, + { + "epoch": 0.19203980099502488, + "grad_norm": 0.29296875, + "learning_rate": 0.00018241437616794946, + "loss": 0.1626, + "step": 4053 + }, + { + "epoch": 0.19208718313195924, + "grad_norm": 0.484375, + "learning_rate": 0.0001824059403768604, + "loss": 0.7362, + "step": 4054 + }, + { + "epoch": 0.19213456526889364, + "grad_norm": 0.6171875, + "learning_rate": 0.00018239750275809538, + "loss": 1.2624, + "step": 4055 + }, + { + "epoch": 0.192181947405828, + "grad_norm": 0.515625, + "learning_rate": 0.0001823890633118415, + "loss": 0.8972, + "step": 4056 + }, + { + "epoch": 0.19222932954276237, + "grad_norm": 0.4453125, + "learning_rate": 0.00018238062203828598, + "loss": 0.5408, + "step": 4057 + }, + { + "epoch": 0.19227671167969676, + "grad_norm": 0.65234375, + "learning_rate": 0.00018237217893761598, + "loss": 1.1592, + "step": 4058 + }, + { + "epoch": 0.19232409381663113, + "grad_norm": 0.58984375, + "learning_rate": 0.00018236373401001878, + "loss": 1.0925, + "step": 4059 + }, + { + "epoch": 0.19237147595356552, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00018235528725568174, + "loss": 0.0004, + "step": 4060 + }, + { + "epoch": 0.19241885809049988, + "grad_norm": 0.5859375, + "learning_rate": 0.0001823468386747921, + "loss": 1.0052, + "step": 4061 + }, + { + "epoch": 0.19246624022743425, + "grad_norm": 0.609375, + "learning_rate": 0.00018233838826753733, + "loss": 0.3306, + "step": 4062 + }, + { + "epoch": 0.19251362236436864, + "grad_norm": 0.671875, + "learning_rate": 0.00018232993603410478, + "loss": 0.1146, + "step": 4063 + }, + { + "epoch": 0.192561004501303, + "grad_norm": 0.61328125, + "learning_rate": 0.00018232148197468194, + "loss": 1.229, + "step": 4064 + }, + { + "epoch": 0.19260838663823737, + "grad_norm": 1.0390625, + "learning_rate": 0.00018231302608945636, + "loss": 0.1671, + "step": 4065 + }, + { + "epoch": 0.19265576877517177, + "grad_norm": 0.1005859375, + "learning_rate": 0.0001823045683786155, + "loss": 0.0051, + "step": 4066 + }, + { + "epoch": 0.19270315091210613, + "grad_norm": 0.51171875, + "learning_rate": 0.00018229610884234698, + "loss": 0.7844, + "step": 4067 + }, + { + "epoch": 0.19275053304904052, + "grad_norm": 1.2109375, + "learning_rate": 0.00018228764748083847, + "loss": 0.9091, + "step": 4068 + }, + { + "epoch": 0.1927979151859749, + "grad_norm": 0.57421875, + "learning_rate": 0.00018227918429427752, + "loss": 0.7842, + "step": 4069 + }, + { + "epoch": 0.19284529732290925, + "grad_norm": 0.51171875, + "learning_rate": 0.00018227071928285192, + "loss": 1.0486, + "step": 4070 + }, + { + "epoch": 0.19289267945984365, + "grad_norm": 0.259765625, + "learning_rate": 0.00018226225244674944, + "loss": 0.0681, + "step": 4071 + }, + { + "epoch": 0.192940061596778, + "grad_norm": 0.12158203125, + "learning_rate": 0.00018225378378615778, + "loss": 0.0134, + "step": 4072 + }, + { + "epoch": 0.19298744373371238, + "grad_norm": 0.498046875, + "learning_rate": 0.00018224531330126483, + "loss": 0.9821, + "step": 4073 + }, + { + "epoch": 0.19303482587064677, + "grad_norm": 0.828125, + "learning_rate": 0.00018223684099225843, + "loss": 0.246, + "step": 4074 + }, + { + "epoch": 0.19308220800758114, + "grad_norm": 0.59765625, + "learning_rate": 0.00018222836685932646, + "loss": 1.0779, + "step": 4075 + }, + { + "epoch": 0.19312959014451553, + "grad_norm": 0.4453125, + "learning_rate": 0.00018221989090265694, + "loss": 0.4981, + "step": 4076 + }, + { + "epoch": 0.1931769722814499, + "grad_norm": 0.6484375, + "learning_rate": 0.0001822114131224378, + "loss": 0.8945, + "step": 4077 + }, + { + "epoch": 0.19322435441838426, + "grad_norm": 0.435546875, + "learning_rate": 0.0001822029335188571, + "loss": 0.3565, + "step": 4078 + }, + { + "epoch": 0.19327173655531865, + "grad_norm": 0.703125, + "learning_rate": 0.00018219445209210287, + "loss": 1.3734, + "step": 4079 + }, + { + "epoch": 0.19331911869225302, + "grad_norm": 0.63671875, + "learning_rate": 0.00018218596884236326, + "loss": 1.1543, + "step": 4080 + }, + { + "epoch": 0.1933665008291874, + "grad_norm": 0.0107421875, + "learning_rate": 0.00018217748376982636, + "loss": 0.0005, + "step": 4081 + }, + { + "epoch": 0.19341388296612178, + "grad_norm": 0.7109375, + "learning_rate": 0.00018216899687468048, + "loss": 0.8825, + "step": 4082 + }, + { + "epoch": 0.19346126510305614, + "grad_norm": 0.51953125, + "learning_rate": 0.0001821605081571137, + "loss": 1.0132, + "step": 4083 + }, + { + "epoch": 0.19350864723999053, + "grad_norm": 0.4921875, + "learning_rate": 0.0001821520176173144, + "loss": 1.0012, + "step": 4084 + }, + { + "epoch": 0.1935560293769249, + "grad_norm": 0.57421875, + "learning_rate": 0.00018214352525547085, + "loss": 1.0989, + "step": 4085 + }, + { + "epoch": 0.19360341151385926, + "grad_norm": 0.69140625, + "learning_rate": 0.0001821350310717714, + "loss": 0.8541, + "step": 4086 + }, + { + "epoch": 0.19365079365079366, + "grad_norm": 0.73828125, + "learning_rate": 0.00018212653506640442, + "loss": 0.8196, + "step": 4087 + }, + { + "epoch": 0.19369817578772802, + "grad_norm": 0.2265625, + "learning_rate": 0.0001821180372395584, + "loss": 0.0037, + "step": 4088 + }, + { + "epoch": 0.19374555792466241, + "grad_norm": 0.5859375, + "learning_rate": 0.00018210953759142178, + "loss": 0.2064, + "step": 4089 + }, + { + "epoch": 0.19379294006159678, + "grad_norm": 0.466796875, + "learning_rate": 0.00018210103612218307, + "loss": 0.0975, + "step": 4090 + }, + { + "epoch": 0.19384032219853115, + "grad_norm": 0.578125, + "learning_rate": 0.00018209253283203082, + "loss": 0.7448, + "step": 4091 + }, + { + "epoch": 0.19388770433546554, + "grad_norm": 0.51171875, + "learning_rate": 0.00018208402772115366, + "loss": 0.0596, + "step": 4092 + }, + { + "epoch": 0.1939350864723999, + "grad_norm": 0.59375, + "learning_rate": 0.00018207552078974018, + "loss": 0.0836, + "step": 4093 + }, + { + "epoch": 0.19398246860933427, + "grad_norm": 0.6328125, + "learning_rate": 0.00018206701203797907, + "loss": 0.9699, + "step": 4094 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 0.6015625, + "learning_rate": 0.00018205850146605905, + "loss": 0.9955, + "step": 4095 + }, + { + "epoch": 0.19407723288320303, + "grad_norm": 0.56640625, + "learning_rate": 0.00018204998907416888, + "loss": 1.0723, + "step": 4096 + }, + { + "epoch": 0.19412461502013742, + "grad_norm": 0.006317138671875, + "learning_rate": 0.00018204147486249733, + "loss": 0.0005, + "step": 4097 + }, + { + "epoch": 0.19417199715707179, + "grad_norm": 0.6015625, + "learning_rate": 0.0001820329588312333, + "loss": 1.2173, + "step": 4098 + }, + { + "epoch": 0.19421937929400615, + "grad_norm": 0.298828125, + "learning_rate": 0.00018202444098056554, + "loss": 0.1614, + "step": 4099 + }, + { + "epoch": 0.19426676143094054, + "grad_norm": 0.39453125, + "learning_rate": 0.00018201592131068312, + "loss": 0.4065, + "step": 4100 + }, + { + "epoch": 0.1943141435678749, + "grad_norm": 0.451171875, + "learning_rate": 0.00018200739982177487, + "loss": 0.0372, + "step": 4101 + }, + { + "epoch": 0.19436152570480927, + "grad_norm": 0.439453125, + "learning_rate": 0.00018199887651402987, + "loss": 0.4186, + "step": 4102 + }, + { + "epoch": 0.19440890784174367, + "grad_norm": 0.51953125, + "learning_rate": 0.00018199035138763712, + "loss": 0.9015, + "step": 4103 + }, + { + "epoch": 0.19445628997867803, + "grad_norm": 0.5234375, + "learning_rate": 0.00018198182444278572, + "loss": 1.0228, + "step": 4104 + }, + { + "epoch": 0.19450367211561242, + "grad_norm": 0.48046875, + "learning_rate": 0.0001819732956796648, + "loss": 1.0538, + "step": 4105 + }, + { + "epoch": 0.1945510542525468, + "grad_norm": 0.765625, + "learning_rate": 0.00018196476509846346, + "loss": 1.1581, + "step": 4106 + }, + { + "epoch": 0.19459843638948116, + "grad_norm": 0.57421875, + "learning_rate": 0.00018195623269937094, + "loss": 0.9356, + "step": 4107 + }, + { + "epoch": 0.19464581852641555, + "grad_norm": 0.63671875, + "learning_rate": 0.00018194769848257648, + "loss": 0.9523, + "step": 4108 + }, + { + "epoch": 0.1946932006633499, + "grad_norm": 0.0194091796875, + "learning_rate": 0.00018193916244826938, + "loss": 0.0006, + "step": 4109 + }, + { + "epoch": 0.1947405828002843, + "grad_norm": 0.546875, + "learning_rate": 0.0001819306245966389, + "loss": 1.0516, + "step": 4110 + }, + { + "epoch": 0.19478796493721867, + "grad_norm": 0.4765625, + "learning_rate": 0.00018192208492787444, + "loss": 0.8879, + "step": 4111 + }, + { + "epoch": 0.19483534707415304, + "grad_norm": 0.54296875, + "learning_rate": 0.0001819135434421654, + "loss": 0.6391, + "step": 4112 + }, + { + "epoch": 0.19488272921108743, + "grad_norm": 0.408203125, + "learning_rate": 0.0001819050001397012, + "loss": 0.0754, + "step": 4113 + }, + { + "epoch": 0.1949301113480218, + "grad_norm": 0.546875, + "learning_rate": 0.0001818964550206714, + "loss": 1.0995, + "step": 4114 + }, + { + "epoch": 0.19497749348495616, + "grad_norm": 0.640625, + "learning_rate": 0.0001818879080852654, + "loss": 0.9117, + "step": 4115 + }, + { + "epoch": 0.19502487562189055, + "grad_norm": 0.53515625, + "learning_rate": 0.00018187935933367281, + "loss": 0.6531, + "step": 4116 + }, + { + "epoch": 0.19507225775882492, + "grad_norm": 0.193359375, + "learning_rate": 0.00018187080876608328, + "loss": 0.0202, + "step": 4117 + }, + { + "epoch": 0.1951196398957593, + "grad_norm": 0.55859375, + "learning_rate": 0.00018186225638268643, + "loss": 0.0461, + "step": 4118 + }, + { + "epoch": 0.19516702203269368, + "grad_norm": 0.671875, + "learning_rate": 0.00018185370218367188, + "loss": 0.8954, + "step": 4119 + }, + { + "epoch": 0.19521440416962804, + "grad_norm": 0.419921875, + "learning_rate": 0.00018184514616922945, + "loss": 0.0695, + "step": 4120 + }, + { + "epoch": 0.19526178630656243, + "grad_norm": 0.53515625, + "learning_rate": 0.00018183658833954884, + "loss": 0.4098, + "step": 4121 + }, + { + "epoch": 0.1953091684434968, + "grad_norm": 1.1796875, + "learning_rate": 0.00018182802869481986, + "loss": 1.5883, + "step": 4122 + }, + { + "epoch": 0.19535655058043117, + "grad_norm": 0.640625, + "learning_rate": 0.00018181946723523236, + "loss": 1.1525, + "step": 4123 + }, + { + "epoch": 0.19540393271736556, + "grad_norm": 0.53125, + "learning_rate": 0.00018181090396097625, + "loss": 1.054, + "step": 4124 + }, + { + "epoch": 0.19545131485429992, + "grad_norm": 0.578125, + "learning_rate": 0.00018180233887224142, + "loss": 0.6363, + "step": 4125 + }, + { + "epoch": 0.19549869699123432, + "grad_norm": 0.421875, + "learning_rate": 0.0001817937719692178, + "loss": 0.5821, + "step": 4126 + }, + { + "epoch": 0.19554607912816868, + "grad_norm": 0.470703125, + "learning_rate": 0.0001817852032520955, + "loss": 0.2442, + "step": 4127 + }, + { + "epoch": 0.19559346126510305, + "grad_norm": 0.69140625, + "learning_rate": 0.00018177663272106448, + "loss": 0.5642, + "step": 4128 + }, + { + "epoch": 0.19564084340203744, + "grad_norm": 0.51171875, + "learning_rate": 0.00018176806037631485, + "loss": 0.5534, + "step": 4129 + }, + { + "epoch": 0.1956882255389718, + "grad_norm": 0.59765625, + "learning_rate": 0.00018175948621803676, + "loss": 0.7327, + "step": 4130 + }, + { + "epoch": 0.19573560767590617, + "grad_norm": 0.32421875, + "learning_rate": 0.00018175091024642034, + "loss": 0.142, + "step": 4131 + }, + { + "epoch": 0.19578298981284056, + "grad_norm": 0.5234375, + "learning_rate": 0.0001817423324616558, + "loss": 0.7017, + "step": 4132 + }, + { + "epoch": 0.19583037194977493, + "grad_norm": 0.28125, + "learning_rate": 0.0001817337528639334, + "loss": 0.0416, + "step": 4133 + }, + { + "epoch": 0.19587775408670932, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018172517145344341, + "loss": 0.1415, + "step": 4134 + }, + { + "epoch": 0.1959251362236437, + "grad_norm": 0.56640625, + "learning_rate": 0.00018171658823037617, + "loss": 1.0319, + "step": 4135 + }, + { + "epoch": 0.19597251836057805, + "grad_norm": 0.64453125, + "learning_rate": 0.00018170800319492204, + "loss": 0.9732, + "step": 4136 + }, + { + "epoch": 0.19601990049751244, + "grad_norm": 0.671875, + "learning_rate": 0.00018169941634727142, + "loss": 1.0862, + "step": 4137 + }, + { + "epoch": 0.1960672826344468, + "grad_norm": 0.20703125, + "learning_rate": 0.0001816908276876148, + "loss": 0.0289, + "step": 4138 + }, + { + "epoch": 0.1961146647713812, + "grad_norm": 0.609375, + "learning_rate": 0.00018168223721614263, + "loss": 0.6891, + "step": 4139 + }, + { + "epoch": 0.19616204690831557, + "grad_norm": 0.609375, + "learning_rate": 0.00018167364493304545, + "loss": 1.2633, + "step": 4140 + }, + { + "epoch": 0.19620942904524993, + "grad_norm": 0.59375, + "learning_rate": 0.0001816650508385138, + "loss": 0.9559, + "step": 4141 + }, + { + "epoch": 0.19625681118218433, + "grad_norm": 0.609375, + "learning_rate": 0.0001816564549327383, + "loss": 1.3139, + "step": 4142 + }, + { + "epoch": 0.1963041933191187, + "grad_norm": 0.75390625, + "learning_rate": 0.00018164785721590961, + "loss": 1.0766, + "step": 4143 + }, + { + "epoch": 0.19635157545605306, + "grad_norm": 0.65625, + "learning_rate": 0.00018163925768821843, + "loss": 0.7621, + "step": 4144 + }, + { + "epoch": 0.19639895759298745, + "grad_norm": 0.859375, + "learning_rate": 0.00018163065634985547, + "loss": 1.3806, + "step": 4145 + }, + { + "epoch": 0.19644633972992182, + "grad_norm": 0.66796875, + "learning_rate": 0.0001816220532010115, + "loss": 1.0591, + "step": 4146 + }, + { + "epoch": 0.1964937218668562, + "grad_norm": 0.76953125, + "learning_rate": 0.00018161344824187733, + "loss": 1.3723, + "step": 4147 + }, + { + "epoch": 0.19654110400379057, + "grad_norm": 0.34765625, + "learning_rate": 0.0001816048414726438, + "loss": 0.2652, + "step": 4148 + }, + { + "epoch": 0.19658848614072494, + "grad_norm": 0.86328125, + "learning_rate": 0.00018159623289350183, + "loss": 1.0016, + "step": 4149 + }, + { + "epoch": 0.19663586827765933, + "grad_norm": 0.4375, + "learning_rate": 0.0001815876225046423, + "loss": 0.0529, + "step": 4150 + }, + { + "epoch": 0.1966832504145937, + "grad_norm": 0.482421875, + "learning_rate": 0.00018157901030625625, + "loss": 0.7826, + "step": 4151 + }, + { + "epoch": 0.19673063255152806, + "grad_norm": 0.60546875, + "learning_rate": 0.00018157039629853458, + "loss": 0.7337, + "step": 4152 + }, + { + "epoch": 0.19677801468846245, + "grad_norm": 0.78125, + "learning_rate": 0.00018156178048166842, + "loss": 1.1036, + "step": 4153 + }, + { + "epoch": 0.19682539682539682, + "grad_norm": 0.56640625, + "learning_rate": 0.00018155316285584886, + "loss": 0.751, + "step": 4154 + }, + { + "epoch": 0.1968727789623312, + "grad_norm": 0.59765625, + "learning_rate": 0.00018154454342126703, + "loss": 1.113, + "step": 4155 + }, + { + "epoch": 0.19692016109926558, + "grad_norm": 0.72265625, + "learning_rate": 0.00018153592217811407, + "loss": 1.2036, + "step": 4156 + }, + { + "epoch": 0.19696754323619994, + "grad_norm": 0.5078125, + "learning_rate": 0.00018152729912658118, + "loss": 0.1611, + "step": 4157 + }, + { + "epoch": 0.19701492537313434, + "grad_norm": 0.21875, + "learning_rate": 0.0001815186742668597, + "loss": 0.1472, + "step": 4158 + }, + { + "epoch": 0.1970623075100687, + "grad_norm": 0.70703125, + "learning_rate": 0.0001815100475991408, + "loss": 1.2335, + "step": 4159 + }, + { + "epoch": 0.19710968964700307, + "grad_norm": 0.66796875, + "learning_rate": 0.00018150141912361587, + "loss": 0.9885, + "step": 4160 + }, + { + "epoch": 0.19715707178393746, + "grad_norm": 0.40625, + "learning_rate": 0.0001814927888404763, + "loss": 0.9222, + "step": 4161 + }, + { + "epoch": 0.19720445392087183, + "grad_norm": 0.0185546875, + "learning_rate": 0.00018148415674991344, + "loss": 0.0008, + "step": 4162 + }, + { + "epoch": 0.19725183605780622, + "grad_norm": 0.447265625, + "learning_rate": 0.0001814755228521188, + "loss": 0.2763, + "step": 4163 + }, + { + "epoch": 0.19729921819474058, + "grad_norm": 0.7578125, + "learning_rate": 0.00018146688714728386, + "loss": 0.9484, + "step": 4164 + }, + { + "epoch": 0.19734660033167495, + "grad_norm": 0.65234375, + "learning_rate": 0.0001814582496356001, + "loss": 1.0825, + "step": 4165 + }, + { + "epoch": 0.19739398246860934, + "grad_norm": 0.64453125, + "learning_rate": 0.00018144961031725916, + "loss": 1.1818, + "step": 4166 + }, + { + "epoch": 0.1974413646055437, + "grad_norm": 0.671875, + "learning_rate": 0.0001814409691924526, + "loss": 1.0338, + "step": 4167 + }, + { + "epoch": 0.1974887467424781, + "grad_norm": 0.34375, + "learning_rate": 0.0001814323262613721, + "loss": 0.0489, + "step": 4168 + }, + { + "epoch": 0.19753612887941246, + "grad_norm": 0.55078125, + "learning_rate": 0.00018142368152420933, + "loss": 0.9375, + "step": 4169 + }, + { + "epoch": 0.19758351101634683, + "grad_norm": 0.66015625, + "learning_rate": 0.00018141503498115603, + "loss": 1.2822, + "step": 4170 + }, + { + "epoch": 0.19763089315328122, + "grad_norm": 0.62109375, + "learning_rate": 0.00018140638663240402, + "loss": 1.137, + "step": 4171 + }, + { + "epoch": 0.1976782752902156, + "grad_norm": 1.09375, + "learning_rate": 0.00018139773647814501, + "loss": 1.0599, + "step": 4172 + }, + { + "epoch": 0.19772565742714995, + "grad_norm": 0.5703125, + "learning_rate": 0.00018138908451857094, + "loss": 0.2821, + "step": 4173 + }, + { + "epoch": 0.19777303956408435, + "grad_norm": 0.81640625, + "learning_rate": 0.00018138043075387365, + "loss": 1.3287, + "step": 4174 + }, + { + "epoch": 0.1978204217010187, + "grad_norm": 0.55859375, + "learning_rate": 0.00018137177518424508, + "loss": 0.9248, + "step": 4175 + }, + { + "epoch": 0.1978678038379531, + "grad_norm": 0.7265625, + "learning_rate": 0.00018136311780987723, + "loss": 0.8726, + "step": 4176 + }, + { + "epoch": 0.19791518597488747, + "grad_norm": 0.54296875, + "learning_rate": 0.00018135445863096206, + "loss": 0.7506, + "step": 4177 + }, + { + "epoch": 0.19796256811182183, + "grad_norm": 0.67578125, + "learning_rate": 0.00018134579764769168, + "loss": 0.2422, + "step": 4178 + }, + { + "epoch": 0.19800995024875623, + "grad_norm": 0.77734375, + "learning_rate": 0.00018133713486025816, + "loss": 1.2308, + "step": 4179 + }, + { + "epoch": 0.1980573323856906, + "grad_norm": 0.54296875, + "learning_rate": 0.0001813284702688536, + "loss": 0.961, + "step": 4180 + }, + { + "epoch": 0.19810471452262496, + "grad_norm": 0.6328125, + "learning_rate": 0.00018131980387367023, + "loss": 1.2177, + "step": 4181 + }, + { + "epoch": 0.19815209665955935, + "grad_norm": 0.51171875, + "learning_rate": 0.00018131113567490021, + "loss": 0.0505, + "step": 4182 + }, + { + "epoch": 0.19819947879649372, + "grad_norm": 0.5078125, + "learning_rate": 0.0001813024656727358, + "loss": 0.7928, + "step": 4183 + }, + { + "epoch": 0.1982468609334281, + "grad_norm": 0.7109375, + "learning_rate": 0.00018129379386736932, + "loss": 0.4456, + "step": 4184 + }, + { + "epoch": 0.19829424307036247, + "grad_norm": 0.64453125, + "learning_rate": 0.00018128512025899305, + "loss": 0.2851, + "step": 4185 + }, + { + "epoch": 0.19834162520729684, + "grad_norm": 0.06787109375, + "learning_rate": 0.00018127644484779943, + "loss": 0.0022, + "step": 4186 + }, + { + "epoch": 0.19838900734423123, + "grad_norm": 0.6640625, + "learning_rate": 0.0001812677676339808, + "loss": 0.8009, + "step": 4187 + }, + { + "epoch": 0.1984363894811656, + "grad_norm": 0.6953125, + "learning_rate": 0.00018125908861772966, + "loss": 0.9012, + "step": 4188 + }, + { + "epoch": 0.19848377161809996, + "grad_norm": 0.67578125, + "learning_rate": 0.0001812504077992385, + "loss": 0.9618, + "step": 4189 + }, + { + "epoch": 0.19853115375503436, + "grad_norm": 0.30078125, + "learning_rate": 0.00018124172517869984, + "loss": 0.1837, + "step": 4190 + }, + { + "epoch": 0.19857853589196872, + "grad_norm": 0.671875, + "learning_rate": 0.00018123304075630625, + "loss": 1.0119, + "step": 4191 + }, + { + "epoch": 0.19862591802890311, + "grad_norm": 0.68359375, + "learning_rate": 0.00018122435453225033, + "loss": 1.0928, + "step": 4192 + }, + { + "epoch": 0.19867330016583748, + "grad_norm": 0.63671875, + "learning_rate": 0.00018121566650672474, + "loss": 0.8389, + "step": 4193 + }, + { + "epoch": 0.19872068230277184, + "grad_norm": 0.609375, + "learning_rate": 0.00018120697667992216, + "loss": 1.0344, + "step": 4194 + }, + { + "epoch": 0.19876806443970624, + "grad_norm": 0.65234375, + "learning_rate": 0.00018119828505203537, + "loss": 1.2736, + "step": 4195 + }, + { + "epoch": 0.1988154465766406, + "grad_norm": 0.7578125, + "learning_rate": 0.00018118959162325711, + "loss": 0.4803, + "step": 4196 + }, + { + "epoch": 0.19886282871357497, + "grad_norm": 0.26171875, + "learning_rate": 0.00018118089639378016, + "loss": 0.146, + "step": 4197 + }, + { + "epoch": 0.19891021085050936, + "grad_norm": 0.0037994384765625, + "learning_rate": 0.00018117219936379742, + "loss": 0.0002, + "step": 4198 + }, + { + "epoch": 0.19895759298744373, + "grad_norm": 0.6796875, + "learning_rate": 0.00018116350053350172, + "loss": 1.1075, + "step": 4199 + }, + { + "epoch": 0.19900497512437812, + "grad_norm": 0.5859375, + "learning_rate": 0.0001811547999030861, + "loss": 0.7409, + "step": 4200 + }, + { + "epoch": 0.19905235726131248, + "grad_norm": 1.2578125, + "learning_rate": 0.0001811460974727434, + "loss": 0.0631, + "step": 4201 + }, + { + "epoch": 0.19909973939824685, + "grad_norm": 0.25390625, + "learning_rate": 0.00018113739324266673, + "loss": 0.1505, + "step": 4202 + }, + { + "epoch": 0.19914712153518124, + "grad_norm": 0.66015625, + "learning_rate": 0.00018112868721304904, + "loss": 1.0933, + "step": 4203 + }, + { + "epoch": 0.1991945036721156, + "grad_norm": 0.515625, + "learning_rate": 0.00018111997938408353, + "loss": 0.8051, + "step": 4204 + }, + { + "epoch": 0.19924188580905, + "grad_norm": 0.177734375, + "learning_rate": 0.00018111126975596328, + "loss": 0.0196, + "step": 4205 + }, + { + "epoch": 0.19928926794598437, + "grad_norm": 0.3203125, + "learning_rate": 0.00018110255832888146, + "loss": 0.1584, + "step": 4206 + }, + { + "epoch": 0.19933665008291873, + "grad_norm": 0.8203125, + "learning_rate": 0.00018109384510303127, + "loss": 0.1847, + "step": 4207 + }, + { + "epoch": 0.19938403221985312, + "grad_norm": 0.75390625, + "learning_rate": 0.00018108513007860597, + "loss": 0.6955, + "step": 4208 + }, + { + "epoch": 0.1994314143567875, + "grad_norm": 0.53515625, + "learning_rate": 0.00018107641325579887, + "loss": 0.0291, + "step": 4209 + }, + { + "epoch": 0.19947879649372185, + "grad_norm": 0.69921875, + "learning_rate": 0.0001810676946348033, + "loss": 1.3563, + "step": 4210 + }, + { + "epoch": 0.19952617863065625, + "grad_norm": 0.251953125, + "learning_rate": 0.00018105897421581257, + "loss": 0.1529, + "step": 4211 + }, + { + "epoch": 0.1995735607675906, + "grad_norm": 0.330078125, + "learning_rate": 0.00018105025199902016, + "loss": 0.1252, + "step": 4212 + }, + { + "epoch": 0.199620942904525, + "grad_norm": 0.62890625, + "learning_rate": 0.00018104152798461946, + "loss": 0.1457, + "step": 4213 + }, + { + "epoch": 0.19966832504145937, + "grad_norm": 0.78515625, + "learning_rate": 0.00018103280217280406, + "loss": 1.43, + "step": 4214 + }, + { + "epoch": 0.19971570717839374, + "grad_norm": 0.80078125, + "learning_rate": 0.00018102407456376733, + "loss": 1.4493, + "step": 4215 + }, + { + "epoch": 0.19976308931532813, + "grad_norm": 0.91015625, + "learning_rate": 0.00018101534515770298, + "loss": 1.3808, + "step": 4216 + }, + { + "epoch": 0.1998104714522625, + "grad_norm": 0.6015625, + "learning_rate": 0.00018100661395480455, + "loss": 0.8198, + "step": 4217 + }, + { + "epoch": 0.19985785358919686, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018099788095526576, + "loss": 0.1655, + "step": 4218 + }, + { + "epoch": 0.19990523572613125, + "grad_norm": 0.447265625, + "learning_rate": 0.00018098914615928018, + "loss": 0.8965, + "step": 4219 + }, + { + "epoch": 0.19995261786306562, + "grad_norm": 0.390625, + "learning_rate": 0.00018098040956704162, + "loss": 0.0433, + "step": 4220 + }, + { + "epoch": 0.2, + "grad_norm": 0.69140625, + "learning_rate": 0.00018097167117874386, + "loss": 1.3795, + "step": 4221 + }, + { + "epoch": 0.20004738213693438, + "grad_norm": 0.54296875, + "learning_rate": 0.00018096293099458067, + "loss": 1.1495, + "step": 4222 + }, + { + "epoch": 0.20009476427386874, + "grad_norm": 0.345703125, + "learning_rate": 0.00018095418901474587, + "loss": 0.0622, + "step": 4223 + }, + { + "epoch": 0.20014214641080313, + "grad_norm": 0.59765625, + "learning_rate": 0.0001809454452394334, + "loss": 0.1261, + "step": 4224 + }, + { + "epoch": 0.2001895285477375, + "grad_norm": 0.90234375, + "learning_rate": 0.00018093669966883722, + "loss": 0.1317, + "step": 4225 + }, + { + "epoch": 0.20023691068467186, + "grad_norm": 0.65625, + "learning_rate": 0.0001809279523031512, + "loss": 1.4214, + "step": 4226 + }, + { + "epoch": 0.20028429282160626, + "grad_norm": 0.8125, + "learning_rate": 0.00018091920314256945, + "loss": 0.301, + "step": 4227 + }, + { + "epoch": 0.20033167495854062, + "grad_norm": 0.234375, + "learning_rate": 0.00018091045218728593, + "loss": 0.1512, + "step": 4228 + }, + { + "epoch": 0.20037905709547502, + "grad_norm": 0.28515625, + "learning_rate": 0.00018090169943749476, + "loss": 0.1496, + "step": 4229 + }, + { + "epoch": 0.20042643923240938, + "grad_norm": 0.6953125, + "learning_rate": 0.00018089294489339008, + "loss": 0.88, + "step": 4230 + }, + { + "epoch": 0.20047382136934375, + "grad_norm": 0.515625, + "learning_rate": 0.00018088418855516603, + "loss": 0.201, + "step": 4231 + }, + { + "epoch": 0.20052120350627814, + "grad_norm": 0.40234375, + "learning_rate": 0.00018087543042301685, + "loss": 0.675, + "step": 4232 + }, + { + "epoch": 0.2005685856432125, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018086667049713675, + "loss": 0.0207, + "step": 4233 + }, + { + "epoch": 0.2006159677801469, + "grad_norm": 0.64453125, + "learning_rate": 0.00018085790877772005, + "loss": 1.1899, + "step": 4234 + }, + { + "epoch": 0.20066334991708126, + "grad_norm": 0.6171875, + "learning_rate": 0.00018084914526496105, + "loss": 1.0218, + "step": 4235 + }, + { + "epoch": 0.20071073205401563, + "grad_norm": 0.314453125, + "learning_rate": 0.0001808403799590541, + "loss": 0.0254, + "step": 4236 + }, + { + "epoch": 0.20075811419095002, + "grad_norm": 0.494140625, + "learning_rate": 0.00018083161286019367, + "loss": 0.8982, + "step": 4237 + }, + { + "epoch": 0.2008054963278844, + "grad_norm": 0.625, + "learning_rate": 0.0001808228439685742, + "loss": 1.0802, + "step": 4238 + }, + { + "epoch": 0.20085287846481875, + "grad_norm": 0.73828125, + "learning_rate": 0.00018081407328439007, + "loss": 0.926, + "step": 4239 + }, + { + "epoch": 0.20090026060175314, + "grad_norm": 0.640625, + "learning_rate": 0.0001808053008078359, + "loss": 0.8248, + "step": 4240 + }, + { + "epoch": 0.2009476427386875, + "grad_norm": 0.59375, + "learning_rate": 0.0001807965265391062, + "loss": 0.9299, + "step": 4241 + }, + { + "epoch": 0.2009950248756219, + "grad_norm": 0.58203125, + "learning_rate": 0.00018078775047839564, + "loss": 0.2623, + "step": 4242 + }, + { + "epoch": 0.20104240701255627, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018077897262589883, + "loss": 0.1651, + "step": 4243 + }, + { + "epoch": 0.20108978914949063, + "grad_norm": 1.5, + "learning_rate": 0.00018077019298181043, + "loss": 0.0975, + "step": 4244 + }, + { + "epoch": 0.20113717128642503, + "grad_norm": 0.8359375, + "learning_rate": 0.00018076141154632516, + "loss": 0.8288, + "step": 4245 + }, + { + "epoch": 0.2011845534233594, + "grad_norm": 0.5703125, + "learning_rate": 0.0001807526283196378, + "loss": 0.7461, + "step": 4246 + }, + { + "epoch": 0.20123193556029376, + "grad_norm": 0.09228515625, + "learning_rate": 0.00018074384330194322, + "loss": 0.0066, + "step": 4247 + }, + { + "epoch": 0.20127931769722815, + "grad_norm": 0.734375, + "learning_rate": 0.00018073505649343616, + "loss": 0.1853, + "step": 4248 + }, + { + "epoch": 0.20132669983416251, + "grad_norm": 0.625, + "learning_rate": 0.00018072626789431156, + "loss": 0.8787, + "step": 4249 + }, + { + "epoch": 0.2013740819710969, + "grad_norm": 0.51171875, + "learning_rate": 0.0001807174775047643, + "loss": 0.4628, + "step": 4250 + }, + { + "epoch": 0.20142146410803127, + "grad_norm": 1.265625, + "learning_rate": 0.0001807086853249894, + "loss": 0.3134, + "step": 4251 + }, + { + "epoch": 0.20146884624496564, + "grad_norm": 0.66796875, + "learning_rate": 0.0001806998913551818, + "loss": 0.8921, + "step": 4252 + }, + { + "epoch": 0.20151622838190003, + "grad_norm": 0.69921875, + "learning_rate": 0.00018069109559553655, + "loss": 0.7831, + "step": 4253 + }, + { + "epoch": 0.2015636105188344, + "grad_norm": 0.796875, + "learning_rate": 0.00018068229804624878, + "loss": 0.0292, + "step": 4254 + }, + { + "epoch": 0.20161099265576876, + "grad_norm": 0.53125, + "learning_rate": 0.00018067349870751355, + "loss": 0.0452, + "step": 4255 + }, + { + "epoch": 0.20165837479270315, + "grad_norm": 0.59765625, + "learning_rate": 0.00018066469757952608, + "loss": 1.1553, + "step": 4256 + }, + { + "epoch": 0.20170575692963752, + "grad_norm": 0.53515625, + "learning_rate": 0.00018065589466248152, + "loss": 0.9697, + "step": 4257 + }, + { + "epoch": 0.2017531390665719, + "grad_norm": 0.5703125, + "learning_rate": 0.00018064708995657513, + "loss": 0.9858, + "step": 4258 + }, + { + "epoch": 0.20180052120350628, + "grad_norm": 0.734375, + "learning_rate": 0.0001806382834620022, + "loss": 0.994, + "step": 4259 + }, + { + "epoch": 0.20184790334044064, + "grad_norm": 0.6953125, + "learning_rate": 0.000180629475178958, + "loss": 0.1986, + "step": 4260 + }, + { + "epoch": 0.20189528547737504, + "grad_norm": 0.330078125, + "learning_rate": 0.00018062066510763795, + "loss": 0.2004, + "step": 4261 + }, + { + "epoch": 0.2019426676143094, + "grad_norm": 0.1787109375, + "learning_rate": 0.00018061185324823743, + "loss": 0.0201, + "step": 4262 + }, + { + "epoch": 0.2019900497512438, + "grad_norm": 0.69921875, + "learning_rate": 0.00018060303960095185, + "loss": 1.223, + "step": 4263 + }, + { + "epoch": 0.20203743188817816, + "grad_norm": 0.6171875, + "learning_rate": 0.0001805942241659767, + "loss": 0.8158, + "step": 4264 + }, + { + "epoch": 0.20208481402511252, + "grad_norm": 0.11279296875, + "learning_rate": 0.00018058540694350752, + "loss": 0.0056, + "step": 4265 + }, + { + "epoch": 0.20213219616204692, + "grad_norm": 0.70703125, + "learning_rate": 0.00018057658793373983, + "loss": 1.104, + "step": 4266 + }, + { + "epoch": 0.20217957829898128, + "grad_norm": 0.87890625, + "learning_rate": 0.00018056776713686927, + "loss": 1.2196, + "step": 4267 + }, + { + "epoch": 0.20222696043591565, + "grad_norm": 0.443359375, + "learning_rate": 0.00018055894455309144, + "loss": 1.0654, + "step": 4268 + }, + { + "epoch": 0.20227434257285004, + "grad_norm": 0.55078125, + "learning_rate": 0.00018055012018260204, + "loss": 1.1841, + "step": 4269 + }, + { + "epoch": 0.2023217247097844, + "grad_norm": 0.6015625, + "learning_rate": 0.00018054129402559675, + "loss": 1.1881, + "step": 4270 + }, + { + "epoch": 0.2023691068467188, + "grad_norm": 0.79296875, + "learning_rate": 0.00018053246608227137, + "loss": 0.5166, + "step": 4271 + }, + { + "epoch": 0.20241648898365316, + "grad_norm": 0.51953125, + "learning_rate": 0.00018052363635282163, + "loss": 0.0744, + "step": 4272 + }, + { + "epoch": 0.20246387112058753, + "grad_norm": 0.55859375, + "learning_rate": 0.00018051480483744346, + "loss": 0.9402, + "step": 4273 + }, + { + "epoch": 0.20251125325752192, + "grad_norm": 0.02197265625, + "learning_rate": 0.00018050597153633263, + "loss": 0.0009, + "step": 4274 + }, + { + "epoch": 0.2025586353944563, + "grad_norm": 0.28515625, + "learning_rate": 0.00018049713644968516, + "loss": 0.1977, + "step": 4275 + }, + { + "epoch": 0.20260601753139065, + "grad_norm": 0.6484375, + "learning_rate": 0.0001804882995776969, + "loss": 1.4933, + "step": 4276 + }, + { + "epoch": 0.20265339966832505, + "grad_norm": 0.01129150390625, + "learning_rate": 0.00018047946092056391, + "loss": 0.0006, + "step": 4277 + }, + { + "epoch": 0.2027007818052594, + "grad_norm": 0.58984375, + "learning_rate": 0.0001804706204784822, + "loss": 1.4036, + "step": 4278 + }, + { + "epoch": 0.2027481639421938, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018046177825164784, + "loss": 0.015, + "step": 4279 + }, + { + "epoch": 0.20279554607912817, + "grad_norm": 0.65625, + "learning_rate": 0.0001804529342402569, + "loss": 0.6647, + "step": 4280 + }, + { + "epoch": 0.20284292821606253, + "grad_norm": 0.69140625, + "learning_rate": 0.00018044408844450563, + "loss": 0.3855, + "step": 4281 + }, + { + "epoch": 0.20289031035299693, + "grad_norm": 0.5546875, + "learning_rate": 0.00018043524086459013, + "loss": 0.9161, + "step": 4282 + }, + { + "epoch": 0.2029376924899313, + "grad_norm": 0.55078125, + "learning_rate": 0.0001804263915007067, + "loss": 1.0379, + "step": 4283 + }, + { + "epoch": 0.20298507462686566, + "grad_norm": 0.53125, + "learning_rate": 0.00018041754035305152, + "loss": 0.0514, + "step": 4284 + }, + { + "epoch": 0.20303245676380005, + "grad_norm": 1.09375, + "learning_rate": 0.00018040868742182098, + "loss": 0.0456, + "step": 4285 + }, + { + "epoch": 0.20307983890073442, + "grad_norm": 0.67578125, + "learning_rate": 0.0001803998327072114, + "loss": 1.2112, + "step": 4286 + }, + { + "epoch": 0.2031272210376688, + "grad_norm": 0.349609375, + "learning_rate": 0.00018039097620941915, + "loss": 0.0263, + "step": 4287 + }, + { + "epoch": 0.20317460317460317, + "grad_norm": 0.14453125, + "learning_rate": 0.0001803821179286407, + "loss": 0.0274, + "step": 4288 + }, + { + "epoch": 0.20322198531153754, + "grad_norm": 0.171875, + "learning_rate": 0.00018037325786507248, + "loss": 0.0196, + "step": 4289 + }, + { + "epoch": 0.20326936744847193, + "grad_norm": 0.5703125, + "learning_rate": 0.00018036439601891102, + "loss": 1.1825, + "step": 4290 + }, + { + "epoch": 0.2033167495854063, + "grad_norm": 0.79296875, + "learning_rate": 0.00018035553239035285, + "loss": 0.8397, + "step": 4291 + }, + { + "epoch": 0.2033641317223407, + "grad_norm": 0.66015625, + "learning_rate": 0.00018034666697959456, + "loss": 0.9871, + "step": 4292 + }, + { + "epoch": 0.20341151385927506, + "grad_norm": 0.609375, + "learning_rate": 0.0001803377997868328, + "loss": 0.7991, + "step": 4293 + }, + { + "epoch": 0.20345889599620942, + "grad_norm": 0.70703125, + "learning_rate": 0.0001803289308122642, + "loss": 0.9436, + "step": 4294 + }, + { + "epoch": 0.20350627813314381, + "grad_norm": 0.78125, + "learning_rate": 0.0001803200600560855, + "loss": 1.4093, + "step": 4295 + }, + { + "epoch": 0.20355366027007818, + "grad_norm": 0.1904296875, + "learning_rate": 0.00018031118751849338, + "loss": 0.0173, + "step": 4296 + }, + { + "epoch": 0.20360104240701254, + "grad_norm": 0.326171875, + "learning_rate": 0.00018030231319968466, + "loss": 0.1785, + "step": 4297 + }, + { + "epoch": 0.20364842454394694, + "grad_norm": 0.5234375, + "learning_rate": 0.0001802934370998562, + "loss": 0.6897, + "step": 4298 + }, + { + "epoch": 0.2036958066808813, + "grad_norm": 0.158203125, + "learning_rate": 0.0001802845592192048, + "loss": 0.0282, + "step": 4299 + }, + { + "epoch": 0.2037431888178157, + "grad_norm": 0.5703125, + "learning_rate": 0.0001802756795579274, + "loss": 1.045, + "step": 4300 + }, + { + "epoch": 0.20379057095475006, + "grad_norm": 0.65625, + "learning_rate": 0.00018026679811622098, + "loss": 1.0125, + "step": 4301 + }, + { + "epoch": 0.20383795309168443, + "grad_norm": 0.984375, + "learning_rate": 0.0001802579148942824, + "loss": 1.3856, + "step": 4302 + }, + { + "epoch": 0.20388533522861882, + "grad_norm": 0.5859375, + "learning_rate": 0.00018024902989230882, + "loss": 0.8329, + "step": 4303 + }, + { + "epoch": 0.20393271736555318, + "grad_norm": 0.9453125, + "learning_rate": 0.0001802401431104972, + "loss": 1.3463, + "step": 4304 + }, + { + "epoch": 0.20398009950248755, + "grad_norm": 0.83984375, + "learning_rate": 0.00018023125454904467, + "loss": 0.2841, + "step": 4305 + }, + { + "epoch": 0.20402748163942194, + "grad_norm": 0.5390625, + "learning_rate": 0.00018022236420814838, + "loss": 0.7527, + "step": 4306 + }, + { + "epoch": 0.2040748637763563, + "grad_norm": 0.00122833251953125, + "learning_rate": 0.0001802134720880055, + "loss": 0.0001, + "step": 4307 + }, + { + "epoch": 0.2041222459132907, + "grad_norm": 0.67578125, + "learning_rate": 0.00018020457818881326, + "loss": 0.7237, + "step": 4308 + }, + { + "epoch": 0.20416962805022507, + "grad_norm": 0.90234375, + "learning_rate": 0.0001801956825107689, + "loss": 1.564, + "step": 4309 + }, + { + "epoch": 0.20421701018715943, + "grad_norm": 0.8984375, + "learning_rate": 0.00018018678505406972, + "loss": 1.4349, + "step": 4310 + }, + { + "epoch": 0.20426439232409382, + "grad_norm": 0.87109375, + "learning_rate": 0.00018017788581891307, + "loss": 1.4531, + "step": 4311 + }, + { + "epoch": 0.2043117744610282, + "grad_norm": 0.65625, + "learning_rate": 0.0001801689848054963, + "loss": 0.8812, + "step": 4312 + }, + { + "epoch": 0.20435915659796255, + "grad_norm": 0.58203125, + "learning_rate": 0.00018016008201401684, + "loss": 0.8732, + "step": 4313 + }, + { + "epoch": 0.20440653873489695, + "grad_norm": 0.345703125, + "learning_rate": 0.00018015117744467213, + "loss": 0.1634, + "step": 4314 + }, + { + "epoch": 0.2044539208718313, + "grad_norm": 0.099609375, + "learning_rate": 0.0001801422710976597, + "loss": 0.0031, + "step": 4315 + }, + { + "epoch": 0.2045013030087657, + "grad_norm": 0.88671875, + "learning_rate": 0.00018013336297317703, + "loss": 0.8116, + "step": 4316 + }, + { + "epoch": 0.20454868514570007, + "grad_norm": 0.671875, + "learning_rate": 0.00018012445307142175, + "loss": 1.1377, + "step": 4317 + }, + { + "epoch": 0.20459606728263444, + "grad_norm": 0.5625, + "learning_rate": 0.00018011554139259146, + "loss": 0.8103, + "step": 4318 + }, + { + "epoch": 0.20464344941956883, + "grad_norm": 0.609375, + "learning_rate": 0.00018010662793688378, + "loss": 0.8936, + "step": 4319 + }, + { + "epoch": 0.2046908315565032, + "grad_norm": 0.50390625, + "learning_rate": 0.0001800977127044964, + "loss": 0.1203, + "step": 4320 + }, + { + "epoch": 0.2047382136934376, + "grad_norm": 0.76171875, + "learning_rate": 0.0001800887956956271, + "loss": 0.9999, + "step": 4321 + }, + { + "epoch": 0.20478559583037195, + "grad_norm": 0.8125, + "learning_rate": 0.00018007987691047358, + "loss": 1.2671, + "step": 4322 + }, + { + "epoch": 0.20483297796730632, + "grad_norm": 0.640625, + "learning_rate": 0.00018007095634923372, + "loss": 1.2057, + "step": 4323 + }, + { + "epoch": 0.2048803601042407, + "grad_norm": 0.71875, + "learning_rate": 0.00018006203401210535, + "loss": 1.0501, + "step": 4324 + }, + { + "epoch": 0.20492774224117508, + "grad_norm": 0.67578125, + "learning_rate": 0.00018005310989928632, + "loss": 1.1723, + "step": 4325 + }, + { + "epoch": 0.20497512437810944, + "grad_norm": 0.6484375, + "learning_rate": 0.00018004418401097456, + "loss": 1.1301, + "step": 4326 + }, + { + "epoch": 0.20502250651504383, + "grad_norm": 0.60546875, + "learning_rate": 0.00018003525634736808, + "loss": 0.8912, + "step": 4327 + }, + { + "epoch": 0.2050698886519782, + "grad_norm": 0.6328125, + "learning_rate": 0.00018002632690866487, + "loss": 1.0652, + "step": 4328 + }, + { + "epoch": 0.2051172707889126, + "grad_norm": 0.8125, + "learning_rate": 0.000180017395695063, + "loss": 1.0792, + "step": 4329 + }, + { + "epoch": 0.20516465292584696, + "grad_norm": 0.79296875, + "learning_rate": 0.00018000846270676047, + "loss": 1.1601, + "step": 4330 + }, + { + "epoch": 0.20521203506278132, + "grad_norm": 0.31640625, + "learning_rate": 0.00017999952794395548, + "loss": 0.0512, + "step": 4331 + }, + { + "epoch": 0.20525941719971572, + "grad_norm": 0.703125, + "learning_rate": 0.00017999059140684615, + "loss": 0.7923, + "step": 4332 + }, + { + "epoch": 0.20530679933665008, + "grad_norm": 0.9609375, + "learning_rate": 0.00017998165309563073, + "loss": 0.7649, + "step": 4333 + }, + { + "epoch": 0.20535418147358445, + "grad_norm": 0.671875, + "learning_rate": 0.0001799727130105074, + "loss": 0.9117, + "step": 4334 + }, + { + "epoch": 0.20540156361051884, + "grad_norm": 0.76171875, + "learning_rate": 0.00017996377115167452, + "loss": 0.3355, + "step": 4335 + }, + { + "epoch": 0.2054489457474532, + "grad_norm": 0.75, + "learning_rate": 0.00017995482751933037, + "loss": 1.2005, + "step": 4336 + }, + { + "epoch": 0.2054963278843876, + "grad_norm": 0.65234375, + "learning_rate": 0.0001799458821136733, + "loss": 0.8636, + "step": 4337 + }, + { + "epoch": 0.20554371002132196, + "grad_norm": 0.8203125, + "learning_rate": 0.00017993693493490175, + "loss": 0.5847, + "step": 4338 + }, + { + "epoch": 0.20559109215825633, + "grad_norm": 0.6484375, + "learning_rate": 0.00017992798598321405, + "loss": 1.2174, + "step": 4339 + }, + { + "epoch": 0.20563847429519072, + "grad_norm": 0.640625, + "learning_rate": 0.00017991903525880882, + "loss": 1.2868, + "step": 4340 + }, + { + "epoch": 0.20568585643212509, + "grad_norm": 0.494140625, + "learning_rate": 0.00017991008276188448, + "loss": 0.0422, + "step": 4341 + }, + { + "epoch": 0.20573323856905945, + "grad_norm": 0.6796875, + "learning_rate": 0.00017990112849263965, + "loss": 0.8283, + "step": 4342 + }, + { + "epoch": 0.20578062070599384, + "grad_norm": 0.7109375, + "learning_rate": 0.00017989217245127287, + "loss": 1.3559, + "step": 4343 + }, + { + "epoch": 0.2058280028429282, + "grad_norm": 0.578125, + "learning_rate": 0.00017988321463798283, + "loss": 0.6048, + "step": 4344 + }, + { + "epoch": 0.2058753849798626, + "grad_norm": 0.51953125, + "learning_rate": 0.00017987425505296815, + "loss": 0.9089, + "step": 4345 + }, + { + "epoch": 0.20592276711679697, + "grad_norm": 0.75, + "learning_rate": 0.00017986529369642758, + "loss": 0.8273, + "step": 4346 + }, + { + "epoch": 0.20597014925373133, + "grad_norm": 0.60546875, + "learning_rate": 0.00017985633056855988, + "loss": 0.1078, + "step": 4347 + }, + { + "epoch": 0.20601753139066573, + "grad_norm": 0.6015625, + "learning_rate": 0.00017984736566956382, + "loss": 0.8183, + "step": 4348 + }, + { + "epoch": 0.2060649135276001, + "grad_norm": 0.00225830078125, + "learning_rate": 0.00017983839899963822, + "loss": 0.0002, + "step": 4349 + }, + { + "epoch": 0.20611229566453448, + "grad_norm": 0.330078125, + "learning_rate": 0.000179829430558982, + "loss": 0.1592, + "step": 4350 + }, + { + "epoch": 0.20615967780146885, + "grad_norm": 0.5859375, + "learning_rate": 0.000179820460347794, + "loss": 0.9371, + "step": 4351 + }, + { + "epoch": 0.20620705993840321, + "grad_norm": 0.68359375, + "learning_rate": 0.00017981148836627325, + "loss": 1.0387, + "step": 4352 + }, + { + "epoch": 0.2062544420753376, + "grad_norm": 0.984375, + "learning_rate": 0.00017980251461461867, + "loss": 0.4302, + "step": 4353 + }, + { + "epoch": 0.20630182421227197, + "grad_norm": 0.51953125, + "learning_rate": 0.00017979353909302934, + "loss": 0.5641, + "step": 4354 + }, + { + "epoch": 0.20634920634920634, + "grad_norm": 0.6953125, + "learning_rate": 0.0001797845618017043, + "loss": 0.986, + "step": 4355 + }, + { + "epoch": 0.20639658848614073, + "grad_norm": 0.6171875, + "learning_rate": 0.00017977558274084266, + "loss": 1.3083, + "step": 4356 + }, + { + "epoch": 0.2064439706230751, + "grad_norm": 0.56640625, + "learning_rate": 0.00017976660191064357, + "loss": 0.7942, + "step": 4357 + }, + { + "epoch": 0.2064913527600095, + "grad_norm": 0.703125, + "learning_rate": 0.0001797576193113062, + "loss": 0.26, + "step": 4358 + }, + { + "epoch": 0.20653873489694385, + "grad_norm": 0.6484375, + "learning_rate": 0.00017974863494302982, + "loss": 0.9274, + "step": 4359 + }, + { + "epoch": 0.20658611703387822, + "grad_norm": 0.5703125, + "learning_rate": 0.00017973964880601363, + "loss": 0.7014, + "step": 4360 + }, + { + "epoch": 0.2066334991708126, + "grad_norm": 0.462890625, + "learning_rate": 0.000179730660900457, + "loss": 0.059, + "step": 4361 + }, + { + "epoch": 0.20668088130774698, + "grad_norm": 0.396484375, + "learning_rate": 0.00017972167122655918, + "loss": 0.1828, + "step": 4362 + }, + { + "epoch": 0.20672826344468134, + "grad_norm": 0.8359375, + "learning_rate": 0.00017971267978451967, + "loss": 0.6896, + "step": 4363 + }, + { + "epoch": 0.20677564558161574, + "grad_norm": 0.515625, + "learning_rate": 0.00017970368657453778, + "loss": 0.767, + "step": 4364 + }, + { + "epoch": 0.2068230277185501, + "grad_norm": 0.26953125, + "learning_rate": 0.000179694691596813, + "loss": 0.1646, + "step": 4365 + }, + { + "epoch": 0.2068704098554845, + "grad_norm": 0.62890625, + "learning_rate": 0.00017968569485154489, + "loss": 1.4281, + "step": 4366 + }, + { + "epoch": 0.20691779199241886, + "grad_norm": 0.7421875, + "learning_rate": 0.00017967669633893297, + "loss": 0.9969, + "step": 4367 + }, + { + "epoch": 0.20696517412935322, + "grad_norm": 0.66015625, + "learning_rate": 0.00017966769605917674, + "loss": 0.8421, + "step": 4368 + }, + { + "epoch": 0.20701255626628762, + "grad_norm": 0.56640625, + "learning_rate": 0.0001796586940124759, + "loss": 0.9112, + "step": 4369 + }, + { + "epoch": 0.20705993840322198, + "grad_norm": 0.5390625, + "learning_rate": 0.00017964969019903005, + "loss": 0.5215, + "step": 4370 + }, + { + "epoch": 0.20710732054015635, + "grad_norm": 0.65234375, + "learning_rate": 0.00017964068461903896, + "loss": 0.8646, + "step": 4371 + }, + { + "epoch": 0.20715470267709074, + "grad_norm": 0.8984375, + "learning_rate": 0.00017963167727270225, + "loss": 0.1278, + "step": 4372 + }, + { + "epoch": 0.2072020848140251, + "grad_norm": 0.75, + "learning_rate": 0.0001796226681602198, + "loss": 1.0172, + "step": 4373 + }, + { + "epoch": 0.2072494669509595, + "grad_norm": 0.6328125, + "learning_rate": 0.00017961365728179138, + "loss": 0.7388, + "step": 4374 + }, + { + "epoch": 0.20729684908789386, + "grad_norm": 0.65625, + "learning_rate": 0.00017960464463761682, + "loss": 0.9097, + "step": 4375 + }, + { + "epoch": 0.20734423122482823, + "grad_norm": 1.9609375, + "learning_rate": 0.00017959563022789605, + "loss": 0.4247, + "step": 4376 + }, + { + "epoch": 0.20739161336176262, + "grad_norm": 0.6953125, + "learning_rate": 0.00017958661405282898, + "loss": 0.4378, + "step": 4377 + }, + { + "epoch": 0.207438995498697, + "grad_norm": 0.703125, + "learning_rate": 0.0001795775961126156, + "loss": 0.8121, + "step": 4378 + }, + { + "epoch": 0.20748637763563138, + "grad_norm": 0.57421875, + "learning_rate": 0.00017956857640745585, + "loss": 0.1325, + "step": 4379 + }, + { + "epoch": 0.20753375977256575, + "grad_norm": 0.26953125, + "learning_rate": 0.00017955955493754986, + "loss": 0.1541, + "step": 4380 + }, + { + "epoch": 0.2075811419095001, + "grad_norm": 0.53125, + "learning_rate": 0.0001795505317030977, + "loss": 1.0451, + "step": 4381 + }, + { + "epoch": 0.2076285240464345, + "grad_norm": 0.419921875, + "learning_rate": 0.00017954150670429944, + "loss": 0.6229, + "step": 4382 + }, + { + "epoch": 0.20767590618336887, + "grad_norm": 0.671875, + "learning_rate": 0.0001795324799413553, + "loss": 1.2359, + "step": 4383 + }, + { + "epoch": 0.20772328832030323, + "grad_norm": 0.4375, + "learning_rate": 0.00017952345141446552, + "loss": 0.1026, + "step": 4384 + }, + { + "epoch": 0.20777067045723763, + "grad_norm": 1.265625, + "learning_rate": 0.0001795144211238302, + "loss": 0.7769, + "step": 4385 + }, + { + "epoch": 0.207818052594172, + "grad_norm": 0.65625, + "learning_rate": 0.00017950538906964979, + "loss": 0.53, + "step": 4386 + }, + { + "epoch": 0.20786543473110639, + "grad_norm": 0.73828125, + "learning_rate": 0.0001794963552521245, + "loss": 1.4608, + "step": 4387 + }, + { + "epoch": 0.20791281686804075, + "grad_norm": 0.57421875, + "learning_rate": 0.00017948731967145474, + "loss": 1.4962, + "step": 4388 + }, + { + "epoch": 0.20796019900497512, + "grad_norm": 0.6171875, + "learning_rate": 0.0001794782823278409, + "loss": 1.2669, + "step": 4389 + }, + { + "epoch": 0.2080075811419095, + "grad_norm": 0.67578125, + "learning_rate": 0.00017946924322148343, + "loss": 1.1537, + "step": 4390 + }, + { + "epoch": 0.20805496327884387, + "grad_norm": 0.546875, + "learning_rate": 0.00017946020235258274, + "loss": 0.9543, + "step": 4391 + }, + { + "epoch": 0.20810234541577824, + "grad_norm": 0.67578125, + "learning_rate": 0.00017945115972133944, + "loss": 0.7522, + "step": 4392 + }, + { + "epoch": 0.20814972755271263, + "grad_norm": 0.61328125, + "learning_rate": 0.000179442115327954, + "loss": 0.1398, + "step": 4393 + }, + { + "epoch": 0.208197109689647, + "grad_norm": 0.66015625, + "learning_rate": 0.0001794330691726271, + "loss": 0.2923, + "step": 4394 + }, + { + "epoch": 0.2082444918265814, + "grad_norm": 0.244140625, + "learning_rate": 0.00017942402125555935, + "loss": 0.1591, + "step": 4395 + }, + { + "epoch": 0.20829187396351576, + "grad_norm": 0.3515625, + "learning_rate": 0.00017941497157695138, + "loss": 0.0288, + "step": 4396 + }, + { + "epoch": 0.20833925610045012, + "grad_norm": 0.68359375, + "learning_rate": 0.00017940592013700394, + "loss": 1.0994, + "step": 4397 + }, + { + "epoch": 0.2083866382373845, + "grad_norm": 0.6484375, + "learning_rate": 0.00017939686693591776, + "loss": 1.5042, + "step": 4398 + }, + { + "epoch": 0.20843402037431888, + "grad_norm": 0.671875, + "learning_rate": 0.00017938781197389365, + "loss": 0.4372, + "step": 4399 + }, + { + "epoch": 0.20848140251125324, + "grad_norm": 0.53515625, + "learning_rate": 0.00017937875525113242, + "loss": 1.1026, + "step": 4400 + }, + { + "epoch": 0.20852878464818764, + "grad_norm": 0.5390625, + "learning_rate": 0.00017936969676783494, + "loss": 0.1716, + "step": 4401 + }, + { + "epoch": 0.208576166785122, + "grad_norm": 0.0037689208984375, + "learning_rate": 0.00017936063652420215, + "loss": 0.0003, + "step": 4402 + }, + { + "epoch": 0.2086235489220564, + "grad_norm": 0.197265625, + "learning_rate": 0.00017935157452043495, + "loss": 0.0158, + "step": 4403 + }, + { + "epoch": 0.20867093105899076, + "grad_norm": 0.2099609375, + "learning_rate": 0.00017934251075673436, + "loss": 0.0206, + "step": 4404 + }, + { + "epoch": 0.20871831319592513, + "grad_norm": 0.66015625, + "learning_rate": 0.0001793334452333014, + "loss": 0.8304, + "step": 4405 + }, + { + "epoch": 0.20876569533285952, + "grad_norm": 0.376953125, + "learning_rate": 0.00017932437795033712, + "loss": 0.0437, + "step": 4406 + }, + { + "epoch": 0.20881307746979388, + "grad_norm": 0.7578125, + "learning_rate": 0.00017931530890804263, + "loss": 1.0193, + "step": 4407 + }, + { + "epoch": 0.20886045960672828, + "grad_norm": 0.65625, + "learning_rate": 0.0001793062381066191, + "loss": 0.4923, + "step": 4408 + }, + { + "epoch": 0.20890784174366264, + "grad_norm": 1.109375, + "learning_rate": 0.00017929716554626764, + "loss": 0.4406, + "step": 4409 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 0.578125, + "learning_rate": 0.00017928809122718953, + "loss": 0.8709, + "step": 4410 + }, + { + "epoch": 0.2090026060175314, + "grad_norm": 0.65625, + "learning_rate": 0.000179279015149586, + "loss": 0.6877, + "step": 4411 + }, + { + "epoch": 0.20904998815446577, + "grad_norm": 0.142578125, + "learning_rate": 0.00017926993731365838, + "loss": 0.0178, + "step": 4412 + }, + { + "epoch": 0.20909737029140013, + "grad_norm": 0.578125, + "learning_rate": 0.00017926085771960797, + "loss": 0.9612, + "step": 4413 + }, + { + "epoch": 0.20914475242833452, + "grad_norm": 0.55859375, + "learning_rate": 0.00017925177636763615, + "loss": 0.3026, + "step": 4414 + }, + { + "epoch": 0.2091921345652689, + "grad_norm": 0.52734375, + "learning_rate": 0.00017924269325794435, + "loss": 0.6007, + "step": 4415 + }, + { + "epoch": 0.20923951670220328, + "grad_norm": 0.6171875, + "learning_rate": 0.00017923360839073406, + "loss": 0.7906, + "step": 4416 + }, + { + "epoch": 0.20928689883913765, + "grad_norm": 0.5703125, + "learning_rate": 0.00017922452176620667, + "loss": 0.7725, + "step": 4417 + }, + { + "epoch": 0.209334280976072, + "grad_norm": 0.8671875, + "learning_rate": 0.0001792154333845638, + "loss": 1.5173, + "step": 4418 + }, + { + "epoch": 0.2093816631130064, + "grad_norm": 0.013671875, + "learning_rate": 0.000179206343246007, + "loss": 0.0007, + "step": 4419 + }, + { + "epoch": 0.20942904524994077, + "grad_norm": 0.65625, + "learning_rate": 0.00017919725135073788, + "loss": 0.9344, + "step": 4420 + }, + { + "epoch": 0.20947642738687514, + "grad_norm": 0.69921875, + "learning_rate": 0.00017918815769895808, + "loss": 0.0335, + "step": 4421 + }, + { + "epoch": 0.20952380952380953, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017917906229086925, + "loss": 0.0333, + "step": 4422 + }, + { + "epoch": 0.2095711916607439, + "grad_norm": 0.55078125, + "learning_rate": 0.0001791699651266732, + "loss": 1.1535, + "step": 4423 + }, + { + "epoch": 0.2096185737976783, + "grad_norm": 0.61328125, + "learning_rate": 0.00017916086620657165, + "loss": 1.2566, + "step": 4424 + }, + { + "epoch": 0.20966595593461265, + "grad_norm": 0.60546875, + "learning_rate": 0.0001791517655307664, + "loss": 0.9216, + "step": 4425 + }, + { + "epoch": 0.20971333807154702, + "grad_norm": 0.330078125, + "learning_rate": 0.00017914266309945927, + "loss": 0.033, + "step": 4426 + }, + { + "epoch": 0.2097607202084814, + "grad_norm": 0.423828125, + "learning_rate": 0.0001791335589128522, + "loss": 0.6292, + "step": 4427 + }, + { + "epoch": 0.20980810234541578, + "grad_norm": 0.51171875, + "learning_rate": 0.00017912445297114707, + "loss": 0.8958, + "step": 4428 + }, + { + "epoch": 0.20985548448235014, + "grad_norm": 0.6875, + "learning_rate": 0.00017911534527454586, + "loss": 0.7853, + "step": 4429 + }, + { + "epoch": 0.20990286661928453, + "grad_norm": 0.546875, + "learning_rate": 0.00017910623582325058, + "loss": 1.0217, + "step": 4430 + }, + { + "epoch": 0.2099502487562189, + "grad_norm": 0.640625, + "learning_rate": 0.0001790971246174632, + "loss": 0.9569, + "step": 4431 + }, + { + "epoch": 0.2099976308931533, + "grad_norm": 0.671875, + "learning_rate": 0.0001790880116573859, + "loss": 1.152, + "step": 4432 + }, + { + "epoch": 0.21004501303008766, + "grad_norm": 0.69921875, + "learning_rate": 0.0001790788969432207, + "loss": 0.9903, + "step": 4433 + }, + { + "epoch": 0.21009239516702202, + "grad_norm": 0.53515625, + "learning_rate": 0.00017906978047516983, + "loss": 0.6095, + "step": 4434 + }, + { + "epoch": 0.21013977730395642, + "grad_norm": 0.58203125, + "learning_rate": 0.00017906066225343542, + "loss": 0.7025, + "step": 4435 + }, + { + "epoch": 0.21018715944089078, + "grad_norm": 0.2041015625, + "learning_rate": 0.00017905154227821978, + "loss": 0.0122, + "step": 4436 + }, + { + "epoch": 0.21023454157782517, + "grad_norm": 0.68359375, + "learning_rate": 0.0001790424205497251, + "loss": 1.0213, + "step": 4437 + }, + { + "epoch": 0.21028192371475954, + "grad_norm": 0.76171875, + "learning_rate": 0.00017903329706815373, + "loss": 1.4509, + "step": 4438 + }, + { + "epoch": 0.2103293058516939, + "grad_norm": 0.58984375, + "learning_rate": 0.000179024171833708, + "loss": 0.7588, + "step": 4439 + }, + { + "epoch": 0.2103766879886283, + "grad_norm": 0.5234375, + "learning_rate": 0.00017901504484659034, + "loss": 1.0901, + "step": 4440 + }, + { + "epoch": 0.21042407012556266, + "grad_norm": 0.55859375, + "learning_rate": 0.00017900591610700312, + "loss": 1.1255, + "step": 4441 + }, + { + "epoch": 0.21047145226249703, + "grad_norm": 0.64453125, + "learning_rate": 0.00017899678561514884, + "loss": 0.5801, + "step": 4442 + }, + { + "epoch": 0.21051883439943142, + "grad_norm": 0.66015625, + "learning_rate": 0.00017898765337123003, + "loss": 0.9075, + "step": 4443 + }, + { + "epoch": 0.21056621653636579, + "grad_norm": 0.58203125, + "learning_rate": 0.00017897851937544916, + "loss": 0.7969, + "step": 4444 + }, + { + "epoch": 0.21061359867330018, + "grad_norm": 0.55859375, + "learning_rate": 0.00017896938362800885, + "loss": 0.7594, + "step": 4445 + }, + { + "epoch": 0.21066098081023454, + "grad_norm": 0.86328125, + "learning_rate": 0.00017896024612911178, + "loss": 0.959, + "step": 4446 + }, + { + "epoch": 0.2107083629471689, + "grad_norm": 0.60546875, + "learning_rate": 0.00017895110687896052, + "loss": 1.2508, + "step": 4447 + }, + { + "epoch": 0.2107557450841033, + "grad_norm": 0.60546875, + "learning_rate": 0.0001789419658777578, + "loss": 0.6613, + "step": 4448 + }, + { + "epoch": 0.21080312722103767, + "grad_norm": 0.71875, + "learning_rate": 0.00017893282312570635, + "loss": 1.3908, + "step": 4449 + }, + { + "epoch": 0.21085050935797203, + "grad_norm": 0.78125, + "learning_rate": 0.000178923678623009, + "loss": 1.1284, + "step": 4450 + }, + { + "epoch": 0.21089789149490643, + "grad_norm": 0.5625, + "learning_rate": 0.00017891453236986847, + "loss": 0.7242, + "step": 4451 + }, + { + "epoch": 0.2109452736318408, + "grad_norm": 0.5859375, + "learning_rate": 0.0001789053843664877, + "loss": 1.0444, + "step": 4452 + }, + { + "epoch": 0.21099265576877518, + "grad_norm": 0.6328125, + "learning_rate": 0.00017889623461306956, + "loss": 0.8223, + "step": 4453 + }, + { + "epoch": 0.21104003790570955, + "grad_norm": 0.2734375, + "learning_rate": 0.00017888708310981696, + "loss": 0.1471, + "step": 4454 + }, + { + "epoch": 0.21108742004264391, + "grad_norm": 0.68359375, + "learning_rate": 0.0001788779298569329, + "loss": 1.1383, + "step": 4455 + }, + { + "epoch": 0.2111348021795783, + "grad_norm": 0.62109375, + "learning_rate": 0.00017886877485462038, + "loss": 0.0846, + "step": 4456 + }, + { + "epoch": 0.21118218431651267, + "grad_norm": 0.86328125, + "learning_rate": 0.00017885961810308238, + "loss": 0.1903, + "step": 4457 + }, + { + "epoch": 0.21122956645344704, + "grad_norm": 0.6796875, + "learning_rate": 0.0001788504596025221, + "loss": 0.0582, + "step": 4458 + }, + { + "epoch": 0.21127694859038143, + "grad_norm": 0.57421875, + "learning_rate": 0.00017884129935314262, + "loss": 1.2312, + "step": 4459 + }, + { + "epoch": 0.2113243307273158, + "grad_norm": 0.62890625, + "learning_rate": 0.00017883213735514706, + "loss": 0.8433, + "step": 4460 + }, + { + "epoch": 0.2113717128642502, + "grad_norm": 0.71484375, + "learning_rate": 0.0001788229736087387, + "loss": 0.9221, + "step": 4461 + }, + { + "epoch": 0.21141909500118455, + "grad_norm": 0.486328125, + "learning_rate": 0.00017881380811412073, + "loss": 0.8076, + "step": 4462 + }, + { + "epoch": 0.21146647713811892, + "grad_norm": 0.5390625, + "learning_rate": 0.00017880464087149643, + "loss": 1.1539, + "step": 4463 + }, + { + "epoch": 0.2115138592750533, + "grad_norm": 0.5859375, + "learning_rate": 0.00017879547188106914, + "loss": 0.7532, + "step": 4464 + }, + { + "epoch": 0.21156124141198768, + "grad_norm": 0.58984375, + "learning_rate": 0.00017878630114304223, + "loss": 0.9691, + "step": 4465 + }, + { + "epoch": 0.21160862354892207, + "grad_norm": 0.4453125, + "learning_rate": 0.00017877712865761906, + "loss": 0.5623, + "step": 4466 + }, + { + "epoch": 0.21165600568585644, + "grad_norm": 0.56640625, + "learning_rate": 0.0001787679544250031, + "loss": 1.3412, + "step": 4467 + }, + { + "epoch": 0.2117033878227908, + "grad_norm": 0.0810546875, + "learning_rate": 0.0001787587784453978, + "loss": 0.0055, + "step": 4468 + }, + { + "epoch": 0.2117507699597252, + "grad_norm": 0.69921875, + "learning_rate": 0.00017874960071900668, + "loss": 1.0678, + "step": 4469 + }, + { + "epoch": 0.21179815209665956, + "grad_norm": 0.65234375, + "learning_rate": 0.0001787404212460333, + "loss": 1.2227, + "step": 4470 + }, + { + "epoch": 0.21184553423359392, + "grad_norm": 0.21484375, + "learning_rate": 0.00017873124002668126, + "loss": 0.1423, + "step": 4471 + }, + { + "epoch": 0.21189291637052832, + "grad_norm": 0.57421875, + "learning_rate": 0.00017872205706115417, + "loss": 0.7837, + "step": 4472 + }, + { + "epoch": 0.21194029850746268, + "grad_norm": 0.46875, + "learning_rate": 0.0001787128723496557, + "loss": 0.4995, + "step": 4473 + }, + { + "epoch": 0.21198768064439708, + "grad_norm": 0.6484375, + "learning_rate": 0.00017870368589238957, + "loss": 1.6933, + "step": 4474 + }, + { + "epoch": 0.21203506278133144, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017869449768955953, + "loss": 0.0053, + "step": 4475 + }, + { + "epoch": 0.2120824449182658, + "grad_norm": 0.58203125, + "learning_rate": 0.00017868530774136932, + "loss": 1.1833, + "step": 4476 + }, + { + "epoch": 0.2121298270552002, + "grad_norm": 0.75390625, + "learning_rate": 0.00017867611604802282, + "loss": 0.7067, + "step": 4477 + }, + { + "epoch": 0.21217720919213456, + "grad_norm": 1.5703125, + "learning_rate": 0.00017866692260972388, + "loss": 0.8348, + "step": 4478 + }, + { + "epoch": 0.21222459132906893, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00017865772742667636, + "loss": 0.0019, + "step": 4479 + }, + { + "epoch": 0.21227197346600332, + "grad_norm": 0.62890625, + "learning_rate": 0.00017864853049908422, + "loss": 0.9081, + "step": 4480 + }, + { + "epoch": 0.2123193556029377, + "grad_norm": 0.5546875, + "learning_rate": 0.00017863933182715147, + "loss": 0.5684, + "step": 4481 + }, + { + "epoch": 0.21236673773987208, + "grad_norm": 0.69921875, + "learning_rate": 0.0001786301314110821, + "loss": 0.9752, + "step": 4482 + }, + { + "epoch": 0.21241411987680645, + "grad_norm": 0.5234375, + "learning_rate": 0.00017862092925108016, + "loss": 0.0964, + "step": 4483 + }, + { + "epoch": 0.2124615020137408, + "grad_norm": 0.1884765625, + "learning_rate": 0.00017861172534734977, + "loss": 0.0359, + "step": 4484 + }, + { + "epoch": 0.2125088841506752, + "grad_norm": 0.671875, + "learning_rate": 0.00017860251970009503, + "loss": 1.3729, + "step": 4485 + }, + { + "epoch": 0.21255626628760957, + "grad_norm": 0.640625, + "learning_rate": 0.00017859331230952013, + "loss": 0.6791, + "step": 4486 + }, + { + "epoch": 0.21260364842454393, + "grad_norm": 0.6875, + "learning_rate": 0.00017858410317582929, + "loss": 0.2489, + "step": 4487 + }, + { + "epoch": 0.21265103056147833, + "grad_norm": 0.52734375, + "learning_rate": 0.0001785748922992267, + "loss": 0.7125, + "step": 4488 + }, + { + "epoch": 0.2126984126984127, + "grad_norm": 0.5234375, + "learning_rate": 0.00017856567967991675, + "loss": 0.8741, + "step": 4489 + }, + { + "epoch": 0.21274579483534709, + "grad_norm": 0.640625, + "learning_rate": 0.00017855646531810367, + "loss": 0.2095, + "step": 4490 + }, + { + "epoch": 0.21279317697228145, + "grad_norm": 0.890625, + "learning_rate": 0.0001785472492139919, + "loss": 0.9846, + "step": 4491 + }, + { + "epoch": 0.21284055910921582, + "grad_norm": 0.6640625, + "learning_rate": 0.0001785380313677858, + "loss": 0.8484, + "step": 4492 + }, + { + "epoch": 0.2128879412461502, + "grad_norm": 0.5234375, + "learning_rate": 0.00017852881177968978, + "loss": 0.8543, + "step": 4493 + }, + { + "epoch": 0.21293532338308457, + "grad_norm": 0.62890625, + "learning_rate": 0.0001785195904499084, + "loss": 1.2075, + "step": 4494 + }, + { + "epoch": 0.21298270552001897, + "grad_norm": 0.6171875, + "learning_rate": 0.00017851036737864614, + "loss": 0.9833, + "step": 4495 + }, + { + "epoch": 0.21303008765695333, + "grad_norm": 0.58203125, + "learning_rate": 0.00017850114256610754, + "loss": 1.2396, + "step": 4496 + }, + { + "epoch": 0.2130774697938877, + "grad_norm": 0.1953125, + "learning_rate": 0.0001784919160124972, + "loss": 0.0141, + "step": 4497 + }, + { + "epoch": 0.2131248519308221, + "grad_norm": 0.46875, + "learning_rate": 0.00017848268771801978, + "loss": 0.5485, + "step": 4498 + }, + { + "epoch": 0.21317223406775646, + "grad_norm": 0.625, + "learning_rate": 0.00017847345768287994, + "loss": 1.1611, + "step": 4499 + }, + { + "epoch": 0.21321961620469082, + "grad_norm": 0.671875, + "learning_rate": 0.00017846422590728241, + "loss": 1.4193, + "step": 4500 + }, + { + "epoch": 0.2132669983416252, + "grad_norm": 0.6640625, + "learning_rate": 0.0001784549923914319, + "loss": 1.0991, + "step": 4501 + }, + { + "epoch": 0.21331438047855958, + "grad_norm": 0.62109375, + "learning_rate": 0.00017844575713553325, + "loss": 1.0719, + "step": 4502 + }, + { + "epoch": 0.21336176261549397, + "grad_norm": 0.8984375, + "learning_rate": 0.00017843652013979123, + "loss": 1.1205, + "step": 4503 + }, + { + "epoch": 0.21340914475242834, + "grad_norm": 0.5234375, + "learning_rate": 0.00017842728140441078, + "loss": 0.7693, + "step": 4504 + }, + { + "epoch": 0.2134565268893627, + "grad_norm": 0.671875, + "learning_rate": 0.00017841804092959675, + "loss": 1.0125, + "step": 4505 + }, + { + "epoch": 0.2135039090262971, + "grad_norm": 0.5546875, + "learning_rate": 0.00017840879871555408, + "loss": 1.2968, + "step": 4506 + }, + { + "epoch": 0.21355129116323146, + "grad_norm": 0.62109375, + "learning_rate": 0.00017839955476248783, + "loss": 1.0322, + "step": 4507 + }, + { + "epoch": 0.21359867330016583, + "grad_norm": 0.55078125, + "learning_rate": 0.0001783903090706029, + "loss": 0.039, + "step": 4508 + }, + { + "epoch": 0.21364605543710022, + "grad_norm": 0.65234375, + "learning_rate": 0.00017838106164010445, + "loss": 1.449, + "step": 4509 + }, + { + "epoch": 0.21369343757403458, + "grad_norm": 0.359375, + "learning_rate": 0.00017837181247119753, + "loss": 0.2595, + "step": 4510 + }, + { + "epoch": 0.21374081971096898, + "grad_norm": 0.52734375, + "learning_rate": 0.00017836256156408728, + "loss": 0.5309, + "step": 4511 + }, + { + "epoch": 0.21378820184790334, + "grad_norm": 0.60546875, + "learning_rate": 0.0001783533089189789, + "loss": 1.3286, + "step": 4512 + }, + { + "epoch": 0.2138355839848377, + "grad_norm": 0.609375, + "learning_rate": 0.0001783440545360776, + "loss": 0.9724, + "step": 4513 + }, + { + "epoch": 0.2138829661217721, + "grad_norm": 0.5234375, + "learning_rate": 0.00017833479841558857, + "loss": 0.7573, + "step": 4514 + }, + { + "epoch": 0.21393034825870647, + "grad_norm": 0.80078125, + "learning_rate": 0.00017832554055771714, + "loss": 0.1716, + "step": 4515 + }, + { + "epoch": 0.21397773039564083, + "grad_norm": 0.5390625, + "learning_rate": 0.0001783162809626687, + "loss": 0.066, + "step": 4516 + }, + { + "epoch": 0.21402511253257522, + "grad_norm": 0.59765625, + "learning_rate": 0.00017830701963064856, + "loss": 0.9212, + "step": 4517 + }, + { + "epoch": 0.2140724946695096, + "grad_norm": 0.60546875, + "learning_rate": 0.00017829775656186213, + "loss": 1.08, + "step": 4518 + }, + { + "epoch": 0.21411987680644398, + "grad_norm": 0.67578125, + "learning_rate": 0.00017828849175651482, + "loss": 1.1641, + "step": 4519 + }, + { + "epoch": 0.21416725894337835, + "grad_norm": 0.78515625, + "learning_rate": 0.0001782792252148122, + "loss": 0.5539, + "step": 4520 + }, + { + "epoch": 0.2142146410803127, + "grad_norm": 0.4765625, + "learning_rate": 0.00017826995693695974, + "loss": 0.356, + "step": 4521 + }, + { + "epoch": 0.2142620232172471, + "grad_norm": 0.8671875, + "learning_rate": 0.00017826068692316298, + "loss": 1.1708, + "step": 4522 + }, + { + "epoch": 0.21430940535418147, + "grad_norm": 0.40625, + "learning_rate": 0.00017825141517362756, + "loss": 0.0662, + "step": 4523 + }, + { + "epoch": 0.21435678749111584, + "grad_norm": 0.46484375, + "learning_rate": 0.00017824214168855908, + "loss": 0.6175, + "step": 4524 + }, + { + "epoch": 0.21440416962805023, + "grad_norm": 0.91015625, + "learning_rate": 0.00017823286646816326, + "loss": 0.9617, + "step": 4525 + }, + { + "epoch": 0.2144515517649846, + "grad_norm": 0.78125, + "learning_rate": 0.0001782235895126458, + "loss": 1.2364, + "step": 4526 + }, + { + "epoch": 0.214498933901919, + "grad_norm": 0.62890625, + "learning_rate": 0.0001782143108222124, + "loss": 1.1671, + "step": 4527 + }, + { + "epoch": 0.21454631603885335, + "grad_norm": 0.69921875, + "learning_rate": 0.00017820503039706894, + "loss": 0.1485, + "step": 4528 + }, + { + "epoch": 0.21459369817578772, + "grad_norm": 0.5546875, + "learning_rate": 0.0001781957482374212, + "loss": 1.2461, + "step": 4529 + }, + { + "epoch": 0.2146410803127221, + "grad_norm": 0.80078125, + "learning_rate": 0.00017818646434347504, + "loss": 0.8072, + "step": 4530 + }, + { + "epoch": 0.21468846244965648, + "grad_norm": 0.75390625, + "learning_rate": 0.0001781771787154364, + "loss": 1.2577, + "step": 4531 + }, + { + "epoch": 0.21473584458659087, + "grad_norm": 0.73828125, + "learning_rate": 0.00017816789135351117, + "loss": 1.0809, + "step": 4532 + }, + { + "epoch": 0.21478322672352523, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001781586022579054, + "loss": 0.0052, + "step": 4533 + }, + { + "epoch": 0.2148306088604596, + "grad_norm": 0.65625, + "learning_rate": 0.00017814931142882506, + "loss": 0.7403, + "step": 4534 + }, + { + "epoch": 0.214877990997394, + "grad_norm": 0.76953125, + "learning_rate": 0.00017814001886647628, + "loss": 0.9979, + "step": 4535 + }, + { + "epoch": 0.21492537313432836, + "grad_norm": 0.302734375, + "learning_rate": 0.00017813072457106504, + "loss": 0.1139, + "step": 4536 + }, + { + "epoch": 0.21497275527126272, + "grad_norm": 0.69140625, + "learning_rate": 0.00017812142854279763, + "loss": 1.0816, + "step": 4537 + }, + { + "epoch": 0.21502013740819712, + "grad_norm": 0.6015625, + "learning_rate": 0.00017811213078188008, + "loss": 0.0649, + "step": 4538 + }, + { + "epoch": 0.21506751954513148, + "grad_norm": 0.625, + "learning_rate": 0.00017810283128851873, + "loss": 0.0291, + "step": 4539 + }, + { + "epoch": 0.21511490168206587, + "grad_norm": 0.08984375, + "learning_rate": 0.00017809353006291974, + "loss": 0.0044, + "step": 4540 + }, + { + "epoch": 0.21516228381900024, + "grad_norm": 0.70703125, + "learning_rate": 0.00017808422710528942, + "loss": 1.1405, + "step": 4541 + }, + { + "epoch": 0.2152096659559346, + "grad_norm": 0.57421875, + "learning_rate": 0.00017807492241583413, + "loss": 0.5905, + "step": 4542 + }, + { + "epoch": 0.215257048092869, + "grad_norm": 0.64453125, + "learning_rate": 0.00017806561599476023, + "loss": 0.9847, + "step": 4543 + }, + { + "epoch": 0.21530443022980336, + "grad_norm": 0.8125, + "learning_rate": 0.0001780563078422741, + "loss": 0.9864, + "step": 4544 + }, + { + "epoch": 0.21535181236673773, + "grad_norm": 0.275390625, + "learning_rate": 0.00017804699795858224, + "loss": 0.1668, + "step": 4545 + }, + { + "epoch": 0.21539919450367212, + "grad_norm": 0.87109375, + "learning_rate": 0.00017803768634389107, + "loss": 0.5681, + "step": 4546 + }, + { + "epoch": 0.21544657664060649, + "grad_norm": 0.66796875, + "learning_rate": 0.00017802837299840714, + "loss": 0.9761, + "step": 4547 + }, + { + "epoch": 0.21549395877754088, + "grad_norm": 0.78515625, + "learning_rate": 0.00017801905792233703, + "loss": 1.0556, + "step": 4548 + }, + { + "epoch": 0.21554134091447524, + "grad_norm": 0.6171875, + "learning_rate": 0.0001780097411158873, + "loss": 1.2234, + "step": 4549 + }, + { + "epoch": 0.2155887230514096, + "grad_norm": 0.640625, + "learning_rate": 0.0001780004225792646, + "loss": 1.0361, + "step": 4550 + }, + { + "epoch": 0.215636105188344, + "grad_norm": 0.75, + "learning_rate": 0.0001779911023126756, + "loss": 0.9324, + "step": 4551 + }, + { + "epoch": 0.21568348732527837, + "grad_norm": 0.6328125, + "learning_rate": 0.00017798178031632703, + "loss": 1.1086, + "step": 4552 + }, + { + "epoch": 0.21573086946221273, + "grad_norm": 0.373046875, + "learning_rate": 0.00017797245659042566, + "loss": 0.0465, + "step": 4553 + }, + { + "epoch": 0.21577825159914713, + "grad_norm": 0.546875, + "learning_rate": 0.00017796313113517824, + "loss": 0.846, + "step": 4554 + }, + { + "epoch": 0.2158256337360815, + "grad_norm": 0.4453125, + "learning_rate": 0.0001779538039507916, + "loss": 0.7183, + "step": 4555 + }, + { + "epoch": 0.21587301587301588, + "grad_norm": 0.53515625, + "learning_rate": 0.00017794447503747263, + "loss": 0.9674, + "step": 4556 + }, + { + "epoch": 0.21592039800995025, + "grad_norm": 0.5625, + "learning_rate": 0.00017793514439542825, + "loss": 0.6377, + "step": 4557 + }, + { + "epoch": 0.2159677801468846, + "grad_norm": 0.6171875, + "learning_rate": 0.00017792581202486535, + "loss": 0.7499, + "step": 4558 + }, + { + "epoch": 0.216015162283819, + "grad_norm": 0.59765625, + "learning_rate": 0.00017791647792599093, + "loss": 0.9097, + "step": 4559 + }, + { + "epoch": 0.21606254442075337, + "grad_norm": 1.0, + "learning_rate": 0.00017790714209901207, + "loss": 1.3563, + "step": 4560 + }, + { + "epoch": 0.21610992655768776, + "grad_norm": 0.6015625, + "learning_rate": 0.00017789780454413573, + "loss": 1.0608, + "step": 4561 + }, + { + "epoch": 0.21615730869462213, + "grad_norm": 0.63671875, + "learning_rate": 0.00017788846526156911, + "loss": 0.089, + "step": 4562 + }, + { + "epoch": 0.2162046908315565, + "grad_norm": 0.5078125, + "learning_rate": 0.00017787912425151924, + "loss": 0.5966, + "step": 4563 + }, + { + "epoch": 0.2162520729684909, + "grad_norm": 0.8125, + "learning_rate": 0.00017786978151419338, + "loss": 0.7353, + "step": 4564 + }, + { + "epoch": 0.21629945510542525, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001778604370497987, + "loss": 0.1329, + "step": 4565 + }, + { + "epoch": 0.21634683724235962, + "grad_norm": 0.61328125, + "learning_rate": 0.00017785109085854245, + "loss": 0.356, + "step": 4566 + }, + { + "epoch": 0.216394219379294, + "grad_norm": 0.2119140625, + "learning_rate": 0.00017784174294063195, + "loss": 0.0174, + "step": 4567 + }, + { + "epoch": 0.21644160151622838, + "grad_norm": 0.75, + "learning_rate": 0.00017783239329627454, + "loss": 1.3163, + "step": 4568 + }, + { + "epoch": 0.21648898365316277, + "grad_norm": 0.41015625, + "learning_rate": 0.00017782304192567747, + "loss": 0.1526, + "step": 4569 + }, + { + "epoch": 0.21653636579009714, + "grad_norm": 0.9453125, + "learning_rate": 0.00017781368882904824, + "loss": 0.0569, + "step": 4570 + }, + { + "epoch": 0.2165837479270315, + "grad_norm": 0.6640625, + "learning_rate": 0.00017780433400659432, + "loss": 1.3177, + "step": 4571 + }, + { + "epoch": 0.2166311300639659, + "grad_norm": 0.92578125, + "learning_rate": 0.0001777949774585231, + "loss": 0.0829, + "step": 4572 + }, + { + "epoch": 0.21667851220090026, + "grad_norm": 0.62109375, + "learning_rate": 0.00017778561918504218, + "loss": 0.0749, + "step": 4573 + }, + { + "epoch": 0.21672589433783462, + "grad_norm": 0.640625, + "learning_rate": 0.00017777625918635907, + "loss": 1.1553, + "step": 4574 + }, + { + "epoch": 0.21677327647476902, + "grad_norm": 0.59765625, + "learning_rate": 0.00017776689746268137, + "loss": 1.1068, + "step": 4575 + }, + { + "epoch": 0.21682065861170338, + "grad_norm": 0.53515625, + "learning_rate": 0.0001777575340142167, + "loss": 0.6795, + "step": 4576 + }, + { + "epoch": 0.21686804074863777, + "grad_norm": 0.734375, + "learning_rate": 0.00017774816884117277, + "loss": 1.1908, + "step": 4577 + }, + { + "epoch": 0.21691542288557214, + "grad_norm": 0.58203125, + "learning_rate": 0.00017773880194375726, + "loss": 1.3618, + "step": 4578 + }, + { + "epoch": 0.2169628050225065, + "grad_norm": 0.259765625, + "learning_rate": 0.00017772943332217792, + "loss": 0.0409, + "step": 4579 + }, + { + "epoch": 0.2170101871594409, + "grad_norm": 0.953125, + "learning_rate": 0.00017772006297664256, + "loss": 0.6163, + "step": 4580 + }, + { + "epoch": 0.21705756929637526, + "grad_norm": 0.6015625, + "learning_rate": 0.00017771069090735897, + "loss": 1.048, + "step": 4581 + }, + { + "epoch": 0.21710495143330963, + "grad_norm": 0.58984375, + "learning_rate": 0.00017770131711453507, + "loss": 1.1447, + "step": 4582 + }, + { + "epoch": 0.21715233357024402, + "grad_norm": 0.58984375, + "learning_rate": 0.00017769194159837867, + "loss": 1.2723, + "step": 4583 + }, + { + "epoch": 0.2171997157071784, + "grad_norm": 0.67578125, + "learning_rate": 0.00017768256435909778, + "loss": 1.3821, + "step": 4584 + }, + { + "epoch": 0.21724709784411278, + "grad_norm": 0.1865234375, + "learning_rate": 0.00017767318539690035, + "loss": 0.0339, + "step": 4585 + }, + { + "epoch": 0.21729447998104715, + "grad_norm": 0.578125, + "learning_rate": 0.0001776638047119944, + "loss": 0.6054, + "step": 4586 + }, + { + "epoch": 0.2173418621179815, + "grad_norm": 0.5703125, + "learning_rate": 0.000177654422304588, + "loss": 1.1137, + "step": 4587 + }, + { + "epoch": 0.2173892442549159, + "grad_norm": 0.6328125, + "learning_rate": 0.00017764503817488923, + "loss": 1.4631, + "step": 4588 + }, + { + "epoch": 0.21743662639185027, + "grad_norm": 1.1015625, + "learning_rate": 0.0001776356523231062, + "loss": 0.9437, + "step": 4589 + }, + { + "epoch": 0.21748400852878466, + "grad_norm": 0.4765625, + "learning_rate": 0.0001776262647494471, + "loss": 0.6411, + "step": 4590 + }, + { + "epoch": 0.21753139066571903, + "grad_norm": 0.53125, + "learning_rate": 0.00017761687545412013, + "loss": 0.7975, + "step": 4591 + }, + { + "epoch": 0.2175787728026534, + "grad_norm": 0.234375, + "learning_rate": 0.00017760748443733352, + "loss": 0.1838, + "step": 4592 + }, + { + "epoch": 0.21762615493958778, + "grad_norm": 0.578125, + "learning_rate": 0.0001775980916992956, + "loss": 1.1014, + "step": 4593 + }, + { + "epoch": 0.21767353707652215, + "grad_norm": 0.5234375, + "learning_rate": 0.00017758869724021463, + "loss": 0.7344, + "step": 4594 + }, + { + "epoch": 0.21772091921345652, + "grad_norm": 0.62109375, + "learning_rate": 0.00017757930106029903, + "loss": 0.9556, + "step": 4595 + }, + { + "epoch": 0.2177683013503909, + "grad_norm": 0.578125, + "learning_rate": 0.00017756990315975714, + "loss": 1.1179, + "step": 4596 + }, + { + "epoch": 0.21781568348732527, + "grad_norm": 0.474609375, + "learning_rate": 0.00017756050353879746, + "loss": 0.6082, + "step": 4597 + }, + { + "epoch": 0.21786306562425967, + "grad_norm": 0.48046875, + "learning_rate": 0.0001775511021976284, + "loss": 0.6329, + "step": 4598 + }, + { + "epoch": 0.21791044776119403, + "grad_norm": 0.75, + "learning_rate": 0.0001775416991364585, + "loss": 1.2488, + "step": 4599 + }, + { + "epoch": 0.2179578298981284, + "grad_norm": 1.859375, + "learning_rate": 0.00017753229435549627, + "loss": 1.1134, + "step": 4600 + }, + { + "epoch": 0.2180052120350628, + "grad_norm": 0.55859375, + "learning_rate": 0.00017752288785495037, + "loss": 0.9355, + "step": 4601 + }, + { + "epoch": 0.21805259417199715, + "grad_norm": 1.2734375, + "learning_rate": 0.0001775134796350294, + "loss": 0.064, + "step": 4602 + }, + { + "epoch": 0.21809997630893152, + "grad_norm": 1.234375, + "learning_rate": 0.000177504069695942, + "loss": 0.3083, + "step": 4603 + }, + { + "epoch": 0.2181473584458659, + "grad_norm": 0.25390625, + "learning_rate": 0.0001774946580378969, + "loss": 0.1614, + "step": 4604 + }, + { + "epoch": 0.21819474058280028, + "grad_norm": 0.46484375, + "learning_rate": 0.0001774852446611028, + "loss": 0.2464, + "step": 4605 + }, + { + "epoch": 0.21824212271973467, + "grad_norm": 0.6171875, + "learning_rate": 0.00017747582956576853, + "loss": 0.5314, + "step": 4606 + }, + { + "epoch": 0.21828950485666904, + "grad_norm": 0.248046875, + "learning_rate": 0.0001774664127521029, + "loss": 0.0323, + "step": 4607 + }, + { + "epoch": 0.2183368869936034, + "grad_norm": 0.53515625, + "learning_rate": 0.0001774569942203147, + "loss": 0.5314, + "step": 4608 + }, + { + "epoch": 0.2183842691305378, + "grad_norm": 0.57421875, + "learning_rate": 0.0001774475739706129, + "loss": 0.8986, + "step": 4609 + }, + { + "epoch": 0.21843165126747216, + "grad_norm": 0.71875, + "learning_rate": 0.0001774381520032064, + "loss": 1.1637, + "step": 4610 + }, + { + "epoch": 0.21847903340440653, + "grad_norm": 0.75, + "learning_rate": 0.00017742872831830422, + "loss": 0.8177, + "step": 4611 + }, + { + "epoch": 0.21852641554134092, + "grad_norm": 0.62890625, + "learning_rate": 0.00017741930291611526, + "loss": 1.1722, + "step": 4612 + }, + { + "epoch": 0.21857379767827528, + "grad_norm": 0.55859375, + "learning_rate": 0.00017740987579684863, + "loss": 0.8712, + "step": 4613 + }, + { + "epoch": 0.21862117981520968, + "grad_norm": 0.248046875, + "learning_rate": 0.00017740044696071342, + "loss": 0.062, + "step": 4614 + }, + { + "epoch": 0.21866856195214404, + "grad_norm": 0.671875, + "learning_rate": 0.00017739101640791875, + "loss": 1.4296, + "step": 4615 + }, + { + "epoch": 0.2187159440890784, + "grad_norm": 0.07568359375, + "learning_rate": 0.00017738158413867377, + "loss": 0.0048, + "step": 4616 + }, + { + "epoch": 0.2187633262260128, + "grad_norm": 0.53125, + "learning_rate": 0.00017737215015318765, + "loss": 0.1394, + "step": 4617 + }, + { + "epoch": 0.21881070836294716, + "grad_norm": 0.58203125, + "learning_rate": 0.00017736271445166965, + "loss": 1.2406, + "step": 4618 + }, + { + "epoch": 0.21885809049988156, + "grad_norm": 0.74609375, + "learning_rate": 0.00017735327703432906, + "loss": 1.0548, + "step": 4619 + }, + { + "epoch": 0.21890547263681592, + "grad_norm": 0.6640625, + "learning_rate": 0.0001773438379013752, + "loss": 1.4781, + "step": 4620 + }, + { + "epoch": 0.2189528547737503, + "grad_norm": 0.7265625, + "learning_rate": 0.00017733439705301738, + "loss": 1.6623, + "step": 4621 + }, + { + "epoch": 0.21900023691068468, + "grad_norm": 0.62109375, + "learning_rate": 0.000177324954489465, + "loss": 1.0556, + "step": 4622 + }, + { + "epoch": 0.21904761904761905, + "grad_norm": 0.26953125, + "learning_rate": 0.00017731551021092748, + "loss": 0.0364, + "step": 4623 + }, + { + "epoch": 0.2190950011845534, + "grad_norm": 0.380859375, + "learning_rate": 0.00017730606421761434, + "loss": 0.5981, + "step": 4624 + }, + { + "epoch": 0.2191423833214878, + "grad_norm": 0.7734375, + "learning_rate": 0.00017729661650973502, + "loss": 1.2069, + "step": 4625 + }, + { + "epoch": 0.21918976545842217, + "grad_norm": 0.5859375, + "learning_rate": 0.00017728716708749907, + "loss": 0.628, + "step": 4626 + }, + { + "epoch": 0.21923714759535656, + "grad_norm": 0.84765625, + "learning_rate": 0.0001772777159511161, + "loss": 0.9068, + "step": 4627 + }, + { + "epoch": 0.21928452973229093, + "grad_norm": 0.57421875, + "learning_rate": 0.00017726826310079566, + "loss": 0.7564, + "step": 4628 + }, + { + "epoch": 0.2193319118692253, + "grad_norm": 0.79296875, + "learning_rate": 0.00017725880853674744, + "loss": 0.256, + "step": 4629 + }, + { + "epoch": 0.2193792940061597, + "grad_norm": 0.59375, + "learning_rate": 0.0001772493522591812, + "loss": 0.9717, + "step": 4630 + }, + { + "epoch": 0.21942667614309405, + "grad_norm": 0.65625, + "learning_rate": 0.00017723989426830656, + "loss": 0.1078, + "step": 4631 + }, + { + "epoch": 0.21947405828002842, + "grad_norm": 0.7734375, + "learning_rate": 0.00017723043456433334, + "loss": 1.1237, + "step": 4632 + }, + { + "epoch": 0.2195214404169628, + "grad_norm": 0.5703125, + "learning_rate": 0.00017722097314747137, + "loss": 1.2465, + "step": 4633 + }, + { + "epoch": 0.21956882255389717, + "grad_norm": 0.6875, + "learning_rate": 0.00017721151001793046, + "loss": 1.145, + "step": 4634 + }, + { + "epoch": 0.21961620469083157, + "grad_norm": 0.546875, + "learning_rate": 0.0001772020451759205, + "loss": 0.5836, + "step": 4635 + }, + { + "epoch": 0.21966358682776593, + "grad_norm": 0.48046875, + "learning_rate": 0.00017719257862165139, + "loss": 0.0576, + "step": 4636 + }, + { + "epoch": 0.2197109689647003, + "grad_norm": 0.6640625, + "learning_rate": 0.00017718311035533314, + "loss": 1.0329, + "step": 4637 + }, + { + "epoch": 0.2197583511016347, + "grad_norm": 0.640625, + "learning_rate": 0.00017717364037717566, + "loss": 1.1943, + "step": 4638 + }, + { + "epoch": 0.21980573323856906, + "grad_norm": 0.77734375, + "learning_rate": 0.0001771641686873891, + "loss": 0.9518, + "step": 4639 + }, + { + "epoch": 0.21985311537550342, + "grad_norm": 0.70703125, + "learning_rate": 0.00017715469528618342, + "loss": 0.5623, + "step": 4640 + }, + { + "epoch": 0.21990049751243781, + "grad_norm": 0.31640625, + "learning_rate": 0.0001771452201737688, + "loss": 0.1291, + "step": 4641 + }, + { + "epoch": 0.21994787964937218, + "grad_norm": 0.8984375, + "learning_rate": 0.00017713574335035542, + "loss": 0.7221, + "step": 4642 + }, + { + "epoch": 0.21999526178630657, + "grad_norm": 0.57421875, + "learning_rate": 0.00017712626481615335, + "loss": 0.8539, + "step": 4643 + }, + { + "epoch": 0.22004264392324094, + "grad_norm": 0.67578125, + "learning_rate": 0.00017711678457137288, + "loss": 1.3783, + "step": 4644 + }, + { + "epoch": 0.2200900260601753, + "grad_norm": 0.228515625, + "learning_rate": 0.0001771073026162243, + "loss": 0.0489, + "step": 4645 + }, + { + "epoch": 0.2201374081971097, + "grad_norm": 0.55078125, + "learning_rate": 0.00017709781895091785, + "loss": 1.1835, + "step": 4646 + }, + { + "epoch": 0.22018479033404406, + "grad_norm": 0.53125, + "learning_rate": 0.0001770883335756639, + "loss": 0.906, + "step": 4647 + }, + { + "epoch": 0.22023217247097845, + "grad_norm": 0.56640625, + "learning_rate": 0.00017707884649067283, + "loss": 0.7656, + "step": 4648 + }, + { + "epoch": 0.22027955460791282, + "grad_norm": 0.224609375, + "learning_rate": 0.00017706935769615508, + "loss": 0.1213, + "step": 4649 + }, + { + "epoch": 0.22032693674484718, + "grad_norm": 0.74609375, + "learning_rate": 0.00017705986719232102, + "loss": 0.9539, + "step": 4650 + }, + { + "epoch": 0.22037431888178158, + "grad_norm": 0.6328125, + "learning_rate": 0.0001770503749793812, + "loss": 0.8432, + "step": 4651 + }, + { + "epoch": 0.22042170101871594, + "grad_norm": 0.71484375, + "learning_rate": 0.00017704088105754612, + "loss": 0.7951, + "step": 4652 + }, + { + "epoch": 0.2204690831556503, + "grad_norm": 0.734375, + "learning_rate": 0.0001770313854270264, + "loss": 0.9206, + "step": 4653 + }, + { + "epoch": 0.2205164652925847, + "grad_norm": 0.53515625, + "learning_rate": 0.00017702188808803255, + "loss": 0.8624, + "step": 4654 + }, + { + "epoch": 0.22056384742951907, + "grad_norm": 0.162109375, + "learning_rate": 0.00017701238904077528, + "loss": 0.0222, + "step": 4655 + }, + { + "epoch": 0.22061122956645346, + "grad_norm": 0.71484375, + "learning_rate": 0.00017700288828546525, + "loss": 0.0418, + "step": 4656 + }, + { + "epoch": 0.22065861170338782, + "grad_norm": 1.1171875, + "learning_rate": 0.00017699338582231316, + "loss": 0.0485, + "step": 4657 + }, + { + "epoch": 0.2207059938403222, + "grad_norm": 0.69921875, + "learning_rate": 0.00017698388165152983, + "loss": 1.0035, + "step": 4658 + }, + { + "epoch": 0.22075337597725658, + "grad_norm": 0.62109375, + "learning_rate": 0.000176974375773326, + "loss": 1.1605, + "step": 4659 + }, + { + "epoch": 0.22080075811419095, + "grad_norm": 0.609375, + "learning_rate": 0.00017696486818791248, + "loss": 1.1143, + "step": 4660 + }, + { + "epoch": 0.2208481402511253, + "grad_norm": 0.318359375, + "learning_rate": 0.00017695535889550014, + "loss": 0.046, + "step": 4661 + }, + { + "epoch": 0.2208955223880597, + "grad_norm": 0.5703125, + "learning_rate": 0.00017694584789629996, + "loss": 0.9403, + "step": 4662 + }, + { + "epoch": 0.22094290452499407, + "grad_norm": 0.5234375, + "learning_rate": 0.0001769363351905228, + "loss": 1.1337, + "step": 4663 + }, + { + "epoch": 0.22099028666192846, + "grad_norm": 0.5625, + "learning_rate": 0.00017692682077837966, + "loss": 1.2104, + "step": 4664 + }, + { + "epoch": 0.22103766879886283, + "grad_norm": 0.5078125, + "learning_rate": 0.00017691730466008163, + "loss": 0.6819, + "step": 4665 + }, + { + "epoch": 0.2210850509357972, + "grad_norm": 0.640625, + "learning_rate": 0.00017690778683583967, + "loss": 0.8609, + "step": 4666 + }, + { + "epoch": 0.2211324330727316, + "grad_norm": 0.67578125, + "learning_rate": 0.00017689826730586493, + "loss": 1.376, + "step": 4667 + }, + { + "epoch": 0.22117981520966595, + "grad_norm": 0.71875, + "learning_rate": 0.00017688874607036853, + "loss": 1.4283, + "step": 4668 + }, + { + "epoch": 0.22122719734660032, + "grad_norm": 0.98828125, + "learning_rate": 0.00017687922312956163, + "loss": 0.2257, + "step": 4669 + }, + { + "epoch": 0.2212745794835347, + "grad_norm": 0.69140625, + "learning_rate": 0.00017686969848365545, + "loss": 0.0948, + "step": 4670 + }, + { + "epoch": 0.22132196162046908, + "grad_norm": 0.51171875, + "learning_rate": 0.00017686017213286123, + "loss": 1.052, + "step": 4671 + }, + { + "epoch": 0.22136934375740347, + "grad_norm": 0.6875, + "learning_rate": 0.00017685064407739026, + "loss": 1.2214, + "step": 4672 + }, + { + "epoch": 0.22141672589433783, + "grad_norm": 0.6640625, + "learning_rate": 0.0001768411143174539, + "loss": 1.276, + "step": 4673 + }, + { + "epoch": 0.2214641080312722, + "grad_norm": 0.279296875, + "learning_rate": 0.00017683158285326344, + "loss": 0.1367, + "step": 4674 + }, + { + "epoch": 0.2215114901682066, + "grad_norm": 0.80078125, + "learning_rate": 0.00017682204968503032, + "loss": 1.3712, + "step": 4675 + }, + { + "epoch": 0.22155887230514096, + "grad_norm": 0.703125, + "learning_rate": 0.00017681251481296595, + "loss": 1.4865, + "step": 4676 + }, + { + "epoch": 0.22160625444207535, + "grad_norm": 0.8984375, + "learning_rate": 0.00017680297823728183, + "loss": 0.5422, + "step": 4677 + }, + { + "epoch": 0.22165363657900972, + "grad_norm": 0.6640625, + "learning_rate": 0.00017679343995818947, + "loss": 0.7689, + "step": 4678 + }, + { + "epoch": 0.22170101871594408, + "grad_norm": 0.2578125, + "learning_rate": 0.00017678389997590038, + "loss": 0.1221, + "step": 4679 + }, + { + "epoch": 0.22174840085287847, + "grad_norm": 0.59765625, + "learning_rate": 0.00017677435829062618, + "loss": 0.6552, + "step": 4680 + }, + { + "epoch": 0.22179578298981284, + "grad_norm": 0.26953125, + "learning_rate": 0.0001767648149025785, + "loss": 0.1533, + "step": 4681 + }, + { + "epoch": 0.2218431651267472, + "grad_norm": 0.64453125, + "learning_rate": 0.000176755269811969, + "loss": 1.3087, + "step": 4682 + }, + { + "epoch": 0.2218905472636816, + "grad_norm": 0.341796875, + "learning_rate": 0.00017674572301900934, + "loss": 0.4479, + "step": 4683 + }, + { + "epoch": 0.22193792940061596, + "grad_norm": 0.349609375, + "learning_rate": 0.00017673617452391134, + "loss": 0.0224, + "step": 4684 + }, + { + "epoch": 0.22198531153755036, + "grad_norm": 0.57421875, + "learning_rate": 0.0001767266243268867, + "loss": 0.4738, + "step": 4685 + }, + { + "epoch": 0.22203269367448472, + "grad_norm": 0.90234375, + "learning_rate": 0.00017671707242814723, + "loss": 0.8659, + "step": 4686 + }, + { + "epoch": 0.2220800758114191, + "grad_norm": 0.6640625, + "learning_rate": 0.00017670751882790486, + "loss": 1.4518, + "step": 4687 + }, + { + "epoch": 0.22212745794835348, + "grad_norm": 0.70703125, + "learning_rate": 0.00017669796352637137, + "loss": 1.1001, + "step": 4688 + }, + { + "epoch": 0.22217484008528784, + "grad_norm": 0.69140625, + "learning_rate": 0.0001766884065237588, + "loss": 0.6323, + "step": 4689 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.71875, + "learning_rate": 0.00017667884782027903, + "loss": 1.1724, + "step": 4690 + }, + { + "epoch": 0.2222696043591566, + "grad_norm": 0.49609375, + "learning_rate": 0.0001766692874161441, + "loss": 1.0944, + "step": 4691 + }, + { + "epoch": 0.22231698649609097, + "grad_norm": 0.78515625, + "learning_rate": 0.00017665972531156603, + "loss": 0.0107, + "step": 4692 + }, + { + "epoch": 0.22236436863302536, + "grad_norm": 0.71875, + "learning_rate": 0.0001766501615067569, + "loss": 0.9774, + "step": 4693 + }, + { + "epoch": 0.22241175076995973, + "grad_norm": 0.61328125, + "learning_rate": 0.00017664059600192884, + "loss": 1.0095, + "step": 4694 + }, + { + "epoch": 0.2224591329068941, + "grad_norm": 0.55859375, + "learning_rate": 0.00017663102879729401, + "loss": 1.158, + "step": 4695 + }, + { + "epoch": 0.22250651504382848, + "grad_norm": 0.65234375, + "learning_rate": 0.00017662145989306456, + "loss": 0.918, + "step": 4696 + }, + { + "epoch": 0.22255389718076285, + "grad_norm": 0.3984375, + "learning_rate": 0.00017661188928945275, + "loss": 0.0291, + "step": 4697 + }, + { + "epoch": 0.22260127931769721, + "grad_norm": 0.498046875, + "learning_rate": 0.00017660231698667084, + "loss": 0.0249, + "step": 4698 + }, + { + "epoch": 0.2226486614546316, + "grad_norm": 0.11328125, + "learning_rate": 0.00017659274298493114, + "loss": 0.0116, + "step": 4699 + }, + { + "epoch": 0.22269604359156597, + "grad_norm": 0.58203125, + "learning_rate": 0.00017658316728444597, + "loss": 0.6287, + "step": 4700 + }, + { + "epoch": 0.22274342572850037, + "grad_norm": 0.63671875, + "learning_rate": 0.00017657358988542774, + "loss": 1.0842, + "step": 4701 + }, + { + "epoch": 0.22279080786543473, + "grad_norm": 0.72265625, + "learning_rate": 0.00017656401078808883, + "loss": 0.6734, + "step": 4702 + }, + { + "epoch": 0.2228381900023691, + "grad_norm": 0.65234375, + "learning_rate": 0.00017655442999264174, + "loss": 0.7644, + "step": 4703 + }, + { + "epoch": 0.2228855721393035, + "grad_norm": 0.62109375, + "learning_rate": 0.00017654484749929893, + "loss": 0.8459, + "step": 4704 + }, + { + "epoch": 0.22293295427623785, + "grad_norm": 0.60546875, + "learning_rate": 0.0001765352633082729, + "loss": 0.315, + "step": 4705 + }, + { + "epoch": 0.22298033641317225, + "grad_norm": 0.5546875, + "learning_rate": 0.00017652567741977627, + "loss": 0.5902, + "step": 4706 + }, + { + "epoch": 0.2230277185501066, + "grad_norm": 0.7265625, + "learning_rate": 0.00017651608983402164, + "loss": 1.2075, + "step": 4707 + }, + { + "epoch": 0.22307510068704098, + "grad_norm": 0.5703125, + "learning_rate": 0.0001765065005512216, + "loss": 0.7806, + "step": 4708 + }, + { + "epoch": 0.22312248282397537, + "grad_norm": 1.109375, + "learning_rate": 0.00017649690957158892, + "loss": 0.3137, + "step": 4709 + }, + { + "epoch": 0.22316986496090974, + "grad_norm": 0.703125, + "learning_rate": 0.00017648731689533627, + "loss": 0.5101, + "step": 4710 + }, + { + "epoch": 0.2232172470978441, + "grad_norm": 0.6796875, + "learning_rate": 0.00017647772252267637, + "loss": 0.6607, + "step": 4711 + }, + { + "epoch": 0.2232646292347785, + "grad_norm": 0.7109375, + "learning_rate": 0.00017646812645382201, + "loss": 1.1553, + "step": 4712 + }, + { + "epoch": 0.22331201137171286, + "grad_norm": 0.69140625, + "learning_rate": 0.00017645852868898611, + "loss": 1.0347, + "step": 4713 + }, + { + "epoch": 0.22335939350864725, + "grad_norm": 0.609375, + "learning_rate": 0.00017644892922838147, + "loss": 0.8938, + "step": 4714 + }, + { + "epoch": 0.22340677564558162, + "grad_norm": 0.66796875, + "learning_rate": 0.00017643932807222102, + "loss": 1.466, + "step": 4715 + }, + { + "epoch": 0.22345415778251598, + "grad_norm": 0.58203125, + "learning_rate": 0.00017642972522071768, + "loss": 0.7273, + "step": 4716 + }, + { + "epoch": 0.22350153991945038, + "grad_norm": 0.6015625, + "learning_rate": 0.00017642012067408444, + "loss": 1.5163, + "step": 4717 + }, + { + "epoch": 0.22354892205638474, + "grad_norm": 0.7578125, + "learning_rate": 0.00017641051443253433, + "loss": 0.925, + "step": 4718 + }, + { + "epoch": 0.2235963041933191, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00017640090649628042, + "loss": 0.0005, + "step": 4719 + }, + { + "epoch": 0.2236436863302535, + "grad_norm": 0.69140625, + "learning_rate": 0.00017639129686553573, + "loss": 0.9881, + "step": 4720 + }, + { + "epoch": 0.22369106846718786, + "grad_norm": 0.46875, + "learning_rate": 0.00017638168554051352, + "loss": 0.5051, + "step": 4721 + }, + { + "epoch": 0.22373845060412226, + "grad_norm": 0.62890625, + "learning_rate": 0.0001763720725214268, + "loss": 0.9316, + "step": 4722 + }, + { + "epoch": 0.22378583274105662, + "grad_norm": 0.71484375, + "learning_rate": 0.0001763624578084889, + "loss": 1.0418, + "step": 4723 + }, + { + "epoch": 0.223833214877991, + "grad_norm": 0.52734375, + "learning_rate": 0.00017635284140191302, + "loss": 1.1204, + "step": 4724 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.58984375, + "learning_rate": 0.00017634322330191244, + "loss": 0.8188, + "step": 4725 + }, + { + "epoch": 0.22392797915185975, + "grad_norm": 0.56640625, + "learning_rate": 0.0001763336035087005, + "loss": 1.2027, + "step": 4726 + }, + { + "epoch": 0.2239753612887941, + "grad_norm": 0.71875, + "learning_rate": 0.00017632398202249055, + "loss": 1.5133, + "step": 4727 + }, + { + "epoch": 0.2240227434257285, + "grad_norm": 0.90234375, + "learning_rate": 0.00017631435884349594, + "loss": 0.3672, + "step": 4728 + }, + { + "epoch": 0.22407012556266287, + "grad_norm": 0.50390625, + "learning_rate": 0.00017630473397193015, + "loss": 0.3307, + "step": 4729 + }, + { + "epoch": 0.22411750769959726, + "grad_norm": 0.5390625, + "learning_rate": 0.00017629510740800662, + "loss": 0.0769, + "step": 4730 + }, + { + "epoch": 0.22416488983653163, + "grad_norm": 0.703125, + "learning_rate": 0.00017628547915193892, + "loss": 1.293, + "step": 4731 + }, + { + "epoch": 0.224212271973466, + "grad_norm": 0.6953125, + "learning_rate": 0.0001762758492039405, + "loss": 0.8374, + "step": 4732 + }, + { + "epoch": 0.22425965411040039, + "grad_norm": 1.1484375, + "learning_rate": 0.00017626621756422503, + "loss": 0.2328, + "step": 4733 + }, + { + "epoch": 0.22430703624733475, + "grad_norm": 0.69140625, + "learning_rate": 0.00017625658423300606, + "loss": 1.0732, + "step": 4734 + }, + { + "epoch": 0.22435441838426914, + "grad_norm": 0.57421875, + "learning_rate": 0.00017624694921049728, + "loss": 0.0418, + "step": 4735 + }, + { + "epoch": 0.2244018005212035, + "grad_norm": 1.65625, + "learning_rate": 0.0001762373124969124, + "loss": 0.1589, + "step": 4736 + }, + { + "epoch": 0.22444918265813787, + "grad_norm": 0.69140625, + "learning_rate": 0.0001762276740924651, + "loss": 1.3373, + "step": 4737 + }, + { + "epoch": 0.22449656479507227, + "grad_norm": 0.453125, + "learning_rate": 0.00017621803399736922, + "loss": 0.5409, + "step": 4738 + }, + { + "epoch": 0.22454394693200663, + "grad_norm": 0.44140625, + "learning_rate": 0.00017620839221183852, + "loss": 0.2755, + "step": 4739 + }, + { + "epoch": 0.224591329068941, + "grad_norm": 0.61328125, + "learning_rate": 0.00017619874873608685, + "loss": 0.9164, + "step": 4740 + }, + { + "epoch": 0.2246387112058754, + "grad_norm": 0.50390625, + "learning_rate": 0.00017618910357032808, + "loss": 0.3311, + "step": 4741 + }, + { + "epoch": 0.22468609334280976, + "grad_norm": 0.8828125, + "learning_rate": 0.00017617945671477618, + "loss": 1.7412, + "step": 4742 + }, + { + "epoch": 0.22473347547974415, + "grad_norm": 0.265625, + "learning_rate": 0.00017616980816964503, + "loss": 0.0396, + "step": 4743 + }, + { + "epoch": 0.22478085761667851, + "grad_norm": 0.515625, + "learning_rate": 0.0001761601579351487, + "loss": 0.7049, + "step": 4744 + }, + { + "epoch": 0.22482823975361288, + "grad_norm": 0.1748046875, + "learning_rate": 0.00017615050601150117, + "loss": 0.0126, + "step": 4745 + }, + { + "epoch": 0.22487562189054727, + "grad_norm": 0.73828125, + "learning_rate": 0.00017614085239891654, + "loss": 1.1157, + "step": 4746 + }, + { + "epoch": 0.22492300402748164, + "grad_norm": 0.671875, + "learning_rate": 0.0001761311970976089, + "loss": 0.8671, + "step": 4747 + }, + { + "epoch": 0.224970386164416, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017612154010779242, + "loss": 0.0453, + "step": 4748 + }, + { + "epoch": 0.2250177683013504, + "grad_norm": 0.77734375, + "learning_rate": 0.00017611188142968126, + "loss": 0.8946, + "step": 4749 + }, + { + "epoch": 0.22506515043828476, + "grad_norm": 0.21484375, + "learning_rate": 0.0001761022210634896, + "loss": 0.024, + "step": 4750 + }, + { + "epoch": 0.22511253257521915, + "grad_norm": 0.6484375, + "learning_rate": 0.00017609255900943177, + "loss": 1.0729, + "step": 4751 + }, + { + "epoch": 0.22515991471215352, + "grad_norm": 0.486328125, + "learning_rate": 0.000176082895267722, + "loss": 0.9089, + "step": 4752 + }, + { + "epoch": 0.22520729684908788, + "grad_norm": 0.66015625, + "learning_rate": 0.0001760732298385747, + "loss": 1.1987, + "step": 4753 + }, + { + "epoch": 0.22525467898602228, + "grad_norm": 0.5234375, + "learning_rate": 0.00017606356272220416, + "loss": 0.9727, + "step": 4754 + }, + { + "epoch": 0.22530206112295664, + "grad_norm": 0.36328125, + "learning_rate": 0.00017605389391882483, + "loss": 0.0066, + "step": 4755 + }, + { + "epoch": 0.225349443259891, + "grad_norm": 0.412109375, + "learning_rate": 0.00017604422342865113, + "loss": 0.1331, + "step": 4756 + }, + { + "epoch": 0.2253968253968254, + "grad_norm": 0.71484375, + "learning_rate": 0.00017603455125189758, + "loss": 0.9788, + "step": 4757 + }, + { + "epoch": 0.22544420753375977, + "grad_norm": 0.74609375, + "learning_rate": 0.00017602487738877862, + "loss": 1.3464, + "step": 4758 + }, + { + "epoch": 0.22549158967069416, + "grad_norm": 0.6875, + "learning_rate": 0.0001760152018395089, + "loss": 1.2365, + "step": 4759 + }, + { + "epoch": 0.22553897180762852, + "grad_norm": 0.59765625, + "learning_rate": 0.00017600552460430297, + "loss": 0.9028, + "step": 4760 + }, + { + "epoch": 0.2255863539445629, + "grad_norm": 0.60546875, + "learning_rate": 0.00017599584568337546, + "loss": 1.0353, + "step": 4761 + }, + { + "epoch": 0.22563373608149728, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598616507694104, + "loss": 1.1336, + "step": 4762 + }, + { + "epoch": 0.22568111821843165, + "grad_norm": 0.80078125, + "learning_rate": 0.0001759764827852144, + "loss": 0.012, + "step": 4763 + }, + { + "epoch": 0.22572850035536604, + "grad_norm": 0.6328125, + "learning_rate": 0.0001759667988084103, + "loss": 0.8005, + "step": 4764 + }, + { + "epoch": 0.2257758824923004, + "grad_norm": 0.62109375, + "learning_rate": 0.00017595711314674352, + "loss": 0.0862, + "step": 4765 + }, + { + "epoch": 0.22582326462923477, + "grad_norm": 0.7265625, + "learning_rate": 0.00017594742580042888, + "loss": 1.197, + "step": 4766 + }, + { + "epoch": 0.22587064676616916, + "grad_norm": 0.75, + "learning_rate": 0.00017593773676968124, + "loss": 0.0649, + "step": 4767 + }, + { + "epoch": 0.22591802890310353, + "grad_norm": 0.484375, + "learning_rate": 0.00017592804605471546, + "loss": 0.6548, + "step": 4768 + }, + { + "epoch": 0.2259654110400379, + "grad_norm": 0.33203125, + "learning_rate": 0.00017591835365574654, + "loss": 0.013, + "step": 4769 + }, + { + "epoch": 0.2260127931769723, + "grad_norm": 0.52734375, + "learning_rate": 0.00017590865957298934, + "loss": 1.1218, + "step": 4770 + }, + { + "epoch": 0.22606017531390665, + "grad_norm": 0.466796875, + "learning_rate": 0.00017589896380665896, + "loss": 0.7666, + "step": 4771 + }, + { + "epoch": 0.22610755745084105, + "grad_norm": 0.58984375, + "learning_rate": 0.0001758892663569704, + "loss": 0.8846, + "step": 4772 + }, + { + "epoch": 0.2261549395877754, + "grad_norm": 0.52734375, + "learning_rate": 0.00017587956722413877, + "loss": 0.6879, + "step": 4773 + }, + { + "epoch": 0.22620232172470978, + "grad_norm": 0.9296875, + "learning_rate": 0.00017586986640837914, + "loss": 0.0823, + "step": 4774 + }, + { + "epoch": 0.22624970386164417, + "grad_norm": 0.66796875, + "learning_rate": 0.00017586016390990668, + "loss": 1.276, + "step": 4775 + }, + { + "epoch": 0.22629708599857853, + "grad_norm": 0.41796875, + "learning_rate": 0.00017585045972893658, + "loss": 0.008, + "step": 4776 + }, + { + "epoch": 0.2263444681355129, + "grad_norm": 0.57421875, + "learning_rate": 0.0001758407538656841, + "loss": 0.9002, + "step": 4777 + }, + { + "epoch": 0.2263918502724473, + "grad_norm": 0.6484375, + "learning_rate": 0.00017583104632036447, + "loss": 0.7278, + "step": 4778 + }, + { + "epoch": 0.22643923240938166, + "grad_norm": 1.03125, + "learning_rate": 0.00017582133709319298, + "loss": 0.2357, + "step": 4779 + }, + { + "epoch": 0.22648661454631605, + "grad_norm": 0.033203125, + "learning_rate": 0.00017581162618438505, + "loss": 0.0013, + "step": 4780 + }, + { + "epoch": 0.22653399668325042, + "grad_norm": 0.2177734375, + "learning_rate": 0.00017580191359415595, + "loss": 0.0245, + "step": 4781 + }, + { + "epoch": 0.22658137882018478, + "grad_norm": 0.7109375, + "learning_rate": 0.00017579219932272117, + "loss": 1.1036, + "step": 4782 + }, + { + "epoch": 0.22662876095711917, + "grad_norm": 0.59765625, + "learning_rate": 0.00017578248337029613, + "loss": 0.8106, + "step": 4783 + }, + { + "epoch": 0.22667614309405354, + "grad_norm": 0.55859375, + "learning_rate": 0.00017577276573709635, + "loss": 0.772, + "step": 4784 + }, + { + "epoch": 0.2267235252309879, + "grad_norm": 0.75, + "learning_rate": 0.00017576304642333732, + "loss": 1.1472, + "step": 4785 + }, + { + "epoch": 0.2267709073679223, + "grad_norm": 0.3046875, + "learning_rate": 0.00017575332542923465, + "loss": 0.1988, + "step": 4786 + }, + { + "epoch": 0.22681828950485666, + "grad_norm": 0.70703125, + "learning_rate": 0.00017574360275500386, + "loss": 1.0495, + "step": 4787 + }, + { + "epoch": 0.22686567164179106, + "grad_norm": 0.447265625, + "learning_rate": 0.00017573387840086067, + "loss": 0.1959, + "step": 4788 + }, + { + "epoch": 0.22691305377872542, + "grad_norm": 0.6484375, + "learning_rate": 0.00017572415236702068, + "loss": 0.2046, + "step": 4789 + }, + { + "epoch": 0.2269604359156598, + "grad_norm": 0.7734375, + "learning_rate": 0.0001757144246536997, + "loss": 0.9627, + "step": 4790 + }, + { + "epoch": 0.22700781805259418, + "grad_norm": 0.42578125, + "learning_rate": 0.0001757046952611134, + "loss": 0.1513, + "step": 4791 + }, + { + "epoch": 0.22705520018952854, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001756949641894776, + "loss": 0.1538, + "step": 4792 + }, + { + "epoch": 0.22710258232646294, + "grad_norm": 0.5390625, + "learning_rate": 0.00017568523143900815, + "loss": 0.9176, + "step": 4793 + }, + { + "epoch": 0.2271499644633973, + "grad_norm": 0.7734375, + "learning_rate": 0.00017567549700992085, + "loss": 0.9416, + "step": 4794 + }, + { + "epoch": 0.22719734660033167, + "grad_norm": 0.79296875, + "learning_rate": 0.00017566576090243164, + "loss": 1.1508, + "step": 4795 + }, + { + "epoch": 0.22724472873726606, + "grad_norm": 0.6640625, + "learning_rate": 0.00017565602311675647, + "loss": 1.1279, + "step": 4796 + }, + { + "epoch": 0.22729211087420043, + "grad_norm": 0.51953125, + "learning_rate": 0.00017564628365311129, + "loss": 0.9059, + "step": 4797 + }, + { + "epoch": 0.2273394930111348, + "grad_norm": 0.765625, + "learning_rate": 0.00017563654251171208, + "loss": 0.0381, + "step": 4798 + }, + { + "epoch": 0.22738687514806918, + "grad_norm": 0.64453125, + "learning_rate": 0.00017562679969277496, + "loss": 0.2966, + "step": 4799 + }, + { + "epoch": 0.22743425728500355, + "grad_norm": 0.77734375, + "learning_rate": 0.00017561705519651593, + "loss": 1.2171, + "step": 4800 + }, + { + "epoch": 0.22748163942193794, + "grad_norm": 0.546875, + "learning_rate": 0.0001756073090231512, + "loss": 1.2832, + "step": 4801 + }, + { + "epoch": 0.2275290215588723, + "grad_norm": 0.462890625, + "learning_rate": 0.00017559756117289688, + "loss": 0.7064, + "step": 4802 + }, + { + "epoch": 0.22757640369580667, + "grad_norm": 0.66796875, + "learning_rate": 0.00017558781164596918, + "loss": 0.7603, + "step": 4803 + }, + { + "epoch": 0.22762378583274107, + "grad_norm": 0.78515625, + "learning_rate": 0.00017557806044258433, + "loss": 0.9623, + "step": 4804 + }, + { + "epoch": 0.22767116796967543, + "grad_norm": 0.69140625, + "learning_rate": 0.00017556830756295858, + "loss": 1.2106, + "step": 4805 + }, + { + "epoch": 0.2277185501066098, + "grad_norm": 0.57421875, + "learning_rate": 0.00017555855300730827, + "loss": 1.2195, + "step": 4806 + }, + { + "epoch": 0.2277659322435442, + "grad_norm": 0.703125, + "learning_rate": 0.00017554879677584973, + "loss": 0.9812, + "step": 4807 + }, + { + "epoch": 0.22781331438047855, + "grad_norm": 0.578125, + "learning_rate": 0.00017553903886879937, + "loss": 0.0847, + "step": 4808 + }, + { + "epoch": 0.22786069651741295, + "grad_norm": 0.66796875, + "learning_rate": 0.0001755292792863736, + "loss": 1.2341, + "step": 4809 + }, + { + "epoch": 0.2279080786543473, + "grad_norm": 0.58203125, + "learning_rate": 0.00017551951802878882, + "loss": 0.7845, + "step": 4810 + }, + { + "epoch": 0.22795546079128168, + "grad_norm": 0.2578125, + "learning_rate": 0.00017550975509626163, + "loss": 0.1396, + "step": 4811 + }, + { + "epoch": 0.22800284292821607, + "grad_norm": 0.67578125, + "learning_rate": 0.00017549999048900846, + "loss": 1.2626, + "step": 4812 + }, + { + "epoch": 0.22805022506515044, + "grad_norm": 0.6015625, + "learning_rate": 0.0001754902242072459, + "loss": 1.0554, + "step": 4813 + }, + { + "epoch": 0.2280976072020848, + "grad_norm": 0.474609375, + "learning_rate": 0.0001754804562511906, + "loss": 0.6975, + "step": 4814 + }, + { + "epoch": 0.2281449893390192, + "grad_norm": 0.79296875, + "learning_rate": 0.0001754706866210592, + "loss": 0.1331, + "step": 4815 + }, + { + "epoch": 0.22819237147595356, + "grad_norm": 0.6953125, + "learning_rate": 0.00017546091531706832, + "loss": 1.4851, + "step": 4816 + }, + { + "epoch": 0.22823975361288795, + "grad_norm": 0.5234375, + "learning_rate": 0.00017545114233943472, + "loss": 0.8983, + "step": 4817 + }, + { + "epoch": 0.22828713574982232, + "grad_norm": 0.72265625, + "learning_rate": 0.0001754413676883752, + "loss": 1.0767, + "step": 4818 + }, + { + "epoch": 0.22833451788675668, + "grad_norm": 0.75, + "learning_rate": 0.00017543159136410643, + "loss": 1.3045, + "step": 4819 + }, + { + "epoch": 0.22838190002369108, + "grad_norm": 0.73046875, + "learning_rate": 0.00017542181336684535, + "loss": 0.7192, + "step": 4820 + }, + { + "epoch": 0.22842928216062544, + "grad_norm": 0.59375, + "learning_rate": 0.00017541203369680875, + "loss": 0.5849, + "step": 4821 + }, + { + "epoch": 0.2284766642975598, + "grad_norm": 0.59765625, + "learning_rate": 0.0001754022523542136, + "loss": 0.9732, + "step": 4822 + }, + { + "epoch": 0.2285240464344942, + "grad_norm": 0.69140625, + "learning_rate": 0.00017539246933927682, + "loss": 1.1261, + "step": 4823 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.6171875, + "learning_rate": 0.00017538268465221534, + "loss": 0.9075, + "step": 4824 + }, + { + "epoch": 0.22861881070836296, + "grad_norm": 0.1845703125, + "learning_rate": 0.00017537289829324624, + "loss": 0.134, + "step": 4825 + }, + { + "epoch": 0.22866619284529732, + "grad_norm": 0.671875, + "learning_rate": 0.00017536311026258652, + "loss": 0.6335, + "step": 4826 + }, + { + "epoch": 0.2287135749822317, + "grad_norm": 0.466796875, + "learning_rate": 0.00017535332056045332, + "loss": 0.5317, + "step": 4827 + }, + { + "epoch": 0.22876095711916608, + "grad_norm": 0.640625, + "learning_rate": 0.0001753435291870637, + "loss": 0.789, + "step": 4828 + }, + { + "epoch": 0.22880833925610045, + "grad_norm": 0.7734375, + "learning_rate": 0.00017533373614263487, + "loss": 1.2605, + "step": 4829 + }, + { + "epoch": 0.22885572139303484, + "grad_norm": 0.236328125, + "learning_rate": 0.000175323941427384, + "loss": 0.1808, + "step": 4830 + }, + { + "epoch": 0.2289031035299692, + "grad_norm": 0.11962890625, + "learning_rate": 0.00017531414504152833, + "loss": 0.01, + "step": 4831 + }, + { + "epoch": 0.22895048566690357, + "grad_norm": 0.55078125, + "learning_rate": 0.00017530434698528516, + "loss": 0.9347, + "step": 4832 + }, + { + "epoch": 0.22899786780383796, + "grad_norm": 0.6015625, + "learning_rate": 0.00017529454725887178, + "loss": 0.5792, + "step": 4833 + }, + { + "epoch": 0.22904524994077233, + "grad_norm": 0.76171875, + "learning_rate": 0.00017528474586250554, + "loss": 1.219, + "step": 4834 + }, + { + "epoch": 0.2290926320777067, + "grad_norm": 0.6328125, + "learning_rate": 0.00017527494279640383, + "loss": 0.7787, + "step": 4835 + }, + { + "epoch": 0.22914001421464109, + "grad_norm": 0.5078125, + "learning_rate": 0.00017526513806078407, + "loss": 0.8009, + "step": 4836 + }, + { + "epoch": 0.22918739635157545, + "grad_norm": 0.57421875, + "learning_rate": 0.00017525533165586374, + "loss": 1.0709, + "step": 4837 + }, + { + "epoch": 0.22923477848850984, + "grad_norm": 0.63671875, + "learning_rate": 0.00017524552358186027, + "loss": 0.8568, + "step": 4838 + }, + { + "epoch": 0.2292821606254442, + "grad_norm": 0.12890625, + "learning_rate": 0.00017523571383899127, + "loss": 0.012, + "step": 4839 + }, + { + "epoch": 0.22932954276237857, + "grad_norm": 0.76953125, + "learning_rate": 0.00017522590242747426, + "loss": 1.3154, + "step": 4840 + }, + { + "epoch": 0.22937692489931297, + "grad_norm": 0.609375, + "learning_rate": 0.00017521608934752684, + "loss": 0.8839, + "step": 4841 + }, + { + "epoch": 0.22942430703624733, + "grad_norm": 0.400390625, + "learning_rate": 0.00017520627459936673, + "loss": 0.1537, + "step": 4842 + }, + { + "epoch": 0.2294716891731817, + "grad_norm": 0.5625, + "learning_rate": 0.0001751964581832115, + "loss": 1.0254, + "step": 4843 + }, + { + "epoch": 0.2295190713101161, + "grad_norm": 0.63671875, + "learning_rate": 0.00017518664009927895, + "loss": 0.2249, + "step": 4844 + }, + { + "epoch": 0.22956645344705046, + "grad_norm": 0.5703125, + "learning_rate": 0.0001751768203477868, + "loss": 1.1852, + "step": 4845 + }, + { + "epoch": 0.22961383558398485, + "grad_norm": 0.56640625, + "learning_rate": 0.00017516699892895286, + "loss": 0.4592, + "step": 4846 + }, + { + "epoch": 0.22966121772091921, + "grad_norm": 0.53515625, + "learning_rate": 0.00017515717584299493, + "loss": 1.0775, + "step": 4847 + }, + { + "epoch": 0.22970859985785358, + "grad_norm": 0.57421875, + "learning_rate": 0.0001751473510901309, + "loss": 0.9325, + "step": 4848 + }, + { + "epoch": 0.22975598199478797, + "grad_norm": 0.30859375, + "learning_rate": 0.00017513752467057867, + "loss": 0.1666, + "step": 4849 + }, + { + "epoch": 0.22980336413172234, + "grad_norm": 0.67578125, + "learning_rate": 0.00017512769658455617, + "loss": 0.8059, + "step": 4850 + }, + { + "epoch": 0.2298507462686567, + "grad_norm": 0.28125, + "learning_rate": 0.0001751178668322814, + "loss": 0.1496, + "step": 4851 + }, + { + "epoch": 0.2298981284055911, + "grad_norm": 0.6875, + "learning_rate": 0.00017510803541397234, + "loss": 1.0282, + "step": 4852 + }, + { + "epoch": 0.22994551054252546, + "grad_norm": 0.59375, + "learning_rate": 0.00017509820232984705, + "loss": 0.0131, + "step": 4853 + }, + { + "epoch": 0.22999289267945985, + "grad_norm": 0.69921875, + "learning_rate": 0.00017508836758012365, + "loss": 1.2326, + "step": 4854 + }, + { + "epoch": 0.23004027481639422, + "grad_norm": 0.6875, + "learning_rate": 0.00017507853116502023, + "loss": 1.0969, + "step": 4855 + }, + { + "epoch": 0.23008765695332858, + "grad_norm": 0.65234375, + "learning_rate": 0.00017506869308475494, + "loss": 1.248, + "step": 4856 + }, + { + "epoch": 0.23013503909026298, + "grad_norm": 0.44921875, + "learning_rate": 0.000175058853339546, + "loss": 0.1875, + "step": 4857 + }, + { + "epoch": 0.23018242122719734, + "grad_norm": 0.47265625, + "learning_rate": 0.00017504901192961163, + "loss": 0.5449, + "step": 4858 + }, + { + "epoch": 0.23022980336413174, + "grad_norm": 0.828125, + "learning_rate": 0.00017503916885517015, + "loss": 1.0936, + "step": 4859 + }, + { + "epoch": 0.2302771855010661, + "grad_norm": 0.09619140625, + "learning_rate": 0.0001750293241164398, + "loss": 0.0034, + "step": 4860 + }, + { + "epoch": 0.23032456763800047, + "grad_norm": 0.8671875, + "learning_rate": 0.00017501947771363896, + "loss": 0.1314, + "step": 4861 + }, + { + "epoch": 0.23037194977493486, + "grad_norm": 0.6875, + "learning_rate": 0.00017500962964698603, + "loss": 1.2821, + "step": 4862 + }, + { + "epoch": 0.23041933191186922, + "grad_norm": 0.56640625, + "learning_rate": 0.0001749997799166994, + "loss": 0.0218, + "step": 4863 + }, + { + "epoch": 0.2304667140488036, + "grad_norm": 0.7890625, + "learning_rate": 0.00017498992852299757, + "loss": 0.8939, + "step": 4864 + }, + { + "epoch": 0.23051409618573798, + "grad_norm": 0.68359375, + "learning_rate": 0.00017498007546609895, + "loss": 1.309, + "step": 4865 + }, + { + "epoch": 0.23056147832267235, + "grad_norm": 0.6875, + "learning_rate": 0.00017497022074622215, + "loss": 0.9838, + "step": 4866 + }, + { + "epoch": 0.23060886045960674, + "grad_norm": 0.58203125, + "learning_rate": 0.0001749603643635857, + "loss": 1.042, + "step": 4867 + }, + { + "epoch": 0.2306562425965411, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0001749505063184082, + "loss": 0.0017, + "step": 4868 + }, + { + "epoch": 0.23070362473347547, + "grad_norm": 0.74609375, + "learning_rate": 0.00017494064661090835, + "loss": 0.8515, + "step": 4869 + }, + { + "epoch": 0.23075100687040986, + "grad_norm": 0.53515625, + "learning_rate": 0.00017493078524130474, + "loss": 1.0213, + "step": 4870 + }, + { + "epoch": 0.23079838900734423, + "grad_norm": 0.35546875, + "learning_rate": 0.00017492092220981612, + "loss": 0.1837, + "step": 4871 + }, + { + "epoch": 0.2308457711442786, + "grad_norm": 0.498046875, + "learning_rate": 0.0001749110575166613, + "loss": 0.3354, + "step": 4872 + }, + { + "epoch": 0.230893153281213, + "grad_norm": 0.010009765625, + "learning_rate": 0.00017490119116205896, + "loss": 0.0006, + "step": 4873 + }, + { + "epoch": 0.23094053541814735, + "grad_norm": 0.59375, + "learning_rate": 0.000174891323146228, + "loss": 0.6591, + "step": 4874 + }, + { + "epoch": 0.23098791755508175, + "grad_norm": 0.20703125, + "learning_rate": 0.00017488145346938728, + "loss": 0.0333, + "step": 4875 + }, + { + "epoch": 0.2310352996920161, + "grad_norm": 0.625, + "learning_rate": 0.00017487158213175564, + "loss": 1.029, + "step": 4876 + }, + { + "epoch": 0.23108268182895048, + "grad_norm": 0.546875, + "learning_rate": 0.00017486170913355212, + "loss": 0.6199, + "step": 4877 + }, + { + "epoch": 0.23113006396588487, + "grad_norm": 0.68359375, + "learning_rate": 0.0001748518344749956, + "loss": 0.8363, + "step": 4878 + }, + { + "epoch": 0.23117744610281923, + "grad_norm": 0.041748046875, + "learning_rate": 0.0001748419581563051, + "loss": 0.0018, + "step": 4879 + }, + { + "epoch": 0.2312248282397536, + "grad_norm": 0.57421875, + "learning_rate": 0.0001748320801776997, + "loss": 1.0243, + "step": 4880 + }, + { + "epoch": 0.231272210376688, + "grad_norm": 0.48828125, + "learning_rate": 0.0001748222005393985, + "loss": 1.0112, + "step": 4881 + }, + { + "epoch": 0.23131959251362236, + "grad_norm": 0.361328125, + "learning_rate": 0.00017481231924162054, + "loss": 0.035, + "step": 4882 + }, + { + "epoch": 0.23136697465055675, + "grad_norm": 0.2265625, + "learning_rate": 0.00017480243628458504, + "loss": 0.0418, + "step": 4883 + }, + { + "epoch": 0.23141435678749112, + "grad_norm": 0.546875, + "learning_rate": 0.0001747925516685112, + "loss": 0.6955, + "step": 4884 + }, + { + "epoch": 0.23146173892442548, + "grad_norm": 0.3203125, + "learning_rate": 0.0001747826653936182, + "loss": 0.1441, + "step": 4885 + }, + { + "epoch": 0.23150912106135987, + "grad_norm": 0.01904296875, + "learning_rate": 0.00017477277746012537, + "loss": 0.0011, + "step": 4886 + }, + { + "epoch": 0.23155650319829424, + "grad_norm": 0.451171875, + "learning_rate": 0.00017476288786825195, + "loss": 0.4778, + "step": 4887 + }, + { + "epoch": 0.23160388533522863, + "grad_norm": 0.58203125, + "learning_rate": 0.0001747529966182173, + "loss": 0.5974, + "step": 4888 + }, + { + "epoch": 0.231651267472163, + "grad_norm": 0.69140625, + "learning_rate": 0.00017474310371024085, + "loss": 1.2586, + "step": 4889 + }, + { + "epoch": 0.23169864960909736, + "grad_norm": 0.58984375, + "learning_rate": 0.00017473320914454193, + "loss": 0.9656, + "step": 4890 + }, + { + "epoch": 0.23174603174603176, + "grad_norm": 0.58203125, + "learning_rate": 0.0001747233129213401, + "loss": 1.1482, + "step": 4891 + }, + { + "epoch": 0.23179341388296612, + "grad_norm": 0.88671875, + "learning_rate": 0.00017471341504085472, + "loss": 0.1904, + "step": 4892 + }, + { + "epoch": 0.23184079601990049, + "grad_norm": 0.6875, + "learning_rate": 0.0001747035155033054, + "loss": 0.7755, + "step": 4893 + }, + { + "epoch": 0.23188817815683488, + "grad_norm": 0.458984375, + "learning_rate": 0.00017469361430891167, + "loss": 0.0128, + "step": 4894 + }, + { + "epoch": 0.23193556029376924, + "grad_norm": 0.5, + "learning_rate": 0.00017468371145789314, + "loss": 0.1607, + "step": 4895 + }, + { + "epoch": 0.23198294243070364, + "grad_norm": 0.70703125, + "learning_rate": 0.00017467380695046942, + "loss": 1.0165, + "step": 4896 + }, + { + "epoch": 0.232030324567638, + "grad_norm": 0.028076171875, + "learning_rate": 0.00017466390078686023, + "loss": 0.0017, + "step": 4897 + }, + { + "epoch": 0.23207770670457237, + "grad_norm": 0.55078125, + "learning_rate": 0.00017465399296728523, + "loss": 0.9253, + "step": 4898 + }, + { + "epoch": 0.23212508884150676, + "grad_norm": 0.55078125, + "learning_rate": 0.00017464408349196416, + "loss": 0.0854, + "step": 4899 + }, + { + "epoch": 0.23217247097844113, + "grad_norm": 0.59375, + "learning_rate": 0.00017463417236111684, + "loss": 1.1781, + "step": 4900 + }, + { + "epoch": 0.2322198531153755, + "grad_norm": 0.71875, + "learning_rate": 0.0001746242595749631, + "loss": 0.3372, + "step": 4901 + }, + { + "epoch": 0.23226723525230988, + "grad_norm": 0.6015625, + "learning_rate": 0.00017461434513372275, + "loss": 1.125, + "step": 4902 + }, + { + "epoch": 0.23231461738924425, + "grad_norm": 0.486328125, + "learning_rate": 0.00017460442903761573, + "loss": 0.0674, + "step": 4903 + }, + { + "epoch": 0.23236199952617864, + "grad_norm": 0.6796875, + "learning_rate": 0.00017459451128686192, + "loss": 1.2133, + "step": 4904 + }, + { + "epoch": 0.232409381663113, + "grad_norm": 0.494140625, + "learning_rate": 0.00017458459188168132, + "loss": 0.3097, + "step": 4905 + }, + { + "epoch": 0.23245676380004737, + "grad_norm": 0.166015625, + "learning_rate": 0.00017457467082229386, + "loss": 0.1143, + "step": 4906 + }, + { + "epoch": 0.23250414593698177, + "grad_norm": 0.48046875, + "learning_rate": 0.0001745647481089197, + "loss": 0.0897, + "step": 4907 + }, + { + "epoch": 0.23255152807391613, + "grad_norm": 0.8125, + "learning_rate": 0.0001745548237417788, + "loss": 0.6822, + "step": 4908 + }, + { + "epoch": 0.2325989102108505, + "grad_norm": 0.70703125, + "learning_rate": 0.00017454489772109134, + "loss": 1.2888, + "step": 4909 + }, + { + "epoch": 0.2326462923477849, + "grad_norm": 0.5390625, + "learning_rate": 0.00017453497004707747, + "loss": 0.613, + "step": 4910 + }, + { + "epoch": 0.23269367448471925, + "grad_norm": 1.03125, + "learning_rate": 0.00017452504071995733, + "loss": 0.8391, + "step": 4911 + }, + { + "epoch": 0.23274105662165365, + "grad_norm": 0.67578125, + "learning_rate": 0.00017451510973995115, + "loss": 1.3987, + "step": 4912 + }, + { + "epoch": 0.232788438758588, + "grad_norm": 0.640625, + "learning_rate": 0.00017450517710727924, + "loss": 0.8413, + "step": 4913 + }, + { + "epoch": 0.23283582089552238, + "grad_norm": 0.396484375, + "learning_rate": 0.00017449524282216186, + "loss": 0.1535, + "step": 4914 + }, + { + "epoch": 0.23288320303245677, + "grad_norm": 0.67578125, + "learning_rate": 0.00017448530688481934, + "loss": 0.9345, + "step": 4915 + }, + { + "epoch": 0.23293058516939114, + "grad_norm": 0.515625, + "learning_rate": 0.00017447536929547202, + "loss": 0.4542, + "step": 4916 + }, + { + "epoch": 0.23297796730632553, + "grad_norm": 0.52734375, + "learning_rate": 0.00017446543005434037, + "loss": 0.7109, + "step": 4917 + }, + { + "epoch": 0.2330253494432599, + "grad_norm": 0.37890625, + "learning_rate": 0.0001744554891616448, + "loss": 0.0557, + "step": 4918 + }, + { + "epoch": 0.23307273158019426, + "grad_norm": 0.74609375, + "learning_rate": 0.00017444554661760577, + "loss": 1.5583, + "step": 4919 + }, + { + "epoch": 0.23312011371712865, + "grad_norm": 0.7734375, + "learning_rate": 0.00017443560242244384, + "loss": 0.8715, + "step": 4920 + }, + { + "epoch": 0.23316749585406302, + "grad_norm": 0.65625, + "learning_rate": 0.0001744256565763795, + "loss": 0.6857, + "step": 4921 + }, + { + "epoch": 0.23321487799099738, + "grad_norm": 1.6796875, + "learning_rate": 0.0001744157090796334, + "loss": 1.169, + "step": 4922 + }, + { + "epoch": 0.23326226012793178, + "grad_norm": 0.61328125, + "learning_rate": 0.0001744057599324261, + "loss": 0.8634, + "step": 4923 + }, + { + "epoch": 0.23330964226486614, + "grad_norm": 0.61328125, + "learning_rate": 0.00017439580913497832, + "loss": 1.1702, + "step": 4924 + }, + { + "epoch": 0.23335702440180053, + "grad_norm": 0.73828125, + "learning_rate": 0.00017438585668751074, + "loss": 1.1863, + "step": 4925 + }, + { + "epoch": 0.2334044065387349, + "grad_norm": 0.53125, + "learning_rate": 0.0001743759025902441, + "loss": 1.2634, + "step": 4926 + }, + { + "epoch": 0.23345178867566926, + "grad_norm": 0.32421875, + "learning_rate": 0.00017436594684339912, + "loss": 0.0342, + "step": 4927 + }, + { + "epoch": 0.23349917081260366, + "grad_norm": 0.384765625, + "learning_rate": 0.0001743559894471967, + "loss": 0.1795, + "step": 4928 + }, + { + "epoch": 0.23354655294953802, + "grad_norm": 1.53125, + "learning_rate": 0.00017434603040185763, + "loss": 0.7102, + "step": 4929 + }, + { + "epoch": 0.2335939350864724, + "grad_norm": 0.57421875, + "learning_rate": 0.00017433606970760276, + "loss": 1.11, + "step": 4930 + }, + { + "epoch": 0.23364131722340678, + "grad_norm": 0.259765625, + "learning_rate": 0.00017432610736465307, + "loss": 0.1318, + "step": 4931 + }, + { + "epoch": 0.23368869936034115, + "grad_norm": 0.6015625, + "learning_rate": 0.00017431614337322948, + "loss": 1.1823, + "step": 4932 + }, + { + "epoch": 0.23373608149727554, + "grad_norm": 0.54296875, + "learning_rate": 0.00017430617773355297, + "loss": 0.7964, + "step": 4933 + }, + { + "epoch": 0.2337834636342099, + "grad_norm": 0.6953125, + "learning_rate": 0.00017429621044584464, + "loss": 0.956, + "step": 4934 + }, + { + "epoch": 0.23383084577114427, + "grad_norm": 0.54296875, + "learning_rate": 0.00017428624151032544, + "loss": 0.1908, + "step": 4935 + }, + { + "epoch": 0.23387822790807866, + "grad_norm": 0.67578125, + "learning_rate": 0.00017427627092721654, + "loss": 0.8539, + "step": 4936 + }, + { + "epoch": 0.23392561004501303, + "grad_norm": 0.498046875, + "learning_rate": 0.0001742662986967391, + "loss": 0.4029, + "step": 4937 + }, + { + "epoch": 0.2339729921819474, + "grad_norm": 0.052001953125, + "learning_rate": 0.00017425632481911423, + "loss": 0.0029, + "step": 4938 + }, + { + "epoch": 0.23402037431888179, + "grad_norm": 0.546875, + "learning_rate": 0.0001742463492945632, + "loss": 1.1076, + "step": 4939 + }, + { + "epoch": 0.23406775645581615, + "grad_norm": 0.91015625, + "learning_rate": 0.00017423637212330716, + "loss": 0.6563, + "step": 4940 + }, + { + "epoch": 0.23411513859275054, + "grad_norm": 0.498046875, + "learning_rate": 0.00017422639330556754, + "loss": 0.7361, + "step": 4941 + }, + { + "epoch": 0.2341625207296849, + "grad_norm": 0.84765625, + "learning_rate": 0.00017421641284156553, + "loss": 1.0227, + "step": 4942 + }, + { + "epoch": 0.23420990286661927, + "grad_norm": 0.71875, + "learning_rate": 0.00017420643073152254, + "loss": 0.5419, + "step": 4943 + }, + { + "epoch": 0.23425728500355367, + "grad_norm": 0.76171875, + "learning_rate": 0.00017419644697565996, + "loss": 0.9013, + "step": 4944 + }, + { + "epoch": 0.23430466714048803, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017418646157419922, + "loss": 0.0414, + "step": 4945 + }, + { + "epoch": 0.23435204927742243, + "grad_norm": 0.61328125, + "learning_rate": 0.00017417647452736178, + "loss": 0.0804, + "step": 4946 + }, + { + "epoch": 0.2343994314143568, + "grad_norm": 0.6953125, + "learning_rate": 0.00017416648583536915, + "loss": 0.9069, + "step": 4947 + }, + { + "epoch": 0.23444681355129116, + "grad_norm": 0.85546875, + "learning_rate": 0.00017415649549844286, + "loss": 1.0225, + "step": 4948 + }, + { + "epoch": 0.23449419568822555, + "grad_norm": 0.625, + "learning_rate": 0.00017414650351680447, + "loss": 1.2589, + "step": 4949 + }, + { + "epoch": 0.2345415778251599, + "grad_norm": 0.66015625, + "learning_rate": 0.00017413650989067564, + "loss": 0.8184, + "step": 4950 + }, + { + "epoch": 0.23458895996209428, + "grad_norm": 0.625, + "learning_rate": 0.00017412651462027798, + "loss": 1.5527, + "step": 4951 + }, + { + "epoch": 0.23463634209902867, + "grad_norm": 0.48046875, + "learning_rate": 0.00017411651770583318, + "loss": 0.1233, + "step": 4952 + }, + { + "epoch": 0.23468372423596304, + "grad_norm": 0.984375, + "learning_rate": 0.00017410651914756295, + "loss": 1.028, + "step": 4953 + }, + { + "epoch": 0.23473110637289743, + "grad_norm": 0.7265625, + "learning_rate": 0.00017409651894568907, + "loss": 0.8294, + "step": 4954 + }, + { + "epoch": 0.2347784885098318, + "grad_norm": 0.7421875, + "learning_rate": 0.00017408651710043333, + "loss": 0.0641, + "step": 4955 + }, + { + "epoch": 0.23482587064676616, + "grad_norm": 0.46875, + "learning_rate": 0.00017407651361201756, + "loss": 1.0256, + "step": 4956 + }, + { + "epoch": 0.23487325278370055, + "grad_norm": 0.95703125, + "learning_rate": 0.0001740665084806636, + "loss": 1.0128, + "step": 4957 + }, + { + "epoch": 0.23492063492063492, + "grad_norm": 0.7734375, + "learning_rate": 0.00017405650170659339, + "loss": 1.0555, + "step": 4958 + }, + { + "epoch": 0.23496801705756928, + "grad_norm": 0.66796875, + "learning_rate": 0.00017404649329002883, + "loss": 0.9352, + "step": 4959 + }, + { + "epoch": 0.23501539919450368, + "grad_norm": 0.224609375, + "learning_rate": 0.00017403648323119196, + "loss": 0.1448, + "step": 4960 + }, + { + "epoch": 0.23506278133143804, + "grad_norm": 0.64453125, + "learning_rate": 0.0001740264715303047, + "loss": 1.1987, + "step": 4961 + }, + { + "epoch": 0.23511016346837244, + "grad_norm": 0.28125, + "learning_rate": 0.00017401645818758917, + "loss": 0.0393, + "step": 4962 + }, + { + "epoch": 0.2351575456053068, + "grad_norm": 0.326171875, + "learning_rate": 0.00017400644320326745, + "loss": 0.1885, + "step": 4963 + }, + { + "epoch": 0.23520492774224117, + "grad_norm": 0.6015625, + "learning_rate": 0.00017399642657756162, + "loss": 0.9889, + "step": 4964 + }, + { + "epoch": 0.23525230987917556, + "grad_norm": 0.87109375, + "learning_rate": 0.0001739864083106939, + "loss": 0.9626, + "step": 4965 + }, + { + "epoch": 0.23529969201610992, + "grad_norm": 0.7265625, + "learning_rate": 0.00017397638840288643, + "loss": 0.9734, + "step": 4966 + }, + { + "epoch": 0.2353470741530443, + "grad_norm": 0.7265625, + "learning_rate": 0.00017396636685436149, + "loss": 0.3847, + "step": 4967 + }, + { + "epoch": 0.23539445628997868, + "grad_norm": 0.74609375, + "learning_rate": 0.00017395634366534131, + "loss": 1.0344, + "step": 4968 + }, + { + "epoch": 0.23544183842691305, + "grad_norm": 0.91796875, + "learning_rate": 0.00017394631883604818, + "loss": 1.1247, + "step": 4969 + }, + { + "epoch": 0.23548922056384744, + "grad_norm": 0.345703125, + "learning_rate": 0.00017393629236670446, + "loss": 0.0628, + "step": 4970 + }, + { + "epoch": 0.2355366027007818, + "grad_norm": 0.56640625, + "learning_rate": 0.00017392626425753255, + "loss": 0.7047, + "step": 4971 + }, + { + "epoch": 0.23558398483771617, + "grad_norm": 0.78515625, + "learning_rate": 0.00017391623450875482, + "loss": 1.1307, + "step": 4972 + }, + { + "epoch": 0.23563136697465056, + "grad_norm": 0.6484375, + "learning_rate": 0.00017390620312059376, + "loss": 1.0037, + "step": 4973 + }, + { + "epoch": 0.23567874911158493, + "grad_norm": 1.75, + "learning_rate": 0.00017389617009327184, + "loss": 1.0204, + "step": 4974 + }, + { + "epoch": 0.23572613124851932, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017388613542701156, + "loss": 0.1655, + "step": 4975 + }, + { + "epoch": 0.2357735133854537, + "grad_norm": 0.671875, + "learning_rate": 0.0001738760991220355, + "loss": 0.9813, + "step": 4976 + }, + { + "epoch": 0.23582089552238805, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017386606117856626, + "loss": 0.0226, + "step": 4977 + }, + { + "epoch": 0.23586827765932245, + "grad_norm": 0.494140625, + "learning_rate": 0.00017385602159682647, + "loss": 1.083, + "step": 4978 + }, + { + "epoch": 0.2359156597962568, + "grad_norm": 0.4765625, + "learning_rate": 0.00017384598037703877, + "loss": 1.045, + "step": 4979 + }, + { + "epoch": 0.23596304193319118, + "grad_norm": 0.65234375, + "learning_rate": 0.0001738359375194259, + "loss": 1.4046, + "step": 4980 + }, + { + "epoch": 0.23601042407012557, + "grad_norm": 0.6953125, + "learning_rate": 0.00017382589302421055, + "loss": 0.4487, + "step": 4981 + }, + { + "epoch": 0.23605780620705993, + "grad_norm": 0.7109375, + "learning_rate": 0.00017381584689161555, + "loss": 1.0711, + "step": 4982 + }, + { + "epoch": 0.23610518834399433, + "grad_norm": 0.5234375, + "learning_rate": 0.0001738057991218637, + "loss": 0.8805, + "step": 4983 + }, + { + "epoch": 0.2361525704809287, + "grad_norm": 0.4921875, + "learning_rate": 0.00017379574971517782, + "loss": 0.5025, + "step": 4984 + }, + { + "epoch": 0.23619995261786306, + "grad_norm": 0.62109375, + "learning_rate": 0.00017378569867178083, + "loss": 1.1161, + "step": 4985 + }, + { + "epoch": 0.23624733475479745, + "grad_norm": 0.26953125, + "learning_rate": 0.00017377564599189562, + "loss": 0.0181, + "step": 4986 + }, + { + "epoch": 0.23629471689173182, + "grad_norm": 0.2236328125, + "learning_rate": 0.00017376559167574517, + "loss": 0.1703, + "step": 4987 + }, + { + "epoch": 0.23634209902866618, + "grad_norm": 0.5546875, + "learning_rate": 0.00017375553572355248, + "loss": 0.5635, + "step": 4988 + }, + { + "epoch": 0.23638948116560057, + "grad_norm": 0.953125, + "learning_rate": 0.00017374547813554057, + "loss": 1.1397, + "step": 4989 + }, + { + "epoch": 0.23643686330253494, + "grad_norm": 0.80078125, + "learning_rate": 0.0001737354189119325, + "loss": 1.2917, + "step": 4990 + }, + { + "epoch": 0.23648424543946933, + "grad_norm": 1.171875, + "learning_rate": 0.00017372535805295136, + "loss": 0.1013, + "step": 4991 + }, + { + "epoch": 0.2365316275764037, + "grad_norm": 0.7890625, + "learning_rate": 0.00017371529555882032, + "loss": 0.8985, + "step": 4992 + }, + { + "epoch": 0.23657900971333806, + "grad_norm": 0.78125, + "learning_rate": 0.00017370523142976255, + "loss": 0.9875, + "step": 4993 + }, + { + "epoch": 0.23662639185027246, + "grad_norm": 0.703125, + "learning_rate": 0.00017369516566600126, + "loss": 0.7521, + "step": 4994 + }, + { + "epoch": 0.23667377398720682, + "grad_norm": 0.6640625, + "learning_rate": 0.00017368509826775968, + "loss": 1.1462, + "step": 4995 + }, + { + "epoch": 0.23672115612414119, + "grad_norm": 0.8671875, + "learning_rate": 0.00017367502923526108, + "loss": 1.8729, + "step": 4996 + }, + { + "epoch": 0.23676853826107558, + "grad_norm": 0.515625, + "learning_rate": 0.00017366495856872884, + "loss": 0.9345, + "step": 4997 + }, + { + "epoch": 0.23681592039800994, + "grad_norm": 0.78515625, + "learning_rate": 0.00017365488626838632, + "loss": 1.3382, + "step": 4998 + }, + { + "epoch": 0.23686330253494434, + "grad_norm": 0.64453125, + "learning_rate": 0.0001736448123344568, + "loss": 0.9335, + "step": 4999 + }, + { + "epoch": 0.2369106846718787, + "grad_norm": 0.56640625, + "learning_rate": 0.00017363473676716384, + "loss": 0.7158, + "step": 5000 + }, + { + "epoch": 0.23695806680881307, + "grad_norm": 0.609375, + "learning_rate": 0.00017362465956673078, + "loss": 1.3346, + "step": 5001 + }, + { + "epoch": 0.23700544894574746, + "grad_norm": 0.765625, + "learning_rate": 0.00017361458073338127, + "loss": 1.39, + "step": 5002 + }, + { + "epoch": 0.23705283108268183, + "grad_norm": 0.515625, + "learning_rate": 0.00017360450026733873, + "loss": 0.6038, + "step": 5003 + }, + { + "epoch": 0.23710021321961622, + "grad_norm": 0.65234375, + "learning_rate": 0.0001735944181688268, + "loss": 0.9036, + "step": 5004 + }, + { + "epoch": 0.23714759535655058, + "grad_norm": 0.5546875, + "learning_rate": 0.00017358433443806905, + "loss": 1.1041, + "step": 5005 + }, + { + "epoch": 0.23719497749348495, + "grad_norm": 0.56640625, + "learning_rate": 0.00017357424907528914, + "loss": 0.152, + "step": 5006 + }, + { + "epoch": 0.23724235963041934, + "grad_norm": 0.625, + "learning_rate": 0.00017356416208071074, + "loss": 0.0677, + "step": 5007 + }, + { + "epoch": 0.2372897417673537, + "grad_norm": 0.390625, + "learning_rate": 0.00017355407345455762, + "loss": 0.1848, + "step": 5008 + }, + { + "epoch": 0.23733712390428807, + "grad_norm": 0.7109375, + "learning_rate": 0.00017354398319705346, + "loss": 1.084, + "step": 5009 + }, + { + "epoch": 0.23738450604122247, + "grad_norm": 0.64453125, + "learning_rate": 0.0001735338913084221, + "loss": 1.2232, + "step": 5010 + }, + { + "epoch": 0.23743188817815683, + "grad_norm": 0.69140625, + "learning_rate": 0.00017352379778888736, + "loss": 0.6343, + "step": 5011 + }, + { + "epoch": 0.23747927031509122, + "grad_norm": 0.578125, + "learning_rate": 0.0001735137026386731, + "loss": 0.811, + "step": 5012 + }, + { + "epoch": 0.2375266524520256, + "grad_norm": 0.5, + "learning_rate": 0.0001735036058580032, + "loss": 0.4561, + "step": 5013 + }, + { + "epoch": 0.23757403458895995, + "grad_norm": 0.609375, + "learning_rate": 0.00017349350744710163, + "loss": 0.6335, + "step": 5014 + }, + { + "epoch": 0.23762141672589435, + "grad_norm": 0.3046875, + "learning_rate": 0.00017348340740619235, + "loss": 0.1175, + "step": 5015 + }, + { + "epoch": 0.2376687988628287, + "grad_norm": 0.65625, + "learning_rate": 0.00017347330573549936, + "loss": 1.1408, + "step": 5016 + }, + { + "epoch": 0.23771618099976308, + "grad_norm": 0.61328125, + "learning_rate": 0.0001734632024352467, + "loss": 0.99, + "step": 5017 + }, + { + "epoch": 0.23776356313669747, + "grad_norm": 0.609375, + "learning_rate": 0.00017345309750565848, + "loss": 0.1639, + "step": 5018 + }, + { + "epoch": 0.23781094527363184, + "grad_norm": 0.9609375, + "learning_rate": 0.0001734429909469588, + "loss": 0.7657, + "step": 5019 + }, + { + "epoch": 0.23785832741056623, + "grad_norm": 0.1962890625, + "learning_rate": 0.00017343288275937176, + "loss": 0.1353, + "step": 5020 + }, + { + "epoch": 0.2379057095475006, + "grad_norm": 0.3828125, + "learning_rate": 0.00017342277294312165, + "loss": 0.1467, + "step": 5021 + }, + { + "epoch": 0.23795309168443496, + "grad_norm": 0.8046875, + "learning_rate": 0.00017341266149843262, + "loss": 0.7441, + "step": 5022 + }, + { + "epoch": 0.23800047382136935, + "grad_norm": 0.234375, + "learning_rate": 0.00017340254842552897, + "loss": 0.1331, + "step": 5023 + }, + { + "epoch": 0.23804785595830372, + "grad_norm": 0.77734375, + "learning_rate": 0.00017339243372463495, + "loss": 1.0166, + "step": 5024 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.65625, + "learning_rate": 0.00017338231739597496, + "loss": 0.151, + "step": 5025 + }, + { + "epoch": 0.23814262023217247, + "grad_norm": 0.451171875, + "learning_rate": 0.00017337219943977332, + "loss": 0.0549, + "step": 5026 + }, + { + "epoch": 0.23819000236910684, + "grad_norm": 0.8046875, + "learning_rate": 0.00017336207985625443, + "loss": 0.4165, + "step": 5027 + }, + { + "epoch": 0.23823738450604123, + "grad_norm": 0.66796875, + "learning_rate": 0.00017335195864564277, + "loss": 1.1232, + "step": 5028 + }, + { + "epoch": 0.2382847666429756, + "grad_norm": 0.6171875, + "learning_rate": 0.00017334183580816279, + "loss": 1.1028, + "step": 5029 + }, + { + "epoch": 0.23833214877990996, + "grad_norm": 0.578125, + "learning_rate": 0.000173331711344039, + "loss": 0.9572, + "step": 5030 + }, + { + "epoch": 0.23837953091684436, + "grad_norm": 0.75, + "learning_rate": 0.000173321585253496, + "loss": 1.3829, + "step": 5031 + }, + { + "epoch": 0.23842691305377872, + "grad_norm": 0.042724609375, + "learning_rate": 0.0001733114575367583, + "loss": 0.0021, + "step": 5032 + }, + { + "epoch": 0.23847429519071311, + "grad_norm": 0.6015625, + "learning_rate": 0.00017330132819405058, + "loss": 0.3454, + "step": 5033 + }, + { + "epoch": 0.23852167732764748, + "grad_norm": 0.6796875, + "learning_rate": 0.00017329119722559749, + "loss": 0.557, + "step": 5034 + }, + { + "epoch": 0.23856905946458185, + "grad_norm": 0.017578125, + "learning_rate": 0.00017328106463162369, + "loss": 0.0011, + "step": 5035 + }, + { + "epoch": 0.23861644160151624, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001732709304123539, + "loss": 0.0309, + "step": 5036 + }, + { + "epoch": 0.2386638237384506, + "grad_norm": 0.55078125, + "learning_rate": 0.00017326079456801298, + "loss": 0.9946, + "step": 5037 + }, + { + "epoch": 0.23871120587538497, + "grad_norm": 0.5078125, + "learning_rate": 0.00017325065709882567, + "loss": 0.3698, + "step": 5038 + }, + { + "epoch": 0.23875858801231936, + "grad_norm": 0.474609375, + "learning_rate": 0.00017324051800501677, + "loss": 0.6622, + "step": 5039 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.90234375, + "learning_rate": 0.00017323037728681122, + "loss": 0.2755, + "step": 5040 + }, + { + "epoch": 0.23885335228618812, + "grad_norm": 0.9453125, + "learning_rate": 0.00017322023494443386, + "loss": 1.3305, + "step": 5041 + }, + { + "epoch": 0.23890073442312248, + "grad_norm": 0.89453125, + "learning_rate": 0.0001732100909781097, + "loss": 1.0599, + "step": 5042 + }, + { + "epoch": 0.23894811656005685, + "grad_norm": 0.365234375, + "learning_rate": 0.00017319994538806372, + "loss": 0.2098, + "step": 5043 + }, + { + "epoch": 0.23899549869699124, + "grad_norm": 0.01708984375, + "learning_rate": 0.00017318979817452091, + "loss": 0.0013, + "step": 5044 + }, + { + "epoch": 0.2390428808339256, + "grad_norm": 1.1953125, + "learning_rate": 0.00017317964933770633, + "loss": 0.483, + "step": 5045 + }, + { + "epoch": 0.23909026297085997, + "grad_norm": 0.2265625, + "learning_rate": 0.0001731694988778451, + "loss": 0.0344, + "step": 5046 + }, + { + "epoch": 0.23913764510779437, + "grad_norm": 0.67578125, + "learning_rate": 0.0001731593467951623, + "loss": 1.0277, + "step": 5047 + }, + { + "epoch": 0.23918502724472873, + "grad_norm": 0.76953125, + "learning_rate": 0.0001731491930898831, + "loss": 1.0551, + "step": 5048 + }, + { + "epoch": 0.23923240938166312, + "grad_norm": 0.498046875, + "learning_rate": 0.00017313903776223274, + "loss": 0.8756, + "step": 5049 + }, + { + "epoch": 0.2392797915185975, + "grad_norm": 0.263671875, + "learning_rate": 0.0001731288808124364, + "loss": 0.1851, + "step": 5050 + }, + { + "epoch": 0.23932717365553186, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017311872224071942, + "loss": 0.1446, + "step": 5051 + }, + { + "epoch": 0.23937455579246625, + "grad_norm": 0.486328125, + "learning_rate": 0.00017310856204730705, + "loss": 0.6632, + "step": 5052 + }, + { + "epoch": 0.2394219379294006, + "grad_norm": 0.142578125, + "learning_rate": 0.0001730984002324246, + "loss": 0.022, + "step": 5053 + }, + { + "epoch": 0.23946932006633498, + "grad_norm": 0.80078125, + "learning_rate": 0.00017308823679629756, + "loss": 1.0228, + "step": 5054 + }, + { + "epoch": 0.23951670220326937, + "grad_norm": 0.55078125, + "learning_rate": 0.00017307807173915123, + "loss": 0.9571, + "step": 5055 + }, + { + "epoch": 0.23956408434020374, + "grad_norm": 0.69140625, + "learning_rate": 0.00017306790506121114, + "loss": 1.0204, + "step": 5056 + }, + { + "epoch": 0.23961146647713813, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001730577367627027, + "loss": 0.0199, + "step": 5057 + }, + { + "epoch": 0.2396588486140725, + "grad_norm": 0.5234375, + "learning_rate": 0.00017304756684385152, + "loss": 1.0773, + "step": 5058 + }, + { + "epoch": 0.23970623075100686, + "grad_norm": 0.578125, + "learning_rate": 0.00017303739530488308, + "loss": 1.0701, + "step": 5059 + }, + { + "epoch": 0.23975361288794125, + "grad_norm": 0.59765625, + "learning_rate": 0.00017302722214602303, + "loss": 0.961, + "step": 5060 + }, + { + "epoch": 0.23980099502487562, + "grad_norm": 0.609375, + "learning_rate": 0.00017301704736749697, + "loss": 0.0564, + "step": 5061 + }, + { + "epoch": 0.23984837716181, + "grad_norm": 0.01483154296875, + "learning_rate": 0.0001730068709695306, + "loss": 0.001, + "step": 5062 + }, + { + "epoch": 0.23989575929874438, + "grad_norm": 0.671875, + "learning_rate": 0.00017299669295234955, + "loss": 1.3446, + "step": 5063 + }, + { + "epoch": 0.23994314143567874, + "grad_norm": 0.80859375, + "learning_rate": 0.0001729865133161796, + "loss": 1.3723, + "step": 5064 + }, + { + "epoch": 0.23999052357261313, + "grad_norm": 0.4140625, + "learning_rate": 0.00017297633206124656, + "loss": 0.1451, + "step": 5065 + }, + { + "epoch": 0.2400379057095475, + "grad_norm": 0.828125, + "learning_rate": 0.0001729661491877762, + "loss": 1.1708, + "step": 5066 + }, + { + "epoch": 0.24008528784648187, + "grad_norm": 0.64453125, + "learning_rate": 0.00017295596469599437, + "loss": 1.1451, + "step": 5067 + }, + { + "epoch": 0.24013266998341626, + "grad_norm": 0.5078125, + "learning_rate": 0.00017294577858612695, + "loss": 0.6699, + "step": 5068 + }, + { + "epoch": 0.24018005212035062, + "grad_norm": 0.7734375, + "learning_rate": 0.00017293559085839987, + "loss": 1.2246, + "step": 5069 + }, + { + "epoch": 0.24022743425728502, + "grad_norm": 0.462890625, + "learning_rate": 0.00017292540151303903, + "loss": 0.2145, + "step": 5070 + }, + { + "epoch": 0.24027481639421938, + "grad_norm": 0.37890625, + "learning_rate": 0.00017291521055027052, + "loss": 0.1655, + "step": 5071 + }, + { + "epoch": 0.24032219853115375, + "grad_norm": 0.64453125, + "learning_rate": 0.00017290501797032027, + "loss": 1.3479, + "step": 5072 + }, + { + "epoch": 0.24036958066808814, + "grad_norm": 0.19921875, + "learning_rate": 0.0001728948237734144, + "loss": 0.1186, + "step": 5073 + }, + { + "epoch": 0.2404169628050225, + "grad_norm": 0.58203125, + "learning_rate": 0.00017288462795977895, + "loss": 0.6707, + "step": 5074 + }, + { + "epoch": 0.24046434494195687, + "grad_norm": 0.87109375, + "learning_rate": 0.0001728744305296401, + "loss": 0.7204, + "step": 5075 + }, + { + "epoch": 0.24051172707889126, + "grad_norm": 0.62109375, + "learning_rate": 0.000172864231483224, + "loss": 0.9086, + "step": 5076 + }, + { + "epoch": 0.24055910921582563, + "grad_norm": 0.7734375, + "learning_rate": 0.00017285403082075687, + "loss": 1.2283, + "step": 5077 + }, + { + "epoch": 0.24060649135276002, + "grad_norm": 0.61328125, + "learning_rate": 0.00017284382854246494, + "loss": 0.8928, + "step": 5078 + }, + { + "epoch": 0.2406538734896944, + "grad_norm": 0.88671875, + "learning_rate": 0.00017283362464857446, + "loss": 0.3822, + "step": 5079 + }, + { + "epoch": 0.24070125562662875, + "grad_norm": 0.59375, + "learning_rate": 0.00017282341913931178, + "loss": 0.8967, + "step": 5080 + }, + { + "epoch": 0.24074863776356314, + "grad_norm": 0.7421875, + "learning_rate": 0.0001728132120149032, + "loss": 1.743, + "step": 5081 + }, + { + "epoch": 0.2407960199004975, + "grad_norm": 0.6875, + "learning_rate": 0.0001728030032755752, + "loss": 1.162, + "step": 5082 + }, + { + "epoch": 0.24084340203743188, + "grad_norm": 0.6875, + "learning_rate": 0.00017279279292155408, + "loss": 1.3576, + "step": 5083 + }, + { + "epoch": 0.24089078417436627, + "grad_norm": 0.1455078125, + "learning_rate": 0.00017278258095306637, + "loss": 0.0138, + "step": 5084 + }, + { + "epoch": 0.24093816631130063, + "grad_norm": 0.2578125, + "learning_rate": 0.00017277236737033854, + "loss": 0.1336, + "step": 5085 + }, + { + "epoch": 0.24098554844823503, + "grad_norm": 0.54296875, + "learning_rate": 0.0001727621521735971, + "loss": 0.7657, + "step": 5086 + }, + { + "epoch": 0.2410329305851694, + "grad_norm": 0.1826171875, + "learning_rate": 0.00017275193536306864, + "loss": 0.1299, + "step": 5087 + }, + { + "epoch": 0.24108031272210376, + "grad_norm": 0.6328125, + "learning_rate": 0.00017274171693897975, + "loss": 0.9155, + "step": 5088 + }, + { + "epoch": 0.24112769485903815, + "grad_norm": 0.484375, + "learning_rate": 0.00017273149690155703, + "loss": 0.6494, + "step": 5089 + }, + { + "epoch": 0.24117507699597251, + "grad_norm": 0.59765625, + "learning_rate": 0.00017272127525102721, + "loss": 0.7286, + "step": 5090 + }, + { + "epoch": 0.2412224591329069, + "grad_norm": 0.5625, + "learning_rate": 0.00017271105198761694, + "loss": 1.1988, + "step": 5091 + }, + { + "epoch": 0.24126984126984127, + "grad_norm": 0.234375, + "learning_rate": 0.00017270082711155302, + "loss": 0.1147, + "step": 5092 + }, + { + "epoch": 0.24131722340677564, + "grad_norm": 0.75390625, + "learning_rate": 0.00017269060062306214, + "loss": 1.0916, + "step": 5093 + }, + { + "epoch": 0.24136460554371003, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017268037252237122, + "loss": 0.0054, + "step": 5094 + }, + { + "epoch": 0.2414119876806444, + "grad_norm": 0.85546875, + "learning_rate": 0.00017267014280970702, + "loss": 0.4695, + "step": 5095 + }, + { + "epoch": 0.24145936981757876, + "grad_norm": 0.9375, + "learning_rate": 0.00017265991148529648, + "loss": 1.0174, + "step": 5096 + }, + { + "epoch": 0.24150675195451315, + "grad_norm": 0.287109375, + "learning_rate": 0.0001726496785493665, + "loss": 0.1565, + "step": 5097 + }, + { + "epoch": 0.24155413409144752, + "grad_norm": 0.54296875, + "learning_rate": 0.000172639444002144, + "loss": 1.2032, + "step": 5098 + }, + { + "epoch": 0.2416015162283819, + "grad_norm": 0.474609375, + "learning_rate": 0.00017262920784385602, + "loss": 1.0722, + "step": 5099 + }, + { + "epoch": 0.24164889836531628, + "grad_norm": 0.31640625, + "learning_rate": 0.00017261897007472956, + "loss": 0.0169, + "step": 5100 + }, + { + "epoch": 0.24169628050225064, + "grad_norm": 0.671875, + "learning_rate": 0.00017260873069499172, + "loss": 1.778, + "step": 5101 + }, + { + "epoch": 0.24174366263918504, + "grad_norm": 0.65625, + "learning_rate": 0.00017259848970486955, + "loss": 0.4495, + "step": 5102 + }, + { + "epoch": 0.2417910447761194, + "grad_norm": 0.640625, + "learning_rate": 0.00017258824710459023, + "loss": 0.9077, + "step": 5103 + }, + { + "epoch": 0.24183842691305377, + "grad_norm": 0.1796875, + "learning_rate": 0.0001725780028943809, + "loss": 0.0096, + "step": 5104 + }, + { + "epoch": 0.24188580904998816, + "grad_norm": 0.1767578125, + "learning_rate": 0.00017256775707446875, + "loss": 0.0437, + "step": 5105 + }, + { + "epoch": 0.24193319118692252, + "grad_norm": 0.6171875, + "learning_rate": 0.00017255750964508107, + "loss": 1.0936, + "step": 5106 + }, + { + "epoch": 0.24198057332385692, + "grad_norm": 0.56640625, + "learning_rate": 0.00017254726060644512, + "loss": 0.0998, + "step": 5107 + }, + { + "epoch": 0.24202795546079128, + "grad_norm": 0.7265625, + "learning_rate": 0.00017253700995878814, + "loss": 1.4343, + "step": 5108 + }, + { + "epoch": 0.24207533759772565, + "grad_norm": 0.7578125, + "learning_rate": 0.00017252675770233758, + "loss": 1.1585, + "step": 5109 + }, + { + "epoch": 0.24212271973466004, + "grad_norm": 0.51171875, + "learning_rate": 0.0001725165038373208, + "loss": 0.9669, + "step": 5110 + }, + { + "epoch": 0.2421701018715944, + "grad_norm": 0.466796875, + "learning_rate": 0.0001725062483639652, + "loss": 0.1645, + "step": 5111 + }, + { + "epoch": 0.24221748400852877, + "grad_norm": 0.462890625, + "learning_rate": 0.00017249599128249825, + "loss": 0.8636, + "step": 5112 + }, + { + "epoch": 0.24226486614546316, + "grad_norm": 0.2177734375, + "learning_rate": 0.00017248573259314739, + "loss": 0.1532, + "step": 5113 + }, + { + "epoch": 0.24231224828239753, + "grad_norm": 0.72265625, + "learning_rate": 0.00017247547229614022, + "loss": 1.4756, + "step": 5114 + }, + { + "epoch": 0.24235963041933192, + "grad_norm": 0.5, + "learning_rate": 0.00017246521039170429, + "loss": 0.7939, + "step": 5115 + }, + { + "epoch": 0.2424070125562663, + "grad_norm": 0.59375, + "learning_rate": 0.00017245494688006716, + "loss": 0.8726, + "step": 5116 + }, + { + "epoch": 0.24245439469320065, + "grad_norm": 0.54296875, + "learning_rate": 0.00017244468176145648, + "loss": 1.7517, + "step": 5117 + }, + { + "epoch": 0.24250177683013505, + "grad_norm": 0.259765625, + "learning_rate": 0.00017243441503609993, + "loss": 0.0335, + "step": 5118 + }, + { + "epoch": 0.2425491589670694, + "grad_norm": 0.54296875, + "learning_rate": 0.00017242414670422523, + "loss": 0.7218, + "step": 5119 + }, + { + "epoch": 0.2425965411040038, + "grad_norm": 0.6953125, + "learning_rate": 0.00017241387676606004, + "loss": 0.6616, + "step": 5120 + }, + { + "epoch": 0.24264392324093817, + "grad_norm": 0.150390625, + "learning_rate": 0.00017240360522183224, + "loss": 0.0204, + "step": 5121 + }, + { + "epoch": 0.24269130537787253, + "grad_norm": 0.53515625, + "learning_rate": 0.0001723933320717696, + "loss": 0.7932, + "step": 5122 + }, + { + "epoch": 0.24273868751480693, + "grad_norm": 0.19140625, + "learning_rate": 0.00017238305731609997, + "loss": 0.1196, + "step": 5123 + }, + { + "epoch": 0.2427860696517413, + "grad_norm": 0.546875, + "learning_rate": 0.00017237278095505118, + "loss": 0.6768, + "step": 5124 + }, + { + "epoch": 0.24283345178867566, + "grad_norm": 0.6484375, + "learning_rate": 0.00017236250298885124, + "loss": 0.7022, + "step": 5125 + }, + { + "epoch": 0.24288083392561005, + "grad_norm": 0.71875, + "learning_rate": 0.00017235222341772802, + "loss": 1.0216, + "step": 5126 + }, + { + "epoch": 0.24292821606254442, + "grad_norm": 0.65234375, + "learning_rate": 0.00017234194224190961, + "loss": 0.9276, + "step": 5127 + }, + { + "epoch": 0.2429755981994788, + "grad_norm": 0.59375, + "learning_rate": 0.00017233165946162394, + "loss": 1.2332, + "step": 5128 + }, + { + "epoch": 0.24302298033641317, + "grad_norm": 0.1328125, + "learning_rate": 0.00017232137507709912, + "loss": 0.0201, + "step": 5129 + }, + { + "epoch": 0.24307036247334754, + "grad_norm": 0.6953125, + "learning_rate": 0.0001723110890885632, + "loss": 0.7718, + "step": 5130 + }, + { + "epoch": 0.24311774461028193, + "grad_norm": 0.71484375, + "learning_rate": 0.0001723008014962444, + "loss": 1.0057, + "step": 5131 + }, + { + "epoch": 0.2431651267472163, + "grad_norm": 0.515625, + "learning_rate": 0.00017229051230037082, + "loss": 0.9161, + "step": 5132 + }, + { + "epoch": 0.24321250888415066, + "grad_norm": 0.76953125, + "learning_rate": 0.00017228022150117065, + "loss": 1.0776, + "step": 5133 + }, + { + "epoch": 0.24325989102108506, + "grad_norm": 0.53125, + "learning_rate": 0.00017226992909887215, + "loss": 0.0449, + "step": 5134 + }, + { + "epoch": 0.24330727315801942, + "grad_norm": 0.859375, + "learning_rate": 0.0001722596350937036, + "loss": 1.0285, + "step": 5135 + }, + { + "epoch": 0.24335465529495381, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017224933948589336, + "loss": 0.0313, + "step": 5136 + }, + { + "epoch": 0.24340203743188818, + "grad_norm": 0.64453125, + "learning_rate": 0.0001722390422756697, + "loss": 1.1853, + "step": 5137 + }, + { + "epoch": 0.24344941956882254, + "grad_norm": 0.87890625, + "learning_rate": 0.00017222874346326103, + "loss": 0.7152, + "step": 5138 + }, + { + "epoch": 0.24349680170575694, + "grad_norm": 0.4453125, + "learning_rate": 0.00017221844304889577, + "loss": 0.1681, + "step": 5139 + }, + { + "epoch": 0.2435441838426913, + "grad_norm": 0.5390625, + "learning_rate": 0.00017220814103280233, + "loss": 0.7481, + "step": 5140 + }, + { + "epoch": 0.24359156597962567, + "grad_norm": 0.4140625, + "learning_rate": 0.0001721978374152093, + "loss": 0.026, + "step": 5141 + }, + { + "epoch": 0.24363894811656006, + "grad_norm": 0.68359375, + "learning_rate": 0.0001721875321963451, + "loss": 1.3398, + "step": 5142 + }, + { + "epoch": 0.24368633025349443, + "grad_norm": 1.6796875, + "learning_rate": 0.0001721772253764383, + "loss": 0.5379, + "step": 5143 + }, + { + "epoch": 0.24373371239042882, + "grad_norm": 0.89453125, + "learning_rate": 0.00017216691695571756, + "loss": 1.3066, + "step": 5144 + }, + { + "epoch": 0.24378109452736318, + "grad_norm": 0.494140625, + "learning_rate": 0.00017215660693441147, + "loss": 0.4534, + "step": 5145 + }, + { + "epoch": 0.24382847666429755, + "grad_norm": 0.392578125, + "learning_rate": 0.00017214629531274865, + "loss": 0.2493, + "step": 5146 + }, + { + "epoch": 0.24387585880123194, + "grad_norm": 0.447265625, + "learning_rate": 0.00017213598209095792, + "loss": 0.9154, + "step": 5147 + }, + { + "epoch": 0.2439232409381663, + "grad_norm": 0.71484375, + "learning_rate": 0.00017212566726926789, + "loss": 0.4233, + "step": 5148 + }, + { + "epoch": 0.24397062307510067, + "grad_norm": 0.11474609375, + "learning_rate": 0.0001721153508479074, + "loss": 0.0072, + "step": 5149 + }, + { + "epoch": 0.24401800521203507, + "grad_norm": 0.76171875, + "learning_rate": 0.00017210503282710527, + "loss": 0.8346, + "step": 5150 + }, + { + "epoch": 0.24406538734896943, + "grad_norm": 0.49609375, + "learning_rate": 0.00017209471320709025, + "loss": 0.2293, + "step": 5151 + }, + { + "epoch": 0.24411276948590382, + "grad_norm": 0.6796875, + "learning_rate": 0.00017208439198809132, + "loss": 0.8109, + "step": 5152 + }, + { + "epoch": 0.2441601516228382, + "grad_norm": 0.06982421875, + "learning_rate": 0.00017207406917033738, + "loss": 0.003, + "step": 5153 + }, + { + "epoch": 0.24420753375977255, + "grad_norm": 0.73828125, + "learning_rate": 0.0001720637447540573, + "loss": 1.1695, + "step": 5154 + }, + { + "epoch": 0.24425491589670695, + "grad_norm": 0.64453125, + "learning_rate": 0.00017205341873948018, + "loss": 0.9309, + "step": 5155 + }, + { + "epoch": 0.2443022980336413, + "grad_norm": 0.81640625, + "learning_rate": 0.00017204309112683493, + "loss": 0.9097, + "step": 5156 + }, + { + "epoch": 0.2443496801705757, + "grad_norm": 0.365234375, + "learning_rate": 0.0001720327619163507, + "loss": 0.0052, + "step": 5157 + }, + { + "epoch": 0.24439706230751007, + "grad_norm": 0.1923828125, + "learning_rate": 0.00017202243110825652, + "loss": 0.0227, + "step": 5158 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.49609375, + "learning_rate": 0.00017201209870278152, + "loss": 0.726, + "step": 5159 + }, + { + "epoch": 0.24449182658137883, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017200176470015486, + "loss": 0.0155, + "step": 5160 + }, + { + "epoch": 0.2445392087183132, + "grad_norm": 0.60546875, + "learning_rate": 0.0001719914291006058, + "loss": 1.3043, + "step": 5161 + }, + { + "epoch": 0.24458659085524756, + "grad_norm": 0.671875, + "learning_rate": 0.0001719810919043635, + "loss": 0.9903, + "step": 5162 + }, + { + "epoch": 0.24463397299218195, + "grad_norm": 1.453125, + "learning_rate": 0.00017197075311165723, + "loss": 0.0655, + "step": 5163 + }, + { + "epoch": 0.24468135512911632, + "grad_norm": 0.6328125, + "learning_rate": 0.00017196041272271635, + "loss": 0.936, + "step": 5164 + }, + { + "epoch": 0.2447287372660507, + "grad_norm": 0.470703125, + "learning_rate": 0.00017195007073777014, + "loss": 0.0852, + "step": 5165 + }, + { + "epoch": 0.24477611940298508, + "grad_norm": 0.53125, + "learning_rate": 0.000171939727157048, + "loss": 0.6782, + "step": 5166 + }, + { + "epoch": 0.24482350153991944, + "grad_norm": 0.75, + "learning_rate": 0.00017192938198077936, + "loss": 1.1449, + "step": 5167 + }, + { + "epoch": 0.24487088367685383, + "grad_norm": 0.7734375, + "learning_rate": 0.00017191903520919364, + "loss": 1.3613, + "step": 5168 + }, + { + "epoch": 0.2449182658137882, + "grad_norm": 0.6796875, + "learning_rate": 0.0001719086868425203, + "loss": 1.1635, + "step": 5169 + }, + { + "epoch": 0.24496564795072256, + "grad_norm": 0.6171875, + "learning_rate": 0.0001718983368809889, + "loss": 0.7599, + "step": 5170 + }, + { + "epoch": 0.24501303008765696, + "grad_norm": 0.33203125, + "learning_rate": 0.00017188798532482896, + "loss": 0.0246, + "step": 5171 + }, + { + "epoch": 0.24506041222459132, + "grad_norm": 0.6796875, + "learning_rate": 0.0001718776321742701, + "loss": 1.2764, + "step": 5172 + }, + { + "epoch": 0.24510779436152572, + "grad_norm": 0.75, + "learning_rate": 0.00017186727742954188, + "loss": 0.1184, + "step": 5173 + }, + { + "epoch": 0.24515517649846008, + "grad_norm": 0.6484375, + "learning_rate": 0.00017185692109087403, + "loss": 0.9347, + "step": 5174 + }, + { + "epoch": 0.24520255863539445, + "grad_norm": 0.453125, + "learning_rate": 0.00017184656315849618, + "loss": 0.4884, + "step": 5175 + }, + { + "epoch": 0.24524994077232884, + "grad_norm": 0.255859375, + "learning_rate": 0.0001718362036326381, + "loss": 0.1475, + "step": 5176 + }, + { + "epoch": 0.2452973229092632, + "grad_norm": 0.5, + "learning_rate": 0.00017182584251352955, + "loss": 1.0809, + "step": 5177 + }, + { + "epoch": 0.24534470504619757, + "grad_norm": 0.0037689208984375, + "learning_rate": 0.00017181547980140032, + "loss": 0.0003, + "step": 5178 + }, + { + "epoch": 0.24539208718313196, + "grad_norm": 0.73046875, + "learning_rate": 0.00017180511549648024, + "loss": 1.2188, + "step": 5179 + }, + { + "epoch": 0.24543946932006633, + "grad_norm": 0.341796875, + "learning_rate": 0.00017179474959899918, + "loss": 0.0038, + "step": 5180 + }, + { + "epoch": 0.24548685145700072, + "grad_norm": 0.310546875, + "learning_rate": 0.00017178438210918703, + "loss": 0.0334, + "step": 5181 + }, + { + "epoch": 0.2455342335939351, + "grad_norm": 0.765625, + "learning_rate": 0.00017177401302727376, + "loss": 0.0359, + "step": 5182 + }, + { + "epoch": 0.24558161573086945, + "grad_norm": 0.70703125, + "learning_rate": 0.00017176364235348932, + "loss": 1.4391, + "step": 5183 + }, + { + "epoch": 0.24562899786780384, + "grad_norm": 0.498046875, + "learning_rate": 0.00017175327008806375, + "loss": 0.6934, + "step": 5184 + }, + { + "epoch": 0.2456763800047382, + "grad_norm": 0.390625, + "learning_rate": 0.00017174289623122705, + "loss": 0.0347, + "step": 5185 + }, + { + "epoch": 0.2457237621416726, + "grad_norm": 0.64453125, + "learning_rate": 0.00017173252078320935, + "loss": 1.4223, + "step": 5186 + }, + { + "epoch": 0.24577114427860697, + "grad_norm": 0.5546875, + "learning_rate": 0.00017172214374424076, + "loss": 0.9328, + "step": 5187 + }, + { + "epoch": 0.24581852641554133, + "grad_norm": 0.26953125, + "learning_rate": 0.0001717117651145514, + "loss": 0.0053, + "step": 5188 + }, + { + "epoch": 0.24586590855247573, + "grad_norm": 0.498046875, + "learning_rate": 0.00017170138489437146, + "loss": 0.4809, + "step": 5189 + }, + { + "epoch": 0.2459132906894101, + "grad_norm": 0.70703125, + "learning_rate": 0.0001716910030839312, + "loss": 0.1269, + "step": 5190 + }, + { + "epoch": 0.24596067282634446, + "grad_norm": 0.546875, + "learning_rate": 0.00017168061968346083, + "loss": 0.6677, + "step": 5191 + }, + { + "epoch": 0.24600805496327885, + "grad_norm": 0.09912109375, + "learning_rate": 0.0001716702346931907, + "loss": 0.0111, + "step": 5192 + }, + { + "epoch": 0.24605543710021321, + "grad_norm": 0.78515625, + "learning_rate": 0.00017165984811335106, + "loss": 1.6476, + "step": 5193 + }, + { + "epoch": 0.2461028192371476, + "grad_norm": 0.7265625, + "learning_rate": 0.00017164945994417233, + "loss": 1.1426, + "step": 5194 + }, + { + "epoch": 0.24615020137408197, + "grad_norm": 0.83984375, + "learning_rate": 0.00017163907018588492, + "loss": 0.6152, + "step": 5195 + }, + { + "epoch": 0.24619758351101634, + "grad_norm": 0.65625, + "learning_rate": 0.00017162867883871924, + "loss": 0.9099, + "step": 5196 + }, + { + "epoch": 0.24624496564795073, + "grad_norm": 0.388671875, + "learning_rate": 0.00017161828590290572, + "loss": 0.0948, + "step": 5197 + }, + { + "epoch": 0.2462923477848851, + "grad_norm": 0.478515625, + "learning_rate": 0.00017160789137867495, + "loss": 0.5108, + "step": 5198 + }, + { + "epoch": 0.24633972992181946, + "grad_norm": 0.361328125, + "learning_rate": 0.0001715974952662574, + "loss": 0.1785, + "step": 5199 + }, + { + "epoch": 0.24638711205875385, + "grad_norm": 0.1083984375, + "learning_rate": 0.00017158709756588366, + "loss": 0.0049, + "step": 5200 + }, + { + "epoch": 0.24643449419568822, + "grad_norm": 0.76953125, + "learning_rate": 0.00017157669827778436, + "loss": 0.2206, + "step": 5201 + }, + { + "epoch": 0.2464818763326226, + "grad_norm": 0.01275634765625, + "learning_rate": 0.0001715662974021901, + "loss": 0.0009, + "step": 5202 + }, + { + "epoch": 0.24652925846955698, + "grad_norm": 0.98828125, + "learning_rate": 0.00017155589493933162, + "loss": 0.2739, + "step": 5203 + }, + { + "epoch": 0.24657664060649134, + "grad_norm": 0.56640625, + "learning_rate": 0.0001715454908894396, + "loss": 0.7662, + "step": 5204 + }, + { + "epoch": 0.24662402274342574, + "grad_norm": 0.69921875, + "learning_rate": 0.0001715350852527448, + "loss": 0.7587, + "step": 5205 + }, + { + "epoch": 0.2466714048803601, + "grad_norm": 0.50390625, + "learning_rate": 0.00017152467802947804, + "loss": 1.1468, + "step": 5206 + }, + { + "epoch": 0.24671878701729447, + "grad_norm": 0.27734375, + "learning_rate": 0.00017151426921987008, + "loss": 0.1373, + "step": 5207 + }, + { + "epoch": 0.24676616915422886, + "grad_norm": 0.0732421875, + "learning_rate": 0.0001715038588241518, + "loss": 0.0061, + "step": 5208 + }, + { + "epoch": 0.24681355129116322, + "grad_norm": 0.65625, + "learning_rate": 0.0001714934468425541, + "loss": 0.9072, + "step": 5209 + }, + { + "epoch": 0.24686093342809762, + "grad_norm": 0.54296875, + "learning_rate": 0.00017148303327530788, + "loss": 0.7494, + "step": 5210 + }, + { + "epoch": 0.24690831556503198, + "grad_norm": 0.515625, + "learning_rate": 0.00017147261812264412, + "loss": 0.5658, + "step": 5211 + }, + { + "epoch": 0.24695569770196635, + "grad_norm": 0.6875, + "learning_rate": 0.00017146220138479384, + "loss": 1.052, + "step": 5212 + }, + { + "epoch": 0.24700307983890074, + "grad_norm": 0.6640625, + "learning_rate": 0.00017145178306198806, + "loss": 0.1219, + "step": 5213 + }, + { + "epoch": 0.2470504619758351, + "grad_norm": 0.6953125, + "learning_rate": 0.00017144136315445783, + "loss": 0.7649, + "step": 5214 + }, + { + "epoch": 0.2470978441127695, + "grad_norm": 0.78125, + "learning_rate": 0.00017143094166243423, + "loss": 1.2744, + "step": 5215 + }, + { + "epoch": 0.24714522624970386, + "grad_norm": 0.298828125, + "learning_rate": 0.00017142051858614848, + "loss": 0.0192, + "step": 5216 + }, + { + "epoch": 0.24719260838663823, + "grad_norm": 0.60546875, + "learning_rate": 0.00017141009392583167, + "loss": 1.2459, + "step": 5217 + }, + { + "epoch": 0.24723999052357262, + "grad_norm": 0.59375, + "learning_rate": 0.00017139966768171504, + "loss": 0.5501, + "step": 5218 + }, + { + "epoch": 0.247287372660507, + "grad_norm": 0.359375, + "learning_rate": 0.00017138923985402985, + "loss": 0.2089, + "step": 5219 + }, + { + "epoch": 0.24733475479744135, + "grad_norm": 0.59375, + "learning_rate": 0.00017137881044300735, + "loss": 0.794, + "step": 5220 + }, + { + "epoch": 0.24738213693437575, + "grad_norm": 0.65234375, + "learning_rate": 0.00017136837944887887, + "loss": 0.9825, + "step": 5221 + }, + { + "epoch": 0.2474295190713101, + "grad_norm": 0.6171875, + "learning_rate": 0.00017135794687187574, + "loss": 0.9264, + "step": 5222 + }, + { + "epoch": 0.2474769012082445, + "grad_norm": 0.66015625, + "learning_rate": 0.00017134751271222936, + "loss": 1.1857, + "step": 5223 + }, + { + "epoch": 0.24752428334517887, + "grad_norm": 0.396484375, + "learning_rate": 0.00017133707697017115, + "loss": 0.0222, + "step": 5224 + }, + { + "epoch": 0.24757166548211323, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017132663964593254, + "loss": 0.0491, + "step": 5225 + }, + { + "epoch": 0.24761904761904763, + "grad_norm": 0.67578125, + "learning_rate": 0.00017131620073974503, + "loss": 1.0718, + "step": 5226 + }, + { + "epoch": 0.247666429755982, + "grad_norm": 0.412109375, + "learning_rate": 0.0001713057602518402, + "loss": 0.4596, + "step": 5227 + }, + { + "epoch": 0.24771381189291636, + "grad_norm": 0.609375, + "learning_rate": 0.00017129531818244954, + "loss": 0.0546, + "step": 5228 + }, + { + "epoch": 0.24776119402985075, + "grad_norm": 0.75390625, + "learning_rate": 0.00017128487453180462, + "loss": 0.8905, + "step": 5229 + }, + { + "epoch": 0.24780857616678512, + "grad_norm": 0.50390625, + "learning_rate": 0.00017127442930013715, + "loss": 0.538, + "step": 5230 + }, + { + "epoch": 0.2478559583037195, + "grad_norm": 0.46875, + "learning_rate": 0.00017126398248767875, + "loss": 0.144, + "step": 5231 + }, + { + "epoch": 0.24790334044065387, + "grad_norm": 0.7265625, + "learning_rate": 0.0001712535340946611, + "loss": 1.2883, + "step": 5232 + }, + { + "epoch": 0.24795072257758824, + "grad_norm": 0.671875, + "learning_rate": 0.000171243084121316, + "loss": 1.1133, + "step": 5233 + }, + { + "epoch": 0.24799810471452263, + "grad_norm": 0.21484375, + "learning_rate": 0.00017123263256787517, + "loss": 0.0404, + "step": 5234 + }, + { + "epoch": 0.248045486851457, + "grad_norm": 0.189453125, + "learning_rate": 0.0001712221794345704, + "loss": 0.1245, + "step": 5235 + }, + { + "epoch": 0.24809286898839136, + "grad_norm": 0.328125, + "learning_rate": 0.00017121172472163356, + "loss": 0.042, + "step": 5236 + }, + { + "epoch": 0.24814025112532576, + "grad_norm": 0.875, + "learning_rate": 0.00017120126842929656, + "loss": 0.4715, + "step": 5237 + }, + { + "epoch": 0.24818763326226012, + "grad_norm": 0.59765625, + "learning_rate": 0.0001711908105577912, + "loss": 1.1695, + "step": 5238 + }, + { + "epoch": 0.24823501539919451, + "grad_norm": 0.83984375, + "learning_rate": 0.00017118035110734954, + "loss": 0.9905, + "step": 5239 + }, + { + "epoch": 0.24828239753612888, + "grad_norm": 0.051025390625, + "learning_rate": 0.0001711698900782035, + "loss": 0.0019, + "step": 5240 + }, + { + "epoch": 0.24832977967306324, + "grad_norm": 0.55859375, + "learning_rate": 0.0001711594274705851, + "loss": 0.9222, + "step": 5241 + }, + { + "epoch": 0.24837716180999764, + "grad_norm": 0.703125, + "learning_rate": 0.00017114896328472638, + "loss": 0.2917, + "step": 5242 + }, + { + "epoch": 0.248424543946932, + "grad_norm": 0.439453125, + "learning_rate": 0.00017113849752085946, + "loss": 0.5447, + "step": 5243 + }, + { + "epoch": 0.2484719260838664, + "grad_norm": 0.58984375, + "learning_rate": 0.0001711280301792164, + "loss": 1.1684, + "step": 5244 + }, + { + "epoch": 0.24851930822080076, + "grad_norm": 1.046875, + "learning_rate": 0.00017111756126002945, + "loss": 1.2022, + "step": 5245 + }, + { + "epoch": 0.24856669035773513, + "grad_norm": 0.70703125, + "learning_rate": 0.00017110709076353068, + "loss": 0.9585, + "step": 5246 + }, + { + "epoch": 0.24861407249466952, + "grad_norm": 0.73828125, + "learning_rate": 0.00017109661868995244, + "loss": 0.932, + "step": 5247 + }, + { + "epoch": 0.24866145463160388, + "grad_norm": 0.70703125, + "learning_rate": 0.0001710861450395269, + "loss": 1.2129, + "step": 5248 + }, + { + "epoch": 0.24870883676853825, + "grad_norm": 0.3046875, + "learning_rate": 0.00017107566981248637, + "loss": 0.0886, + "step": 5249 + }, + { + "epoch": 0.24875621890547264, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001710651930090632, + "loss": 0.0481, + "step": 5250 + }, + { + "epoch": 0.248803601042407, + "grad_norm": 0.63671875, + "learning_rate": 0.00017105471462948975, + "loss": 0.1141, + "step": 5251 + }, + { + "epoch": 0.2488509831793414, + "grad_norm": 0.76953125, + "learning_rate": 0.00017104423467399838, + "loss": 0.9654, + "step": 5252 + }, + { + "epoch": 0.24889836531627577, + "grad_norm": 1.6328125, + "learning_rate": 0.0001710337531428216, + "loss": 0.7203, + "step": 5253 + }, + { + "epoch": 0.24894574745321013, + "grad_norm": 0.69921875, + "learning_rate": 0.00017102327003619183, + "loss": 1.1123, + "step": 5254 + }, + { + "epoch": 0.24899312959014452, + "grad_norm": 0.57421875, + "learning_rate": 0.00017101278535434155, + "loss": 0.857, + "step": 5255 + }, + { + "epoch": 0.2490405117270789, + "grad_norm": 0.76171875, + "learning_rate": 0.00017100229909750337, + "loss": 1.2906, + "step": 5256 + }, + { + "epoch": 0.24908789386401325, + "grad_norm": 0.515625, + "learning_rate": 0.0001709918112659098, + "loss": 0.7463, + "step": 5257 + }, + { + "epoch": 0.24913527600094765, + "grad_norm": 0.0341796875, + "learning_rate": 0.00017098132185979346, + "loss": 0.0013, + "step": 5258 + }, + { + "epoch": 0.249182658137882, + "grad_norm": 0.68359375, + "learning_rate": 0.00017097083087938705, + "loss": 1.2393, + "step": 5259 + }, + { + "epoch": 0.2492300402748164, + "grad_norm": 0.38671875, + "learning_rate": 0.00017096033832492317, + "loss": 0.1059, + "step": 5260 + }, + { + "epoch": 0.24927742241175077, + "grad_norm": 0.54296875, + "learning_rate": 0.00017094984419663457, + "loss": 1.0911, + "step": 5261 + }, + { + "epoch": 0.24932480454868514, + "grad_norm": 0.279296875, + "learning_rate": 0.00017093934849475405, + "loss": 0.1458, + "step": 5262 + }, + { + "epoch": 0.24937218668561953, + "grad_norm": 0.640625, + "learning_rate": 0.00017092885121951427, + "loss": 0.7728, + "step": 5263 + }, + { + "epoch": 0.2494195688225539, + "grad_norm": 0.7578125, + "learning_rate": 0.00017091835237114818, + "loss": 1.1178, + "step": 5264 + }, + { + "epoch": 0.24946695095948826, + "grad_norm": 0.58203125, + "learning_rate": 0.00017090785194988852, + "loss": 0.9226, + "step": 5265 + }, + { + "epoch": 0.24951433309642265, + "grad_norm": 0.546875, + "learning_rate": 0.00017089734995596824, + "loss": 1.1267, + "step": 5266 + }, + { + "epoch": 0.24956171523335702, + "grad_norm": 0.55859375, + "learning_rate": 0.00017088684638962029, + "loss": 0.9419, + "step": 5267 + }, + { + "epoch": 0.2496090973702914, + "grad_norm": 0.478515625, + "learning_rate": 0.00017087634125107756, + "loss": 0.5949, + "step": 5268 + }, + { + "epoch": 0.24965647950722578, + "grad_norm": 0.359375, + "learning_rate": 0.0001708658345405731, + "loss": 0.0654, + "step": 5269 + }, + { + "epoch": 0.24970386164416014, + "grad_norm": 0.640625, + "learning_rate": 0.0001708553262583399, + "loss": 1.1039, + "step": 5270 + }, + { + "epoch": 0.24975124378109453, + "grad_norm": 0.46484375, + "learning_rate": 0.00017084481640461104, + "loss": 0.5722, + "step": 5271 + }, + { + "epoch": 0.2497986259180289, + "grad_norm": 0.5546875, + "learning_rate": 0.0001708343049796196, + "loss": 1.0248, + "step": 5272 + }, + { + "epoch": 0.2498460080549633, + "grad_norm": 0.890625, + "learning_rate": 0.00017082379198359875, + "loss": 0.7751, + "step": 5273 + }, + { + "epoch": 0.24989339019189766, + "grad_norm": 0.828125, + "learning_rate": 0.00017081327741678162, + "loss": 0.1469, + "step": 5274 + }, + { + "epoch": 0.24994077232883202, + "grad_norm": 0.7265625, + "learning_rate": 0.0001708027612794014, + "loss": 0.9905, + "step": 5275 + }, + { + "epoch": 0.24998815446576642, + "grad_norm": 0.73828125, + "learning_rate": 0.00017079224357169137, + "loss": 0.7281, + "step": 5276 + }, + { + "epoch": 0.2500355366027008, + "grad_norm": 0.70703125, + "learning_rate": 0.00017078172429388477, + "loss": 0.9772, + "step": 5277 + }, + { + "epoch": 0.2500829187396352, + "grad_norm": 0.34375, + "learning_rate": 0.00017077120344621496, + "loss": 0.0234, + "step": 5278 + }, + { + "epoch": 0.2501303008765695, + "grad_norm": 0.03759765625, + "learning_rate": 0.0001707606810289152, + "loss": 0.001, + "step": 5279 + }, + { + "epoch": 0.2501776830135039, + "grad_norm": 0.6796875, + "learning_rate": 0.0001707501570422189, + "loss": 1.0387, + "step": 5280 + }, + { + "epoch": 0.2502250651504383, + "grad_norm": 0.59375, + "learning_rate": 0.0001707396314863595, + "loss": 0.0456, + "step": 5281 + }, + { + "epoch": 0.25027244728737263, + "grad_norm": 0.5546875, + "learning_rate": 0.0001707291043615704, + "loss": 0.9844, + "step": 5282 + }, + { + "epoch": 0.250319829424307, + "grad_norm": 0.33984375, + "learning_rate": 0.00017071857566808513, + "loss": 0.1872, + "step": 5283 + }, + { + "epoch": 0.2503672115612414, + "grad_norm": 0.458984375, + "learning_rate": 0.00017070804540613718, + "loss": 0.1904, + "step": 5284 + }, + { + "epoch": 0.2504145936981758, + "grad_norm": 0.671875, + "learning_rate": 0.0001706975135759601, + "loss": 1.0268, + "step": 5285 + }, + { + "epoch": 0.25046197583511015, + "grad_norm": 0.58984375, + "learning_rate": 0.0001706869801777874, + "loss": 1.2751, + "step": 5286 + }, + { + "epoch": 0.25050935797204454, + "grad_norm": 0.78515625, + "learning_rate": 0.00017067644521185288, + "loss": 1.1897, + "step": 5287 + }, + { + "epoch": 0.25055674010897894, + "grad_norm": 0.6640625, + "learning_rate": 0.00017066590867838999, + "loss": 1.3925, + "step": 5288 + }, + { + "epoch": 0.2506041222459133, + "grad_norm": 0.4921875, + "learning_rate": 0.00017065537057763257, + "loss": 0.1312, + "step": 5289 + }, + { + "epoch": 0.25065150438284767, + "grad_norm": 0.1953125, + "learning_rate": 0.00017064483090981428, + "loss": 0.0314, + "step": 5290 + }, + { + "epoch": 0.25069888651978206, + "grad_norm": 0.6640625, + "learning_rate": 0.00017063428967516888, + "loss": 1.44, + "step": 5291 + }, + { + "epoch": 0.2507462686567164, + "grad_norm": 0.78125, + "learning_rate": 0.0001706237468739302, + "loss": 0.7273, + "step": 5292 + }, + { + "epoch": 0.2507936507936508, + "grad_norm": 0.298828125, + "learning_rate": 0.000170613202506332, + "loss": 0.0644, + "step": 5293 + }, + { + "epoch": 0.2508410329305852, + "grad_norm": 0.546875, + "learning_rate": 0.00017060265657260822, + "loss": 1.2131, + "step": 5294 + }, + { + "epoch": 0.2508884150675195, + "grad_norm": 0.86328125, + "learning_rate": 0.00017059210907299267, + "loss": 0.408, + "step": 5295 + }, + { + "epoch": 0.2509357972044539, + "grad_norm": 0.76171875, + "learning_rate": 0.00017058156000771937, + "loss": 1.3831, + "step": 5296 + }, + { + "epoch": 0.2509831793413883, + "grad_norm": 0.267578125, + "learning_rate": 0.00017057100937702222, + "loss": 0.134, + "step": 5297 + }, + { + "epoch": 0.25103056147832264, + "grad_norm": 0.61328125, + "learning_rate": 0.00017056045718113528, + "loss": 1.0522, + "step": 5298 + }, + { + "epoch": 0.25107794361525704, + "grad_norm": 0.69921875, + "learning_rate": 0.00017054990342029255, + "loss": 1.4229, + "step": 5299 + }, + { + "epoch": 0.25112532575219143, + "grad_norm": 0.006072998046875, + "learning_rate": 0.0001705393480947281, + "loss": 0.0003, + "step": 5300 + }, + { + "epoch": 0.2511727078891258, + "grad_norm": 0.18359375, + "learning_rate": 0.00017052879120467605, + "loss": 0.0204, + "step": 5301 + }, + { + "epoch": 0.25122009002606016, + "grad_norm": 0.5859375, + "learning_rate": 0.00017051823275037053, + "loss": 1.1679, + "step": 5302 + }, + { + "epoch": 0.25126747216299455, + "grad_norm": 0.84375, + "learning_rate": 0.00017050767273204574, + "loss": 0.9325, + "step": 5303 + }, + { + "epoch": 0.25131485429992895, + "grad_norm": 0.447265625, + "learning_rate": 0.00017049711114993588, + "loss": 0.5116, + "step": 5304 + }, + { + "epoch": 0.2513622364368633, + "grad_norm": 0.06689453125, + "learning_rate": 0.00017048654800427512, + "loss": 0.0021, + "step": 5305 + }, + { + "epoch": 0.2514096185737977, + "grad_norm": 0.11376953125, + "learning_rate": 0.00017047598329529787, + "loss": 0.0169, + "step": 5306 + }, + { + "epoch": 0.25145700071073207, + "grad_norm": 0.07177734375, + "learning_rate": 0.00017046541702323836, + "loss": 0.0057, + "step": 5307 + }, + { + "epoch": 0.2515043828476664, + "grad_norm": 0.640625, + "learning_rate": 0.00017045484918833093, + "loss": 1.2136, + "step": 5308 + }, + { + "epoch": 0.2515517649846008, + "grad_norm": 0.47265625, + "learning_rate": 0.00017044427979081002, + "loss": 0.206, + "step": 5309 + }, + { + "epoch": 0.2515991471215352, + "grad_norm": 0.78125, + "learning_rate": 0.00017043370883091, + "loss": 0.695, + "step": 5310 + }, + { + "epoch": 0.25164652925846953, + "grad_norm": 0.197265625, + "learning_rate": 0.00017042313630886535, + "loss": 0.1355, + "step": 5311 + }, + { + "epoch": 0.2516939113954039, + "grad_norm": 0.56640625, + "learning_rate": 0.00017041256222491056, + "loss": 0.4462, + "step": 5312 + }, + { + "epoch": 0.2517412935323383, + "grad_norm": 0.234375, + "learning_rate": 0.00017040198657928011, + "loss": 0.1635, + "step": 5313 + }, + { + "epoch": 0.2517886756692727, + "grad_norm": 0.75, + "learning_rate": 0.00017039140937220862, + "loss": 1.4779, + "step": 5314 + }, + { + "epoch": 0.25183605780620705, + "grad_norm": 0.640625, + "learning_rate": 0.0001703808306039306, + "loss": 1.2994, + "step": 5315 + }, + { + "epoch": 0.25188343994314144, + "grad_norm": 0.59375, + "learning_rate": 0.0001703702502746808, + "loss": 1.1394, + "step": 5316 + }, + { + "epoch": 0.25193082208007583, + "grad_norm": 0.78125, + "learning_rate": 0.00017035966838469376, + "loss": 0.9651, + "step": 5317 + }, + { + "epoch": 0.25197820421701017, + "grad_norm": 0.79296875, + "learning_rate": 0.0001703490849342042, + "loss": 1.0763, + "step": 5318 + }, + { + "epoch": 0.25202558635394456, + "grad_norm": 0.64453125, + "learning_rate": 0.00017033849992344687, + "loss": 0.9423, + "step": 5319 + }, + { + "epoch": 0.25207296849087896, + "grad_norm": 0.65234375, + "learning_rate": 0.00017032791335265657, + "loss": 0.0622, + "step": 5320 + }, + { + "epoch": 0.2521203506278133, + "grad_norm": 0.91796875, + "learning_rate": 0.00017031732522206804, + "loss": 0.1328, + "step": 5321 + }, + { + "epoch": 0.2521677327647477, + "grad_norm": 0.5546875, + "learning_rate": 0.00017030673553191611, + "loss": 0.5537, + "step": 5322 + }, + { + "epoch": 0.2522151149016821, + "grad_norm": 0.56640625, + "learning_rate": 0.0001702961442824357, + "loss": 1.0053, + "step": 5323 + }, + { + "epoch": 0.2522624970386164, + "grad_norm": 0.26171875, + "learning_rate": 0.00017028555147386172, + "loss": 0.1488, + "step": 5324 + }, + { + "epoch": 0.2523098791755508, + "grad_norm": 0.65234375, + "learning_rate": 0.000170274957106429, + "loss": 0.6858, + "step": 5325 + }, + { + "epoch": 0.2523572613124852, + "grad_norm": 0.68359375, + "learning_rate": 0.00017026436118037266, + "loss": 1.3075, + "step": 5326 + }, + { + "epoch": 0.25240464344941954, + "grad_norm": 0.80078125, + "learning_rate": 0.00017025376369592758, + "loss": 1.209, + "step": 5327 + }, + { + "epoch": 0.25245202558635393, + "grad_norm": 0.50390625, + "learning_rate": 0.00017024316465332886, + "loss": 0.5749, + "step": 5328 + }, + { + "epoch": 0.2524994077232883, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017023256405281157, + "loss": 0.128, + "step": 5329 + }, + { + "epoch": 0.2525467898602227, + "grad_norm": 0.26953125, + "learning_rate": 0.0001702219618946108, + "loss": 0.0648, + "step": 5330 + }, + { + "epoch": 0.25259417199715706, + "grad_norm": 0.3046875, + "learning_rate": 0.00017021135817896178, + "loss": 0.0228, + "step": 5331 + }, + { + "epoch": 0.25264155413409145, + "grad_norm": 0.58203125, + "learning_rate": 0.00017020075290609957, + "loss": 1.1479, + "step": 5332 + }, + { + "epoch": 0.25268893627102584, + "grad_norm": 0.74609375, + "learning_rate": 0.00017019014607625943, + "loss": 1.4094, + "step": 5333 + }, + { + "epoch": 0.2527363184079602, + "grad_norm": 0.53515625, + "learning_rate": 0.00017017953768967662, + "loss": 1.2996, + "step": 5334 + }, + { + "epoch": 0.2527837005448946, + "grad_norm": 0.69140625, + "learning_rate": 0.00017016892774658642, + "loss": 0.0563, + "step": 5335 + }, + { + "epoch": 0.25283108268182897, + "grad_norm": 0.1943359375, + "learning_rate": 0.00017015831624722413, + "loss": 0.0816, + "step": 5336 + }, + { + "epoch": 0.2528784648187633, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017014770319182513, + "loss": 0.0362, + "step": 5337 + }, + { + "epoch": 0.2529258469556977, + "grad_norm": 0.71875, + "learning_rate": 0.0001701370885806248, + "loss": 0.7215, + "step": 5338 + }, + { + "epoch": 0.2529732290926321, + "grad_norm": 0.435546875, + "learning_rate": 0.00017012647241385856, + "loss": 0.0291, + "step": 5339 + }, + { + "epoch": 0.25302061122956643, + "grad_norm": 0.58984375, + "learning_rate": 0.00017011585469176184, + "loss": 1.0139, + "step": 5340 + }, + { + "epoch": 0.2530679933665008, + "grad_norm": 0.7421875, + "learning_rate": 0.00017010523541457015, + "loss": 0.8763, + "step": 5341 + }, + { + "epoch": 0.2531153755034352, + "grad_norm": 0.6484375, + "learning_rate": 0.000170094614582519, + "loss": 1.1539, + "step": 5342 + }, + { + "epoch": 0.2531627576403696, + "grad_norm": 0.90625, + "learning_rate": 0.00017008399219584398, + "loss": 0.7403, + "step": 5343 + }, + { + "epoch": 0.25321013977730394, + "grad_norm": 0.66015625, + "learning_rate": 0.00017007336825478064, + "loss": 0.9521, + "step": 5344 + }, + { + "epoch": 0.25325752191423834, + "grad_norm": 0.26953125, + "learning_rate": 0.00017006274275956461, + "loss": 0.0444, + "step": 5345 + }, + { + "epoch": 0.25330490405117273, + "grad_norm": 0.66015625, + "learning_rate": 0.0001700521157104316, + "loss": 0.6661, + "step": 5346 + }, + { + "epoch": 0.25335228618810707, + "grad_norm": 0.60546875, + "learning_rate": 0.00017004148710761732, + "loss": 1.1629, + "step": 5347 + }, + { + "epoch": 0.25339966832504146, + "grad_norm": 0.640625, + "learning_rate": 0.00017003085695135742, + "loss": 1.4518, + "step": 5348 + }, + { + "epoch": 0.25344705046197585, + "grad_norm": 0.71875, + "learning_rate": 0.0001700202252418877, + "loss": 1.1735, + "step": 5349 + }, + { + "epoch": 0.2534944325989102, + "grad_norm": 0.69140625, + "learning_rate": 0.000170009591979444, + "loss": 1.2322, + "step": 5350 + }, + { + "epoch": 0.2535418147358446, + "grad_norm": 0.50390625, + "learning_rate": 0.00016999895716426208, + "loss": 0.4126, + "step": 5351 + }, + { + "epoch": 0.253589196872779, + "grad_norm": 0.51171875, + "learning_rate": 0.00016998832079657787, + "loss": 0.0597, + "step": 5352 + }, + { + "epoch": 0.2536365790097133, + "grad_norm": 1.125, + "learning_rate": 0.00016997768287662724, + "loss": 0.3836, + "step": 5353 + }, + { + "epoch": 0.2536839611466477, + "grad_norm": 0.7578125, + "learning_rate": 0.00016996704340464612, + "loss": 1.1632, + "step": 5354 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 0.55859375, + "learning_rate": 0.0001699564023808706, + "loss": 1.2416, + "step": 5355 + }, + { + "epoch": 0.25377872542051644, + "grad_norm": 0.6328125, + "learning_rate": 0.0001699457598055365, + "loss": 0.913, + "step": 5356 + }, + { + "epoch": 0.25382610755745083, + "grad_norm": 0.65625, + "learning_rate": 0.00016993511567887996, + "loss": 1.0357, + "step": 5357 + }, + { + "epoch": 0.2538734896943852, + "grad_norm": 0.171875, + "learning_rate": 0.00016992447000113706, + "loss": 0.0206, + "step": 5358 + }, + { + "epoch": 0.2539208718313196, + "grad_norm": 0.71875, + "learning_rate": 0.00016991382277254391, + "loss": 0.8985, + "step": 5359 + }, + { + "epoch": 0.25396825396825395, + "grad_norm": 0.8046875, + "learning_rate": 0.0001699031739933366, + "loss": 1.1099, + "step": 5360 + }, + { + "epoch": 0.25401563610518835, + "grad_norm": 0.8046875, + "learning_rate": 0.00016989252366375138, + "loss": 0.32, + "step": 5361 + }, + { + "epoch": 0.25406301824212274, + "grad_norm": 1.3125, + "learning_rate": 0.00016988187178402443, + "loss": 0.3791, + "step": 5362 + }, + { + "epoch": 0.2541104003790571, + "grad_norm": 0.72265625, + "learning_rate": 0.00016987121835439198, + "loss": 1.2845, + "step": 5363 + }, + { + "epoch": 0.25415778251599147, + "grad_norm": 0.62109375, + "learning_rate": 0.00016986056337509034, + "loss": 0.893, + "step": 5364 + }, + { + "epoch": 0.25420516465292586, + "grad_norm": 0.6171875, + "learning_rate": 0.00016984990684635584, + "loss": 0.888, + "step": 5365 + }, + { + "epoch": 0.2542525467898602, + "grad_norm": 0.63671875, + "learning_rate": 0.00016983924876842478, + "loss": 1.352, + "step": 5366 + }, + { + "epoch": 0.2542999289267946, + "grad_norm": 0.53515625, + "learning_rate": 0.00016982858914153356, + "loss": 0.3745, + "step": 5367 + }, + { + "epoch": 0.254347311063729, + "grad_norm": 0.6484375, + "learning_rate": 0.00016981792796591866, + "loss": 1.3169, + "step": 5368 + }, + { + "epoch": 0.2543946932006633, + "grad_norm": 0.58203125, + "learning_rate": 0.00016980726524181642, + "loss": 1.0266, + "step": 5369 + }, + { + "epoch": 0.2544420753375977, + "grad_norm": 0.64453125, + "learning_rate": 0.00016979660096946343, + "loss": 0.8766, + "step": 5370 + }, + { + "epoch": 0.2544894574745321, + "grad_norm": 0.640625, + "learning_rate": 0.0001697859351490962, + "loss": 1.0002, + "step": 5371 + }, + { + "epoch": 0.2545368396114665, + "grad_norm": 0.337890625, + "learning_rate": 0.00016977526778095121, + "loss": 0.0449, + "step": 5372 + }, + { + "epoch": 0.25458422174840084, + "grad_norm": 0.703125, + "learning_rate": 0.00016976459886526514, + "loss": 1.2232, + "step": 5373 + }, + { + "epoch": 0.25463160388533523, + "grad_norm": 0.5078125, + "learning_rate": 0.00016975392840227455, + "loss": 0.6692, + "step": 5374 + }, + { + "epoch": 0.2546789860222696, + "grad_norm": 1.1640625, + "learning_rate": 0.00016974325639221616, + "loss": 1.3018, + "step": 5375 + }, + { + "epoch": 0.25472636815920396, + "grad_norm": 1.640625, + "learning_rate": 0.00016973258283532657, + "loss": 0.1251, + "step": 5376 + }, + { + "epoch": 0.25477375029613836, + "grad_norm": 0.69921875, + "learning_rate": 0.0001697219077318426, + "loss": 0.9578, + "step": 5377 + }, + { + "epoch": 0.25482113243307275, + "grad_norm": 0.515625, + "learning_rate": 0.00016971123108200102, + "loss": 0.7342, + "step": 5378 + }, + { + "epoch": 0.2548685145700071, + "grad_norm": 0.58203125, + "learning_rate": 0.00016970055288603858, + "loss": 1.1069, + "step": 5379 + }, + { + "epoch": 0.2549158967069415, + "grad_norm": 0.380859375, + "learning_rate": 0.00016968987314419203, + "loss": 0.0442, + "step": 5380 + }, + { + "epoch": 0.2549632788438759, + "grad_norm": 0.61328125, + "learning_rate": 0.00016967919185669842, + "loss": 1.1472, + "step": 5381 + }, + { + "epoch": 0.2550106609808102, + "grad_norm": 0.80859375, + "learning_rate": 0.0001696685090237945, + "loss": 0.598, + "step": 5382 + }, + { + "epoch": 0.2550580431177446, + "grad_norm": 0.70703125, + "learning_rate": 0.00016965782464571728, + "loss": 0.3313, + "step": 5383 + }, + { + "epoch": 0.255105425254679, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001696471387227037, + "loss": 0.1466, + "step": 5384 + }, + { + "epoch": 0.25515280739161333, + "grad_norm": 0.6484375, + "learning_rate": 0.00016963645125499079, + "loss": 0.8998, + "step": 5385 + }, + { + "epoch": 0.2552001895285477, + "grad_norm": 0.73046875, + "learning_rate": 0.00016962576224281554, + "loss": 0.1095, + "step": 5386 + }, + { + "epoch": 0.2552475716654821, + "grad_norm": 0.3125, + "learning_rate": 0.00016961507168641507, + "loss": 0.0474, + "step": 5387 + }, + { + "epoch": 0.2552949538024165, + "grad_norm": 0.6796875, + "learning_rate": 0.00016960437958602644, + "loss": 0.7683, + "step": 5388 + }, + { + "epoch": 0.25534233593935085, + "grad_norm": 0.640625, + "learning_rate": 0.00016959368594188681, + "loss": 1.4062, + "step": 5389 + }, + { + "epoch": 0.25538971807628524, + "grad_norm": 0.68359375, + "learning_rate": 0.00016958299075423336, + "loss": 0.9186, + "step": 5390 + }, + { + "epoch": 0.25543710021321964, + "grad_norm": 0.26953125, + "learning_rate": 0.00016957229402330329, + "loss": 0.0289, + "step": 5391 + }, + { + "epoch": 0.255484482350154, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001695615957493338, + "loss": 0.0273, + "step": 5392 + }, + { + "epoch": 0.25553186448708837, + "grad_norm": 0.515625, + "learning_rate": 0.00016955089593256227, + "loss": 0.5225, + "step": 5393 + }, + { + "epoch": 0.25557924662402276, + "grad_norm": 0.64453125, + "learning_rate": 0.00016954019457322595, + "loss": 1.5962, + "step": 5394 + }, + { + "epoch": 0.2556266287609571, + "grad_norm": 0.5234375, + "learning_rate": 0.00016952949167156216, + "loss": 0.8025, + "step": 5395 + }, + { + "epoch": 0.2556740108978915, + "grad_norm": 0.73828125, + "learning_rate": 0.00016951878722780832, + "loss": 0.8912, + "step": 5396 + }, + { + "epoch": 0.2557213930348259, + "grad_norm": 0.46875, + "learning_rate": 0.0001695080812422018, + "loss": 0.4736, + "step": 5397 + }, + { + "epoch": 0.2557687751717602, + "grad_norm": 1.2421875, + "learning_rate": 0.00016949737371498008, + "loss": 0.803, + "step": 5398 + }, + { + "epoch": 0.2558161573086946, + "grad_norm": 0.640625, + "learning_rate": 0.00016948666464638068, + "loss": 1.0973, + "step": 5399 + }, + { + "epoch": 0.255863539445629, + "grad_norm": 0.451171875, + "learning_rate": 0.00016947595403664101, + "loss": 0.0214, + "step": 5400 + }, + { + "epoch": 0.2559109215825634, + "grad_norm": 1.0390625, + "learning_rate": 0.00016946524188599872, + "loss": 1.081, + "step": 5401 + }, + { + "epoch": 0.25595830371949774, + "grad_norm": 1.0546875, + "learning_rate": 0.00016945452819469136, + "loss": 0.5477, + "step": 5402 + }, + { + "epoch": 0.25600568585643213, + "grad_norm": 0.3203125, + "learning_rate": 0.00016944381296295653, + "loss": 0.1643, + "step": 5403 + }, + { + "epoch": 0.2560530679933665, + "grad_norm": 0.443359375, + "learning_rate": 0.00016943309619103192, + "loss": 0.2134, + "step": 5404 + }, + { + "epoch": 0.25610045013030086, + "grad_norm": 1.046875, + "learning_rate": 0.00016942237787915515, + "loss": 0.1236, + "step": 5405 + }, + { + "epoch": 0.25614783226723525, + "grad_norm": 0.4609375, + "learning_rate": 0.00016941165802756403, + "loss": 0.0374, + "step": 5406 + }, + { + "epoch": 0.25619521440416965, + "grad_norm": 0.5859375, + "learning_rate": 0.00016940093663649622, + "loss": 0.8897, + "step": 5407 + }, + { + "epoch": 0.256242596541104, + "grad_norm": 0.53125, + "learning_rate": 0.0001693902137061896, + "loss": 1.0275, + "step": 5408 + }, + { + "epoch": 0.2562899786780384, + "grad_norm": 0.337890625, + "learning_rate": 0.00016937948923688193, + "loss": 0.089, + "step": 5409 + }, + { + "epoch": 0.25633736081497277, + "grad_norm": 0.8359375, + "learning_rate": 0.0001693687632288111, + "loss": 1.2504, + "step": 5410 + }, + { + "epoch": 0.2563847429519071, + "grad_norm": 0.1962890625, + "learning_rate": 0.000169358035682215, + "loss": 0.1348, + "step": 5411 + }, + { + "epoch": 0.2564321250888415, + "grad_norm": 0.2412109375, + "learning_rate": 0.00016934730659733154, + "loss": 0.1577, + "step": 5412 + }, + { + "epoch": 0.2564795072257759, + "grad_norm": 0.65234375, + "learning_rate": 0.00016933657597439865, + "loss": 1.201, + "step": 5413 + }, + { + "epoch": 0.25652688936271023, + "grad_norm": 0.5390625, + "learning_rate": 0.00016932584381365438, + "loss": 0.5596, + "step": 5414 + }, + { + "epoch": 0.2565742714996446, + "grad_norm": 0.48828125, + "learning_rate": 0.00016931511011533673, + "loss": 0.4878, + "step": 5415 + }, + { + "epoch": 0.256621653636579, + "grad_norm": 0.5546875, + "learning_rate": 0.00016930437487968378, + "loss": 0.0254, + "step": 5416 + }, + { + "epoch": 0.2566690357735134, + "grad_norm": 0.71484375, + "learning_rate": 0.0001692936381069336, + "loss": 0.7329, + "step": 5417 + }, + { + "epoch": 0.25671641791044775, + "grad_norm": 0.36328125, + "learning_rate": 0.00016928289979732436, + "loss": 0.0926, + "step": 5418 + }, + { + "epoch": 0.25676380004738214, + "grad_norm": 0.671875, + "learning_rate": 0.0001692721599510942, + "loss": 0.4945, + "step": 5419 + }, + { + "epoch": 0.25681118218431653, + "grad_norm": 0.42578125, + "learning_rate": 0.0001692614185684813, + "loss": 0.0455, + "step": 5420 + }, + { + "epoch": 0.25685856432125087, + "grad_norm": 0.70703125, + "learning_rate": 0.00016925067564972393, + "loss": 1.5666, + "step": 5421 + }, + { + "epoch": 0.25690594645818526, + "grad_norm": 0.1953125, + "learning_rate": 0.0001692399311950603, + "loss": 0.1474, + "step": 5422 + }, + { + "epoch": 0.25695332859511966, + "grad_norm": 0.63671875, + "learning_rate": 0.0001692291852047288, + "loss": 1.1678, + "step": 5423 + }, + { + "epoch": 0.257000710732054, + "grad_norm": 0.6328125, + "learning_rate": 0.00016921843767896765, + "loss": 0.8908, + "step": 5424 + }, + { + "epoch": 0.2570480928689884, + "grad_norm": 0.58984375, + "learning_rate": 0.00016920768861801533, + "loss": 0.6911, + "step": 5425 + }, + { + "epoch": 0.2570954750059228, + "grad_norm": 0.216796875, + "learning_rate": 0.0001691969380221102, + "loss": 0.1496, + "step": 5426 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 0.7890625, + "learning_rate": 0.00016918618589149064, + "loss": 0.8787, + "step": 5427 + }, + { + "epoch": 0.2571902392797915, + "grad_norm": 0.671875, + "learning_rate": 0.0001691754322263952, + "loss": 1.2739, + "step": 5428 + }, + { + "epoch": 0.2572376214167259, + "grad_norm": 0.6796875, + "learning_rate": 0.00016916467702706236, + "loss": 0.8478, + "step": 5429 + }, + { + "epoch": 0.2572850035536603, + "grad_norm": 0.55078125, + "learning_rate": 0.00016915392029373064, + "loss": 0.8881, + "step": 5430 + }, + { + "epoch": 0.25733238569059463, + "grad_norm": 0.703125, + "learning_rate": 0.00016914316202663862, + "loss": 1.5181, + "step": 5431 + }, + { + "epoch": 0.257379767827529, + "grad_norm": 0.578125, + "learning_rate": 0.00016913240222602493, + "loss": 0.5271, + "step": 5432 + }, + { + "epoch": 0.2574271499644634, + "grad_norm": 0.5, + "learning_rate": 0.0001691216408921282, + "loss": 0.6417, + "step": 5433 + }, + { + "epoch": 0.25747453210139776, + "grad_norm": 0.314453125, + "learning_rate": 0.0001691108780251871, + "loss": 0.1801, + "step": 5434 + }, + { + "epoch": 0.25752191423833215, + "grad_norm": 0.5859375, + "learning_rate": 0.00016910011362544035, + "loss": 0.7382, + "step": 5435 + }, + { + "epoch": 0.25756929637526654, + "grad_norm": 0.59765625, + "learning_rate": 0.00016908934769312666, + "loss": 1.7052, + "step": 5436 + }, + { + "epoch": 0.2576166785122009, + "grad_norm": 0.921875, + "learning_rate": 0.00016907858022848483, + "loss": 1.3022, + "step": 5437 + }, + { + "epoch": 0.2576640606491353, + "grad_norm": 0.56640625, + "learning_rate": 0.00016906781123175366, + "loss": 0.5217, + "step": 5438 + }, + { + "epoch": 0.25771144278606967, + "grad_norm": 0.7421875, + "learning_rate": 0.000169057040703172, + "loss": 0.6668, + "step": 5439 + }, + { + "epoch": 0.257758824923004, + "grad_norm": 0.6484375, + "learning_rate": 0.00016904626864297875, + "loss": 1.2352, + "step": 5440 + }, + { + "epoch": 0.2578062070599384, + "grad_norm": 0.8359375, + "learning_rate": 0.00016903549505141284, + "loss": 0.9384, + "step": 5441 + }, + { + "epoch": 0.2578535891968728, + "grad_norm": 0.62890625, + "learning_rate": 0.00016902471992871315, + "loss": 1.3833, + "step": 5442 + }, + { + "epoch": 0.2579009713338071, + "grad_norm": 0.55078125, + "learning_rate": 0.0001690139432751187, + "loss": 0.7121, + "step": 5443 + }, + { + "epoch": 0.2579483534707415, + "grad_norm": 0.625, + "learning_rate": 0.00016900316509086847, + "loss": 1.2244, + "step": 5444 + }, + { + "epoch": 0.2579957356076759, + "grad_norm": 0.10791015625, + "learning_rate": 0.00016899238537620154, + "loss": 0.0141, + "step": 5445 + }, + { + "epoch": 0.2580431177446103, + "grad_norm": 0.73828125, + "learning_rate": 0.00016898160413135701, + "loss": 1.0926, + "step": 5446 + }, + { + "epoch": 0.25809049988154464, + "grad_norm": 0.6328125, + "learning_rate": 0.00016897082135657399, + "loss": 1.0492, + "step": 5447 + }, + { + "epoch": 0.25813788201847904, + "grad_norm": 0.38671875, + "learning_rate": 0.00016896003705209157, + "loss": 0.1845, + "step": 5448 + }, + { + "epoch": 0.25818526415541343, + "grad_norm": 0.62890625, + "learning_rate": 0.00016894925121814906, + "loss": 1.3174, + "step": 5449 + }, + { + "epoch": 0.25823264629234777, + "grad_norm": 0.296875, + "learning_rate": 0.00016893846385498552, + "loss": 0.0335, + "step": 5450 + }, + { + "epoch": 0.25828002842928216, + "grad_norm": 0.67578125, + "learning_rate": 0.00016892767496284034, + "loss": 1.2171, + "step": 5451 + }, + { + "epoch": 0.25832741056621655, + "grad_norm": 0.70703125, + "learning_rate": 0.00016891688454195273, + "loss": 1.2409, + "step": 5452 + }, + { + "epoch": 0.2583747927031509, + "grad_norm": 0.48046875, + "learning_rate": 0.000168906092592562, + "loss": 0.5802, + "step": 5453 + }, + { + "epoch": 0.2584221748400853, + "grad_norm": 0.63671875, + "learning_rate": 0.00016889529911490753, + "loss": 0.9941, + "step": 5454 + }, + { + "epoch": 0.2584695569770197, + "grad_norm": 0.002838134765625, + "learning_rate": 0.00016888450410922876, + "loss": 0.0003, + "step": 5455 + }, + { + "epoch": 0.258516939113954, + "grad_norm": 0.7890625, + "learning_rate": 0.000168873707575765, + "loss": 0.2259, + "step": 5456 + }, + { + "epoch": 0.2585643212508884, + "grad_norm": 0.6015625, + "learning_rate": 0.00016886290951475584, + "loss": 0.9563, + "step": 5457 + }, + { + "epoch": 0.2586117033878228, + "grad_norm": 0.51171875, + "learning_rate": 0.00016885210992644066, + "loss": 0.0681, + "step": 5458 + }, + { + "epoch": 0.2586590855247572, + "grad_norm": 0.7734375, + "learning_rate": 0.000168841308811059, + "loss": 1.1659, + "step": 5459 + }, + { + "epoch": 0.25870646766169153, + "grad_norm": 0.640625, + "learning_rate": 0.00016883050616885043, + "loss": 1.6042, + "step": 5460 + }, + { + "epoch": 0.2587538497986259, + "grad_norm": 0.87109375, + "learning_rate": 0.0001688197020000546, + "loss": 0.6201, + "step": 5461 + }, + { + "epoch": 0.2588012319355603, + "grad_norm": 0.6328125, + "learning_rate": 0.00016880889630491104, + "loss": 1.339, + "step": 5462 + }, + { + "epoch": 0.25884861407249465, + "grad_norm": 0.48046875, + "learning_rate": 0.00016879808908365945, + "loss": 1.2221, + "step": 5463 + }, + { + "epoch": 0.25889599620942905, + "grad_norm": 0.72265625, + "learning_rate": 0.00016878728033653957, + "loss": 1.1975, + "step": 5464 + }, + { + "epoch": 0.25894337834636344, + "grad_norm": 0.66015625, + "learning_rate": 0.00016877647006379104, + "loss": 0.8915, + "step": 5465 + }, + { + "epoch": 0.2589907604832978, + "grad_norm": 0.498046875, + "learning_rate": 0.00016876565826565366, + "loss": 0.2214, + "step": 5466 + }, + { + "epoch": 0.25903814262023217, + "grad_norm": 0.53515625, + "learning_rate": 0.00016875484494236726, + "loss": 0.0409, + "step": 5467 + }, + { + "epoch": 0.25908552475716656, + "grad_norm": 0.74609375, + "learning_rate": 0.00016874403009417162, + "loss": 1.0784, + "step": 5468 + }, + { + "epoch": 0.2591329068941009, + "grad_norm": 0.1640625, + "learning_rate": 0.0001687332137213066, + "loss": 0.0318, + "step": 5469 + }, + { + "epoch": 0.2591802890310353, + "grad_norm": 0.6640625, + "learning_rate": 0.0001687223958240121, + "loss": 1.1415, + "step": 5470 + }, + { + "epoch": 0.2592276711679697, + "grad_norm": 0.60546875, + "learning_rate": 0.00016871157640252807, + "loss": 0.7716, + "step": 5471 + }, + { + "epoch": 0.259275053304904, + "grad_norm": 0.404296875, + "learning_rate": 0.00016870075545709449, + "loss": 0.0613, + "step": 5472 + }, + { + "epoch": 0.2593224354418384, + "grad_norm": 0.462890625, + "learning_rate": 0.0001686899329879513, + "loss": 0.0502, + "step": 5473 + }, + { + "epoch": 0.2593698175787728, + "grad_norm": 0.283203125, + "learning_rate": 0.00016867910899533858, + "loss": 0.0127, + "step": 5474 + }, + { + "epoch": 0.2594171997157072, + "grad_norm": 0.58203125, + "learning_rate": 0.00016866828347949638, + "loss": 0.7669, + "step": 5475 + }, + { + "epoch": 0.25946458185264154, + "grad_norm": 0.6875, + "learning_rate": 0.0001686574564406648, + "loss": 0.7168, + "step": 5476 + }, + { + "epoch": 0.25951196398957593, + "grad_norm": 0.6640625, + "learning_rate": 0.00016864662787908393, + "loss": 0.9622, + "step": 5477 + }, + { + "epoch": 0.2595593461265103, + "grad_norm": 0.7578125, + "learning_rate": 0.00016863579779499398, + "loss": 1.2405, + "step": 5478 + }, + { + "epoch": 0.25960672826344466, + "grad_norm": 0.71484375, + "learning_rate": 0.00016862496618863518, + "loss": 0.9796, + "step": 5479 + }, + { + "epoch": 0.25965411040037906, + "grad_norm": 0.546875, + "learning_rate": 0.0001686141330602477, + "loss": 1.1535, + "step": 5480 + }, + { + "epoch": 0.25970149253731345, + "grad_norm": 0.498046875, + "learning_rate": 0.00016860329841007182, + "loss": 1.0441, + "step": 5481 + }, + { + "epoch": 0.2597488746742478, + "grad_norm": 0.030029296875, + "learning_rate": 0.0001685924622383479, + "loss": 0.0022, + "step": 5482 + }, + { + "epoch": 0.2597962568111822, + "grad_norm": 0.7109375, + "learning_rate": 0.00016858162454531618, + "loss": 1.1805, + "step": 5483 + }, + { + "epoch": 0.2598436389481166, + "grad_norm": 0.103515625, + "learning_rate": 0.0001685707853312171, + "loss": 0.0136, + "step": 5484 + }, + { + "epoch": 0.2598910210850509, + "grad_norm": 0.6328125, + "learning_rate": 0.00016855994459629106, + "loss": 0.9291, + "step": 5485 + }, + { + "epoch": 0.2599384032219853, + "grad_norm": 0.80078125, + "learning_rate": 0.00016854910234077842, + "loss": 1.4879, + "step": 5486 + }, + { + "epoch": 0.2599857853589197, + "grad_norm": 0.8828125, + "learning_rate": 0.00016853825856491972, + "loss": 1.1442, + "step": 5487 + }, + { + "epoch": 0.2600331674958541, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001685274132689555, + "loss": 0.1475, + "step": 5488 + }, + { + "epoch": 0.2600805496327884, + "grad_norm": 0.84765625, + "learning_rate": 0.00016851656645312622, + "loss": 1.3088, + "step": 5489 + }, + { + "epoch": 0.2601279317697228, + "grad_norm": 0.66015625, + "learning_rate": 0.00016850571811767245, + "loss": 0.2319, + "step": 5490 + }, + { + "epoch": 0.2601753139066572, + "grad_norm": 0.6640625, + "learning_rate": 0.00016849486826283486, + "loss": 1.084, + "step": 5491 + }, + { + "epoch": 0.26022269604359155, + "grad_norm": 0.69921875, + "learning_rate": 0.000168484016888854, + "loss": 1.3469, + "step": 5492 + }, + { + "epoch": 0.26027007818052594, + "grad_norm": 0.5859375, + "learning_rate": 0.00016847316399597065, + "loss": 1.4763, + "step": 5493 + }, + { + "epoch": 0.26031746031746034, + "grad_norm": 0.431640625, + "learning_rate": 0.00016846230958442542, + "loss": 0.6448, + "step": 5494 + }, + { + "epoch": 0.2603648424543947, + "grad_norm": 0.75, + "learning_rate": 0.0001684514536544591, + "loss": 1.0415, + "step": 5495 + }, + { + "epoch": 0.26041222459132907, + "grad_norm": 0.85546875, + "learning_rate": 0.00016844059620631244, + "loss": 1.5532, + "step": 5496 + }, + { + "epoch": 0.26045960672826346, + "grad_norm": 0.74609375, + "learning_rate": 0.00016842973724022625, + "loss": 0.0721, + "step": 5497 + }, + { + "epoch": 0.2605069888651978, + "grad_norm": 0.70703125, + "learning_rate": 0.0001684188767564414, + "loss": 0.976, + "step": 5498 + }, + { + "epoch": 0.2605543710021322, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001684080147551987, + "loss": 0.0077, + "step": 5499 + }, + { + "epoch": 0.2606017531390666, + "grad_norm": 0.69921875, + "learning_rate": 0.00016839715123673915, + "loss": 1.5189, + "step": 5500 + }, + { + "epoch": 0.2606491352760009, + "grad_norm": 0.66015625, + "learning_rate": 0.00016838628620130362, + "loss": 0.9867, + "step": 5501 + }, + { + "epoch": 0.2606965174129353, + "grad_norm": 0.349609375, + "learning_rate": 0.00016837541964913306, + "loss": 0.4231, + "step": 5502 + }, + { + "epoch": 0.2607438995498697, + "grad_norm": 1.09375, + "learning_rate": 0.00016836455158046856, + "loss": 1.5931, + "step": 5503 + }, + { + "epoch": 0.2607912816868041, + "grad_norm": 0.5546875, + "learning_rate": 0.00016835368199555112, + "loss": 1.2388, + "step": 5504 + }, + { + "epoch": 0.26083866382373844, + "grad_norm": 0.29296875, + "learning_rate": 0.00016834281089462186, + "loss": 0.0145, + "step": 5505 + }, + { + "epoch": 0.26088604596067283, + "grad_norm": 0.74609375, + "learning_rate": 0.00016833193827792177, + "loss": 0.621, + "step": 5506 + }, + { + "epoch": 0.2609334280976072, + "grad_norm": 0.51171875, + "learning_rate": 0.00016832106414569213, + "loss": 0.7694, + "step": 5507 + }, + { + "epoch": 0.26098081023454156, + "grad_norm": 0.69921875, + "learning_rate": 0.00016831018849817407, + "loss": 0.7878, + "step": 5508 + }, + { + "epoch": 0.26102819237147595, + "grad_norm": 1.6015625, + "learning_rate": 0.00016829931133560875, + "loss": 0.0324, + "step": 5509 + }, + { + "epoch": 0.26107557450841035, + "grad_norm": 0.16015625, + "learning_rate": 0.00016828843265823747, + "loss": 0.0147, + "step": 5510 + }, + { + "epoch": 0.2611229566453447, + "grad_norm": 0.671875, + "learning_rate": 0.00016827755246630148, + "loss": 0.7277, + "step": 5511 + }, + { + "epoch": 0.2611703387822791, + "grad_norm": 0.5625, + "learning_rate": 0.00016826667076004213, + "loss": 0.5882, + "step": 5512 + }, + { + "epoch": 0.26121772091921347, + "grad_norm": 0.84765625, + "learning_rate": 0.0001682557875397007, + "loss": 0.8579, + "step": 5513 + }, + { + "epoch": 0.2612651030561478, + "grad_norm": 0.470703125, + "learning_rate": 0.00016824490280551864, + "loss": 0.2415, + "step": 5514 + }, + { + "epoch": 0.2613124851930822, + "grad_norm": 0.640625, + "learning_rate": 0.0001682340165577373, + "loss": 0.7873, + "step": 5515 + }, + { + "epoch": 0.2613598673300166, + "grad_norm": 0.67578125, + "learning_rate": 0.00016822312879659817, + "loss": 1.012, + "step": 5516 + }, + { + "epoch": 0.261407249466951, + "grad_norm": 0.50390625, + "learning_rate": 0.0001682122395223427, + "loss": 0.0454, + "step": 5517 + }, + { + "epoch": 0.2614546316038853, + "grad_norm": 0.8046875, + "learning_rate": 0.0001682013487352124, + "loss": 0.176, + "step": 5518 + }, + { + "epoch": 0.2615020137408197, + "grad_norm": 0.96875, + "learning_rate": 0.00016819045643544885, + "loss": 0.254, + "step": 5519 + }, + { + "epoch": 0.2615493958777541, + "grad_norm": 0.6171875, + "learning_rate": 0.00016817956262329362, + "loss": 0.8728, + "step": 5520 + }, + { + "epoch": 0.26159677801468845, + "grad_norm": 0.94921875, + "learning_rate": 0.0001681686672989883, + "loss": 1.3349, + "step": 5521 + }, + { + "epoch": 0.26164416015162284, + "grad_norm": 0.76171875, + "learning_rate": 0.00016815777046277455, + "loss": 0.5688, + "step": 5522 + }, + { + "epoch": 0.26169154228855723, + "grad_norm": 0.70703125, + "learning_rate": 0.00016814687211489404, + "loss": 1.051, + "step": 5523 + }, + { + "epoch": 0.26173892442549157, + "grad_norm": 0.57421875, + "learning_rate": 0.0001681359722555885, + "loss": 0.9618, + "step": 5524 + }, + { + "epoch": 0.26178630656242596, + "grad_norm": 0.408203125, + "learning_rate": 0.0001681250708850997, + "loss": 0.7285, + "step": 5525 + }, + { + "epoch": 0.26183368869936036, + "grad_norm": 0.52734375, + "learning_rate": 0.00016811416800366935, + "loss": 1.0602, + "step": 5526 + }, + { + "epoch": 0.2618810708362947, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001681032636115393, + "loss": 0.0395, + "step": 5527 + }, + { + "epoch": 0.2619284529732291, + "grad_norm": 0.296875, + "learning_rate": 0.00016809235770895144, + "loss": 0.0628, + "step": 5528 + }, + { + "epoch": 0.2619758351101635, + "grad_norm": 0.7578125, + "learning_rate": 0.00016808145029614758, + "loss": 0.8732, + "step": 5529 + }, + { + "epoch": 0.2620232172470978, + "grad_norm": 0.8359375, + "learning_rate": 0.00016807054137336973, + "loss": 1.06, + "step": 5530 + }, + { + "epoch": 0.2620705993840322, + "grad_norm": 0.048583984375, + "learning_rate": 0.0001680596309408597, + "loss": 0.0016, + "step": 5531 + }, + { + "epoch": 0.2621179815209666, + "grad_norm": 0.025634765625, + "learning_rate": 0.0001680487189988596, + "loss": 0.0016, + "step": 5532 + }, + { + "epoch": 0.262165363657901, + "grad_norm": 0.78125, + "learning_rate": 0.00016803780554761137, + "loss": 1.2372, + "step": 5533 + }, + { + "epoch": 0.26221274579483533, + "grad_norm": 0.6328125, + "learning_rate": 0.0001680268905873571, + "loss": 0.8734, + "step": 5534 + }, + { + "epoch": 0.2622601279317697, + "grad_norm": 0.0810546875, + "learning_rate": 0.00016801597411833885, + "loss": 0.0055, + "step": 5535 + }, + { + "epoch": 0.2623075100687041, + "grad_norm": 0.7109375, + "learning_rate": 0.00016800505614079876, + "loss": 1.2974, + "step": 5536 + }, + { + "epoch": 0.26235489220563846, + "grad_norm": 0.59375, + "learning_rate": 0.00016799413665497892, + "loss": 1.3831, + "step": 5537 + }, + { + "epoch": 0.26240227434257285, + "grad_norm": 0.59765625, + "learning_rate": 0.00016798321566112158, + "loss": 1.1895, + "step": 5538 + }, + { + "epoch": 0.26244965647950724, + "grad_norm": 0.65234375, + "learning_rate": 0.0001679722931594689, + "loss": 1.0775, + "step": 5539 + }, + { + "epoch": 0.2624970386164416, + "grad_norm": 0.640625, + "learning_rate": 0.0001679613691502632, + "loss": 0.8848, + "step": 5540 + }, + { + "epoch": 0.262544420753376, + "grad_norm": 0.6484375, + "learning_rate": 0.00016795044363374673, + "loss": 0.5862, + "step": 5541 + }, + { + "epoch": 0.26259180289031037, + "grad_norm": 0.59375, + "learning_rate": 0.00016793951661016175, + "loss": 0.9577, + "step": 5542 + }, + { + "epoch": 0.2626391850272447, + "grad_norm": 0.765625, + "learning_rate": 0.00016792858807975066, + "loss": 1.3822, + "step": 5543 + }, + { + "epoch": 0.2626865671641791, + "grad_norm": 0.640625, + "learning_rate": 0.0001679176580427559, + "loss": 0.962, + "step": 5544 + }, + { + "epoch": 0.2627339493011135, + "grad_norm": 0.69140625, + "learning_rate": 0.00016790672649941976, + "loss": 1.5334, + "step": 5545 + }, + { + "epoch": 0.2627813314380479, + "grad_norm": 0.6640625, + "learning_rate": 0.0001678957934499848, + "loss": 0.9279, + "step": 5546 + }, + { + "epoch": 0.2628287135749822, + "grad_norm": 0.466796875, + "learning_rate": 0.00016788485889469344, + "loss": 0.4792, + "step": 5547 + }, + { + "epoch": 0.2628760957119166, + "grad_norm": 0.7421875, + "learning_rate": 0.00016787392283378822, + "loss": 1.0632, + "step": 5548 + }, + { + "epoch": 0.262923477848851, + "grad_norm": 0.1533203125, + "learning_rate": 0.00016786298526751166, + "loss": 0.0135, + "step": 5549 + }, + { + "epoch": 0.26297085998578534, + "grad_norm": 0.64453125, + "learning_rate": 0.00016785204619610642, + "loss": 0.8517, + "step": 5550 + }, + { + "epoch": 0.26301824212271974, + "grad_norm": 0.400390625, + "learning_rate": 0.00016784110561981507, + "loss": 0.0653, + "step": 5551 + }, + { + "epoch": 0.26306562425965413, + "grad_norm": 0.63671875, + "learning_rate": 0.00016783016353888024, + "loss": 1.2957, + "step": 5552 + }, + { + "epoch": 0.26311300639658847, + "grad_norm": 0.57421875, + "learning_rate": 0.00016781921995354462, + "loss": 0.7068, + "step": 5553 + }, + { + "epoch": 0.26316038853352286, + "grad_norm": 0.64453125, + "learning_rate": 0.00016780827486405096, + "loss": 1.0027, + "step": 5554 + }, + { + "epoch": 0.26320777067045725, + "grad_norm": 0.65625, + "learning_rate": 0.00016779732827064197, + "loss": 0.104, + "step": 5555 + }, + { + "epoch": 0.2632551528073916, + "grad_norm": 0.84765625, + "learning_rate": 0.00016778638017356045, + "loss": 0.1015, + "step": 5556 + }, + { + "epoch": 0.263302534944326, + "grad_norm": 0.78125, + "learning_rate": 0.00016777543057304922, + "loss": 0.1114, + "step": 5557 + }, + { + "epoch": 0.2633499170812604, + "grad_norm": 0.58984375, + "learning_rate": 0.00016776447946935115, + "loss": 0.0839, + "step": 5558 + }, + { + "epoch": 0.2633972992181947, + "grad_norm": 0.765625, + "learning_rate": 0.00016775352686270912, + "loss": 0.3187, + "step": 5559 + }, + { + "epoch": 0.2634446813551291, + "grad_norm": 0.4765625, + "learning_rate": 0.000167742572753366, + "loss": 0.5824, + "step": 5560 + }, + { + "epoch": 0.2634920634920635, + "grad_norm": 0.306640625, + "learning_rate": 0.00016773161714156478, + "loss": 0.1804, + "step": 5561 + }, + { + "epoch": 0.2635394456289979, + "grad_norm": 0.3984375, + "learning_rate": 0.00016772066002754846, + "loss": 0.0095, + "step": 5562 + }, + { + "epoch": 0.26358682776593223, + "grad_norm": 0.76171875, + "learning_rate": 0.00016770970141155998, + "loss": 1.1152, + "step": 5563 + }, + { + "epoch": 0.2636342099028666, + "grad_norm": 0.61328125, + "learning_rate": 0.00016769874129384248, + "loss": 1.1267, + "step": 5564 + }, + { + "epoch": 0.263681592039801, + "grad_norm": 0.65234375, + "learning_rate": 0.000167687779674639, + "loss": 1.0091, + "step": 5565 + }, + { + "epoch": 0.26372897417673535, + "grad_norm": 0.7109375, + "learning_rate": 0.00016767681655419268, + "loss": 0.7512, + "step": 5566 + }, + { + "epoch": 0.26377635631366975, + "grad_norm": 0.1064453125, + "learning_rate": 0.00016766585193274665, + "loss": 0.0189, + "step": 5567 + }, + { + "epoch": 0.26382373845060414, + "grad_norm": 0.484375, + "learning_rate": 0.0001676548858105441, + "loss": 0.4407, + "step": 5568 + }, + { + "epoch": 0.2638711205875385, + "grad_norm": 0.703125, + "learning_rate": 0.00016764391818782824, + "loss": 0.8798, + "step": 5569 + }, + { + "epoch": 0.26391850272447287, + "grad_norm": 0.70703125, + "learning_rate": 0.00016763294906484234, + "loss": 1.0353, + "step": 5570 + }, + { + "epoch": 0.26396588486140726, + "grad_norm": 0.6015625, + "learning_rate": 0.00016762197844182963, + "loss": 1.4533, + "step": 5571 + }, + { + "epoch": 0.2640132669983416, + "grad_norm": 0.6328125, + "learning_rate": 0.00016761100631903348, + "loss": 0.1956, + "step": 5572 + }, + { + "epoch": 0.264060649135276, + "grad_norm": 0.326171875, + "learning_rate": 0.00016760003269669725, + "loss": 0.0535, + "step": 5573 + }, + { + "epoch": 0.2641080312722104, + "grad_norm": 0.7109375, + "learning_rate": 0.00016758905757506426, + "loss": 1.2123, + "step": 5574 + }, + { + "epoch": 0.2641554134091448, + "grad_norm": 0.408203125, + "learning_rate": 0.000167578080954378, + "loss": 0.3087, + "step": 5575 + }, + { + "epoch": 0.2642027955460791, + "grad_norm": 0.470703125, + "learning_rate": 0.0001675671028348819, + "loss": 1.1576, + "step": 5576 + }, + { + "epoch": 0.2642501776830135, + "grad_norm": 0.076171875, + "learning_rate": 0.00016755612321681936, + "loss": 0.002, + "step": 5577 + }, + { + "epoch": 0.2642975598199479, + "grad_norm": 0.57421875, + "learning_rate": 0.00016754514210043402, + "loss": 0.1876, + "step": 5578 + }, + { + "epoch": 0.26434494195688224, + "grad_norm": 0.76953125, + "learning_rate": 0.00016753415948596935, + "loss": 1.0717, + "step": 5579 + }, + { + "epoch": 0.26439232409381663, + "grad_norm": 0.7421875, + "learning_rate": 0.00016752317537366897, + "loss": 1.1072, + "step": 5580 + }, + { + "epoch": 0.264439706230751, + "grad_norm": 0.1376953125, + "learning_rate": 0.0001675121897637765, + "loss": 0.0101, + "step": 5581 + }, + { + "epoch": 0.26448708836768536, + "grad_norm": 0.671875, + "learning_rate": 0.00016750120265653554, + "loss": 1.4156, + "step": 5582 + }, + { + "epoch": 0.26453447050461976, + "grad_norm": 0.13671875, + "learning_rate": 0.0001674902140521898, + "loss": 0.0341, + "step": 5583 + }, + { + "epoch": 0.26458185264155415, + "grad_norm": 0.0108642578125, + "learning_rate": 0.00016747922395098303, + "loss": 0.0006, + "step": 5584 + }, + { + "epoch": 0.2646292347784885, + "grad_norm": 0.78125, + "learning_rate": 0.00016746823235315895, + "loss": 1.0785, + "step": 5585 + }, + { + "epoch": 0.2646766169154229, + "grad_norm": 0.17578125, + "learning_rate": 0.0001674572392589613, + "loss": 0.1082, + "step": 5586 + }, + { + "epoch": 0.2647239990523573, + "grad_norm": 0.62890625, + "learning_rate": 0.00016744624466863395, + "loss": 0.9585, + "step": 5587 + }, + { + "epoch": 0.2647713811892916, + "grad_norm": 0.79296875, + "learning_rate": 0.00016743524858242075, + "loss": 0.853, + "step": 5588 + }, + { + "epoch": 0.264818763326226, + "grad_norm": 0.1083984375, + "learning_rate": 0.00016742425100056555, + "loss": 0.0078, + "step": 5589 + }, + { + "epoch": 0.2648661454631604, + "grad_norm": 1.0625, + "learning_rate": 0.0001674132519233123, + "loss": 1.1826, + "step": 5590 + }, + { + "epoch": 0.2649135276000948, + "grad_norm": 0.2314453125, + "learning_rate": 0.00016740225135090491, + "loss": 0.1609, + "step": 5591 + }, + { + "epoch": 0.2649609097370291, + "grad_norm": 0.63671875, + "learning_rate": 0.0001673912492835874, + "loss": 0.9908, + "step": 5592 + }, + { + "epoch": 0.2650082918739635, + "grad_norm": 0.984375, + "learning_rate": 0.00016738024572160375, + "loss": 0.7295, + "step": 5593 + }, + { + "epoch": 0.2650556740108979, + "grad_norm": 0.65234375, + "learning_rate": 0.000167369240665198, + "loss": 0.8419, + "step": 5594 + }, + { + "epoch": 0.26510305614783225, + "grad_norm": 0.62890625, + "learning_rate": 0.0001673582341146143, + "loss": 0.5265, + "step": 5595 + }, + { + "epoch": 0.26515043828476664, + "grad_norm": 0.55078125, + "learning_rate": 0.00016734722607009665, + "loss": 0.8965, + "step": 5596 + }, + { + "epoch": 0.26519782042170104, + "grad_norm": 0.46484375, + "learning_rate": 0.00016733621653188931, + "loss": 0.2007, + "step": 5597 + }, + { + "epoch": 0.2652452025586354, + "grad_norm": 0.66015625, + "learning_rate": 0.00016732520550023642, + "loss": 0.685, + "step": 5598 + }, + { + "epoch": 0.26529258469556977, + "grad_norm": 0.5078125, + "learning_rate": 0.00016731419297538212, + "loss": 0.5059, + "step": 5599 + }, + { + "epoch": 0.26533996683250416, + "grad_norm": 0.88671875, + "learning_rate": 0.0001673031789575708, + "loss": 1.1605, + "step": 5600 + }, + { + "epoch": 0.2653873489694385, + "grad_norm": 0.54296875, + "learning_rate": 0.00016729216344704662, + "loss": 0.6541, + "step": 5601 + }, + { + "epoch": 0.2654347311063729, + "grad_norm": 0.5625, + "learning_rate": 0.00016728114644405388, + "loss": 1.5109, + "step": 5602 + }, + { + "epoch": 0.2654821132433073, + "grad_norm": 0.5625, + "learning_rate": 0.00016727012794883703, + "loss": 0.6519, + "step": 5603 + }, + { + "epoch": 0.2655294953802417, + "grad_norm": 0.55078125, + "learning_rate": 0.0001672591079616404, + "loss": 1.0167, + "step": 5604 + }, + { + "epoch": 0.265576877517176, + "grad_norm": 0.72265625, + "learning_rate": 0.0001672480864827084, + "loss": 0.6678, + "step": 5605 + }, + { + "epoch": 0.2656242596541104, + "grad_norm": 0.47265625, + "learning_rate": 0.00016723706351228545, + "loss": 0.6186, + "step": 5606 + }, + { + "epoch": 0.2656716417910448, + "grad_norm": 0.703125, + "learning_rate": 0.00016722603905061604, + "loss": 0.7193, + "step": 5607 + }, + { + "epoch": 0.26571902392797914, + "grad_norm": 0.7421875, + "learning_rate": 0.00016721501309794473, + "loss": 0.7566, + "step": 5608 + }, + { + "epoch": 0.26576640606491353, + "grad_norm": 0.478515625, + "learning_rate": 0.000167203985654516, + "loss": 0.7573, + "step": 5609 + }, + { + "epoch": 0.2658137882018479, + "grad_norm": 0.45703125, + "learning_rate": 0.00016719295672057443, + "loss": 0.5047, + "step": 5610 + }, + { + "epoch": 0.26586117033878226, + "grad_norm": 0.83203125, + "learning_rate": 0.00016718192629636467, + "loss": 0.5751, + "step": 5611 + }, + { + "epoch": 0.26590855247571665, + "grad_norm": 0.79296875, + "learning_rate": 0.00016717089438213133, + "loss": 0.9762, + "step": 5612 + }, + { + "epoch": 0.26595593461265105, + "grad_norm": 0.55859375, + "learning_rate": 0.00016715986097811912, + "loss": 1.1714, + "step": 5613 + }, + { + "epoch": 0.2660033167495854, + "grad_norm": 0.87109375, + "learning_rate": 0.00016714882608457273, + "loss": 1.4798, + "step": 5614 + }, + { + "epoch": 0.2660506988865198, + "grad_norm": 0.8984375, + "learning_rate": 0.0001671377897017369, + "loss": 1.0433, + "step": 5615 + }, + { + "epoch": 0.26609808102345417, + "grad_norm": 0.703125, + "learning_rate": 0.0001671267518298564, + "loss": 0.8132, + "step": 5616 + }, + { + "epoch": 0.2661454631603885, + "grad_norm": 0.7734375, + "learning_rate": 0.00016711571246917607, + "loss": 1.2701, + "step": 5617 + }, + { + "epoch": 0.2661928452973229, + "grad_norm": 0.52734375, + "learning_rate": 0.0001671046716199407, + "loss": 0.7974, + "step": 5618 + }, + { + "epoch": 0.2662402274342573, + "grad_norm": 0.95703125, + "learning_rate": 0.00016709362928239515, + "loss": 1.0338, + "step": 5619 + }, + { + "epoch": 0.2662876095711917, + "grad_norm": 0.7421875, + "learning_rate": 0.00016708258545678444, + "loss": 0.6452, + "step": 5620 + }, + { + "epoch": 0.266334991708126, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001670715401433534, + "loss": 0.0214, + "step": 5621 + }, + { + "epoch": 0.2663823738450604, + "grad_norm": 0.77734375, + "learning_rate": 0.00016706049334234705, + "loss": 1.0348, + "step": 5622 + }, + { + "epoch": 0.2664297559819948, + "grad_norm": 1.1484375, + "learning_rate": 0.0001670494450540104, + "loss": 0.6407, + "step": 5623 + }, + { + "epoch": 0.26647713811892915, + "grad_norm": 0.68359375, + "learning_rate": 0.00016703839527858848, + "loss": 1.1821, + "step": 5624 + }, + { + "epoch": 0.26652452025586354, + "grad_norm": 1.2890625, + "learning_rate": 0.00016702734401632632, + "loss": 0.3451, + "step": 5625 + }, + { + "epoch": 0.26657190239279793, + "grad_norm": 0.2578125, + "learning_rate": 0.00016701629126746908, + "loss": 0.1856, + "step": 5626 + }, + { + "epoch": 0.26661928452973227, + "grad_norm": 0.8515625, + "learning_rate": 0.0001670052370322619, + "loss": 0.0511, + "step": 5627 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.65625, + "learning_rate": 0.0001669941813109499, + "loss": 1.1102, + "step": 5628 + }, + { + "epoch": 0.26671404880360106, + "grad_norm": 0.2255859375, + "learning_rate": 0.00016698312410377833, + "loss": 0.1564, + "step": 5629 + }, + { + "epoch": 0.2667614309405354, + "grad_norm": 0.302734375, + "learning_rate": 0.00016697206541099245, + "loss": 0.1795, + "step": 5630 + }, + { + "epoch": 0.2668088130774698, + "grad_norm": 0.84765625, + "learning_rate": 0.00016696100523283744, + "loss": 1.0263, + "step": 5631 + }, + { + "epoch": 0.2668561952144042, + "grad_norm": 0.765625, + "learning_rate": 0.0001669499435695587, + "loss": 0.5158, + "step": 5632 + }, + { + "epoch": 0.2669035773513386, + "grad_norm": 0.51171875, + "learning_rate": 0.00016693888042140152, + "loss": 1.1575, + "step": 5633 + }, + { + "epoch": 0.2669509594882729, + "grad_norm": 0.83203125, + "learning_rate": 0.00016692781578861126, + "loss": 1.0503, + "step": 5634 + }, + { + "epoch": 0.2669983416252073, + "grad_norm": 0.5078125, + "learning_rate": 0.0001669167496714333, + "loss": 0.9357, + "step": 5635 + }, + { + "epoch": 0.2670457237621417, + "grad_norm": 0.6640625, + "learning_rate": 0.00016690568207011313, + "loss": 0.9347, + "step": 5636 + }, + { + "epoch": 0.26709310589907603, + "grad_norm": 0.765625, + "learning_rate": 0.00016689461298489625, + "loss": 1.0529, + "step": 5637 + }, + { + "epoch": 0.2671404880360104, + "grad_norm": 0.64453125, + "learning_rate": 0.00016688354241602805, + "loss": 0.8088, + "step": 5638 + }, + { + "epoch": 0.2671878701729448, + "grad_norm": 0.71484375, + "learning_rate": 0.00016687247036375414, + "loss": 1.321, + "step": 5639 + }, + { + "epoch": 0.26723525230987916, + "grad_norm": 0.81640625, + "learning_rate": 0.00016686139682832004, + "loss": 0.8933, + "step": 5640 + }, + { + "epoch": 0.26728263444681355, + "grad_norm": 0.546875, + "learning_rate": 0.00016685032180997143, + "loss": 0.8912, + "step": 5641 + }, + { + "epoch": 0.26733001658374794, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016683924530895385, + "loss": 0.0123, + "step": 5642 + }, + { + "epoch": 0.2673773987206823, + "grad_norm": 0.7734375, + "learning_rate": 0.000166828167325513, + "loss": 1.3312, + "step": 5643 + }, + { + "epoch": 0.2674247808576167, + "grad_norm": 0.166015625, + "learning_rate": 0.0001668170878598946, + "loss": 0.0041, + "step": 5644 + }, + { + "epoch": 0.26747216299455107, + "grad_norm": 0.69921875, + "learning_rate": 0.00016680600691234437, + "loss": 0.8034, + "step": 5645 + }, + { + "epoch": 0.2675195451314854, + "grad_norm": 0.875, + "learning_rate": 0.00016679492448310804, + "loss": 0.4828, + "step": 5646 + }, + { + "epoch": 0.2675669272684198, + "grad_norm": 0.27734375, + "learning_rate": 0.00016678384057243145, + "loss": 0.0189, + "step": 5647 + }, + { + "epoch": 0.2676143094053542, + "grad_norm": 0.69140625, + "learning_rate": 0.00016677275518056037, + "loss": 1.4275, + "step": 5648 + }, + { + "epoch": 0.2676616915422886, + "grad_norm": 0.57421875, + "learning_rate": 0.00016676166830774074, + "loss": 0.9209, + "step": 5649 + }, + { + "epoch": 0.2677090736792229, + "grad_norm": 0.59375, + "learning_rate": 0.00016675057995421843, + "loss": 0.7652, + "step": 5650 + }, + { + "epoch": 0.2677564558161573, + "grad_norm": 0.51953125, + "learning_rate": 0.00016673949012023932, + "loss": 0.5517, + "step": 5651 + }, + { + "epoch": 0.2678038379530917, + "grad_norm": 0.71875, + "learning_rate": 0.00016672839880604943, + "loss": 0.9392, + "step": 5652 + }, + { + "epoch": 0.26785122009002604, + "grad_norm": 0.75390625, + "learning_rate": 0.00016671730601189473, + "loss": 1.086, + "step": 5653 + }, + { + "epoch": 0.26789860222696044, + "grad_norm": 0.64453125, + "learning_rate": 0.00016670621173802124, + "loss": 1.0273, + "step": 5654 + }, + { + "epoch": 0.26794598436389483, + "grad_norm": 0.11767578125, + "learning_rate": 0.00016669511598467503, + "loss": 0.0101, + "step": 5655 + }, + { + "epoch": 0.26799336650082917, + "grad_norm": 0.69140625, + "learning_rate": 0.0001666840187521022, + "loss": 1.0378, + "step": 5656 + }, + { + "epoch": 0.26804074863776356, + "grad_norm": 0.73828125, + "learning_rate": 0.00016667292004054885, + "loss": 0.6257, + "step": 5657 + }, + { + "epoch": 0.26808813077469795, + "grad_norm": 0.60546875, + "learning_rate": 0.00016666181985026113, + "loss": 0.6897, + "step": 5658 + }, + { + "epoch": 0.2681355129116323, + "grad_norm": 0.6484375, + "learning_rate": 0.00016665071818148527, + "loss": 0.8365, + "step": 5659 + }, + { + "epoch": 0.2681828950485667, + "grad_norm": 0.68359375, + "learning_rate": 0.00016663961503446748, + "loss": 0.2519, + "step": 5660 + }, + { + "epoch": 0.2682302771855011, + "grad_norm": 1.3828125, + "learning_rate": 0.000166628510409454, + "loss": 0.4015, + "step": 5661 + }, + { + "epoch": 0.26827765932243547, + "grad_norm": 0.72265625, + "learning_rate": 0.00016661740430669116, + "loss": 0.8856, + "step": 5662 + }, + { + "epoch": 0.2683250414593698, + "grad_norm": 0.54296875, + "learning_rate": 0.0001666062967264252, + "loss": 1.2056, + "step": 5663 + }, + { + "epoch": 0.2683724235963042, + "grad_norm": 0.734375, + "learning_rate": 0.00016659518766890257, + "loss": 1.3711, + "step": 5664 + }, + { + "epoch": 0.2684198057332386, + "grad_norm": 0.671875, + "learning_rate": 0.00016658407713436956, + "loss": 0.8598, + "step": 5665 + }, + { + "epoch": 0.26846718787017293, + "grad_norm": 0.83203125, + "learning_rate": 0.00016657296512307266, + "loss": 1.4123, + "step": 5666 + }, + { + "epoch": 0.2685145700071073, + "grad_norm": 0.63671875, + "learning_rate": 0.00016656185163525832, + "loss": 0.8251, + "step": 5667 + }, + { + "epoch": 0.2685619521440417, + "grad_norm": 0.91015625, + "learning_rate": 0.000166550736671173, + "loss": 0.1422, + "step": 5668 + }, + { + "epoch": 0.26860933428097605, + "grad_norm": 0.67578125, + "learning_rate": 0.00016653962023106323, + "loss": 1.0764, + "step": 5669 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 1.0, + "learning_rate": 0.00016652850231517558, + "loss": 1.195, + "step": 5670 + }, + { + "epoch": 0.26870409855484484, + "grad_norm": 0.6328125, + "learning_rate": 0.0001665173829237566, + "loss": 0.7752, + "step": 5671 + }, + { + "epoch": 0.2687514806917792, + "grad_norm": 0.5625, + "learning_rate": 0.0001665062620570529, + "loss": 0.8655, + "step": 5672 + }, + { + "epoch": 0.26879886282871357, + "grad_norm": 0.81640625, + "learning_rate": 0.00016649513971531114, + "loss": 1.2131, + "step": 5673 + }, + { + "epoch": 0.26884624496564796, + "grad_norm": 0.80859375, + "learning_rate": 0.00016648401589877804, + "loss": 1.2834, + "step": 5674 + }, + { + "epoch": 0.2688936271025823, + "grad_norm": 0.2578125, + "learning_rate": 0.00016647289060770027, + "loss": 0.1421, + "step": 5675 + }, + { + "epoch": 0.2689410092395167, + "grad_norm": 0.66015625, + "learning_rate": 0.00016646176384232456, + "loss": 1.0725, + "step": 5676 + }, + { + "epoch": 0.2689883913764511, + "grad_norm": 0.671875, + "learning_rate": 0.00016645063560289777, + "loss": 1.1293, + "step": 5677 + }, + { + "epoch": 0.2690357735133855, + "grad_norm": 0.75, + "learning_rate": 0.00016643950588966663, + "loss": 1.3876, + "step": 5678 + }, + { + "epoch": 0.2690831556503198, + "grad_norm": 0.69921875, + "learning_rate": 0.00016642837470287803, + "loss": 0.9904, + "step": 5679 + }, + { + "epoch": 0.2691305377872542, + "grad_norm": 0.6328125, + "learning_rate": 0.0001664172420427788, + "loss": 0.8577, + "step": 5680 + }, + { + "epoch": 0.2691779199241886, + "grad_norm": 1.375, + "learning_rate": 0.00016640610790961591, + "loss": 0.2961, + "step": 5681 + }, + { + "epoch": 0.26922530206112294, + "grad_norm": 0.7109375, + "learning_rate": 0.0001663949723036363, + "loss": 1.6089, + "step": 5682 + }, + { + "epoch": 0.26927268419805733, + "grad_norm": 0.62890625, + "learning_rate": 0.0001663838352250869, + "loss": 0.948, + "step": 5683 + }, + { + "epoch": 0.2693200663349917, + "grad_norm": 0.6171875, + "learning_rate": 0.0001663726966742148, + "loss": 0.9098, + "step": 5684 + }, + { + "epoch": 0.26936744847192606, + "grad_norm": 0.5625, + "learning_rate": 0.00016636155665126693, + "loss": 0.6937, + "step": 5685 + }, + { + "epoch": 0.26941483060886046, + "grad_norm": 0.0673828125, + "learning_rate": 0.00016635041515649043, + "loss": 0.0027, + "step": 5686 + }, + { + "epoch": 0.26946221274579485, + "grad_norm": 0.68359375, + "learning_rate": 0.00016633927219013238, + "loss": 0.2111, + "step": 5687 + }, + { + "epoch": 0.2695095948827292, + "grad_norm": 1.09375, + "learning_rate": 0.00016632812775243996, + "loss": 0.6147, + "step": 5688 + }, + { + "epoch": 0.2695569770196636, + "grad_norm": 0.7578125, + "learning_rate": 0.0001663169818436603, + "loss": 0.7323, + "step": 5689 + }, + { + "epoch": 0.269604359156598, + "grad_norm": 0.55078125, + "learning_rate": 0.00016630583446404062, + "loss": 0.4665, + "step": 5690 + }, + { + "epoch": 0.26965174129353237, + "grad_norm": 0.91796875, + "learning_rate": 0.00016629468561382815, + "loss": 0.795, + "step": 5691 + }, + { + "epoch": 0.2696991234304667, + "grad_norm": 0.54296875, + "learning_rate": 0.00016628353529327022, + "loss": 0.5587, + "step": 5692 + }, + { + "epoch": 0.2697465055674011, + "grad_norm": 0.7109375, + "learning_rate": 0.00016627238350261402, + "loss": 1.3128, + "step": 5693 + }, + { + "epoch": 0.2697938877043355, + "grad_norm": 0.6953125, + "learning_rate": 0.00016626123024210697, + "loss": 1.0178, + "step": 5694 + }, + { + "epoch": 0.2698412698412698, + "grad_norm": 0.58203125, + "learning_rate": 0.0001662500755119964, + "loss": 0.2749, + "step": 5695 + }, + { + "epoch": 0.2698886519782042, + "grad_norm": 0.28515625, + "learning_rate": 0.00016623891931252972, + "loss": 0.1534, + "step": 5696 + }, + { + "epoch": 0.2699360341151386, + "grad_norm": 0.9765625, + "learning_rate": 0.00016622776164395436, + "loss": 1.0676, + "step": 5697 + }, + { + "epoch": 0.26998341625207295, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001662166025065178, + "loss": 0.0204, + "step": 5698 + }, + { + "epoch": 0.27003079838900734, + "grad_norm": 0.59375, + "learning_rate": 0.00016620544190046752, + "loss": 1.3422, + "step": 5699 + }, + { + "epoch": 0.27007818052594174, + "grad_norm": 0.83203125, + "learning_rate": 0.0001661942798260511, + "loss": 0.9876, + "step": 5700 + }, + { + "epoch": 0.2701255626628761, + "grad_norm": 0.1904296875, + "learning_rate": 0.000166183116283516, + "loss": 0.018, + "step": 5701 + }, + { + "epoch": 0.27017294479981047, + "grad_norm": 0.7109375, + "learning_rate": 0.0001661719512731099, + "loss": 1.3744, + "step": 5702 + }, + { + "epoch": 0.27022032693674486, + "grad_norm": 0.671875, + "learning_rate": 0.0001661607847950804, + "loss": 0.0785, + "step": 5703 + }, + { + "epoch": 0.2702677090736792, + "grad_norm": 0.0869140625, + "learning_rate": 0.0001661496168496752, + "loss": 0.0065, + "step": 5704 + }, + { + "epoch": 0.2703150912106136, + "grad_norm": 0.69140625, + "learning_rate": 0.0001661384474371419, + "loss": 1.119, + "step": 5705 + }, + { + "epoch": 0.270362473347548, + "grad_norm": 0.78125, + "learning_rate": 0.00016612727655772831, + "loss": 1.0204, + "step": 5706 + }, + { + "epoch": 0.2704098554844824, + "grad_norm": 0.640625, + "learning_rate": 0.00016611610421168215, + "loss": 0.8329, + "step": 5707 + }, + { + "epoch": 0.2704572376214167, + "grad_norm": 0.59375, + "learning_rate": 0.00016610493039925125, + "loss": 0.9431, + "step": 5708 + }, + { + "epoch": 0.2705046197583511, + "grad_norm": 0.0849609375, + "learning_rate": 0.00016609375512068337, + "loss": 0.0081, + "step": 5709 + }, + { + "epoch": 0.2705520018952855, + "grad_norm": 1.078125, + "learning_rate": 0.00016608257837622646, + "loss": 0.2527, + "step": 5710 + }, + { + "epoch": 0.27059938403221984, + "grad_norm": 0.59375, + "learning_rate": 0.00016607140016612826, + "loss": 1.0873, + "step": 5711 + }, + { + "epoch": 0.27064676616915423, + "grad_norm": 0.7265625, + "learning_rate": 0.00016606022049063686, + "loss": 1.42, + "step": 5712 + }, + { + "epoch": 0.2706941483060886, + "grad_norm": 0.5703125, + "learning_rate": 0.0001660490393500001, + "loss": 0.0884, + "step": 5713 + }, + { + "epoch": 0.27074153044302296, + "grad_norm": 0.6015625, + "learning_rate": 0.000166037856744466, + "loss": 1.1958, + "step": 5714 + }, + { + "epoch": 0.27078891257995735, + "grad_norm": 0.2001953125, + "learning_rate": 0.00016602667267428257, + "loss": 0.1684, + "step": 5715 + }, + { + "epoch": 0.27083629471689175, + "grad_norm": 0.578125, + "learning_rate": 0.0001660154871396979, + "loss": 0.5702, + "step": 5716 + }, + { + "epoch": 0.2708836768538261, + "grad_norm": 0.7109375, + "learning_rate": 0.00016600430014096, + "loss": 0.872, + "step": 5717 + }, + { + "epoch": 0.2709310589907605, + "grad_norm": 0.58984375, + "learning_rate": 0.00016599311167831706, + "loss": 1.018, + "step": 5718 + }, + { + "epoch": 0.27097844112769487, + "grad_norm": 0.484375, + "learning_rate": 0.00016598192175201718, + "loss": 0.3869, + "step": 5719 + }, + { + "epoch": 0.27102582326462926, + "grad_norm": 0.6484375, + "learning_rate": 0.00016597073036230854, + "loss": 0.7992, + "step": 5720 + }, + { + "epoch": 0.2710732054015636, + "grad_norm": 0.09033203125, + "learning_rate": 0.0001659595375094394, + "loss": 0.0172, + "step": 5721 + }, + { + "epoch": 0.271120587538498, + "grad_norm": 0.466796875, + "learning_rate": 0.00016594834319365797, + "loss": 0.5832, + "step": 5722 + }, + { + "epoch": 0.2711679696754324, + "grad_norm": 0.57421875, + "learning_rate": 0.00016593714741521253, + "loss": 0.6701, + "step": 5723 + }, + { + "epoch": 0.2712153518123667, + "grad_norm": 0.6015625, + "learning_rate": 0.0001659259501743514, + "loss": 0.8962, + "step": 5724 + }, + { + "epoch": 0.2712627339493011, + "grad_norm": 0.59765625, + "learning_rate": 0.0001659147514713229, + "loss": 0.8608, + "step": 5725 + }, + { + "epoch": 0.2713101160862355, + "grad_norm": 0.7265625, + "learning_rate": 0.00016590355130637546, + "loss": 0.9534, + "step": 5726 + }, + { + "epoch": 0.27135749822316985, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0001658923496797574, + "loss": 0.001, + "step": 5727 + }, + { + "epoch": 0.27140488036010424, + "grad_norm": 0.22265625, + "learning_rate": 0.00016588114659171722, + "loss": 0.0697, + "step": 5728 + }, + { + "epoch": 0.27145226249703863, + "grad_norm": 0.5703125, + "learning_rate": 0.00016586994204250338, + "loss": 1.0821, + "step": 5729 + }, + { + "epoch": 0.27149964463397297, + "grad_norm": 0.62890625, + "learning_rate": 0.0001658587360323644, + "loss": 0.9673, + "step": 5730 + }, + { + "epoch": 0.27154702677090736, + "grad_norm": 0.890625, + "learning_rate": 0.0001658475285615488, + "loss": 0.4256, + "step": 5731 + }, + { + "epoch": 0.27159440890784176, + "grad_norm": 0.53515625, + "learning_rate": 0.00016583631963030518, + "loss": 0.9804, + "step": 5732 + }, + { + "epoch": 0.2716417910447761, + "grad_norm": 0.65234375, + "learning_rate": 0.0001658251092388821, + "loss": 0.9756, + "step": 5733 + }, + { + "epoch": 0.2716891731817105, + "grad_norm": 1.1015625, + "learning_rate": 0.0001658138973875282, + "loss": 1.2051, + "step": 5734 + }, + { + "epoch": 0.2717365553186449, + "grad_norm": 0.59765625, + "learning_rate": 0.00016580268407649214, + "loss": 0.1696, + "step": 5735 + }, + { + "epoch": 0.27178393745557927, + "grad_norm": 0.60546875, + "learning_rate": 0.00016579146930602266, + "loss": 0.1459, + "step": 5736 + }, + { + "epoch": 0.2718313195925136, + "grad_norm": 0.6328125, + "learning_rate": 0.00016578025307636846, + "loss": 0.9619, + "step": 5737 + }, + { + "epoch": 0.271878701729448, + "grad_norm": 0.640625, + "learning_rate": 0.00016576903538777834, + "loss": 1.0552, + "step": 5738 + }, + { + "epoch": 0.2719260838663824, + "grad_norm": 0.6328125, + "learning_rate": 0.00016575781624050104, + "loss": 0.7923, + "step": 5739 + }, + { + "epoch": 0.27197346600331673, + "grad_norm": 0.416015625, + "learning_rate": 0.00016574659563478543, + "loss": 0.1956, + "step": 5740 + }, + { + "epoch": 0.2720208481402511, + "grad_norm": 0.8046875, + "learning_rate": 0.00016573537357088036, + "loss": 0.6846, + "step": 5741 + }, + { + "epoch": 0.2720682302771855, + "grad_norm": 0.70703125, + "learning_rate": 0.0001657241500490347, + "loss": 1.2539, + "step": 5742 + }, + { + "epoch": 0.27211561241411986, + "grad_norm": 0.443359375, + "learning_rate": 0.00016571292506949742, + "loss": 0.1037, + "step": 5743 + }, + { + "epoch": 0.27216299455105425, + "grad_norm": 0.5390625, + "learning_rate": 0.00016570169863251746, + "loss": 0.7723, + "step": 5744 + }, + { + "epoch": 0.27221037668798864, + "grad_norm": 0.8046875, + "learning_rate": 0.00016569047073834378, + "loss": 1.033, + "step": 5745 + }, + { + "epoch": 0.272257758824923, + "grad_norm": 0.54296875, + "learning_rate": 0.00016567924138722546, + "loss": 1.2771, + "step": 5746 + }, + { + "epoch": 0.2723051409618574, + "grad_norm": 0.390625, + "learning_rate": 0.0001656680105794115, + "loss": 0.0527, + "step": 5747 + }, + { + "epoch": 0.27235252309879177, + "grad_norm": 0.71875, + "learning_rate": 0.00016565677831515104, + "loss": 1.0547, + "step": 5748 + }, + { + "epoch": 0.27239990523572616, + "grad_norm": 0.7734375, + "learning_rate": 0.00016564554459469317, + "loss": 0.8906, + "step": 5749 + }, + { + "epoch": 0.2724472873726605, + "grad_norm": 0.67578125, + "learning_rate": 0.000165634309418287, + "loss": 1.0512, + "step": 5750 + }, + { + "epoch": 0.2724946695095949, + "grad_norm": 0.72265625, + "learning_rate": 0.00016562307278618178, + "loss": 1.1705, + "step": 5751 + }, + { + "epoch": 0.2725420516465293, + "grad_norm": 0.7734375, + "learning_rate": 0.00016561183469862673, + "loss": 1.1822, + "step": 5752 + }, + { + "epoch": 0.2725894337834636, + "grad_norm": 0.61328125, + "learning_rate": 0.00016560059515587105, + "loss": 0.7487, + "step": 5753 + }, + { + "epoch": 0.272636815920398, + "grad_norm": 0.51171875, + "learning_rate": 0.00016558935415816403, + "loss": 0.6775, + "step": 5754 + }, + { + "epoch": 0.2726841980573324, + "grad_norm": 0.84375, + "learning_rate": 0.00016557811170575504, + "loss": 0.9706, + "step": 5755 + }, + { + "epoch": 0.27273158019426674, + "grad_norm": 0.703125, + "learning_rate": 0.00016556686779889337, + "loss": 0.9756, + "step": 5756 + }, + { + "epoch": 0.27277896233120114, + "grad_norm": 0.056396484375, + "learning_rate": 0.0001655556224378284, + "loss": 0.0017, + "step": 5757 + }, + { + "epoch": 0.27282634446813553, + "grad_norm": 0.68359375, + "learning_rate": 0.00016554437562280952, + "loss": 1.2672, + "step": 5758 + }, + { + "epoch": 0.27287372660506987, + "grad_norm": 0.82421875, + "learning_rate": 0.00016553312735408625, + "loss": 0.1707, + "step": 5759 + }, + { + "epoch": 0.27292110874200426, + "grad_norm": 0.0133056640625, + "learning_rate": 0.00016552187763190797, + "loss": 0.0005, + "step": 5760 + }, + { + "epoch": 0.27296849087893865, + "grad_norm": 0.96875, + "learning_rate": 0.0001655106264565243, + "loss": 1.0667, + "step": 5761 + }, + { + "epoch": 0.273015873015873, + "grad_norm": 0.01031494140625, + "learning_rate": 0.00016549937382818466, + "loss": 0.0005, + "step": 5762 + }, + { + "epoch": 0.2730632551528074, + "grad_norm": 0.76171875, + "learning_rate": 0.0001654881197471387, + "loss": 0.9846, + "step": 5763 + }, + { + "epoch": 0.2731106372897418, + "grad_norm": 0.8125, + "learning_rate": 0.00016547686421363602, + "loss": 1.5557, + "step": 5764 + }, + { + "epoch": 0.27315801942667617, + "grad_norm": 1.2421875, + "learning_rate": 0.0001654656072279262, + "loss": 1.2228, + "step": 5765 + }, + { + "epoch": 0.2732054015636105, + "grad_norm": 0.62109375, + "learning_rate": 0.00016545434879025893, + "loss": 0.0552, + "step": 5766 + }, + { + "epoch": 0.2732527837005449, + "grad_norm": 0.283203125, + "learning_rate": 0.00016544308890088395, + "loss": 0.1073, + "step": 5767 + }, + { + "epoch": 0.2733001658374793, + "grad_norm": 1.015625, + "learning_rate": 0.00016543182756005096, + "loss": 1.0415, + "step": 5768 + }, + { + "epoch": 0.27334754797441363, + "grad_norm": 0.259765625, + "learning_rate": 0.00016542056476800973, + "loss": 0.0164, + "step": 5769 + }, + { + "epoch": 0.273394930111348, + "grad_norm": 0.74609375, + "learning_rate": 0.00016540930052501006, + "loss": 0.9602, + "step": 5770 + }, + { + "epoch": 0.2734423122482824, + "grad_norm": 0.609375, + "learning_rate": 0.00016539803483130177, + "loss": 1.3939, + "step": 5771 + }, + { + "epoch": 0.27348969438521675, + "grad_norm": 0.6328125, + "learning_rate": 0.00016538676768713476, + "loss": 1.0973, + "step": 5772 + }, + { + "epoch": 0.27353707652215115, + "grad_norm": 0.46484375, + "learning_rate": 0.00016537549909275881, + "loss": 0.0422, + "step": 5773 + }, + { + "epoch": 0.27358445865908554, + "grad_norm": 0.203125, + "learning_rate": 0.00016536422904842398, + "loss": 0.0074, + "step": 5774 + }, + { + "epoch": 0.2736318407960199, + "grad_norm": 0.53515625, + "learning_rate": 0.00016535295755438017, + "loss": 0.8604, + "step": 5775 + }, + { + "epoch": 0.27367922293295427, + "grad_norm": 0.53515625, + "learning_rate": 0.00016534168461087738, + "loss": 0.2881, + "step": 5776 + }, + { + "epoch": 0.27372660506988866, + "grad_norm": 0.61328125, + "learning_rate": 0.0001653304102181656, + "loss": 0.9498, + "step": 5777 + }, + { + "epoch": 0.27377398720682306, + "grad_norm": 0.734375, + "learning_rate": 0.0001653191343764949, + "loss": 1.4866, + "step": 5778 + }, + { + "epoch": 0.2738213693437574, + "grad_norm": 0.96484375, + "learning_rate": 0.0001653078570861154, + "loss": 1.2301, + "step": 5779 + }, + { + "epoch": 0.2738687514806918, + "grad_norm": 0.455078125, + "learning_rate": 0.0001652965783472772, + "loss": 0.0687, + "step": 5780 + }, + { + "epoch": 0.2739161336176262, + "grad_norm": 0.57421875, + "learning_rate": 0.0001652852981602304, + "loss": 1.2015, + "step": 5781 + }, + { + "epoch": 0.2739635157545605, + "grad_norm": 0.53515625, + "learning_rate": 0.00016527401652522528, + "loss": 0.6943, + "step": 5782 + }, + { + "epoch": 0.2740108978914949, + "grad_norm": 0.373046875, + "learning_rate": 0.00016526273344251197, + "loss": 0.0311, + "step": 5783 + }, + { + "epoch": 0.2740582800284293, + "grad_norm": 0.8359375, + "learning_rate": 0.00016525144891234078, + "loss": 1.2077, + "step": 5784 + }, + { + "epoch": 0.27410566216536364, + "grad_norm": 0.0498046875, + "learning_rate": 0.0001652401629349619, + "loss": 0.0035, + "step": 5785 + }, + { + "epoch": 0.27415304430229803, + "grad_norm": 0.7421875, + "learning_rate": 0.0001652288755106257, + "loss": 0.5082, + "step": 5786 + }, + { + "epoch": 0.2742004264392324, + "grad_norm": 1.2421875, + "learning_rate": 0.0001652175866395826, + "loss": 0.9112, + "step": 5787 + }, + { + "epoch": 0.27424780857616676, + "grad_norm": 0.62890625, + "learning_rate": 0.0001652062963220828, + "loss": 1.5539, + "step": 5788 + }, + { + "epoch": 0.27429519071310116, + "grad_norm": 0.7109375, + "learning_rate": 0.00016519500455837688, + "loss": 1.0714, + "step": 5789 + }, + { + "epoch": 0.27434257285003555, + "grad_norm": 0.5390625, + "learning_rate": 0.00016518371134871513, + "loss": 0.6494, + "step": 5790 + }, + { + "epoch": 0.2743899549869699, + "grad_norm": 0.2353515625, + "learning_rate": 0.00016517241669334812, + "loss": 0.151, + "step": 5791 + }, + { + "epoch": 0.2744373371239043, + "grad_norm": 0.50390625, + "learning_rate": 0.00016516112059252635, + "loss": 0.3159, + "step": 5792 + }, + { + "epoch": 0.27448471926083867, + "grad_norm": 0.85546875, + "learning_rate": 0.0001651498230465003, + "loss": 1.2159, + "step": 5793 + }, + { + "epoch": 0.27453210139777307, + "grad_norm": 0.458984375, + "learning_rate": 0.00016513852405552058, + "loss": 0.1881, + "step": 5794 + }, + { + "epoch": 0.2745794835347074, + "grad_norm": 0.7109375, + "learning_rate": 0.00016512722361983778, + "loss": 0.9019, + "step": 5795 + }, + { + "epoch": 0.2746268656716418, + "grad_norm": 0.7578125, + "learning_rate": 0.00016511592173970257, + "loss": 1.2556, + "step": 5796 + }, + { + "epoch": 0.2746742478085762, + "grad_norm": 0.609375, + "learning_rate": 0.0001651046184153655, + "loss": 1.1222, + "step": 5797 + }, + { + "epoch": 0.2747216299455105, + "grad_norm": 0.07421875, + "learning_rate": 0.00016509331364707739, + "loss": 0.0055, + "step": 5798 + }, + { + "epoch": 0.2747690120824449, + "grad_norm": 0.53515625, + "learning_rate": 0.0001650820074350889, + "loss": 0.8872, + "step": 5799 + }, + { + "epoch": 0.2748163942193793, + "grad_norm": 0.01165771484375, + "learning_rate": 0.00016507069977965083, + "loss": 0.0009, + "step": 5800 + }, + { + "epoch": 0.27486377635631365, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001650593906810139, + "loss": 0.1333, + "step": 5801 + }, + { + "epoch": 0.27491115849324804, + "grad_norm": 0.09521484375, + "learning_rate": 0.000165048080139429, + "loss": 0.0043, + "step": 5802 + }, + { + "epoch": 0.27495854063018244, + "grad_norm": 0.58984375, + "learning_rate": 0.00016503676815514702, + "loss": 1.1665, + "step": 5803 + }, + { + "epoch": 0.2750059227671168, + "grad_norm": 0.83984375, + "learning_rate": 0.00016502545472841875, + "loss": 1.2289, + "step": 5804 + }, + { + "epoch": 0.27505330490405117, + "grad_norm": 0.8125, + "learning_rate": 0.00016501413985949514, + "loss": 0.8153, + "step": 5805 + }, + { + "epoch": 0.27510068704098556, + "grad_norm": 0.51171875, + "learning_rate": 0.0001650028235486272, + "loss": 0.3612, + "step": 5806 + }, + { + "epoch": 0.27514806917791995, + "grad_norm": 0.322265625, + "learning_rate": 0.00016499150579606586, + "loss": 0.1876, + "step": 5807 + }, + { + "epoch": 0.2751954513148543, + "grad_norm": 0.52734375, + "learning_rate": 0.00016498018660206213, + "loss": 0.4883, + "step": 5808 + }, + { + "epoch": 0.2752428334517887, + "grad_norm": 0.048095703125, + "learning_rate": 0.00016496886596686707, + "loss": 0.0051, + "step": 5809 + }, + { + "epoch": 0.2752902155887231, + "grad_norm": 0.03662109375, + "learning_rate": 0.00016495754389073182, + "loss": 0.0022, + "step": 5810 + }, + { + "epoch": 0.2753375977256574, + "grad_norm": 0.7265625, + "learning_rate": 0.0001649462203739074, + "loss": 0.7888, + "step": 5811 + }, + { + "epoch": 0.2753849798625918, + "grad_norm": 0.5625, + "learning_rate": 0.00016493489541664498, + "loss": 0.7594, + "step": 5812 + }, + { + "epoch": 0.2754323619995262, + "grad_norm": 0.63671875, + "learning_rate": 0.00016492356901919575, + "loss": 0.8629, + "step": 5813 + }, + { + "epoch": 0.27547974413646054, + "grad_norm": 0.9609375, + "learning_rate": 0.0001649122411818109, + "loss": 1.3489, + "step": 5814 + }, + { + "epoch": 0.27552712627339493, + "grad_norm": 0.55078125, + "learning_rate": 0.00016490091190474168, + "loss": 0.9438, + "step": 5815 + }, + { + "epoch": 0.2755745084103293, + "grad_norm": 0.56640625, + "learning_rate": 0.0001648895811882394, + "loss": 0.8693, + "step": 5816 + }, + { + "epoch": 0.27562189054726366, + "grad_norm": 0.7890625, + "learning_rate": 0.0001648782490325553, + "loss": 0.7939, + "step": 5817 + }, + { + "epoch": 0.27566927268419805, + "grad_norm": 0.314453125, + "learning_rate": 0.00016486691543794076, + "loss": 0.1437, + "step": 5818 + }, + { + "epoch": 0.27571665482113245, + "grad_norm": 0.5390625, + "learning_rate": 0.00016485558040464713, + "loss": 0.6232, + "step": 5819 + }, + { + "epoch": 0.2757640369580668, + "grad_norm": 0.53515625, + "learning_rate": 0.00016484424393292576, + "loss": 0.5876, + "step": 5820 + }, + { + "epoch": 0.2758114190950012, + "grad_norm": 0.390625, + "learning_rate": 0.00016483290602302818, + "loss": 0.0837, + "step": 5821 + }, + { + "epoch": 0.27585880123193557, + "grad_norm": 0.76171875, + "learning_rate": 0.00016482156667520576, + "loss": 0.8046, + "step": 5822 + }, + { + "epoch": 0.27590618336886996, + "grad_norm": 0.640625, + "learning_rate": 0.00016481022588971006, + "loss": 0.9897, + "step": 5823 + }, + { + "epoch": 0.2759535655058043, + "grad_norm": 0.80078125, + "learning_rate": 0.0001647988836667926, + "loss": 1.3133, + "step": 5824 + }, + { + "epoch": 0.2760009476427387, + "grad_norm": 0.82421875, + "learning_rate": 0.00016478754000670489, + "loss": 1.4105, + "step": 5825 + }, + { + "epoch": 0.2760483297796731, + "grad_norm": 0.5078125, + "learning_rate": 0.00016477619490969857, + "loss": 0.7777, + "step": 5826 + }, + { + "epoch": 0.2760957119166074, + "grad_norm": 0.7421875, + "learning_rate": 0.00016476484837602527, + "loss": 0.9432, + "step": 5827 + }, + { + "epoch": 0.2761430940535418, + "grad_norm": 0.69140625, + "learning_rate": 0.00016475350040593656, + "loss": 1.1919, + "step": 5828 + }, + { + "epoch": 0.2761904761904762, + "grad_norm": 1.03125, + "learning_rate": 0.00016474215099968422, + "loss": 0.1892, + "step": 5829 + }, + { + "epoch": 0.27623785832741055, + "grad_norm": 0.84375, + "learning_rate": 0.0001647308001575199, + "loss": 0.1605, + "step": 5830 + }, + { + "epoch": 0.27628524046434494, + "grad_norm": 0.341796875, + "learning_rate": 0.0001647194478796954, + "loss": 0.0698, + "step": 5831 + }, + { + "epoch": 0.27633262260127933, + "grad_norm": 0.6796875, + "learning_rate": 0.00016470809416646248, + "loss": 0.8024, + "step": 5832 + }, + { + "epoch": 0.27638000473821367, + "grad_norm": 0.6328125, + "learning_rate": 0.00016469673901807296, + "loss": 1.1148, + "step": 5833 + }, + { + "epoch": 0.27642738687514806, + "grad_norm": 0.0986328125, + "learning_rate": 0.0001646853824347787, + "loss": 0.0053, + "step": 5834 + }, + { + "epoch": 0.27647476901208246, + "grad_norm": 0.07275390625, + "learning_rate": 0.00016467402441683153, + "loss": 0.0064, + "step": 5835 + }, + { + "epoch": 0.27652215114901685, + "grad_norm": 1.140625, + "learning_rate": 0.0001646626649644834, + "loss": 1.199, + "step": 5836 + }, + { + "epoch": 0.2765695332859512, + "grad_norm": 0.34765625, + "learning_rate": 0.0001646513040779862, + "loss": 0.1704, + "step": 5837 + }, + { + "epoch": 0.2766169154228856, + "grad_norm": 0.1357421875, + "learning_rate": 0.00016463994175759197, + "loss": 0.0157, + "step": 5838 + }, + { + "epoch": 0.27666429755981997, + "grad_norm": 0.62890625, + "learning_rate": 0.00016462857800355268, + "loss": 0.0994, + "step": 5839 + }, + { + "epoch": 0.2767116796967543, + "grad_norm": 0.59765625, + "learning_rate": 0.00016461721281612037, + "loss": 0.7554, + "step": 5840 + }, + { + "epoch": 0.2767590618336887, + "grad_norm": 0.306640625, + "learning_rate": 0.0001646058461955471, + "loss": 0.0174, + "step": 5841 + }, + { + "epoch": 0.2768064439706231, + "grad_norm": 0.45703125, + "learning_rate": 0.00016459447814208498, + "loss": 0.5143, + "step": 5842 + }, + { + "epoch": 0.27685382610755743, + "grad_norm": 0.5546875, + "learning_rate": 0.00016458310865598617, + "loss": 0.8134, + "step": 5843 + }, + { + "epoch": 0.2769012082444918, + "grad_norm": 0.62890625, + "learning_rate": 0.00016457173773750275, + "loss": 0.7829, + "step": 5844 + }, + { + "epoch": 0.2769485903814262, + "grad_norm": 0.042236328125, + "learning_rate": 0.000164560365386887, + "loss": 0.0016, + "step": 5845 + }, + { + "epoch": 0.27699597251836056, + "grad_norm": 1.6015625, + "learning_rate": 0.0001645489916043911, + "loss": 0.7386, + "step": 5846 + }, + { + "epoch": 0.27704335465529495, + "grad_norm": 0.52734375, + "learning_rate": 0.0001645376163902673, + "loss": 1.1325, + "step": 5847 + }, + { + "epoch": 0.27709073679222934, + "grad_norm": 0.9375, + "learning_rate": 0.00016452623974476795, + "loss": 0.4153, + "step": 5848 + }, + { + "epoch": 0.2771381189291637, + "grad_norm": 0.5703125, + "learning_rate": 0.0001645148616681453, + "loss": 0.8578, + "step": 5849 + }, + { + "epoch": 0.2771855010660981, + "grad_norm": 0.77734375, + "learning_rate": 0.00016450348216065177, + "loss": 0.9116, + "step": 5850 + }, + { + "epoch": 0.27723288320303247, + "grad_norm": 0.6953125, + "learning_rate": 0.00016449210122253968, + "loss": 1.5575, + "step": 5851 + }, + { + "epoch": 0.27728026533996686, + "grad_norm": 0.203125, + "learning_rate": 0.00016448071885406148, + "loss": 0.1502, + "step": 5852 + }, + { + "epoch": 0.2773276474769012, + "grad_norm": 0.70703125, + "learning_rate": 0.00016446933505546963, + "loss": 1.5635, + "step": 5853 + }, + { + "epoch": 0.2773750296138356, + "grad_norm": 0.6640625, + "learning_rate": 0.00016445794982701661, + "loss": 0.912, + "step": 5854 + }, + { + "epoch": 0.27742241175077, + "grad_norm": 0.76953125, + "learning_rate": 0.00016444656316895491, + "loss": 1.082, + "step": 5855 + }, + { + "epoch": 0.2774697938877043, + "grad_norm": 0.58984375, + "learning_rate": 0.00016443517508153707, + "loss": 0.7714, + "step": 5856 + }, + { + "epoch": 0.2775171760246387, + "grad_norm": 0.68359375, + "learning_rate": 0.0001644237855650157, + "loss": 0.6867, + "step": 5857 + }, + { + "epoch": 0.2775645581615731, + "grad_norm": 0.65234375, + "learning_rate": 0.00016441239461964337, + "loss": 1.5592, + "step": 5858 + }, + { + "epoch": 0.27761194029850744, + "grad_norm": 0.62890625, + "learning_rate": 0.00016440100224567275, + "loss": 0.0292, + "step": 5859 + }, + { + "epoch": 0.27765932243544184, + "grad_norm": 0.072265625, + "learning_rate": 0.0001643896084433565, + "loss": 0.0025, + "step": 5860 + }, + { + "epoch": 0.27770670457237623, + "grad_norm": 0.69921875, + "learning_rate": 0.00016437821321294732, + "loss": 1.1657, + "step": 5861 + }, + { + "epoch": 0.27775408670931057, + "grad_norm": 0.60546875, + "learning_rate": 0.00016436681655469793, + "loss": 0.7367, + "step": 5862 + }, + { + "epoch": 0.27780146884624496, + "grad_norm": 0.828125, + "learning_rate": 0.0001643554184688611, + "loss": 1.1748, + "step": 5863 + }, + { + "epoch": 0.27784885098317935, + "grad_norm": 0.85546875, + "learning_rate": 0.00016434401895568966, + "loss": 1.0877, + "step": 5864 + }, + { + "epoch": 0.27789623312011374, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001643326180154364, + "loss": 0.0143, + "step": 5865 + }, + { + "epoch": 0.2779436152570481, + "grad_norm": 0.5390625, + "learning_rate": 0.00016432121564835422, + "loss": 0.7058, + "step": 5866 + }, + { + "epoch": 0.2779909973939825, + "grad_norm": 0.76953125, + "learning_rate": 0.00016430981185469595, + "loss": 1.2879, + "step": 5867 + }, + { + "epoch": 0.27803837953091687, + "grad_norm": 0.333984375, + "learning_rate": 0.0001642984066347146, + "loss": 0.1743, + "step": 5868 + }, + { + "epoch": 0.2780857616678512, + "grad_norm": 0.6015625, + "learning_rate": 0.00016428699998866303, + "loss": 0.734, + "step": 5869 + }, + { + "epoch": 0.2781331438047856, + "grad_norm": 0.53515625, + "learning_rate": 0.0001642755919167943, + "loss": 1.0882, + "step": 5870 + }, + { + "epoch": 0.27818052594172, + "grad_norm": 0.35546875, + "learning_rate": 0.00016426418241936142, + "loss": 0.0577, + "step": 5871 + }, + { + "epoch": 0.27822790807865433, + "grad_norm": 0.76171875, + "learning_rate": 0.00016425277149661736, + "loss": 1.1437, + "step": 5872 + }, + { + "epoch": 0.2782752902155887, + "grad_norm": 0.67578125, + "learning_rate": 0.00016424135914881534, + "loss": 1.2461, + "step": 5873 + }, + { + "epoch": 0.2783226723525231, + "grad_norm": 0.65234375, + "learning_rate": 0.0001642299453762084, + "loss": 1.051, + "step": 5874 + }, + { + "epoch": 0.27837005448945745, + "grad_norm": 0.5859375, + "learning_rate": 0.00016421853017904963, + "loss": 0.5847, + "step": 5875 + }, + { + "epoch": 0.27841743662639185, + "grad_norm": 0.181640625, + "learning_rate": 0.00016420711355759232, + "loss": 0.1426, + "step": 5876 + }, + { + "epoch": 0.27846481876332624, + "grad_norm": 0.53515625, + "learning_rate": 0.00016419569551208958, + "loss": 0.0234, + "step": 5877 + }, + { + "epoch": 0.2785122009002606, + "grad_norm": 0.0107421875, + "learning_rate": 0.00016418427604279471, + "loss": 0.0005, + "step": 5878 + }, + { + "epoch": 0.27855958303719497, + "grad_norm": 1.2890625, + "learning_rate": 0.00016417285514996096, + "loss": 1.1206, + "step": 5879 + }, + { + "epoch": 0.27860696517412936, + "grad_norm": 0.0498046875, + "learning_rate": 0.00016416143283384165, + "loss": 0.0053, + "step": 5880 + }, + { + "epoch": 0.27865434731106375, + "grad_norm": 0.59765625, + "learning_rate": 0.0001641500090946901, + "loss": 1.0242, + "step": 5881 + }, + { + "epoch": 0.2787017294479981, + "grad_norm": 0.6328125, + "learning_rate": 0.00016413858393275968, + "loss": 0.9941, + "step": 5882 + }, + { + "epoch": 0.2787491115849325, + "grad_norm": 0.74609375, + "learning_rate": 0.00016412715734830376, + "loss": 0.2387, + "step": 5883 + }, + { + "epoch": 0.2787964937218669, + "grad_norm": 0.310546875, + "learning_rate": 0.00016411572934157582, + "loss": 0.0203, + "step": 5884 + }, + { + "epoch": 0.2788438758588012, + "grad_norm": 0.76171875, + "learning_rate": 0.00016410429991282928, + "loss": 0.8312, + "step": 5885 + }, + { + "epoch": 0.2788912579957356, + "grad_norm": 0.63671875, + "learning_rate": 0.00016409286906231765, + "loss": 0.8291, + "step": 5886 + }, + { + "epoch": 0.27893864013267, + "grad_norm": 0.75, + "learning_rate": 0.00016408143679029445, + "loss": 1.2022, + "step": 5887 + }, + { + "epoch": 0.27898602226960434, + "grad_norm": 0.59765625, + "learning_rate": 0.00016407000309701323, + "loss": 0.826, + "step": 5888 + }, + { + "epoch": 0.27903340440653873, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0001640585679827276, + "loss": 0.0015, + "step": 5889 + }, + { + "epoch": 0.2790807865434731, + "grad_norm": 0.80859375, + "learning_rate": 0.0001640471314476912, + "loss": 0.9617, + "step": 5890 + }, + { + "epoch": 0.27912816868040746, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001640356934921576, + "loss": 0.0169, + "step": 5891 + }, + { + "epoch": 0.27917555081734186, + "grad_norm": 0.79296875, + "learning_rate": 0.00016402425411638053, + "loss": 1.1381, + "step": 5892 + }, + { + "epoch": 0.27922293295427625, + "grad_norm": 0.78125, + "learning_rate": 0.0001640128133206137, + "loss": 1.4161, + "step": 5893 + }, + { + "epoch": 0.27927031509121064, + "grad_norm": 0.345703125, + "learning_rate": 0.00016400137110511085, + "loss": 0.0422, + "step": 5894 + }, + { + "epoch": 0.279317697228145, + "grad_norm": 0.57421875, + "learning_rate": 0.00016398992747012578, + "loss": 1.1908, + "step": 5895 + }, + { + "epoch": 0.27936507936507937, + "grad_norm": 0.298828125, + "learning_rate": 0.00016397848241591224, + "loss": 0.0074, + "step": 5896 + }, + { + "epoch": 0.27941246150201376, + "grad_norm": 0.703125, + "learning_rate": 0.00016396703594272416, + "loss": 1.2606, + "step": 5897 + }, + { + "epoch": 0.2794598436389481, + "grad_norm": 0.6328125, + "learning_rate": 0.00016395558805081532, + "loss": 0.5287, + "step": 5898 + }, + { + "epoch": 0.2795072257758825, + "grad_norm": 0.71875, + "learning_rate": 0.00016394413874043967, + "loss": 0.6665, + "step": 5899 + }, + { + "epoch": 0.2795546079128169, + "grad_norm": 1.4765625, + "learning_rate": 0.00016393268801185115, + "loss": 0.8474, + "step": 5900 + }, + { + "epoch": 0.2796019900497512, + "grad_norm": 0.6171875, + "learning_rate": 0.0001639212358653037, + "loss": 1.1306, + "step": 5901 + }, + { + "epoch": 0.2796493721866856, + "grad_norm": 0.322265625, + "learning_rate": 0.00016390978230105128, + "loss": 0.0694, + "step": 5902 + }, + { + "epoch": 0.27969675432362, + "grad_norm": 0.65625, + "learning_rate": 0.00016389832731934803, + "loss": 0.1758, + "step": 5903 + }, + { + "epoch": 0.27974413646055435, + "grad_norm": 0.51953125, + "learning_rate": 0.00016388687092044787, + "loss": 0.1082, + "step": 5904 + }, + { + "epoch": 0.27979151859748874, + "grad_norm": 0.5859375, + "learning_rate": 0.00016387541310460506, + "loss": 0.6653, + "step": 5905 + }, + { + "epoch": 0.27983890073442313, + "grad_norm": 0.5234375, + "learning_rate": 0.00016386395387207355, + "loss": 0.0071, + "step": 5906 + }, + { + "epoch": 0.2798862828713575, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001638524932231076, + "loss": 0.1529, + "step": 5907 + }, + { + "epoch": 0.27993366500829187, + "grad_norm": 0.34375, + "learning_rate": 0.00016384103115796132, + "loss": 0.0179, + "step": 5908 + }, + { + "epoch": 0.27998104714522626, + "grad_norm": 0.6484375, + "learning_rate": 0.000163829567676889, + "loss": 1.1626, + "step": 5909 + }, + { + "epoch": 0.28002842928216065, + "grad_norm": 0.66015625, + "learning_rate": 0.00016381810278014484, + "loss": 1.0607, + "step": 5910 + }, + { + "epoch": 0.280075811419095, + "grad_norm": 0.55078125, + "learning_rate": 0.00016380663646798315, + "loss": 1.2028, + "step": 5911 + }, + { + "epoch": 0.2801231935560294, + "grad_norm": 0.5078125, + "learning_rate": 0.00016379516874065825, + "loss": 0.6336, + "step": 5912 + }, + { + "epoch": 0.2801705756929638, + "grad_norm": 0.640625, + "learning_rate": 0.00016378369959842444, + "loss": 0.0943, + "step": 5913 + }, + { + "epoch": 0.2802179578298981, + "grad_norm": 0.890625, + "learning_rate": 0.00016377222904153608, + "loss": 1.0451, + "step": 5914 + }, + { + "epoch": 0.2802653399668325, + "grad_norm": 0.78125, + "learning_rate": 0.00016376075707024766, + "loss": 1.0668, + "step": 5915 + }, + { + "epoch": 0.2803127221037669, + "grad_norm": 0.84375, + "learning_rate": 0.00016374928368481352, + "loss": 1.0897, + "step": 5916 + }, + { + "epoch": 0.28036010424070124, + "grad_norm": 0.65234375, + "learning_rate": 0.00016373780888548817, + "loss": 0.1122, + "step": 5917 + }, + { + "epoch": 0.28040748637763563, + "grad_norm": 0.515625, + "learning_rate": 0.00016372633267252616, + "loss": 0.9279, + "step": 5918 + }, + { + "epoch": 0.28045486851457, + "grad_norm": 0.7109375, + "learning_rate": 0.00016371485504618191, + "loss": 1.1632, + "step": 5919 + }, + { + "epoch": 0.28050225065150436, + "grad_norm": 0.51953125, + "learning_rate": 0.00016370337600671008, + "loss": 0.6464, + "step": 5920 + }, + { + "epoch": 0.28054963278843875, + "grad_norm": 0.055419921875, + "learning_rate": 0.00016369189555436525, + "loss": 0.0027, + "step": 5921 + }, + { + "epoch": 0.28059701492537314, + "grad_norm": 0.73828125, + "learning_rate": 0.00016368041368940194, + "loss": 0.9308, + "step": 5922 + }, + { + "epoch": 0.2806443970623075, + "grad_norm": 0.609375, + "learning_rate": 0.00016366893041207492, + "loss": 1.0763, + "step": 5923 + }, + { + "epoch": 0.2806917791992419, + "grad_norm": 0.318359375, + "learning_rate": 0.00016365744572263886, + "loss": 0.135, + "step": 5924 + }, + { + "epoch": 0.28073916133617627, + "grad_norm": 0.62109375, + "learning_rate": 0.00016364595962134844, + "loss": 1.0708, + "step": 5925 + }, + { + "epoch": 0.28078654347311066, + "grad_norm": 1.109375, + "learning_rate": 0.00016363447210845843, + "loss": 0.6604, + "step": 5926 + }, + { + "epoch": 0.280833925610045, + "grad_norm": 0.66015625, + "learning_rate": 0.0001636229831842236, + "loss": 1.2479, + "step": 5927 + }, + { + "epoch": 0.2808813077469794, + "grad_norm": 0.6953125, + "learning_rate": 0.0001636114928488988, + "loss": 1.0728, + "step": 5928 + }, + { + "epoch": 0.2809286898839138, + "grad_norm": 0.6796875, + "learning_rate": 0.0001636000011027388, + "loss": 0.856, + "step": 5929 + }, + { + "epoch": 0.2809760720208481, + "grad_norm": 0.412109375, + "learning_rate": 0.00016358850794599857, + "loss": 0.0754, + "step": 5930 + }, + { + "epoch": 0.2810234541577825, + "grad_norm": 0.57421875, + "learning_rate": 0.00016357701337893295, + "loss": 0.7193, + "step": 5931 + }, + { + "epoch": 0.2810708362947169, + "grad_norm": 0.54296875, + "learning_rate": 0.00016356551740179687, + "loss": 0.6159, + "step": 5932 + }, + { + "epoch": 0.28111821843165125, + "grad_norm": 0.36328125, + "learning_rate": 0.00016355402001484533, + "loss": 0.0552, + "step": 5933 + }, + { + "epoch": 0.28116560056858564, + "grad_norm": 0.69140625, + "learning_rate": 0.00016354252121833333, + "loss": 1.3275, + "step": 5934 + }, + { + "epoch": 0.28121298270552003, + "grad_norm": 2.84375, + "learning_rate": 0.00016353102101251588, + "loss": 0.8097, + "step": 5935 + }, + { + "epoch": 0.28126036484245437, + "grad_norm": 0.51953125, + "learning_rate": 0.00016351951939764806, + "loss": 0.7637, + "step": 5936 + }, + { + "epoch": 0.28130774697938876, + "grad_norm": 0.59765625, + "learning_rate": 0.00016350801637398497, + "loss": 0.5522, + "step": 5937 + }, + { + "epoch": 0.28135512911632315, + "grad_norm": 0.61328125, + "learning_rate": 0.00016349651194178173, + "loss": 1.2141, + "step": 5938 + }, + { + "epoch": 0.28140251125325755, + "grad_norm": 0.6015625, + "learning_rate": 0.00016348500610129346, + "loss": 1.445, + "step": 5939 + }, + { + "epoch": 0.2814498933901919, + "grad_norm": 0.6640625, + "learning_rate": 0.00016347349885277538, + "loss": 1.3921, + "step": 5940 + }, + { + "epoch": 0.2814972755271263, + "grad_norm": 0.00628662109375, + "learning_rate": 0.00016346199019648272, + "loss": 0.0003, + "step": 5941 + }, + { + "epoch": 0.28154465766406067, + "grad_norm": 0.48046875, + "learning_rate": 0.00016345048013267067, + "loss": 0.0337, + "step": 5942 + }, + { + "epoch": 0.281592039800995, + "grad_norm": 0.03515625, + "learning_rate": 0.0001634389686615946, + "loss": 0.0011, + "step": 5943 + }, + { + "epoch": 0.2816394219379294, + "grad_norm": 0.73046875, + "learning_rate": 0.00016342745578350976, + "loss": 1.8758, + "step": 5944 + }, + { + "epoch": 0.2816868040748638, + "grad_norm": 0.69140625, + "learning_rate": 0.00016341594149867154, + "loss": 0.994, + "step": 5945 + }, + { + "epoch": 0.28173418621179813, + "grad_norm": 0.00262451171875, + "learning_rate": 0.00016340442580733527, + "loss": 0.0002, + "step": 5946 + }, + { + "epoch": 0.2817815683487325, + "grad_norm": 0.7265625, + "learning_rate": 0.00016339290870975636, + "loss": 1.3652, + "step": 5947 + }, + { + "epoch": 0.2818289504856669, + "grad_norm": 0.15625, + "learning_rate": 0.00016338139020619026, + "loss": 0.0128, + "step": 5948 + }, + { + "epoch": 0.28187633262260126, + "grad_norm": 0.77734375, + "learning_rate": 0.00016336987029689243, + "loss": 1.0002, + "step": 5949 + }, + { + "epoch": 0.28192371475953565, + "grad_norm": 1.453125, + "learning_rate": 0.00016335834898211838, + "loss": 1.2971, + "step": 5950 + }, + { + "epoch": 0.28197109689647004, + "grad_norm": 0.67578125, + "learning_rate": 0.00016334682626212364, + "loss": 1.3942, + "step": 5951 + }, + { + "epoch": 0.2820184790334044, + "grad_norm": 0.734375, + "learning_rate": 0.00016333530213716378, + "loss": 1.3696, + "step": 5952 + }, + { + "epoch": 0.28206586117033877, + "grad_norm": 0.8828125, + "learning_rate": 0.00016332377660749437, + "loss": 1.3783, + "step": 5953 + }, + { + "epoch": 0.28211324330727316, + "grad_norm": 0.71484375, + "learning_rate": 0.00016331224967337102, + "loss": 1.2957, + "step": 5954 + }, + { + "epoch": 0.28216062544420756, + "grad_norm": 0.341796875, + "learning_rate": 0.00016330072133504943, + "loss": 0.134, + "step": 5955 + }, + { + "epoch": 0.2822080075811419, + "grad_norm": 0.30078125, + "learning_rate": 0.00016328919159278526, + "loss": 0.0179, + "step": 5956 + }, + { + "epoch": 0.2822553897180763, + "grad_norm": 0.53125, + "learning_rate": 0.00016327766044683424, + "loss": 0.594, + "step": 5957 + }, + { + "epoch": 0.2823027718550107, + "grad_norm": 0.7265625, + "learning_rate": 0.00016326612789745212, + "loss": 0.7436, + "step": 5958 + }, + { + "epoch": 0.282350153991945, + "grad_norm": 0.6015625, + "learning_rate": 0.00016325459394489467, + "loss": 1.401, + "step": 5959 + }, + { + "epoch": 0.2823975361288794, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001632430585894177, + "loss": 0.1428, + "step": 5960 + }, + { + "epoch": 0.2824449182658138, + "grad_norm": 0.578125, + "learning_rate": 0.00016323152183127704, + "loss": 0.8121, + "step": 5961 + }, + { + "epoch": 0.28249230040274814, + "grad_norm": 0.734375, + "learning_rate": 0.00016321998367072857, + "loss": 1.1734, + "step": 5962 + }, + { + "epoch": 0.28253968253968254, + "grad_norm": 0.68359375, + "learning_rate": 0.0001632084441080282, + "loss": 1.1679, + "step": 5963 + }, + { + "epoch": 0.28258706467661693, + "grad_norm": 0.796875, + "learning_rate": 0.0001631969031434319, + "loss": 0.1729, + "step": 5964 + }, + { + "epoch": 0.28263444681355127, + "grad_norm": 0.53515625, + "learning_rate": 0.0001631853607771956, + "loss": 1.3251, + "step": 5965 + }, + { + "epoch": 0.28268182895048566, + "grad_norm": 0.90234375, + "learning_rate": 0.00016317381700957525, + "loss": 0.8001, + "step": 5966 + }, + { + "epoch": 0.28272921108742005, + "grad_norm": 0.62109375, + "learning_rate": 0.00016316227184082698, + "loss": 0.8081, + "step": 5967 + }, + { + "epoch": 0.28277659322435444, + "grad_norm": 0.71484375, + "learning_rate": 0.0001631507252712068, + "loss": 1.072, + "step": 5968 + }, + { + "epoch": 0.2828239753612888, + "grad_norm": 0.3828125, + "learning_rate": 0.00016313917730097078, + "loss": 0.3934, + "step": 5969 + }, + { + "epoch": 0.2828713574982232, + "grad_norm": 0.50390625, + "learning_rate": 0.00016312762793037507, + "loss": 1.1343, + "step": 5970 + }, + { + "epoch": 0.28291873963515757, + "grad_norm": 0.259765625, + "learning_rate": 0.00016311607715967577, + "loss": 0.0241, + "step": 5971 + }, + { + "epoch": 0.2829661217720919, + "grad_norm": 0.66796875, + "learning_rate": 0.00016310452498912917, + "loss": 1.0106, + "step": 5972 + }, + { + "epoch": 0.2830135039090263, + "grad_norm": 0.70703125, + "learning_rate": 0.00016309297141899135, + "loss": 1.1859, + "step": 5973 + }, + { + "epoch": 0.2830608860459607, + "grad_norm": 0.7890625, + "learning_rate": 0.00016308141644951867, + "loss": 1.3627, + "step": 5974 + }, + { + "epoch": 0.28310826818289503, + "grad_norm": 0.78125, + "learning_rate": 0.0001630698600809674, + "loss": 0.8214, + "step": 5975 + }, + { + "epoch": 0.2831556503198294, + "grad_norm": 0.55859375, + "learning_rate": 0.00016305830231359378, + "loss": 1.4405, + "step": 5976 + }, + { + "epoch": 0.2832030324567638, + "grad_norm": 0.6015625, + "learning_rate": 0.00016304674314765417, + "loss": 1.1925, + "step": 5977 + }, + { + "epoch": 0.28325041459369815, + "grad_norm": 0.01171875, + "learning_rate": 0.00016303518258340492, + "loss": 0.0006, + "step": 5978 + }, + { + "epoch": 0.28329779673063255, + "grad_norm": 0.65234375, + "learning_rate": 0.0001630236206211025, + "loss": 0.6789, + "step": 5979 + }, + { + "epoch": 0.28334517886756694, + "grad_norm": 0.7109375, + "learning_rate": 0.00016301205726100332, + "loss": 1.5498, + "step": 5980 + }, + { + "epoch": 0.2833925610045013, + "grad_norm": 0.6171875, + "learning_rate": 0.00016300049250336383, + "loss": 0.7349, + "step": 5981 + }, + { + "epoch": 0.28343994314143567, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001629889263484405, + "loss": 0.0111, + "step": 5982 + }, + { + "epoch": 0.28348732527837006, + "grad_norm": 0.9609375, + "learning_rate": 0.00016297735879648988, + "loss": 0.3184, + "step": 5983 + }, + { + "epoch": 0.28353470741530445, + "grad_norm": 0.703125, + "learning_rate": 0.0001629657898477685, + "loss": 0.8369, + "step": 5984 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 0.00897216796875, + "learning_rate": 0.00016295421950253302, + "loss": 0.0005, + "step": 5985 + }, + { + "epoch": 0.2836294716891732, + "grad_norm": 1.03125, + "learning_rate": 0.00016294264776103997, + "loss": 0.7341, + "step": 5986 + }, + { + "epoch": 0.2836768538261076, + "grad_norm": 0.8515625, + "learning_rate": 0.00016293107462354603, + "loss": 0.6819, + "step": 5987 + }, + { + "epoch": 0.2837242359630419, + "grad_norm": 0.5546875, + "learning_rate": 0.00016291950009030793, + "loss": 0.9467, + "step": 5988 + }, + { + "epoch": 0.2837716180999763, + "grad_norm": 0.474609375, + "learning_rate": 0.00016290792416158228, + "loss": 0.7839, + "step": 5989 + }, + { + "epoch": 0.2838190002369107, + "grad_norm": 0.58203125, + "learning_rate": 0.00016289634683762591, + "loss": 0.1468, + "step": 5990 + }, + { + "epoch": 0.28386638237384504, + "grad_norm": 0.546875, + "learning_rate": 0.00016288476811869557, + "loss": 0.7732, + "step": 5991 + }, + { + "epoch": 0.28391376451077943, + "grad_norm": 1.0, + "learning_rate": 0.00016287318800504804, + "loss": 0.0987, + "step": 5992 + }, + { + "epoch": 0.2839611466477138, + "grad_norm": 0.83984375, + "learning_rate": 0.0001628616064969402, + "loss": 0.8688, + "step": 5993 + }, + { + "epoch": 0.28400852878464816, + "grad_norm": 0.66015625, + "learning_rate": 0.00016285002359462883, + "loss": 1.3546, + "step": 5994 + }, + { + "epoch": 0.28405591092158256, + "grad_norm": 0.78125, + "learning_rate": 0.00016283843929837093, + "loss": 0.8868, + "step": 5995 + }, + { + "epoch": 0.28410329305851695, + "grad_norm": 0.58203125, + "learning_rate": 0.00016282685360842334, + "loss": 0.8705, + "step": 5996 + }, + { + "epoch": 0.28415067519545134, + "grad_norm": 0.6640625, + "learning_rate": 0.00016281526652504308, + "loss": 1.1959, + "step": 5997 + }, + { + "epoch": 0.2841980573323857, + "grad_norm": 0.640625, + "learning_rate": 0.00016280367804848711, + "loss": 1.3464, + "step": 5998 + }, + { + "epoch": 0.28424543946932007, + "grad_norm": 0.1533203125, + "learning_rate": 0.00016279208817901247, + "loss": 0.0082, + "step": 5999 + }, + { + "epoch": 0.28429282160625446, + "grad_norm": 0.67578125, + "learning_rate": 0.00016278049691687617, + "loss": 1.8539, + "step": 6000 + }, + { + "epoch": 0.2843402037431888, + "grad_norm": 0.99609375, + "learning_rate": 0.00016276890426233535, + "loss": 1.017, + "step": 6001 + }, + { + "epoch": 0.2843875858801232, + "grad_norm": 0.43359375, + "learning_rate": 0.00016275731021564706, + "loss": 0.1131, + "step": 6002 + }, + { + "epoch": 0.2844349680170576, + "grad_norm": 0.71875, + "learning_rate": 0.00016274571477706848, + "loss": 1.4311, + "step": 6003 + }, + { + "epoch": 0.2844823501539919, + "grad_norm": 0.4921875, + "learning_rate": 0.0001627341179468568, + "loss": 1.0981, + "step": 6004 + }, + { + "epoch": 0.2845297322909263, + "grad_norm": 0.427734375, + "learning_rate": 0.00016272251972526921, + "loss": 0.5771, + "step": 6005 + }, + { + "epoch": 0.2845771144278607, + "grad_norm": 0.75, + "learning_rate": 0.00016271092011256293, + "loss": 0.0699, + "step": 6006 + }, + { + "epoch": 0.28462449656479505, + "grad_norm": 0.38671875, + "learning_rate": 0.00016269931910899526, + "loss": 0.1502, + "step": 6007 + }, + { + "epoch": 0.28467187870172944, + "grad_norm": 0.57421875, + "learning_rate": 0.00016268771671482343, + "loss": 1.0168, + "step": 6008 + }, + { + "epoch": 0.28471926083866383, + "grad_norm": 0.78125, + "learning_rate": 0.00016267611293030484, + "loss": 0.2216, + "step": 6009 + }, + { + "epoch": 0.28476664297559817, + "grad_norm": 0.51171875, + "learning_rate": 0.00016266450775569684, + "loss": 0.639, + "step": 6010 + }, + { + "epoch": 0.28481402511253257, + "grad_norm": 0.9453125, + "learning_rate": 0.00016265290119125677, + "loss": 1.3192, + "step": 6011 + }, + { + "epoch": 0.28486140724946696, + "grad_norm": 0.216796875, + "learning_rate": 0.0001626412932372421, + "loss": 0.1411, + "step": 6012 + }, + { + "epoch": 0.28490878938640135, + "grad_norm": 0.6328125, + "learning_rate": 0.00016262968389391027, + "loss": 1.1672, + "step": 6013 + }, + { + "epoch": 0.2849561715233357, + "grad_norm": 0.5234375, + "learning_rate": 0.0001626180731615188, + "loss": 0.4654, + "step": 6014 + }, + { + "epoch": 0.2850035536602701, + "grad_norm": 0.53515625, + "learning_rate": 0.00016260646104032512, + "loss": 0.8847, + "step": 6015 + }, + { + "epoch": 0.2850509357972045, + "grad_norm": 0.26171875, + "learning_rate": 0.00016259484753058682, + "loss": 0.1109, + "step": 6016 + }, + { + "epoch": 0.2850983179341388, + "grad_norm": 0.68359375, + "learning_rate": 0.0001625832326325615, + "loss": 1.344, + "step": 6017 + }, + { + "epoch": 0.2851457000710732, + "grad_norm": 0.390625, + "learning_rate": 0.0001625716163465067, + "loss": 0.1425, + "step": 6018 + }, + { + "epoch": 0.2851930822080076, + "grad_norm": 0.162109375, + "learning_rate": 0.00016255999867268012, + "loss": 0.0183, + "step": 6019 + }, + { + "epoch": 0.28524046434494194, + "grad_norm": 0.66796875, + "learning_rate": 0.00016254837961133942, + "loss": 0.718, + "step": 6020 + }, + { + "epoch": 0.28528784648187633, + "grad_norm": 0.62109375, + "learning_rate": 0.00016253675916274226, + "loss": 1.0985, + "step": 6021 + }, + { + "epoch": 0.2853352286188107, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00016252513732714643, + "loss": 0.0006, + "step": 6022 + }, + { + "epoch": 0.28538261075574506, + "grad_norm": 0.004425048828125, + "learning_rate": 0.00016251351410480962, + "loss": 0.0002, + "step": 6023 + }, + { + "epoch": 0.28542999289267945, + "grad_norm": 0.65625, + "learning_rate": 0.00016250188949598965, + "loss": 1.2493, + "step": 6024 + }, + { + "epoch": 0.28547737502961384, + "grad_norm": 0.16796875, + "learning_rate": 0.00016249026350094437, + "loss": 0.1221, + "step": 6025 + }, + { + "epoch": 0.28552475716654824, + "grad_norm": 0.76953125, + "learning_rate": 0.00016247863611993158, + "loss": 0.8118, + "step": 6026 + }, + { + "epoch": 0.2855721393034826, + "grad_norm": 0.80078125, + "learning_rate": 0.0001624670073532092, + "loss": 1.1523, + "step": 6027 + }, + { + "epoch": 0.28561952144041697, + "grad_norm": 0.23046875, + "learning_rate": 0.00016245537720103515, + "loss": 0.0994, + "step": 6028 + }, + { + "epoch": 0.28566690357735136, + "grad_norm": 0.470703125, + "learning_rate": 0.00016244374566366733, + "loss": 0.9692, + "step": 6029 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.50390625, + "learning_rate": 0.00016243211274136378, + "loss": 0.5362, + "step": 6030 + }, + { + "epoch": 0.2857616678512201, + "grad_norm": 0.0086669921875, + "learning_rate": 0.00016242047843438245, + "loss": 0.0004, + "step": 6031 + }, + { + "epoch": 0.2858090499881545, + "grad_norm": 0.7265625, + "learning_rate": 0.0001624088427429814, + "loss": 1.7455, + "step": 6032 + }, + { + "epoch": 0.2858564321250888, + "grad_norm": 0.62890625, + "learning_rate": 0.00016239720566741867, + "loss": 0.9899, + "step": 6033 + }, + { + "epoch": 0.2859038142620232, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001623855672079524, + "loss": 0.0159, + "step": 6034 + }, + { + "epoch": 0.2859511963989576, + "grad_norm": 0.7890625, + "learning_rate": 0.0001623739273648407, + "loss": 0.4276, + "step": 6035 + }, + { + "epoch": 0.28599857853589195, + "grad_norm": 0.61328125, + "learning_rate": 0.00016236228613834171, + "loss": 0.9734, + "step": 6036 + }, + { + "epoch": 0.28604596067282634, + "grad_norm": 0.033203125, + "learning_rate": 0.00016235064352871365, + "loss": 0.0035, + "step": 6037 + }, + { + "epoch": 0.28609334280976073, + "grad_norm": 0.9296875, + "learning_rate": 0.00016233899953621475, + "loss": 1.3255, + "step": 6038 + }, + { + "epoch": 0.28614072494669507, + "grad_norm": 0.9140625, + "learning_rate": 0.00016232735416110323, + "loss": 2.0154, + "step": 6039 + }, + { + "epoch": 0.28618810708362946, + "grad_norm": 0.58984375, + "learning_rate": 0.00016231570740363738, + "loss": 1.0311, + "step": 6040 + }, + { + "epoch": 0.28623548922056385, + "grad_norm": 0.66015625, + "learning_rate": 0.00016230405926407552, + "loss": 0.7651, + "step": 6041 + }, + { + "epoch": 0.28628287135749825, + "grad_norm": 0.69921875, + "learning_rate": 0.00016229240974267596, + "loss": 0.9131, + "step": 6042 + }, + { + "epoch": 0.2863302534944326, + "grad_norm": 0.287109375, + "learning_rate": 0.00016228075883969713, + "loss": 0.0416, + "step": 6043 + }, + { + "epoch": 0.286377635631367, + "grad_norm": 0.75, + "learning_rate": 0.0001622691065553974, + "loss": 1.0857, + "step": 6044 + }, + { + "epoch": 0.28642501776830137, + "grad_norm": 1.1875, + "learning_rate": 0.00016225745289003522, + "loss": 0.7592, + "step": 6045 + }, + { + "epoch": 0.2864723999052357, + "grad_norm": 0.65234375, + "learning_rate": 0.00016224579784386903, + "loss": 1.2407, + "step": 6046 + }, + { + "epoch": 0.2865197820421701, + "grad_norm": 0.58203125, + "learning_rate": 0.00016223414141715737, + "loss": 1.2729, + "step": 6047 + }, + { + "epoch": 0.2865671641791045, + "grad_norm": 0.69921875, + "learning_rate": 0.00016222248361015873, + "loss": 1.2148, + "step": 6048 + }, + { + "epoch": 0.28661454631603883, + "grad_norm": 0.8515625, + "learning_rate": 0.00016221082442313168, + "loss": 0.9701, + "step": 6049 + }, + { + "epoch": 0.2866619284529732, + "grad_norm": 0.54296875, + "learning_rate": 0.00016219916385633483, + "loss": 0.9508, + "step": 6050 + }, + { + "epoch": 0.2867093105899076, + "grad_norm": 0.185546875, + "learning_rate": 0.00016218750191002675, + "loss": 0.0976, + "step": 6051 + }, + { + "epoch": 0.28675669272684196, + "grad_norm": 0.263671875, + "learning_rate": 0.0001621758385844661, + "loss": 0.0691, + "step": 6052 + }, + { + "epoch": 0.28680407486377635, + "grad_norm": 0.69140625, + "learning_rate": 0.0001621641738799116, + "loss": 1.346, + "step": 6053 + }, + { + "epoch": 0.28685145700071074, + "grad_norm": 0.96484375, + "learning_rate": 0.00016215250779662193, + "loss": 0.1656, + "step": 6054 + }, + { + "epoch": 0.28689883913764513, + "grad_norm": 0.55859375, + "learning_rate": 0.0001621408403348559, + "loss": 0.9364, + "step": 6055 + }, + { + "epoch": 0.28694622127457947, + "grad_norm": 0.0032501220703125, + "learning_rate": 0.00016212917149487216, + "loss": 0.0002, + "step": 6056 + }, + { + "epoch": 0.28699360341151386, + "grad_norm": 0.0036468505859375, + "learning_rate": 0.00016211750127692956, + "loss": 0.0003, + "step": 6057 + }, + { + "epoch": 0.28704098554844826, + "grad_norm": 0.57421875, + "learning_rate": 0.00016210582968128695, + "loss": 0.9902, + "step": 6058 + }, + { + "epoch": 0.2870883676853826, + "grad_norm": 0.7890625, + "learning_rate": 0.00016209415670820321, + "loss": 0.6357, + "step": 6059 + }, + { + "epoch": 0.287135749822317, + "grad_norm": 0.6796875, + "learning_rate": 0.00016208248235793723, + "loss": 1.3255, + "step": 6060 + }, + { + "epoch": 0.2871831319592514, + "grad_norm": 0.73828125, + "learning_rate": 0.00016207080663074787, + "loss": 0.289, + "step": 6061 + }, + { + "epoch": 0.2872305140961857, + "grad_norm": 0.65234375, + "learning_rate": 0.0001620591295268942, + "loss": 0.5338, + "step": 6062 + }, + { + "epoch": 0.2872778962331201, + "grad_norm": 0.68359375, + "learning_rate": 0.0001620474510466351, + "loss": 0.089, + "step": 6063 + }, + { + "epoch": 0.2873252783700545, + "grad_norm": 0.193359375, + "learning_rate": 0.0001620357711902296, + "loss": 0.1438, + "step": 6064 + }, + { + "epoch": 0.28737266050698884, + "grad_norm": 0.69140625, + "learning_rate": 0.0001620240899579368, + "loss": 1.2508, + "step": 6065 + }, + { + "epoch": 0.28742004264392323, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016201240735001576, + "loss": 0.0149, + "step": 6066 + }, + { + "epoch": 0.2874674247808576, + "grad_norm": 0.058837890625, + "learning_rate": 0.00016200072336672555, + "loss": 0.0066, + "step": 6067 + }, + { + "epoch": 0.28751480691779197, + "grad_norm": 0.68359375, + "learning_rate": 0.00016198903800832538, + "loss": 1.5147, + "step": 6068 + }, + { + "epoch": 0.28756218905472636, + "grad_norm": 0.640625, + "learning_rate": 0.00016197735127507435, + "loss": 0.776, + "step": 6069 + }, + { + "epoch": 0.28760957119166075, + "grad_norm": 0.953125, + "learning_rate": 0.00016196566316723169, + "loss": 0.5439, + "step": 6070 + }, + { + "epoch": 0.28765695332859514, + "grad_norm": 0.52734375, + "learning_rate": 0.0001619539736850566, + "loss": 0.7401, + "step": 6071 + }, + { + "epoch": 0.2877043354655295, + "grad_norm": 0.6953125, + "learning_rate": 0.00016194228282880838, + "loss": 1.199, + "step": 6072 + }, + { + "epoch": 0.2877517176024639, + "grad_norm": 0.671875, + "learning_rate": 0.0001619305905987463, + "loss": 1.0443, + "step": 6073 + }, + { + "epoch": 0.28779909973939827, + "grad_norm": 0.70703125, + "learning_rate": 0.0001619188969951297, + "loss": 1.0907, + "step": 6074 + }, + { + "epoch": 0.2878464818763326, + "grad_norm": 0.5625, + "learning_rate": 0.00016190720201821792, + "loss": 0.7844, + "step": 6075 + }, + { + "epoch": 0.287893864013267, + "grad_norm": 0.63671875, + "learning_rate": 0.0001618955056682703, + "loss": 1.1599, + "step": 6076 + }, + { + "epoch": 0.2879412461502014, + "grad_norm": 0.7109375, + "learning_rate": 0.00016188380794554633, + "loss": 1.07, + "step": 6077 + }, + { + "epoch": 0.28798862828713573, + "grad_norm": 0.6953125, + "learning_rate": 0.00016187210885030542, + "loss": 0.3129, + "step": 6078 + }, + { + "epoch": 0.2880360104240701, + "grad_norm": 0.46484375, + "learning_rate": 0.00016186040838280703, + "loss": 1.0747, + "step": 6079 + }, + { + "epoch": 0.2880833925610045, + "grad_norm": 0.65625, + "learning_rate": 0.00016184870654331065, + "loss": 0.9774, + "step": 6080 + }, + { + "epoch": 0.28813077469793885, + "grad_norm": 0.8125, + "learning_rate": 0.00016183700333207587, + "loss": 0.7071, + "step": 6081 + }, + { + "epoch": 0.28817815683487324, + "grad_norm": 0.462890625, + "learning_rate": 0.0001618252987493622, + "loss": 0.4575, + "step": 6082 + }, + { + "epoch": 0.28822553897180764, + "grad_norm": 0.46484375, + "learning_rate": 0.00016181359279542927, + "loss": 0.6988, + "step": 6083 + }, + { + "epoch": 0.28827292110874203, + "grad_norm": 0.703125, + "learning_rate": 0.0001618018854705367, + "loss": 1.4512, + "step": 6084 + }, + { + "epoch": 0.28832030324567637, + "grad_norm": 0.76953125, + "learning_rate": 0.00016179017677494414, + "loss": 0.4298, + "step": 6085 + }, + { + "epoch": 0.28836768538261076, + "grad_norm": 0.63671875, + "learning_rate": 0.00016177846670891125, + "loss": 0.8053, + "step": 6086 + }, + { + "epoch": 0.28841506751954515, + "grad_norm": 0.53515625, + "learning_rate": 0.00016176675527269777, + "loss": 1.2651, + "step": 6087 + }, + { + "epoch": 0.2884624496564795, + "grad_norm": 0.048828125, + "learning_rate": 0.00016175504246656347, + "loss": 0.004, + "step": 6088 + }, + { + "epoch": 0.2885098317934139, + "grad_norm": 0.62109375, + "learning_rate": 0.0001617433282907681, + "loss": 0.7184, + "step": 6089 + }, + { + "epoch": 0.2885572139303483, + "grad_norm": 0.287109375, + "learning_rate": 0.00016173161274557146, + "loss": 0.0291, + "step": 6090 + }, + { + "epoch": 0.2886045960672826, + "grad_norm": 0.81640625, + "learning_rate": 0.00016171989583123341, + "loss": 1.0737, + "step": 6091 + }, + { + "epoch": 0.288651978204217, + "grad_norm": 0.5390625, + "learning_rate": 0.00016170817754801383, + "loss": 0.7138, + "step": 6092 + }, + { + "epoch": 0.2886993603411514, + "grad_norm": 0.55859375, + "learning_rate": 0.0001616964578961726, + "loss": 0.7767, + "step": 6093 + }, + { + "epoch": 0.28874674247808574, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00016168473687596963, + "loss": 0.0022, + "step": 6094 + }, + { + "epoch": 0.28879412461502013, + "grad_norm": 0.6796875, + "learning_rate": 0.0001616730144876649, + "loss": 0.6575, + "step": 6095 + }, + { + "epoch": 0.2888415067519545, + "grad_norm": 0.546875, + "learning_rate": 0.00016166129073151843, + "loss": 0.1054, + "step": 6096 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.70703125, + "learning_rate": 0.0001616495656077902, + "loss": 0.9698, + "step": 6097 + }, + { + "epoch": 0.28893627102582325, + "grad_norm": 0.66796875, + "learning_rate": 0.00016163783911674021, + "loss": 1.0205, + "step": 6098 + }, + { + "epoch": 0.28898365316275765, + "grad_norm": 0.59375, + "learning_rate": 0.00016162611125862867, + "loss": 0.8748, + "step": 6099 + }, + { + "epoch": 0.28903103529969204, + "grad_norm": 0.78515625, + "learning_rate": 0.00016161438203371562, + "loss": 1.3839, + "step": 6100 + }, + { + "epoch": 0.2890784174366264, + "grad_norm": 0.8203125, + "learning_rate": 0.0001616026514422612, + "loss": 0.9761, + "step": 6101 + }, + { + "epoch": 0.28912579957356077, + "grad_norm": 0.61328125, + "learning_rate": 0.00016159091948452555, + "loss": 1.2139, + "step": 6102 + }, + { + "epoch": 0.28917318171049516, + "grad_norm": 0.94140625, + "learning_rate": 0.00016157918616076895, + "loss": 1.4222, + "step": 6103 + }, + { + "epoch": 0.2892205638474295, + "grad_norm": 0.28125, + "learning_rate": 0.00016156745147125157, + "loss": 0.0964, + "step": 6104 + }, + { + "epoch": 0.2892679459843639, + "grad_norm": 0.73046875, + "learning_rate": 0.0001615557154162337, + "loss": 1.0896, + "step": 6105 + }, + { + "epoch": 0.2893153281212983, + "grad_norm": 0.41796875, + "learning_rate": 0.00016154397799597562, + "loss": 0.0228, + "step": 6106 + }, + { + "epoch": 0.2893627102582326, + "grad_norm": 0.63671875, + "learning_rate": 0.00016153223921073768, + "loss": 1.5017, + "step": 6107 + }, + { + "epoch": 0.289410092395167, + "grad_norm": 0.67578125, + "learning_rate": 0.0001615204990607802, + "loss": 0.9594, + "step": 6108 + }, + { + "epoch": 0.2894574745321014, + "grad_norm": 0.51171875, + "learning_rate": 0.0001615087575463636, + "loss": 0.7924, + "step": 6109 + }, + { + "epoch": 0.28950485666903575, + "grad_norm": 0.55078125, + "learning_rate": 0.00016149701466774827, + "loss": 0.9055, + "step": 6110 + }, + { + "epoch": 0.28955223880597014, + "grad_norm": 0.5390625, + "learning_rate": 0.00016148527042519466, + "loss": 0.648, + "step": 6111 + }, + { + "epoch": 0.28959962094290453, + "grad_norm": 0.68359375, + "learning_rate": 0.00016147352481896322, + "loss": 1.648, + "step": 6112 + }, + { + "epoch": 0.2896470030798389, + "grad_norm": 0.7109375, + "learning_rate": 0.0001614617778493145, + "loss": 1.1806, + "step": 6113 + }, + { + "epoch": 0.28969438521677326, + "grad_norm": 0.52734375, + "learning_rate": 0.000161450029516509, + "loss": 0.5153, + "step": 6114 + }, + { + "epoch": 0.28974176735370766, + "grad_norm": 1.03125, + "learning_rate": 0.0001614382798208073, + "loss": 1.104, + "step": 6115 + }, + { + "epoch": 0.28978914949064205, + "grad_norm": 0.6328125, + "learning_rate": 0.00016142652876247, + "loss": 0.916, + "step": 6116 + }, + { + "epoch": 0.2898365316275764, + "grad_norm": 0.66015625, + "learning_rate": 0.00016141477634175772, + "loss": 1.2782, + "step": 6117 + }, + { + "epoch": 0.2898839137645108, + "grad_norm": 0.67578125, + "learning_rate": 0.00016140302255893114, + "loss": 1.3098, + "step": 6118 + }, + { + "epoch": 0.2899312959014452, + "grad_norm": 0.1142578125, + "learning_rate": 0.0001613912674142509, + "loss": 0.0161, + "step": 6119 + }, + { + "epoch": 0.2899786780383795, + "grad_norm": 0.6484375, + "learning_rate": 0.00016137951090797775, + "loss": 1.2315, + "step": 6120 + }, + { + "epoch": 0.2900260601753139, + "grad_norm": 0.671875, + "learning_rate": 0.0001613677530403724, + "loss": 0.7916, + "step": 6121 + }, + { + "epoch": 0.2900734423122483, + "grad_norm": 0.7109375, + "learning_rate": 0.00016135599381169566, + "loss": 1.5569, + "step": 6122 + }, + { + "epoch": 0.29012082444918263, + "grad_norm": 0.5546875, + "learning_rate": 0.00016134423322220834, + "loss": 0.2116, + "step": 6123 + }, + { + "epoch": 0.29016820658611703, + "grad_norm": 0.63671875, + "learning_rate": 0.00016133247127217125, + "loss": 0.8461, + "step": 6124 + }, + { + "epoch": 0.2902155887230514, + "grad_norm": 0.66796875, + "learning_rate": 0.00016132070796184532, + "loss": 0.9446, + "step": 6125 + }, + { + "epoch": 0.29026297085998576, + "grad_norm": 0.61328125, + "learning_rate": 0.00016130894329149137, + "loss": 1.0857, + "step": 6126 + }, + { + "epoch": 0.29031035299692015, + "grad_norm": 0.55078125, + "learning_rate": 0.00016129717726137037, + "loss": 0.6333, + "step": 6127 + }, + { + "epoch": 0.29035773513385454, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016128540987174326, + "loss": 0.0076, + "step": 6128 + }, + { + "epoch": 0.29040511727078894, + "grad_norm": 0.734375, + "learning_rate": 0.00016127364112287107, + "loss": 1.0762, + "step": 6129 + }, + { + "epoch": 0.2904524994077233, + "grad_norm": 0.58203125, + "learning_rate": 0.00016126187101501475, + "loss": 1.1474, + "step": 6130 + }, + { + "epoch": 0.29049988154465767, + "grad_norm": 0.61328125, + "learning_rate": 0.0001612500995484354, + "loss": 1.068, + "step": 6131 + }, + { + "epoch": 0.29054726368159206, + "grad_norm": 0.46875, + "learning_rate": 0.00016123832672339407, + "loss": 0.1338, + "step": 6132 + }, + { + "epoch": 0.2905946458185264, + "grad_norm": 0.201171875, + "learning_rate": 0.00016122655254015192, + "loss": 0.0309, + "step": 6133 + }, + { + "epoch": 0.2906420279554608, + "grad_norm": 0.23046875, + "learning_rate": 0.00016121477699896998, + "loss": 0.0521, + "step": 6134 + }, + { + "epoch": 0.2906894100923952, + "grad_norm": 0.65625, + "learning_rate": 0.00016120300010010954, + "loss": 1.0097, + "step": 6135 + }, + { + "epoch": 0.2907367922293295, + "grad_norm": 0.640625, + "learning_rate": 0.00016119122184383175, + "loss": 0.8926, + "step": 6136 + }, + { + "epoch": 0.2907841743662639, + "grad_norm": 0.61328125, + "learning_rate": 0.0001611794422303978, + "loss": 1.2017, + "step": 6137 + }, + { + "epoch": 0.2908315565031983, + "grad_norm": 0.58984375, + "learning_rate": 0.000161167661260069, + "loss": 1.012, + "step": 6138 + }, + { + "epoch": 0.29087893864013264, + "grad_norm": 0.345703125, + "learning_rate": 0.00016115587893310665, + "loss": 0.0874, + "step": 6139 + }, + { + "epoch": 0.29092632077706704, + "grad_norm": 0.9296875, + "learning_rate": 0.00016114409524977202, + "loss": 1.116, + "step": 6140 + }, + { + "epoch": 0.29097370291400143, + "grad_norm": 0.6875, + "learning_rate": 0.0001611323102103265, + "loss": 0.6605, + "step": 6141 + }, + { + "epoch": 0.2910210850509358, + "grad_norm": 0.6015625, + "learning_rate": 0.00016112052381503147, + "loss": 1.1295, + "step": 6142 + }, + { + "epoch": 0.29106846718787016, + "grad_norm": 0.77734375, + "learning_rate": 0.0001611087360641483, + "loss": 0.9839, + "step": 6143 + }, + { + "epoch": 0.29111584932480455, + "grad_norm": 0.6875, + "learning_rate": 0.00016109694695793847, + "loss": 0.8536, + "step": 6144 + }, + { + "epoch": 0.29116323146173895, + "grad_norm": 0.349609375, + "learning_rate": 0.0001610851564966634, + "loss": 0.048, + "step": 6145 + }, + { + "epoch": 0.2912106135986733, + "grad_norm": 0.10888671875, + "learning_rate": 0.00016107336468058466, + "loss": 0.015, + "step": 6146 + }, + { + "epoch": 0.2912579957356077, + "grad_norm": 0.609375, + "learning_rate": 0.00016106157150996375, + "loss": 0.9039, + "step": 6147 + }, + { + "epoch": 0.29130537787254207, + "grad_norm": 0.9375, + "learning_rate": 0.0001610497769850622, + "loss": 0.2915, + "step": 6148 + }, + { + "epoch": 0.2913527600094764, + "grad_norm": 0.5703125, + "learning_rate": 0.00016103798110614164, + "loss": 0.757, + "step": 6149 + }, + { + "epoch": 0.2914001421464108, + "grad_norm": 0.68359375, + "learning_rate": 0.00016102618387346367, + "loss": 1.1858, + "step": 6150 + }, + { + "epoch": 0.2914475242833452, + "grad_norm": 0.71484375, + "learning_rate": 0.00016101438528728993, + "loss": 0.0603, + "step": 6151 + }, + { + "epoch": 0.29149490642027953, + "grad_norm": 0.61328125, + "learning_rate": 0.00016100258534788213, + "loss": 0.8641, + "step": 6152 + }, + { + "epoch": 0.2915422885572139, + "grad_norm": 0.50390625, + "learning_rate": 0.00016099078405550194, + "loss": 0.7625, + "step": 6153 + }, + { + "epoch": 0.2915896706941483, + "grad_norm": 0.59375, + "learning_rate": 0.00016097898141041115, + "loss": 1.667, + "step": 6154 + }, + { + "epoch": 0.29163705283108265, + "grad_norm": 0.5859375, + "learning_rate": 0.00016096717741287148, + "loss": 0.0808, + "step": 6155 + }, + { + "epoch": 0.29168443496801705, + "grad_norm": 0.115234375, + "learning_rate": 0.00016095537206314479, + "loss": 0.0055, + "step": 6156 + }, + { + "epoch": 0.29173181710495144, + "grad_norm": 0.50390625, + "learning_rate": 0.00016094356536149284, + "loss": 0.2689, + "step": 6157 + }, + { + "epoch": 0.29177919924188583, + "grad_norm": 0.60546875, + "learning_rate": 0.00016093175730817752, + "loss": 0.8178, + "step": 6158 + }, + { + "epoch": 0.29182658137882017, + "grad_norm": 0.2294921875, + "learning_rate": 0.00016091994790346076, + "loss": 0.0233, + "step": 6159 + }, + { + "epoch": 0.29187396351575456, + "grad_norm": 0.53515625, + "learning_rate": 0.00016090813714760442, + "loss": 0.1928, + "step": 6160 + }, + { + "epoch": 0.29192134565268896, + "grad_norm": 0.69921875, + "learning_rate": 0.0001608963250408705, + "loss": 0.1937, + "step": 6161 + }, + { + "epoch": 0.2919687277896233, + "grad_norm": 1.125, + "learning_rate": 0.0001608845115835209, + "loss": 0.5616, + "step": 6162 + }, + { + "epoch": 0.2920161099265577, + "grad_norm": 0.095703125, + "learning_rate": 0.00016087269677581774, + "loss": 0.0031, + "step": 6163 + }, + { + "epoch": 0.2920634920634921, + "grad_norm": 0.53125, + "learning_rate": 0.000160860880618023, + "loss": 1.07, + "step": 6164 + }, + { + "epoch": 0.2921108742004264, + "grad_norm": 0.6953125, + "learning_rate": 0.00016084906311039873, + "loss": 1.5735, + "step": 6165 + }, + { + "epoch": 0.2921582563373608, + "grad_norm": 0.75, + "learning_rate": 0.00016083724425320706, + "loss": 1.0827, + "step": 6166 + }, + { + "epoch": 0.2922056384742952, + "grad_norm": 0.10302734375, + "learning_rate": 0.0001608254240467101, + "loss": 0.0123, + "step": 6167 + }, + { + "epoch": 0.29225302061122954, + "grad_norm": 0.7890625, + "learning_rate": 0.00016081360249117004, + "loss": 0.8328, + "step": 6168 + }, + { + "epoch": 0.29230040274816393, + "grad_norm": 0.74609375, + "learning_rate": 0.00016080177958684906, + "loss": 0.9292, + "step": 6169 + }, + { + "epoch": 0.2923477848850983, + "grad_norm": 0.30859375, + "learning_rate": 0.00016078995533400935, + "loss": 0.1389, + "step": 6170 + }, + { + "epoch": 0.2923951670220327, + "grad_norm": 0.88671875, + "learning_rate": 0.0001607781297329132, + "loss": 0.8424, + "step": 6171 + }, + { + "epoch": 0.29244254915896706, + "grad_norm": 0.8828125, + "learning_rate": 0.00016076630278382287, + "loss": 0.8404, + "step": 6172 + }, + { + "epoch": 0.29248993129590145, + "grad_norm": 0.6328125, + "learning_rate": 0.00016075447448700067, + "loss": 1.0984, + "step": 6173 + }, + { + "epoch": 0.29253731343283584, + "grad_norm": 0.400390625, + "learning_rate": 0.00016074264484270892, + "loss": 0.494, + "step": 6174 + }, + { + "epoch": 0.2925846955697702, + "grad_norm": 0.2578125, + "learning_rate": 0.00016073081385121002, + "loss": 0.0795, + "step": 6175 + }, + { + "epoch": 0.2926320777067046, + "grad_norm": 0.6875, + "learning_rate": 0.00016071898151276637, + "loss": 0.9714, + "step": 6176 + }, + { + "epoch": 0.29267945984363897, + "grad_norm": 0.46484375, + "learning_rate": 0.00016070714782764035, + "loss": 0.9679, + "step": 6177 + }, + { + "epoch": 0.2927268419805733, + "grad_norm": 0.66015625, + "learning_rate": 0.00016069531279609448, + "loss": 1.077, + "step": 6178 + }, + { + "epoch": 0.2927742241175077, + "grad_norm": 0.6796875, + "learning_rate": 0.0001606834764183912, + "loss": 0.9103, + "step": 6179 + }, + { + "epoch": 0.2928216062544421, + "grad_norm": 0.55078125, + "learning_rate": 0.00016067163869479308, + "loss": 1.4553, + "step": 6180 + }, + { + "epoch": 0.29286898839137643, + "grad_norm": 0.62109375, + "learning_rate": 0.00016065979962556263, + "loss": 1.0385, + "step": 6181 + }, + { + "epoch": 0.2929163705283108, + "grad_norm": 0.50390625, + "learning_rate": 0.00016064795921096243, + "loss": 0.9758, + "step": 6182 + }, + { + "epoch": 0.2929637526652452, + "grad_norm": 0.431640625, + "learning_rate": 0.00016063611745125507, + "loss": 0.4767, + "step": 6183 + }, + { + "epoch": 0.29301113480217955, + "grad_norm": 0.5859375, + "learning_rate": 0.00016062427434670329, + "loss": 1.1962, + "step": 6184 + }, + { + "epoch": 0.29305851693911394, + "grad_norm": 0.8359375, + "learning_rate": 0.0001606124298975696, + "loss": 1.4655, + "step": 6185 + }, + { + "epoch": 0.29310589907604834, + "grad_norm": 0.515625, + "learning_rate": 0.0001606005841041168, + "loss": 0.8719, + "step": 6186 + }, + { + "epoch": 0.29315328121298273, + "grad_norm": 0.142578125, + "learning_rate": 0.00016058873696660761, + "loss": 0.0059, + "step": 6187 + }, + { + "epoch": 0.29320066334991707, + "grad_norm": 0.65234375, + "learning_rate": 0.00016057688848530475, + "loss": 0.9908, + "step": 6188 + }, + { + "epoch": 0.29324804548685146, + "grad_norm": 0.5859375, + "learning_rate": 0.00016056503866047103, + "loss": 1.0236, + "step": 6189 + }, + { + "epoch": 0.29329542762378585, + "grad_norm": 0.185546875, + "learning_rate": 0.00016055318749236928, + "loss": 0.1252, + "step": 6190 + }, + { + "epoch": 0.2933428097607202, + "grad_norm": 0.76171875, + "learning_rate": 0.00016054133498126229, + "loss": 1.286, + "step": 6191 + }, + { + "epoch": 0.2933901918976546, + "grad_norm": 0.1904296875, + "learning_rate": 0.00016052948112741302, + "loss": 0.0102, + "step": 6192 + }, + { + "epoch": 0.293437574034589, + "grad_norm": 0.69921875, + "learning_rate": 0.0001605176259310843, + "loss": 0.135, + "step": 6193 + }, + { + "epoch": 0.2934849561715233, + "grad_norm": 0.78515625, + "learning_rate": 0.00016050576939253912, + "loss": 1.089, + "step": 6194 + }, + { + "epoch": 0.2935323383084577, + "grad_norm": 0.6875, + "learning_rate": 0.00016049391151204043, + "loss": 0.9853, + "step": 6195 + }, + { + "epoch": 0.2935797204453921, + "grad_norm": 0.90625, + "learning_rate": 0.00016048205228985117, + "loss": 1.1275, + "step": 6196 + }, + { + "epoch": 0.29362710258232644, + "grad_norm": 0.0732421875, + "learning_rate": 0.00016047019172623448, + "loss": 0.0027, + "step": 6197 + }, + { + "epoch": 0.29367448471926083, + "grad_norm": 0.71484375, + "learning_rate": 0.0001604583298214533, + "loss": 0.9025, + "step": 6198 + }, + { + "epoch": 0.2937218668561952, + "grad_norm": 0.193359375, + "learning_rate": 0.0001604464665757708, + "loss": 0.1409, + "step": 6199 + }, + { + "epoch": 0.2937692489931296, + "grad_norm": 0.53515625, + "learning_rate": 0.00016043460198945, + "loss": 0.8453, + "step": 6200 + }, + { + "epoch": 0.29381663113006395, + "grad_norm": 0.75390625, + "learning_rate": 0.00016042273606275414, + "loss": 1.5202, + "step": 6201 + }, + { + "epoch": 0.29386401326699835, + "grad_norm": 0.66015625, + "learning_rate": 0.00016041086879594634, + "loss": 1.264, + "step": 6202 + }, + { + "epoch": 0.29391139540393274, + "grad_norm": 1.25, + "learning_rate": 0.0001603990001892898, + "loss": 0.3838, + "step": 6203 + }, + { + "epoch": 0.2939587775408671, + "grad_norm": 0.5703125, + "learning_rate": 0.0001603871302430478, + "loss": 0.1965, + "step": 6204 + }, + { + "epoch": 0.29400615967780147, + "grad_norm": 0.2353515625, + "learning_rate": 0.00016037525895748358, + "loss": 0.051, + "step": 6205 + }, + { + "epoch": 0.29405354181473586, + "grad_norm": 0.51953125, + "learning_rate": 0.0001603633863328604, + "loss": 0.1407, + "step": 6206 + }, + { + "epoch": 0.2941009239516702, + "grad_norm": 2.21875, + "learning_rate": 0.00016035151236944162, + "loss": 0.6282, + "step": 6207 + }, + { + "epoch": 0.2941483060886046, + "grad_norm": 0.76953125, + "learning_rate": 0.00016033963706749057, + "loss": 0.3471, + "step": 6208 + }, + { + "epoch": 0.294195688225539, + "grad_norm": 0.70703125, + "learning_rate": 0.00016032776042727064, + "loss": 0.8517, + "step": 6209 + }, + { + "epoch": 0.2942430703624733, + "grad_norm": 0.2373046875, + "learning_rate": 0.00016031588244904525, + "loss": 0.0247, + "step": 6210 + }, + { + "epoch": 0.2942904524994077, + "grad_norm": 0.234375, + "learning_rate": 0.00016030400313307784, + "loss": 0.0238, + "step": 6211 + }, + { + "epoch": 0.2943378346363421, + "grad_norm": 0.72265625, + "learning_rate": 0.00016029212247963185, + "loss": 0.9779, + "step": 6212 + }, + { + "epoch": 0.29438521677327645, + "grad_norm": 0.5234375, + "learning_rate": 0.0001602802404889708, + "loss": 0.1089, + "step": 6213 + }, + { + "epoch": 0.29443259891021084, + "grad_norm": 0.625, + "learning_rate": 0.00016026835716135827, + "loss": 1.0606, + "step": 6214 + }, + { + "epoch": 0.29447998104714523, + "grad_norm": 0.63671875, + "learning_rate": 0.00016025647249705776, + "loss": 0.7342, + "step": 6215 + }, + { + "epoch": 0.2945273631840796, + "grad_norm": 0.60546875, + "learning_rate": 0.00016024458649633282, + "loss": 1.044, + "step": 6216 + }, + { + "epoch": 0.29457474532101396, + "grad_norm": 0.7265625, + "learning_rate": 0.00016023269915944717, + "loss": 1.1679, + "step": 6217 + }, + { + "epoch": 0.29462212745794836, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0001602208104866644, + "loss": 0.0018, + "step": 6218 + }, + { + "epoch": 0.29466950959488275, + "grad_norm": 0.37109375, + "learning_rate": 0.0001602089204782482, + "loss": 0.1495, + "step": 6219 + }, + { + "epoch": 0.2947168917318171, + "grad_norm": 0.640625, + "learning_rate": 0.00016019702913446226, + "loss": 1.0678, + "step": 6220 + }, + { + "epoch": 0.2947642738687515, + "grad_norm": 0.53125, + "learning_rate": 0.00016018513645557034, + "loss": 0.6447, + "step": 6221 + }, + { + "epoch": 0.2948116560056859, + "grad_norm": 0.79296875, + "learning_rate": 0.0001601732424418362, + "loss": 0.8159, + "step": 6222 + }, + { + "epoch": 0.2948590381426202, + "grad_norm": 0.64453125, + "learning_rate": 0.00016016134709352365, + "loss": 0.7239, + "step": 6223 + }, + { + "epoch": 0.2949064202795546, + "grad_norm": 0.6328125, + "learning_rate": 0.0001601494504108965, + "loss": 1.0982, + "step": 6224 + }, + { + "epoch": 0.294953802416489, + "grad_norm": 0.5859375, + "learning_rate": 0.00016013755239421856, + "loss": 1.1708, + "step": 6225 + }, + { + "epoch": 0.29500118455342333, + "grad_norm": 0.265625, + "learning_rate": 0.00016012565304375381, + "loss": 0.0258, + "step": 6226 + }, + { + "epoch": 0.2950485666903577, + "grad_norm": 0.30859375, + "learning_rate": 0.00016011375235976612, + "loss": 0.0109, + "step": 6227 + }, + { + "epoch": 0.2950959488272921, + "grad_norm": 0.58203125, + "learning_rate": 0.00016010185034251944, + "loss": 0.8148, + "step": 6228 + }, + { + "epoch": 0.2951433309642265, + "grad_norm": 0.5859375, + "learning_rate": 0.00016008994699227773, + "loss": 0.4518, + "step": 6229 + }, + { + "epoch": 0.29519071310116085, + "grad_norm": 0.625, + "learning_rate": 0.00016007804230930498, + "loss": 0.6498, + "step": 6230 + }, + { + "epoch": 0.29523809523809524, + "grad_norm": 0.671875, + "learning_rate": 0.00016006613629386527, + "loss": 0.9114, + "step": 6231 + }, + { + "epoch": 0.29528547737502964, + "grad_norm": 0.58984375, + "learning_rate": 0.00016005422894622263, + "loss": 0.6616, + "step": 6232 + }, + { + "epoch": 0.295332859511964, + "grad_norm": 0.70703125, + "learning_rate": 0.0001600423202666411, + "loss": 1.1665, + "step": 6233 + }, + { + "epoch": 0.29538024164889837, + "grad_norm": 0.400390625, + "learning_rate": 0.00016003041025538495, + "loss": 0.0348, + "step": 6234 + }, + { + "epoch": 0.29542762378583276, + "grad_norm": 0.51953125, + "learning_rate": 0.0001600184989127182, + "loss": 0.5651, + "step": 6235 + }, + { + "epoch": 0.2954750059227671, + "grad_norm": 0.19921875, + "learning_rate": 0.00016000658623890508, + "loss": 0.1426, + "step": 6236 + }, + { + "epoch": 0.2955223880597015, + "grad_norm": 0.6640625, + "learning_rate": 0.00015999467223420979, + "loss": 1.3802, + "step": 6237 + }, + { + "epoch": 0.2955697701966359, + "grad_norm": 0.66015625, + "learning_rate": 0.00015998275689889656, + "loss": 0.0291, + "step": 6238 + }, + { + "epoch": 0.2956171523335702, + "grad_norm": 0.640625, + "learning_rate": 0.0001599708402332297, + "loss": 0.9275, + "step": 6239 + }, + { + "epoch": 0.2956645344705046, + "grad_norm": 0.33203125, + "learning_rate": 0.00015995892223747346, + "loss": 0.1245, + "step": 6240 + }, + { + "epoch": 0.295711916607439, + "grad_norm": 0.31640625, + "learning_rate": 0.0001599470029118922, + "loss": 0.1849, + "step": 6241 + }, + { + "epoch": 0.29575929874437334, + "grad_norm": 0.55078125, + "learning_rate": 0.00015993508225675029, + "loss": 0.4715, + "step": 6242 + }, + { + "epoch": 0.29580668088130774, + "grad_norm": 0.55859375, + "learning_rate": 0.00015992316027231204, + "loss": 0.1159, + "step": 6243 + }, + { + "epoch": 0.29585406301824213, + "grad_norm": 0.1533203125, + "learning_rate": 0.00015991123695884197, + "loss": 0.0317, + "step": 6244 + }, + { + "epoch": 0.2959014451551765, + "grad_norm": 0.63671875, + "learning_rate": 0.00015989931231660446, + "loss": 0.5017, + "step": 6245 + }, + { + "epoch": 0.29594882729211086, + "grad_norm": 0.61328125, + "learning_rate": 0.000159887386345864, + "loss": 0.939, + "step": 6246 + }, + { + "epoch": 0.29599620942904525, + "grad_norm": 1.46875, + "learning_rate": 0.00015987545904688514, + "loss": 1.6365, + "step": 6247 + }, + { + "epoch": 0.29604359156597965, + "grad_norm": 0.3046875, + "learning_rate": 0.0001598635304199323, + "loss": 0.0365, + "step": 6248 + }, + { + "epoch": 0.296090973702914, + "grad_norm": 0.193359375, + "learning_rate": 0.0001598516004652702, + "loss": 0.0459, + "step": 6249 + }, + { + "epoch": 0.2961383558398484, + "grad_norm": 0.283203125, + "learning_rate": 0.0001598396691831633, + "loss": 0.1533, + "step": 6250 + }, + { + "epoch": 0.29618573797678277, + "grad_norm": 0.6875, + "learning_rate": 0.00015982773657387627, + "loss": 0.6514, + "step": 6251 + }, + { + "epoch": 0.2962331201137171, + "grad_norm": 0.74609375, + "learning_rate": 0.00015981580263767383, + "loss": 1.3279, + "step": 6252 + }, + { + "epoch": 0.2962805022506515, + "grad_norm": 0.65625, + "learning_rate": 0.00015980386737482057, + "loss": 1.3035, + "step": 6253 + }, + { + "epoch": 0.2963278843875859, + "grad_norm": 0.56640625, + "learning_rate": 0.00015979193078558118, + "loss": 0.7709, + "step": 6254 + }, + { + "epoch": 0.29637526652452023, + "grad_norm": 0.68359375, + "learning_rate": 0.00015977999287022053, + "loss": 0.9822, + "step": 6255 + }, + { + "epoch": 0.2964226486614546, + "grad_norm": 0.42578125, + "learning_rate": 0.00015976805362900325, + "loss": 0.2037, + "step": 6256 + }, + { + "epoch": 0.296470030798389, + "grad_norm": 1.1328125, + "learning_rate": 0.00015975611306219423, + "loss": 1.2106, + "step": 6257 + }, + { + "epoch": 0.2965174129353234, + "grad_norm": 0.62890625, + "learning_rate": 0.0001597441711700583, + "loss": 0.7787, + "step": 6258 + }, + { + "epoch": 0.29656479507225775, + "grad_norm": 0.78515625, + "learning_rate": 0.00015973222795286025, + "loss": 1.0039, + "step": 6259 + }, + { + "epoch": 0.29661217720919214, + "grad_norm": 0.59765625, + "learning_rate": 0.000159720283410865, + "loss": 0.9096, + "step": 6260 + }, + { + "epoch": 0.29665955934612653, + "grad_norm": 0.267578125, + "learning_rate": 0.00015970833754433753, + "loss": 0.1478, + "step": 6261 + }, + { + "epoch": 0.29670694148306087, + "grad_norm": 0.291015625, + "learning_rate": 0.00015969639035354267, + "loss": 0.0838, + "step": 6262 + }, + { + "epoch": 0.29675432361999526, + "grad_norm": 0.76953125, + "learning_rate": 0.0001596844418387455, + "loss": 1.3852, + "step": 6263 + }, + { + "epoch": 0.29680170575692966, + "grad_norm": 0.5078125, + "learning_rate": 0.00015967249200021094, + "loss": 0.9298, + "step": 6264 + }, + { + "epoch": 0.296849087893864, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001596605408382041, + "loss": 0.022, + "step": 6265 + }, + { + "epoch": 0.2968964700307984, + "grad_norm": 0.4609375, + "learning_rate": 0.00015964858835299003, + "loss": 0.2375, + "step": 6266 + }, + { + "epoch": 0.2969438521677328, + "grad_norm": 0.8359375, + "learning_rate": 0.00015963663454483378, + "loss": 1.151, + "step": 6267 + }, + { + "epoch": 0.2969912343046671, + "grad_norm": 0.02978515625, + "learning_rate": 0.00015962467941400048, + "loss": 0.0012, + "step": 6268 + }, + { + "epoch": 0.2970386164416015, + "grad_norm": 0.65234375, + "learning_rate": 0.0001596127229607553, + "loss": 1.225, + "step": 6269 + }, + { + "epoch": 0.2970859985785359, + "grad_norm": 0.5390625, + "learning_rate": 0.00015960076518536345, + "loss": 0.9748, + "step": 6270 + }, + { + "epoch": 0.29713338071547024, + "grad_norm": 0.56640625, + "learning_rate": 0.0001595888060880901, + "loss": 1.0452, + "step": 6271 + }, + { + "epoch": 0.29718076285240463, + "grad_norm": 0.58203125, + "learning_rate": 0.00015957684566920052, + "loss": 0.8657, + "step": 6272 + }, + { + "epoch": 0.297228144989339, + "grad_norm": 0.6484375, + "learning_rate": 0.00015956488392895995, + "loss": 0.9778, + "step": 6273 + }, + { + "epoch": 0.2972755271262734, + "grad_norm": 0.71875, + "learning_rate": 0.00015955292086763373, + "loss": 1.4371, + "step": 6274 + }, + { + "epoch": 0.29732290926320776, + "grad_norm": 0.58203125, + "learning_rate": 0.0001595409564854871, + "loss": 0.6994, + "step": 6275 + }, + { + "epoch": 0.29737029140014215, + "grad_norm": 0.65625, + "learning_rate": 0.0001595289907827855, + "loss": 0.788, + "step": 6276 + }, + { + "epoch": 0.29741767353707654, + "grad_norm": 0.376953125, + "learning_rate": 0.00015951702375979426, + "loss": 0.1758, + "step": 6277 + }, + { + "epoch": 0.2974650556740109, + "grad_norm": 0.69140625, + "learning_rate": 0.00015950505541677886, + "loss": 0.1653, + "step": 6278 + }, + { + "epoch": 0.2975124378109453, + "grad_norm": 0.828125, + "learning_rate": 0.00015949308575400473, + "loss": 1.3668, + "step": 6279 + }, + { + "epoch": 0.29755981994787967, + "grad_norm": 0.0205078125, + "learning_rate": 0.0001594811147717373, + "loss": 0.0009, + "step": 6280 + }, + { + "epoch": 0.297607202084814, + "grad_norm": 0.546875, + "learning_rate": 0.00015946914247024212, + "loss": 0.826, + "step": 6281 + }, + { + "epoch": 0.2976545842217484, + "grad_norm": 0.2890625, + "learning_rate": 0.0001594571688497847, + "loss": 0.1798, + "step": 6282 + }, + { + "epoch": 0.2977019663586828, + "grad_norm": 0.60546875, + "learning_rate": 0.0001594451939106306, + "loss": 1.074, + "step": 6283 + }, + { + "epoch": 0.2977493484956171, + "grad_norm": 0.6640625, + "learning_rate": 0.00015943321765304538, + "loss": 0.8172, + "step": 6284 + }, + { + "epoch": 0.2977967306325515, + "grad_norm": 0.703125, + "learning_rate": 0.00015942124007729475, + "loss": 0.1754, + "step": 6285 + }, + { + "epoch": 0.2978441127694859, + "grad_norm": 0.7890625, + "learning_rate": 0.0001594092611836443, + "loss": 1.321, + "step": 6286 + }, + { + "epoch": 0.2978914949064203, + "grad_norm": 0.8984375, + "learning_rate": 0.0001593972809723597, + "loss": 1.0648, + "step": 6287 + }, + { + "epoch": 0.29793887704335464, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001593852994437067, + "loss": 0.0259, + "step": 6288 + }, + { + "epoch": 0.29798625918028904, + "grad_norm": 0.64453125, + "learning_rate": 0.00015937331659795101, + "loss": 1.3141, + "step": 6289 + }, + { + "epoch": 0.29803364131722343, + "grad_norm": 0.435546875, + "learning_rate": 0.00015936133243535838, + "loss": 0.2187, + "step": 6290 + }, + { + "epoch": 0.29808102345415777, + "grad_norm": 0.6796875, + "learning_rate": 0.00015934934695619463, + "loss": 0.1718, + "step": 6291 + }, + { + "epoch": 0.29812840559109216, + "grad_norm": 1.0078125, + "learning_rate": 0.00015933736016072558, + "loss": 1.254, + "step": 6292 + }, + { + "epoch": 0.29817578772802655, + "grad_norm": 1.1015625, + "learning_rate": 0.0001593253720492171, + "loss": 0.7333, + "step": 6293 + }, + { + "epoch": 0.2982231698649609, + "grad_norm": 0.9453125, + "learning_rate": 0.00015931338262193501, + "loss": 0.3758, + "step": 6294 + }, + { + "epoch": 0.2982705520018953, + "grad_norm": 0.7734375, + "learning_rate": 0.00015930139187914533, + "loss": 0.9309, + "step": 6295 + }, + { + "epoch": 0.2983179341388297, + "grad_norm": 0.578125, + "learning_rate": 0.0001592893998211139, + "loss": 1.0219, + "step": 6296 + }, + { + "epoch": 0.298365316275764, + "grad_norm": 0.609375, + "learning_rate": 0.00015927740644810677, + "loss": 1.0214, + "step": 6297 + }, + { + "epoch": 0.2984126984126984, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001592654117603899, + "loss": 0.1297, + "step": 6298 + }, + { + "epoch": 0.2984600805496328, + "grad_norm": 0.50390625, + "learning_rate": 0.00015925341575822932, + "loss": 0.0671, + "step": 6299 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.6328125, + "learning_rate": 0.00015924141844189107, + "loss": 0.7515, + "step": 6300 + }, + { + "epoch": 0.29855484482350153, + "grad_norm": 0.6875, + "learning_rate": 0.00015922941981164128, + "loss": 1.2515, + "step": 6301 + }, + { + "epoch": 0.2986022269604359, + "grad_norm": 0.60546875, + "learning_rate": 0.00015921741986774603, + "loss": 0.9278, + "step": 6302 + }, + { + "epoch": 0.2986496090973703, + "grad_norm": 0.54296875, + "learning_rate": 0.0001592054186104715, + "loss": 1.3154, + "step": 6303 + }, + { + "epoch": 0.29869699123430465, + "grad_norm": 0.640625, + "learning_rate": 0.00015919341604008383, + "loss": 0.8406, + "step": 6304 + }, + { + "epoch": 0.29874437337123905, + "grad_norm": 0.65625, + "learning_rate": 0.0001591814121568493, + "loss": 1.1043, + "step": 6305 + }, + { + "epoch": 0.29879175550817344, + "grad_norm": 0.1455078125, + "learning_rate": 0.000159169406961034, + "loss": 0.0178, + "step": 6306 + }, + { + "epoch": 0.2988391376451078, + "grad_norm": 0.99609375, + "learning_rate": 0.00015915740045290434, + "loss": 0.0587, + "step": 6307 + }, + { + "epoch": 0.29888651978204217, + "grad_norm": 0.625, + "learning_rate": 0.0001591453926327265, + "loss": 0.9912, + "step": 6308 + }, + { + "epoch": 0.29893390191897656, + "grad_norm": 0.1640625, + "learning_rate": 0.0001591333835007669, + "loss": 0.0063, + "step": 6309 + }, + { + "epoch": 0.2989812840559109, + "grad_norm": 0.6796875, + "learning_rate": 0.00015912137305729181, + "loss": 1.1696, + "step": 6310 + }, + { + "epoch": 0.2990286661928453, + "grad_norm": 0.65625, + "learning_rate": 0.00015910936130256764, + "loss": 0.1296, + "step": 6311 + }, + { + "epoch": 0.2990760483297797, + "grad_norm": 0.7109375, + "learning_rate": 0.00015909734823686081, + "loss": 1.0156, + "step": 6312 + }, + { + "epoch": 0.299123430466714, + "grad_norm": 0.24609375, + "learning_rate": 0.00015908533386043775, + "loss": 0.1437, + "step": 6313 + }, + { + "epoch": 0.2991708126036484, + "grad_norm": 0.62109375, + "learning_rate": 0.0001590733181735649, + "loss": 1.0903, + "step": 6314 + }, + { + "epoch": 0.2992181947405828, + "grad_norm": 0.68359375, + "learning_rate": 0.00015906130117650878, + "loss": 1.1543, + "step": 6315 + }, + { + "epoch": 0.2992655768775172, + "grad_norm": 0.06982421875, + "learning_rate": 0.00015904928286953593, + "loss": 0.0095, + "step": 6316 + }, + { + "epoch": 0.29931295901445154, + "grad_norm": 0.69140625, + "learning_rate": 0.00015903726325291285, + "loss": 0.9702, + "step": 6317 + }, + { + "epoch": 0.29936034115138593, + "grad_norm": 0.734375, + "learning_rate": 0.00015902524232690616, + "loss": 0.9361, + "step": 6318 + }, + { + "epoch": 0.2994077232883203, + "grad_norm": 0.90234375, + "learning_rate": 0.00015901322009178248, + "loss": 1.4427, + "step": 6319 + }, + { + "epoch": 0.29945510542525466, + "grad_norm": 0.5234375, + "learning_rate": 0.00015900119654780842, + "loss": 1.0989, + "step": 6320 + }, + { + "epoch": 0.29950248756218906, + "grad_norm": 0.220703125, + "learning_rate": 0.0001589891716952507, + "loss": 0.1506, + "step": 6321 + }, + { + "epoch": 0.29954986969912345, + "grad_norm": 0.435546875, + "learning_rate": 0.00015897714553437598, + "loss": 1.2756, + "step": 6322 + }, + { + "epoch": 0.2995972518360578, + "grad_norm": 0.93359375, + "learning_rate": 0.00015896511806545095, + "loss": 0.8838, + "step": 6323 + }, + { + "epoch": 0.2996446339729922, + "grad_norm": 0.212890625, + "learning_rate": 0.00015895308928874245, + "loss": 0.1703, + "step": 6324 + }, + { + "epoch": 0.2996920161099266, + "grad_norm": 0.6328125, + "learning_rate": 0.0001589410592045172, + "loss": 1.1891, + "step": 6325 + }, + { + "epoch": 0.2997393982468609, + "grad_norm": 0.1328125, + "learning_rate": 0.00015892902781304203, + "loss": 0.0167, + "step": 6326 + }, + { + "epoch": 0.2997867803837953, + "grad_norm": 0.54296875, + "learning_rate": 0.00015891699511458383, + "loss": 1.7041, + "step": 6327 + }, + { + "epoch": 0.2998341625207297, + "grad_norm": 0.7734375, + "learning_rate": 0.00015890496110940937, + "loss": 1.2175, + "step": 6328 + }, + { + "epoch": 0.29988154465766403, + "grad_norm": 0.8359375, + "learning_rate": 0.00015889292579778568, + "loss": 0.6999, + "step": 6329 + }, + { + "epoch": 0.2999289267945984, + "grad_norm": 0.65625, + "learning_rate": 0.00015888088917997962, + "loss": 0.7467, + "step": 6330 + }, + { + "epoch": 0.2999763089315328, + "grad_norm": 0.04931640625, + "learning_rate": 0.00015886885125625813, + "loss": 0.0019, + "step": 6331 + }, + { + "epoch": 0.3000236910684672, + "grad_norm": 0.71484375, + "learning_rate": 0.0001588568120268882, + "loss": 1.3148, + "step": 6332 + }, + { + "epoch": 0.30007107320540155, + "grad_norm": 0.57421875, + "learning_rate": 0.0001588447714921369, + "loss": 0.9488, + "step": 6333 + }, + { + "epoch": 0.30011845534233594, + "grad_norm": 0.47265625, + "learning_rate": 0.00015883272965227125, + "loss": 0.1925, + "step": 6334 + }, + { + "epoch": 0.30016583747927034, + "grad_norm": 0.73828125, + "learning_rate": 0.00015882068650755832, + "loss": 0.9735, + "step": 6335 + }, + { + "epoch": 0.3002132196162047, + "grad_norm": 0.4765625, + "learning_rate": 0.0001588086420582652, + "loss": 0.1459, + "step": 6336 + }, + { + "epoch": 0.30026060175313907, + "grad_norm": 0.55859375, + "learning_rate": 0.00015879659630465905, + "loss": 1.1262, + "step": 6337 + }, + { + "epoch": 0.30030798389007346, + "grad_norm": 0.91015625, + "learning_rate": 0.00015878454924700706, + "loss": 1.0842, + "step": 6338 + }, + { + "epoch": 0.3003553660270078, + "grad_norm": 1.03125, + "learning_rate": 0.00015877250088557635, + "loss": 0.9827, + "step": 6339 + }, + { + "epoch": 0.3004027481639422, + "grad_norm": 0.66015625, + "learning_rate": 0.00015876045122063416, + "loss": 1.1656, + "step": 6340 + }, + { + "epoch": 0.3004501303008766, + "grad_norm": 0.1298828125, + "learning_rate": 0.00015874840025244775, + "loss": 0.013, + "step": 6341 + }, + { + "epoch": 0.3004975124378109, + "grad_norm": 0.87109375, + "learning_rate": 0.00015873634798128442, + "loss": 0.7893, + "step": 6342 + }, + { + "epoch": 0.3005448945747453, + "grad_norm": 0.66796875, + "learning_rate": 0.00015872429440741143, + "loss": 0.7876, + "step": 6343 + }, + { + "epoch": 0.3005922767116797, + "grad_norm": 2.078125, + "learning_rate": 0.00015871223953109616, + "loss": 0.8104, + "step": 6344 + }, + { + "epoch": 0.3006396588486141, + "grad_norm": 1.6328125, + "learning_rate": 0.00015870018335260597, + "loss": 0.6066, + "step": 6345 + }, + { + "epoch": 0.30068704098554844, + "grad_norm": 0.57421875, + "learning_rate": 0.0001586881258722082, + "loss": 0.7675, + "step": 6346 + }, + { + "epoch": 0.30073442312248283, + "grad_norm": 0.58984375, + "learning_rate": 0.00015867606709017032, + "loss": 0.0975, + "step": 6347 + }, + { + "epoch": 0.3007818052594172, + "grad_norm": 0.353515625, + "learning_rate": 0.00015866400700675977, + "loss": 0.0579, + "step": 6348 + }, + { + "epoch": 0.30082918739635156, + "grad_norm": 0.1826171875, + "learning_rate": 0.00015865194562224402, + "loss": 0.0256, + "step": 6349 + }, + { + "epoch": 0.30087656953328595, + "grad_norm": 0.61328125, + "learning_rate": 0.00015863988293689062, + "loss": 0.5278, + "step": 6350 + }, + { + "epoch": 0.30092395167022035, + "grad_norm": 0.60546875, + "learning_rate": 0.00015862781895096707, + "loss": 1.3326, + "step": 6351 + }, + { + "epoch": 0.3009713338071547, + "grad_norm": 0.53515625, + "learning_rate": 0.00015861575366474094, + "loss": 1.1228, + "step": 6352 + }, + { + "epoch": 0.3010187159440891, + "grad_norm": 0.51953125, + "learning_rate": 0.00015860368707847983, + "loss": 0.8816, + "step": 6353 + }, + { + "epoch": 0.30106609808102347, + "grad_norm": 0.55859375, + "learning_rate": 0.00015859161919245133, + "loss": 0.7351, + "step": 6354 + }, + { + "epoch": 0.3011134802179578, + "grad_norm": 0.064453125, + "learning_rate": 0.00015857955000692317, + "loss": 0.0034, + "step": 6355 + }, + { + "epoch": 0.3011608623548922, + "grad_norm": 0.58984375, + "learning_rate": 0.00015856747952216297, + "loss": 0.9956, + "step": 6356 + }, + { + "epoch": 0.3012082444918266, + "grad_norm": 0.61328125, + "learning_rate": 0.00015855540773843845, + "loss": 0.1596, + "step": 6357 + }, + { + "epoch": 0.30125562662876093, + "grad_norm": 0.228515625, + "learning_rate": 0.00015854333465601736, + "loss": 0.1357, + "step": 6358 + }, + { + "epoch": 0.3013030087656953, + "grad_norm": 0.484375, + "learning_rate": 0.0001585312602751675, + "loss": 0.7673, + "step": 6359 + }, + { + "epoch": 0.3013503909026297, + "grad_norm": 0.81640625, + "learning_rate": 0.0001585191845961566, + "loss": 1.2014, + "step": 6360 + }, + { + "epoch": 0.3013977730395641, + "grad_norm": 0.515625, + "learning_rate": 0.00015850710761925252, + "loss": 1.1184, + "step": 6361 + }, + { + "epoch": 0.30144515517649845, + "grad_norm": 0.62109375, + "learning_rate": 0.0001584950293447231, + "loss": 0.7689, + "step": 6362 + }, + { + "epoch": 0.30149253731343284, + "grad_norm": 0.96484375, + "learning_rate": 0.00015848294977283624, + "loss": 1.2459, + "step": 6363 + }, + { + "epoch": 0.30153991945036723, + "grad_norm": 0.58984375, + "learning_rate": 0.00015847086890385988, + "loss": 0.8124, + "step": 6364 + }, + { + "epoch": 0.30158730158730157, + "grad_norm": 0.1884765625, + "learning_rate": 0.00015845878673806187, + "loss": 0.1512, + "step": 6365 + }, + { + "epoch": 0.30163468372423596, + "grad_norm": 0.00567626953125, + "learning_rate": 0.00015844670327571029, + "loss": 0.0003, + "step": 6366 + }, + { + "epoch": 0.30168206586117036, + "grad_norm": 0.32421875, + "learning_rate": 0.00015843461851707303, + "loss": 0.0213, + "step": 6367 + }, + { + "epoch": 0.3017294479981047, + "grad_norm": 0.80859375, + "learning_rate": 0.00015842253246241823, + "loss": 1.3032, + "step": 6368 + }, + { + "epoch": 0.3017768301350391, + "grad_norm": 0.66796875, + "learning_rate": 0.00015841044511201387, + "loss": 1.1587, + "step": 6369 + }, + { + "epoch": 0.3018242122719735, + "grad_norm": 0.3203125, + "learning_rate": 0.00015839835646612804, + "loss": 0.1073, + "step": 6370 + }, + { + "epoch": 0.3018715944089078, + "grad_norm": 0.416015625, + "learning_rate": 0.00015838626652502888, + "loss": 0.0134, + "step": 6371 + }, + { + "epoch": 0.3019189765458422, + "grad_norm": 0.474609375, + "learning_rate": 0.0001583741752889845, + "loss": 0.6428, + "step": 6372 + }, + { + "epoch": 0.3019663586827766, + "grad_norm": 1.265625, + "learning_rate": 0.00015836208275826313, + "loss": 0.8704, + "step": 6373 + }, + { + "epoch": 0.302013740819711, + "grad_norm": 0.322265625, + "learning_rate": 0.00015834998893313288, + "loss": 0.1387, + "step": 6374 + }, + { + "epoch": 0.30206112295664533, + "grad_norm": 0.53515625, + "learning_rate": 0.00015833789381386205, + "loss": 0.673, + "step": 6375 + }, + { + "epoch": 0.3021085050935797, + "grad_norm": 0.73046875, + "learning_rate": 0.0001583257974007189, + "loss": 0.9997, + "step": 6376 + }, + { + "epoch": 0.3021558872305141, + "grad_norm": 0.734375, + "learning_rate": 0.00015831369969397164, + "loss": 1.2411, + "step": 6377 + }, + { + "epoch": 0.30220326936744846, + "grad_norm": 0.52734375, + "learning_rate": 0.00015830160069388866, + "loss": 0.7677, + "step": 6378 + }, + { + "epoch": 0.30225065150438285, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00015828950040073828, + "loss": 0.0027, + "step": 6379 + }, + { + "epoch": 0.30229803364131724, + "grad_norm": 0.43359375, + "learning_rate": 0.00015827739881478888, + "loss": 0.0299, + "step": 6380 + }, + { + "epoch": 0.3023454157782516, + "grad_norm": 0.8359375, + "learning_rate": 0.00015826529593630881, + "loss": 1.3019, + "step": 6381 + }, + { + "epoch": 0.302392797915186, + "grad_norm": 0.609375, + "learning_rate": 0.00015825319176556658, + "loss": 0.8761, + "step": 6382 + }, + { + "epoch": 0.30244018005212037, + "grad_norm": 0.609375, + "learning_rate": 0.00015824108630283057, + "loss": 1.1102, + "step": 6383 + }, + { + "epoch": 0.3024875621890547, + "grad_norm": 0.6015625, + "learning_rate": 0.00015822897954836931, + "loss": 1.1638, + "step": 6384 + }, + { + "epoch": 0.3025349443259891, + "grad_norm": 0.53125, + "learning_rate": 0.00015821687150245132, + "loss": 0.7149, + "step": 6385 + }, + { + "epoch": 0.3025823264629235, + "grad_norm": 0.61328125, + "learning_rate": 0.0001582047621653451, + "loss": 1.0962, + "step": 6386 + }, + { + "epoch": 0.3026297085998578, + "grad_norm": 0.5859375, + "learning_rate": 0.0001581926515373193, + "loss": 1.4139, + "step": 6387 + }, + { + "epoch": 0.3026770907367922, + "grad_norm": 0.068359375, + "learning_rate": 0.00015818053961864242, + "loss": 0.0104, + "step": 6388 + }, + { + "epoch": 0.3027244728737266, + "grad_norm": 1.3984375, + "learning_rate": 0.00015816842640958321, + "loss": 0.7952, + "step": 6389 + }, + { + "epoch": 0.302771855010661, + "grad_norm": 0.59375, + "learning_rate": 0.0001581563119104102, + "loss": 1.1018, + "step": 6390 + }, + { + "epoch": 0.30281923714759534, + "grad_norm": 0.240234375, + "learning_rate": 0.00015814419612139214, + "loss": 0.1604, + "step": 6391 + }, + { + "epoch": 0.30286661928452974, + "grad_norm": 0.70703125, + "learning_rate": 0.0001581320790427978, + "loss": 0.9539, + "step": 6392 + }, + { + "epoch": 0.30291400142146413, + "grad_norm": 0.66796875, + "learning_rate": 0.0001581199606748958, + "loss": 0.7522, + "step": 6393 + }, + { + "epoch": 0.30296138355839847, + "grad_norm": 0.6171875, + "learning_rate": 0.000158107841017955, + "loss": 0.8823, + "step": 6394 + }, + { + "epoch": 0.30300876569533286, + "grad_norm": 0.48046875, + "learning_rate": 0.0001580957200722442, + "loss": 0.034, + "step": 6395 + }, + { + "epoch": 0.30305614783226725, + "grad_norm": 0.2109375, + "learning_rate": 0.0001580835978380322, + "loss": 0.15, + "step": 6396 + }, + { + "epoch": 0.3031035299692016, + "grad_norm": 0.302734375, + "learning_rate": 0.00015807147431558786, + "loss": 0.1194, + "step": 6397 + }, + { + "epoch": 0.303150912106136, + "grad_norm": 0.625, + "learning_rate": 0.00015805934950518006, + "loss": 1.0244, + "step": 6398 + }, + { + "epoch": 0.3031982942430704, + "grad_norm": 0.12890625, + "learning_rate": 0.00015804722340707778, + "loss": 0.0147, + "step": 6399 + }, + { + "epoch": 0.3032456763800047, + "grad_norm": 0.7890625, + "learning_rate": 0.00015803509602154984, + "loss": 0.7737, + "step": 6400 + }, + { + "epoch": 0.3032930585169391, + "grad_norm": 0.478515625, + "learning_rate": 0.00015802296734886536, + "loss": 0.5766, + "step": 6401 + }, + { + "epoch": 0.3033404406538735, + "grad_norm": 0.56640625, + "learning_rate": 0.00015801083738929323, + "loss": 0.854, + "step": 6402 + }, + { + "epoch": 0.3033878227908079, + "grad_norm": 0.2041015625, + "learning_rate": 0.00015799870614310253, + "loss": 0.1264, + "step": 6403 + }, + { + "epoch": 0.30343520492774223, + "grad_norm": 0.6015625, + "learning_rate": 0.00015798657361056228, + "loss": 1.1678, + "step": 6404 + }, + { + "epoch": 0.3034825870646766, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001579744397919416, + "loss": 0.1216, + "step": 6405 + }, + { + "epoch": 0.303529969201611, + "grad_norm": 0.7734375, + "learning_rate": 0.00015796230468750961, + "loss": 1.2662, + "step": 6406 + }, + { + "epoch": 0.30357735133854535, + "grad_norm": 0.4140625, + "learning_rate": 0.00015795016829753546, + "loss": 0.1355, + "step": 6407 + }, + { + "epoch": 0.30362473347547975, + "grad_norm": 0.90234375, + "learning_rate": 0.00015793803062228828, + "loss": 0.0243, + "step": 6408 + }, + { + "epoch": 0.30367211561241414, + "grad_norm": 0.59765625, + "learning_rate": 0.00015792589166203729, + "loss": 1.1711, + "step": 6409 + }, + { + "epoch": 0.3037194977493485, + "grad_norm": 0.7578125, + "learning_rate": 0.0001579137514170517, + "loss": 1.1089, + "step": 6410 + }, + { + "epoch": 0.30376687988628287, + "grad_norm": 0.53515625, + "learning_rate": 0.00015790160988760082, + "loss": 0.8376, + "step": 6411 + }, + { + "epoch": 0.30381426202321726, + "grad_norm": 0.6640625, + "learning_rate": 0.0001578894670739539, + "loss": 1.0766, + "step": 6412 + }, + { + "epoch": 0.3038616441601516, + "grad_norm": 0.8984375, + "learning_rate": 0.00015787732297638027, + "loss": 1.198, + "step": 6413 + }, + { + "epoch": 0.303909026297086, + "grad_norm": 0.57421875, + "learning_rate": 0.00015786517759514926, + "loss": 0.8732, + "step": 6414 + }, + { + "epoch": 0.3039564084340204, + "grad_norm": 0.54296875, + "learning_rate": 0.00015785303093053025, + "loss": 0.0758, + "step": 6415 + }, + { + "epoch": 0.3040037905709547, + "grad_norm": 0.89453125, + "learning_rate": 0.00015784088298279264, + "loss": 0.0796, + "step": 6416 + }, + { + "epoch": 0.3040511727078891, + "grad_norm": 0.6953125, + "learning_rate": 0.00015782873375220583, + "loss": 1.0285, + "step": 6417 + }, + { + "epoch": 0.3040985548448235, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001578165832390393, + "loss": 0.0299, + "step": 6418 + }, + { + "epoch": 0.3041459369817579, + "grad_norm": 0.57421875, + "learning_rate": 0.0001578044314435626, + "loss": 1.039, + "step": 6419 + }, + { + "epoch": 0.30419331911869224, + "grad_norm": 0.58203125, + "learning_rate": 0.0001577922783660451, + "loss": 1.2203, + "step": 6420 + }, + { + "epoch": 0.30424070125562663, + "grad_norm": 0.69921875, + "learning_rate": 0.00015778012400675647, + "loss": 1.0667, + "step": 6421 + }, + { + "epoch": 0.304288083392561, + "grad_norm": 0.61328125, + "learning_rate": 0.0001577679683659662, + "loss": 0.1458, + "step": 6422 + }, + { + "epoch": 0.30433546552949536, + "grad_norm": 0.2578125, + "learning_rate": 0.00015775581144394395, + "loss": 0.0251, + "step": 6423 + }, + { + "epoch": 0.30438284766642976, + "grad_norm": 0.4609375, + "learning_rate": 0.00015774365324095936, + "loss": 1.1773, + "step": 6424 + }, + { + "epoch": 0.30443022980336415, + "grad_norm": 0.55859375, + "learning_rate": 0.00015773149375728198, + "loss": 0.9736, + "step": 6425 + }, + { + "epoch": 0.3044776119402985, + "grad_norm": 0.234375, + "learning_rate": 0.0001577193329931816, + "loss": 0.0459, + "step": 6426 + }, + { + "epoch": 0.3045249940772329, + "grad_norm": 0.8671875, + "learning_rate": 0.00015770717094892785, + "loss": 0.9453, + "step": 6427 + }, + { + "epoch": 0.3045723762141673, + "grad_norm": 0.703125, + "learning_rate": 0.00015769500762479054, + "loss": 0.7941, + "step": 6428 + }, + { + "epoch": 0.3046197583511016, + "grad_norm": 0.1943359375, + "learning_rate": 0.00015768284302103945, + "loss": 0.1327, + "step": 6429 + }, + { + "epoch": 0.304667140488036, + "grad_norm": 0.421875, + "learning_rate": 0.0001576706771379443, + "loss": 0.1174, + "step": 6430 + }, + { + "epoch": 0.3047145226249704, + "grad_norm": 0.6875, + "learning_rate": 0.000157658509975775, + "loss": 1.1515, + "step": 6431 + }, + { + "epoch": 0.3047619047619048, + "grad_norm": 0.05517578125, + "learning_rate": 0.00015764634153480134, + "loss": 0.0068, + "step": 6432 + }, + { + "epoch": 0.3048092868988391, + "grad_norm": 0.68359375, + "learning_rate": 0.0001576341718152932, + "loss": 0.9539, + "step": 6433 + }, + { + "epoch": 0.3048566690357735, + "grad_norm": 0.61328125, + "learning_rate": 0.00015762200081752053, + "loss": 0.8338, + "step": 6434 + }, + { + "epoch": 0.3049040511727079, + "grad_norm": 0.73828125, + "learning_rate": 0.0001576098285417533, + "loss": 0.8848, + "step": 6435 + }, + { + "epoch": 0.30495143330964225, + "grad_norm": 0.53515625, + "learning_rate": 0.00015759765498826137, + "loss": 1.0667, + "step": 6436 + }, + { + "epoch": 0.30499881544657664, + "grad_norm": 0.6875, + "learning_rate": 0.00015758548015731486, + "loss": 0.9937, + "step": 6437 + }, + { + "epoch": 0.30504619758351104, + "grad_norm": 0.5546875, + "learning_rate": 0.00015757330404918372, + "loss": 1.1364, + "step": 6438 + }, + { + "epoch": 0.3050935797204454, + "grad_norm": 0.7265625, + "learning_rate": 0.000157561126664138, + "loss": 1.0133, + "step": 6439 + }, + { + "epoch": 0.30514096185737977, + "grad_norm": 0.6171875, + "learning_rate": 0.0001575489480024478, + "loss": 0.562, + "step": 6440 + }, + { + "epoch": 0.30518834399431416, + "grad_norm": 0.46484375, + "learning_rate": 0.00015753676806438328, + "loss": 0.02, + "step": 6441 + }, + { + "epoch": 0.3052357261312485, + "grad_norm": 0.240234375, + "learning_rate": 0.00015752458685021448, + "loss": 0.138, + "step": 6442 + }, + { + "epoch": 0.3052831082681829, + "grad_norm": 0.578125, + "learning_rate": 0.00015751240436021163, + "loss": 0.7693, + "step": 6443 + }, + { + "epoch": 0.3053304904051173, + "grad_norm": 0.078125, + "learning_rate": 0.00015750022059464493, + "loss": 0.0042, + "step": 6444 + }, + { + "epoch": 0.3053778725420516, + "grad_norm": 0.05078125, + "learning_rate": 0.0001574880355537846, + "loss": 0.003, + "step": 6445 + }, + { + "epoch": 0.305425254678986, + "grad_norm": 0.515625, + "learning_rate": 0.0001574758492379008, + "loss": 0.8774, + "step": 6446 + }, + { + "epoch": 0.3054726368159204, + "grad_norm": 0.69921875, + "learning_rate": 0.00015746366164726396, + "loss": 1.1418, + "step": 6447 + }, + { + "epoch": 0.3055200189528548, + "grad_norm": 0.62890625, + "learning_rate": 0.00015745147278214427, + "loss": 1.227, + "step": 6448 + }, + { + "epoch": 0.30556740108978914, + "grad_norm": 0.1923828125, + "learning_rate": 0.00015743928264281212, + "loss": 0.0743, + "step": 6449 + }, + { + "epoch": 0.30561478322672353, + "grad_norm": 0.55859375, + "learning_rate": 0.00015742709122953786, + "loss": 0.9859, + "step": 6450 + }, + { + "epoch": 0.3056621653636579, + "grad_norm": 0.81640625, + "learning_rate": 0.00015741489854259187, + "loss": 1.0521, + "step": 6451 + }, + { + "epoch": 0.30570954750059226, + "grad_norm": 0.474609375, + "learning_rate": 0.0001574027045822446, + "loss": 0.7161, + "step": 6452 + }, + { + "epoch": 0.30575692963752665, + "grad_norm": 0.61328125, + "learning_rate": 0.00015739050934876652, + "loss": 0.6875, + "step": 6453 + }, + { + "epoch": 0.30580431177446105, + "grad_norm": 0.5234375, + "learning_rate": 0.00015737831284242803, + "loss": 0.7807, + "step": 6454 + }, + { + "epoch": 0.3058516939113954, + "grad_norm": 0.765625, + "learning_rate": 0.0001573661150634997, + "loss": 1.3163, + "step": 6455 + }, + { + "epoch": 0.3058990760483298, + "grad_norm": 0.609375, + "learning_rate": 0.00015735391601225202, + "loss": 0.319, + "step": 6456 + }, + { + "epoch": 0.30594645818526417, + "grad_norm": 0.034912109375, + "learning_rate": 0.00015734171568895558, + "loss": 0.0014, + "step": 6457 + }, + { + "epoch": 0.3059938403221985, + "grad_norm": 0.57421875, + "learning_rate": 0.00015732951409388096, + "loss": 1.143, + "step": 6458 + }, + { + "epoch": 0.3060412224591329, + "grad_norm": 0.59375, + "learning_rate": 0.00015731731122729881, + "loss": 0.2536, + "step": 6459 + }, + { + "epoch": 0.3060886045960673, + "grad_norm": 0.2470703125, + "learning_rate": 0.00015730510708947973, + "loss": 0.1477, + "step": 6460 + }, + { + "epoch": 0.3061359867330017, + "grad_norm": 0.7109375, + "learning_rate": 0.0001572929016806944, + "loss": 1.3176, + "step": 6461 + }, + { + "epoch": 0.306183368869936, + "grad_norm": 0.65234375, + "learning_rate": 0.00015728069500121357, + "loss": 0.9559, + "step": 6462 + }, + { + "epoch": 0.3062307510068704, + "grad_norm": 0.5390625, + "learning_rate": 0.00015726848705130788, + "loss": 0.9945, + "step": 6463 + }, + { + "epoch": 0.3062781331438048, + "grad_norm": 1.1640625, + "learning_rate": 0.0001572562778312482, + "loss": 0.1301, + "step": 6464 + }, + { + "epoch": 0.30632551528073915, + "grad_norm": 0.048095703125, + "learning_rate": 0.00015724406734130524, + "loss": 0.0058, + "step": 6465 + }, + { + "epoch": 0.30637289741767354, + "grad_norm": 0.244140625, + "learning_rate": 0.00015723185558174984, + "loss": 0.1685, + "step": 6466 + }, + { + "epoch": 0.30642027955460793, + "grad_norm": 0.796875, + "learning_rate": 0.00015721964255285283, + "loss": 0.7259, + "step": 6467 + }, + { + "epoch": 0.30646766169154227, + "grad_norm": 0.6328125, + "learning_rate": 0.00015720742825488511, + "loss": 1.2139, + "step": 6468 + }, + { + "epoch": 0.30651504382847666, + "grad_norm": 0.73828125, + "learning_rate": 0.00015719521268811758, + "loss": 0.6002, + "step": 6469 + }, + { + "epoch": 0.30656242596541106, + "grad_norm": 0.53125, + "learning_rate": 0.00015718299585282117, + "loss": 0.5995, + "step": 6470 + }, + { + "epoch": 0.3066098081023454, + "grad_norm": 0.08349609375, + "learning_rate": 0.0001571707777492668, + "loss": 0.0112, + "step": 6471 + }, + { + "epoch": 0.3066571902392798, + "grad_norm": 0.478515625, + "learning_rate": 0.00015715855837772547, + "loss": 0.9161, + "step": 6472 + }, + { + "epoch": 0.3067045723762142, + "grad_norm": 0.6640625, + "learning_rate": 0.00015714633773846818, + "loss": 0.8095, + "step": 6473 + }, + { + "epoch": 0.3067519545131485, + "grad_norm": 0.6953125, + "learning_rate": 0.00015713411583176602, + "loss": 0.1901, + "step": 6474 + }, + { + "epoch": 0.3067993366500829, + "grad_norm": 0.7890625, + "learning_rate": 0.00015712189265789002, + "loss": 1.2767, + "step": 6475 + }, + { + "epoch": 0.3068467187870173, + "grad_norm": 0.66796875, + "learning_rate": 0.00015710966821711129, + "loss": 1.4565, + "step": 6476 + }, + { + "epoch": 0.3068941009239517, + "grad_norm": 0.703125, + "learning_rate": 0.00015709744250970092, + "loss": 1.0918, + "step": 6477 + }, + { + "epoch": 0.30694148306088603, + "grad_norm": 0.69140625, + "learning_rate": 0.00015708521553593012, + "loss": 1.4035, + "step": 6478 + }, + { + "epoch": 0.3069888651978204, + "grad_norm": 0.6875, + "learning_rate": 0.00015707298729607003, + "loss": 0.253, + "step": 6479 + }, + { + "epoch": 0.3070362473347548, + "grad_norm": 0.55859375, + "learning_rate": 0.0001570607577903919, + "loss": 0.7883, + "step": 6480 + }, + { + "epoch": 0.30708362947168916, + "grad_norm": 0.5703125, + "learning_rate": 0.0001570485270191669, + "loss": 0.7736, + "step": 6481 + }, + { + "epoch": 0.30713101160862355, + "grad_norm": 0.07177734375, + "learning_rate": 0.00015703629498266635, + "loss": 0.0134, + "step": 6482 + }, + { + "epoch": 0.30717839374555794, + "grad_norm": 0.001922607421875, + "learning_rate": 0.00015702406168116153, + "loss": 0.0002, + "step": 6483 + }, + { + "epoch": 0.3072257758824923, + "grad_norm": 0.61328125, + "learning_rate": 0.00015701182711492378, + "loss": 1.0878, + "step": 6484 + }, + { + "epoch": 0.3072731580194267, + "grad_norm": 0.2255859375, + "learning_rate": 0.00015699959128422442, + "loss": 0.109, + "step": 6485 + }, + { + "epoch": 0.30732054015636107, + "grad_norm": 0.95703125, + "learning_rate": 0.00015698735418933482, + "loss": 1.3013, + "step": 6486 + }, + { + "epoch": 0.3073679222932954, + "grad_norm": 0.62109375, + "learning_rate": 0.0001569751158305264, + "loss": 1.0957, + "step": 6487 + }, + { + "epoch": 0.3074153044302298, + "grad_norm": 0.7265625, + "learning_rate": 0.00015696287620807064, + "loss": 1.2234, + "step": 6488 + }, + { + "epoch": 0.3074626865671642, + "grad_norm": 0.49609375, + "learning_rate": 0.00015695063532223896, + "loss": 0.9033, + "step": 6489 + }, + { + "epoch": 0.3075100687040986, + "grad_norm": 0.8125, + "learning_rate": 0.0001569383931733028, + "loss": 0.7429, + "step": 6490 + }, + { + "epoch": 0.3075574508410329, + "grad_norm": 0.625, + "learning_rate": 0.00015692614976153374, + "loss": 1.1825, + "step": 6491 + }, + { + "epoch": 0.3076048329779673, + "grad_norm": 0.65234375, + "learning_rate": 0.00015691390508720335, + "loss": 0.678, + "step": 6492 + }, + { + "epoch": 0.3076522151149017, + "grad_norm": 0.71484375, + "learning_rate": 0.00015690165915058314, + "loss": 1.0472, + "step": 6493 + }, + { + "epoch": 0.30769959725183604, + "grad_norm": 0.5234375, + "learning_rate": 0.00015688941195194476, + "loss": 0.2872, + "step": 6494 + }, + { + "epoch": 0.30774697938877044, + "grad_norm": 0.578125, + "learning_rate": 0.0001568771634915598, + "loss": 1.4657, + "step": 6495 + }, + { + "epoch": 0.30779436152570483, + "grad_norm": 0.46484375, + "learning_rate": 0.00015686491376969993, + "loss": 0.021, + "step": 6496 + }, + { + "epoch": 0.30784174366263917, + "grad_norm": 0.6640625, + "learning_rate": 0.00015685266278663685, + "loss": 1.0232, + "step": 6497 + }, + { + "epoch": 0.30788912579957356, + "grad_norm": 0.83984375, + "learning_rate": 0.0001568404105426423, + "loss": 1.1927, + "step": 6498 + }, + { + "epoch": 0.30793650793650795, + "grad_norm": 0.8828125, + "learning_rate": 0.00015682815703798792, + "loss": 0.3838, + "step": 6499 + }, + { + "epoch": 0.3079838900734423, + "grad_norm": 0.265625, + "learning_rate": 0.00015681590227294558, + "loss": 0.0158, + "step": 6500 + }, + { + "epoch": 0.3080312722103767, + "grad_norm": 0.56640625, + "learning_rate": 0.00015680364624778704, + "loss": 1.0193, + "step": 6501 + }, + { + "epoch": 0.3080786543473111, + "grad_norm": 0.6953125, + "learning_rate": 0.00015679138896278417, + "loss": 1.3357, + "step": 6502 + }, + { + "epoch": 0.3081260364842454, + "grad_norm": 0.65234375, + "learning_rate": 0.00015677913041820874, + "loss": 0.8807, + "step": 6503 + }, + { + "epoch": 0.3081734186211798, + "grad_norm": 0.6640625, + "learning_rate": 0.00015676687061433268, + "loss": 1.2676, + "step": 6504 + }, + { + "epoch": 0.3082208007581142, + "grad_norm": 0.54296875, + "learning_rate": 0.0001567546095514279, + "loss": 0.62, + "step": 6505 + }, + { + "epoch": 0.3082681828950486, + "grad_norm": 0.55859375, + "learning_rate": 0.00015674234722976634, + "loss": 1.2959, + "step": 6506 + }, + { + "epoch": 0.30831556503198293, + "grad_norm": 0.6328125, + "learning_rate": 0.00015673008364962, + "loss": 0.6548, + "step": 6507 + }, + { + "epoch": 0.3083629471689173, + "grad_norm": 1.0546875, + "learning_rate": 0.00015671781881126075, + "loss": 0.1654, + "step": 6508 + }, + { + "epoch": 0.3084103293058517, + "grad_norm": 0.18359375, + "learning_rate": 0.00015670555271496075, + "loss": 0.1376, + "step": 6509 + }, + { + "epoch": 0.30845771144278605, + "grad_norm": 0.765625, + "learning_rate": 0.00015669328536099198, + "loss": 0.6347, + "step": 6510 + }, + { + "epoch": 0.30850509357972045, + "grad_norm": 0.62109375, + "learning_rate": 0.00015668101674962647, + "loss": 0.9348, + "step": 6511 + }, + { + "epoch": 0.30855247571665484, + "grad_norm": 0.2412109375, + "learning_rate": 0.00015666874688113644, + "loss": 0.1464, + "step": 6512 + }, + { + "epoch": 0.3085998578535892, + "grad_norm": 0.5, + "learning_rate": 0.00015665647575579397, + "loss": 0.6497, + "step": 6513 + }, + { + "epoch": 0.30864723999052357, + "grad_norm": 0.609375, + "learning_rate": 0.00015664420337387118, + "loss": 0.9218, + "step": 6514 + }, + { + "epoch": 0.30869462212745796, + "grad_norm": 0.60546875, + "learning_rate": 0.00015663192973564032, + "loss": 0.7704, + "step": 6515 + }, + { + "epoch": 0.3087420042643923, + "grad_norm": 0.5546875, + "learning_rate": 0.00015661965484137355, + "loss": 0.7489, + "step": 6516 + }, + { + "epoch": 0.3087893864013267, + "grad_norm": 0.59375, + "learning_rate": 0.00015660737869134322, + "loss": 0.5298, + "step": 6517 + }, + { + "epoch": 0.3088367685382611, + "grad_norm": 0.8046875, + "learning_rate": 0.00015659510128582143, + "loss": 0.77, + "step": 6518 + }, + { + "epoch": 0.3088841506751954, + "grad_norm": 0.640625, + "learning_rate": 0.00015658282262508063, + "loss": 1.2064, + "step": 6519 + }, + { + "epoch": 0.3089315328121298, + "grad_norm": 0.59765625, + "learning_rate": 0.00015657054270939308, + "loss": 0.5916, + "step": 6520 + }, + { + "epoch": 0.3089789149490642, + "grad_norm": 0.036376953125, + "learning_rate": 0.00015655826153903118, + "loss": 0.002, + "step": 6521 + }, + { + "epoch": 0.3090262970859986, + "grad_norm": 0.61328125, + "learning_rate": 0.00015654597911426722, + "loss": 0.9681, + "step": 6522 + }, + { + "epoch": 0.30907367922293294, + "grad_norm": 0.5234375, + "learning_rate": 0.00015653369543537372, + "loss": 0.4623, + "step": 6523 + }, + { + "epoch": 0.30912106135986733, + "grad_norm": 0.66796875, + "learning_rate": 0.00015652141050262307, + "loss": 0.4973, + "step": 6524 + }, + { + "epoch": 0.3091684434968017, + "grad_norm": 1.3515625, + "learning_rate": 0.00015650912431628772, + "loss": 0.2051, + "step": 6525 + }, + { + "epoch": 0.30921582563373606, + "grad_norm": 0.60546875, + "learning_rate": 0.00015649683687664017, + "loss": 1.2065, + "step": 6526 + }, + { + "epoch": 0.30926320777067046, + "grad_norm": 0.380859375, + "learning_rate": 0.00015648454818395298, + "loss": 0.0663, + "step": 6527 + }, + { + "epoch": 0.30931058990760485, + "grad_norm": 0.486328125, + "learning_rate": 0.00015647225823849868, + "loss": 0.8263, + "step": 6528 + }, + { + "epoch": 0.3093579720445392, + "grad_norm": 0.59765625, + "learning_rate": 0.00015645996704054984, + "loss": 1.0052, + "step": 6529 + }, + { + "epoch": 0.3094053541814736, + "grad_norm": 0.431640625, + "learning_rate": 0.00015644767459037907, + "loss": 0.8184, + "step": 6530 + }, + { + "epoch": 0.309452736318408, + "grad_norm": 0.47265625, + "learning_rate": 0.000156435380888259, + "loss": 0.9012, + "step": 6531 + }, + { + "epoch": 0.3095001184553423, + "grad_norm": 0.4609375, + "learning_rate": 0.00015642308593446228, + "loss": 0.3063, + "step": 6532 + }, + { + "epoch": 0.3095475005922767, + "grad_norm": 0.95703125, + "learning_rate": 0.00015641078972926164, + "loss": 0.1363, + "step": 6533 + }, + { + "epoch": 0.3095948827292111, + "grad_norm": 0.99609375, + "learning_rate": 0.00015639849227292975, + "loss": 0.5949, + "step": 6534 + }, + { + "epoch": 0.3096422648661455, + "grad_norm": 0.51171875, + "learning_rate": 0.0001563861935657394, + "loss": 0.7603, + "step": 6535 + }, + { + "epoch": 0.3096896470030798, + "grad_norm": 0.63671875, + "learning_rate": 0.0001563738936079633, + "loss": 0.9269, + "step": 6536 + }, + { + "epoch": 0.3097370291400142, + "grad_norm": 0.6484375, + "learning_rate": 0.0001563615923998743, + "loss": 0.9251, + "step": 6537 + }, + { + "epoch": 0.3097844112769486, + "grad_norm": 0.140625, + "learning_rate": 0.00015634928994174522, + "loss": 0.0117, + "step": 6538 + }, + { + "epoch": 0.30983179341388295, + "grad_norm": 0.70703125, + "learning_rate": 0.0001563369862338489, + "loss": 0.9585, + "step": 6539 + }, + { + "epoch": 0.30987917555081734, + "grad_norm": 0.66796875, + "learning_rate": 0.00015632468127645826, + "loss": 1.1298, + "step": 6540 + }, + { + "epoch": 0.30992655768775174, + "grad_norm": 0.75, + "learning_rate": 0.00015631237506984617, + "loss": 1.4214, + "step": 6541 + }, + { + "epoch": 0.3099739398246861, + "grad_norm": 0.6015625, + "learning_rate": 0.00015630006761428557, + "loss": 1.0516, + "step": 6542 + }, + { + "epoch": 0.31002132196162047, + "grad_norm": 0.67578125, + "learning_rate": 0.00015628775891004946, + "loss": 0.86, + "step": 6543 + }, + { + "epoch": 0.31006870409855486, + "grad_norm": 0.55859375, + "learning_rate": 0.00015627544895741076, + "loss": 0.4261, + "step": 6544 + }, + { + "epoch": 0.3101160862354892, + "grad_norm": 0.65625, + "learning_rate": 0.0001562631377566426, + "loss": 1.2106, + "step": 6545 + }, + { + "epoch": 0.3101634683724236, + "grad_norm": 0.69140625, + "learning_rate": 0.00015625082530801795, + "loss": 1.0549, + "step": 6546 + }, + { + "epoch": 0.310210850509358, + "grad_norm": 0.84375, + "learning_rate": 0.00015623851161180991, + "loss": 0.257, + "step": 6547 + }, + { + "epoch": 0.3102582326462923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001562261966682916, + "loss": 0.5688, + "step": 6548 + }, + { + "epoch": 0.3103056147832267, + "grad_norm": 0.33984375, + "learning_rate": 0.00015621388047773612, + "loss": 0.0285, + "step": 6549 + }, + { + "epoch": 0.3103529969201611, + "grad_norm": 0.53125, + "learning_rate": 0.00015620156304041666, + "loss": 0.621, + "step": 6550 + }, + { + "epoch": 0.3104003790570955, + "grad_norm": 0.68359375, + "learning_rate": 0.0001561892443566064, + "loss": 0.998, + "step": 6551 + }, + { + "epoch": 0.31044776119402984, + "grad_norm": 0.267578125, + "learning_rate": 0.00015617692442657853, + "loss": 0.03, + "step": 6552 + }, + { + "epoch": 0.31049514333096423, + "grad_norm": 0.84765625, + "learning_rate": 0.00015616460325060635, + "loss": 0.7481, + "step": 6553 + }, + { + "epoch": 0.3105425254678986, + "grad_norm": 0.490234375, + "learning_rate": 0.00015615228082896306, + "loss": 0.8258, + "step": 6554 + }, + { + "epoch": 0.31058990760483296, + "grad_norm": 0.5546875, + "learning_rate": 0.00015613995716192198, + "loss": 0.974, + "step": 6555 + }, + { + "epoch": 0.31063728974176735, + "grad_norm": 0.54296875, + "learning_rate": 0.00015612763224975647, + "loss": 0.8936, + "step": 6556 + }, + { + "epoch": 0.31068467187870175, + "grad_norm": 0.031982421875, + "learning_rate": 0.00015611530609273985, + "loss": 0.0016, + "step": 6557 + }, + { + "epoch": 0.3107320540156361, + "grad_norm": 0.66796875, + "learning_rate": 0.00015610297869114552, + "loss": 0.7345, + "step": 6558 + }, + { + "epoch": 0.3107794361525705, + "grad_norm": 0.2890625, + "learning_rate": 0.0001560906500452469, + "loss": 0.0393, + "step": 6559 + }, + { + "epoch": 0.31082681828950487, + "grad_norm": 0.84375, + "learning_rate": 0.00015607832015531736, + "loss": 0.1445, + "step": 6560 + }, + { + "epoch": 0.3108742004264392, + "grad_norm": 1.3046875, + "learning_rate": 0.00015606598902163045, + "loss": 0.6681, + "step": 6561 + }, + { + "epoch": 0.3109215825633736, + "grad_norm": 0.546875, + "learning_rate": 0.0001560536566444596, + "loss": 0.5334, + "step": 6562 + }, + { + "epoch": 0.310968964700308, + "grad_norm": 0.75, + "learning_rate": 0.00015604132302407837, + "loss": 1.5135, + "step": 6563 + }, + { + "epoch": 0.3110163468372424, + "grad_norm": 0.470703125, + "learning_rate": 0.0001560289881607603, + "loss": 0.4288, + "step": 6564 + }, + { + "epoch": 0.3110637289741767, + "grad_norm": 0.546875, + "learning_rate": 0.0001560166520547789, + "loss": 0.4492, + "step": 6565 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 1.0, + "learning_rate": 0.00015600431470640786, + "loss": 1.3553, + "step": 6566 + }, + { + "epoch": 0.3111584932480455, + "grad_norm": 0.5234375, + "learning_rate": 0.0001559919761159208, + "loss": 0.6518, + "step": 6567 + }, + { + "epoch": 0.31120587538497985, + "grad_norm": 0.66015625, + "learning_rate": 0.0001559796362835913, + "loss": 1.0372, + "step": 6568 + }, + { + "epoch": 0.31125325752191424, + "grad_norm": 0.236328125, + "learning_rate": 0.0001559672952096931, + "loss": 0.0377, + "step": 6569 + }, + { + "epoch": 0.31130063965884863, + "grad_norm": 0.7578125, + "learning_rate": 0.00015595495289449994, + "loss": 0.8932, + "step": 6570 + }, + { + "epoch": 0.31134802179578297, + "grad_norm": 0.8359375, + "learning_rate": 0.0001559426093382855, + "loss": 1.586, + "step": 6571 + }, + { + "epoch": 0.31139540393271736, + "grad_norm": 0.71484375, + "learning_rate": 0.00015593026454132359, + "loss": 0.9798, + "step": 6572 + }, + { + "epoch": 0.31144278606965176, + "grad_norm": 0.6015625, + "learning_rate": 0.00015591791850388798, + "loss": 0.7761, + "step": 6573 + }, + { + "epoch": 0.3114901682065861, + "grad_norm": 0.61328125, + "learning_rate": 0.00015590557122625246, + "loss": 0.9733, + "step": 6574 + }, + { + "epoch": 0.3115375503435205, + "grad_norm": 0.640625, + "learning_rate": 0.00015589322270869095, + "loss": 0.6146, + "step": 6575 + }, + { + "epoch": 0.3115849324804549, + "grad_norm": 0.7578125, + "learning_rate": 0.0001558808729514773, + "loss": 1.2899, + "step": 6576 + }, + { + "epoch": 0.3116323146173892, + "grad_norm": 0.66796875, + "learning_rate": 0.0001558685219548854, + "loss": 1.2373, + "step": 6577 + }, + { + "epoch": 0.3116796967543236, + "grad_norm": 0.734375, + "learning_rate": 0.0001558561697191892, + "loss": 1.3932, + "step": 6578 + }, + { + "epoch": 0.311727078891258, + "grad_norm": 0.6484375, + "learning_rate": 0.00015584381624466265, + "loss": 0.5996, + "step": 6579 + }, + { + "epoch": 0.3117744610281924, + "grad_norm": 0.71484375, + "learning_rate": 0.00015583146153157974, + "loss": 0.6589, + "step": 6580 + }, + { + "epoch": 0.31182184316512673, + "grad_norm": 0.69140625, + "learning_rate": 0.00015581910558021446, + "loss": 1.0168, + "step": 6581 + }, + { + "epoch": 0.3118692253020611, + "grad_norm": 0.255859375, + "learning_rate": 0.0001558067483908409, + "loss": 0.1408, + "step": 6582 + }, + { + "epoch": 0.3119166074389955, + "grad_norm": 0.6171875, + "learning_rate": 0.0001557943899637331, + "loss": 0.093, + "step": 6583 + }, + { + "epoch": 0.31196398957592986, + "grad_norm": 0.5703125, + "learning_rate": 0.00015578203029916515, + "loss": 0.722, + "step": 6584 + }, + { + "epoch": 0.31201137171286425, + "grad_norm": 0.625, + "learning_rate": 0.0001557696693974112, + "loss": 0.7425, + "step": 6585 + }, + { + "epoch": 0.31205875384979864, + "grad_norm": 0.671875, + "learning_rate": 0.00015575730725874535, + "loss": 1.0831, + "step": 6586 + }, + { + "epoch": 0.312106135986733, + "grad_norm": 0.671875, + "learning_rate": 0.00015574494388344182, + "loss": 1.0582, + "step": 6587 + }, + { + "epoch": 0.3121535181236674, + "grad_norm": 0.5703125, + "learning_rate": 0.0001557325792717749, + "loss": 1.2879, + "step": 6588 + }, + { + "epoch": 0.31220090026060177, + "grad_norm": 0.25, + "learning_rate": 0.00015572021342401864, + "loss": 0.1396, + "step": 6589 + }, + { + "epoch": 0.3122482823975361, + "grad_norm": 0.875, + "learning_rate": 0.00015570784634044742, + "loss": 0.2146, + "step": 6590 + }, + { + "epoch": 0.3122956645344705, + "grad_norm": 0.126953125, + "learning_rate": 0.0001556954780213355, + "loss": 0.0209, + "step": 6591 + }, + { + "epoch": 0.3123430466714049, + "grad_norm": 1.0, + "learning_rate": 0.00015568310846695722, + "loss": 0.9349, + "step": 6592 + }, + { + "epoch": 0.3123904288083393, + "grad_norm": 0.65625, + "learning_rate": 0.0001556707376775869, + "loss": 1.2064, + "step": 6593 + }, + { + "epoch": 0.3124378109452736, + "grad_norm": 0.73046875, + "learning_rate": 0.00015565836565349889, + "loss": 0.8765, + "step": 6594 + }, + { + "epoch": 0.312485193082208, + "grad_norm": 0.73046875, + "learning_rate": 0.00015564599239496765, + "loss": 1.4647, + "step": 6595 + }, + { + "epoch": 0.3125325752191424, + "grad_norm": 0.6015625, + "learning_rate": 0.0001556336179022676, + "loss": 0.0322, + "step": 6596 + }, + { + "epoch": 0.31257995735607674, + "grad_norm": 0.6328125, + "learning_rate": 0.0001556212421756731, + "loss": 0.7906, + "step": 6597 + }, + { + "epoch": 0.31262733949301114, + "grad_norm": 0.6796875, + "learning_rate": 0.00015560886521545866, + "loss": 1.4391, + "step": 6598 + }, + { + "epoch": 0.31267472162994553, + "grad_norm": 0.5859375, + "learning_rate": 0.00015559648702189892, + "loss": 0.1864, + "step": 6599 + }, + { + "epoch": 0.31272210376687987, + "grad_norm": 0.212890625, + "learning_rate": 0.00015558410759526826, + "loss": 0.0391, + "step": 6600 + }, + { + "epoch": 0.31276948590381426, + "grad_norm": 0.64453125, + "learning_rate": 0.0001555717269358413, + "loss": 0.9505, + "step": 6601 + }, + { + "epoch": 0.31281686804074865, + "grad_norm": 0.4296875, + "learning_rate": 0.00015555934504389262, + "loss": 0.1565, + "step": 6602 + }, + { + "epoch": 0.312864250177683, + "grad_norm": 0.6875, + "learning_rate": 0.0001555469619196969, + "loss": 0.616, + "step": 6603 + }, + { + "epoch": 0.3129116323146174, + "grad_norm": 0.6015625, + "learning_rate": 0.0001555345775635287, + "loss": 1.1719, + "step": 6604 + }, + { + "epoch": 0.3129590144515518, + "grad_norm": 1.0625, + "learning_rate": 0.00015552219197566272, + "loss": 2.274, + "step": 6605 + }, + { + "epoch": 0.3130063965884861, + "grad_norm": 0.5703125, + "learning_rate": 0.00015550980515637367, + "loss": 1.172, + "step": 6606 + }, + { + "epoch": 0.3130537787254205, + "grad_norm": 0.10302734375, + "learning_rate": 0.00015549741710593624, + "loss": 0.0183, + "step": 6607 + }, + { + "epoch": 0.3131011608623549, + "grad_norm": 0.435546875, + "learning_rate": 0.00015548502782462522, + "loss": 0.0319, + "step": 6608 + }, + { + "epoch": 0.3131485429992893, + "grad_norm": 0.8515625, + "learning_rate": 0.0001554726373127154, + "loss": 1.1681, + "step": 6609 + }, + { + "epoch": 0.31319592513622363, + "grad_norm": 0.83984375, + "learning_rate": 0.00015546024557048154, + "loss": 1.3657, + "step": 6610 + }, + { + "epoch": 0.313243307273158, + "grad_norm": 0.6484375, + "learning_rate": 0.00015544785259819855, + "loss": 1.2749, + "step": 6611 + }, + { + "epoch": 0.3132906894100924, + "grad_norm": 0.59765625, + "learning_rate": 0.00015543545839614121, + "loss": 0.6629, + "step": 6612 + }, + { + "epoch": 0.31333807154702675, + "grad_norm": 0.63671875, + "learning_rate": 0.0001554230629645845, + "loss": 1.1962, + "step": 6613 + }, + { + "epoch": 0.31338545368396115, + "grad_norm": 0.240234375, + "learning_rate": 0.00015541066630380326, + "loss": 0.0403, + "step": 6614 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 0.6171875, + "learning_rate": 0.00015539826841407247, + "loss": 0.1714, + "step": 6615 + }, + { + "epoch": 0.3134802179578299, + "grad_norm": 0.63671875, + "learning_rate": 0.00015538586929566707, + "loss": 0.9104, + "step": 6616 + }, + { + "epoch": 0.31352760009476427, + "grad_norm": 0.6171875, + "learning_rate": 0.0001553734689488621, + "loss": 0.7639, + "step": 6617 + }, + { + "epoch": 0.31357498223169866, + "grad_norm": 0.083984375, + "learning_rate": 0.00015536106737393257, + "loss": 0.0042, + "step": 6618 + }, + { + "epoch": 0.313622364368633, + "grad_norm": 0.4921875, + "learning_rate": 0.00015534866457115354, + "loss": 0.4751, + "step": 6619 + }, + { + "epoch": 0.3136697465055674, + "grad_norm": 0.8984375, + "learning_rate": 0.00015533626054080007, + "loss": 0.7211, + "step": 6620 + }, + { + "epoch": 0.3137171286425018, + "grad_norm": 0.6171875, + "learning_rate": 0.00015532385528314727, + "loss": 1.0287, + "step": 6621 + }, + { + "epoch": 0.3137645107794362, + "grad_norm": 0.75390625, + "learning_rate": 0.00015531144879847033, + "loss": 1.6725, + "step": 6622 + }, + { + "epoch": 0.3138118929163705, + "grad_norm": 0.59765625, + "learning_rate": 0.00015529904108704435, + "loss": 1.1494, + "step": 6623 + }, + { + "epoch": 0.3138592750533049, + "grad_norm": 0.68359375, + "learning_rate": 0.00015528663214914453, + "loss": 0.7874, + "step": 6624 + }, + { + "epoch": 0.3139066571902393, + "grad_norm": 0.59375, + "learning_rate": 0.0001552742219850461, + "loss": 0.9382, + "step": 6625 + }, + { + "epoch": 0.31395403932717364, + "grad_norm": 0.53515625, + "learning_rate": 0.00015526181059502428, + "loss": 0.8744, + "step": 6626 + }, + { + "epoch": 0.31400142146410803, + "grad_norm": 0.1826171875, + "learning_rate": 0.00015524939797935443, + "loss": 0.1217, + "step": 6627 + }, + { + "epoch": 0.3140488036010424, + "grad_norm": 0.1923828125, + "learning_rate": 0.00015523698413831173, + "loss": 0.1361, + "step": 6628 + }, + { + "epoch": 0.31409618573797676, + "grad_norm": 0.71484375, + "learning_rate": 0.00015522456907217155, + "loss": 0.8062, + "step": 6629 + }, + { + "epoch": 0.31414356787491116, + "grad_norm": 0.67578125, + "learning_rate": 0.0001552121527812093, + "loss": 0.9175, + "step": 6630 + }, + { + "epoch": 0.31419095001184555, + "grad_norm": 0.6328125, + "learning_rate": 0.00015519973526570023, + "loss": 0.813, + "step": 6631 + }, + { + "epoch": 0.3142383321487799, + "grad_norm": 0.6640625, + "learning_rate": 0.0001551873165259199, + "loss": 1.2004, + "step": 6632 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 0.412109375, + "learning_rate": 0.00015517489656214365, + "loss": 0.0825, + "step": 6633 + }, + { + "epoch": 0.3143330964226487, + "grad_norm": 0.2890625, + "learning_rate": 0.00015516247537464696, + "loss": 0.1309, + "step": 6634 + }, + { + "epoch": 0.314380478559583, + "grad_norm": 0.74609375, + "learning_rate": 0.00015515005296370534, + "loss": 1.126, + "step": 6635 + }, + { + "epoch": 0.3144278606965174, + "grad_norm": 0.66015625, + "learning_rate": 0.0001551376293295943, + "loss": 0.8408, + "step": 6636 + }, + { + "epoch": 0.3144752428334518, + "grad_norm": 0.6796875, + "learning_rate": 0.00015512520447258932, + "loss": 1.3584, + "step": 6637 + }, + { + "epoch": 0.3145226249703862, + "grad_norm": 1.0859375, + "learning_rate": 0.00015511277839296605, + "loss": 1.3213, + "step": 6638 + }, + { + "epoch": 0.3145700071073205, + "grad_norm": 0.6171875, + "learning_rate": 0.00015510035109100008, + "loss": 0.9437, + "step": 6639 + }, + { + "epoch": 0.3146173892442549, + "grad_norm": 0.310546875, + "learning_rate": 0.00015508792256696698, + "loss": 0.0295, + "step": 6640 + }, + { + "epoch": 0.3146647713811893, + "grad_norm": 0.47265625, + "learning_rate": 0.00015507549282114244, + "loss": 0.3888, + "step": 6641 + }, + { + "epoch": 0.31471215351812365, + "grad_norm": 0.8125, + "learning_rate": 0.00015506306185380213, + "loss": 1.0023, + "step": 6642 + }, + { + "epoch": 0.31475953565505804, + "grad_norm": 0.98828125, + "learning_rate": 0.0001550506296652218, + "loss": 0.4771, + "step": 6643 + }, + { + "epoch": 0.31480691779199244, + "grad_norm": 0.5859375, + "learning_rate": 0.0001550381962556771, + "loss": 0.7989, + "step": 6644 + }, + { + "epoch": 0.3148542999289268, + "grad_norm": 0.37109375, + "learning_rate": 0.00015502576162544385, + "loss": 0.0758, + "step": 6645 + }, + { + "epoch": 0.31490168206586117, + "grad_norm": 0.6953125, + "learning_rate": 0.00015501332577479778, + "loss": 1.0518, + "step": 6646 + }, + { + "epoch": 0.31494906420279556, + "grad_norm": 0.31640625, + "learning_rate": 0.00015500088870401476, + "loss": 0.0091, + "step": 6647 + }, + { + "epoch": 0.3149964463397299, + "grad_norm": 0.703125, + "learning_rate": 0.00015498845041337064, + "loss": 1.0809, + "step": 6648 + }, + { + "epoch": 0.3150438284766643, + "grad_norm": 0.3671875, + "learning_rate": 0.00015497601090314124, + "loss": 0.2188, + "step": 6649 + }, + { + "epoch": 0.3150912106135987, + "grad_norm": 0.765625, + "learning_rate": 0.0001549635701736025, + "loss": 1.1145, + "step": 6650 + }, + { + "epoch": 0.3151385927505331, + "grad_norm": 0.546875, + "learning_rate": 0.00015495112822503027, + "loss": 0.5646, + "step": 6651 + }, + { + "epoch": 0.3151859748874674, + "grad_norm": 0.66015625, + "learning_rate": 0.0001549386850577006, + "loss": 1.0729, + "step": 6652 + }, + { + "epoch": 0.3152333570244018, + "grad_norm": 0.6171875, + "learning_rate": 0.0001549262406718894, + "loss": 1.0544, + "step": 6653 + }, + { + "epoch": 0.3152807391613362, + "grad_norm": 0.546875, + "learning_rate": 0.00015491379506787264, + "loss": 0.781, + "step": 6654 + }, + { + "epoch": 0.31532812129827054, + "grad_norm": 0.5859375, + "learning_rate": 0.00015490134824592644, + "loss": 0.9315, + "step": 6655 + }, + { + "epoch": 0.31537550343520493, + "grad_norm": 0.68359375, + "learning_rate": 0.00015488890020632677, + "loss": 1.2667, + "step": 6656 + }, + { + "epoch": 0.3154228855721393, + "grad_norm": 0.11865234375, + "learning_rate": 0.00015487645094934983, + "loss": 0.0167, + "step": 6657 + }, + { + "epoch": 0.31547026770907366, + "grad_norm": 0.158203125, + "learning_rate": 0.00015486400047527162, + "loss": 0.0131, + "step": 6658 + }, + { + "epoch": 0.31551764984600805, + "grad_norm": 0.55078125, + "learning_rate": 0.00015485154878436833, + "loss": 0.6654, + "step": 6659 + }, + { + "epoch": 0.31556503198294245, + "grad_norm": 0.53515625, + "learning_rate": 0.0001548390958769161, + "loss": 0.4158, + "step": 6660 + }, + { + "epoch": 0.3156124141198768, + "grad_norm": 0.7109375, + "learning_rate": 0.00015482664175319111, + "loss": 1.1609, + "step": 6661 + }, + { + "epoch": 0.3156597962568112, + "grad_norm": 0.62109375, + "learning_rate": 0.00015481418641346965, + "loss": 1.0335, + "step": 6662 + }, + { + "epoch": 0.31570717839374557, + "grad_norm": 0.11962890625, + "learning_rate": 0.0001548017298580279, + "loss": 0.0021, + "step": 6663 + }, + { + "epoch": 0.3157545605306799, + "grad_norm": 0.490234375, + "learning_rate": 0.0001547892720871422, + "loss": 0.7076, + "step": 6664 + }, + { + "epoch": 0.3158019426676143, + "grad_norm": 0.54296875, + "learning_rate": 0.00015477681310108873, + "loss": 0.7821, + "step": 6665 + }, + { + "epoch": 0.3158493248045487, + "grad_norm": 0.11865234375, + "learning_rate": 0.00015476435290014395, + "loss": 0.0123, + "step": 6666 + }, + { + "epoch": 0.3158967069414831, + "grad_norm": 0.146484375, + "learning_rate": 0.0001547518914845841, + "loss": 0.025, + "step": 6667 + }, + { + "epoch": 0.3159440890784174, + "grad_norm": 0.5703125, + "learning_rate": 0.0001547394288546857, + "loss": 1.0853, + "step": 6668 + }, + { + "epoch": 0.3159914712153518, + "grad_norm": 0.31640625, + "learning_rate": 0.000154726965010725, + "loss": 0.0582, + "step": 6669 + }, + { + "epoch": 0.3160388533522862, + "grad_norm": 0.54296875, + "learning_rate": 0.0001547144999529785, + "loss": 0.0365, + "step": 6670 + }, + { + "epoch": 0.31608623548922055, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001547020336817227, + "loss": 0.0178, + "step": 6671 + }, + { + "epoch": 0.31613361762615494, + "grad_norm": 0.75, + "learning_rate": 0.00015468956619723407, + "loss": 0.7368, + "step": 6672 + }, + { + "epoch": 0.31618099976308933, + "grad_norm": 0.201171875, + "learning_rate": 0.0001546770974997891, + "loss": 0.0375, + "step": 6673 + }, + { + "epoch": 0.31622838190002367, + "grad_norm": 0.7109375, + "learning_rate": 0.00015466462758966437, + "loss": 0.7151, + "step": 6674 + }, + { + "epoch": 0.31627576403695806, + "grad_norm": 1.359375, + "learning_rate": 0.0001546521564671364, + "loss": 0.5066, + "step": 6675 + }, + { + "epoch": 0.31632314617389246, + "grad_norm": 0.23046875, + "learning_rate": 0.0001546396841324818, + "loss": 0.011, + "step": 6676 + }, + { + "epoch": 0.3163705283108268, + "grad_norm": 1.5234375, + "learning_rate": 0.0001546272105859772, + "loss": 1.2534, + "step": 6677 + }, + { + "epoch": 0.3164179104477612, + "grad_norm": 0.58984375, + "learning_rate": 0.0001546147358278993, + "loss": 0.9658, + "step": 6678 + }, + { + "epoch": 0.3164652925846956, + "grad_norm": 0.00341796875, + "learning_rate": 0.0001546022598585247, + "loss": 0.0003, + "step": 6679 + }, + { + "epoch": 0.31651267472162997, + "grad_norm": 0.734375, + "learning_rate": 0.00015458978267813016, + "loss": 0.9939, + "step": 6680 + }, + { + "epoch": 0.3165600568585643, + "grad_norm": 0.671875, + "learning_rate": 0.00015457730428699237, + "loss": 0.5775, + "step": 6681 + }, + { + "epoch": 0.3166074389954987, + "grad_norm": 0.5625, + "learning_rate": 0.00015456482468538814, + "loss": 0.9927, + "step": 6682 + }, + { + "epoch": 0.3166548211324331, + "grad_norm": 0.58984375, + "learning_rate": 0.00015455234387359417, + "loss": 0.867, + "step": 6683 + }, + { + "epoch": 0.31670220326936743, + "grad_norm": 0.07470703125, + "learning_rate": 0.0001545398618518873, + "loss": 0.009, + "step": 6684 + }, + { + "epoch": 0.3167495854063018, + "grad_norm": 0.71484375, + "learning_rate": 0.00015452737862054446, + "loss": 1.0679, + "step": 6685 + }, + { + "epoch": 0.3167969675432362, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015451489417984238, + "loss": 0.0163, + "step": 6686 + }, + { + "epoch": 0.31684434968017056, + "grad_norm": 0.703125, + "learning_rate": 0.00015450240853005805, + "loss": 0.973, + "step": 6687 + }, + { + "epoch": 0.31689173181710495, + "grad_norm": 0.68359375, + "learning_rate": 0.00015448992167146834, + "loss": 0.3367, + "step": 6688 + }, + { + "epoch": 0.31693911395403934, + "grad_norm": 0.216796875, + "learning_rate": 0.0001544774336043502, + "loss": 0.1513, + "step": 6689 + }, + { + "epoch": 0.3169864960909737, + "grad_norm": 0.59765625, + "learning_rate": 0.00015446494432898064, + "loss": 1.4223, + "step": 6690 + }, + { + "epoch": 0.3170338782279081, + "grad_norm": 0.50390625, + "learning_rate": 0.0001544524538456366, + "loss": 1.0795, + "step": 6691 + }, + { + "epoch": 0.31708126036484247, + "grad_norm": 0.53515625, + "learning_rate": 0.00015443996215459515, + "loss": 0.4657, + "step": 6692 + }, + { + "epoch": 0.3171286425017768, + "grad_norm": 0.66015625, + "learning_rate": 0.0001544274692561333, + "loss": 0.9374, + "step": 6693 + }, + { + "epoch": 0.3171760246387112, + "grad_norm": 0.734375, + "learning_rate": 0.00015441497515052816, + "loss": 0.9042, + "step": 6694 + }, + { + "epoch": 0.3172234067756456, + "grad_norm": 0.1875, + "learning_rate": 0.00015440247983805686, + "loss": 0.0815, + "step": 6695 + }, + { + "epoch": 0.31727078891258, + "grad_norm": 0.65625, + "learning_rate": 0.0001543899833189965, + "loss": 1.362, + "step": 6696 + }, + { + "epoch": 0.3173181710495143, + "grad_norm": 0.56640625, + "learning_rate": 0.0001543774855936242, + "loss": 0.6509, + "step": 6697 + }, + { + "epoch": 0.3173655531864487, + "grad_norm": 0.30859375, + "learning_rate": 0.00015436498666221725, + "loss": 0.014, + "step": 6698 + }, + { + "epoch": 0.3174129353233831, + "grad_norm": 0.796875, + "learning_rate": 0.00015435248652505276, + "loss": 0.8763, + "step": 6699 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.6015625, + "learning_rate": 0.000154339985182408, + "loss": 0.7114, + "step": 6700 + }, + { + "epoch": 0.31750769959725184, + "grad_norm": 0.0159912109375, + "learning_rate": 0.0001543274826345603, + "loss": 0.001, + "step": 6701 + }, + { + "epoch": 0.31755508173418623, + "grad_norm": 0.228515625, + "learning_rate": 0.00015431497888178687, + "loss": 0.0264, + "step": 6702 + }, + { + "epoch": 0.31760246387112057, + "grad_norm": 0.8984375, + "learning_rate": 0.00015430247392436507, + "loss": 1.4535, + "step": 6703 + }, + { + "epoch": 0.31764984600805496, + "grad_norm": 0.94140625, + "learning_rate": 0.00015428996776257225, + "loss": 0.6312, + "step": 6704 + }, + { + "epoch": 0.31769722814498935, + "grad_norm": 0.2451171875, + "learning_rate": 0.00015427746039668575, + "loss": 0.1916, + "step": 6705 + }, + { + "epoch": 0.3177446102819237, + "grad_norm": 0.75, + "learning_rate": 0.00015426495182698302, + "loss": 0.7129, + "step": 6706 + }, + { + "epoch": 0.3177919924188581, + "grad_norm": 0.369140625, + "learning_rate": 0.00015425244205374144, + "loss": 0.1759, + "step": 6707 + }, + { + "epoch": 0.3178393745557925, + "grad_norm": 0.7890625, + "learning_rate": 0.00015423993107723847, + "loss": 1.465, + "step": 6708 + }, + { + "epoch": 0.31788675669272687, + "grad_norm": 0.65625, + "learning_rate": 0.0001542274188977516, + "loss": 0.9762, + "step": 6709 + }, + { + "epoch": 0.3179341388296612, + "grad_norm": 0.35546875, + "learning_rate": 0.00015421490551555838, + "loss": 0.1516, + "step": 6710 + }, + { + "epoch": 0.3179815209665956, + "grad_norm": 0.79296875, + "learning_rate": 0.00015420239093093623, + "loss": 1.2048, + "step": 6711 + }, + { + "epoch": 0.31802890310353, + "grad_norm": 0.6875, + "learning_rate": 0.00015418987514416286, + "loss": 1.065, + "step": 6712 + }, + { + "epoch": 0.31807628524046433, + "grad_norm": 1.2578125, + "learning_rate": 0.00015417735815551574, + "loss": 0.2488, + "step": 6713 + }, + { + "epoch": 0.3181236673773987, + "grad_norm": 1.1171875, + "learning_rate": 0.00015416483996527256, + "loss": 1.0395, + "step": 6714 + }, + { + "epoch": 0.3181710495143331, + "grad_norm": 0.55078125, + "learning_rate": 0.0001541523205737109, + "loss": 0.411, + "step": 6715 + }, + { + "epoch": 0.31821843165126745, + "grad_norm": 0.60546875, + "learning_rate": 0.00015413979998110845, + "loss": 0.7368, + "step": 6716 + }, + { + "epoch": 0.31826581378820185, + "grad_norm": 0.671875, + "learning_rate": 0.00015412727818774288, + "loss": 0.4258, + "step": 6717 + }, + { + "epoch": 0.31831319592513624, + "grad_norm": 0.58984375, + "learning_rate": 0.00015411475519389197, + "loss": 0.8731, + "step": 6718 + }, + { + "epoch": 0.3183605780620706, + "grad_norm": 0.83203125, + "learning_rate": 0.00015410223099983342, + "loss": 1.1324, + "step": 6719 + }, + { + "epoch": 0.31840796019900497, + "grad_norm": 1.0703125, + "learning_rate": 0.000154089705605845, + "loss": 0.0377, + "step": 6720 + }, + { + "epoch": 0.31845534233593936, + "grad_norm": 0.486328125, + "learning_rate": 0.00015407717901220452, + "loss": 0.0648, + "step": 6721 + }, + { + "epoch": 0.3185027244728737, + "grad_norm": 0.71484375, + "learning_rate": 0.00015406465121918985, + "loss": 1.4768, + "step": 6722 + }, + { + "epoch": 0.3185501066098081, + "grad_norm": 0.498046875, + "learning_rate": 0.00015405212222707875, + "loss": 0.0798, + "step": 6723 + }, + { + "epoch": 0.3185974887467425, + "grad_norm": 0.6640625, + "learning_rate": 0.00015403959203614918, + "loss": 0.9283, + "step": 6724 + }, + { + "epoch": 0.3186448708836769, + "grad_norm": 0.5703125, + "learning_rate": 0.00015402706064667902, + "loss": 1.3497, + "step": 6725 + }, + { + "epoch": 0.3186922530206112, + "grad_norm": 0.453125, + "learning_rate": 0.0001540145280589462, + "loss": 0.6482, + "step": 6726 + }, + { + "epoch": 0.3187396351575456, + "grad_norm": 0.671875, + "learning_rate": 0.00015400199427322868, + "loss": 1.0577, + "step": 6727 + }, + { + "epoch": 0.31878701729448, + "grad_norm": 0.6171875, + "learning_rate": 0.00015398945928980445, + "loss": 1.4132, + "step": 6728 + }, + { + "epoch": 0.31883439943141434, + "grad_norm": 0.515625, + "learning_rate": 0.0001539769231089515, + "loss": 0.7271, + "step": 6729 + }, + { + "epoch": 0.31888178156834873, + "grad_norm": 0.66796875, + "learning_rate": 0.00015396438573094792, + "loss": 0.0687, + "step": 6730 + }, + { + "epoch": 0.3189291637052831, + "grad_norm": 0.314453125, + "learning_rate": 0.00015395184715607174, + "loss": 0.0163, + "step": 6731 + }, + { + "epoch": 0.31897654584221746, + "grad_norm": 0.5390625, + "learning_rate": 0.00015393930738460103, + "loss": 0.6262, + "step": 6732 + }, + { + "epoch": 0.31902392797915186, + "grad_norm": 1.1796875, + "learning_rate": 0.00015392676641681394, + "loss": 0.6423, + "step": 6733 + }, + { + "epoch": 0.31907131011608625, + "grad_norm": 0.5, + "learning_rate": 0.00015391422425298866, + "loss": 0.9327, + "step": 6734 + }, + { + "epoch": 0.3191186922530206, + "grad_norm": 0.734375, + "learning_rate": 0.00015390168089340325, + "loss": 0.3877, + "step": 6735 + }, + { + "epoch": 0.319166074389955, + "grad_norm": 0.70703125, + "learning_rate": 0.00015388913633833598, + "loss": 0.8933, + "step": 6736 + }, + { + "epoch": 0.31921345652688937, + "grad_norm": 0.796875, + "learning_rate": 0.0001538765905880651, + "loss": 0.6484, + "step": 6737 + }, + { + "epoch": 0.31926083866382376, + "grad_norm": 0.59765625, + "learning_rate": 0.00015386404364286884, + "loss": 0.1907, + "step": 6738 + }, + { + "epoch": 0.3193082208007581, + "grad_norm": 0.6640625, + "learning_rate": 0.00015385149550302543, + "loss": 1.6144, + "step": 6739 + }, + { + "epoch": 0.3193556029376925, + "grad_norm": 0.4921875, + "learning_rate": 0.00015383894616881322, + "loss": 0.8437, + "step": 6740 + }, + { + "epoch": 0.3194029850746269, + "grad_norm": 0.70703125, + "learning_rate": 0.0001538263956405105, + "loss": 1.4197, + "step": 6741 + }, + { + "epoch": 0.3194503672115612, + "grad_norm": 0.7890625, + "learning_rate": 0.0001538138439183957, + "loss": 1.0557, + "step": 6742 + }, + { + "epoch": 0.3194977493484956, + "grad_norm": 0.51953125, + "learning_rate": 0.00015380129100274714, + "loss": 0.5517, + "step": 6743 + }, + { + "epoch": 0.31954513148543, + "grad_norm": 0.8046875, + "learning_rate": 0.00015378873689384328, + "loss": 1.4355, + "step": 6744 + }, + { + "epoch": 0.31959251362236435, + "grad_norm": 0.2265625, + "learning_rate": 0.0001537761815919625, + "loss": 0.1077, + "step": 6745 + }, + { + "epoch": 0.31963989575929874, + "grad_norm": 0.98828125, + "learning_rate": 0.00015376362509738332, + "loss": 0.5584, + "step": 6746 + }, + { + "epoch": 0.31968727789623314, + "grad_norm": 0.69140625, + "learning_rate": 0.0001537510674103842, + "loss": 0.9827, + "step": 6747 + }, + { + "epoch": 0.3197346600331675, + "grad_norm": 0.69140625, + "learning_rate": 0.00015373850853124362, + "loss": 0.7614, + "step": 6748 + }, + { + "epoch": 0.31978204217010187, + "grad_norm": 0.48046875, + "learning_rate": 0.00015372594846024018, + "loss": 0.6042, + "step": 6749 + }, + { + "epoch": 0.31982942430703626, + "grad_norm": 0.6953125, + "learning_rate": 0.00015371338719765242, + "loss": 0.6404, + "step": 6750 + }, + { + "epoch": 0.3198768064439706, + "grad_norm": 0.58203125, + "learning_rate": 0.00015370082474375896, + "loss": 0.7974, + "step": 6751 + }, + { + "epoch": 0.319924188580905, + "grad_norm": 0.119140625, + "learning_rate": 0.00015368826109883837, + "loss": 0.0118, + "step": 6752 + }, + { + "epoch": 0.3199715707178394, + "grad_norm": 0.6796875, + "learning_rate": 0.0001536756962631694, + "loss": 0.9106, + "step": 6753 + }, + { + "epoch": 0.3200189528547738, + "grad_norm": 0.376953125, + "learning_rate": 0.00015366313023703061, + "loss": 0.0626, + "step": 6754 + }, + { + "epoch": 0.3200663349917081, + "grad_norm": 0.76953125, + "learning_rate": 0.00015365056302070077, + "loss": 0.9871, + "step": 6755 + }, + { + "epoch": 0.3201137171286425, + "grad_norm": 0.62109375, + "learning_rate": 0.0001536379946144586, + "loss": 0.7532, + "step": 6756 + }, + { + "epoch": 0.3201610992655769, + "grad_norm": 0.056640625, + "learning_rate": 0.0001536254250185828, + "loss": 0.0051, + "step": 6757 + }, + { + "epoch": 0.32020848140251124, + "grad_norm": 0.9921875, + "learning_rate": 0.00015361285423335223, + "loss": 1.1125, + "step": 6758 + }, + { + "epoch": 0.32025586353944563, + "grad_norm": 0.78125, + "learning_rate": 0.00015360028225904567, + "loss": 0.8966, + "step": 6759 + }, + { + "epoch": 0.32030324567638, + "grad_norm": 0.62890625, + "learning_rate": 0.00015358770909594188, + "loss": 0.189, + "step": 6760 + }, + { + "epoch": 0.32035062781331436, + "grad_norm": 0.59765625, + "learning_rate": 0.00015357513474431985, + "loss": 0.7883, + "step": 6761 + }, + { + "epoch": 0.32039800995024875, + "grad_norm": 0.69921875, + "learning_rate": 0.0001535625592044584, + "loss": 1.0491, + "step": 6762 + }, + { + "epoch": 0.32044539208718315, + "grad_norm": 0.58203125, + "learning_rate": 0.0001535499824766364, + "loss": 0.8326, + "step": 6763 + }, + { + "epoch": 0.3204927742241175, + "grad_norm": 0.59375, + "learning_rate": 0.0001535374045611328, + "loss": 1.5672, + "step": 6764 + }, + { + "epoch": 0.3205401563610519, + "grad_norm": 0.6328125, + "learning_rate": 0.00015352482545822666, + "loss": 0.678, + "step": 6765 + }, + { + "epoch": 0.32058753849798627, + "grad_norm": 0.17578125, + "learning_rate": 0.00015351224516819688, + "loss": 0.1243, + "step": 6766 + }, + { + "epoch": 0.32063492063492066, + "grad_norm": 0.9921875, + "learning_rate": 0.00015349966369132247, + "loss": 1.1682, + "step": 6767 + }, + { + "epoch": 0.320682302771855, + "grad_norm": 0.7890625, + "learning_rate": 0.00015348708102788254, + "loss": 1.3193, + "step": 6768 + }, + { + "epoch": 0.3207296849087894, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001534744971781561, + "loss": 0.0012, + "step": 6769 + }, + { + "epoch": 0.3207770670457238, + "grad_norm": 0.64453125, + "learning_rate": 0.0001534619121424223, + "loss": 1.0788, + "step": 6770 + }, + { + "epoch": 0.3208244491826581, + "grad_norm": 0.30078125, + "learning_rate": 0.00015344932592096022, + "loss": 0.1021, + "step": 6771 + }, + { + "epoch": 0.3208718313195925, + "grad_norm": 0.78125, + "learning_rate": 0.00015343673851404903, + "loss": 1.2014, + "step": 6772 + }, + { + "epoch": 0.3209192134565269, + "grad_norm": 0.640625, + "learning_rate": 0.00015342414992196787, + "loss": 0.716, + "step": 6773 + }, + { + "epoch": 0.32096659559346125, + "grad_norm": 0.84375, + "learning_rate": 0.00015341156014499598, + "loss": 1.0394, + "step": 6774 + }, + { + "epoch": 0.32101397773039564, + "grad_norm": 0.59765625, + "learning_rate": 0.00015339896918341257, + "loss": 0.9286, + "step": 6775 + }, + { + "epoch": 0.32106135986733003, + "grad_norm": 0.8046875, + "learning_rate": 0.00015338637703749694, + "loss": 1.0344, + "step": 6776 + }, + { + "epoch": 0.32110874200426437, + "grad_norm": 0.609375, + "learning_rate": 0.00015337378370752831, + "loss": 0.751, + "step": 6777 + }, + { + "epoch": 0.32115612414119876, + "grad_norm": 0.61328125, + "learning_rate": 0.00015336118919378601, + "loss": 1.0953, + "step": 6778 + }, + { + "epoch": 0.32120350627813316, + "grad_norm": 0.6171875, + "learning_rate": 0.00015334859349654936, + "loss": 0.9351, + "step": 6779 + }, + { + "epoch": 0.3212508884150675, + "grad_norm": 0.2353515625, + "learning_rate": 0.00015333599661609775, + "loss": 0.0137, + "step": 6780 + }, + { + "epoch": 0.3212982705520019, + "grad_norm": 0.09912109375, + "learning_rate": 0.00015332339855271052, + "loss": 0.0023, + "step": 6781 + }, + { + "epoch": 0.3213456526889363, + "grad_norm": 0.54296875, + "learning_rate": 0.0001533107993066671, + "loss": 1.0246, + "step": 6782 + }, + { + "epoch": 0.32139303482587067, + "grad_norm": 0.703125, + "learning_rate": 0.00015329819887824697, + "loss": 0.8105, + "step": 6783 + }, + { + "epoch": 0.321440416962805, + "grad_norm": 0.162109375, + "learning_rate": 0.00015328559726772957, + "loss": 0.0319, + "step": 6784 + }, + { + "epoch": 0.3214877990997394, + "grad_norm": 0.193359375, + "learning_rate": 0.00015327299447539435, + "loss": 0.1445, + "step": 6785 + }, + { + "epoch": 0.3215351812366738, + "grad_norm": 0.5703125, + "learning_rate": 0.00015326039050152086, + "loss": 0.6866, + "step": 6786 + }, + { + "epoch": 0.32158256337360813, + "grad_norm": 0.79296875, + "learning_rate": 0.00015324778534638863, + "loss": 1.0162, + "step": 6787 + }, + { + "epoch": 0.3216299455105425, + "grad_norm": 0.91796875, + "learning_rate": 0.00015323517901027727, + "loss": 1.3541, + "step": 6788 + }, + { + "epoch": 0.3216773276474769, + "grad_norm": 0.7578125, + "learning_rate": 0.0001532225714934663, + "loss": 0.7535, + "step": 6789 + }, + { + "epoch": 0.32172470978441126, + "grad_norm": 0.248046875, + "learning_rate": 0.00015320996279623544, + "loss": 0.1705, + "step": 6790 + }, + { + "epoch": 0.32177209192134565, + "grad_norm": 0.765625, + "learning_rate": 0.00015319735291886423, + "loss": 1.1241, + "step": 6791 + }, + { + "epoch": 0.32181947405828004, + "grad_norm": 0.73046875, + "learning_rate": 0.00015318474186163243, + "loss": 1.0389, + "step": 6792 + }, + { + "epoch": 0.3218668561952144, + "grad_norm": 0.7734375, + "learning_rate": 0.00015317212962481967, + "loss": 1.0484, + "step": 6793 + }, + { + "epoch": 0.32191423833214877, + "grad_norm": 0.6328125, + "learning_rate": 0.00015315951620870573, + "loss": 1.4084, + "step": 6794 + }, + { + "epoch": 0.32196162046908317, + "grad_norm": 0.6328125, + "learning_rate": 0.00015314690161357037, + "loss": 0.9202, + "step": 6795 + }, + { + "epoch": 0.32200900260601756, + "grad_norm": 0.74609375, + "learning_rate": 0.0001531342858396933, + "loss": 0.7585, + "step": 6796 + }, + { + "epoch": 0.3220563847429519, + "grad_norm": 0.58984375, + "learning_rate": 0.00015312166888735437, + "loss": 1.2135, + "step": 6797 + }, + { + "epoch": 0.3221037668798863, + "grad_norm": 0.330078125, + "learning_rate": 0.00015310905075683343, + "loss": 0.1361, + "step": 6798 + }, + { + "epoch": 0.3221511490168207, + "grad_norm": 0.68359375, + "learning_rate": 0.00015309643144841034, + "loss": 1.1331, + "step": 6799 + }, + { + "epoch": 0.322198531153755, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001530838109623649, + "loss": 0.0115, + "step": 6800 + }, + { + "epoch": 0.3222459132906894, + "grad_norm": 0.5078125, + "learning_rate": 0.0001530711892989771, + "loss": 0.2296, + "step": 6801 + }, + { + "epoch": 0.3222932954276238, + "grad_norm": 0.56640625, + "learning_rate": 0.00015305856645852686, + "loss": 1.0299, + "step": 6802 + }, + { + "epoch": 0.32234067756455814, + "grad_norm": 0.4296875, + "learning_rate": 0.00015304594244129412, + "loss": 0.1335, + "step": 6803 + }, + { + "epoch": 0.32238805970149254, + "grad_norm": 0.004669189453125, + "learning_rate": 0.00015303331724755885, + "loss": 0.0002, + "step": 6804 + }, + { + "epoch": 0.32243544183842693, + "grad_norm": 0.625, + "learning_rate": 0.0001530206908776011, + "loss": 1.3244, + "step": 6805 + }, + { + "epoch": 0.32248282397536127, + "grad_norm": 0.33984375, + "learning_rate": 0.00015300806333170094, + "loss": 0.1052, + "step": 6806 + }, + { + "epoch": 0.32253020611229566, + "grad_norm": 0.703125, + "learning_rate": 0.00015299543461013836, + "loss": 0.8976, + "step": 6807 + }, + { + "epoch": 0.32257758824923005, + "grad_norm": 0.5703125, + "learning_rate": 0.00015298280471319353, + "loss": 0.1655, + "step": 6808 + }, + { + "epoch": 0.3226249703861644, + "grad_norm": 0.10107421875, + "learning_rate": 0.00015297017364114649, + "loss": 0.0132, + "step": 6809 + }, + { + "epoch": 0.3226723525230988, + "grad_norm": 0.7421875, + "learning_rate": 0.00015295754139427742, + "loss": 0.7851, + "step": 6810 + }, + { + "epoch": 0.3227197346600332, + "grad_norm": 0.6875, + "learning_rate": 0.0001529449079728665, + "loss": 1.1728, + "step": 6811 + }, + { + "epoch": 0.32276711679696757, + "grad_norm": 0.53515625, + "learning_rate": 0.00015293227337719395, + "loss": 0.7953, + "step": 6812 + }, + { + "epoch": 0.3228144989339019, + "grad_norm": 0.443359375, + "learning_rate": 0.00015291963760753988, + "loss": 0.4337, + "step": 6813 + }, + { + "epoch": 0.3228618810708363, + "grad_norm": 0.177734375, + "learning_rate": 0.0001529070006641847, + "loss": 0.0239, + "step": 6814 + }, + { + "epoch": 0.3229092632077707, + "grad_norm": 0.6015625, + "learning_rate": 0.00015289436254740853, + "loss": 1.0518, + "step": 6815 + }, + { + "epoch": 0.32295664534470503, + "grad_norm": 0.7109375, + "learning_rate": 0.00015288172325749177, + "loss": 0.6293, + "step": 6816 + }, + { + "epoch": 0.3230040274816394, + "grad_norm": 0.5390625, + "learning_rate": 0.00015286908279471472, + "loss": 0.7351, + "step": 6817 + }, + { + "epoch": 0.3230514096185738, + "grad_norm": 0.59375, + "learning_rate": 0.00015285644115935773, + "loss": 0.9188, + "step": 6818 + }, + { + "epoch": 0.32309879175550815, + "grad_norm": 0.3671875, + "learning_rate": 0.00015284379835170118, + "loss": 0.0971, + "step": 6819 + }, + { + "epoch": 0.32314617389244255, + "grad_norm": 1.0390625, + "learning_rate": 0.00015283115437202542, + "loss": 0.5676, + "step": 6820 + }, + { + "epoch": 0.32319355602937694, + "grad_norm": 0.57421875, + "learning_rate": 0.00015281850922061095, + "loss": 0.9162, + "step": 6821 + }, + { + "epoch": 0.3232409381663113, + "grad_norm": 0.7109375, + "learning_rate": 0.00015280586289773823, + "loss": 1.3716, + "step": 6822 + }, + { + "epoch": 0.32328832030324567, + "grad_norm": 0.62890625, + "learning_rate": 0.0001527932154036877, + "loss": 0.3771, + "step": 6823 + }, + { + "epoch": 0.32333570244018006, + "grad_norm": 0.62109375, + "learning_rate": 0.0001527805667387399, + "loss": 0.4614, + "step": 6824 + }, + { + "epoch": 0.32338308457711445, + "grad_norm": 0.75, + "learning_rate": 0.00015276791690317534, + "loss": 0.5965, + "step": 6825 + }, + { + "epoch": 0.3234304667140488, + "grad_norm": 0.78125, + "learning_rate": 0.00015275526589727463, + "loss": 1.106, + "step": 6826 + }, + { + "epoch": 0.3234778488509832, + "grad_norm": 0.59375, + "learning_rate": 0.00015274261372131824, + "loss": 1.1501, + "step": 6827 + }, + { + "epoch": 0.3235252309879176, + "grad_norm": 0.2197265625, + "learning_rate": 0.00015272996037558688, + "loss": 0.0227, + "step": 6828 + }, + { + "epoch": 0.3235726131248519, + "grad_norm": 0.53515625, + "learning_rate": 0.00015271730586036118, + "loss": 0.9243, + "step": 6829 + }, + { + "epoch": 0.3236199952617863, + "grad_norm": 0.73828125, + "learning_rate": 0.0001527046501759218, + "loss": 0.1782, + "step": 6830 + }, + { + "epoch": 0.3236673773987207, + "grad_norm": 0.57421875, + "learning_rate": 0.0001526919933225494, + "loss": 0.1422, + "step": 6831 + }, + { + "epoch": 0.32371475953565504, + "grad_norm": 0.72265625, + "learning_rate": 0.0001526793353005247, + "loss": 1.4339, + "step": 6832 + }, + { + "epoch": 0.32376214167258943, + "grad_norm": 0.50390625, + "learning_rate": 0.00015266667611012847, + "loss": 0.9213, + "step": 6833 + }, + { + "epoch": 0.3238095238095238, + "grad_norm": 0.5234375, + "learning_rate": 0.00015265401575164148, + "loss": 0.6695, + "step": 6834 + }, + { + "epoch": 0.32385690594645816, + "grad_norm": 0.51171875, + "learning_rate": 0.00015264135422534447, + "loss": 1.099, + "step": 6835 + }, + { + "epoch": 0.32390428808339256, + "grad_norm": 1.40625, + "learning_rate": 0.0001526286915315183, + "loss": 1.2299, + "step": 6836 + }, + { + "epoch": 0.32395167022032695, + "grad_norm": 0.609375, + "learning_rate": 0.00015261602767044382, + "loss": 0.6742, + "step": 6837 + }, + { + "epoch": 0.3239990523572613, + "grad_norm": 0.65625, + "learning_rate": 0.00015260336264240188, + "loss": 1.0091, + "step": 6838 + }, + { + "epoch": 0.3240464344941957, + "grad_norm": 0.21875, + "learning_rate": 0.0001525906964476734, + "loss": 0.1714, + "step": 6839 + }, + { + "epoch": 0.32409381663113007, + "grad_norm": 0.7578125, + "learning_rate": 0.00015257802908653928, + "loss": 0.1054, + "step": 6840 + }, + { + "epoch": 0.32414119876806446, + "grad_norm": 0.00323486328125, + "learning_rate": 0.0001525653605592805, + "loss": 0.0003, + "step": 6841 + }, + { + "epoch": 0.3241885809049988, + "grad_norm": 0.169921875, + "learning_rate": 0.000152552690866178, + "loss": 0.1259, + "step": 6842 + }, + { + "epoch": 0.3242359630419332, + "grad_norm": 0.0028228759765625, + "learning_rate": 0.0001525400200075128, + "loss": 0.0002, + "step": 6843 + }, + { + "epoch": 0.3242833451788676, + "grad_norm": 0.96484375, + "learning_rate": 0.00015252734798356588, + "loss": 1.4238, + "step": 6844 + }, + { + "epoch": 0.3243307273158019, + "grad_norm": 0.0400390625, + "learning_rate": 0.00015251467479461837, + "loss": 0.0015, + "step": 6845 + }, + { + "epoch": 0.3243781094527363, + "grad_norm": 0.7734375, + "learning_rate": 0.0001525020004409513, + "loss": 0.3023, + "step": 6846 + }, + { + "epoch": 0.3244254915896707, + "grad_norm": 0.66796875, + "learning_rate": 0.0001524893249228458, + "loss": 1.0292, + "step": 6847 + }, + { + "epoch": 0.32447287372660505, + "grad_norm": 0.71875, + "learning_rate": 0.00015247664824058295, + "loss": 0.2902, + "step": 6848 + }, + { + "epoch": 0.32452025586353944, + "grad_norm": 0.765625, + "learning_rate": 0.00015246397039444398, + "loss": 0.9553, + "step": 6849 + }, + { + "epoch": 0.32456763800047383, + "grad_norm": 0.66015625, + "learning_rate": 0.00015245129138471, + "loss": 0.8417, + "step": 6850 + }, + { + "epoch": 0.3246150201374082, + "grad_norm": 0.671875, + "learning_rate": 0.00015243861121166222, + "loss": 0.9548, + "step": 6851 + }, + { + "epoch": 0.32466240227434257, + "grad_norm": 0.5703125, + "learning_rate": 0.00015242592987558192, + "loss": 0.9847, + "step": 6852 + }, + { + "epoch": 0.32470978441127696, + "grad_norm": 0.7890625, + "learning_rate": 0.00015241324737675038, + "loss": 1.4285, + "step": 6853 + }, + { + "epoch": 0.32475716654821135, + "grad_norm": 0.59765625, + "learning_rate": 0.00015240056371544877, + "loss": 1.125, + "step": 6854 + }, + { + "epoch": 0.3248045486851457, + "grad_norm": 0.5546875, + "learning_rate": 0.0001523878788919585, + "loss": 0.7167, + "step": 6855 + }, + { + "epoch": 0.3248519308220801, + "grad_norm": 0.45703125, + "learning_rate": 0.0001523751929065609, + "loss": 0.7513, + "step": 6856 + }, + { + "epoch": 0.3248993129590145, + "grad_norm": 1.0, + "learning_rate": 0.0001523625057595373, + "loss": 1.1208, + "step": 6857 + }, + { + "epoch": 0.3249466950959488, + "grad_norm": 0.71875, + "learning_rate": 0.0001523498174511691, + "loss": 0.7778, + "step": 6858 + }, + { + "epoch": 0.3249940772328832, + "grad_norm": 0.0537109375, + "learning_rate": 0.0001523371279817377, + "loss": 0.0056, + "step": 6859 + }, + { + "epoch": 0.3250414593698176, + "grad_norm": 0.890625, + "learning_rate": 0.00015232443735152456, + "loss": 0.9183, + "step": 6860 + }, + { + "epoch": 0.32508884150675194, + "grad_norm": 0.625, + "learning_rate": 0.00015231174556081109, + "loss": 0.7987, + "step": 6861 + }, + { + "epoch": 0.32513622364368633, + "grad_norm": 0.482421875, + "learning_rate": 0.00015229905260987886, + "loss": 0.1747, + "step": 6862 + }, + { + "epoch": 0.3251836057806207, + "grad_norm": 0.640625, + "learning_rate": 0.00015228635849900935, + "loss": 1.0784, + "step": 6863 + }, + { + "epoch": 0.32523098791755506, + "grad_norm": 0.6015625, + "learning_rate": 0.00015227366322848407, + "loss": 0.0737, + "step": 6864 + }, + { + "epoch": 0.32527837005448945, + "grad_norm": 0.380859375, + "learning_rate": 0.00015226096679858467, + "loss": 0.0539, + "step": 6865 + }, + { + "epoch": 0.32532575219142384, + "grad_norm": 0.74609375, + "learning_rate": 0.00015224826920959264, + "loss": 1.331, + "step": 6866 + }, + { + "epoch": 0.3253731343283582, + "grad_norm": 0.53515625, + "learning_rate": 0.00015223557046178967, + "loss": 0.8021, + "step": 6867 + }, + { + "epoch": 0.3254205164652926, + "grad_norm": 0.6171875, + "learning_rate": 0.0001522228705554574, + "loss": 0.6568, + "step": 6868 + }, + { + "epoch": 0.32546789860222697, + "grad_norm": 0.92578125, + "learning_rate": 0.00015221016949087748, + "loss": 1.5958, + "step": 6869 + }, + { + "epoch": 0.32551528073916136, + "grad_norm": 0.88671875, + "learning_rate": 0.0001521974672683316, + "loss": 1.0054, + "step": 6870 + }, + { + "epoch": 0.3255626628760957, + "grad_norm": 0.19921875, + "learning_rate": 0.00015218476388810151, + "loss": 0.1415, + "step": 6871 + }, + { + "epoch": 0.3256100450130301, + "grad_norm": 0.65234375, + "learning_rate": 0.00015217205935046896, + "loss": 0.9692, + "step": 6872 + }, + { + "epoch": 0.3256574271499645, + "grad_norm": 0.515625, + "learning_rate": 0.00015215935365571568, + "loss": 0.6272, + "step": 6873 + }, + { + "epoch": 0.3257048092868988, + "grad_norm": 0.81640625, + "learning_rate": 0.0001521466468041235, + "loss": 1.0321, + "step": 6874 + }, + { + "epoch": 0.3257521914238332, + "grad_norm": 0.61328125, + "learning_rate": 0.0001521339387959742, + "loss": 1.1456, + "step": 6875 + }, + { + "epoch": 0.3257995735607676, + "grad_norm": 0.97265625, + "learning_rate": 0.00015212122963154974, + "loss": 0.2301, + "step": 6876 + }, + { + "epoch": 0.32584695569770195, + "grad_norm": 0.138671875, + "learning_rate": 0.00015210851931113186, + "loss": 0.0103, + "step": 6877 + }, + { + "epoch": 0.32589433783463634, + "grad_norm": 0.5859375, + "learning_rate": 0.00015209580783500255, + "loss": 1.1929, + "step": 6878 + }, + { + "epoch": 0.32594171997157073, + "grad_norm": 0.6796875, + "learning_rate": 0.00015208309520344372, + "loss": 1.2416, + "step": 6879 + }, + { + "epoch": 0.32598910210850507, + "grad_norm": 0.73828125, + "learning_rate": 0.0001520703814167373, + "loss": 1.2499, + "step": 6880 + }, + { + "epoch": 0.32603648424543946, + "grad_norm": 0.734375, + "learning_rate": 0.0001520576664751653, + "loss": 0.8114, + "step": 6881 + }, + { + "epoch": 0.32608386638237385, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001520449503790097, + "loss": 0.1669, + "step": 6882 + }, + { + "epoch": 0.32613124851930825, + "grad_norm": 0.51171875, + "learning_rate": 0.00015203223312855254, + "loss": 0.5363, + "step": 6883 + }, + { + "epoch": 0.3261786306562426, + "grad_norm": 0.7109375, + "learning_rate": 0.00015201951472407584, + "loss": 1.1362, + "step": 6884 + }, + { + "epoch": 0.326226012793177, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015200679516586173, + "loss": 0.0285, + "step": 6885 + }, + { + "epoch": 0.32627339493011137, + "grad_norm": 0.62109375, + "learning_rate": 0.00015199407445419235, + "loss": 1.0288, + "step": 6886 + }, + { + "epoch": 0.3263207770670457, + "grad_norm": 0.546875, + "learning_rate": 0.00015198135258934968, + "loss": 1.0881, + "step": 6887 + }, + { + "epoch": 0.3263681592039801, + "grad_norm": 0.455078125, + "learning_rate": 0.00015196862957161605, + "loss": 0.0375, + "step": 6888 + }, + { + "epoch": 0.3264155413409145, + "grad_norm": 0.0458984375, + "learning_rate": 0.00015195590540127357, + "loss": 0.0011, + "step": 6889 + }, + { + "epoch": 0.32646292347784883, + "grad_norm": 0.703125, + "learning_rate": 0.00015194318007860443, + "loss": 0.862, + "step": 6890 + }, + { + "epoch": 0.3265103056147832, + "grad_norm": 0.66015625, + "learning_rate": 0.00015193045360389088, + "loss": 1.0593, + "step": 6891 + }, + { + "epoch": 0.3265576877517176, + "grad_norm": 0.390625, + "learning_rate": 0.0001519177259774152, + "loss": 0.0673, + "step": 6892 + }, + { + "epoch": 0.32660506988865196, + "grad_norm": 0.474609375, + "learning_rate": 0.00015190499719945963, + "loss": 0.0542, + "step": 6893 + }, + { + "epoch": 0.32665245202558635, + "grad_norm": 0.64453125, + "learning_rate": 0.00015189226727030652, + "loss": 1.0707, + "step": 6894 + }, + { + "epoch": 0.32669983416252074, + "grad_norm": 0.59375, + "learning_rate": 0.00015187953619023823, + "loss": 1.0912, + "step": 6895 + }, + { + "epoch": 0.3267472162994551, + "grad_norm": 0.0966796875, + "learning_rate": 0.00015186680395953706, + "loss": 0.0081, + "step": 6896 + }, + { + "epoch": 0.32679459843638947, + "grad_norm": 1.2109375, + "learning_rate": 0.0001518540705784854, + "loss": 1.0111, + "step": 6897 + }, + { + "epoch": 0.32684198057332386, + "grad_norm": 0.56640625, + "learning_rate": 0.00015184133604736571, + "loss": 1.0445, + "step": 6898 + }, + { + "epoch": 0.32688936271025826, + "grad_norm": 0.6953125, + "learning_rate": 0.00015182860036646041, + "loss": 0.9122, + "step": 6899 + }, + { + "epoch": 0.3269367448471926, + "grad_norm": 0.7265625, + "learning_rate": 0.00015181586353605196, + "loss": 1.3648, + "step": 6900 + }, + { + "epoch": 0.326984126984127, + "grad_norm": 0.447265625, + "learning_rate": 0.0001518031255564228, + "loss": 0.7088, + "step": 6901 + }, + { + "epoch": 0.3270315091210614, + "grad_norm": 0.65234375, + "learning_rate": 0.00015179038642785557, + "loss": 1.4338, + "step": 6902 + }, + { + "epoch": 0.3270788912579957, + "grad_norm": 0.625, + "learning_rate": 0.0001517776461506327, + "loss": 0.8348, + "step": 6903 + }, + { + "epoch": 0.3271262733949301, + "grad_norm": 0.578125, + "learning_rate": 0.00015176490472503678, + "loss": 0.9255, + "step": 6904 + }, + { + "epoch": 0.3271736555318645, + "grad_norm": 0.349609375, + "learning_rate": 0.00015175216215135043, + "loss": 0.035, + "step": 6905 + }, + { + "epoch": 0.32722103766879884, + "grad_norm": 0.66015625, + "learning_rate": 0.00015173941842985627, + "loss": 1.0767, + "step": 6906 + }, + { + "epoch": 0.32726841980573323, + "grad_norm": 0.54296875, + "learning_rate": 0.00015172667356083686, + "loss": 0.8763, + "step": 6907 + }, + { + "epoch": 0.32731580194266763, + "grad_norm": 0.64453125, + "learning_rate": 0.00015171392754457494, + "loss": 0.6437, + "step": 6908 + }, + { + "epoch": 0.32736318407960197, + "grad_norm": 0.19921875, + "learning_rate": 0.00015170118038135318, + "loss": 0.1531, + "step": 6909 + }, + { + "epoch": 0.32741056621653636, + "grad_norm": 0.458984375, + "learning_rate": 0.00015168843207145435, + "loss": 0.0325, + "step": 6910 + }, + { + "epoch": 0.32745794835347075, + "grad_norm": 0.66015625, + "learning_rate": 0.00015167568261516116, + "loss": 1.0991, + "step": 6911 + }, + { + "epoch": 0.32750533049040514, + "grad_norm": 0.73828125, + "learning_rate": 0.00015166293201275633, + "loss": 0.8243, + "step": 6912 + }, + { + "epoch": 0.3275527126273395, + "grad_norm": 0.83203125, + "learning_rate": 0.0001516501802645227, + "loss": 1.4578, + "step": 6913 + }, + { + "epoch": 0.3276000947642739, + "grad_norm": 0.55859375, + "learning_rate": 0.00015163742737074314, + "loss": 0.8581, + "step": 6914 + }, + { + "epoch": 0.32764747690120827, + "grad_norm": 0.734375, + "learning_rate": 0.0001516246733317004, + "loss": 0.9694, + "step": 6915 + }, + { + "epoch": 0.3276948590381426, + "grad_norm": 0.7421875, + "learning_rate": 0.0001516119181476774, + "loss": 0.5541, + "step": 6916 + }, + { + "epoch": 0.327742241175077, + "grad_norm": 0.73828125, + "learning_rate": 0.000151599161818957, + "loss": 1.2393, + "step": 6917 + }, + { + "epoch": 0.3277896233120114, + "grad_norm": 0.66015625, + "learning_rate": 0.0001515864043458222, + "loss": 1.134, + "step": 6918 + }, + { + "epoch": 0.32783700544894573, + "grad_norm": 0.55078125, + "learning_rate": 0.0001515736457285559, + "loss": 0.6624, + "step": 6919 + }, + { + "epoch": 0.3278843875858801, + "grad_norm": 0.69921875, + "learning_rate": 0.00015156088596744103, + "loss": 0.6075, + "step": 6920 + }, + { + "epoch": 0.3279317697228145, + "grad_norm": 0.6640625, + "learning_rate": 0.00015154812506276065, + "loss": 1.0984, + "step": 6921 + }, + { + "epoch": 0.32797915185974885, + "grad_norm": 0.67578125, + "learning_rate": 0.00015153536301479775, + "loss": 0.9776, + "step": 6922 + }, + { + "epoch": 0.32802653399668324, + "grad_norm": 0.703125, + "learning_rate": 0.00015152259982383542, + "loss": 1.2544, + "step": 6923 + }, + { + "epoch": 0.32807391613361764, + "grad_norm": 0.1396484375, + "learning_rate": 0.00015150983549015666, + "loss": 0.0135, + "step": 6924 + }, + { + "epoch": 0.328121298270552, + "grad_norm": 0.5859375, + "learning_rate": 0.00015149707001404464, + "loss": 1.0113, + "step": 6925 + }, + { + "epoch": 0.32816868040748637, + "grad_norm": 1.125, + "learning_rate": 0.0001514843033957825, + "loss": 1.1702, + "step": 6926 + }, + { + "epoch": 0.32821606254442076, + "grad_norm": 0.11865234375, + "learning_rate": 0.00015147153563565332, + "loss": 0.0092, + "step": 6927 + }, + { + "epoch": 0.32826344468135515, + "grad_norm": 0.10009765625, + "learning_rate": 0.00015145876673394027, + "loss": 0.017, + "step": 6928 + }, + { + "epoch": 0.3283108268182895, + "grad_norm": 0.8828125, + "learning_rate": 0.00015144599669092662, + "loss": 1.0963, + "step": 6929 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 0.703125, + "learning_rate": 0.00015143322550689554, + "loss": 1.3804, + "step": 6930 + }, + { + "epoch": 0.3284055910921583, + "grad_norm": 0.57421875, + "learning_rate": 0.00015142045318213031, + "loss": 0.2335, + "step": 6931 + }, + { + "epoch": 0.3284529732290926, + "grad_norm": 0.55859375, + "learning_rate": 0.0001514076797169142, + "loss": 0.9878, + "step": 6932 + }, + { + "epoch": 0.328500355366027, + "grad_norm": 0.46484375, + "learning_rate": 0.00015139490511153055, + "loss": 0.0259, + "step": 6933 + }, + { + "epoch": 0.3285477375029614, + "grad_norm": 1.1171875, + "learning_rate": 0.00015138212936626258, + "loss": 0.8695, + "step": 6934 + }, + { + "epoch": 0.32859511963989574, + "grad_norm": 0.859375, + "learning_rate": 0.00015136935248139376, + "loss": 1.6176, + "step": 6935 + }, + { + "epoch": 0.32864250177683013, + "grad_norm": 0.201171875, + "learning_rate": 0.00015135657445720742, + "loss": 0.122, + "step": 6936 + }, + { + "epoch": 0.3286898839137645, + "grad_norm": 0.54296875, + "learning_rate": 0.00015134379529398693, + "loss": 0.8374, + "step": 6937 + }, + { + "epoch": 0.32873726605069886, + "grad_norm": 0.55859375, + "learning_rate": 0.00015133101499201576, + "loss": 0.9414, + "step": 6938 + }, + { + "epoch": 0.32878464818763325, + "grad_norm": 0.5859375, + "learning_rate": 0.00015131823355157736, + "loss": 0.7317, + "step": 6939 + }, + { + "epoch": 0.32883203032456765, + "grad_norm": 0.625, + "learning_rate": 0.00015130545097295518, + "loss": 1.0873, + "step": 6940 + }, + { + "epoch": 0.32887941246150204, + "grad_norm": 0.306640625, + "learning_rate": 0.0001512926672564328, + "loss": 0.0427, + "step": 6941 + }, + { + "epoch": 0.3289267945984364, + "grad_norm": 0.5546875, + "learning_rate": 0.00015127988240229364, + "loss": 0.9747, + "step": 6942 + }, + { + "epoch": 0.32897417673537077, + "grad_norm": 0.77734375, + "learning_rate": 0.00015126709641082132, + "loss": 1.2964, + "step": 6943 + }, + { + "epoch": 0.32902155887230516, + "grad_norm": 0.0018157958984375, + "learning_rate": 0.00015125430928229942, + "loss": 0.0002, + "step": 6944 + }, + { + "epoch": 0.3290689410092395, + "grad_norm": 0.64453125, + "learning_rate": 0.00015124152101701155, + "loss": 1.0356, + "step": 6945 + }, + { + "epoch": 0.3291163231461739, + "grad_norm": 0.73828125, + "learning_rate": 0.00015122873161524126, + "loss": 1.1814, + "step": 6946 + }, + { + "epoch": 0.3291637052831083, + "grad_norm": 0.486328125, + "learning_rate": 0.0001512159410772723, + "loss": 0.0803, + "step": 6947 + }, + { + "epoch": 0.3292110874200426, + "grad_norm": 0.76953125, + "learning_rate": 0.00015120314940338835, + "loss": 0.8302, + "step": 6948 + }, + { + "epoch": 0.329258469556977, + "grad_norm": 0.76953125, + "learning_rate": 0.00015119035659387308, + "loss": 1.1037, + "step": 6949 + }, + { + "epoch": 0.3293058516939114, + "grad_norm": 0.5, + "learning_rate": 0.0001511775626490102, + "loss": 0.5201, + "step": 6950 + }, + { + "epoch": 0.32935323383084575, + "grad_norm": 0.15234375, + "learning_rate": 0.00015116476756908353, + "loss": 0.1235, + "step": 6951 + }, + { + "epoch": 0.32940061596778014, + "grad_norm": 0.0849609375, + "learning_rate": 0.0001511519713543768, + "loss": 0.0051, + "step": 6952 + }, + { + "epoch": 0.32944799810471453, + "grad_norm": 0.08203125, + "learning_rate": 0.0001511391740051738, + "loss": 0.0045, + "step": 6953 + }, + { + "epoch": 0.32949538024164887, + "grad_norm": 0.59375, + "learning_rate": 0.0001511263755217584, + "loss": 0.3869, + "step": 6954 + }, + { + "epoch": 0.32954276237858326, + "grad_norm": 0.71484375, + "learning_rate": 0.00015111357590441444, + "loss": 1.262, + "step": 6955 + }, + { + "epoch": 0.32959014451551766, + "grad_norm": 0.25390625, + "learning_rate": 0.00015110077515342586, + "loss": 0.0431, + "step": 6956 + }, + { + "epoch": 0.32963752665245205, + "grad_norm": 0.7421875, + "learning_rate": 0.0001510879732690765, + "loss": 1.046, + "step": 6957 + }, + { + "epoch": 0.3296849087893864, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001510751702516503, + "loss": 0.0331, + "step": 6958 + }, + { + "epoch": 0.3297322909263208, + "grad_norm": 0.77734375, + "learning_rate": 0.00015106236610143122, + "loss": 0.7115, + "step": 6959 + }, + { + "epoch": 0.3297796730632552, + "grad_norm": 0.60546875, + "learning_rate": 0.00015104956081870324, + "loss": 0.5726, + "step": 6960 + }, + { + "epoch": 0.3298270552001895, + "grad_norm": 0.734375, + "learning_rate": 0.0001510367544037504, + "loss": 0.2715, + "step": 6961 + }, + { + "epoch": 0.3298744373371239, + "grad_norm": 0.62109375, + "learning_rate": 0.00015102394685685667, + "loss": 0.0544, + "step": 6962 + }, + { + "epoch": 0.3299218194740583, + "grad_norm": 0.25390625, + "learning_rate": 0.0001510111381783062, + "loss": 0.0501, + "step": 6963 + }, + { + "epoch": 0.32996920161099264, + "grad_norm": 0.98828125, + "learning_rate": 0.000150998328368383, + "loss": 0.9406, + "step": 6964 + }, + { + "epoch": 0.33001658374792703, + "grad_norm": 0.62109375, + "learning_rate": 0.0001509855174273712, + "loss": 0.8347, + "step": 6965 + }, + { + "epoch": 0.3300639658848614, + "grad_norm": 0.609375, + "learning_rate": 0.00015097270535555487, + "loss": 0.9301, + "step": 6966 + }, + { + "epoch": 0.33011134802179576, + "grad_norm": 0.46484375, + "learning_rate": 0.00015095989215321832, + "loss": 0.7732, + "step": 6967 + }, + { + "epoch": 0.33015873015873015, + "grad_norm": 0.6640625, + "learning_rate": 0.0001509470778206456, + "loss": 1.3104, + "step": 6968 + }, + { + "epoch": 0.33020611229566454, + "grad_norm": 0.26171875, + "learning_rate": 0.00015093426235812096, + "loss": 0.162, + "step": 6969 + }, + { + "epoch": 0.33025349443259894, + "grad_norm": 0.91796875, + "learning_rate": 0.0001509214457659286, + "loss": 1.7801, + "step": 6970 + }, + { + "epoch": 0.3303008765695333, + "grad_norm": 0.6484375, + "learning_rate": 0.00015090862804435288, + "loss": 1.7799, + "step": 6971 + }, + { + "epoch": 0.33034825870646767, + "grad_norm": 0.52734375, + "learning_rate": 0.00015089580919367798, + "loss": 0.642, + "step": 6972 + }, + { + "epoch": 0.33039564084340206, + "grad_norm": 0.80859375, + "learning_rate": 0.00015088298921418826, + "loss": 1.085, + "step": 6973 + }, + { + "epoch": 0.3304430229803364, + "grad_norm": 0.84765625, + "learning_rate": 0.00015087016810616803, + "loss": 1.5359, + "step": 6974 + }, + { + "epoch": 0.3304904051172708, + "grad_norm": 1.421875, + "learning_rate": 0.00015085734586990166, + "loss": 1.1087, + "step": 6975 + }, + { + "epoch": 0.3305377872542052, + "grad_norm": 0.10205078125, + "learning_rate": 0.00015084452250567352, + "loss": 0.0126, + "step": 6976 + }, + { + "epoch": 0.3305851693911395, + "grad_norm": 0.2216796875, + "learning_rate": 0.00015083169801376802, + "loss": 0.0923, + "step": 6977 + }, + { + "epoch": 0.3306325515280739, + "grad_norm": 0.6171875, + "learning_rate": 0.0001508188723944696, + "loss": 1.0601, + "step": 6978 + }, + { + "epoch": 0.3306799336650083, + "grad_norm": 0.578125, + "learning_rate": 0.00015080604564806274, + "loss": 1.2822, + "step": 6979 + }, + { + "epoch": 0.33072731580194265, + "grad_norm": 0.57421875, + "learning_rate": 0.0001507932177748319, + "loss": 0.8857, + "step": 6980 + }, + { + "epoch": 0.33077469793887704, + "grad_norm": 0.60546875, + "learning_rate": 0.00015078038877506159, + "loss": 0.0921, + "step": 6981 + }, + { + "epoch": 0.33082208007581143, + "grad_norm": 0.71484375, + "learning_rate": 0.00015076755864903632, + "loss": 1.1313, + "step": 6982 + }, + { + "epoch": 0.33086946221274577, + "grad_norm": 0.69140625, + "learning_rate": 0.0001507547273970407, + "loss": 1.3182, + "step": 6983 + }, + { + "epoch": 0.33091684434968016, + "grad_norm": 0.39453125, + "learning_rate": 0.0001507418950193593, + "loss": 0.1749, + "step": 6984 + }, + { + "epoch": 0.33096422648661455, + "grad_norm": 0.55078125, + "learning_rate": 0.0001507290615162767, + "loss": 0.5609, + "step": 6985 + }, + { + "epoch": 0.33101160862354895, + "grad_norm": 0.353515625, + "learning_rate": 0.00015071622688807757, + "loss": 0.0124, + "step": 6986 + }, + { + "epoch": 0.3310589907604833, + "grad_norm": 1.3359375, + "learning_rate": 0.0001507033911350465, + "loss": 0.259, + "step": 6987 + }, + { + "epoch": 0.3311063728974177, + "grad_norm": 0.625, + "learning_rate": 0.00015069055425746827, + "loss": 0.8591, + "step": 6988 + }, + { + "epoch": 0.33115375503435207, + "grad_norm": 0.765625, + "learning_rate": 0.00015067771625562753, + "loss": 1.0551, + "step": 6989 + }, + { + "epoch": 0.3312011371712864, + "grad_norm": 0.54296875, + "learning_rate": 0.00015066487712980898, + "loss": 0.8592, + "step": 6990 + }, + { + "epoch": 0.3312485193082208, + "grad_norm": 0.54296875, + "learning_rate": 0.0001506520368802975, + "loss": 1.0344, + "step": 6991 + }, + { + "epoch": 0.3312959014451552, + "grad_norm": 0.2177734375, + "learning_rate": 0.00015063919550737772, + "loss": 0.0456, + "step": 6992 + }, + { + "epoch": 0.33134328358208953, + "grad_norm": 0.60546875, + "learning_rate": 0.00015062635301133455, + "loss": 0.9304, + "step": 6993 + }, + { + "epoch": 0.3313906657190239, + "grad_norm": 0.365234375, + "learning_rate": 0.00015061350939245277, + "loss": 0.0861, + "step": 6994 + }, + { + "epoch": 0.3314380478559583, + "grad_norm": 0.435546875, + "learning_rate": 0.00015060066465101733, + "loss": 0.0486, + "step": 6995 + }, + { + "epoch": 0.33148542999289266, + "grad_norm": 0.5625, + "learning_rate": 0.000150587818787313, + "loss": 0.8601, + "step": 6996 + }, + { + "epoch": 0.33153281212982705, + "grad_norm": 0.00244140625, + "learning_rate": 0.0001505749718016247, + "loss": 0.0002, + "step": 6997 + }, + { + "epoch": 0.33158019426676144, + "grad_norm": 0.66796875, + "learning_rate": 0.00015056212369423747, + "loss": 1.1144, + "step": 6998 + }, + { + "epoch": 0.33162757640369583, + "grad_norm": 0.5078125, + "learning_rate": 0.00015054927446543615, + "loss": 0.8029, + "step": 6999 + }, + { + "epoch": 0.33167495854063017, + "grad_norm": 0.11376953125, + "learning_rate": 0.00015053642411550576, + "loss": 0.0089, + "step": 7000 + }, + { + "epoch": 0.33172234067756456, + "grad_norm": 0.96875, + "learning_rate": 0.0001505235726447313, + "loss": 0.497, + "step": 7001 + }, + { + "epoch": 0.33176972281449896, + "grad_norm": 0.953125, + "learning_rate": 0.00015051072005339787, + "loss": 1.1711, + "step": 7002 + }, + { + "epoch": 0.3318171049514333, + "grad_norm": 0.57421875, + "learning_rate": 0.0001504978663417904, + "loss": 0.8463, + "step": 7003 + }, + { + "epoch": 0.3318644870883677, + "grad_norm": 0.6484375, + "learning_rate": 0.00015048501151019412, + "loss": 0.9828, + "step": 7004 + }, + { + "epoch": 0.3319118692253021, + "grad_norm": 0.69140625, + "learning_rate": 0.000150472155558894, + "loss": 1.1101, + "step": 7005 + }, + { + "epoch": 0.3319592513622364, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015045929848817526, + "loss": 0.008, + "step": 7006 + }, + { + "epoch": 0.3320066334991708, + "grad_norm": 0.703125, + "learning_rate": 0.000150446440298323, + "loss": 1.2718, + "step": 7007 + }, + { + "epoch": 0.3320540156361052, + "grad_norm": 0.34765625, + "learning_rate": 0.00015043358098962246, + "loss": 0.179, + "step": 7008 + }, + { + "epoch": 0.33210139777303954, + "grad_norm": 0.61328125, + "learning_rate": 0.00015042072056235876, + "loss": 1.006, + "step": 7009 + }, + { + "epoch": 0.33214877990997393, + "grad_norm": 0.05517578125, + "learning_rate": 0.00015040785901681723, + "loss": 0.0034, + "step": 7010 + }, + { + "epoch": 0.3321961620469083, + "grad_norm": 0.486328125, + "learning_rate": 0.00015039499635328306, + "loss": 0.2602, + "step": 7011 + }, + { + "epoch": 0.33224354418384267, + "grad_norm": 0.01025390625, + "learning_rate": 0.0001503821325720416, + "loss": 0.0003, + "step": 7012 + }, + { + "epoch": 0.33229092632077706, + "grad_norm": 0.251953125, + "learning_rate": 0.00015036926767337803, + "loss": 0.096, + "step": 7013 + }, + { + "epoch": 0.33233830845771145, + "grad_norm": 0.63671875, + "learning_rate": 0.0001503564016575778, + "loss": 0.8911, + "step": 7014 + }, + { + "epoch": 0.33238569059464584, + "grad_norm": 0.62109375, + "learning_rate": 0.0001503435345249262, + "loss": 0.9765, + "step": 7015 + }, + { + "epoch": 0.3324330727315802, + "grad_norm": 0.7578125, + "learning_rate": 0.00015033066627570863, + "loss": 0.9499, + "step": 7016 + }, + { + "epoch": 0.3324804548685146, + "grad_norm": 0.61328125, + "learning_rate": 0.00015031779691021047, + "loss": 0.7292, + "step": 7017 + }, + { + "epoch": 0.33252783700544897, + "grad_norm": 0.66015625, + "learning_rate": 0.00015030492642871722, + "loss": 1.1517, + "step": 7018 + }, + { + "epoch": 0.3325752191423833, + "grad_norm": 0.9765625, + "learning_rate": 0.00015029205483151425, + "loss": 1.2147, + "step": 7019 + }, + { + "epoch": 0.3326226012793177, + "grad_norm": 0.58984375, + "learning_rate": 0.0001502791821188871, + "loss": 0.977, + "step": 7020 + }, + { + "epoch": 0.3326699834162521, + "grad_norm": 0.78125, + "learning_rate": 0.00015026630829112123, + "loss": 1.0967, + "step": 7021 + }, + { + "epoch": 0.33271736555318643, + "grad_norm": 0.5625, + "learning_rate": 0.00015025343334850217, + "loss": 0.9472, + "step": 7022 + }, + { + "epoch": 0.3327647476901208, + "grad_norm": 0.58203125, + "learning_rate": 0.00015024055729131547, + "loss": 0.5209, + "step": 7023 + }, + { + "epoch": 0.3328121298270552, + "grad_norm": 0.6171875, + "learning_rate": 0.00015022768011984676, + "loss": 1.2798, + "step": 7024 + }, + { + "epoch": 0.33285951196398955, + "grad_norm": 0.125, + "learning_rate": 0.0001502148018343816, + "loss": 0.0117, + "step": 7025 + }, + { + "epoch": 0.33290689410092394, + "grad_norm": 0.79296875, + "learning_rate": 0.00015020192243520562, + "loss": 0.8227, + "step": 7026 + }, + { + "epoch": 0.33295427623785834, + "grad_norm": 0.173828125, + "learning_rate": 0.00015018904192260445, + "loss": 0.1, + "step": 7027 + }, + { + "epoch": 0.33300165837479273, + "grad_norm": 0.890625, + "learning_rate": 0.00015017616029686383, + "loss": 0.7774, + "step": 7028 + }, + { + "epoch": 0.33304904051172707, + "grad_norm": 0.515625, + "learning_rate": 0.0001501632775582694, + "loss": 1.027, + "step": 7029 + }, + { + "epoch": 0.33309642264866146, + "grad_norm": 0.80078125, + "learning_rate": 0.00015015039370710692, + "loss": 1.0763, + "step": 7030 + }, + { + "epoch": 0.33314380478559585, + "grad_norm": 0.70703125, + "learning_rate": 0.00015013750874366208, + "loss": 0.3201, + "step": 7031 + }, + { + "epoch": 0.3331911869225302, + "grad_norm": 0.49609375, + "learning_rate": 0.00015012462266822074, + "loss": 0.0487, + "step": 7032 + }, + { + "epoch": 0.3332385690594646, + "grad_norm": 0.271484375, + "learning_rate": 0.00015011173548106865, + "loss": 0.0261, + "step": 7033 + }, + { + "epoch": 0.333285951196399, + "grad_norm": 0.484375, + "learning_rate": 0.00015009884718249162, + "loss": 0.4067, + "step": 7034 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.408203125, + "learning_rate": 0.00015008595777277556, + "loss": 0.0487, + "step": 7035 + }, + { + "epoch": 0.3333807154702677, + "grad_norm": 0.294921875, + "learning_rate": 0.0001500730672522063, + "loss": 0.0181, + "step": 7036 + }, + { + "epoch": 0.3334280976072021, + "grad_norm": 0.71484375, + "learning_rate": 0.00015006017562106973, + "loss": 0.772, + "step": 7037 + }, + { + "epoch": 0.33347547974413644, + "grad_norm": 0.5234375, + "learning_rate": 0.0001500472828796518, + "loss": 0.8465, + "step": 7038 + }, + { + "epoch": 0.33352286188107083, + "grad_norm": 0.7109375, + "learning_rate": 0.0001500343890282384, + "loss": 0.7087, + "step": 7039 + }, + { + "epoch": 0.3335702440180052, + "grad_norm": 0.2001953125, + "learning_rate": 0.00015002149406711558, + "loss": 0.0318, + "step": 7040 + }, + { + "epoch": 0.33361762615493956, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001500085979965693, + "loss": 0.1506, + "step": 7041 + }, + { + "epoch": 0.33366500829187395, + "grad_norm": 0.76171875, + "learning_rate": 0.00014999570081688558, + "loss": 1.1756, + "step": 7042 + }, + { + "epoch": 0.33371239042880835, + "grad_norm": 0.65234375, + "learning_rate": 0.0001499828025283505, + "loss": 1.231, + "step": 7043 + }, + { + "epoch": 0.33375977256574274, + "grad_norm": 0.396484375, + "learning_rate": 0.00014996990313125008, + "loss": 0.0667, + "step": 7044 + }, + { + "epoch": 0.3338071547026771, + "grad_norm": 0.6171875, + "learning_rate": 0.00014995700262587043, + "loss": 0.8659, + "step": 7045 + }, + { + "epoch": 0.33385453683961147, + "grad_norm": 0.71484375, + "learning_rate": 0.00014994410101249766, + "loss": 0.8055, + "step": 7046 + }, + { + "epoch": 0.33390191897654586, + "grad_norm": 0.62109375, + "learning_rate": 0.00014993119829141794, + "loss": 0.7001, + "step": 7047 + }, + { + "epoch": 0.3339493011134802, + "grad_norm": 0.5390625, + "learning_rate": 0.00014991829446291743, + "loss": 0.0552, + "step": 7048 + }, + { + "epoch": 0.3339966832504146, + "grad_norm": 0.8359375, + "learning_rate": 0.0001499053895272823, + "loss": 0.6531, + "step": 7049 + }, + { + "epoch": 0.334044065387349, + "grad_norm": 0.51171875, + "learning_rate": 0.0001498924834847988, + "loss": 0.1737, + "step": 7050 + }, + { + "epoch": 0.3340914475242833, + "grad_norm": 0.7109375, + "learning_rate": 0.00014987957633575316, + "loss": 0.5999, + "step": 7051 + }, + { + "epoch": 0.3341388296612177, + "grad_norm": 0.765625, + "learning_rate": 0.00014986666808043165, + "loss": 0.149, + "step": 7052 + }, + { + "epoch": 0.3341862117981521, + "grad_norm": 0.72265625, + "learning_rate": 0.00014985375871912055, + "loss": 0.7358, + "step": 7053 + }, + { + "epoch": 0.33423359393508645, + "grad_norm": 0.515625, + "learning_rate": 0.00014984084825210619, + "loss": 0.6134, + "step": 7054 + }, + { + "epoch": 0.33428097607202084, + "grad_norm": 0.71484375, + "learning_rate": 0.0001498279366796749, + "loss": 0.8201, + "step": 7055 + }, + { + "epoch": 0.33432835820895523, + "grad_norm": 0.1328125, + "learning_rate": 0.00014981502400211304, + "loss": 0.0103, + "step": 7056 + }, + { + "epoch": 0.3343757403458896, + "grad_norm": 0.69140625, + "learning_rate": 0.00014980211021970702, + "loss": 1.4085, + "step": 7057 + }, + { + "epoch": 0.33442312248282396, + "grad_norm": 0.734375, + "learning_rate": 0.00014978919533274322, + "loss": 1.0654, + "step": 7058 + }, + { + "epoch": 0.33447050461975836, + "grad_norm": 0.8203125, + "learning_rate": 0.00014977627934150816, + "loss": 0.9682, + "step": 7059 + }, + { + "epoch": 0.33451788675669275, + "grad_norm": 0.7109375, + "learning_rate": 0.00014976336224628822, + "loss": 0.8443, + "step": 7060 + }, + { + "epoch": 0.3345652688936271, + "grad_norm": 0.400390625, + "learning_rate": 0.00014975044404736987, + "loss": 0.4145, + "step": 7061 + }, + { + "epoch": 0.3346126510305615, + "grad_norm": 1.1640625, + "learning_rate": 0.00014973752474503968, + "loss": 1.3228, + "step": 7062 + }, + { + "epoch": 0.3346600331674959, + "grad_norm": 0.6640625, + "learning_rate": 0.00014972460433958419, + "loss": 0.8524, + "step": 7063 + }, + { + "epoch": 0.3347074153044302, + "grad_norm": 0.703125, + "learning_rate": 0.00014971168283128993, + "loss": 0.8595, + "step": 7064 + }, + { + "epoch": 0.3347547974413646, + "grad_norm": 0.7890625, + "learning_rate": 0.00014969876022044346, + "loss": 0.6905, + "step": 7065 + }, + { + "epoch": 0.334802179578299, + "grad_norm": 0.48828125, + "learning_rate": 0.0001496858365073315, + "loss": 0.9029, + "step": 7066 + }, + { + "epoch": 0.33484956171523333, + "grad_norm": 0.8046875, + "learning_rate": 0.00014967291169224058, + "loss": 0.9739, + "step": 7067 + }, + { + "epoch": 0.33489694385216773, + "grad_norm": 0.06689453125, + "learning_rate": 0.00014965998577545734, + "loss": 0.0078, + "step": 7068 + }, + { + "epoch": 0.3349443259891021, + "grad_norm": 0.443359375, + "learning_rate": 0.00014964705875726857, + "loss": 0.0475, + "step": 7069 + }, + { + "epoch": 0.33499170812603646, + "grad_norm": 0.06396484375, + "learning_rate": 0.00014963413063796087, + "loss": 0.0081, + "step": 7070 + }, + { + "epoch": 0.33503909026297085, + "grad_norm": 0.859375, + "learning_rate": 0.00014962120141782104, + "loss": 0.3946, + "step": 7071 + }, + { + "epoch": 0.33508647239990524, + "grad_norm": 0.54296875, + "learning_rate": 0.0001496082710971358, + "loss": 1.0272, + "step": 7072 + }, + { + "epoch": 0.33513385453683964, + "grad_norm": 0.08154296875, + "learning_rate": 0.000149595339676192, + "loss": 0.0063, + "step": 7073 + }, + { + "epoch": 0.335181236673774, + "grad_norm": 0.68359375, + "learning_rate": 0.00014958240715527636, + "loss": 1.2715, + "step": 7074 + }, + { + "epoch": 0.33522861881070837, + "grad_norm": 0.0517578125, + "learning_rate": 0.00014956947353467578, + "loss": 0.0058, + "step": 7075 + }, + { + "epoch": 0.33527600094764276, + "grad_norm": 0.79296875, + "learning_rate": 0.00014955653881467703, + "loss": 1.415, + "step": 7076 + }, + { + "epoch": 0.3353233830845771, + "grad_norm": 0.0634765625, + "learning_rate": 0.00014954360299556703, + "loss": 0.0029, + "step": 7077 + }, + { + "epoch": 0.3353707652215115, + "grad_norm": 0.455078125, + "learning_rate": 0.00014953066607763268, + "loss": 0.0616, + "step": 7078 + }, + { + "epoch": 0.3354181473584459, + "grad_norm": 0.86328125, + "learning_rate": 0.00014951772806116095, + "loss": 1.2487, + "step": 7079 + }, + { + "epoch": 0.3354655294953802, + "grad_norm": 0.5703125, + "learning_rate": 0.00014950478894643873, + "loss": 1.2876, + "step": 7080 + }, + { + "epoch": 0.3355129116323146, + "grad_norm": 0.6015625, + "learning_rate": 0.000149491848733753, + "loss": 0.9256, + "step": 7081 + }, + { + "epoch": 0.335560293769249, + "grad_norm": 0.5, + "learning_rate": 0.00014947890742339086, + "loss": 0.678, + "step": 7082 + }, + { + "epoch": 0.33560767590618334, + "grad_norm": 0.64453125, + "learning_rate": 0.0001494659650156392, + "loss": 1.1603, + "step": 7083 + }, + { + "epoch": 0.33565505804311774, + "grad_norm": 0.60546875, + "learning_rate": 0.00014945302151078512, + "loss": 0.7622, + "step": 7084 + }, + { + "epoch": 0.33570244018005213, + "grad_norm": 0.6953125, + "learning_rate": 0.00014944007690911572, + "loss": 0.8894, + "step": 7085 + }, + { + "epoch": 0.3357498223169865, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001494271312109181, + "loss": 0.001, + "step": 7086 + }, + { + "epoch": 0.33579720445392086, + "grad_norm": 0.859375, + "learning_rate": 0.0001494141844164793, + "loss": 1.2383, + "step": 7087 + }, + { + "epoch": 0.33584458659085525, + "grad_norm": 0.73046875, + "learning_rate": 0.00014940123652608652, + "loss": 0.1373, + "step": 7088 + }, + { + "epoch": 0.33589196872778965, + "grad_norm": 0.59375, + "learning_rate": 0.00014938828754002697, + "loss": 0.508, + "step": 7089 + }, + { + "epoch": 0.335939350864724, + "grad_norm": 0.68359375, + "learning_rate": 0.00014937533745858783, + "loss": 1.205, + "step": 7090 + }, + { + "epoch": 0.3359867330016584, + "grad_norm": 0.625, + "learning_rate": 0.00014936238628205625, + "loss": 1.1114, + "step": 7091 + }, + { + "epoch": 0.33603411513859277, + "grad_norm": 0.6640625, + "learning_rate": 0.00014934943401071954, + "loss": 0.8703, + "step": 7092 + }, + { + "epoch": 0.3360814972755271, + "grad_norm": 0.18359375, + "learning_rate": 0.00014933648064486494, + "loss": 0.1389, + "step": 7093 + }, + { + "epoch": 0.3361288794124615, + "grad_norm": 0.88671875, + "learning_rate": 0.00014932352618477976, + "loss": 1.1597, + "step": 7094 + }, + { + "epoch": 0.3361762615493959, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001493105706307513, + "loss": 0.1503, + "step": 7095 + }, + { + "epoch": 0.33622364368633023, + "grad_norm": 0.9296875, + "learning_rate": 0.0001492976139830669, + "loss": 0.5618, + "step": 7096 + }, + { + "epoch": 0.3362710258232646, + "grad_norm": 0.322265625, + "learning_rate": 0.00014928465624201395, + "loss": 0.1363, + "step": 7097 + }, + { + "epoch": 0.336318407960199, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014927169740787982, + "loss": 0.0229, + "step": 7098 + }, + { + "epoch": 0.33636579009713335, + "grad_norm": 0.6953125, + "learning_rate": 0.00014925873748095192, + "loss": 1.0774, + "step": 7099 + }, + { + "epoch": 0.33641317223406775, + "grad_norm": 1.1484375, + "learning_rate": 0.00014924577646151767, + "loss": 0.7773, + "step": 7100 + }, + { + "epoch": 0.33646055437100214, + "grad_norm": 0.578125, + "learning_rate": 0.00014923281434986457, + "loss": 0.811, + "step": 7101 + }, + { + "epoch": 0.33650793650793653, + "grad_norm": 0.047607421875, + "learning_rate": 0.0001492198511462801, + "loss": 0.0017, + "step": 7102 + }, + { + "epoch": 0.33655531864487087, + "grad_norm": 0.09619140625, + "learning_rate": 0.00014920688685105172, + "loss": 0.0042, + "step": 7103 + }, + { + "epoch": 0.33660270078180526, + "grad_norm": 0.5625, + "learning_rate": 0.00014919392146446703, + "loss": 1.039, + "step": 7104 + }, + { + "epoch": 0.33665008291873966, + "grad_norm": 0.65234375, + "learning_rate": 0.00014918095498681356, + "loss": 1.2575, + "step": 7105 + }, + { + "epoch": 0.336697465055674, + "grad_norm": 0.5703125, + "learning_rate": 0.0001491679874183789, + "loss": 0.6664, + "step": 7106 + }, + { + "epoch": 0.3367448471926084, + "grad_norm": 0.73828125, + "learning_rate": 0.00014915501875945064, + "loss": 0.9582, + "step": 7107 + }, + { + "epoch": 0.3367922293295428, + "grad_norm": 0.365234375, + "learning_rate": 0.0001491420490103164, + "loss": 0.6622, + "step": 7108 + }, + { + "epoch": 0.3368396114664771, + "grad_norm": 0.68359375, + "learning_rate": 0.0001491290781712639, + "loss": 1.2484, + "step": 7109 + }, + { + "epoch": 0.3368869936034115, + "grad_norm": 0.57421875, + "learning_rate": 0.00014911610624258076, + "loss": 1.2539, + "step": 7110 + }, + { + "epoch": 0.3369343757403459, + "grad_norm": 0.51171875, + "learning_rate": 0.00014910313322455466, + "loss": 0.5703, + "step": 7111 + }, + { + "epoch": 0.33698175787728024, + "grad_norm": 0.11376953125, + "learning_rate": 0.00014909015911747343, + "loss": 0.0237, + "step": 7112 + }, + { + "epoch": 0.33702914001421463, + "grad_norm": 0.6171875, + "learning_rate": 0.00014907718392162474, + "loss": 1.2957, + "step": 7113 + }, + { + "epoch": 0.337076522151149, + "grad_norm": 0.58203125, + "learning_rate": 0.00014906420763729638, + "loss": 0.5867, + "step": 7114 + }, + { + "epoch": 0.3371239042880834, + "grad_norm": 0.341796875, + "learning_rate": 0.00014905123026477614, + "loss": 0.0653, + "step": 7115 + }, + { + "epoch": 0.33717128642501776, + "grad_norm": 0.6640625, + "learning_rate": 0.00014903825180435186, + "loss": 1.0758, + "step": 7116 + }, + { + "epoch": 0.33721866856195215, + "grad_norm": 0.59765625, + "learning_rate": 0.0001490252722563114, + "loss": 0.4744, + "step": 7117 + }, + { + "epoch": 0.33726605069888654, + "grad_norm": 0.376953125, + "learning_rate": 0.00014901229162094263, + "loss": 0.1012, + "step": 7118 + }, + { + "epoch": 0.3373134328358209, + "grad_norm": 0.79296875, + "learning_rate": 0.00014899930989853343, + "loss": 0.8182, + "step": 7119 + }, + { + "epoch": 0.3373608149727553, + "grad_norm": 0.6953125, + "learning_rate": 0.00014898632708937171, + "loss": 1.108, + "step": 7120 + }, + { + "epoch": 0.33740819710968967, + "grad_norm": 0.5234375, + "learning_rate": 0.00014897334319374545, + "loss": 0.9846, + "step": 7121 + }, + { + "epoch": 0.337455579246624, + "grad_norm": 0.26171875, + "learning_rate": 0.00014896035821194262, + "loss": 0.0365, + "step": 7122 + }, + { + "epoch": 0.3375029613835584, + "grad_norm": 0.69921875, + "learning_rate": 0.00014894737214425117, + "loss": 0.9856, + "step": 7123 + }, + { + "epoch": 0.3375503435204928, + "grad_norm": 0.52734375, + "learning_rate": 0.00014893438499095915, + "loss": 0.0867, + "step": 7124 + }, + { + "epoch": 0.33759772565742713, + "grad_norm": 0.52734375, + "learning_rate": 0.00014892139675235462, + "loss": 0.9966, + "step": 7125 + }, + { + "epoch": 0.3376451077943615, + "grad_norm": 0.06982421875, + "learning_rate": 0.00014890840742872555, + "loss": 0.0097, + "step": 7126 + }, + { + "epoch": 0.3376924899312959, + "grad_norm": 0.69921875, + "learning_rate": 0.00014889541702036013, + "loss": 1.1499, + "step": 7127 + }, + { + "epoch": 0.33773987206823025, + "grad_norm": 0.62890625, + "learning_rate": 0.00014888242552754647, + "loss": 0.6494, + "step": 7128 + }, + { + "epoch": 0.33778725420516464, + "grad_norm": 0.04296875, + "learning_rate": 0.00014886943295057265, + "loss": 0.0051, + "step": 7129 + }, + { + "epoch": 0.33783463634209904, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001488564392897269, + "loss": 0.1142, + "step": 7130 + }, + { + "epoch": 0.33788201847903343, + "grad_norm": 0.671875, + "learning_rate": 0.00014884344454529734, + "loss": 0.6924, + "step": 7131 + }, + { + "epoch": 0.33792940061596777, + "grad_norm": 0.6640625, + "learning_rate": 0.00014883044871757218, + "loss": 0.8983, + "step": 7132 + }, + { + "epoch": 0.33797678275290216, + "grad_norm": 0.5703125, + "learning_rate": 0.0001488174518068397, + "loss": 0.622, + "step": 7133 + }, + { + "epoch": 0.33802416488983655, + "grad_norm": 0.68359375, + "learning_rate": 0.00014880445381338815, + "loss": 1.1508, + "step": 7134 + }, + { + "epoch": 0.3380715470267709, + "grad_norm": 0.259765625, + "learning_rate": 0.00014879145473750577, + "loss": 0.0232, + "step": 7135 + }, + { + "epoch": 0.3381189291637053, + "grad_norm": 0.6640625, + "learning_rate": 0.00014877845457948093, + "loss": 0.7931, + "step": 7136 + }, + { + "epoch": 0.3381663113006397, + "grad_norm": 0.62890625, + "learning_rate": 0.0001487654533396019, + "loss": 1.0485, + "step": 7137 + }, + { + "epoch": 0.338213693437574, + "grad_norm": 0.5859375, + "learning_rate": 0.00014875245101815708, + "loss": 0.4988, + "step": 7138 + }, + { + "epoch": 0.3382610755745084, + "grad_norm": 0.5703125, + "learning_rate": 0.0001487394476154348, + "loss": 0.9374, + "step": 7139 + }, + { + "epoch": 0.3383084577114428, + "grad_norm": 0.53515625, + "learning_rate": 0.0001487264431317235, + "loss": 1.1444, + "step": 7140 + }, + { + "epoch": 0.33835583984837714, + "grad_norm": 0.703125, + "learning_rate": 0.00014871343756731156, + "loss": 0.5691, + "step": 7141 + }, + { + "epoch": 0.33840322198531153, + "grad_norm": 0.6796875, + "learning_rate": 0.00014870043092248748, + "loss": 1.2617, + "step": 7142 + }, + { + "epoch": 0.3384506041222459, + "grad_norm": 0.5546875, + "learning_rate": 0.00014868742319753975, + "loss": 1.0233, + "step": 7143 + }, + { + "epoch": 0.33849798625918026, + "grad_norm": 0.61328125, + "learning_rate": 0.0001486744143927568, + "loss": 1.2125, + "step": 7144 + }, + { + "epoch": 0.33854536839611465, + "grad_norm": 0.62890625, + "learning_rate": 0.00014866140450842718, + "loss": 1.0453, + "step": 7145 + }, + { + "epoch": 0.33859275053304905, + "grad_norm": 0.5859375, + "learning_rate": 0.00014864839354483946, + "loss": 0.8343, + "step": 7146 + }, + { + "epoch": 0.33864013266998344, + "grad_norm": 0.41796875, + "learning_rate": 0.00014863538150228217, + "loss": 0.6355, + "step": 7147 + }, + { + "epoch": 0.3386875148069178, + "grad_norm": 0.62890625, + "learning_rate": 0.00014862236838104396, + "loss": 1.1156, + "step": 7148 + }, + { + "epoch": 0.33873489694385217, + "grad_norm": 0.59375, + "learning_rate": 0.00014860935418141338, + "loss": 1.0201, + "step": 7149 + }, + { + "epoch": 0.33878227908078656, + "grad_norm": 0.6328125, + "learning_rate": 0.00014859633890367907, + "loss": 0.5542, + "step": 7150 + }, + { + "epoch": 0.3388296612177209, + "grad_norm": 0.042236328125, + "learning_rate": 0.0001485833225481298, + "loss": 0.0045, + "step": 7151 + }, + { + "epoch": 0.3388770433546553, + "grad_norm": 0.60546875, + "learning_rate": 0.00014857030511505412, + "loss": 1.0131, + "step": 7152 + }, + { + "epoch": 0.3389244254915897, + "grad_norm": 0.220703125, + "learning_rate": 0.00014855728660474084, + "loss": 0.0299, + "step": 7153 + }, + { + "epoch": 0.338971807628524, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014854426701747865, + "loss": 0.1357, + "step": 7154 + }, + { + "epoch": 0.3390191897654584, + "grad_norm": 0.031494140625, + "learning_rate": 0.00014853124635355632, + "loss": 0.0032, + "step": 7155 + }, + { + "epoch": 0.3390665719023928, + "grad_norm": 0.470703125, + "learning_rate": 0.00014851822461326266, + "loss": 0.3549, + "step": 7156 + }, + { + "epoch": 0.33911395403932715, + "grad_norm": 0.388671875, + "learning_rate": 0.00014850520179688644, + "loss": 0.4451, + "step": 7157 + }, + { + "epoch": 0.33916133617626154, + "grad_norm": 0.44140625, + "learning_rate": 0.0001484921779047165, + "loss": 0.0741, + "step": 7158 + }, + { + "epoch": 0.33920871831319593, + "grad_norm": 0.54296875, + "learning_rate": 0.00014847915293704172, + "loss": 1.5142, + "step": 7159 + }, + { + "epoch": 0.3392561004501303, + "grad_norm": 0.039306640625, + "learning_rate": 0.000148466126894151, + "loss": 0.0014, + "step": 7160 + }, + { + "epoch": 0.33930348258706466, + "grad_norm": 0.67578125, + "learning_rate": 0.00014845309977633316, + "loss": 1.29, + "step": 7161 + }, + { + "epoch": 0.33935086472399906, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014844007158387718, + "loss": 0.0123, + "step": 7162 + }, + { + "epoch": 0.33939824686093345, + "grad_norm": 0.6328125, + "learning_rate": 0.00014842704231707204, + "loss": 0.9652, + "step": 7163 + }, + { + "epoch": 0.3394456289978678, + "grad_norm": 0.5234375, + "learning_rate": 0.00014841401197620665, + "loss": 1.042, + "step": 7164 + }, + { + "epoch": 0.3394930111348022, + "grad_norm": 0.494140625, + "learning_rate": 0.00014840098056157007, + "loss": 0.9504, + "step": 7165 + }, + { + "epoch": 0.3395403932717366, + "grad_norm": 0.7421875, + "learning_rate": 0.00014838794807345128, + "loss": 1.1345, + "step": 7166 + }, + { + "epoch": 0.3395877754086709, + "grad_norm": 0.400390625, + "learning_rate": 0.00014837491451213933, + "loss": 0.0718, + "step": 7167 + }, + { + "epoch": 0.3396351575456053, + "grad_norm": 0.58203125, + "learning_rate": 0.00014836187987792333, + "loss": 0.5195, + "step": 7168 + }, + { + "epoch": 0.3396825396825397, + "grad_norm": 0.58203125, + "learning_rate": 0.00014834884417109235, + "loss": 0.9032, + "step": 7169 + }, + { + "epoch": 0.33972992181947403, + "grad_norm": 0.6484375, + "learning_rate": 0.00014833580739193546, + "loss": 1.1558, + "step": 7170 + }, + { + "epoch": 0.3397773039564084, + "grad_norm": 0.004119873046875, + "learning_rate": 0.00014832276954074191, + "loss": 0.0002, + "step": 7171 + }, + { + "epoch": 0.3398246860933428, + "grad_norm": 0.671875, + "learning_rate": 0.00014830973061780075, + "loss": 0.5184, + "step": 7172 + }, + { + "epoch": 0.33987206823027716, + "grad_norm": 0.73828125, + "learning_rate": 0.00014829669062340123, + "loss": 0.9264, + "step": 7173 + }, + { + "epoch": 0.33991945036721155, + "grad_norm": 0.65625, + "learning_rate": 0.00014828364955783257, + "loss": 0.8504, + "step": 7174 + }, + { + "epoch": 0.33996683250414594, + "grad_norm": 0.64453125, + "learning_rate": 0.00014827060742138399, + "loss": 1.2405, + "step": 7175 + }, + { + "epoch": 0.34001421464108034, + "grad_norm": 0.05859375, + "learning_rate": 0.00014825756421434476, + "loss": 0.007, + "step": 7176 + }, + { + "epoch": 0.3400615967780147, + "grad_norm": 0.6640625, + "learning_rate": 0.00014824451993700416, + "loss": 0.8505, + "step": 7177 + }, + { + "epoch": 0.34010897891494907, + "grad_norm": 0.89453125, + "learning_rate": 0.0001482314745896515, + "loss": 0.6473, + "step": 7178 + }, + { + "epoch": 0.34015636105188346, + "grad_norm": 0.8203125, + "learning_rate": 0.00014821842817257607, + "loss": 1.3722, + "step": 7179 + }, + { + "epoch": 0.3402037431888178, + "grad_norm": 0.65234375, + "learning_rate": 0.00014820538068606727, + "loss": 0.9416, + "step": 7180 + }, + { + "epoch": 0.3402511253257522, + "grad_norm": 0.40234375, + "learning_rate": 0.00014819233213041451, + "loss": 0.0131, + "step": 7181 + }, + { + "epoch": 0.3402985074626866, + "grad_norm": 0.46875, + "learning_rate": 0.00014817928250590714, + "loss": 0.0351, + "step": 7182 + }, + { + "epoch": 0.3403458895996209, + "grad_norm": 0.58984375, + "learning_rate": 0.00014816623181283458, + "loss": 1.075, + "step": 7183 + }, + { + "epoch": 0.3403932717365553, + "grad_norm": 0.7421875, + "learning_rate": 0.0001481531800514863, + "loss": 1.0367, + "step": 7184 + }, + { + "epoch": 0.3404406538734897, + "grad_norm": 0.54296875, + "learning_rate": 0.00014814012722215181, + "loss": 0.8159, + "step": 7185 + }, + { + "epoch": 0.34048803601042404, + "grad_norm": 0.5703125, + "learning_rate": 0.0001481270733251206, + "loss": 0.8916, + "step": 7186 + }, + { + "epoch": 0.34053541814735844, + "grad_norm": 0.90234375, + "learning_rate": 0.00014811401836068211, + "loss": 0.7287, + "step": 7187 + }, + { + "epoch": 0.34058280028429283, + "grad_norm": 0.392578125, + "learning_rate": 0.00014810096232912594, + "loss": 0.1533, + "step": 7188 + }, + { + "epoch": 0.3406301824212272, + "grad_norm": 0.53125, + "learning_rate": 0.0001480879052307417, + "loss": 0.0744, + "step": 7189 + }, + { + "epoch": 0.34067756455816156, + "grad_norm": 0.66015625, + "learning_rate": 0.0001480748470658189, + "loss": 0.9296, + "step": 7190 + }, + { + "epoch": 0.34072494669509595, + "grad_norm": 0.51953125, + "learning_rate": 0.00014806178783464722, + "loss": 0.6243, + "step": 7191 + }, + { + "epoch": 0.34077232883203035, + "grad_norm": 0.470703125, + "learning_rate": 0.00014804872753751625, + "loss": 0.5635, + "step": 7192 + }, + { + "epoch": 0.3408197109689647, + "grad_norm": 0.2451171875, + "learning_rate": 0.00014803566617471573, + "loss": 0.1587, + "step": 7193 + }, + { + "epoch": 0.3408670931058991, + "grad_norm": 0.169921875, + "learning_rate": 0.0001480226037465353, + "loss": 0.0075, + "step": 7194 + }, + { + "epoch": 0.34091447524283347, + "grad_norm": 0.328125, + "learning_rate": 0.00014800954025326465, + "loss": 0.1212, + "step": 7195 + }, + { + "epoch": 0.3409618573797678, + "grad_norm": 0.7890625, + "learning_rate": 0.00014799647569519353, + "loss": 1.1877, + "step": 7196 + }, + { + "epoch": 0.3410092395167022, + "grad_norm": 0.71484375, + "learning_rate": 0.00014798341007261171, + "loss": 1.0534, + "step": 7197 + }, + { + "epoch": 0.3410566216536366, + "grad_norm": 0.67578125, + "learning_rate": 0.00014797034338580897, + "loss": 1.1359, + "step": 7198 + }, + { + "epoch": 0.34110400379057093, + "grad_norm": 1.2890625, + "learning_rate": 0.0001479572756350751, + "loss": 0.2054, + "step": 7199 + }, + { + "epoch": 0.3411513859275053, + "grad_norm": 0.453125, + "learning_rate": 0.00014794420682069995, + "loss": 0.3547, + "step": 7200 + }, + { + "epoch": 0.3411987680644397, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00014793113694297336, + "loss": 0.0009, + "step": 7201 + }, + { + "epoch": 0.34124615020137405, + "grad_norm": 0.66796875, + "learning_rate": 0.0001479180660021852, + "loss": 1.2018, + "step": 7202 + }, + { + "epoch": 0.34129353233830845, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014790499399862535, + "loss": 0.0145, + "step": 7203 + }, + { + "epoch": 0.34134091447524284, + "grad_norm": 0.765625, + "learning_rate": 0.00014789192093258378, + "loss": 0.9923, + "step": 7204 + }, + { + "epoch": 0.34138829661217723, + "grad_norm": 0.58203125, + "learning_rate": 0.00014787884680435043, + "loss": 0.5752, + "step": 7205 + }, + { + "epoch": 0.34143567874911157, + "grad_norm": 0.671875, + "learning_rate": 0.00014786577161421524, + "loss": 1.3934, + "step": 7206 + }, + { + "epoch": 0.34148306088604596, + "grad_norm": 0.640625, + "learning_rate": 0.00014785269536246823, + "loss": 1.1021, + "step": 7207 + }, + { + "epoch": 0.34153044302298036, + "grad_norm": 0.6875, + "learning_rate": 0.0001478396180493994, + "loss": 1.4513, + "step": 7208 + }, + { + "epoch": 0.3415778251599147, + "grad_norm": 0.65234375, + "learning_rate": 0.00014782653967529882, + "loss": 0.8498, + "step": 7209 + }, + { + "epoch": 0.3416252072968491, + "grad_norm": 0.859375, + "learning_rate": 0.0001478134602404565, + "loss": 0.5001, + "step": 7210 + }, + { + "epoch": 0.3416725894337835, + "grad_norm": 0.8359375, + "learning_rate": 0.00014780037974516258, + "loss": 0.8522, + "step": 7211 + }, + { + "epoch": 0.3417199715707178, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014778729818970714, + "loss": 0.0263, + "step": 7212 + }, + { + "epoch": 0.3417673537076522, + "grad_norm": 0.66015625, + "learning_rate": 0.00014777421557438033, + "loss": 1.4596, + "step": 7213 + }, + { + "epoch": 0.3418147358445866, + "grad_norm": 0.6484375, + "learning_rate": 0.0001477611318994723, + "loss": 0.9247, + "step": 7214 + }, + { + "epoch": 0.34186211798152094, + "grad_norm": 0.640625, + "learning_rate": 0.00014774804716527324, + "loss": 1.1011, + "step": 7215 + }, + { + "epoch": 0.34190950011845533, + "grad_norm": 0.72265625, + "learning_rate": 0.00014773496137207337, + "loss": 0.4848, + "step": 7216 + }, + { + "epoch": 0.3419568822553897, + "grad_norm": 0.19921875, + "learning_rate": 0.0001477218745201629, + "loss": 0.1409, + "step": 7217 + }, + { + "epoch": 0.3420042643923241, + "grad_norm": 0.6484375, + "learning_rate": 0.00014770878660983207, + "loss": 1.2408, + "step": 7218 + }, + { + "epoch": 0.34205164652925846, + "grad_norm": 0.26171875, + "learning_rate": 0.00014769569764137117, + "loss": 0.1452, + "step": 7219 + }, + { + "epoch": 0.34209902866619285, + "grad_norm": 0.5390625, + "learning_rate": 0.0001476826076150705, + "loss": 0.0404, + "step": 7220 + }, + { + "epoch": 0.34214641080312724, + "grad_norm": 0.87890625, + "learning_rate": 0.0001476695165312204, + "loss": 1.304, + "step": 7221 + }, + { + "epoch": 0.3421937929400616, + "grad_norm": 0.73828125, + "learning_rate": 0.00014765642439011116, + "loss": 1.162, + "step": 7222 + }, + { + "epoch": 0.342241175076996, + "grad_norm": 0.76953125, + "learning_rate": 0.0001476433311920332, + "loss": 1.0679, + "step": 7223 + }, + { + "epoch": 0.34228855721393037, + "grad_norm": 0.78515625, + "learning_rate": 0.00014763023693727695, + "loss": 0.1273, + "step": 7224 + }, + { + "epoch": 0.3423359393508647, + "grad_norm": 0.671875, + "learning_rate": 0.00014761714162613273, + "loss": 1.2374, + "step": 7225 + }, + { + "epoch": 0.3423833214877991, + "grad_norm": 0.6640625, + "learning_rate": 0.00014760404525889105, + "loss": 0.7444, + "step": 7226 + }, + { + "epoch": 0.3424307036247335, + "grad_norm": 0.59765625, + "learning_rate": 0.00014759094783584233, + "loss": 0.9537, + "step": 7227 + }, + { + "epoch": 0.3424780857616678, + "grad_norm": 0.55078125, + "learning_rate": 0.0001475778493572771, + "loss": 0.3954, + "step": 7228 + }, + { + "epoch": 0.3425254678986022, + "grad_norm": 1.046875, + "learning_rate": 0.00014756474982348584, + "loss": 0.3159, + "step": 7229 + }, + { + "epoch": 0.3425728500355366, + "grad_norm": 0.67578125, + "learning_rate": 0.00014755164923475908, + "loss": 1.084, + "step": 7230 + }, + { + "epoch": 0.34262023217247095, + "grad_norm": 0.62890625, + "learning_rate": 0.00014753854759138742, + "loss": 0.9867, + "step": 7231 + }, + { + "epoch": 0.34266761430940534, + "grad_norm": 0.41796875, + "learning_rate": 0.0001475254448936614, + "loss": 0.089, + "step": 7232 + }, + { + "epoch": 0.34271499644633974, + "grad_norm": 0.59375, + "learning_rate": 0.0001475123411418716, + "loss": 0.9966, + "step": 7233 + }, + { + "epoch": 0.34276237858327413, + "grad_norm": 0.169921875, + "learning_rate": 0.00014749923633630872, + "loss": 0.0149, + "step": 7234 + }, + { + "epoch": 0.34280976072020847, + "grad_norm": 1.0703125, + "learning_rate": 0.00014748613047726336, + "loss": 1.4929, + "step": 7235 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.75390625, + "learning_rate": 0.00014747302356502622, + "loss": 0.3428, + "step": 7236 + }, + { + "epoch": 0.34290452499407725, + "grad_norm": 0.0027923583984375, + "learning_rate": 0.00014745991559988794, + "loss": 0.0002, + "step": 7237 + }, + { + "epoch": 0.3429519071310116, + "grad_norm": 0.69140625, + "learning_rate": 0.00014744680658213932, + "loss": 1.5044, + "step": 7238 + }, + { + "epoch": 0.342999289267946, + "grad_norm": 0.62109375, + "learning_rate": 0.00014743369651207106, + "loss": 1.1535, + "step": 7239 + }, + { + "epoch": 0.3430466714048804, + "grad_norm": 0.68359375, + "learning_rate": 0.00014742058538997393, + "loss": 1.421, + "step": 7240 + }, + { + "epoch": 0.3430940535418147, + "grad_norm": 0.7265625, + "learning_rate": 0.00014740747321613875, + "loss": 1.2337, + "step": 7241 + }, + { + "epoch": 0.3431414356787491, + "grad_norm": 0.59375, + "learning_rate": 0.0001473943599908563, + "loss": 1.1612, + "step": 7242 + }, + { + "epoch": 0.3431888178156835, + "grad_norm": 0.06298828125, + "learning_rate": 0.00014738124571441743, + "loss": 0.0057, + "step": 7243 + }, + { + "epoch": 0.34323619995261784, + "grad_norm": 0.86328125, + "learning_rate": 0.00014736813038711296, + "loss": 0.3686, + "step": 7244 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 0.2431640625, + "learning_rate": 0.00014735501400923385, + "loss": 0.0485, + "step": 7245 + }, + { + "epoch": 0.3433309642264866, + "grad_norm": 0.7421875, + "learning_rate": 0.00014734189658107094, + "loss": 1.0313, + "step": 7246 + }, + { + "epoch": 0.343378346363421, + "grad_norm": 0.6171875, + "learning_rate": 0.00014732877810291523, + "loss": 0.9282, + "step": 7247 + }, + { + "epoch": 0.34342572850035535, + "grad_norm": 0.6328125, + "learning_rate": 0.00014731565857505763, + "loss": 0.6842, + "step": 7248 + }, + { + "epoch": 0.34347311063728975, + "grad_norm": 0.384765625, + "learning_rate": 0.0001473025379977891, + "loss": 0.2441, + "step": 7249 + }, + { + "epoch": 0.34352049277422414, + "grad_norm": 0.6796875, + "learning_rate": 0.00014728941637140065, + "loss": 0.7064, + "step": 7250 + }, + { + "epoch": 0.3435678749111585, + "grad_norm": 0.5390625, + "learning_rate": 0.00014727629369618334, + "loss": 0.5426, + "step": 7251 + }, + { + "epoch": 0.34361525704809287, + "grad_norm": 0.6875, + "learning_rate": 0.00014726316997242818, + "loss": 1.2039, + "step": 7252 + }, + { + "epoch": 0.34366263918502726, + "grad_norm": 0.01708984375, + "learning_rate": 0.00014725004520042628, + "loss": 0.0011, + "step": 7253 + }, + { + "epoch": 0.3437100213219616, + "grad_norm": 0.57421875, + "learning_rate": 0.00014723691938046867, + "loss": 1.3481, + "step": 7254 + }, + { + "epoch": 0.343757403458896, + "grad_norm": 0.73828125, + "learning_rate": 0.00014722379251284654, + "loss": 1.3633, + "step": 7255 + }, + { + "epoch": 0.3438047855958304, + "grad_norm": 0.5, + "learning_rate": 0.00014721066459785095, + "loss": 0.1503, + "step": 7256 + }, + { + "epoch": 0.3438521677327647, + "grad_norm": 0.69921875, + "learning_rate": 0.00014719753563577313, + "loss": 1.1006, + "step": 7257 + }, + { + "epoch": 0.3438995498696991, + "grad_norm": 0.66015625, + "learning_rate": 0.00014718440562690424, + "loss": 0.5793, + "step": 7258 + }, + { + "epoch": 0.3439469320066335, + "grad_norm": 0.87109375, + "learning_rate": 0.0001471712745715355, + "loss": 1.1158, + "step": 7259 + }, + { + "epoch": 0.34399431414356785, + "grad_norm": 0.47265625, + "learning_rate": 0.0001471581424699581, + "loss": 0.9373, + "step": 7260 + }, + { + "epoch": 0.34404169628050224, + "grad_norm": 0.251953125, + "learning_rate": 0.00014714500932246332, + "loss": 0.1639, + "step": 7261 + }, + { + "epoch": 0.34408907841743663, + "grad_norm": 0.201171875, + "learning_rate": 0.00014713187512934253, + "loss": 0.1281, + "step": 7262 + }, + { + "epoch": 0.344136460554371, + "grad_norm": 0.65625, + "learning_rate": 0.0001471187398908869, + "loss": 1.3557, + "step": 7263 + }, + { + "epoch": 0.34418384269130536, + "grad_norm": 0.734375, + "learning_rate": 0.0001471056036073878, + "loss": 1.4961, + "step": 7264 + }, + { + "epoch": 0.34423122482823976, + "grad_norm": 0.6171875, + "learning_rate": 0.0001470924662791366, + "loss": 0.7998, + "step": 7265 + }, + { + "epoch": 0.34427860696517415, + "grad_norm": 0.5390625, + "learning_rate": 0.0001470793279064247, + "loss": 0.7616, + "step": 7266 + }, + { + "epoch": 0.3443259891021085, + "grad_norm": 0.416015625, + "learning_rate": 0.0001470661884895434, + "loss": 0.5415, + "step": 7267 + }, + { + "epoch": 0.3443733712390429, + "grad_norm": 0.57421875, + "learning_rate": 0.00014705304802878417, + "loss": 0.7378, + "step": 7268 + }, + { + "epoch": 0.3444207533759773, + "grad_norm": 0.73828125, + "learning_rate": 0.00014703990652443845, + "loss": 0.9451, + "step": 7269 + }, + { + "epoch": 0.3444681355129116, + "grad_norm": 1.0703125, + "learning_rate": 0.00014702676397679776, + "loss": 0.3388, + "step": 7270 + }, + { + "epoch": 0.344515517649846, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00014701362038615348, + "loss": 0.0011, + "step": 7271 + }, + { + "epoch": 0.3445628997867804, + "grad_norm": 0.7890625, + "learning_rate": 0.0001470004757527972, + "loss": 1.2691, + "step": 7272 + }, + { + "epoch": 0.34461028192371473, + "grad_norm": 0.62890625, + "learning_rate": 0.00014698733007702044, + "loss": 0.8237, + "step": 7273 + }, + { + "epoch": 0.3446576640606491, + "grad_norm": 0.2197265625, + "learning_rate": 0.00014697418335911472, + "loss": 0.065, + "step": 7274 + }, + { + "epoch": 0.3447050461975835, + "grad_norm": 0.68359375, + "learning_rate": 0.00014696103559937165, + "loss": 1.1218, + "step": 7275 + }, + { + "epoch": 0.3447524283345179, + "grad_norm": 0.2109375, + "learning_rate": 0.00014694788679808286, + "loss": 0.1589, + "step": 7276 + }, + { + "epoch": 0.34479981047145225, + "grad_norm": 0.228515625, + "learning_rate": 0.00014693473695553995, + "loss": 0.1621, + "step": 7277 + }, + { + "epoch": 0.34484719260838664, + "grad_norm": 0.68359375, + "learning_rate": 0.00014692158607203454, + "loss": 0.9023, + "step": 7278 + }, + { + "epoch": 0.34489457474532104, + "grad_norm": 0.58203125, + "learning_rate": 0.00014690843414785835, + "loss": 0.8748, + "step": 7279 + }, + { + "epoch": 0.3449419568822554, + "grad_norm": 0.76171875, + "learning_rate": 0.00014689528118330304, + "loss": 0.1103, + "step": 7280 + }, + { + "epoch": 0.34498933901918977, + "grad_norm": 0.34765625, + "learning_rate": 0.00014688212717866038, + "loss": 0.0525, + "step": 7281 + }, + { + "epoch": 0.34503672115612416, + "grad_norm": 0.06494140625, + "learning_rate": 0.000146868972134222, + "loss": 0.008, + "step": 7282 + }, + { + "epoch": 0.3450841032930585, + "grad_norm": 0.65625, + "learning_rate": 0.00014685581605027978, + "loss": 0.9089, + "step": 7283 + }, + { + "epoch": 0.3451314854299929, + "grad_norm": 0.470703125, + "learning_rate": 0.00014684265892712548, + "loss": 0.2723, + "step": 7284 + }, + { + "epoch": 0.3451788675669273, + "grad_norm": 0.7734375, + "learning_rate": 0.0001468295007650509, + "loss": 1.4206, + "step": 7285 + }, + { + "epoch": 0.3452262497038616, + "grad_norm": 0.59375, + "learning_rate": 0.00014681634156434785, + "loss": 1.3776, + "step": 7286 + }, + { + "epoch": 0.345273631840796, + "grad_norm": 0.53125, + "learning_rate": 0.00014680318132530827, + "loss": 1.148, + "step": 7287 + }, + { + "epoch": 0.3453210139777304, + "grad_norm": 0.427734375, + "learning_rate": 0.0001467900200482239, + "loss": 0.6705, + "step": 7288 + }, + { + "epoch": 0.34536839611466474, + "grad_norm": 0.5234375, + "learning_rate": 0.00014677685773338678, + "loss": 0.72, + "step": 7289 + }, + { + "epoch": 0.34541577825159914, + "grad_norm": 0.25390625, + "learning_rate": 0.00014676369438108874, + "loss": 0.0097, + "step": 7290 + }, + { + "epoch": 0.34546316038853353, + "grad_norm": 0.6796875, + "learning_rate": 0.00014675052999162179, + "loss": 1.5282, + "step": 7291 + }, + { + "epoch": 0.3455105425254679, + "grad_norm": 0.87890625, + "learning_rate": 0.00014673736456527787, + "loss": 1.3205, + "step": 7292 + }, + { + "epoch": 0.34555792466240226, + "grad_norm": 0.62109375, + "learning_rate": 0.00014672419810234902, + "loss": 1.3132, + "step": 7293 + }, + { + "epoch": 0.34560530679933665, + "grad_norm": 0.490234375, + "learning_rate": 0.00014671103060312718, + "loss": 0.0965, + "step": 7294 + }, + { + "epoch": 0.34565268893627105, + "grad_norm": 0.255859375, + "learning_rate": 0.00014669786206790447, + "loss": 0.1393, + "step": 7295 + }, + { + "epoch": 0.3457000710732054, + "grad_norm": 0.6328125, + "learning_rate": 0.0001466846924969729, + "loss": 1.0978, + "step": 7296 + }, + { + "epoch": 0.3457474532101398, + "grad_norm": 0.9296875, + "learning_rate": 0.00014667152189062462, + "loss": 1.045, + "step": 7297 + }, + { + "epoch": 0.34579483534707417, + "grad_norm": 0.734375, + "learning_rate": 0.00014665835024915165, + "loss": 0.7312, + "step": 7298 + }, + { + "epoch": 0.3458422174840085, + "grad_norm": 0.55078125, + "learning_rate": 0.00014664517757284617, + "loss": 0.732, + "step": 7299 + }, + { + "epoch": 0.3458895996209429, + "grad_norm": 0.99609375, + "learning_rate": 0.00014663200386200035, + "loss": 0.8799, + "step": 7300 + }, + { + "epoch": 0.3459369817578773, + "grad_norm": 0.5703125, + "learning_rate": 0.00014661882911690634, + "loss": 0.5094, + "step": 7301 + }, + { + "epoch": 0.34598436389481163, + "grad_norm": 0.7421875, + "learning_rate": 0.00014660565333785637, + "loss": 0.969, + "step": 7302 + }, + { + "epoch": 0.346031746031746, + "grad_norm": 0.27734375, + "learning_rate": 0.00014659247652514266, + "loss": 0.02, + "step": 7303 + }, + { + "epoch": 0.3460791281686804, + "grad_norm": 0.466796875, + "learning_rate": 0.00014657929867905746, + "loss": 0.5696, + "step": 7304 + }, + { + "epoch": 0.3461265103056148, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014656611979989298, + "loss": 0.1315, + "step": 7305 + }, + { + "epoch": 0.34617389244254915, + "grad_norm": 0.0283203125, + "learning_rate": 0.00014655293988794158, + "loss": 0.0009, + "step": 7306 + }, + { + "epoch": 0.34622127457948354, + "grad_norm": 0.5546875, + "learning_rate": 0.0001465397589434956, + "loss": 0.9577, + "step": 7307 + }, + { + "epoch": 0.34626865671641793, + "grad_norm": 0.77734375, + "learning_rate": 0.0001465265769668473, + "loss": 1.1871, + "step": 7308 + }, + { + "epoch": 0.34631603885335227, + "grad_norm": 0.1875, + "learning_rate": 0.00014651339395828906, + "loss": 0.1278, + "step": 7309 + }, + { + "epoch": 0.34636342099028666, + "grad_norm": 0.82421875, + "learning_rate": 0.00014650020991811334, + "loss": 1.3712, + "step": 7310 + }, + { + "epoch": 0.34641080312722106, + "grad_norm": 0.5390625, + "learning_rate": 0.00014648702484661245, + "loss": 1.0471, + "step": 7311 + }, + { + "epoch": 0.3464581852641554, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001464738387440789, + "loss": 0.0317, + "step": 7312 + }, + { + "epoch": 0.3465055674010898, + "grad_norm": 0.291015625, + "learning_rate": 0.00014646065161080509, + "loss": 0.14, + "step": 7313 + }, + { + "epoch": 0.3465529495380242, + "grad_norm": 0.96484375, + "learning_rate": 0.00014644746344708351, + "loss": 0.913, + "step": 7314 + }, + { + "epoch": 0.3466003316749585, + "grad_norm": 0.51171875, + "learning_rate": 0.00014643427425320665, + "loss": 0.3591, + "step": 7315 + }, + { + "epoch": 0.3466477138118929, + "grad_norm": 0.76171875, + "learning_rate": 0.00014642108402946707, + "loss": 1.342, + "step": 7316 + }, + { + "epoch": 0.3466950959488273, + "grad_norm": 0.05859375, + "learning_rate": 0.0001464078927761573, + "loss": 0.0029, + "step": 7317 + }, + { + "epoch": 0.34674247808576164, + "grad_norm": 0.07861328125, + "learning_rate": 0.00014639470049356988, + "loss": 0.0056, + "step": 7318 + }, + { + "epoch": 0.34678986022269603, + "grad_norm": 0.52734375, + "learning_rate": 0.00014638150718199744, + "loss": 0.0129, + "step": 7319 + }, + { + "epoch": 0.3468372423596304, + "grad_norm": 0.52734375, + "learning_rate": 0.00014636831284173257, + "loss": 0.406, + "step": 7320 + }, + { + "epoch": 0.3468846244965648, + "grad_norm": 0.66796875, + "learning_rate": 0.0001463551174730679, + "loss": 0.7431, + "step": 7321 + }, + { + "epoch": 0.34693200663349916, + "grad_norm": 0.59375, + "learning_rate": 0.0001463419210762961, + "loss": 0.4843, + "step": 7322 + }, + { + "epoch": 0.34697938877043355, + "grad_norm": 0.69140625, + "learning_rate": 0.00014632872365170986, + "loss": 1.1616, + "step": 7323 + }, + { + "epoch": 0.34702677090736794, + "grad_norm": 0.71484375, + "learning_rate": 0.00014631552519960185, + "loss": 0.9166, + "step": 7324 + }, + { + "epoch": 0.3470741530443023, + "grad_norm": 0.81640625, + "learning_rate": 0.00014630232572026484, + "loss": 1.215, + "step": 7325 + }, + { + "epoch": 0.3471215351812367, + "grad_norm": 0.388671875, + "learning_rate": 0.0001462891252139916, + "loss": 0.1048, + "step": 7326 + }, + { + "epoch": 0.34716891731817107, + "grad_norm": 0.5703125, + "learning_rate": 0.00014627592368107484, + "loss": 0.485, + "step": 7327 + }, + { + "epoch": 0.3472162994551054, + "grad_norm": 0.91015625, + "learning_rate": 0.00014626272112180737, + "loss": 1.0416, + "step": 7328 + }, + { + "epoch": 0.3472636815920398, + "grad_norm": 0.58984375, + "learning_rate": 0.00014624951753648203, + "loss": 0.5534, + "step": 7329 + }, + { + "epoch": 0.3473110637289742, + "grad_norm": 0.0162353515625, + "learning_rate": 0.00014623631292539163, + "loss": 0.0008, + "step": 7330 + }, + { + "epoch": 0.3473584458659085, + "grad_norm": 1.0703125, + "learning_rate": 0.00014622310728882912, + "loss": 1.2545, + "step": 7331 + }, + { + "epoch": 0.3474058280028429, + "grad_norm": 0.423828125, + "learning_rate": 0.0001462099006270873, + "loss": 0.1492, + "step": 7332 + }, + { + "epoch": 0.3474532101397773, + "grad_norm": 0.91015625, + "learning_rate": 0.00014619669294045905, + "loss": 0.9416, + "step": 7333 + }, + { + "epoch": 0.3475005922767117, + "grad_norm": 0.51171875, + "learning_rate": 0.00014618348422923742, + "loss": 1.1713, + "step": 7334 + }, + { + "epoch": 0.34754797441364604, + "grad_norm": 0.279296875, + "learning_rate": 0.00014617027449371532, + "loss": 0.0498, + "step": 7335 + }, + { + "epoch": 0.34759535655058044, + "grad_norm": 0.640625, + "learning_rate": 0.00014615706373418566, + "loss": 0.1106, + "step": 7336 + }, + { + "epoch": 0.34764273868751483, + "grad_norm": 0.65625, + "learning_rate": 0.0001461438519509415, + "loss": 1.1389, + "step": 7337 + }, + { + "epoch": 0.34769012082444917, + "grad_norm": 0.58984375, + "learning_rate": 0.00014613063914427585, + "loss": 0.9827, + "step": 7338 + }, + { + "epoch": 0.34773750296138356, + "grad_norm": 0.58203125, + "learning_rate": 0.0001461174253144818, + "loss": 0.7366, + "step": 7339 + }, + { + "epoch": 0.34778488509831795, + "grad_norm": 0.470703125, + "learning_rate": 0.00014610421046185233, + "loss": 0.0852, + "step": 7340 + }, + { + "epoch": 0.3478322672352523, + "grad_norm": 0.9453125, + "learning_rate": 0.0001460909945866806, + "loss": 0.8307, + "step": 7341 + }, + { + "epoch": 0.3478796493721867, + "grad_norm": 0.7265625, + "learning_rate": 0.0001460777776892597, + "loss": 1.0301, + "step": 7342 + }, + { + "epoch": 0.3479270315091211, + "grad_norm": 0.68359375, + "learning_rate": 0.0001460645597698828, + "loss": 1.4619, + "step": 7343 + }, + { + "epoch": 0.3479744136460554, + "grad_norm": 0.7265625, + "learning_rate": 0.00014605134082884295, + "loss": 1.3222, + "step": 7344 + }, + { + "epoch": 0.3480217957829898, + "grad_norm": 0.83984375, + "learning_rate": 0.00014603812086643348, + "loss": 1.4361, + "step": 7345 + }, + { + "epoch": 0.3480691779199242, + "grad_norm": 0.56640625, + "learning_rate": 0.0001460248998829475, + "loss": 0.4261, + "step": 7346 + }, + { + "epoch": 0.34811656005685854, + "grad_norm": 0.61328125, + "learning_rate": 0.00014601167787867827, + "loss": 0.9054, + "step": 7347 + }, + { + "epoch": 0.34816394219379293, + "grad_norm": 0.7890625, + "learning_rate": 0.00014599845485391906, + "loss": 0.8762, + "step": 7348 + }, + { + "epoch": 0.3482113243307273, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014598523080896307, + "loss": 0.1223, + "step": 7349 + }, + { + "epoch": 0.3482587064676617, + "grad_norm": 0.53125, + "learning_rate": 0.0001459720057441037, + "loss": 0.5596, + "step": 7350 + }, + { + "epoch": 0.34830608860459605, + "grad_norm": 0.74609375, + "learning_rate": 0.00014595877965963418, + "loss": 1.2321, + "step": 7351 + }, + { + "epoch": 0.34835347074153045, + "grad_norm": 1.1328125, + "learning_rate": 0.00014594555255584786, + "loss": 1.2072, + "step": 7352 + }, + { + "epoch": 0.34840085287846484, + "grad_norm": 0.84375, + "learning_rate": 0.00014593232443303812, + "loss": 1.08, + "step": 7353 + }, + { + "epoch": 0.3484482350153992, + "grad_norm": 0.59375, + "learning_rate": 0.00014591909529149838, + "loss": 0.8205, + "step": 7354 + }, + { + "epoch": 0.34849561715233357, + "grad_norm": 0.703125, + "learning_rate": 0.00014590586513152202, + "loss": 1.0821, + "step": 7355 + }, + { + "epoch": 0.34854299928926796, + "grad_norm": 0.68359375, + "learning_rate": 0.00014589263395340245, + "loss": 0.8623, + "step": 7356 + }, + { + "epoch": 0.3485903814262023, + "grad_norm": 0.48828125, + "learning_rate": 0.00014587940175743317, + "loss": 0.7111, + "step": 7357 + }, + { + "epoch": 0.3486377635631367, + "grad_norm": 0.478515625, + "learning_rate": 0.0001458661685439076, + "loss": 0.1724, + "step": 7358 + }, + { + "epoch": 0.3486851457000711, + "grad_norm": 0.625, + "learning_rate": 0.00014585293431311925, + "loss": 0.7656, + "step": 7359 + }, + { + "epoch": 0.3487325278370054, + "grad_norm": 0.69921875, + "learning_rate": 0.00014583969906536168, + "loss": 0.4699, + "step": 7360 + }, + { + "epoch": 0.3487799099739398, + "grad_norm": 0.81640625, + "learning_rate": 0.0001458264628009284, + "loss": 1.0327, + "step": 7361 + }, + { + "epoch": 0.3488272921108742, + "grad_norm": 0.7265625, + "learning_rate": 0.000145813225520113, + "loss": 1.2035, + "step": 7362 + }, + { + "epoch": 0.3488746742478086, + "grad_norm": 0.82421875, + "learning_rate": 0.00014579998722320906, + "loss": 0.2131, + "step": 7363 + }, + { + "epoch": 0.34892205638474294, + "grad_norm": 0.70703125, + "learning_rate": 0.00014578674791051018, + "loss": 0.8787, + "step": 7364 + }, + { + "epoch": 0.34896943852167733, + "grad_norm": 0.58984375, + "learning_rate": 0.00014577350758231, + "loss": 0.5334, + "step": 7365 + }, + { + "epoch": 0.3490168206586117, + "grad_norm": 0.609375, + "learning_rate": 0.0001457602662389022, + "loss": 0.7806, + "step": 7366 + }, + { + "epoch": 0.34906420279554606, + "grad_norm": 0.62890625, + "learning_rate": 0.0001457470238805804, + "loss": 0.957, + "step": 7367 + }, + { + "epoch": 0.34911158493248046, + "grad_norm": 0.57421875, + "learning_rate": 0.00014573378050763836, + "loss": 0.6307, + "step": 7368 + }, + { + "epoch": 0.34915896706941485, + "grad_norm": 0.11572265625, + "learning_rate": 0.00014572053612036979, + "loss": 0.0118, + "step": 7369 + }, + { + "epoch": 0.3492063492063492, + "grad_norm": 0.8515625, + "learning_rate": 0.00014570729071906839, + "loss": 1.1567, + "step": 7370 + }, + { + "epoch": 0.3492537313432836, + "grad_norm": 0.6875, + "learning_rate": 0.00014569404430402798, + "loss": 0.8284, + "step": 7371 + }, + { + "epoch": 0.349301113480218, + "grad_norm": 0.60546875, + "learning_rate": 0.00014568079687554236, + "loss": 0.6115, + "step": 7372 + }, + { + "epoch": 0.3493484956171523, + "grad_norm": 0.65234375, + "learning_rate": 0.00014566754843390535, + "loss": 1.1986, + "step": 7373 + }, + { + "epoch": 0.3493958777540867, + "grad_norm": 0.734375, + "learning_rate": 0.0001456542989794107, + "loss": 0.9807, + "step": 7374 + }, + { + "epoch": 0.3494432598910211, + "grad_norm": 0.6796875, + "learning_rate": 0.00014564104851235232, + "loss": 1.0133, + "step": 7375 + }, + { + "epoch": 0.34949064202795543, + "grad_norm": 0.59765625, + "learning_rate": 0.00014562779703302411, + "loss": 1.1259, + "step": 7376 + }, + { + "epoch": 0.3495380241648898, + "grad_norm": 0.71875, + "learning_rate": 0.00014561454454172, + "loss": 1.0618, + "step": 7377 + }, + { + "epoch": 0.3495854063018242, + "grad_norm": 0.625, + "learning_rate": 0.00014560129103873386, + "loss": 0.7027, + "step": 7378 + }, + { + "epoch": 0.3496327884387586, + "grad_norm": 0.90625, + "learning_rate": 0.00014558803652435964, + "loss": 0.2671, + "step": 7379 + }, + { + "epoch": 0.34968017057569295, + "grad_norm": 0.53125, + "learning_rate": 0.00014557478099889136, + "loss": 0.7036, + "step": 7380 + }, + { + "epoch": 0.34972755271262734, + "grad_norm": 0.75390625, + "learning_rate": 0.00014556152446262296, + "loss": 0.6956, + "step": 7381 + }, + { + "epoch": 0.34977493484956174, + "grad_norm": 0.71484375, + "learning_rate": 0.00014554826691584846, + "loss": 1.3122, + "step": 7382 + }, + { + "epoch": 0.3498223169864961, + "grad_norm": 0.640625, + "learning_rate": 0.00014553500835886194, + "loss": 1.0056, + "step": 7383 + }, + { + "epoch": 0.34986969912343047, + "grad_norm": 0.64453125, + "learning_rate": 0.00014552174879195744, + "loss": 1.1146, + "step": 7384 + }, + { + "epoch": 0.34991708126036486, + "grad_norm": 0.84375, + "learning_rate": 0.00014550848821542905, + "loss": 0.7516, + "step": 7385 + }, + { + "epoch": 0.3499644633972992, + "grad_norm": 0.65234375, + "learning_rate": 0.0001454952266295708, + "loss": 1.2665, + "step": 7386 + }, + { + "epoch": 0.3500118455342336, + "grad_norm": 1.21875, + "learning_rate": 0.00014548196403467694, + "loss": 0.3745, + "step": 7387 + }, + { + "epoch": 0.350059227671168, + "grad_norm": 0.396484375, + "learning_rate": 0.00014546870043104156, + "loss": 0.5764, + "step": 7388 + }, + { + "epoch": 0.3501066098081023, + "grad_norm": 0.70703125, + "learning_rate": 0.00014545543581895884, + "loss": 0.6194, + "step": 7389 + }, + { + "epoch": 0.3501539919450367, + "grad_norm": 0.00154876708984375, + "learning_rate": 0.00014544217019872295, + "loss": 0.0001, + "step": 7390 + }, + { + "epoch": 0.3502013740819711, + "grad_norm": 0.85546875, + "learning_rate": 0.00014542890357062814, + "loss": 0.9624, + "step": 7391 + }, + { + "epoch": 0.3502487562189055, + "grad_norm": 0.6640625, + "learning_rate": 0.00014541563593496864, + "loss": 1.2012, + "step": 7392 + }, + { + "epoch": 0.35029613835583984, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014540236729203868, + "loss": 0.0248, + "step": 7393 + }, + { + "epoch": 0.35034352049277423, + "grad_norm": 0.7265625, + "learning_rate": 0.0001453890976421326, + "loss": 0.8003, + "step": 7394 + }, + { + "epoch": 0.3503909026297086, + "grad_norm": 1.4453125, + "learning_rate": 0.00014537582698554466, + "loss": 0.9812, + "step": 7395 + }, + { + "epoch": 0.35043828476664296, + "grad_norm": 0.625, + "learning_rate": 0.00014536255532256927, + "loss": 0.8983, + "step": 7396 + }, + { + "epoch": 0.35048566690357735, + "grad_norm": 0.154296875, + "learning_rate": 0.00014534928265350067, + "loss": 0.0213, + "step": 7397 + }, + { + "epoch": 0.35053304904051175, + "grad_norm": 0.5390625, + "learning_rate": 0.0001453360089786333, + "loss": 0.8947, + "step": 7398 + }, + { + "epoch": 0.3505804311774461, + "grad_norm": 0.625, + "learning_rate": 0.00014532273429826152, + "loss": 0.8732, + "step": 7399 + }, + { + "epoch": 0.3506278133143805, + "grad_norm": 0.185546875, + "learning_rate": 0.0001453094586126798, + "loss": 0.1235, + "step": 7400 + }, + { + "epoch": 0.35067519545131487, + "grad_norm": 0.04443359375, + "learning_rate": 0.0001452961819221825, + "loss": 0.0053, + "step": 7401 + }, + { + "epoch": 0.3507225775882492, + "grad_norm": 0.189453125, + "learning_rate": 0.00014528290422706418, + "loss": 0.132, + "step": 7402 + }, + { + "epoch": 0.3507699597251836, + "grad_norm": 0.64453125, + "learning_rate": 0.00014526962552761927, + "loss": 0.8855, + "step": 7403 + }, + { + "epoch": 0.350817341862118, + "grad_norm": 0.796875, + "learning_rate": 0.00014525634582414226, + "loss": 0.7257, + "step": 7404 + }, + { + "epoch": 0.35086472399905233, + "grad_norm": 0.5234375, + "learning_rate": 0.00014524306511692772, + "loss": 1.1252, + "step": 7405 + }, + { + "epoch": 0.3509121061359867, + "grad_norm": 0.6484375, + "learning_rate": 0.00014522978340627017, + "loss": 1.1197, + "step": 7406 + }, + { + "epoch": 0.3509594882729211, + "grad_norm": 0.671875, + "learning_rate": 0.00014521650069246423, + "loss": 1.0832, + "step": 7407 + }, + { + "epoch": 0.3510068704098555, + "grad_norm": 0.6640625, + "learning_rate": 0.00014520321697580446, + "loss": 1.3954, + "step": 7408 + }, + { + "epoch": 0.35105425254678985, + "grad_norm": 0.48828125, + "learning_rate": 0.00014518993225658548, + "loss": 0.1518, + "step": 7409 + }, + { + "epoch": 0.35110163468372424, + "grad_norm": 0.68359375, + "learning_rate": 0.0001451766465351019, + "loss": 0.9196, + "step": 7410 + }, + { + "epoch": 0.35114901682065863, + "grad_norm": 0.63671875, + "learning_rate": 0.0001451633598116485, + "loss": 1.2294, + "step": 7411 + }, + { + "epoch": 0.35119639895759297, + "grad_norm": 0.8515625, + "learning_rate": 0.00014515007208651984, + "loss": 0.9131, + "step": 7412 + }, + { + "epoch": 0.35124378109452736, + "grad_norm": 0.8046875, + "learning_rate": 0.00014513678336001068, + "loss": 1.2582, + "step": 7413 + }, + { + "epoch": 0.35129116323146176, + "grad_norm": 0.7265625, + "learning_rate": 0.00014512349363241572, + "loss": 0.7336, + "step": 7414 + }, + { + "epoch": 0.3513385453683961, + "grad_norm": 0.006500244140625, + "learning_rate": 0.00014511020290402976, + "loss": 0.0003, + "step": 7415 + }, + { + "epoch": 0.3513859275053305, + "grad_norm": 1.1015625, + "learning_rate": 0.00014509691117514753, + "loss": 0.5614, + "step": 7416 + }, + { + "epoch": 0.3514333096422649, + "grad_norm": 0.6953125, + "learning_rate": 0.00014508361844606387, + "loss": 0.8816, + "step": 7417 + }, + { + "epoch": 0.3514806917791992, + "grad_norm": 0.322265625, + "learning_rate": 0.00014507032471707353, + "loss": 0.1625, + "step": 7418 + }, + { + "epoch": 0.3515280739161336, + "grad_norm": 0.1865234375, + "learning_rate": 0.00014505702998847145, + "loss": 0.1259, + "step": 7419 + }, + { + "epoch": 0.351575456053068, + "grad_norm": 0.62109375, + "learning_rate": 0.00014504373426055243, + "loss": 0.9052, + "step": 7420 + }, + { + "epoch": 0.3516228381900024, + "grad_norm": 0.55078125, + "learning_rate": 0.00014503043753361135, + "loss": 1.4965, + "step": 7421 + }, + { + "epoch": 0.35167022032693673, + "grad_norm": 0.734375, + "learning_rate": 0.00014501713980794308, + "loss": 1.3937, + "step": 7422 + }, + { + "epoch": 0.3517176024638711, + "grad_norm": 0.1162109375, + "learning_rate": 0.00014500384108384268, + "loss": 0.0065, + "step": 7423 + }, + { + "epoch": 0.3517649846008055, + "grad_norm": 0.640625, + "learning_rate": 0.00014499054136160496, + "loss": 1.1131, + "step": 7424 + }, + { + "epoch": 0.35181236673773986, + "grad_norm": 0.03857421875, + "learning_rate": 0.00014497724064152498, + "loss": 0.0025, + "step": 7425 + }, + { + "epoch": 0.35185974887467425, + "grad_norm": 0.9765625, + "learning_rate": 0.0001449639389238977, + "loss": 1.2031, + "step": 7426 + }, + { + "epoch": 0.35190713101160864, + "grad_norm": 0.53125, + "learning_rate": 0.0001449506362090182, + "loss": 0.9041, + "step": 7427 + }, + { + "epoch": 0.351954513148543, + "grad_norm": 0.6875, + "learning_rate": 0.0001449373324971814, + "loss": 1.0974, + "step": 7428 + }, + { + "epoch": 0.3520018952854774, + "grad_norm": 0.0673828125, + "learning_rate": 0.00014492402778868246, + "loss": 0.0086, + "step": 7429 + }, + { + "epoch": 0.35204927742241177, + "grad_norm": 0.388671875, + "learning_rate": 0.00014491072208381643, + "loss": 0.0784, + "step": 7430 + }, + { + "epoch": 0.3520966595593461, + "grad_norm": 0.58203125, + "learning_rate": 0.0001448974153828784, + "loss": 1.2892, + "step": 7431 + }, + { + "epoch": 0.3521440416962805, + "grad_norm": 0.40625, + "learning_rate": 0.00014488410768616355, + "loss": 0.4134, + "step": 7432 + }, + { + "epoch": 0.3521914238332149, + "grad_norm": 0.54296875, + "learning_rate": 0.00014487079899396698, + "loss": 0.5826, + "step": 7433 + }, + { + "epoch": 0.3522388059701492, + "grad_norm": 0.6484375, + "learning_rate": 0.0001448574893065839, + "loss": 0.6569, + "step": 7434 + }, + { + "epoch": 0.3522861881070836, + "grad_norm": 0.59375, + "learning_rate": 0.00014484417862430947, + "loss": 1.0911, + "step": 7435 + }, + { + "epoch": 0.352333570244018, + "grad_norm": 0.65625, + "learning_rate": 0.00014483086694743898, + "loss": 0.7696, + "step": 7436 + }, + { + "epoch": 0.3523809523809524, + "grad_norm": 0.9609375, + "learning_rate": 0.00014481755427626754, + "loss": 1.152, + "step": 7437 + }, + { + "epoch": 0.35242833451788674, + "grad_norm": 0.765625, + "learning_rate": 0.00014480424061109052, + "loss": 1.2271, + "step": 7438 + }, + { + "epoch": 0.35247571665482114, + "grad_norm": 1.7265625, + "learning_rate": 0.00014479092595220315, + "loss": 0.4939, + "step": 7439 + }, + { + "epoch": 0.35252309879175553, + "grad_norm": 0.75390625, + "learning_rate": 0.00014477761029990074, + "loss": 0.8439, + "step": 7440 + }, + { + "epoch": 0.35257048092868987, + "grad_norm": 0.81640625, + "learning_rate": 0.00014476429365447866, + "loss": 1.2563, + "step": 7441 + }, + { + "epoch": 0.35261786306562426, + "grad_norm": 0.59765625, + "learning_rate": 0.0001447509760162322, + "loss": 1.0431, + "step": 7442 + }, + { + "epoch": 0.35266524520255865, + "grad_norm": 0.63671875, + "learning_rate": 0.00014473765738545676, + "loss": 1.1327, + "step": 7443 + }, + { + "epoch": 0.352712627339493, + "grad_norm": 0.59765625, + "learning_rate": 0.00014472433776244778, + "loss": 1.1597, + "step": 7444 + }, + { + "epoch": 0.3527600094764274, + "grad_norm": 0.78125, + "learning_rate": 0.00014471101714750057, + "loss": 1.1022, + "step": 7445 + }, + { + "epoch": 0.3528073916133618, + "grad_norm": 1.03125, + "learning_rate": 0.00014469769554091067, + "loss": 1.1637, + "step": 7446 + }, + { + "epoch": 0.3528547737502961, + "grad_norm": 0.5859375, + "learning_rate": 0.00014468437294297345, + "loss": 0.52, + "step": 7447 + }, + { + "epoch": 0.3529021558872305, + "grad_norm": 0.59765625, + "learning_rate": 0.00014467104935398447, + "loss": 0.8924, + "step": 7448 + }, + { + "epoch": 0.3529495380241649, + "grad_norm": 0.123046875, + "learning_rate": 0.00014465772477423918, + "loss": 0.0154, + "step": 7449 + }, + { + "epoch": 0.3529969201610993, + "grad_norm": 0.6015625, + "learning_rate": 0.00014464439920403312, + "loss": 0.5742, + "step": 7450 + }, + { + "epoch": 0.35304430229803363, + "grad_norm": 0.671875, + "learning_rate": 0.00014463107264366183, + "loss": 0.9505, + "step": 7451 + }, + { + "epoch": 0.353091684434968, + "grad_norm": 0.84765625, + "learning_rate": 0.0001446177450934209, + "loss": 0.8262, + "step": 7452 + }, + { + "epoch": 0.3531390665719024, + "grad_norm": 0.86328125, + "learning_rate": 0.00014460441655360587, + "loss": 1.1557, + "step": 7453 + }, + { + "epoch": 0.35318644870883675, + "grad_norm": 0.11865234375, + "learning_rate": 0.00014459108702451245, + "loss": 0.0148, + "step": 7454 + }, + { + "epoch": 0.35323383084577115, + "grad_norm": 0.62109375, + "learning_rate": 0.0001445777565064362, + "loss": 1.3203, + "step": 7455 + }, + { + "epoch": 0.35328121298270554, + "grad_norm": 0.60546875, + "learning_rate": 0.00014456442499967276, + "loss": 1.0322, + "step": 7456 + }, + { + "epoch": 0.3533285951196399, + "grad_norm": 0.53515625, + "learning_rate": 0.00014455109250451789, + "loss": 0.0646, + "step": 7457 + }, + { + "epoch": 0.35337597725657427, + "grad_norm": 0.6328125, + "learning_rate": 0.00014453775902126723, + "loss": 0.8735, + "step": 7458 + }, + { + "epoch": 0.35342335939350866, + "grad_norm": 0.5859375, + "learning_rate": 0.00014452442455021648, + "loss": 0.0772, + "step": 7459 + }, + { + "epoch": 0.353470741530443, + "grad_norm": 0.66015625, + "learning_rate": 0.00014451108909166146, + "loss": 1.2499, + "step": 7460 + }, + { + "epoch": 0.3535181236673774, + "grad_norm": 0.54296875, + "learning_rate": 0.00014449775264589789, + "loss": 0.7132, + "step": 7461 + }, + { + "epoch": 0.3535655058043118, + "grad_norm": 0.62890625, + "learning_rate": 0.00014448441521322153, + "loss": 1.0795, + "step": 7462 + }, + { + "epoch": 0.3536128879412461, + "grad_norm": 0.4375, + "learning_rate": 0.00014447107679392825, + "loss": 0.1977, + "step": 7463 + }, + { + "epoch": 0.3536602700781805, + "grad_norm": 0.6171875, + "learning_rate": 0.00014445773738831384, + "loss": 0.74, + "step": 7464 + }, + { + "epoch": 0.3537076522151149, + "grad_norm": 0.71875, + "learning_rate": 0.00014444439699667417, + "loss": 1.2651, + "step": 7465 + }, + { + "epoch": 0.3537550343520493, + "grad_norm": 0.73046875, + "learning_rate": 0.00014443105561930513, + "loss": 0.7995, + "step": 7466 + }, + { + "epoch": 0.35380241648898364, + "grad_norm": 0.33984375, + "learning_rate": 0.00014441771325650256, + "loss": 0.1529, + "step": 7467 + }, + { + "epoch": 0.35384979862591803, + "grad_norm": 0.80859375, + "learning_rate": 0.0001444043699085625, + "loss": 0.9945, + "step": 7468 + }, + { + "epoch": 0.3538971807628524, + "grad_norm": 0.11181640625, + "learning_rate": 0.00014439102557578076, + "loss": 0.0181, + "step": 7469 + }, + { + "epoch": 0.35394456289978676, + "grad_norm": 0.333984375, + "learning_rate": 0.00014437768025845338, + "loss": 0.0129, + "step": 7470 + }, + { + "epoch": 0.35399194503672116, + "grad_norm": 0.30859375, + "learning_rate": 0.00014436433395687627, + "loss": 0.0466, + "step": 7471 + }, + { + "epoch": 0.35403932717365555, + "grad_norm": 0.69140625, + "learning_rate": 0.00014435098667134555, + "loss": 1.275, + "step": 7472 + }, + { + "epoch": 0.3540867093105899, + "grad_norm": 0.294921875, + "learning_rate": 0.00014433763840215714, + "loss": 0.122, + "step": 7473 + }, + { + "epoch": 0.3541340914475243, + "grad_norm": 0.79296875, + "learning_rate": 0.00014432428914960715, + "loss": 1.2653, + "step": 7474 + }, + { + "epoch": 0.3541814735844587, + "grad_norm": 0.59375, + "learning_rate": 0.00014431093891399165, + "loss": 1.1488, + "step": 7475 + }, + { + "epoch": 0.354228855721393, + "grad_norm": 0.52734375, + "learning_rate": 0.00014429758769560672, + "loss": 1.43, + "step": 7476 + }, + { + "epoch": 0.3542762378583274, + "grad_norm": 0.60546875, + "learning_rate": 0.0001442842354947485, + "loss": 0.586, + "step": 7477 + }, + { + "epoch": 0.3543236199952618, + "grad_norm": 0.6328125, + "learning_rate": 0.00014427088231171306, + "loss": 1.0131, + "step": 7478 + }, + { + "epoch": 0.3543710021321962, + "grad_norm": 0.6171875, + "learning_rate": 0.00014425752814679663, + "loss": 0.8696, + "step": 7479 + }, + { + "epoch": 0.3544183842691305, + "grad_norm": 0.734375, + "learning_rate": 0.00014424417300029537, + "loss": 1.1594, + "step": 7480 + }, + { + "epoch": 0.3544657664060649, + "grad_norm": 0.83984375, + "learning_rate": 0.00014423081687250545, + "loss": 1.1847, + "step": 7481 + }, + { + "epoch": 0.3545131485429993, + "grad_norm": 0.515625, + "learning_rate": 0.00014421745976372316, + "loss": 0.8928, + "step": 7482 + }, + { + "epoch": 0.35456053067993365, + "grad_norm": 0.67578125, + "learning_rate": 0.00014420410167424472, + "loss": 0.9059, + "step": 7483 + }, + { + "epoch": 0.35460791281686804, + "grad_norm": 0.2109375, + "learning_rate": 0.00014419074260436636, + "loss": 0.1536, + "step": 7484 + }, + { + "epoch": 0.35465529495380244, + "grad_norm": 0.22265625, + "learning_rate": 0.0001441773825543844, + "loss": 0.0605, + "step": 7485 + }, + { + "epoch": 0.3547026770907368, + "grad_norm": 0.2578125, + "learning_rate": 0.00014416402152459516, + "loss": 0.0113, + "step": 7486 + }, + { + "epoch": 0.35475005922767117, + "grad_norm": 0.6875, + "learning_rate": 0.00014415065951529495, + "loss": 1.0058, + "step": 7487 + }, + { + "epoch": 0.35479744136460556, + "grad_norm": 0.6953125, + "learning_rate": 0.00014413729652678017, + "loss": 0.7031, + "step": 7488 + }, + { + "epoch": 0.3548448235015399, + "grad_norm": 0.17578125, + "learning_rate": 0.00014412393255934714, + "loss": 0.1247, + "step": 7489 + }, + { + "epoch": 0.3548922056384743, + "grad_norm": 0.181640625, + "learning_rate": 0.00014411056761329228, + "loss": 0.1318, + "step": 7490 + }, + { + "epoch": 0.3549395877754087, + "grad_norm": 0.57421875, + "learning_rate": 0.00014409720168891203, + "loss": 1.499, + "step": 7491 + }, + { + "epoch": 0.354986969912343, + "grad_norm": 0.83203125, + "learning_rate": 0.00014408383478650282, + "loss": 0.6738, + "step": 7492 + }, + { + "epoch": 0.3550343520492774, + "grad_norm": 0.5546875, + "learning_rate": 0.0001440704669063611, + "loss": 0.9576, + "step": 7493 + }, + { + "epoch": 0.3550817341862118, + "grad_norm": 0.71484375, + "learning_rate": 0.00014405709804878335, + "loss": 1.0804, + "step": 7494 + }, + { + "epoch": 0.3551291163231462, + "grad_norm": 0.05908203125, + "learning_rate": 0.00014404372821406612, + "loss": 0.0015, + "step": 7495 + }, + { + "epoch": 0.35517649846008054, + "grad_norm": 0.69140625, + "learning_rate": 0.00014403035740250593, + "loss": 0.7805, + "step": 7496 + }, + { + "epoch": 0.35522388059701493, + "grad_norm": 0.54296875, + "learning_rate": 0.00014401698561439927, + "loss": 0.833, + "step": 7497 + }, + { + "epoch": 0.3552712627339493, + "grad_norm": 0.71875, + "learning_rate": 0.00014400361285004276, + "loss": 1.1491, + "step": 7498 + }, + { + "epoch": 0.35531864487088366, + "grad_norm": 0.1591796875, + "learning_rate": 0.000143990239109733, + "loss": 0.1052, + "step": 7499 + }, + { + "epoch": 0.35536602700781805, + "grad_norm": 0.72265625, + "learning_rate": 0.00014397686439376658, + "loss": 1.2958, + "step": 7500 + }, + { + "epoch": 0.35541340914475245, + "grad_norm": 0.130859375, + "learning_rate": 0.00014396348870244016, + "loss": 0.0168, + "step": 7501 + }, + { + "epoch": 0.3554607912816868, + "grad_norm": 0.357421875, + "learning_rate": 0.0001439501120360504, + "loss": 0.2103, + "step": 7502 + }, + { + "epoch": 0.3555081734186212, + "grad_norm": 0.7265625, + "learning_rate": 0.00014393673439489394, + "loss": 0.854, + "step": 7503 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.65625, + "learning_rate": 0.00014392335577926755, + "loss": 1.0615, + "step": 7504 + }, + { + "epoch": 0.3556029376924899, + "grad_norm": 0.265625, + "learning_rate": 0.00014390997618946786, + "loss": 0.0217, + "step": 7505 + }, + { + "epoch": 0.3556503198294243, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001438965956257917, + "loss": 0.0417, + "step": 7506 + }, + { + "epoch": 0.3556977019663587, + "grad_norm": 0.60546875, + "learning_rate": 0.00014388321408853586, + "loss": 0.8922, + "step": 7507 + }, + { + "epoch": 0.3557450841032931, + "grad_norm": 0.59765625, + "learning_rate": 0.00014386983157799697, + "loss": 0.872, + "step": 7508 + }, + { + "epoch": 0.3557924662402274, + "grad_norm": 0.51953125, + "learning_rate": 0.000143856448094472, + "loss": 1.1858, + "step": 7509 + }, + { + "epoch": 0.3558398483771618, + "grad_norm": 0.5234375, + "learning_rate": 0.00014384306363825772, + "loss": 0.6, + "step": 7510 + }, + { + "epoch": 0.3558872305140962, + "grad_norm": 0.6796875, + "learning_rate": 0.00014382967820965098, + "loss": 0.9507, + "step": 7511 + }, + { + "epoch": 0.35593461265103055, + "grad_norm": 0.4453125, + "learning_rate": 0.00014381629180894867, + "loss": 0.6646, + "step": 7512 + }, + { + "epoch": 0.35598199478796494, + "grad_norm": 0.703125, + "learning_rate": 0.00014380290443644767, + "loss": 1.2201, + "step": 7513 + }, + { + "epoch": 0.35602937692489933, + "grad_norm": 0.765625, + "learning_rate": 0.00014378951609244492, + "loss": 0.9526, + "step": 7514 + }, + { + "epoch": 0.35607675906183367, + "grad_norm": 0.19921875, + "learning_rate": 0.00014377612677723736, + "loss": 0.1276, + "step": 7515 + }, + { + "epoch": 0.35612414119876806, + "grad_norm": 0.80078125, + "learning_rate": 0.0001437627364911219, + "loss": 0.7398, + "step": 7516 + }, + { + "epoch": 0.35617152333570246, + "grad_norm": 0.65234375, + "learning_rate": 0.00014374934523439555, + "loss": 0.8415, + "step": 7517 + }, + { + "epoch": 0.3562189054726368, + "grad_norm": 0.486328125, + "learning_rate": 0.00014373595300735538, + "loss": 0.7519, + "step": 7518 + }, + { + "epoch": 0.3562662876095712, + "grad_norm": 0.26953125, + "learning_rate": 0.0001437225598102983, + "loss": 0.139, + "step": 7519 + }, + { + "epoch": 0.3563136697465056, + "grad_norm": 0.73828125, + "learning_rate": 0.00014370916564352144, + "loss": 1.1972, + "step": 7520 + }, + { + "epoch": 0.3563610518834399, + "grad_norm": 0.82421875, + "learning_rate": 0.00014369577050732184, + "loss": 1.1084, + "step": 7521 + }, + { + "epoch": 0.3564084340203743, + "grad_norm": 0.5546875, + "learning_rate": 0.0001436823744019966, + "loss": 0.8126, + "step": 7522 + }, + { + "epoch": 0.3564558161573087, + "grad_norm": 0.69921875, + "learning_rate": 0.00014366897732784285, + "loss": 1.1367, + "step": 7523 + }, + { + "epoch": 0.3565031982942431, + "grad_norm": 0.2255859375, + "learning_rate": 0.00014365557928515762, + "loss": 0.1515, + "step": 7524 + }, + { + "epoch": 0.35655058043117743, + "grad_norm": 0.71875, + "learning_rate": 0.0001436421802742382, + "loss": 0.6548, + "step": 7525 + }, + { + "epoch": 0.3565979625681118, + "grad_norm": 0.578125, + "learning_rate": 0.0001436287802953817, + "loss": 0.6433, + "step": 7526 + }, + { + "epoch": 0.3566453447050462, + "grad_norm": 0.73046875, + "learning_rate": 0.00014361537934888533, + "loss": 0.1723, + "step": 7527 + }, + { + "epoch": 0.35669272684198056, + "grad_norm": 0.62890625, + "learning_rate": 0.00014360197743504627, + "loss": 0.9152, + "step": 7528 + }, + { + "epoch": 0.35674010897891495, + "grad_norm": 0.59765625, + "learning_rate": 0.00014358857455416178, + "loss": 1.0276, + "step": 7529 + }, + { + "epoch": 0.35678749111584934, + "grad_norm": 0.56640625, + "learning_rate": 0.00014357517070652921, + "loss": 0.7897, + "step": 7530 + }, + { + "epoch": 0.3568348732527837, + "grad_norm": 0.61328125, + "learning_rate": 0.0001435617658924457, + "loss": 0.995, + "step": 7531 + }, + { + "epoch": 0.3568822553897181, + "grad_norm": 0.5703125, + "learning_rate": 0.00014354836011220868, + "loss": 1.0152, + "step": 7532 + }, + { + "epoch": 0.35692963752665247, + "grad_norm": 0.6484375, + "learning_rate": 0.00014353495336611537, + "loss": 0.6246, + "step": 7533 + }, + { + "epoch": 0.3569770196635868, + "grad_norm": 0.052001953125, + "learning_rate": 0.00014352154565446318, + "loss": 0.0018, + "step": 7534 + }, + { + "epoch": 0.3570244018005212, + "grad_norm": 0.2021484375, + "learning_rate": 0.00014350813697754948, + "loss": 0.0143, + "step": 7535 + }, + { + "epoch": 0.3570717839374556, + "grad_norm": 0.040283203125, + "learning_rate": 0.00014349472733567162, + "loss": 0.0042, + "step": 7536 + }, + { + "epoch": 0.35711916607439, + "grad_norm": 0.6015625, + "learning_rate": 0.00014348131672912705, + "loss": 0.9628, + "step": 7537 + }, + { + "epoch": 0.3571665482113243, + "grad_norm": 0.60546875, + "learning_rate": 0.0001434679051582132, + "loss": 0.9001, + "step": 7538 + }, + { + "epoch": 0.3572139303482587, + "grad_norm": 0.83984375, + "learning_rate": 0.0001434544926232275, + "loss": 0.2132, + "step": 7539 + }, + { + "epoch": 0.3572613124851931, + "grad_norm": 0.49609375, + "learning_rate": 0.00014344107912446743, + "loss": 0.7011, + "step": 7540 + }, + { + "epoch": 0.35730869462212744, + "grad_norm": 0.78125, + "learning_rate": 0.0001434276646622305, + "loss": 1.2016, + "step": 7541 + }, + { + "epoch": 0.35735607675906184, + "grad_norm": 1.375, + "learning_rate": 0.00014341424923681423, + "loss": 0.2444, + "step": 7542 + }, + { + "epoch": 0.35740345889599623, + "grad_norm": 0.765625, + "learning_rate": 0.0001434008328485162, + "loss": 1.0004, + "step": 7543 + }, + { + "epoch": 0.35745084103293057, + "grad_norm": 0.73046875, + "learning_rate": 0.00014338741549763383, + "loss": 0.9323, + "step": 7544 + }, + { + "epoch": 0.35749822316986496, + "grad_norm": 0.79296875, + "learning_rate": 0.00014337399718446488, + "loss": 0.9443, + "step": 7545 + }, + { + "epoch": 0.35754560530679935, + "grad_norm": 0.6640625, + "learning_rate": 0.00014336057790930684, + "loss": 1.1554, + "step": 7546 + }, + { + "epoch": 0.3575929874437337, + "grad_norm": 0.19140625, + "learning_rate": 0.00014334715767245736, + "loss": 0.1453, + "step": 7547 + }, + { + "epoch": 0.3576403695806681, + "grad_norm": 0.65625, + "learning_rate": 0.0001433337364742141, + "loss": 1.6827, + "step": 7548 + }, + { + "epoch": 0.3576877517176025, + "grad_norm": 0.53125, + "learning_rate": 0.00014332031431487475, + "loss": 0.4558, + "step": 7549 + }, + { + "epoch": 0.3577351338545368, + "grad_norm": 0.349609375, + "learning_rate": 0.00014330689119473693, + "loss": 0.1129, + "step": 7550 + }, + { + "epoch": 0.3577825159914712, + "grad_norm": 0.9609375, + "learning_rate": 0.00014329346711409842, + "loss": 1.1667, + "step": 7551 + }, + { + "epoch": 0.3578298981284056, + "grad_norm": 0.08349609375, + "learning_rate": 0.0001432800420732569, + "loss": 0.0043, + "step": 7552 + }, + { + "epoch": 0.35787728026534, + "grad_norm": 0.64453125, + "learning_rate": 0.00014326661607251014, + "loss": 1.2576, + "step": 7553 + }, + { + "epoch": 0.35792466240227433, + "grad_norm": 0.62890625, + "learning_rate": 0.00014325318911215598, + "loss": 0.7217, + "step": 7554 + }, + { + "epoch": 0.3579720445392087, + "grad_norm": 0.4609375, + "learning_rate": 0.0001432397611924921, + "loss": 0.5296, + "step": 7555 + }, + { + "epoch": 0.3580194266761431, + "grad_norm": 0.004425048828125, + "learning_rate": 0.0001432263323138164, + "loss": 0.0003, + "step": 7556 + }, + { + "epoch": 0.35806680881307745, + "grad_norm": 0.212890625, + "learning_rate": 0.0001432129024764267, + "loss": 0.1384, + "step": 7557 + }, + { + "epoch": 0.35811419095001185, + "grad_norm": 0.68359375, + "learning_rate": 0.00014319947168062083, + "loss": 0.9144, + "step": 7558 + }, + { + "epoch": 0.35816157308694624, + "grad_norm": 0.25390625, + "learning_rate": 0.00014318603992669667, + "loss": 0.0499, + "step": 7559 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 0.5859375, + "learning_rate": 0.00014317260721495218, + "loss": 0.0449, + "step": 7560 + }, + { + "epoch": 0.35825633736081497, + "grad_norm": 0.388671875, + "learning_rate": 0.00014315917354568524, + "loss": 0.0961, + "step": 7561 + }, + { + "epoch": 0.35830371949774936, + "grad_norm": 0.703125, + "learning_rate": 0.0001431457389191938, + "loss": 0.6478, + "step": 7562 + }, + { + "epoch": 0.3583511016346837, + "grad_norm": 0.625, + "learning_rate": 0.00014313230333577582, + "loss": 1.1945, + "step": 7563 + }, + { + "epoch": 0.3583984837716181, + "grad_norm": 0.91015625, + "learning_rate": 0.0001431188667957293, + "loss": 1.1632, + "step": 7564 + }, + { + "epoch": 0.3584458659085525, + "grad_norm": 0.80078125, + "learning_rate": 0.00014310542929935226, + "loss": 1.02, + "step": 7565 + }, + { + "epoch": 0.3584932480454869, + "grad_norm": 0.6875, + "learning_rate": 0.0001430919908469427, + "loss": 1.3646, + "step": 7566 + }, + { + "epoch": 0.3585406301824212, + "grad_norm": 0.73046875, + "learning_rate": 0.00014307855143879866, + "loss": 0.8545, + "step": 7567 + }, + { + "epoch": 0.3585880123193556, + "grad_norm": 0.7734375, + "learning_rate": 0.00014306511107521828, + "loss": 0.9253, + "step": 7568 + }, + { + "epoch": 0.35863539445629, + "grad_norm": 0.8515625, + "learning_rate": 0.00014305166975649955, + "loss": 0.972, + "step": 7569 + }, + { + "epoch": 0.35868277659322434, + "grad_norm": 0.7265625, + "learning_rate": 0.0001430382274829407, + "loss": 0.142, + "step": 7570 + }, + { + "epoch": 0.35873015873015873, + "grad_norm": 0.65234375, + "learning_rate": 0.00014302478425483976, + "loss": 0.3351, + "step": 7571 + }, + { + "epoch": 0.3587775408670931, + "grad_norm": 0.67578125, + "learning_rate": 0.00014301134007249498, + "loss": 1.3903, + "step": 7572 + }, + { + "epoch": 0.35882492300402746, + "grad_norm": 0.150390625, + "learning_rate": 0.00014299789493620445, + "loss": 0.014, + "step": 7573 + }, + { + "epoch": 0.35887230514096186, + "grad_norm": 0.640625, + "learning_rate": 0.00014298444884626641, + "loss": 0.7912, + "step": 7574 + }, + { + "epoch": 0.35891968727789625, + "grad_norm": 0.64453125, + "learning_rate": 0.00014297100180297905, + "loss": 0.9977, + "step": 7575 + }, + { + "epoch": 0.3589670694148306, + "grad_norm": 1.03125, + "learning_rate": 0.00014295755380664074, + "loss": 1.0591, + "step": 7576 + }, + { + "epoch": 0.359014451551765, + "grad_norm": 0.9296875, + "learning_rate": 0.00014294410485754956, + "loss": 1.1827, + "step": 7577 + }, + { + "epoch": 0.35906183368869937, + "grad_norm": 0.54296875, + "learning_rate": 0.00014293065495600392, + "loss": 0.8982, + "step": 7578 + }, + { + "epoch": 0.3591092158256337, + "grad_norm": 0.7890625, + "learning_rate": 0.00014291720410230204, + "loss": 1.2666, + "step": 7579 + }, + { + "epoch": 0.3591565979625681, + "grad_norm": 0.796875, + "learning_rate": 0.00014290375229674234, + "loss": 0.9742, + "step": 7580 + }, + { + "epoch": 0.3592039800995025, + "grad_norm": 0.7109375, + "learning_rate": 0.00014289029953962307, + "loss": 1.266, + "step": 7581 + }, + { + "epoch": 0.3592513622364369, + "grad_norm": 0.009765625, + "learning_rate": 0.00014287684583124264, + "loss": 0.0004, + "step": 7582 + }, + { + "epoch": 0.3592987443733712, + "grad_norm": 0.4296875, + "learning_rate": 0.00014286339117189947, + "loss": 0.0734, + "step": 7583 + }, + { + "epoch": 0.3593461265103056, + "grad_norm": 0.173828125, + "learning_rate": 0.00014284993556189191, + "loss": 0.1343, + "step": 7584 + }, + { + "epoch": 0.35939350864724, + "grad_norm": 1.0, + "learning_rate": 0.00014283647900151843, + "loss": 0.0427, + "step": 7585 + }, + { + "epoch": 0.35944089078417435, + "grad_norm": 0.03173828125, + "learning_rate": 0.00014282302149107748, + "loss": 0.0012, + "step": 7586 + }, + { + "epoch": 0.35948827292110874, + "grad_norm": 0.53515625, + "learning_rate": 0.00014280956303086751, + "loss": 0.5595, + "step": 7587 + }, + { + "epoch": 0.35953565505804314, + "grad_norm": 0.546875, + "learning_rate": 0.00014279610362118706, + "loss": 0.6958, + "step": 7588 + }, + { + "epoch": 0.3595830371949775, + "grad_norm": 0.86328125, + "learning_rate": 0.00014278264326233458, + "loss": 0.9134, + "step": 7589 + }, + { + "epoch": 0.35963041933191187, + "grad_norm": 0.515625, + "learning_rate": 0.00014276918195460866, + "loss": 0.7247, + "step": 7590 + }, + { + "epoch": 0.35967780146884626, + "grad_norm": 0.609375, + "learning_rate": 0.00014275571969830786, + "loss": 0.7464, + "step": 7591 + }, + { + "epoch": 0.3597251836057806, + "grad_norm": 0.5859375, + "learning_rate": 0.00014274225649373072, + "loss": 0.4297, + "step": 7592 + }, + { + "epoch": 0.359772565742715, + "grad_norm": 0.1845703125, + "learning_rate": 0.00014272879234117586, + "loss": 0.0135, + "step": 7593 + }, + { + "epoch": 0.3598199478796494, + "grad_norm": 0.69140625, + "learning_rate": 0.0001427153272409419, + "loss": 0.9053, + "step": 7594 + }, + { + "epoch": 0.3598673300165838, + "grad_norm": 0.5703125, + "learning_rate": 0.0001427018611933275, + "loss": 0.8336, + "step": 7595 + }, + { + "epoch": 0.3599147121535181, + "grad_norm": 0.353515625, + "learning_rate": 0.00014268839419863126, + "loss": 0.0803, + "step": 7596 + }, + { + "epoch": 0.3599620942904525, + "grad_norm": 0.10888671875, + "learning_rate": 0.00014267492625715192, + "loss": 0.0176, + "step": 7597 + }, + { + "epoch": 0.3600094764273869, + "grad_norm": 0.5390625, + "learning_rate": 0.00014266145736918816, + "loss": 0.7939, + "step": 7598 + }, + { + "epoch": 0.36005685856432124, + "grad_norm": 0.75, + "learning_rate": 0.00014264798753503875, + "loss": 1.0958, + "step": 7599 + }, + { + "epoch": 0.36010424070125563, + "grad_norm": 1.859375, + "learning_rate": 0.00014263451675500236, + "loss": 0.0532, + "step": 7600 + }, + { + "epoch": 0.36015162283819, + "grad_norm": 0.640625, + "learning_rate": 0.00014262104502937785, + "loss": 0.9797, + "step": 7601 + }, + { + "epoch": 0.36019900497512436, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014260757235846393, + "loss": 0.1309, + "step": 7602 + }, + { + "epoch": 0.36024638711205875, + "grad_norm": 0.7734375, + "learning_rate": 0.00014259409874255947, + "loss": 1.236, + "step": 7603 + }, + { + "epoch": 0.36029376924899315, + "grad_norm": 0.63671875, + "learning_rate": 0.00014258062418196323, + "loss": 0.8716, + "step": 7604 + }, + { + "epoch": 0.3603411513859275, + "grad_norm": 1.1484375, + "learning_rate": 0.00014256714867697413, + "loss": 1.2937, + "step": 7605 + }, + { + "epoch": 0.3603885335228619, + "grad_norm": 0.62890625, + "learning_rate": 0.000142553672227891, + "loss": 1.005, + "step": 7606 + }, + { + "epoch": 0.36043591565979627, + "grad_norm": 0.6328125, + "learning_rate": 0.00014254019483501273, + "loss": 0.7123, + "step": 7607 + }, + { + "epoch": 0.3604832977967306, + "grad_norm": 0.2470703125, + "learning_rate": 0.00014252671649863825, + "loss": 0.059, + "step": 7608 + }, + { + "epoch": 0.360530679933665, + "grad_norm": 1.2578125, + "learning_rate": 0.0001425132372190665, + "loss": 1.1571, + "step": 7609 + }, + { + "epoch": 0.3605780620705994, + "grad_norm": 0.244140625, + "learning_rate": 0.00014249975699659646, + "loss": 0.0491, + "step": 7610 + }, + { + "epoch": 0.3606254442075338, + "grad_norm": 0.625, + "learning_rate": 0.00014248627583152707, + "loss": 1.0308, + "step": 7611 + }, + { + "epoch": 0.3606728263444681, + "grad_norm": 0.9453125, + "learning_rate": 0.00014247279372415732, + "loss": 0.5094, + "step": 7612 + }, + { + "epoch": 0.3607202084814025, + "grad_norm": 0.376953125, + "learning_rate": 0.00014245931067478624, + "loss": 0.179, + "step": 7613 + }, + { + "epoch": 0.3607675906183369, + "grad_norm": 0.7109375, + "learning_rate": 0.00014244582668371288, + "loss": 1.4096, + "step": 7614 + }, + { + "epoch": 0.36081497275527125, + "grad_norm": 0.83203125, + "learning_rate": 0.0001424323417512363, + "loss": 1.5485, + "step": 7615 + }, + { + "epoch": 0.36086235489220564, + "grad_norm": 0.65625, + "learning_rate": 0.0001424188558776556, + "loss": 1.1223, + "step": 7616 + }, + { + "epoch": 0.36090973702914003, + "grad_norm": 0.5078125, + "learning_rate": 0.00014240536906326982, + "loss": 0.6456, + "step": 7617 + }, + { + "epoch": 0.36095711916607437, + "grad_norm": 0.443359375, + "learning_rate": 0.00014239188130837818, + "loss": 0.3114, + "step": 7618 + }, + { + "epoch": 0.36100450130300876, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001423783926132797, + "loss": 0.0378, + "step": 7619 + }, + { + "epoch": 0.36105188343994316, + "grad_norm": 0.76953125, + "learning_rate": 0.00014236490297827364, + "loss": 1.0091, + "step": 7620 + }, + { + "epoch": 0.3610992655768775, + "grad_norm": 0.267578125, + "learning_rate": 0.00014235141240365913, + "loss": 0.1251, + "step": 7621 + }, + { + "epoch": 0.3611466477138119, + "grad_norm": 0.7890625, + "learning_rate": 0.00014233792088973543, + "loss": 0.9898, + "step": 7622 + }, + { + "epoch": 0.3611940298507463, + "grad_norm": 0.388671875, + "learning_rate": 0.00014232442843680176, + "loss": 0.0914, + "step": 7623 + }, + { + "epoch": 0.36124141198768067, + "grad_norm": 0.216796875, + "learning_rate": 0.0001423109350451573, + "loss": 0.0264, + "step": 7624 + }, + { + "epoch": 0.361288794124615, + "grad_norm": 0.6015625, + "learning_rate": 0.00014229744071510143, + "loss": 0.3978, + "step": 7625 + }, + { + "epoch": 0.3613361762615494, + "grad_norm": 0.71875, + "learning_rate": 0.00014228394544693335, + "loss": 0.9037, + "step": 7626 + }, + { + "epoch": 0.3613835583984838, + "grad_norm": 0.67578125, + "learning_rate": 0.00014227044924095239, + "loss": 0.744, + "step": 7627 + }, + { + "epoch": 0.36143094053541813, + "grad_norm": 0.478515625, + "learning_rate": 0.0001422569520974579, + "loss": 0.1844, + "step": 7628 + }, + { + "epoch": 0.3614783226723525, + "grad_norm": 0.7578125, + "learning_rate": 0.0001422434540167492, + "loss": 0.5736, + "step": 7629 + }, + { + "epoch": 0.3615257048092869, + "grad_norm": 0.72265625, + "learning_rate": 0.00014222995499912572, + "loss": 1.3629, + "step": 7630 + }, + { + "epoch": 0.36157308694622126, + "grad_norm": 0.333984375, + "learning_rate": 0.00014221645504488678, + "loss": 0.1722, + "step": 7631 + }, + { + "epoch": 0.36162046908315565, + "grad_norm": 0.5703125, + "learning_rate": 0.00014220295415433184, + "loss": 0.7315, + "step": 7632 + }, + { + "epoch": 0.36166785122009004, + "grad_norm": 0.7109375, + "learning_rate": 0.00014218945232776035, + "loss": 0.507, + "step": 7633 + }, + { + "epoch": 0.3617152333570244, + "grad_norm": 0.51953125, + "learning_rate": 0.00014217594956547174, + "loss": 0.3093, + "step": 7634 + }, + { + "epoch": 0.3617626154939588, + "grad_norm": 0.74609375, + "learning_rate": 0.0001421624458677655, + "loss": 1.0157, + "step": 7635 + }, + { + "epoch": 0.36180999763089317, + "grad_norm": 0.8046875, + "learning_rate": 0.00014214894123494112, + "loss": 0.9068, + "step": 7636 + }, + { + "epoch": 0.3618573797678275, + "grad_norm": 0.7265625, + "learning_rate": 0.0001421354356672981, + "loss": 1.2128, + "step": 7637 + }, + { + "epoch": 0.3619047619047619, + "grad_norm": 0.060791015625, + "learning_rate": 0.000142121929165136, + "loss": 0.0029, + "step": 7638 + }, + { + "epoch": 0.3619521440416963, + "grad_norm": 0.578125, + "learning_rate": 0.00014210842172875438, + "loss": 1.0106, + "step": 7639 + }, + { + "epoch": 0.3619995261786307, + "grad_norm": 0.75, + "learning_rate": 0.00014209491335845283, + "loss": 1.3891, + "step": 7640 + }, + { + "epoch": 0.362046908315565, + "grad_norm": 0.6640625, + "learning_rate": 0.00014208140405453097, + "loss": 1.4021, + "step": 7641 + }, + { + "epoch": 0.3620942904524994, + "grad_norm": 0.51953125, + "learning_rate": 0.00014206789381728834, + "loss": 0.6981, + "step": 7642 + }, + { + "epoch": 0.3621416725894338, + "grad_norm": 0.41015625, + "learning_rate": 0.00014205438264702465, + "loss": 0.121, + "step": 7643 + }, + { + "epoch": 0.36218905472636814, + "grad_norm": 0.72265625, + "learning_rate": 0.0001420408705440395, + "loss": 1.0905, + "step": 7644 + }, + { + "epoch": 0.36223643686330254, + "grad_norm": 0.8671875, + "learning_rate": 0.0001420273575086327, + "loss": 0.9875, + "step": 7645 + }, + { + "epoch": 0.36228381900023693, + "grad_norm": 0.73828125, + "learning_rate": 0.00014201384354110385, + "loss": 1.352, + "step": 7646 + }, + { + "epoch": 0.36233120113717127, + "grad_norm": 0.6015625, + "learning_rate": 0.00014200032864175268, + "loss": 0.1415, + "step": 7647 + }, + { + "epoch": 0.36237858327410566, + "grad_norm": 0.64453125, + "learning_rate": 0.00014198681281087897, + "loss": 0.8686, + "step": 7648 + }, + { + "epoch": 0.36242596541104005, + "grad_norm": 0.59765625, + "learning_rate": 0.00014197329604878248, + "loss": 1.5066, + "step": 7649 + }, + { + "epoch": 0.3624733475479744, + "grad_norm": 0.359375, + "learning_rate": 0.000141959778355763, + "loss": 0.0331, + "step": 7650 + }, + { + "epoch": 0.3625207296849088, + "grad_norm": 0.66015625, + "learning_rate": 0.0001419462597321203, + "loss": 1.1088, + "step": 7651 + }, + { + "epoch": 0.3625681118218432, + "grad_norm": 0.64453125, + "learning_rate": 0.00014193274017815428, + "loss": 0.6578, + "step": 7652 + }, + { + "epoch": 0.36261549395877757, + "grad_norm": 0.59375, + "learning_rate": 0.0001419192196941647, + "loss": 0.7129, + "step": 7653 + }, + { + "epoch": 0.3626628760957119, + "grad_norm": 0.50390625, + "learning_rate": 0.0001419056982804515, + "loss": 0.5904, + "step": 7654 + }, + { + "epoch": 0.3627102582326463, + "grad_norm": 0.2314453125, + "learning_rate": 0.00014189217593731454, + "loss": 0.1664, + "step": 7655 + }, + { + "epoch": 0.3627576403695807, + "grad_norm": 0.1240234375, + "learning_rate": 0.00014187865266505377, + "loss": 0.0093, + "step": 7656 + }, + { + "epoch": 0.36280502250651503, + "grad_norm": 0.5078125, + "learning_rate": 0.00014186512846396906, + "loss": 0.5595, + "step": 7657 + }, + { + "epoch": 0.3628524046434494, + "grad_norm": 0.6640625, + "learning_rate": 0.0001418516033343604, + "loss": 1.0324, + "step": 7658 + }, + { + "epoch": 0.3628997867803838, + "grad_norm": 0.0016326904296875, + "learning_rate": 0.00014183807727652777, + "loss": 0.0001, + "step": 7659 + }, + { + "epoch": 0.36294716891731815, + "grad_norm": 0.66015625, + "learning_rate": 0.00014182455029077112, + "loss": 1.1369, + "step": 7660 + }, + { + "epoch": 0.36299455105425255, + "grad_norm": 0.58203125, + "learning_rate": 0.0001418110223773905, + "loss": 0.402, + "step": 7661 + }, + { + "epoch": 0.36304193319118694, + "grad_norm": 0.44921875, + "learning_rate": 0.00014179749353668598, + "loss": 0.7577, + "step": 7662 + }, + { + "epoch": 0.3630893153281213, + "grad_norm": 0.65234375, + "learning_rate": 0.00014178396376895755, + "loss": 0.7793, + "step": 7663 + }, + { + "epoch": 0.36313669746505567, + "grad_norm": 0.84375, + "learning_rate": 0.0001417704330745053, + "loss": 0.7439, + "step": 7664 + }, + { + "epoch": 0.36318407960199006, + "grad_norm": 0.53125, + "learning_rate": 0.00014175690145362934, + "loss": 1.0099, + "step": 7665 + }, + { + "epoch": 0.3632314617389244, + "grad_norm": 0.64453125, + "learning_rate": 0.00014174336890662977, + "loss": 1.1489, + "step": 7666 + }, + { + "epoch": 0.3632788438758588, + "grad_norm": 0.185546875, + "learning_rate": 0.00014172983543380677, + "loss": 0.1427, + "step": 7667 + }, + { + "epoch": 0.3633262260127932, + "grad_norm": 0.78125, + "learning_rate": 0.00014171630103546046, + "loss": 1.0486, + "step": 7668 + }, + { + "epoch": 0.3633736081497276, + "grad_norm": 0.189453125, + "learning_rate": 0.000141702765711891, + "loss": 0.1382, + "step": 7669 + }, + { + "epoch": 0.3634209902866619, + "grad_norm": 0.220703125, + "learning_rate": 0.00014168922946339863, + "loss": 0.0904, + "step": 7670 + }, + { + "epoch": 0.3634683724235963, + "grad_norm": 0.5, + "learning_rate": 0.0001416756922902836, + "loss": 0.0271, + "step": 7671 + }, + { + "epoch": 0.3635157545605307, + "grad_norm": 0.5078125, + "learning_rate": 0.00014166215419284605, + "loss": 0.584, + "step": 7672 + }, + { + "epoch": 0.36356313669746504, + "grad_norm": 0.57421875, + "learning_rate": 0.00014164861517138632, + "loss": 0.5146, + "step": 7673 + }, + { + "epoch": 0.36361051883439943, + "grad_norm": 0.25390625, + "learning_rate": 0.00014163507522620464, + "loss": 0.0787, + "step": 7674 + }, + { + "epoch": 0.3636579009713338, + "grad_norm": 0.1435546875, + "learning_rate": 0.00014162153435760139, + "loss": 0.0122, + "step": 7675 + }, + { + "epoch": 0.36370528310826816, + "grad_norm": 0.3203125, + "learning_rate": 0.00014160799256587682, + "loss": 0.0143, + "step": 7676 + }, + { + "epoch": 0.36375266524520256, + "grad_norm": 0.0810546875, + "learning_rate": 0.00014159444985133128, + "loss": 0.002, + "step": 7677 + }, + { + "epoch": 0.36380004738213695, + "grad_norm": 0.56640625, + "learning_rate": 0.00014158090621426515, + "loss": 0.3689, + "step": 7678 + }, + { + "epoch": 0.3638474295190713, + "grad_norm": 0.0027923583984375, + "learning_rate": 0.00014156736165497883, + "loss": 0.0002, + "step": 7679 + }, + { + "epoch": 0.3638948116560057, + "grad_norm": 0.60546875, + "learning_rate": 0.00014155381617377268, + "loss": 0.8971, + "step": 7680 + }, + { + "epoch": 0.36394219379294007, + "grad_norm": 0.057373046875, + "learning_rate": 0.00014154026977094715, + "loss": 0.0047, + "step": 7681 + }, + { + "epoch": 0.36398957592987446, + "grad_norm": 0.734375, + "learning_rate": 0.00014152672244680268, + "loss": 1.345, + "step": 7682 + }, + { + "epoch": 0.3640369580668088, + "grad_norm": 0.75390625, + "learning_rate": 0.00014151317420163975, + "loss": 1.2625, + "step": 7683 + }, + { + "epoch": 0.3640843402037432, + "grad_norm": 0.4765625, + "learning_rate": 0.00014149962503575884, + "loss": 0.01, + "step": 7684 + }, + { + "epoch": 0.3641317223406776, + "grad_norm": 0.96875, + "learning_rate": 0.00014148607494946045, + "loss": 0.8278, + "step": 7685 + }, + { + "epoch": 0.3641791044776119, + "grad_norm": 0.48046875, + "learning_rate": 0.00014147252394304508, + "loss": 0.2542, + "step": 7686 + }, + { + "epoch": 0.3642264866145463, + "grad_norm": 0.1787109375, + "learning_rate": 0.00014145897201681335, + "loss": 0.1399, + "step": 7687 + }, + { + "epoch": 0.3642738687514807, + "grad_norm": 0.0135498046875, + "learning_rate": 0.00014144541917106575, + "loss": 0.0009, + "step": 7688 + }, + { + "epoch": 0.36432125088841505, + "grad_norm": 0.78125, + "learning_rate": 0.0001414318654061029, + "loss": 1.0653, + "step": 7689 + }, + { + "epoch": 0.36436863302534944, + "grad_norm": 0.73046875, + "learning_rate": 0.0001414183107222254, + "loss": 1.1532, + "step": 7690 + }, + { + "epoch": 0.36441601516228384, + "grad_norm": 0.259765625, + "learning_rate": 0.0001414047551197339, + "loss": 0.0235, + "step": 7691 + }, + { + "epoch": 0.3644633972992182, + "grad_norm": 0.75390625, + "learning_rate": 0.000141391198598929, + "loss": 1.3046, + "step": 7692 + }, + { + "epoch": 0.36451077943615257, + "grad_norm": 0.189453125, + "learning_rate": 0.00014137764116011145, + "loss": 0.0336, + "step": 7693 + }, + { + "epoch": 0.36455816157308696, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014136408280358186, + "loss": 0.0187, + "step": 7694 + }, + { + "epoch": 0.3646055437100213, + "grad_norm": 0.6953125, + "learning_rate": 0.00014135052352964102, + "loss": 0.8419, + "step": 7695 + }, + { + "epoch": 0.3646529258469557, + "grad_norm": 0.05078125, + "learning_rate": 0.00014133696333858958, + "loss": 0.0062, + "step": 7696 + }, + { + "epoch": 0.3647003079838901, + "grad_norm": 0.59765625, + "learning_rate": 0.0001413234022307283, + "loss": 1.0701, + "step": 7697 + }, + { + "epoch": 0.3647476901208245, + "grad_norm": 0.65625, + "learning_rate": 0.000141309840206358, + "loss": 0.8737, + "step": 7698 + }, + { + "epoch": 0.3647950722577588, + "grad_norm": 0.7265625, + "learning_rate": 0.00014129627726577947, + "loss": 0.1132, + "step": 7699 + }, + { + "epoch": 0.3648424543946932, + "grad_norm": 0.53515625, + "learning_rate": 0.00014128271340929343, + "loss": 0.6267, + "step": 7700 + }, + { + "epoch": 0.3648898365316276, + "grad_norm": 0.83984375, + "learning_rate": 0.00014126914863720082, + "loss": 0.9094, + "step": 7701 + }, + { + "epoch": 0.36493721866856194, + "grad_norm": 0.79296875, + "learning_rate": 0.00014125558294980246, + "loss": 1.1937, + "step": 7702 + }, + { + "epoch": 0.36498460080549633, + "grad_norm": 0.51953125, + "learning_rate": 0.0001412420163473992, + "loss": 0.98, + "step": 7703 + }, + { + "epoch": 0.3650319829424307, + "grad_norm": 0.77734375, + "learning_rate": 0.00014122844883029197, + "loss": 1.0339, + "step": 7704 + }, + { + "epoch": 0.36507936507936506, + "grad_norm": 0.70703125, + "learning_rate": 0.00014121488039878162, + "loss": 1.2399, + "step": 7705 + }, + { + "epoch": 0.36512674721629945, + "grad_norm": 0.66796875, + "learning_rate": 0.00014120131105316915, + "loss": 0.5477, + "step": 7706 + }, + { + "epoch": 0.36517412935323385, + "grad_norm": 0.90625, + "learning_rate": 0.00014118774079375544, + "loss": 0.1137, + "step": 7707 + }, + { + "epoch": 0.3652215114901682, + "grad_norm": 0.62890625, + "learning_rate": 0.00014117416962084156, + "loss": 1.4176, + "step": 7708 + }, + { + "epoch": 0.3652688936271026, + "grad_norm": 0.5859375, + "learning_rate": 0.00014116059753472844, + "loss": 0.9718, + "step": 7709 + }, + { + "epoch": 0.36531627576403697, + "grad_norm": 0.486328125, + "learning_rate": 0.00014114702453571712, + "loss": 0.6634, + "step": 7710 + }, + { + "epoch": 0.36536365790097136, + "grad_norm": 0.8046875, + "learning_rate": 0.00014113345062410863, + "loss": 0.8321, + "step": 7711 + }, + { + "epoch": 0.3654110400379057, + "grad_norm": 0.1318359375, + "learning_rate": 0.000141119875800204, + "loss": 0.0025, + "step": 7712 + }, + { + "epoch": 0.3654584221748401, + "grad_norm": 0.310546875, + "learning_rate": 0.0001411063000643043, + "loss": 0.0578, + "step": 7713 + }, + { + "epoch": 0.3655058043117745, + "grad_norm": 0.57421875, + "learning_rate": 0.0001410927234167107, + "loss": 0.6011, + "step": 7714 + }, + { + "epoch": 0.3655531864487088, + "grad_norm": 0.56640625, + "learning_rate": 0.00014107914585772424, + "loss": 0.8508, + "step": 7715 + }, + { + "epoch": 0.3656005685856432, + "grad_norm": 0.54296875, + "learning_rate": 0.00014106556738764607, + "loss": 0.0356, + "step": 7716 + }, + { + "epoch": 0.3656479507225776, + "grad_norm": 0.6796875, + "learning_rate": 0.00014105198800677736, + "loss": 1.1596, + "step": 7717 + }, + { + "epoch": 0.36569533285951195, + "grad_norm": 0.375, + "learning_rate": 0.0001410384077154193, + "loss": 0.111, + "step": 7718 + }, + { + "epoch": 0.36574271499644634, + "grad_norm": 0.6484375, + "learning_rate": 0.00014102482651387309, + "loss": 0.9378, + "step": 7719 + }, + { + "epoch": 0.36579009713338073, + "grad_norm": 0.734375, + "learning_rate": 0.00014101124440243988, + "loss": 1.1642, + "step": 7720 + }, + { + "epoch": 0.36583747927031507, + "grad_norm": 0.60546875, + "learning_rate": 0.00014099766138142098, + "loss": 0.5318, + "step": 7721 + }, + { + "epoch": 0.36588486140724946, + "grad_norm": 0.60546875, + "learning_rate": 0.00014098407745111757, + "loss": 0.8979, + "step": 7722 + }, + { + "epoch": 0.36593224354418386, + "grad_norm": 0.70703125, + "learning_rate": 0.00014097049261183102, + "loss": 0.9741, + "step": 7723 + }, + { + "epoch": 0.3659796256811182, + "grad_norm": 0.7109375, + "learning_rate": 0.00014095690686386257, + "loss": 0.9232, + "step": 7724 + }, + { + "epoch": 0.3660270078180526, + "grad_norm": 0.6484375, + "learning_rate": 0.00014094332020751356, + "loss": 0.8781, + "step": 7725 + }, + { + "epoch": 0.366074389954987, + "grad_norm": 0.5625, + "learning_rate": 0.0001409297326430853, + "loss": 0.6715, + "step": 7726 + }, + { + "epoch": 0.36612177209192137, + "grad_norm": 0.59765625, + "learning_rate": 0.00014091614417087922, + "loss": 1.0083, + "step": 7727 + }, + { + "epoch": 0.3661691542288557, + "grad_norm": 0.024658203125, + "learning_rate": 0.00014090255479119659, + "loss": 0.0023, + "step": 7728 + }, + { + "epoch": 0.3662165363657901, + "grad_norm": 0.09912109375, + "learning_rate": 0.00014088896450433887, + "loss": 0.0067, + "step": 7729 + }, + { + "epoch": 0.3662639185027245, + "grad_norm": 0.5703125, + "learning_rate": 0.0001408753733106075, + "loss": 1.0664, + "step": 7730 + }, + { + "epoch": 0.36631130063965883, + "grad_norm": 0.51953125, + "learning_rate": 0.00014086178121030385, + "loss": 0.7424, + "step": 7731 + }, + { + "epoch": 0.3663586827765932, + "grad_norm": 0.59375, + "learning_rate": 0.00014084818820372943, + "loss": 0.5998, + "step": 7732 + }, + { + "epoch": 0.3664060649135276, + "grad_norm": 0.6875, + "learning_rate": 0.00014083459429118573, + "loss": 1.0081, + "step": 7733 + }, + { + "epoch": 0.36645344705046196, + "grad_norm": 0.52734375, + "learning_rate": 0.0001408209994729742, + "loss": 0.2045, + "step": 7734 + }, + { + "epoch": 0.36650082918739635, + "grad_norm": 0.140625, + "learning_rate": 0.0001408074037493964, + "loss": 0.0289, + "step": 7735 + }, + { + "epoch": 0.36654821132433074, + "grad_norm": 0.8515625, + "learning_rate": 0.0001407938071207538, + "loss": 0.5849, + "step": 7736 + }, + { + "epoch": 0.3665955934612651, + "grad_norm": 0.5546875, + "learning_rate": 0.0001407802095873481, + "loss": 0.0738, + "step": 7737 + }, + { + "epoch": 0.36664297559819947, + "grad_norm": 0.80078125, + "learning_rate": 0.0001407666111494807, + "loss": 1.0406, + "step": 7738 + }, + { + "epoch": 0.36669035773513387, + "grad_norm": 0.77734375, + "learning_rate": 0.00014075301180745332, + "loss": 1.2881, + "step": 7739 + }, + { + "epoch": 0.36673773987206826, + "grad_norm": 0.04541015625, + "learning_rate": 0.00014073941156156756, + "loss": 0.0032, + "step": 7740 + }, + { + "epoch": 0.3667851220090026, + "grad_norm": 0.98046875, + "learning_rate": 0.000140725810412125, + "loss": 1.5008, + "step": 7741 + }, + { + "epoch": 0.366832504145937, + "grad_norm": 1.765625, + "learning_rate": 0.0001407122083594274, + "loss": 0.498, + "step": 7742 + }, + { + "epoch": 0.3668798862828714, + "grad_norm": 0.5234375, + "learning_rate": 0.00014069860540377635, + "loss": 0.7711, + "step": 7743 + }, + { + "epoch": 0.3669272684198057, + "grad_norm": 0.625, + "learning_rate": 0.00014068500154547361, + "loss": 1.1254, + "step": 7744 + }, + { + "epoch": 0.3669746505567401, + "grad_norm": 0.65234375, + "learning_rate": 0.00014067139678482086, + "loss": 0.6638, + "step": 7745 + }, + { + "epoch": 0.3670220326936745, + "grad_norm": 0.60546875, + "learning_rate": 0.00014065779112211983, + "loss": 0.7345, + "step": 7746 + }, + { + "epoch": 0.36706941483060884, + "grad_norm": 1.171875, + "learning_rate": 0.0001406441845576723, + "loss": 0.6039, + "step": 7747 + }, + { + "epoch": 0.36711679696754324, + "grad_norm": 0.91015625, + "learning_rate": 0.00014063057709178008, + "loss": 0.9442, + "step": 7748 + }, + { + "epoch": 0.36716417910447763, + "grad_norm": 0.56640625, + "learning_rate": 0.00014061696872474494, + "loss": 0.4191, + "step": 7749 + }, + { + "epoch": 0.36721156124141197, + "grad_norm": 0.466796875, + "learning_rate": 0.00014060335945686867, + "loss": 0.5002, + "step": 7750 + }, + { + "epoch": 0.36725894337834636, + "grad_norm": 0.6875, + "learning_rate": 0.00014058974928845313, + "loss": 0.6164, + "step": 7751 + }, + { + "epoch": 0.36730632551528075, + "grad_norm": 0.34375, + "learning_rate": 0.00014057613821980022, + "loss": 0.1459, + "step": 7752 + }, + { + "epoch": 0.3673537076522151, + "grad_norm": 0.52734375, + "learning_rate": 0.00014056252625121177, + "loss": 1.2057, + "step": 7753 + }, + { + "epoch": 0.3674010897891495, + "grad_norm": 0.58984375, + "learning_rate": 0.00014054891338298968, + "loss": 0.0651, + "step": 7754 + }, + { + "epoch": 0.3674484719260839, + "grad_norm": 0.60546875, + "learning_rate": 0.00014053529961543588, + "loss": 1.1087, + "step": 7755 + }, + { + "epoch": 0.36749585406301827, + "grad_norm": 0.3671875, + "learning_rate": 0.0001405216849488523, + "loss": 0.1513, + "step": 7756 + }, + { + "epoch": 0.3675432361999526, + "grad_norm": 0.70703125, + "learning_rate": 0.00014050806938354092, + "loss": 1.0219, + "step": 7757 + }, + { + "epoch": 0.367590618336887, + "grad_norm": 0.63671875, + "learning_rate": 0.00014049445291980375, + "loss": 0.9232, + "step": 7758 + }, + { + "epoch": 0.3676380004738214, + "grad_norm": 0.046630859375, + "learning_rate": 0.0001404808355579427, + "loss": 0.0035, + "step": 7759 + }, + { + "epoch": 0.36768538261075573, + "grad_norm": 0.455078125, + "learning_rate": 0.00014046721729825987, + "loss": 0.513, + "step": 7760 + }, + { + "epoch": 0.3677327647476901, + "grad_norm": 0.87109375, + "learning_rate": 0.00014045359814105724, + "loss": 0.2758, + "step": 7761 + }, + { + "epoch": 0.3677801468846245, + "grad_norm": 0.8046875, + "learning_rate": 0.00014043997808663687, + "loss": 1.0791, + "step": 7762 + }, + { + "epoch": 0.36782752902155885, + "grad_norm": 0.74609375, + "learning_rate": 0.00014042635713530093, + "loss": 0.9252, + "step": 7763 + }, + { + "epoch": 0.36787491115849325, + "grad_norm": 0.60546875, + "learning_rate": 0.00014041273528735137, + "loss": 1.0741, + "step": 7764 + }, + { + "epoch": 0.36792229329542764, + "grad_norm": 0.46484375, + "learning_rate": 0.00014039911254309044, + "loss": 0.3521, + "step": 7765 + }, + { + "epoch": 0.367969675432362, + "grad_norm": 0.68359375, + "learning_rate": 0.00014038548890282022, + "loss": 0.8648, + "step": 7766 + }, + { + "epoch": 0.36801705756929637, + "grad_norm": 0.7421875, + "learning_rate": 0.00014037186436684286, + "loss": 0.751, + "step": 7767 + }, + { + "epoch": 0.36806443970623076, + "grad_norm": 0.0703125, + "learning_rate": 0.00014035823893546054, + "loss": 0.0032, + "step": 7768 + }, + { + "epoch": 0.3681118218431651, + "grad_norm": 0.8984375, + "learning_rate": 0.00014034461260897547, + "loss": 0.5844, + "step": 7769 + }, + { + "epoch": 0.3681592039800995, + "grad_norm": 0.0205078125, + "learning_rate": 0.00014033098538768988, + "loss": 0.0019, + "step": 7770 + }, + { + "epoch": 0.3682065861170339, + "grad_norm": 0.482421875, + "learning_rate": 0.000140317357271906, + "loss": 0.5582, + "step": 7771 + }, + { + "epoch": 0.3682539682539683, + "grad_norm": 0.224609375, + "learning_rate": 0.00014030372826192607, + "loss": 0.0237, + "step": 7772 + }, + { + "epoch": 0.3683013503909026, + "grad_norm": 0.59765625, + "learning_rate": 0.00014029009835805236, + "loss": 1.1245, + "step": 7773 + }, + { + "epoch": 0.368348732527837, + "grad_norm": 0.671875, + "learning_rate": 0.00014027646756058722, + "loss": 0.0236, + "step": 7774 + }, + { + "epoch": 0.3683961146647714, + "grad_norm": 0.53515625, + "learning_rate": 0.0001402628358698329, + "loss": 0.8308, + "step": 7775 + }, + { + "epoch": 0.36844349680170574, + "grad_norm": 0.59375, + "learning_rate": 0.00014024920328609177, + "loss": 0.5748, + "step": 7776 + }, + { + "epoch": 0.36849087893864013, + "grad_norm": 0.81640625, + "learning_rate": 0.00014023556980966617, + "loss": 1.1517, + "step": 7777 + }, + { + "epoch": 0.3685382610755745, + "grad_norm": 1.4609375, + "learning_rate": 0.0001402219354408585, + "loss": 0.2758, + "step": 7778 + }, + { + "epoch": 0.36858564321250886, + "grad_norm": 0.59375, + "learning_rate": 0.00014020830017997117, + "loss": 1.1275, + "step": 7779 + }, + { + "epoch": 0.36863302534944326, + "grad_norm": 0.68359375, + "learning_rate": 0.00014019466402730654, + "loss": 1.3826, + "step": 7780 + }, + { + "epoch": 0.36868040748637765, + "grad_norm": 0.49609375, + "learning_rate": 0.00014018102698316708, + "loss": 1.2337, + "step": 7781 + }, + { + "epoch": 0.368727789623312, + "grad_norm": 0.7734375, + "learning_rate": 0.00014016738904785525, + "loss": 1.1935, + "step": 7782 + }, + { + "epoch": 0.3687751717602464, + "grad_norm": 0.62890625, + "learning_rate": 0.0001401537502216735, + "loss": 1.1739, + "step": 7783 + }, + { + "epoch": 0.36882255389718077, + "grad_norm": 0.1806640625, + "learning_rate": 0.00014014011050492438, + "loss": 0.1418, + "step": 7784 + }, + { + "epoch": 0.36886993603411516, + "grad_norm": 0.7265625, + "learning_rate": 0.00014012646989791032, + "loss": 1.3438, + "step": 7785 + }, + { + "epoch": 0.3689173181710495, + "grad_norm": 0.609375, + "learning_rate": 0.0001401128284009339, + "loss": 1.1215, + "step": 7786 + }, + { + "epoch": 0.3689647003079839, + "grad_norm": 0.640625, + "learning_rate": 0.0001400991860142977, + "loss": 0.2389, + "step": 7787 + }, + { + "epoch": 0.3690120824449183, + "grad_norm": 0.55859375, + "learning_rate": 0.00014008554273830422, + "loss": 1.1212, + "step": 7788 + }, + { + "epoch": 0.3690594645818526, + "grad_norm": 0.87109375, + "learning_rate": 0.00014007189857325616, + "loss": 1.3939, + "step": 7789 + }, + { + "epoch": 0.369106846718787, + "grad_norm": 0.6796875, + "learning_rate": 0.00014005825351945609, + "loss": 1.1574, + "step": 7790 + }, + { + "epoch": 0.3691542288557214, + "grad_norm": 0.447265625, + "learning_rate": 0.00014004460757720654, + "loss": 0.633, + "step": 7791 + }, + { + "epoch": 0.36920161099265575, + "grad_norm": 0.734375, + "learning_rate": 0.0001400309607468103, + "loss": 1.1132, + "step": 7792 + }, + { + "epoch": 0.36924899312959014, + "grad_norm": 0.396484375, + "learning_rate": 0.00014001731302856995, + "loss": 0.0308, + "step": 7793 + }, + { + "epoch": 0.36929637526652453, + "grad_norm": 0.62109375, + "learning_rate": 0.00014000366442278828, + "loss": 0.5809, + "step": 7794 + }, + { + "epoch": 0.36934375740345887, + "grad_norm": 0.60546875, + "learning_rate": 0.0001399900149297679, + "loss": 0.8216, + "step": 7795 + }, + { + "epoch": 0.36939113954039327, + "grad_norm": 0.82421875, + "learning_rate": 0.00013997636454981158, + "loss": 0.9698, + "step": 7796 + }, + { + "epoch": 0.36943852167732766, + "grad_norm": 0.64453125, + "learning_rate": 0.0001399627132832221, + "loss": 1.0882, + "step": 7797 + }, + { + "epoch": 0.369485903814262, + "grad_norm": 0.62109375, + "learning_rate": 0.0001399490611303022, + "loss": 0.7544, + "step": 7798 + }, + { + "epoch": 0.3695332859511964, + "grad_norm": 0.63671875, + "learning_rate": 0.00013993540809135468, + "loss": 0.711, + "step": 7799 + }, + { + "epoch": 0.3695806680881308, + "grad_norm": 0.53515625, + "learning_rate": 0.00013992175416668233, + "loss": 0.3963, + "step": 7800 + }, + { + "epoch": 0.3696280502250652, + "grad_norm": 0.5234375, + "learning_rate": 0.00013990809935658798, + "loss": 0.0312, + "step": 7801 + }, + { + "epoch": 0.3696754323619995, + "grad_norm": 0.162109375, + "learning_rate": 0.00013989444366137454, + "loss": 0.0184, + "step": 7802 + }, + { + "epoch": 0.3697228144989339, + "grad_norm": 0.228515625, + "learning_rate": 0.0001398807870813448, + "loss": 0.1373, + "step": 7803 + }, + { + "epoch": 0.3697701966358683, + "grad_norm": 0.92578125, + "learning_rate": 0.00013986712961680167, + "loss": 1.1736, + "step": 7804 + }, + { + "epoch": 0.36981757877280264, + "grad_norm": 0.796875, + "learning_rate": 0.0001398534712680481, + "loss": 0.973, + "step": 7805 + }, + { + "epoch": 0.36986496090973703, + "grad_norm": 0.6171875, + "learning_rate": 0.000139839812035387, + "loss": 0.8525, + "step": 7806 + }, + { + "epoch": 0.3699123430466714, + "grad_norm": 0.00148773193359375, + "learning_rate": 0.00013982615191912128, + "loss": 0.0001, + "step": 7807 + }, + { + "epoch": 0.36995972518360576, + "grad_norm": 1.28125, + "learning_rate": 0.00013981249091955393, + "loss": 0.2925, + "step": 7808 + }, + { + "epoch": 0.37000710732054015, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013979882903698796, + "loss": 0.021, + "step": 7809 + }, + { + "epoch": 0.37005448945747454, + "grad_norm": 1.7890625, + "learning_rate": 0.0001397851662717263, + "loss": 0.3092, + "step": 7810 + }, + { + "epoch": 0.3701018715944089, + "grad_norm": 0.6171875, + "learning_rate": 0.00013977150262407207, + "loss": 1.2958, + "step": 7811 + }, + { + "epoch": 0.3701492537313433, + "grad_norm": 0.26953125, + "learning_rate": 0.00013975783809432827, + "loss": 0.0816, + "step": 7812 + }, + { + "epoch": 0.37019663586827767, + "grad_norm": 0.71875, + "learning_rate": 0.00013974417268279802, + "loss": 0.9872, + "step": 7813 + }, + { + "epoch": 0.37024401800521206, + "grad_norm": 0.55859375, + "learning_rate": 0.0001397305063897843, + "loss": 0.4005, + "step": 7814 + }, + { + "epoch": 0.3702914001421464, + "grad_norm": 0.6640625, + "learning_rate": 0.00013971683921559024, + "loss": 1.3248, + "step": 7815 + }, + { + "epoch": 0.3703387822790808, + "grad_norm": 0.52734375, + "learning_rate": 0.000139703171160519, + "loss": 0.5001, + "step": 7816 + }, + { + "epoch": 0.3703861644160152, + "grad_norm": 0.039306640625, + "learning_rate": 0.00013968950222487375, + "loss": 0.0041, + "step": 7817 + }, + { + "epoch": 0.3704335465529495, + "grad_norm": 0.578125, + "learning_rate": 0.00013967583240895762, + "loss": 0.1899, + "step": 7818 + }, + { + "epoch": 0.3704809286898839, + "grad_norm": 0.56640625, + "learning_rate": 0.00013966216171307374, + "loss": 0.7107, + "step": 7819 + }, + { + "epoch": 0.3705283108268183, + "grad_norm": 0.6953125, + "learning_rate": 0.0001396484901375254, + "loss": 1.0735, + "step": 7820 + }, + { + "epoch": 0.37057569296375265, + "grad_norm": 0.5078125, + "learning_rate": 0.0001396348176826158, + "loss": 0.914, + "step": 7821 + }, + { + "epoch": 0.37062307510068704, + "grad_norm": 0.5859375, + "learning_rate": 0.0001396211443486481, + "loss": 0.687, + "step": 7822 + }, + { + "epoch": 0.37067045723762143, + "grad_norm": 0.62109375, + "learning_rate": 0.00013960747013592567, + "loss": 0.3023, + "step": 7823 + }, + { + "epoch": 0.37071783937455577, + "grad_norm": 0.54296875, + "learning_rate": 0.0001395937950447517, + "loss": 0.8713, + "step": 7824 + }, + { + "epoch": 0.37076522151149016, + "grad_norm": 0.6328125, + "learning_rate": 0.00013958011907542957, + "loss": 0.7141, + "step": 7825 + }, + { + "epoch": 0.37081260364842455, + "grad_norm": 0.498046875, + "learning_rate": 0.0001395664422282625, + "loss": 0.8826, + "step": 7826 + }, + { + "epoch": 0.3708599857853589, + "grad_norm": 0.79296875, + "learning_rate": 0.00013955276450355392, + "loss": 1.0949, + "step": 7827 + }, + { + "epoch": 0.3709073679222933, + "grad_norm": 0.69921875, + "learning_rate": 0.0001395390859016072, + "loss": 1.1836, + "step": 7828 + }, + { + "epoch": 0.3709547500592277, + "grad_norm": 0.9140625, + "learning_rate": 0.00013952540642272562, + "loss": 0.7109, + "step": 7829 + }, + { + "epoch": 0.37100213219616207, + "grad_norm": 0.37890625, + "learning_rate": 0.00013951172606721263, + "loss": 0.065, + "step": 7830 + }, + { + "epoch": 0.3710495143330964, + "grad_norm": 0.17578125, + "learning_rate": 0.00013949804483537163, + "loss": 0.1274, + "step": 7831 + }, + { + "epoch": 0.3710968964700308, + "grad_norm": 1.3984375, + "learning_rate": 0.0001394843627275061, + "loss": 0.7424, + "step": 7832 + }, + { + "epoch": 0.3711442786069652, + "grad_norm": 0.8046875, + "learning_rate": 0.00013947067974391943, + "loss": 0.9609, + "step": 7833 + }, + { + "epoch": 0.37119166074389953, + "grad_norm": 0.1513671875, + "learning_rate": 0.00013945699588491513, + "loss": 0.026, + "step": 7834 + }, + { + "epoch": 0.3712390428808339, + "grad_norm": 0.22265625, + "learning_rate": 0.0001394433111507967, + "loss": 0.1169, + "step": 7835 + }, + { + "epoch": 0.3712864250177683, + "grad_norm": 0.69921875, + "learning_rate": 0.00013942962554186765, + "loss": 1.5699, + "step": 7836 + }, + { + "epoch": 0.37133380715470266, + "grad_norm": 0.79296875, + "learning_rate": 0.00013941593905843148, + "loss": 1.2091, + "step": 7837 + }, + { + "epoch": 0.37138118929163705, + "grad_norm": 0.7890625, + "learning_rate": 0.0001394022517007918, + "loss": 0.9868, + "step": 7838 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.423828125, + "learning_rate": 0.0001393885634692521, + "loss": 0.0908, + "step": 7839 + }, + { + "epoch": 0.3714759535655058, + "grad_norm": 0.6171875, + "learning_rate": 0.00013937487436411607, + "loss": 0.877, + "step": 7840 + }, + { + "epoch": 0.37152333570244017, + "grad_norm": 0.6875, + "learning_rate": 0.0001393611843856872, + "loss": 0.9356, + "step": 7841 + }, + { + "epoch": 0.37157071783937456, + "grad_norm": 0.671875, + "learning_rate": 0.00013934749353426923, + "loss": 1.2068, + "step": 7842 + }, + { + "epoch": 0.37161809997630896, + "grad_norm": 0.74609375, + "learning_rate": 0.00013933380181016576, + "loss": 1.2205, + "step": 7843 + }, + { + "epoch": 0.3716654821132433, + "grad_norm": 0.00173187255859375, + "learning_rate": 0.00013932010921368049, + "loss": 0.0001, + "step": 7844 + }, + { + "epoch": 0.3717128642501777, + "grad_norm": 0.875, + "learning_rate": 0.00013930641574511705, + "loss": 1.0164, + "step": 7845 + }, + { + "epoch": 0.3717602463871121, + "grad_norm": 0.033203125, + "learning_rate": 0.00013929272140477918, + "loss": 0.0033, + "step": 7846 + }, + { + "epoch": 0.3718076285240464, + "grad_norm": 0.5859375, + "learning_rate": 0.00013927902619297058, + "loss": 0.6085, + "step": 7847 + }, + { + "epoch": 0.3718550106609808, + "grad_norm": 0.7109375, + "learning_rate": 0.00013926533010999505, + "loss": 1.1653, + "step": 7848 + }, + { + "epoch": 0.3719023927979152, + "grad_norm": 0.640625, + "learning_rate": 0.00013925163315615631, + "loss": 1.0358, + "step": 7849 + }, + { + "epoch": 0.37194977493484954, + "grad_norm": 0.6328125, + "learning_rate": 0.00013923793533175815, + "loss": 1.3688, + "step": 7850 + }, + { + "epoch": 0.37199715707178393, + "grad_norm": 0.6640625, + "learning_rate": 0.00013922423663710438, + "loss": 0.641, + "step": 7851 + }, + { + "epoch": 0.37204453920871833, + "grad_norm": 0.55859375, + "learning_rate": 0.00013921053707249885, + "loss": 0.7746, + "step": 7852 + }, + { + "epoch": 0.37209192134565267, + "grad_norm": 0.50390625, + "learning_rate": 0.00013919683663824532, + "loss": 0.0762, + "step": 7853 + }, + { + "epoch": 0.37213930348258706, + "grad_norm": 0.71875, + "learning_rate": 0.00013918313533464773, + "loss": 0.0893, + "step": 7854 + }, + { + "epoch": 0.37218668561952145, + "grad_norm": 0.64453125, + "learning_rate": 0.00013916943316200995, + "loss": 0.939, + "step": 7855 + }, + { + "epoch": 0.3722340677564558, + "grad_norm": 0.60546875, + "learning_rate": 0.00013915573012063582, + "loss": 1.1455, + "step": 7856 + }, + { + "epoch": 0.3722814498933902, + "grad_norm": 0.5, + "learning_rate": 0.00013914202621082935, + "loss": 0.502, + "step": 7857 + }, + { + "epoch": 0.3723288320303246, + "grad_norm": 0.70703125, + "learning_rate": 0.0001391283214328944, + "loss": 1.1999, + "step": 7858 + }, + { + "epoch": 0.37237621416725897, + "grad_norm": 0.70703125, + "learning_rate": 0.00013911461578713498, + "loss": 0.9362, + "step": 7859 + }, + { + "epoch": 0.3724235963041933, + "grad_norm": 0.69140625, + "learning_rate": 0.00013910090927385507, + "loss": 1.2651, + "step": 7860 + }, + { + "epoch": 0.3724709784411277, + "grad_norm": 0.2421875, + "learning_rate": 0.00013908720189335856, + "loss": 0.0403, + "step": 7861 + }, + { + "epoch": 0.3725183605780621, + "grad_norm": 0.60546875, + "learning_rate": 0.00013907349364594959, + "loss": 1.3027, + "step": 7862 + }, + { + "epoch": 0.37256574271499643, + "grad_norm": 0.703125, + "learning_rate": 0.00013905978453193217, + "loss": 0.1854, + "step": 7863 + }, + { + "epoch": 0.3726131248519308, + "grad_norm": 0.7890625, + "learning_rate": 0.0001390460745516103, + "loss": 1.1814, + "step": 7864 + }, + { + "epoch": 0.3726605069888652, + "grad_norm": 0.8671875, + "learning_rate": 0.00013903236370528812, + "loss": 1.1386, + "step": 7865 + }, + { + "epoch": 0.37270788912579955, + "grad_norm": 0.76171875, + "learning_rate": 0.00013901865199326968, + "loss": 1.2676, + "step": 7866 + }, + { + "epoch": 0.37275527126273394, + "grad_norm": 1.1015625, + "learning_rate": 0.0001390049394158591, + "loss": 1.6905, + "step": 7867 + }, + { + "epoch": 0.37280265339966834, + "grad_norm": 0.69140625, + "learning_rate": 0.0001389912259733605, + "loss": 1.3225, + "step": 7868 + }, + { + "epoch": 0.3728500355366027, + "grad_norm": 0.77734375, + "learning_rate": 0.00013897751166607803, + "loss": 0.8471, + "step": 7869 + }, + { + "epoch": 0.37289741767353707, + "grad_norm": 0.68359375, + "learning_rate": 0.00013896379649431587, + "loss": 1.0288, + "step": 7870 + }, + { + "epoch": 0.37294479981047146, + "grad_norm": 0.6484375, + "learning_rate": 0.00013895008045837823, + "loss": 0.7012, + "step": 7871 + }, + { + "epoch": 0.37299218194740585, + "grad_norm": 0.5703125, + "learning_rate": 0.00013893636355856925, + "loss": 0.4255, + "step": 7872 + }, + { + "epoch": 0.3730395640843402, + "grad_norm": 0.2373046875, + "learning_rate": 0.00013892264579519324, + "loss": 0.1423, + "step": 7873 + }, + { + "epoch": 0.3730869462212746, + "grad_norm": 0.3359375, + "learning_rate": 0.0001389089271685544, + "loss": 0.1286, + "step": 7874 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.5859375, + "learning_rate": 0.00013889520767895698, + "loss": 0.8213, + "step": 7875 + }, + { + "epoch": 0.3731817104951433, + "grad_norm": 0.255859375, + "learning_rate": 0.0001388814873267053, + "loss": 0.0258, + "step": 7876 + }, + { + "epoch": 0.3732290926320777, + "grad_norm": 0.234375, + "learning_rate": 0.00013886776611210364, + "loss": 0.0293, + "step": 7877 + }, + { + "epoch": 0.3732764747690121, + "grad_norm": 0.53515625, + "learning_rate": 0.00013885404403545635, + "loss": 0.4202, + "step": 7878 + }, + { + "epoch": 0.37332385690594644, + "grad_norm": 0.64453125, + "learning_rate": 0.0001388403210970677, + "loss": 0.8329, + "step": 7879 + }, + { + "epoch": 0.37337123904288083, + "grad_norm": 0.6953125, + "learning_rate": 0.00013882659729724212, + "loss": 0.7784, + "step": 7880 + }, + { + "epoch": 0.3734186211798152, + "grad_norm": 0.2294921875, + "learning_rate": 0.00013881287263628396, + "loss": 0.1253, + "step": 7881 + }, + { + "epoch": 0.37346600331674956, + "grad_norm": 0.080078125, + "learning_rate": 0.00013879914711449766, + "loss": 0.0111, + "step": 7882 + }, + { + "epoch": 0.37351338545368395, + "grad_norm": 0.05908203125, + "learning_rate": 0.00013878542073218755, + "loss": 0.0033, + "step": 7883 + }, + { + "epoch": 0.37356076759061835, + "grad_norm": 0.69921875, + "learning_rate": 0.00013877169348965819, + "loss": 1.2487, + "step": 7884 + }, + { + "epoch": 0.3736081497275527, + "grad_norm": 0.64453125, + "learning_rate": 0.00013875796538721392, + "loss": 0.2762, + "step": 7885 + }, + { + "epoch": 0.3736555318644871, + "grad_norm": 0.51953125, + "learning_rate": 0.00013874423642515932, + "loss": 1.1925, + "step": 7886 + }, + { + "epoch": 0.37370291400142147, + "grad_norm": 0.62890625, + "learning_rate": 0.00013873050660379873, + "loss": 1.1001, + "step": 7887 + }, + { + "epoch": 0.37375029613835586, + "grad_norm": 0.7265625, + "learning_rate": 0.00013871677592343682, + "loss": 0.7461, + "step": 7888 + }, + { + "epoch": 0.3737976782752902, + "grad_norm": 1.1171875, + "learning_rate": 0.00013870304438437804, + "loss": 0.6083, + "step": 7889 + }, + { + "epoch": 0.3738450604122246, + "grad_norm": 0.51171875, + "learning_rate": 0.00013868931198692696, + "loss": 0.517, + "step": 7890 + }, + { + "epoch": 0.373892442549159, + "grad_norm": 0.5859375, + "learning_rate": 0.00013867557873138814, + "loss": 0.9896, + "step": 7891 + }, + { + "epoch": 0.3739398246860933, + "grad_norm": 0.466796875, + "learning_rate": 0.0001386618446180662, + "loss": 0.0736, + "step": 7892 + }, + { + "epoch": 0.3739872068230277, + "grad_norm": 0.5859375, + "learning_rate": 0.00013864810964726572, + "loss": 0.8012, + "step": 7893 + }, + { + "epoch": 0.3740345889599621, + "grad_norm": 0.494140625, + "learning_rate": 0.00013863437381929133, + "loss": 0.2274, + "step": 7894 + }, + { + "epoch": 0.37408197109689645, + "grad_norm": 1.2734375, + "learning_rate": 0.00013862063713444768, + "loss": 0.2137, + "step": 7895 + }, + { + "epoch": 0.37412935323383084, + "grad_norm": 0.6875, + "learning_rate": 0.00013860689959303946, + "loss": 1.0898, + "step": 7896 + }, + { + "epoch": 0.37417673537076523, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001385931611953713, + "loss": 0.0916, + "step": 7897 + }, + { + "epoch": 0.37422411750769957, + "grad_norm": 1.046875, + "learning_rate": 0.00013857942194174793, + "loss": 0.2023, + "step": 7898 + }, + { + "epoch": 0.37427149964463396, + "grad_norm": 0.154296875, + "learning_rate": 0.00013856568183247408, + "loss": 0.1192, + "step": 7899 + }, + { + "epoch": 0.37431888178156836, + "grad_norm": 0.474609375, + "learning_rate": 0.00013855194086785451, + "loss": 0.411, + "step": 7900 + }, + { + "epoch": 0.37436626391850275, + "grad_norm": 0.50390625, + "learning_rate": 0.00013853819904819395, + "loss": 0.5893, + "step": 7901 + }, + { + "epoch": 0.3744136460554371, + "grad_norm": 0.240234375, + "learning_rate": 0.00013852445637379716, + "loss": 0.1442, + "step": 7902 + }, + { + "epoch": 0.3744610281923715, + "grad_norm": 0.80859375, + "learning_rate": 0.00013851071284496898, + "loss": 0.8285, + "step": 7903 + }, + { + "epoch": 0.3745084103293059, + "grad_norm": 0.119140625, + "learning_rate": 0.00013849696846201417, + "loss": 0.0176, + "step": 7904 + }, + { + "epoch": 0.3745557924662402, + "grad_norm": 0.294921875, + "learning_rate": 0.00013848322322523765, + "loss": 0.112, + "step": 7905 + }, + { + "epoch": 0.3746031746031746, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001384694771349442, + "loss": 0.1176, + "step": 7906 + }, + { + "epoch": 0.374650556740109, + "grad_norm": 0.7421875, + "learning_rate": 0.00013845573019143876, + "loss": 0.793, + "step": 7907 + }, + { + "epoch": 0.37469793887704334, + "grad_norm": 0.66796875, + "learning_rate": 0.00013844198239502614, + "loss": 1.3484, + "step": 7908 + }, + { + "epoch": 0.37474532101397773, + "grad_norm": 0.796875, + "learning_rate": 0.00013842823374601135, + "loss": 1.2427, + "step": 7909 + }, + { + "epoch": 0.3747927031509121, + "grad_norm": 0.49609375, + "learning_rate": 0.00013841448424469922, + "loss": 1.1234, + "step": 7910 + }, + { + "epoch": 0.37484008528784646, + "grad_norm": 0.8671875, + "learning_rate": 0.00013840073389139476, + "loss": 0.0352, + "step": 7911 + }, + { + "epoch": 0.37488746742478085, + "grad_norm": 0.578125, + "learning_rate": 0.00013838698268640293, + "loss": 0.8207, + "step": 7912 + }, + { + "epoch": 0.37493484956171524, + "grad_norm": 0.54296875, + "learning_rate": 0.0001383732306300287, + "loss": 0.5687, + "step": 7913 + }, + { + "epoch": 0.3749822316986496, + "grad_norm": 0.0791015625, + "learning_rate": 0.00013835947772257708, + "loss": 0.0053, + "step": 7914 + }, + { + "epoch": 0.375029613835584, + "grad_norm": 0.23046875, + "learning_rate": 0.00013834572396435318, + "loss": 0.1277, + "step": 7915 + }, + { + "epoch": 0.37507699597251837, + "grad_norm": 0.83203125, + "learning_rate": 0.00013833196935566188, + "loss": 1.2075, + "step": 7916 + }, + { + "epoch": 0.37512437810945276, + "grad_norm": 0.7109375, + "learning_rate": 0.00013831821389680837, + "loss": 0.0653, + "step": 7917 + }, + { + "epoch": 0.3751717602463871, + "grad_norm": 0.671875, + "learning_rate": 0.00013830445758809766, + "loss": 1.2123, + "step": 7918 + }, + { + "epoch": 0.3752191423833215, + "grad_norm": 0.78125, + "learning_rate": 0.0001382907004298349, + "loss": 1.2025, + "step": 7919 + }, + { + "epoch": 0.3752665245202559, + "grad_norm": 0.48828125, + "learning_rate": 0.00013827694242232519, + "loss": 0.7279, + "step": 7920 + }, + { + "epoch": 0.3753139066571902, + "grad_norm": 0.9375, + "learning_rate": 0.00013826318356587367, + "loss": 0.2879, + "step": 7921 + }, + { + "epoch": 0.3753612887941246, + "grad_norm": 0.55859375, + "learning_rate": 0.00013824942386078548, + "loss": 0.8827, + "step": 7922 + }, + { + "epoch": 0.375408670931059, + "grad_norm": 0.6171875, + "learning_rate": 0.00013823566330736583, + "loss": 0.9701, + "step": 7923 + }, + { + "epoch": 0.37545605306799334, + "grad_norm": 0.82421875, + "learning_rate": 0.0001382219019059199, + "loss": 0.8685, + "step": 7924 + }, + { + "epoch": 0.37550343520492774, + "grad_norm": 1.4140625, + "learning_rate": 0.00013820813965675287, + "loss": 0.6137, + "step": 7925 + }, + { + "epoch": 0.37555081734186213, + "grad_norm": 0.65625, + "learning_rate": 0.00013819437656017, + "loss": 0.6669, + "step": 7926 + }, + { + "epoch": 0.37559819947879647, + "grad_norm": 0.578125, + "learning_rate": 0.00013818061261647654, + "loss": 1.0922, + "step": 7927 + }, + { + "epoch": 0.37564558161573086, + "grad_norm": 0.62109375, + "learning_rate": 0.0001381668478259778, + "loss": 0.8572, + "step": 7928 + }, + { + "epoch": 0.37569296375266525, + "grad_norm": 0.81640625, + "learning_rate": 0.000138153082188979, + "loss": 0.2273, + "step": 7929 + }, + { + "epoch": 0.37574034588959965, + "grad_norm": 0.8125, + "learning_rate": 0.00013813931570578548, + "loss": 1.2525, + "step": 7930 + }, + { + "epoch": 0.375787728026534, + "grad_norm": 0.283203125, + "learning_rate": 0.00013812554837670256, + "loss": 0.0903, + "step": 7931 + }, + { + "epoch": 0.3758351101634684, + "grad_norm": 0.51171875, + "learning_rate": 0.0001381117802020356, + "loss": 0.0948, + "step": 7932 + }, + { + "epoch": 0.37588249230040277, + "grad_norm": 0.6796875, + "learning_rate": 0.00013809801118208994, + "loss": 0.852, + "step": 7933 + }, + { + "epoch": 0.3759298744373371, + "grad_norm": 0.6875, + "learning_rate": 0.00013808424131717096, + "loss": 0.9516, + "step": 7934 + }, + { + "epoch": 0.3759772565742715, + "grad_norm": 0.60546875, + "learning_rate": 0.00013807047060758408, + "loss": 0.7198, + "step": 7935 + }, + { + "epoch": 0.3760246387112059, + "grad_norm": 0.107421875, + "learning_rate": 0.00013805669905363473, + "loss": 0.0043, + "step": 7936 + }, + { + "epoch": 0.37607202084814023, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001380429266556283, + "loss": 0.0083, + "step": 7937 + }, + { + "epoch": 0.3761194029850746, + "grad_norm": 0.4375, + "learning_rate": 0.00013802915341387027, + "loss": 0.4724, + "step": 7938 + }, + { + "epoch": 0.376166785122009, + "grad_norm": 0.6015625, + "learning_rate": 0.00013801537932866617, + "loss": 1.1248, + "step": 7939 + }, + { + "epoch": 0.37621416725894335, + "grad_norm": 0.197265625, + "learning_rate": 0.00013800160440032144, + "loss": 0.0161, + "step": 7940 + }, + { + "epoch": 0.37626154939587775, + "grad_norm": 0.94140625, + "learning_rate": 0.0001379878286291416, + "loss": 0.6025, + "step": 7941 + }, + { + "epoch": 0.37630893153281214, + "grad_norm": 0.55859375, + "learning_rate": 0.00013797405201543215, + "loss": 0.0961, + "step": 7942 + }, + { + "epoch": 0.3763563136697465, + "grad_norm": 0.1220703125, + "learning_rate": 0.0001379602745594987, + "loss": 0.0171, + "step": 7943 + }, + { + "epoch": 0.37640369580668087, + "grad_norm": 0.173828125, + "learning_rate": 0.0001379464962616468, + "loss": 0.1276, + "step": 7944 + }, + { + "epoch": 0.37645107794361526, + "grad_norm": 0.80859375, + "learning_rate": 0.000137932717122182, + "loss": 1.0654, + "step": 7945 + }, + { + "epoch": 0.37649846008054966, + "grad_norm": 0.54296875, + "learning_rate": 0.00013791893714140997, + "loss": 1.248, + "step": 7946 + }, + { + "epoch": 0.376545842217484, + "grad_norm": 0.19140625, + "learning_rate": 0.00013790515631963631, + "loss": 0.012, + "step": 7947 + }, + { + "epoch": 0.3765932243544184, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00013789137465716666, + "loss": 0.0009, + "step": 7948 + }, + { + "epoch": 0.3766406064913528, + "grad_norm": 0.61328125, + "learning_rate": 0.00013787759215430664, + "loss": 0.8535, + "step": 7949 + }, + { + "epoch": 0.3766879886282871, + "grad_norm": 0.197265625, + "learning_rate": 0.000137863808811362, + "loss": 0.1482, + "step": 7950 + }, + { + "epoch": 0.3767353707652215, + "grad_norm": 0.1953125, + "learning_rate": 0.00013785002462863842, + "loss": 0.0033, + "step": 7951 + }, + { + "epoch": 0.3767827529021559, + "grad_norm": 0.64453125, + "learning_rate": 0.00013783623960644155, + "loss": 1.3051, + "step": 7952 + }, + { + "epoch": 0.37683013503909024, + "grad_norm": 0.90625, + "learning_rate": 0.00013782245374507725, + "loss": 0.4477, + "step": 7953 + }, + { + "epoch": 0.37687751717602463, + "grad_norm": 0.22265625, + "learning_rate": 0.0001378086670448512, + "loss": 0.0992, + "step": 7954 + }, + { + "epoch": 0.376924899312959, + "grad_norm": 0.5859375, + "learning_rate": 0.00013779487950606919, + "loss": 0.7131, + "step": 7955 + }, + { + "epoch": 0.37697228144989336, + "grad_norm": 0.69921875, + "learning_rate": 0.000137781091129037, + "loss": 1.0882, + "step": 7956 + }, + { + "epoch": 0.37701966358682776, + "grad_norm": 0.71875, + "learning_rate": 0.00013776730191406044, + "loss": 0.9286, + "step": 7957 + }, + { + "epoch": 0.37706704572376215, + "grad_norm": 0.66796875, + "learning_rate": 0.00013775351186144538, + "loss": 1.1329, + "step": 7958 + }, + { + "epoch": 0.37711442786069654, + "grad_norm": 0.671875, + "learning_rate": 0.00013773972097149762, + "loss": 0.8817, + "step": 7959 + }, + { + "epoch": 0.3771618099976309, + "grad_norm": 0.63671875, + "learning_rate": 0.00013772592924452304, + "loss": 1.0865, + "step": 7960 + }, + { + "epoch": 0.3772091921345653, + "grad_norm": 0.062255859375, + "learning_rate": 0.00013771213668082753, + "loss": 0.0069, + "step": 7961 + }, + { + "epoch": 0.37725657427149967, + "grad_norm": 1.359375, + "learning_rate": 0.00013769834328071704, + "loss": 0.8624, + "step": 7962 + }, + { + "epoch": 0.377303956408434, + "grad_norm": 0.58203125, + "learning_rate": 0.00013768454904449743, + "loss": 0.7787, + "step": 7963 + }, + { + "epoch": 0.3773513385453684, + "grad_norm": 0.1943359375, + "learning_rate": 0.00013767075397247465, + "loss": 0.0185, + "step": 7964 + }, + { + "epoch": 0.3773987206823028, + "grad_norm": 0.91015625, + "learning_rate": 0.00013765695806495468, + "loss": 0.9429, + "step": 7965 + }, + { + "epoch": 0.37744610281923713, + "grad_norm": 0.7578125, + "learning_rate": 0.0001376431613222435, + "loss": 1.2558, + "step": 7966 + }, + { + "epoch": 0.3774934849561715, + "grad_norm": 0.87890625, + "learning_rate": 0.00013762936374464707, + "loss": 0.9674, + "step": 7967 + }, + { + "epoch": 0.3775408670931059, + "grad_norm": 0.74609375, + "learning_rate": 0.00013761556533247143, + "loss": 0.7816, + "step": 7968 + }, + { + "epoch": 0.37758824923004025, + "grad_norm": 1.0234375, + "learning_rate": 0.00013760176608602263, + "loss": 0.4469, + "step": 7969 + }, + { + "epoch": 0.37763563136697464, + "grad_norm": 0.47265625, + "learning_rate": 0.00013758796600560675, + "loss": 0.1748, + "step": 7970 + }, + { + "epoch": 0.37768301350390904, + "grad_norm": 0.64453125, + "learning_rate": 0.00013757416509152978, + "loss": 0.0476, + "step": 7971 + }, + { + "epoch": 0.3777303956408434, + "grad_norm": 0.2734375, + "learning_rate": 0.00013756036334409784, + "loss": 0.1644, + "step": 7972 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.60546875, + "learning_rate": 0.00013754656076361707, + "loss": 0.8887, + "step": 7973 + }, + { + "epoch": 0.37782515991471216, + "grad_norm": 0.21484375, + "learning_rate": 0.0001375327573503936, + "loss": 0.0274, + "step": 7974 + }, + { + "epoch": 0.37787254205164655, + "grad_norm": 0.76953125, + "learning_rate": 0.0001375189531047335, + "loss": 1.3528, + "step": 7975 + }, + { + "epoch": 0.3779199241885809, + "grad_norm": 0.06640625, + "learning_rate": 0.000137505148026943, + "loss": 0.0035, + "step": 7976 + }, + { + "epoch": 0.3779673063255153, + "grad_norm": 0.52734375, + "learning_rate": 0.00013749134211732828, + "loss": 0.9132, + "step": 7977 + }, + { + "epoch": 0.3780146884624497, + "grad_norm": 0.703125, + "learning_rate": 0.00013747753537619552, + "loss": 0.7628, + "step": 7978 + }, + { + "epoch": 0.378062070599384, + "grad_norm": 0.1796875, + "learning_rate": 0.00013746372780385095, + "loss": 0.0163, + "step": 7979 + }, + { + "epoch": 0.3781094527363184, + "grad_norm": 0.625, + "learning_rate": 0.00013744991940060078, + "loss": 1.0008, + "step": 7980 + }, + { + "epoch": 0.3781568348732528, + "grad_norm": 0.62109375, + "learning_rate": 0.00013743611016675132, + "loss": 0.8128, + "step": 7981 + }, + { + "epoch": 0.37820421701018714, + "grad_norm": 0.70703125, + "learning_rate": 0.00013742230010260878, + "loss": 0.9581, + "step": 7982 + }, + { + "epoch": 0.37825159914712153, + "grad_norm": 0.77734375, + "learning_rate": 0.0001374084892084795, + "loss": 0.9201, + "step": 7983 + }, + { + "epoch": 0.3782989812840559, + "grad_norm": 0.7109375, + "learning_rate": 0.00013739467748466973, + "loss": 1.4496, + "step": 7984 + }, + { + "epoch": 0.37834636342099026, + "grad_norm": 0.6484375, + "learning_rate": 0.0001373808649314859, + "loss": 1.2382, + "step": 7985 + }, + { + "epoch": 0.37839374555792465, + "grad_norm": 0.4609375, + "learning_rate": 0.0001373670515492343, + "loss": 0.1384, + "step": 7986 + }, + { + "epoch": 0.37844112769485905, + "grad_norm": 0.00262451171875, + "learning_rate": 0.00013735323733822129, + "loss": 0.0002, + "step": 7987 + }, + { + "epoch": 0.37848850983179344, + "grad_norm": 0.98828125, + "learning_rate": 0.00013733942229875323, + "loss": 1.2091, + "step": 7988 + }, + { + "epoch": 0.3785358919687278, + "grad_norm": 0.62890625, + "learning_rate": 0.0001373256064311366, + "loss": 0.8466, + "step": 7989 + }, + { + "epoch": 0.37858327410566217, + "grad_norm": 0.68359375, + "learning_rate": 0.00013731178973567775, + "loss": 0.9977, + "step": 7990 + }, + { + "epoch": 0.37863065624259656, + "grad_norm": 0.57421875, + "learning_rate": 0.00013729797221268317, + "loss": 0.1717, + "step": 7991 + }, + { + "epoch": 0.3786780383795309, + "grad_norm": 0.48828125, + "learning_rate": 0.00013728415386245928, + "loss": 0.5609, + "step": 7992 + }, + { + "epoch": 0.3787254205164653, + "grad_norm": 0.8359375, + "learning_rate": 0.00013727033468531255, + "loss": 0.844, + "step": 7993 + }, + { + "epoch": 0.3787728026533997, + "grad_norm": 0.91015625, + "learning_rate": 0.00013725651468154954, + "loss": 0.3045, + "step": 7994 + }, + { + "epoch": 0.378820184790334, + "grad_norm": 0.7578125, + "learning_rate": 0.0001372426938514767, + "loss": 0.8474, + "step": 7995 + }, + { + "epoch": 0.3788675669272684, + "grad_norm": 0.59765625, + "learning_rate": 0.00013722887219540057, + "loss": 0.6619, + "step": 7996 + }, + { + "epoch": 0.3789149490642028, + "grad_norm": 0.416015625, + "learning_rate": 0.0001372150497136277, + "loss": 0.0171, + "step": 7997 + }, + { + "epoch": 0.37896233120113715, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001372012264064647, + "loss": 0.0394, + "step": 7998 + }, + { + "epoch": 0.37900971333807154, + "grad_norm": 0.484375, + "learning_rate": 0.0001371874022742181, + "loss": 0.7911, + "step": 7999 + }, + { + "epoch": 0.37905709547500593, + "grad_norm": 0.734375, + "learning_rate": 0.00013717357731719455, + "loss": 1.3489, + "step": 8000 + }, + { + "epoch": 0.37910447761194027, + "grad_norm": 0.94921875, + "learning_rate": 0.0001371597515357006, + "loss": 0.2617, + "step": 8001 + }, + { + "epoch": 0.37915185974887466, + "grad_norm": 0.640625, + "learning_rate": 0.00013714592493004299, + "loss": 0.9439, + "step": 8002 + }, + { + "epoch": 0.37919924188580906, + "grad_norm": 0.6953125, + "learning_rate": 0.0001371320975005283, + "loss": 1.1759, + "step": 8003 + }, + { + "epoch": 0.37924662402274345, + "grad_norm": 0.62109375, + "learning_rate": 0.00013711826924746328, + "loss": 0.4806, + "step": 8004 + }, + { + "epoch": 0.3792940061596778, + "grad_norm": 0.859375, + "learning_rate": 0.00013710444017115452, + "loss": 0.7866, + "step": 8005 + }, + { + "epoch": 0.3793413882966122, + "grad_norm": 0.58984375, + "learning_rate": 0.00013709061027190886, + "loss": 0.8067, + "step": 8006 + }, + { + "epoch": 0.3793887704335466, + "grad_norm": 0.76953125, + "learning_rate": 0.00013707677955003292, + "loss": 1.244, + "step": 8007 + }, + { + "epoch": 0.3794361525704809, + "grad_norm": 0.78125, + "learning_rate": 0.00013706294800583355, + "loss": 1.1158, + "step": 8008 + }, + { + "epoch": 0.3794835347074153, + "grad_norm": 0.66015625, + "learning_rate": 0.00013704911563961747, + "loss": 1.2878, + "step": 8009 + }, + { + "epoch": 0.3795309168443497, + "grad_norm": 0.92578125, + "learning_rate": 0.00013703528245169144, + "loss": 0.9559, + "step": 8010 + }, + { + "epoch": 0.37957829898128403, + "grad_norm": 0.61328125, + "learning_rate": 0.0001370214484423623, + "loss": 0.9671, + "step": 8011 + }, + { + "epoch": 0.3796256811182184, + "grad_norm": 0.267578125, + "learning_rate": 0.00013700761361193687, + "loss": 0.1168, + "step": 8012 + }, + { + "epoch": 0.3796730632551528, + "grad_norm": 0.051025390625, + "learning_rate": 0.000136993777960722, + "loss": 0.0051, + "step": 8013 + }, + { + "epoch": 0.37972044539208716, + "grad_norm": 0.5234375, + "learning_rate": 0.0001369799414890245, + "loss": 0.9547, + "step": 8014 + }, + { + "epoch": 0.37976782752902155, + "grad_norm": 0.1884765625, + "learning_rate": 0.00013696610419715132, + "loss": 0.0117, + "step": 8015 + }, + { + "epoch": 0.37981520966595594, + "grad_norm": 0.84765625, + "learning_rate": 0.00013695226608540932, + "loss": 1.1277, + "step": 8016 + }, + { + "epoch": 0.37986259180289034, + "grad_norm": 0.65625, + "learning_rate": 0.00013693842715410543, + "loss": 1.2176, + "step": 8017 + }, + { + "epoch": 0.3799099739398247, + "grad_norm": 0.72265625, + "learning_rate": 0.00013692458740354656, + "loss": 1.3435, + "step": 8018 + }, + { + "epoch": 0.37995735607675907, + "grad_norm": 0.71875, + "learning_rate": 0.00013691074683403967, + "loss": 0.7874, + "step": 8019 + }, + { + "epoch": 0.38000473821369346, + "grad_norm": 0.474609375, + "learning_rate": 0.00013689690544589172, + "loss": 0.0587, + "step": 8020 + }, + { + "epoch": 0.3800521203506278, + "grad_norm": 0.6796875, + "learning_rate": 0.00013688306323940972, + "loss": 0.8233, + "step": 8021 + }, + { + "epoch": 0.3800995024875622, + "grad_norm": 0.70703125, + "learning_rate": 0.00013686922021490064, + "loss": 1.1287, + "step": 8022 + }, + { + "epoch": 0.3801468846244966, + "grad_norm": 0.76171875, + "learning_rate": 0.00013685537637267157, + "loss": 1.0793, + "step": 8023 + }, + { + "epoch": 0.3801942667614309, + "grad_norm": 0.57421875, + "learning_rate": 0.0001368415317130295, + "loss": 0.697, + "step": 8024 + }, + { + "epoch": 0.3802416488983653, + "grad_norm": 0.26953125, + "learning_rate": 0.0001368276862362815, + "loss": 0.0584, + "step": 8025 + }, + { + "epoch": 0.3802890310352997, + "grad_norm": 0.57421875, + "learning_rate": 0.0001368138399427346, + "loss": 0.984, + "step": 8026 + }, + { + "epoch": 0.38033641317223404, + "grad_norm": 0.8515625, + "learning_rate": 0.000136799992832696, + "loss": 0.554, + "step": 8027 + }, + { + "epoch": 0.38038379530916844, + "grad_norm": 0.5546875, + "learning_rate": 0.00013678614490647271, + "loss": 1.06, + "step": 8028 + }, + { + "epoch": 0.38043117744610283, + "grad_norm": 0.921875, + "learning_rate": 0.00013677229616437193, + "loss": 0.3189, + "step": 8029 + }, + { + "epoch": 0.38047855958303717, + "grad_norm": 0.7109375, + "learning_rate": 0.0001367584466067008, + "loss": 0.7537, + "step": 8030 + }, + { + "epoch": 0.38052594171997156, + "grad_norm": 0.609375, + "learning_rate": 0.00013674459623376646, + "loss": 0.9905, + "step": 8031 + }, + { + "epoch": 0.38057332385690595, + "grad_norm": 0.62890625, + "learning_rate": 0.0001367307450458761, + "loss": 0.979, + "step": 8032 + }, + { + "epoch": 0.38062070599384035, + "grad_norm": 0.294921875, + "learning_rate": 0.000136716893043337, + "loss": 0.0686, + "step": 8033 + }, + { + "epoch": 0.3806680881307747, + "grad_norm": 1.1484375, + "learning_rate": 0.00013670304022645626, + "loss": 0.0448, + "step": 8034 + }, + { + "epoch": 0.3807154702677091, + "grad_norm": 0.75, + "learning_rate": 0.00013668918659554122, + "loss": 0.9802, + "step": 8035 + }, + { + "epoch": 0.38076285240464347, + "grad_norm": 0.82421875, + "learning_rate": 0.00013667533215089907, + "loss": 0.9204, + "step": 8036 + }, + { + "epoch": 0.3808102345415778, + "grad_norm": 0.91015625, + "learning_rate": 0.00013666147689283712, + "loss": 1.2561, + "step": 8037 + }, + { + "epoch": 0.3808576166785122, + "grad_norm": 0.39453125, + "learning_rate": 0.00013664762082166268, + "loss": 0.0745, + "step": 8038 + }, + { + "epoch": 0.3809049988154466, + "grad_norm": 0.70703125, + "learning_rate": 0.00013663376393768304, + "loss": 1.1572, + "step": 8039 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.58203125, + "learning_rate": 0.00013661990624120552, + "loss": 0.8301, + "step": 8040 + }, + { + "epoch": 0.3809997630893153, + "grad_norm": 0.86328125, + "learning_rate": 0.0001366060477325375, + "loss": 1.0107, + "step": 8041 + }, + { + "epoch": 0.3810471452262497, + "grad_norm": 0.56640625, + "learning_rate": 0.00013659218841198634, + "loss": 0.5433, + "step": 8042 + }, + { + "epoch": 0.38109452736318405, + "grad_norm": 0.625, + "learning_rate": 0.00013657832827985942, + "loss": 1.0698, + "step": 8043 + }, + { + "epoch": 0.38114190950011845, + "grad_norm": 0.2421875, + "learning_rate": 0.00013656446733646414, + "loss": 0.0785, + "step": 8044 + }, + { + "epoch": 0.38118929163705284, + "grad_norm": 0.56640625, + "learning_rate": 0.00013655060558210788, + "loss": 1.1157, + "step": 8045 + }, + { + "epoch": 0.38123667377398723, + "grad_norm": 0.8125, + "learning_rate": 0.0001365367430170982, + "loss": 1.1293, + "step": 8046 + }, + { + "epoch": 0.38128405591092157, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001365228796417424, + "loss": 0.0096, + "step": 8047 + }, + { + "epoch": 0.38133143804785596, + "grad_norm": 0.63671875, + "learning_rate": 0.00013650901545634805, + "loss": 0.27, + "step": 8048 + }, + { + "epoch": 0.38137882018479036, + "grad_norm": 0.234375, + "learning_rate": 0.00013649515046122265, + "loss": 0.0726, + "step": 8049 + }, + { + "epoch": 0.3814262023217247, + "grad_norm": 0.055908203125, + "learning_rate": 0.0001364812846566737, + "loss": 0.0058, + "step": 8050 + }, + { + "epoch": 0.3814735844586591, + "grad_norm": 0.51953125, + "learning_rate": 0.00013646741804300866, + "loss": 1.2114, + "step": 8051 + }, + { + "epoch": 0.3815209665955935, + "grad_norm": 0.703125, + "learning_rate": 0.00013645355062053515, + "loss": 1.4249, + "step": 8052 + }, + { + "epoch": 0.3815683487325278, + "grad_norm": 0.44140625, + "learning_rate": 0.0001364396823895607, + "loss": 0.911, + "step": 8053 + }, + { + "epoch": 0.3816157308694622, + "grad_norm": 0.63671875, + "learning_rate": 0.00013642581335039295, + "loss": 0.7898, + "step": 8054 + }, + { + "epoch": 0.3816631130063966, + "grad_norm": 0.146484375, + "learning_rate": 0.00013641194350333944, + "loss": 0.0948, + "step": 8055 + }, + { + "epoch": 0.38171049514333094, + "grad_norm": 0.7890625, + "learning_rate": 0.0001363980728487078, + "loss": 1.5226, + "step": 8056 + }, + { + "epoch": 0.38175787728026533, + "grad_norm": 0.5546875, + "learning_rate": 0.0001363842013868057, + "loss": 0.9883, + "step": 8057 + }, + { + "epoch": 0.3818052594171997, + "grad_norm": 0.58984375, + "learning_rate": 0.00013637032911794074, + "loss": 0.7916, + "step": 8058 + }, + { + "epoch": 0.38185264155413406, + "grad_norm": 0.63671875, + "learning_rate": 0.00013635645604242064, + "loss": 1.3916, + "step": 8059 + }, + { + "epoch": 0.38190002369106846, + "grad_norm": 0.97265625, + "learning_rate": 0.00013634258216055304, + "loss": 0.1157, + "step": 8060 + }, + { + "epoch": 0.38194740582800285, + "grad_norm": 0.98828125, + "learning_rate": 0.0001363287074726457, + "loss": 0.939, + "step": 8061 + }, + { + "epoch": 0.38199478796493724, + "grad_norm": 0.61328125, + "learning_rate": 0.00013631483197900631, + "loss": 0.6919, + "step": 8062 + }, + { + "epoch": 0.3820421701018716, + "grad_norm": 0.69140625, + "learning_rate": 0.00013630095567994266, + "loss": 0.7033, + "step": 8063 + }, + { + "epoch": 0.382089552238806, + "grad_norm": 0.51171875, + "learning_rate": 0.00013628707857576245, + "loss": 0.672, + "step": 8064 + }, + { + "epoch": 0.38213693437574037, + "grad_norm": 0.69921875, + "learning_rate": 0.0001362732006667735, + "loss": 1.0353, + "step": 8065 + }, + { + "epoch": 0.3821843165126747, + "grad_norm": 0.74609375, + "learning_rate": 0.0001362593219532836, + "loss": 0.8868, + "step": 8066 + }, + { + "epoch": 0.3822316986496091, + "grad_norm": 0.65625, + "learning_rate": 0.00013624544243560056, + "loss": 0.1084, + "step": 8067 + }, + { + "epoch": 0.3822790807865435, + "grad_norm": 0.79296875, + "learning_rate": 0.0001362315621140322, + "loss": 0.9786, + "step": 8068 + }, + { + "epoch": 0.38232646292347783, + "grad_norm": 0.71875, + "learning_rate": 0.00013621768098888638, + "loss": 0.8478, + "step": 8069 + }, + { + "epoch": 0.3823738450604122, + "grad_norm": 0.80859375, + "learning_rate": 0.00013620379906047097, + "loss": 1.2938, + "step": 8070 + }, + { + "epoch": 0.3824212271973466, + "grad_norm": 0.25390625, + "learning_rate": 0.00013618991632909387, + "loss": 0.1151, + "step": 8071 + }, + { + "epoch": 0.38246860933428095, + "grad_norm": 0.73828125, + "learning_rate": 0.000136176032795063, + "loss": 0.6844, + "step": 8072 + }, + { + "epoch": 0.38251599147121534, + "grad_norm": 0.6171875, + "learning_rate": 0.00013616214845868624, + "loss": 0.9966, + "step": 8073 + }, + { + "epoch": 0.38256337360814974, + "grad_norm": 0.69140625, + "learning_rate": 0.00013614826332027154, + "loss": 1.0518, + "step": 8074 + }, + { + "epoch": 0.38261075574508413, + "grad_norm": 0.6328125, + "learning_rate": 0.00013613437738012684, + "loss": 1.0863, + "step": 8075 + }, + { + "epoch": 0.38265813788201847, + "grad_norm": 0.0037689208984375, + "learning_rate": 0.00013612049063856015, + "loss": 0.0003, + "step": 8076 + }, + { + "epoch": 0.38270552001895286, + "grad_norm": 0.67578125, + "learning_rate": 0.00013610660309587948, + "loss": 1.4448, + "step": 8077 + }, + { + "epoch": 0.38275290215588725, + "grad_norm": 0.79296875, + "learning_rate": 0.0001360927147523928, + "loss": 1.0429, + "step": 8078 + }, + { + "epoch": 0.3828002842928216, + "grad_norm": 0.59765625, + "learning_rate": 0.00013607882560840812, + "loss": 0.922, + "step": 8079 + }, + { + "epoch": 0.382847666429756, + "grad_norm": 0.16796875, + "learning_rate": 0.00013606493566423357, + "loss": 0.0116, + "step": 8080 + }, + { + "epoch": 0.3828950485666904, + "grad_norm": 0.6171875, + "learning_rate": 0.00013605104492017713, + "loss": 1.015, + "step": 8081 + }, + { + "epoch": 0.3829424307036247, + "grad_norm": 0.6015625, + "learning_rate": 0.00013603715337654694, + "loss": 0.0668, + "step": 8082 + }, + { + "epoch": 0.3829898128405591, + "grad_norm": 0.640625, + "learning_rate": 0.00013602326103365103, + "loss": 0.4468, + "step": 8083 + }, + { + "epoch": 0.3830371949774935, + "grad_norm": 0.58984375, + "learning_rate": 0.00013600936789179757, + "loss": 1.3271, + "step": 8084 + }, + { + "epoch": 0.38308457711442784, + "grad_norm": 0.72265625, + "learning_rate": 0.00013599547395129472, + "loss": 1.0323, + "step": 8085 + }, + { + "epoch": 0.38313195925136223, + "grad_norm": 0.9765625, + "learning_rate": 0.00013598157921245054, + "loss": 0.4701, + "step": 8086 + }, + { + "epoch": 0.3831793413882966, + "grad_norm": 0.671875, + "learning_rate": 0.0001359676836755733, + "loss": 0.132, + "step": 8087 + }, + { + "epoch": 0.38322672352523096, + "grad_norm": 0.859375, + "learning_rate": 0.00013595378734097114, + "loss": 1.1943, + "step": 8088 + }, + { + "epoch": 0.38327410566216535, + "grad_norm": 0.263671875, + "learning_rate": 0.00013593989020895228, + "loss": 0.0516, + "step": 8089 + }, + { + "epoch": 0.38332148779909975, + "grad_norm": 0.78125, + "learning_rate": 0.00013592599227982491, + "loss": 0.1347, + "step": 8090 + }, + { + "epoch": 0.38336886993603414, + "grad_norm": 0.67578125, + "learning_rate": 0.00013591209355389734, + "loss": 1.0963, + "step": 8091 + }, + { + "epoch": 0.3834162520729685, + "grad_norm": 0.8203125, + "learning_rate": 0.00013589819403147775, + "loss": 0.8227, + "step": 8092 + }, + { + "epoch": 0.38346363420990287, + "grad_norm": 0.609375, + "learning_rate": 0.00013588429371287449, + "loss": 1.1687, + "step": 8093 + }, + { + "epoch": 0.38351101634683726, + "grad_norm": 0.7734375, + "learning_rate": 0.00013587039259839578, + "loss": 1.2914, + "step": 8094 + }, + { + "epoch": 0.3835583984837716, + "grad_norm": 0.63671875, + "learning_rate": 0.00013585649068835, + "loss": 0.8823, + "step": 8095 + }, + { + "epoch": 0.383605780620706, + "grad_norm": 0.9921875, + "learning_rate": 0.00013584258798304542, + "loss": 0.7378, + "step": 8096 + }, + { + "epoch": 0.3836531627576404, + "grad_norm": 0.73046875, + "learning_rate": 0.00013582868448279044, + "loss": 1.3109, + "step": 8097 + }, + { + "epoch": 0.3837005448945747, + "grad_norm": 0.5859375, + "learning_rate": 0.00013581478018789337, + "loss": 0.4641, + "step": 8098 + }, + { + "epoch": 0.3837479270315091, + "grad_norm": 0.578125, + "learning_rate": 0.00013580087509866266, + "loss": 0.8215, + "step": 8099 + }, + { + "epoch": 0.3837953091684435, + "grad_norm": 0.035400390625, + "learning_rate": 0.00013578696921540665, + "loss": 0.0013, + "step": 8100 + }, + { + "epoch": 0.38384269130537785, + "grad_norm": 0.419921875, + "learning_rate": 0.0001357730625384338, + "loss": 0.022, + "step": 8101 + }, + { + "epoch": 0.38389007344231224, + "grad_norm": 0.33984375, + "learning_rate": 0.0001357591550680525, + "loss": 0.0354, + "step": 8102 + }, + { + "epoch": 0.38393745557924663, + "grad_norm": 0.62109375, + "learning_rate": 0.00013574524680457127, + "loss": 0.6458, + "step": 8103 + }, + { + "epoch": 0.383984837716181, + "grad_norm": 0.65234375, + "learning_rate": 0.00013573133774829853, + "loss": 1.2551, + "step": 8104 + }, + { + "epoch": 0.38403221985311536, + "grad_norm": 0.765625, + "learning_rate": 0.00013571742789954277, + "loss": 1.129, + "step": 8105 + }, + { + "epoch": 0.38407960199004976, + "grad_norm": 0.6484375, + "learning_rate": 0.00013570351725861247, + "loss": 1.4974, + "step": 8106 + }, + { + "epoch": 0.38412698412698415, + "grad_norm": 0.333984375, + "learning_rate": 0.00013568960582581623, + "loss": 0.1634, + "step": 8107 + }, + { + "epoch": 0.3841743662639185, + "grad_norm": 0.1552734375, + "learning_rate": 0.00013567569360146254, + "loss": 0.0321, + "step": 8108 + }, + { + "epoch": 0.3842217484008529, + "grad_norm": 0.859375, + "learning_rate": 0.00013566178058585995, + "loss": 0.16, + "step": 8109 + }, + { + "epoch": 0.3842691305377873, + "grad_norm": 0.484375, + "learning_rate": 0.00013564786677931705, + "loss": 0.9219, + "step": 8110 + }, + { + "epoch": 0.3843165126747216, + "grad_norm": 0.609375, + "learning_rate": 0.00013563395218214245, + "loss": 0.9605, + "step": 8111 + }, + { + "epoch": 0.384363894811656, + "grad_norm": 0.08251953125, + "learning_rate": 0.00013562003679464476, + "loss": 0.0044, + "step": 8112 + }, + { + "epoch": 0.3844112769485904, + "grad_norm": 0.09326171875, + "learning_rate": 0.00013560612061713255, + "loss": 0.0093, + "step": 8113 + }, + { + "epoch": 0.38445865908552473, + "grad_norm": 0.310546875, + "learning_rate": 0.00013559220364991453, + "loss": 0.1572, + "step": 8114 + }, + { + "epoch": 0.3845060412224591, + "grad_norm": 0.1865234375, + "learning_rate": 0.00013557828589329937, + "loss": 0.1326, + "step": 8115 + }, + { + "epoch": 0.3845534233593935, + "grad_norm": 0.71484375, + "learning_rate": 0.0001355643673475957, + "loss": 0.883, + "step": 8116 + }, + { + "epoch": 0.38460080549632786, + "grad_norm": 0.78515625, + "learning_rate": 0.00013555044801311225, + "loss": 1.2444, + "step": 8117 + }, + { + "epoch": 0.38464818763326225, + "grad_norm": 0.75390625, + "learning_rate": 0.0001355365278901577, + "loss": 0.9022, + "step": 8118 + }, + { + "epoch": 0.38469556977019664, + "grad_norm": 0.859375, + "learning_rate": 0.00013552260697904088, + "loss": 0.7201, + "step": 8119 + }, + { + "epoch": 0.38474295190713104, + "grad_norm": 2.265625, + "learning_rate": 0.00013550868528007043, + "loss": 1.5274, + "step": 8120 + }, + { + "epoch": 0.3847903340440654, + "grad_norm": 0.8046875, + "learning_rate": 0.00013549476279355518, + "loss": 0.8623, + "step": 8121 + }, + { + "epoch": 0.38483771618099977, + "grad_norm": 0.67578125, + "learning_rate": 0.00013548083951980388, + "loss": 1.3328, + "step": 8122 + }, + { + "epoch": 0.38488509831793416, + "grad_norm": 0.1962890625, + "learning_rate": 0.00013546691545912538, + "loss": 0.1407, + "step": 8123 + }, + { + "epoch": 0.3849324804548685, + "grad_norm": 0.66015625, + "learning_rate": 0.00013545299061182842, + "loss": 1.1061, + "step": 8124 + }, + { + "epoch": 0.3849798625918029, + "grad_norm": 0.796875, + "learning_rate": 0.00013543906497822194, + "loss": 1.2707, + "step": 8125 + }, + { + "epoch": 0.3850272447287373, + "grad_norm": 0.2158203125, + "learning_rate": 0.00013542513855861475, + "loss": 0.1502, + "step": 8126 + }, + { + "epoch": 0.3850746268656716, + "grad_norm": 0.58203125, + "learning_rate": 0.0001354112113533157, + "loss": 0.1422, + "step": 8127 + }, + { + "epoch": 0.385122009002606, + "grad_norm": 0.6328125, + "learning_rate": 0.0001353972833626337, + "loss": 0.8849, + "step": 8128 + }, + { + "epoch": 0.3851693911395404, + "grad_norm": 0.431640625, + "learning_rate": 0.00013538335458687764, + "loss": 0.0291, + "step": 8129 + }, + { + "epoch": 0.38521677327647474, + "grad_norm": 0.7421875, + "learning_rate": 0.0001353694250263565, + "loss": 0.7615, + "step": 8130 + }, + { + "epoch": 0.38526415541340914, + "grad_norm": 0.6484375, + "learning_rate": 0.00013535549468137917, + "loss": 0.8799, + "step": 8131 + }, + { + "epoch": 0.38531153755034353, + "grad_norm": 0.70703125, + "learning_rate": 0.00013534156355225462, + "loss": 1.2742, + "step": 8132 + }, + { + "epoch": 0.3853589196872779, + "grad_norm": 0.287109375, + "learning_rate": 0.00013532763163929184, + "loss": 0.1134, + "step": 8133 + }, + { + "epoch": 0.38540630182421226, + "grad_norm": 0.53515625, + "learning_rate": 0.00013531369894279985, + "loss": 0.9775, + "step": 8134 + }, + { + "epoch": 0.38545368396114665, + "grad_norm": 0.6640625, + "learning_rate": 0.0001352997654630876, + "loss": 1.1179, + "step": 8135 + }, + { + "epoch": 0.38550106609808105, + "grad_norm": 0.54296875, + "learning_rate": 0.00013528583120046413, + "loss": 0.415, + "step": 8136 + }, + { + "epoch": 0.3855484482350154, + "grad_norm": 0.87890625, + "learning_rate": 0.00013527189615523854, + "loss": 1.18, + "step": 8137 + }, + { + "epoch": 0.3855958303719498, + "grad_norm": 0.54296875, + "learning_rate": 0.00013525796032771986, + "loss": 0.4005, + "step": 8138 + }, + { + "epoch": 0.38564321250888417, + "grad_norm": 0.5546875, + "learning_rate": 0.0001352440237182172, + "loss": 0.387, + "step": 8139 + }, + { + "epoch": 0.3856905946458185, + "grad_norm": 0.61328125, + "learning_rate": 0.00013523008632703958, + "loss": 0.237, + "step": 8140 + }, + { + "epoch": 0.3857379767827529, + "grad_norm": 0.31640625, + "learning_rate": 0.0001352161481544962, + "loss": 0.0283, + "step": 8141 + }, + { + "epoch": 0.3857853589196873, + "grad_norm": 0.1318359375, + "learning_rate": 0.00013520220920089617, + "loss": 0.0253, + "step": 8142 + }, + { + "epoch": 0.38583274105662163, + "grad_norm": 0.6484375, + "learning_rate": 0.00013518826946654864, + "loss": 1.2714, + "step": 8143 + }, + { + "epoch": 0.385880123193556, + "grad_norm": 0.205078125, + "learning_rate": 0.00013517432895176275, + "loss": 0.0174, + "step": 8144 + }, + { + "epoch": 0.3859275053304904, + "grad_norm": 0.56640625, + "learning_rate": 0.0001351603876568477, + "loss": 1.0646, + "step": 8145 + }, + { + "epoch": 0.38597488746742475, + "grad_norm": 0.8359375, + "learning_rate": 0.00013514644558211276, + "loss": 0.5291, + "step": 8146 + }, + { + "epoch": 0.38602226960435915, + "grad_norm": 0.62109375, + "learning_rate": 0.00013513250272786703, + "loss": 0.7843, + "step": 8147 + }, + { + "epoch": 0.38606965174129354, + "grad_norm": 0.8984375, + "learning_rate": 0.00013511855909441984, + "loss": 0.804, + "step": 8148 + }, + { + "epoch": 0.38611703387822793, + "grad_norm": 0.71484375, + "learning_rate": 0.00013510461468208042, + "loss": 1.0318, + "step": 8149 + }, + { + "epoch": 0.38616441601516227, + "grad_norm": 0.69140625, + "learning_rate": 0.00013509066949115802, + "loss": 1.0913, + "step": 8150 + }, + { + "epoch": 0.38621179815209666, + "grad_norm": 0.341796875, + "learning_rate": 0.00013507672352196197, + "loss": 0.0499, + "step": 8151 + }, + { + "epoch": 0.38625918028903106, + "grad_norm": 0.71875, + "learning_rate": 0.00013506277677480155, + "loss": 0.948, + "step": 8152 + }, + { + "epoch": 0.3863065624259654, + "grad_norm": 0.80859375, + "learning_rate": 0.0001350488292499861, + "loss": 0.6322, + "step": 8153 + }, + { + "epoch": 0.3863539445628998, + "grad_norm": 0.7265625, + "learning_rate": 0.00013503488094782494, + "loss": 0.7711, + "step": 8154 + }, + { + "epoch": 0.3864013266998342, + "grad_norm": 0.484375, + "learning_rate": 0.0001350209318686274, + "loss": 0.8854, + "step": 8155 + }, + { + "epoch": 0.3864487088367685, + "grad_norm": 0.59765625, + "learning_rate": 0.0001350069820127029, + "loss": 1.0564, + "step": 8156 + }, + { + "epoch": 0.3864960909737029, + "grad_norm": 0.671875, + "learning_rate": 0.00013499303138036087, + "loss": 0.8249, + "step": 8157 + }, + { + "epoch": 0.3865434731106373, + "grad_norm": 0.51171875, + "learning_rate": 0.00013497907997191065, + "loss": 0.7084, + "step": 8158 + }, + { + "epoch": 0.38659085524757164, + "grad_norm": 0.3046875, + "learning_rate": 0.00013496512778766174, + "loss": 0.0291, + "step": 8159 + }, + { + "epoch": 0.38663823738450603, + "grad_norm": 0.5703125, + "learning_rate": 0.00013495117482792348, + "loss": 0.7449, + "step": 8160 + }, + { + "epoch": 0.3866856195214404, + "grad_norm": 0.7109375, + "learning_rate": 0.0001349372210930054, + "loss": 1.0236, + "step": 8161 + }, + { + "epoch": 0.3867330016583748, + "grad_norm": 0.87109375, + "learning_rate": 0.000134923266583217, + "loss": 0.2611, + "step": 8162 + }, + { + "epoch": 0.38678038379530916, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001349093112988677, + "loss": 0.0238, + "step": 8163 + }, + { + "epoch": 0.38682776593224355, + "grad_norm": 0.703125, + "learning_rate": 0.0001348953552402671, + "loss": 0.9971, + "step": 8164 + }, + { + "epoch": 0.38687514806917794, + "grad_norm": 0.69921875, + "learning_rate": 0.0001348813984077247, + "loss": 1.1173, + "step": 8165 + }, + { + "epoch": 0.3869225302061123, + "grad_norm": 0.69140625, + "learning_rate": 0.00013486744080155, + "loss": 1.029, + "step": 8166 + }, + { + "epoch": 0.3869699123430467, + "grad_norm": 0.63671875, + "learning_rate": 0.00013485348242205263, + "loss": 1.1005, + "step": 8167 + }, + { + "epoch": 0.38701729447998107, + "grad_norm": 0.671875, + "learning_rate": 0.0001348395232695421, + "loss": 0.6973, + "step": 8168 + }, + { + "epoch": 0.3870646766169154, + "grad_norm": 0.578125, + "learning_rate": 0.0001348255633443281, + "loss": 1.0075, + "step": 8169 + }, + { + "epoch": 0.3871120587538498, + "grad_norm": 0.5390625, + "learning_rate": 0.00013481160264672016, + "loss": 0.0349, + "step": 8170 + }, + { + "epoch": 0.3871594408907842, + "grad_norm": 0.6328125, + "learning_rate": 0.00013479764117702798, + "loss": 1.034, + "step": 8171 + }, + { + "epoch": 0.3872068230277185, + "grad_norm": 0.62890625, + "learning_rate": 0.0001347836789355612, + "loss": 0.5948, + "step": 8172 + }, + { + "epoch": 0.3872542051646529, + "grad_norm": 0.0791015625, + "learning_rate": 0.00013476971592262944, + "loss": 0.006, + "step": 8173 + }, + { + "epoch": 0.3873015873015873, + "grad_norm": 0.57421875, + "learning_rate": 0.00013475575213854244, + "loss": 0.8148, + "step": 8174 + }, + { + "epoch": 0.38734896943852165, + "grad_norm": 0.69921875, + "learning_rate": 0.00013474178758360984, + "loss": 0.9542, + "step": 8175 + }, + { + "epoch": 0.38739635157545604, + "grad_norm": 0.62109375, + "learning_rate": 0.00013472782225814142, + "loss": 0.7074, + "step": 8176 + }, + { + "epoch": 0.38744373371239044, + "grad_norm": 0.5, + "learning_rate": 0.0001347138561624469, + "loss": 1.0238, + "step": 8177 + }, + { + "epoch": 0.38749111584932483, + "grad_norm": 0.62890625, + "learning_rate": 0.00013469988929683602, + "loss": 0.2586, + "step": 8178 + }, + { + "epoch": 0.38753849798625917, + "grad_norm": 0.64453125, + "learning_rate": 0.00013468592166161855, + "loss": 0.8672, + "step": 8179 + }, + { + "epoch": 0.38758588012319356, + "grad_norm": 0.69921875, + "learning_rate": 0.0001346719532571043, + "loss": 1.4888, + "step": 8180 + }, + { + "epoch": 0.38763326226012795, + "grad_norm": 0.765625, + "learning_rate": 0.00013465798408360304, + "loss": 0.8679, + "step": 8181 + }, + { + "epoch": 0.3876806443970623, + "grad_norm": 0.703125, + "learning_rate": 0.0001346440141414246, + "loss": 1.1073, + "step": 8182 + }, + { + "epoch": 0.3877280265339967, + "grad_norm": 0.5859375, + "learning_rate": 0.00013463004343087885, + "loss": 0.7844, + "step": 8183 + }, + { + "epoch": 0.3877754086709311, + "grad_norm": 0.6953125, + "learning_rate": 0.00013461607195227563, + "loss": 0.7868, + "step": 8184 + }, + { + "epoch": 0.3878227908078654, + "grad_norm": 0.21484375, + "learning_rate": 0.0001346020997059248, + "loss": 0.0389, + "step": 8185 + }, + { + "epoch": 0.3878701729447998, + "grad_norm": 0.275390625, + "learning_rate": 0.00013458812669213624, + "loss": 0.1952, + "step": 8186 + }, + { + "epoch": 0.3879175550817342, + "grad_norm": 0.6640625, + "learning_rate": 0.00013457415291121984, + "loss": 1.0734, + "step": 8187 + }, + { + "epoch": 0.38796493721866854, + "grad_norm": 0.63671875, + "learning_rate": 0.00013456017836348561, + "loss": 0.6853, + "step": 8188 + }, + { + "epoch": 0.38801231935560293, + "grad_norm": 0.201171875, + "learning_rate": 0.00013454620304924342, + "loss": 0.1069, + "step": 8189 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 0.65625, + "learning_rate": 0.00013453222696880324, + "loss": 0.9593, + "step": 8190 + }, + { + "epoch": 0.3881070836294717, + "grad_norm": 0.63671875, + "learning_rate": 0.00013451825012247506, + "loss": 1.3341, + "step": 8191 + }, + { + "epoch": 0.38815446576640605, + "grad_norm": 0.17578125, + "learning_rate": 0.00013450427251056885, + "loss": 0.0085, + "step": 8192 + }, + { + "epoch": 0.38820184790334045, + "grad_norm": 0.78515625, + "learning_rate": 0.0001344902941333946, + "loss": 0.3653, + "step": 8193 + }, + { + "epoch": 0.38824923004027484, + "grad_norm": 0.6640625, + "learning_rate": 0.00013447631499126242, + "loss": 1.182, + "step": 8194 + }, + { + "epoch": 0.3882966121772092, + "grad_norm": 0.41796875, + "learning_rate": 0.00013446233508448228, + "loss": 0.2139, + "step": 8195 + }, + { + "epoch": 0.38834399431414357, + "grad_norm": 0.84765625, + "learning_rate": 0.00013444835441336423, + "loss": 0.9673, + "step": 8196 + }, + { + "epoch": 0.38839137645107796, + "grad_norm": 0.66015625, + "learning_rate": 0.00013443437297821836, + "loss": 1.0825, + "step": 8197 + }, + { + "epoch": 0.3884387585880123, + "grad_norm": 0.84765625, + "learning_rate": 0.00013442039077935482, + "loss": 1.0597, + "step": 8198 + }, + { + "epoch": 0.3884861407249467, + "grad_norm": 0.71875, + "learning_rate": 0.00013440640781708365, + "loss": 0.9482, + "step": 8199 + }, + { + "epoch": 0.3885335228618811, + "grad_norm": 0.79296875, + "learning_rate": 0.00013439242409171503, + "loss": 0.6872, + "step": 8200 + }, + { + "epoch": 0.3885809049988154, + "grad_norm": 0.021484375, + "learning_rate": 0.00013437843960355903, + "loss": 0.0011, + "step": 8201 + }, + { + "epoch": 0.3886282871357498, + "grad_norm": 0.283203125, + "learning_rate": 0.00013436445435292588, + "loss": 0.1094, + "step": 8202 + }, + { + "epoch": 0.3886756692726842, + "grad_norm": 0.5, + "learning_rate": 0.00013435046834012575, + "loss": 0.1463, + "step": 8203 + }, + { + "epoch": 0.38872305140961855, + "grad_norm": 0.67578125, + "learning_rate": 0.0001343364815654688, + "loss": 0.9555, + "step": 8204 + }, + { + "epoch": 0.38877043354655294, + "grad_norm": 0.73828125, + "learning_rate": 0.00013432249402926526, + "loss": 0.7445, + "step": 8205 + }, + { + "epoch": 0.38881781568348733, + "grad_norm": 0.46484375, + "learning_rate": 0.0001343085057318254, + "loss": 0.3253, + "step": 8206 + }, + { + "epoch": 0.3888651978204217, + "grad_norm": 0.08935546875, + "learning_rate": 0.0001342945166734594, + "loss": 0.0077, + "step": 8207 + }, + { + "epoch": 0.38891257995735606, + "grad_norm": 0.302734375, + "learning_rate": 0.00013428052685447755, + "loss": 0.0575, + "step": 8208 + }, + { + "epoch": 0.38895996209429046, + "grad_norm": 0.9296875, + "learning_rate": 0.0001342665362751901, + "loss": 0.2605, + "step": 8209 + }, + { + "epoch": 0.38900734423122485, + "grad_norm": 0.5859375, + "learning_rate": 0.0001342525449359074, + "loss": 0.997, + "step": 8210 + }, + { + "epoch": 0.3890547263681592, + "grad_norm": 0.74609375, + "learning_rate": 0.00013423855283693973, + "loss": 1.1376, + "step": 8211 + }, + { + "epoch": 0.3891021085050936, + "grad_norm": 0.5234375, + "learning_rate": 0.00013422455997859744, + "loss": 0.828, + "step": 8212 + }, + { + "epoch": 0.389149490642028, + "grad_norm": 0.7578125, + "learning_rate": 0.00013421056636119086, + "loss": 1.1193, + "step": 8213 + }, + { + "epoch": 0.3891968727789623, + "grad_norm": 0.34375, + "learning_rate": 0.00013419657198503039, + "loss": 0.1448, + "step": 8214 + }, + { + "epoch": 0.3892442549158967, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00013418257685042634, + "loss": 0.0004, + "step": 8215 + }, + { + "epoch": 0.3892916370528311, + "grad_norm": 0.56640625, + "learning_rate": 0.00013416858095768915, + "loss": 0.1043, + "step": 8216 + }, + { + "epoch": 0.38933901918976543, + "grad_norm": 0.6640625, + "learning_rate": 0.0001341545843071292, + "loss": 1.0105, + "step": 8217 + }, + { + "epoch": 0.3893864013266998, + "grad_norm": 0.7421875, + "learning_rate": 0.000134140586899057, + "loss": 1.4491, + "step": 8218 + }, + { + "epoch": 0.3894337834636342, + "grad_norm": 0.58203125, + "learning_rate": 0.00013412658873378293, + "loss": 1.2056, + "step": 8219 + }, + { + "epoch": 0.3894811656005686, + "grad_norm": 0.57421875, + "learning_rate": 0.00013411258981161744, + "loss": 0.9109, + "step": 8220 + }, + { + "epoch": 0.38952854773750295, + "grad_norm": 0.52734375, + "learning_rate": 0.00013409859013287107, + "loss": 0.8034, + "step": 8221 + }, + { + "epoch": 0.38957592987443734, + "grad_norm": 0.71484375, + "learning_rate": 0.0001340845896978543, + "loss": 1.1814, + "step": 8222 + }, + { + "epoch": 0.38962331201137174, + "grad_norm": 0.8671875, + "learning_rate": 0.00013407058850687764, + "loss": 1.0716, + "step": 8223 + }, + { + "epoch": 0.3896706941483061, + "grad_norm": 0.56640625, + "learning_rate": 0.0001340565865602516, + "loss": 0.7023, + "step": 8224 + }, + { + "epoch": 0.38971807628524047, + "grad_norm": 0.2158203125, + "learning_rate": 0.00013404258385828674, + "loss": 0.1406, + "step": 8225 + }, + { + "epoch": 0.38976545842217486, + "grad_norm": 0.7421875, + "learning_rate": 0.00013402858040129363, + "loss": 1.024, + "step": 8226 + }, + { + "epoch": 0.3898128405591092, + "grad_norm": 0.6171875, + "learning_rate": 0.0001340145761895829, + "loss": 0.55, + "step": 8227 + }, + { + "epoch": 0.3898602226960436, + "grad_norm": 0.63671875, + "learning_rate": 0.00013400057122346505, + "loss": 0.8254, + "step": 8228 + }, + { + "epoch": 0.389907604832978, + "grad_norm": 0.6015625, + "learning_rate": 0.00013398656550325078, + "loss": 0.1839, + "step": 8229 + }, + { + "epoch": 0.3899549869699123, + "grad_norm": 0.78125, + "learning_rate": 0.00013397255902925065, + "loss": 0.7558, + "step": 8230 + }, + { + "epoch": 0.3900023691068467, + "grad_norm": 0.68359375, + "learning_rate": 0.00013395855180177535, + "loss": 0.9242, + "step": 8231 + }, + { + "epoch": 0.3900497512437811, + "grad_norm": 0.0245361328125, + "learning_rate": 0.00013394454382113557, + "loss": 0.0006, + "step": 8232 + }, + { + "epoch": 0.39009713338071544, + "grad_norm": 0.62890625, + "learning_rate": 0.00013393053508764196, + "loss": 0.9738, + "step": 8233 + }, + { + "epoch": 0.39014451551764984, + "grad_norm": 0.052734375, + "learning_rate": 0.00013391652560160522, + "loss": 0.0041, + "step": 8234 + }, + { + "epoch": 0.39019189765458423, + "grad_norm": 0.0034027099609375, + "learning_rate": 0.00013390251536333605, + "loss": 0.0002, + "step": 8235 + }, + { + "epoch": 0.3902392797915186, + "grad_norm": 0.6796875, + "learning_rate": 0.00013388850437314522, + "loss": 0.9698, + "step": 8236 + }, + { + "epoch": 0.39028666192845296, + "grad_norm": 0.87109375, + "learning_rate": 0.00013387449263134346, + "loss": 0.0449, + "step": 8237 + }, + { + "epoch": 0.39033404406538735, + "grad_norm": 0.66015625, + "learning_rate": 0.00013386048013824153, + "loss": 1.0048, + "step": 8238 + }, + { + "epoch": 0.39038142620232175, + "grad_norm": 0.65625, + "learning_rate": 0.0001338464668941502, + "loss": 1.4565, + "step": 8239 + }, + { + "epoch": 0.3904288083392561, + "grad_norm": 1.1015625, + "learning_rate": 0.00013383245289938031, + "loss": 0.6494, + "step": 8240 + }, + { + "epoch": 0.3904761904761905, + "grad_norm": 0.10546875, + "learning_rate": 0.00013381843815424264, + "loss": 0.0027, + "step": 8241 + }, + { + "epoch": 0.39052357261312487, + "grad_norm": 0.546875, + "learning_rate": 0.00013380442265904801, + "loss": 0.8223, + "step": 8242 + }, + { + "epoch": 0.3905709547500592, + "grad_norm": 0.78125, + "learning_rate": 0.00013379040641410734, + "loss": 0.9571, + "step": 8243 + }, + { + "epoch": 0.3906183368869936, + "grad_norm": 1.1015625, + "learning_rate": 0.0001337763894197314, + "loss": 1.1093, + "step": 8244 + }, + { + "epoch": 0.390665719023928, + "grad_norm": 0.6875, + "learning_rate": 0.00013376237167623116, + "loss": 1.0675, + "step": 8245 + }, + { + "epoch": 0.39071310116086233, + "grad_norm": 0.75, + "learning_rate": 0.00013374835318391746, + "loss": 0.8834, + "step": 8246 + }, + { + "epoch": 0.3907604832977967, + "grad_norm": 0.546875, + "learning_rate": 0.00013373433394310124, + "loss": 0.9922, + "step": 8247 + }, + { + "epoch": 0.3908078654347311, + "grad_norm": 1.0078125, + "learning_rate": 0.00013372031395409342, + "loss": 0.3326, + "step": 8248 + }, + { + "epoch": 0.3908552475716655, + "grad_norm": 0.671875, + "learning_rate": 0.00013370629321720498, + "loss": 0.9127, + "step": 8249 + }, + { + "epoch": 0.39090262970859985, + "grad_norm": 0.38671875, + "learning_rate": 0.00013369227173274682, + "loss": 0.0458, + "step": 8250 + }, + { + "epoch": 0.39095001184553424, + "grad_norm": 0.234375, + "learning_rate": 0.00013367824950102997, + "loss": 0.0229, + "step": 8251 + }, + { + "epoch": 0.39099739398246863, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001336642265223654, + "loss": 0.1301, + "step": 8252 + }, + { + "epoch": 0.39104477611940297, + "grad_norm": 0.5625, + "learning_rate": 0.0001336502027970642, + "loss": 0.134, + "step": 8253 + }, + { + "epoch": 0.39109215825633736, + "grad_norm": 0.7265625, + "learning_rate": 0.0001336361783254373, + "loss": 0.9979, + "step": 8254 + }, + { + "epoch": 0.39113954039327176, + "grad_norm": 0.55859375, + "learning_rate": 0.00013362215310779583, + "loss": 1.2613, + "step": 8255 + }, + { + "epoch": 0.3911869225302061, + "grad_norm": 0.125, + "learning_rate": 0.00013360812714445077, + "loss": 0.0034, + "step": 8256 + }, + { + "epoch": 0.3912343046671405, + "grad_norm": 0.271484375, + "learning_rate": 0.00013359410043571328, + "loss": 0.1307, + "step": 8257 + }, + { + "epoch": 0.3912816868040749, + "grad_norm": 0.56640625, + "learning_rate": 0.0001335800729818944, + "loss": 0.5249, + "step": 8258 + }, + { + "epoch": 0.3913290689410092, + "grad_norm": 0.11767578125, + "learning_rate": 0.00013356604478330527, + "loss": 0.0075, + "step": 8259 + }, + { + "epoch": 0.3913764510779436, + "grad_norm": 0.66796875, + "learning_rate": 0.00013355201584025706, + "loss": 1.3444, + "step": 8260 + }, + { + "epoch": 0.391423833214878, + "grad_norm": 0.023193359375, + "learning_rate": 0.00013353798615306086, + "loss": 0.0017, + "step": 8261 + }, + { + "epoch": 0.39147121535181234, + "grad_norm": 0.6484375, + "learning_rate": 0.00013352395572202783, + "loss": 1.1571, + "step": 8262 + }, + { + "epoch": 0.39151859748874673, + "grad_norm": 0.462890625, + "learning_rate": 0.00013350992454746918, + "loss": 0.082, + "step": 8263 + }, + { + "epoch": 0.3915659796256811, + "grad_norm": 0.78125, + "learning_rate": 0.0001334958926296961, + "loss": 1.4404, + "step": 8264 + }, + { + "epoch": 0.3916133617626155, + "grad_norm": 0.98046875, + "learning_rate": 0.0001334818599690198, + "loss": 0.0723, + "step": 8265 + }, + { + "epoch": 0.39166074389954986, + "grad_norm": 0.423828125, + "learning_rate": 0.0001334678265657515, + "loss": 0.2663, + "step": 8266 + }, + { + "epoch": 0.39170812603648425, + "grad_norm": 0.2001953125, + "learning_rate": 0.00013345379242020244, + "loss": 0.1553, + "step": 8267 + }, + { + "epoch": 0.39175550817341864, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013343975753268394, + "loss": 0.0766, + "step": 8268 + }, + { + "epoch": 0.391802890310353, + "grad_norm": 0.7265625, + "learning_rate": 0.00013342572190350722, + "loss": 0.6747, + "step": 8269 + }, + { + "epoch": 0.3918502724472874, + "grad_norm": 0.68359375, + "learning_rate": 0.00013341168553298357, + "loss": 0.9877, + "step": 8270 + }, + { + "epoch": 0.39189765458422177, + "grad_norm": 0.70703125, + "learning_rate": 0.00013339764842142433, + "loss": 1.0344, + "step": 8271 + }, + { + "epoch": 0.3919450367211561, + "grad_norm": 0.05712890625, + "learning_rate": 0.00013338361056914084, + "loss": 0.0048, + "step": 8272 + }, + { + "epoch": 0.3919924188580905, + "grad_norm": 0.6953125, + "learning_rate": 0.00013336957197644441, + "loss": 1.2426, + "step": 8273 + }, + { + "epoch": 0.3920398009950249, + "grad_norm": 0.62890625, + "learning_rate": 0.0001333555326436464, + "loss": 0.9186, + "step": 8274 + }, + { + "epoch": 0.3920871831319592, + "grad_norm": 0.58984375, + "learning_rate": 0.00013334149257105822, + "loss": 0.8823, + "step": 8275 + }, + { + "epoch": 0.3921345652688936, + "grad_norm": 0.9296875, + "learning_rate": 0.0001333274517589913, + "loss": 1.0725, + "step": 8276 + }, + { + "epoch": 0.392181947405828, + "grad_norm": 0.62890625, + "learning_rate": 0.00013331341020775695, + "loss": 0.9247, + "step": 8277 + }, + { + "epoch": 0.3922293295427624, + "grad_norm": 0.69921875, + "learning_rate": 0.00013329936791766665, + "loss": 0.9583, + "step": 8278 + }, + { + "epoch": 0.39227671167969674, + "grad_norm": 0.0037994384765625, + "learning_rate": 0.00013328532488903185, + "loss": 0.0002, + "step": 8279 + }, + { + "epoch": 0.39232409381663114, + "grad_norm": 0.5078125, + "learning_rate": 0.000133271281122164, + "loss": 0.6199, + "step": 8280 + }, + { + "epoch": 0.39237147595356553, + "grad_norm": 0.248046875, + "learning_rate": 0.00013325723661737455, + "loss": 0.0224, + "step": 8281 + }, + { + "epoch": 0.39241885809049987, + "grad_norm": 0.76171875, + "learning_rate": 0.00013324319137497504, + "loss": 1.1767, + "step": 8282 + }, + { + "epoch": 0.39246624022743426, + "grad_norm": 0.64453125, + "learning_rate": 0.00013322914539527694, + "loss": 0.775, + "step": 8283 + }, + { + "epoch": 0.39251362236436865, + "grad_norm": 0.1767578125, + "learning_rate": 0.00013321509867859178, + "loss": 0.0219, + "step": 8284 + }, + { + "epoch": 0.392561004501303, + "grad_norm": 0.640625, + "learning_rate": 0.00013320105122523112, + "loss": 0.9422, + "step": 8285 + }, + { + "epoch": 0.3926083866382374, + "grad_norm": 0.62890625, + "learning_rate": 0.0001331870030355065, + "loss": 1.0467, + "step": 8286 + }, + { + "epoch": 0.3926557687751718, + "grad_norm": 1.0234375, + "learning_rate": 0.00013317295410972951, + "loss": 0.3509, + "step": 8287 + }, + { + "epoch": 0.3927031509121061, + "grad_norm": 0.765625, + "learning_rate": 0.00013315890444821175, + "loss": 1.0666, + "step": 8288 + }, + { + "epoch": 0.3927505330490405, + "grad_norm": 0.314453125, + "learning_rate": 0.00013314485405126477, + "loss": 0.1586, + "step": 8289 + }, + { + "epoch": 0.3927979151859749, + "grad_norm": 1.0703125, + "learning_rate": 0.00013313080291920025, + "loss": 0.3858, + "step": 8290 + }, + { + "epoch": 0.39284529732290924, + "grad_norm": 0.89453125, + "learning_rate": 0.00013311675105232983, + "loss": 0.2896, + "step": 8291 + }, + { + "epoch": 0.39289267945984363, + "grad_norm": 1.0625, + "learning_rate": 0.00013310269845096516, + "loss": 0.6466, + "step": 8292 + }, + { + "epoch": 0.392940061596778, + "grad_norm": 0.48046875, + "learning_rate": 0.00013308864511541786, + "loss": 0.6098, + "step": 8293 + }, + { + "epoch": 0.3929874437337124, + "grad_norm": 0.76171875, + "learning_rate": 0.00013307459104599964, + "loss": 1.1771, + "step": 8294 + }, + { + "epoch": 0.39303482587064675, + "grad_norm": 0.67578125, + "learning_rate": 0.00013306053624302227, + "loss": 0.7478, + "step": 8295 + }, + { + "epoch": 0.39308220800758115, + "grad_norm": 0.462890625, + "learning_rate": 0.00013304648070679737, + "loss": 0.0336, + "step": 8296 + }, + { + "epoch": 0.39312959014451554, + "grad_norm": 0.62890625, + "learning_rate": 0.00013303242443763675, + "loss": 0.872, + "step": 8297 + }, + { + "epoch": 0.3931769722814499, + "grad_norm": 0.60546875, + "learning_rate": 0.00013301836743585214, + "loss": 1.2149, + "step": 8298 + }, + { + "epoch": 0.39322435441838427, + "grad_norm": 0.9140625, + "learning_rate": 0.00013300430970175533, + "loss": 1.2844, + "step": 8299 + }, + { + "epoch": 0.39327173655531866, + "grad_norm": 0.74609375, + "learning_rate": 0.00013299025123565808, + "loss": 1.2886, + "step": 8300 + }, + { + "epoch": 0.393319118692253, + "grad_norm": 0.25, + "learning_rate": 0.00013297619203787216, + "loss": 0.0843, + "step": 8301 + }, + { + "epoch": 0.3933665008291874, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00013296213210870945, + "loss": 0.0025, + "step": 8302 + }, + { + "epoch": 0.3934138829661218, + "grad_norm": 0.8203125, + "learning_rate": 0.0001329480714484818, + "loss": 1.1182, + "step": 8303 + }, + { + "epoch": 0.3934612651030561, + "grad_norm": 0.73828125, + "learning_rate": 0.00013293401005750095, + "loss": 1.0311, + "step": 8304 + }, + { + "epoch": 0.3935086472399905, + "grad_norm": 0.1376953125, + "learning_rate": 0.00013291994793607884, + "loss": 0.0156, + "step": 8305 + }, + { + "epoch": 0.3935560293769249, + "grad_norm": 0.1923828125, + "learning_rate": 0.00013290588508452743, + "loss": 0.1274, + "step": 8306 + }, + { + "epoch": 0.3936034115138593, + "grad_norm": 0.59375, + "learning_rate": 0.00013289182150315844, + "loss": 0.3793, + "step": 8307 + }, + { + "epoch": 0.39365079365079364, + "grad_norm": 0.65234375, + "learning_rate": 0.00013287775719228394, + "loss": 0.9794, + "step": 8308 + }, + { + "epoch": 0.39369817578772803, + "grad_norm": 0.26171875, + "learning_rate": 0.0001328636921522158, + "loss": 0.1713, + "step": 8309 + }, + { + "epoch": 0.3937455579246624, + "grad_norm": 0.77734375, + "learning_rate": 0.00013284962638326596, + "loss": 1.7904, + "step": 8310 + }, + { + "epoch": 0.39379294006159676, + "grad_norm": 0.59375, + "learning_rate": 0.00013283555988574642, + "loss": 0.7938, + "step": 8311 + }, + { + "epoch": 0.39384032219853116, + "grad_norm": 0.8125, + "learning_rate": 0.00013282149265996912, + "loss": 0.818, + "step": 8312 + }, + { + "epoch": 0.39388770433546555, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013280742470624606, + "loss": 0.1211, + "step": 8313 + }, + { + "epoch": 0.3939350864723999, + "grad_norm": 0.361328125, + "learning_rate": 0.00013279335602488926, + "loss": 0.1617, + "step": 8314 + }, + { + "epoch": 0.3939824686093343, + "grad_norm": 0.06640625, + "learning_rate": 0.00013277928661621077, + "loss": 0.0098, + "step": 8315 + }, + { + "epoch": 0.3940298507462687, + "grad_norm": 0.578125, + "learning_rate": 0.00013276521648052266, + "loss": 0.802, + "step": 8316 + }, + { + "epoch": 0.394077232883203, + "grad_norm": 0.4921875, + "learning_rate": 0.0001327511456181369, + "loss": 0.1753, + "step": 8317 + }, + { + "epoch": 0.3941246150201374, + "grad_norm": 0.5390625, + "learning_rate": 0.0001327370740293656, + "loss": 1.001, + "step": 8318 + }, + { + "epoch": 0.3941719971570718, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001327230017145209, + "loss": 0.0278, + "step": 8319 + }, + { + "epoch": 0.39421937929400613, + "grad_norm": 0.78125, + "learning_rate": 0.00013270892867391486, + "loss": 1.0463, + "step": 8320 + }, + { + "epoch": 0.3942667614309405, + "grad_norm": 0.2314453125, + "learning_rate": 0.00013269485490785963, + "loss": 0.177, + "step": 8321 + }, + { + "epoch": 0.3943141435678749, + "grad_norm": 0.5625, + "learning_rate": 0.00013268078041666734, + "loss": 0.2885, + "step": 8322 + }, + { + "epoch": 0.3943615257048093, + "grad_norm": 0.70703125, + "learning_rate": 0.00013266670520065013, + "loss": 1.1419, + "step": 8323 + }, + { + "epoch": 0.39440890784174365, + "grad_norm": 0.70703125, + "learning_rate": 0.00013265262926012025, + "loss": 1.1623, + "step": 8324 + }, + { + "epoch": 0.39445628997867804, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001326385525953898, + "loss": 0.0795, + "step": 8325 + }, + { + "epoch": 0.39450367211561244, + "grad_norm": 0.734375, + "learning_rate": 0.000132624475206771, + "loss": 0.8408, + "step": 8326 + }, + { + "epoch": 0.3945510542525468, + "grad_norm": 0.625, + "learning_rate": 0.00013261039709457613, + "loss": 0.3733, + "step": 8327 + }, + { + "epoch": 0.39459843638948117, + "grad_norm": 0.244140625, + "learning_rate": 0.00013259631825911735, + "loss": 0.1416, + "step": 8328 + }, + { + "epoch": 0.39464581852641556, + "grad_norm": 0.65234375, + "learning_rate": 0.00013258223870070697, + "loss": 0.8125, + "step": 8329 + }, + { + "epoch": 0.3946932006633499, + "grad_norm": 0.47265625, + "learning_rate": 0.00013256815841965723, + "loss": 0.4684, + "step": 8330 + }, + { + "epoch": 0.3947405828002843, + "grad_norm": 0.59375, + "learning_rate": 0.00013255407741628045, + "loss": 1.0509, + "step": 8331 + }, + { + "epoch": 0.3947879649372187, + "grad_norm": 0.8671875, + "learning_rate": 0.00013253999569088888, + "loss": 0.8827, + "step": 8332 + }, + { + "epoch": 0.394835347074153, + "grad_norm": 0.984375, + "learning_rate": 0.00013252591324379486, + "loss": 0.098, + "step": 8333 + }, + { + "epoch": 0.3948827292110874, + "grad_norm": 0.154296875, + "learning_rate": 0.00013251183007531077, + "loss": 0.0248, + "step": 8334 + }, + { + "epoch": 0.3949301113480218, + "grad_norm": 0.7734375, + "learning_rate": 0.00013249774618574888, + "loss": 0.1278, + "step": 8335 + }, + { + "epoch": 0.3949774934849562, + "grad_norm": 0.48828125, + "learning_rate": 0.00013248366157542161, + "loss": 0.1062, + "step": 8336 + }, + { + "epoch": 0.39502487562189054, + "grad_norm": 0.78515625, + "learning_rate": 0.00013246957624464133, + "loss": 0.9185, + "step": 8337 + }, + { + "epoch": 0.39507225775882493, + "grad_norm": 0.6875, + "learning_rate": 0.00013245549019372043, + "loss": 1.1236, + "step": 8338 + }, + { + "epoch": 0.3951196398957593, + "grad_norm": 0.2490234375, + "learning_rate": 0.00013244140342297134, + "loss": 0.0376, + "step": 8339 + }, + { + "epoch": 0.39516702203269366, + "grad_norm": 0.66796875, + "learning_rate": 0.00013242731593270644, + "loss": 0.994, + "step": 8340 + }, + { + "epoch": 0.39521440416962805, + "grad_norm": 0.60546875, + "learning_rate": 0.00013241322772323825, + "loss": 0.9374, + "step": 8341 + }, + { + "epoch": 0.39526178630656245, + "grad_norm": 0.6875, + "learning_rate": 0.00013239913879487919, + "loss": 1.3045, + "step": 8342 + }, + { + "epoch": 0.3953091684434968, + "grad_norm": 1.0859375, + "learning_rate": 0.0001323850491479417, + "loss": 1.4344, + "step": 8343 + }, + { + "epoch": 0.3953565505804312, + "grad_norm": 0.578125, + "learning_rate": 0.00013237095878273835, + "loss": 0.5707, + "step": 8344 + }, + { + "epoch": 0.39540393271736557, + "grad_norm": 0.0595703125, + "learning_rate": 0.00013235686769958162, + "loss": 0.0062, + "step": 8345 + }, + { + "epoch": 0.3954513148542999, + "grad_norm": 0.349609375, + "learning_rate": 0.00013234277589878403, + "loss": 0.0154, + "step": 8346 + }, + { + "epoch": 0.3954986969912343, + "grad_norm": 0.6171875, + "learning_rate": 0.0001323286833806581, + "loss": 1.0657, + "step": 8347 + }, + { + "epoch": 0.3955460791281687, + "grad_norm": 0.5, + "learning_rate": 0.00013231459014551643, + "loss": 0.4864, + "step": 8348 + }, + { + "epoch": 0.39559346126510303, + "grad_norm": 0.5, + "learning_rate": 0.00013230049619367156, + "loss": 0.7659, + "step": 8349 + }, + { + "epoch": 0.3956408434020374, + "grad_norm": 0.70703125, + "learning_rate": 0.0001322864015254361, + "loss": 1.1576, + "step": 8350 + }, + { + "epoch": 0.3956882255389718, + "grad_norm": 0.70703125, + "learning_rate": 0.00013227230614112264, + "loss": 0.923, + "step": 8351 + }, + { + "epoch": 0.3957356076759062, + "grad_norm": 0.8828125, + "learning_rate": 0.00013225821004104378, + "loss": 1.0582, + "step": 8352 + }, + { + "epoch": 0.39578298981284055, + "grad_norm": 0.8359375, + "learning_rate": 0.0001322441132255122, + "loss": 0.5032, + "step": 8353 + }, + { + "epoch": 0.39583037194977494, + "grad_norm": 0.546875, + "learning_rate": 0.00013223001569484053, + "loss": 0.884, + "step": 8354 + }, + { + "epoch": 0.39587775408670933, + "grad_norm": 0.6953125, + "learning_rate": 0.00013221591744934144, + "loss": 0.9107, + "step": 8355 + }, + { + "epoch": 0.39592513622364367, + "grad_norm": 0.017822265625, + "learning_rate": 0.00013220181848932761, + "loss": 0.0011, + "step": 8356 + }, + { + "epoch": 0.39597251836057806, + "grad_norm": 0.251953125, + "learning_rate": 0.0001321877188151118, + "loss": 0.0106, + "step": 8357 + }, + { + "epoch": 0.39601990049751246, + "grad_norm": 0.61328125, + "learning_rate": 0.0001321736184270066, + "loss": 0.7856, + "step": 8358 + }, + { + "epoch": 0.3960672826344468, + "grad_norm": 0.12353515625, + "learning_rate": 0.00013215951732532482, + "loss": 0.0062, + "step": 8359 + }, + { + "epoch": 0.3961146647713812, + "grad_norm": 0.6875, + "learning_rate": 0.00013214541551037927, + "loss": 0.9263, + "step": 8360 + }, + { + "epoch": 0.3961620469083156, + "grad_norm": 0.5390625, + "learning_rate": 0.00013213131298248255, + "loss": 0.8682, + "step": 8361 + }, + { + "epoch": 0.3962094290452499, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001321172097419476, + "loss": 0.1484, + "step": 8362 + }, + { + "epoch": 0.3962568111821843, + "grad_norm": 0.8125, + "learning_rate": 0.00013210310578908713, + "loss": 1.3438, + "step": 8363 + }, + { + "epoch": 0.3963041933191187, + "grad_norm": 0.5, + "learning_rate": 0.00013208900112421395, + "loss": 1.0264, + "step": 8364 + }, + { + "epoch": 0.3963515754560531, + "grad_norm": 0.09326171875, + "learning_rate": 0.00013207489574764095, + "loss": 0.0142, + "step": 8365 + }, + { + "epoch": 0.39639895759298743, + "grad_norm": 0.63671875, + "learning_rate": 0.00013206078965968089, + "loss": 1.2929, + "step": 8366 + }, + { + "epoch": 0.3964463397299218, + "grad_norm": 0.8359375, + "learning_rate": 0.00013204668286064664, + "loss": 0.9476, + "step": 8367 + }, + { + "epoch": 0.3964937218668562, + "grad_norm": 0.380859375, + "learning_rate": 0.00013203257535085114, + "loss": 0.2027, + "step": 8368 + }, + { + "epoch": 0.39654110400379056, + "grad_norm": 0.6171875, + "learning_rate": 0.00013201846713060725, + "loss": 0.9504, + "step": 8369 + }, + { + "epoch": 0.39658848614072495, + "grad_norm": 0.68359375, + "learning_rate": 0.0001320043582002278, + "loss": 0.1219, + "step": 8370 + }, + { + "epoch": 0.39663586827765934, + "grad_norm": 0.58984375, + "learning_rate": 0.00013199024856002585, + "loss": 0.7931, + "step": 8371 + }, + { + "epoch": 0.3966832504145937, + "grad_norm": 0.625, + "learning_rate": 0.00013197613821031422, + "loss": 1.1369, + "step": 8372 + }, + { + "epoch": 0.3967306325515281, + "grad_norm": 0.57421875, + "learning_rate": 0.00013196202715140591, + "loss": 0.7452, + "step": 8373 + }, + { + "epoch": 0.39677801468846247, + "grad_norm": 0.7421875, + "learning_rate": 0.0001319479153836139, + "loss": 0.8202, + "step": 8374 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 0.71484375, + "learning_rate": 0.00013193380290725111, + "loss": 1.3312, + "step": 8375 + }, + { + "epoch": 0.3968727789623312, + "grad_norm": 0.6640625, + "learning_rate": 0.00013191968972263063, + "loss": 0.9073, + "step": 8376 + }, + { + "epoch": 0.3969201610992656, + "grad_norm": 0.6640625, + "learning_rate": 0.00013190557583006538, + "loss": 1.0335, + "step": 8377 + }, + { + "epoch": 0.3969675432361999, + "grad_norm": 0.7109375, + "learning_rate": 0.00013189146122986848, + "loss": 0.2046, + "step": 8378 + }, + { + "epoch": 0.3970149253731343, + "grad_norm": 0.4453125, + "learning_rate": 0.00013187734592235296, + "loss": 0.1444, + "step": 8379 + }, + { + "epoch": 0.3970623075100687, + "grad_norm": 1.1484375, + "learning_rate": 0.00013186322990783186, + "loss": 1.1527, + "step": 8380 + }, + { + "epoch": 0.3971096896470031, + "grad_norm": 0.78515625, + "learning_rate": 0.00013184911318661824, + "loss": 1.0669, + "step": 8381 + }, + { + "epoch": 0.39715707178393744, + "grad_norm": 0.00848388671875, + "learning_rate": 0.0001318349957590252, + "loss": 0.0004, + "step": 8382 + }, + { + "epoch": 0.39720445392087184, + "grad_norm": 0.4609375, + "learning_rate": 0.00013182087762536588, + "loss": 0.2528, + "step": 8383 + }, + { + "epoch": 0.39725183605780623, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013180675878595336, + "loss": 0.0269, + "step": 8384 + }, + { + "epoch": 0.39729921819474057, + "grad_norm": 0.7109375, + "learning_rate": 0.00013179263924110085, + "loss": 1.1373, + "step": 8385 + }, + { + "epoch": 0.39734660033167496, + "grad_norm": 0.66015625, + "learning_rate": 0.00013177851899112147, + "loss": 0.3504, + "step": 8386 + }, + { + "epoch": 0.39739398246860935, + "grad_norm": 0.8046875, + "learning_rate": 0.00013176439803632835, + "loss": 1.5232, + "step": 8387 + }, + { + "epoch": 0.3974413646055437, + "grad_norm": 0.65625, + "learning_rate": 0.00013175027637703473, + "loss": 0.8524, + "step": 8388 + }, + { + "epoch": 0.3974887467424781, + "grad_norm": 0.671875, + "learning_rate": 0.0001317361540135538, + "loss": 0.6525, + "step": 8389 + }, + { + "epoch": 0.3975361288794125, + "grad_norm": 0.67578125, + "learning_rate": 0.00013172203094619878, + "loss": 0.2698, + "step": 8390 + }, + { + "epoch": 0.3975835110163468, + "grad_norm": 0.55859375, + "learning_rate": 0.00013170790717528292, + "loss": 1.1265, + "step": 8391 + }, + { + "epoch": 0.3976308931532812, + "grad_norm": 0.171875, + "learning_rate": 0.0001316937827011194, + "loss": 0.0248, + "step": 8392 + }, + { + "epoch": 0.3976782752902156, + "grad_norm": 0.671875, + "learning_rate": 0.00013167965752402158, + "loss": 0.6908, + "step": 8393 + }, + { + "epoch": 0.39772565742714994, + "grad_norm": 0.63671875, + "learning_rate": 0.0001316655316443027, + "loss": 1.1508, + "step": 8394 + }, + { + "epoch": 0.39777303956408433, + "grad_norm": 0.76171875, + "learning_rate": 0.00013165140506227606, + "loss": 1.5209, + "step": 8395 + }, + { + "epoch": 0.3978204217010187, + "grad_norm": 0.62890625, + "learning_rate": 0.00013163727777825497, + "loss": 1.3025, + "step": 8396 + }, + { + "epoch": 0.3978678038379531, + "grad_norm": 0.69921875, + "learning_rate": 0.00013162314979255272, + "loss": 0.9712, + "step": 8397 + }, + { + "epoch": 0.39791518597488745, + "grad_norm": 0.95703125, + "learning_rate": 0.00013160902110548274, + "loss": 0.5788, + "step": 8398 + }, + { + "epoch": 0.39796256811182185, + "grad_norm": 0.12353515625, + "learning_rate": 0.0001315948917173583, + "loss": 0.0088, + "step": 8399 + }, + { + "epoch": 0.39800995024875624, + "grad_norm": 0.5546875, + "learning_rate": 0.00013158076162849281, + "loss": 1.0409, + "step": 8400 + }, + { + "epoch": 0.3980573323856906, + "grad_norm": 0.10546875, + "learning_rate": 0.0001315666308391997, + "loss": 0.0045, + "step": 8401 + }, + { + "epoch": 0.39810471452262497, + "grad_norm": 0.58203125, + "learning_rate": 0.00013155249934979234, + "loss": 0.9082, + "step": 8402 + }, + { + "epoch": 0.39815209665955936, + "grad_norm": 0.70703125, + "learning_rate": 0.00013153836716058414, + "loss": 1.0735, + "step": 8403 + }, + { + "epoch": 0.3981994787964937, + "grad_norm": 0.2470703125, + "learning_rate": 0.00013152423427188856, + "loss": 0.0275, + "step": 8404 + }, + { + "epoch": 0.3982468609334281, + "grad_norm": 0.5859375, + "learning_rate": 0.000131510100684019, + "loss": 0.9725, + "step": 8405 + }, + { + "epoch": 0.3982942430703625, + "grad_norm": 0.56640625, + "learning_rate": 0.00013149596639728904, + "loss": 0.4258, + "step": 8406 + }, + { + "epoch": 0.3983416252072968, + "grad_norm": 0.70703125, + "learning_rate": 0.00013148183141201204, + "loss": 1.2988, + "step": 8407 + }, + { + "epoch": 0.3983890073442312, + "grad_norm": 0.267578125, + "learning_rate": 0.00013146769572850158, + "loss": 0.0233, + "step": 8408 + }, + { + "epoch": 0.3984363894811656, + "grad_norm": 0.5078125, + "learning_rate": 0.00013145355934707112, + "loss": 0.9486, + "step": 8409 + }, + { + "epoch": 0.3984837716181, + "grad_norm": 0.515625, + "learning_rate": 0.00013143942226803427, + "loss": 0.1009, + "step": 8410 + }, + { + "epoch": 0.39853115375503434, + "grad_norm": 0.51171875, + "learning_rate": 0.00013142528449170446, + "loss": 0.6194, + "step": 8411 + }, + { + "epoch": 0.39857853589196873, + "grad_norm": 0.6640625, + "learning_rate": 0.00013141114601839532, + "loss": 1.2989, + "step": 8412 + }, + { + "epoch": 0.3986259180289031, + "grad_norm": 0.6171875, + "learning_rate": 0.00013139700684842043, + "loss": 0.5218, + "step": 8413 + }, + { + "epoch": 0.39867330016583746, + "grad_norm": 0.69140625, + "learning_rate": 0.0001313828669820934, + "loss": 1.2017, + "step": 8414 + }, + { + "epoch": 0.39872068230277186, + "grad_norm": 0.8046875, + "learning_rate": 0.00013136872641972776, + "loss": 0.1648, + "step": 8415 + }, + { + "epoch": 0.39876806443970625, + "grad_norm": 0.6484375, + "learning_rate": 0.00013135458516163722, + "loss": 0.2398, + "step": 8416 + }, + { + "epoch": 0.3988154465766406, + "grad_norm": 0.65234375, + "learning_rate": 0.00013134044320813537, + "loss": 1.25, + "step": 8417 + }, + { + "epoch": 0.398862828713575, + "grad_norm": 0.6640625, + "learning_rate": 0.0001313263005595359, + "loss": 0.9613, + "step": 8418 + }, + { + "epoch": 0.3989102108505094, + "grad_norm": 0.74609375, + "learning_rate": 0.0001313121572161524, + "loss": 1.1156, + "step": 8419 + }, + { + "epoch": 0.3989575929874437, + "grad_norm": 1.25, + "learning_rate": 0.00013129801317829858, + "loss": 0.5553, + "step": 8420 + }, + { + "epoch": 0.3990049751243781, + "grad_norm": 0.671875, + "learning_rate": 0.00013128386844628824, + "loss": 1.3128, + "step": 8421 + }, + { + "epoch": 0.3990523572613125, + "grad_norm": 0.84375, + "learning_rate": 0.00013126972302043497, + "loss": 1.3793, + "step": 8422 + }, + { + "epoch": 0.39909973939824683, + "grad_norm": 0.6796875, + "learning_rate": 0.00013125557690105258, + "loss": 1.0913, + "step": 8423 + }, + { + "epoch": 0.3991471215351812, + "grad_norm": 0.08984375, + "learning_rate": 0.00013124143008845474, + "loss": 0.0059, + "step": 8424 + }, + { + "epoch": 0.3991945036721156, + "grad_norm": 0.87890625, + "learning_rate": 0.00013122728258295528, + "loss": 1.249, + "step": 8425 + }, + { + "epoch": 0.39924188580905, + "grad_norm": 0.69921875, + "learning_rate": 0.00013121313438486799, + "loss": 1.2313, + "step": 8426 + }, + { + "epoch": 0.39928926794598435, + "grad_norm": 0.318359375, + "learning_rate": 0.0001311989854945066, + "loss": 0.0372, + "step": 8427 + }, + { + "epoch": 0.39933665008291874, + "grad_norm": 0.03076171875, + "learning_rate": 0.00013118483591218494, + "loss": 0.0015, + "step": 8428 + }, + { + "epoch": 0.39938403221985314, + "grad_norm": 0.671875, + "learning_rate": 0.00013117068563821684, + "loss": 1.2636, + "step": 8429 + }, + { + "epoch": 0.3994314143567875, + "grad_norm": 0.65625, + "learning_rate": 0.00013115653467291613, + "loss": 1.565, + "step": 8430 + }, + { + "epoch": 0.39947879649372187, + "grad_norm": 0.2265625, + "learning_rate": 0.00013114238301659668, + "loss": 0.0358, + "step": 8431 + }, + { + "epoch": 0.39952617863065626, + "grad_norm": 0.66015625, + "learning_rate": 0.0001311282306695723, + "loss": 0.8851, + "step": 8432 + }, + { + "epoch": 0.3995735607675906, + "grad_norm": 0.55078125, + "learning_rate": 0.00013111407763215696, + "loss": 0.1173, + "step": 8433 + }, + { + "epoch": 0.399620942904525, + "grad_norm": 0.54296875, + "learning_rate": 0.00013109992390466455, + "loss": 0.9858, + "step": 8434 + }, + { + "epoch": 0.3996683250414594, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013108576948740893, + "loss": 0.1175, + "step": 8435 + }, + { + "epoch": 0.3997157071783937, + "grad_norm": 0.81640625, + "learning_rate": 0.00013107161438070405, + "loss": 0.6959, + "step": 8436 + }, + { + "epoch": 0.3997630893153281, + "grad_norm": 0.61328125, + "learning_rate": 0.00013105745858486384, + "loss": 0.694, + "step": 8437 + }, + { + "epoch": 0.3998104714522625, + "grad_norm": 0.216796875, + "learning_rate": 0.0001310433021002023, + "loss": 0.0261, + "step": 8438 + }, + { + "epoch": 0.3998578535891969, + "grad_norm": 0.73046875, + "learning_rate": 0.00013102914492703335, + "loss": 0.6536, + "step": 8439 + }, + { + "epoch": 0.39990523572613124, + "grad_norm": 0.62890625, + "learning_rate": 0.00013101498706567105, + "loss": 1.1436, + "step": 8440 + }, + { + "epoch": 0.39995261786306563, + "grad_norm": 0.76171875, + "learning_rate": 0.00013100082851642939, + "loss": 1.562, + "step": 8441 + }, + { + "epoch": 0.4, + "grad_norm": 0.287109375, + "learning_rate": 0.00013098666927962235, + "loss": 0.1947, + "step": 8442 + }, + { + "epoch": 0.40004738213693436, + "grad_norm": 0.2041015625, + "learning_rate": 0.00013097250935556397, + "loss": 0.1247, + "step": 8443 + }, + { + "epoch": 0.40009476427386875, + "grad_norm": 0.53125, + "learning_rate": 0.00013095834874456835, + "loss": 0.6569, + "step": 8444 + }, + { + "epoch": 0.40014214641080315, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001309441874469495, + "loss": 0.1322, + "step": 8445 + }, + { + "epoch": 0.4001895285477375, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013093002546302158, + "loss": 0.1332, + "step": 8446 + }, + { + "epoch": 0.4002369106846719, + "grad_norm": 0.66796875, + "learning_rate": 0.0001309158627930986, + "loss": 1.1967, + "step": 8447 + }, + { + "epoch": 0.40028429282160627, + "grad_norm": 0.51953125, + "learning_rate": 0.00013090169943749476, + "loss": 0.4765, + "step": 8448 + }, + { + "epoch": 0.4003316749585406, + "grad_norm": 0.609375, + "learning_rate": 0.00013088753539652412, + "loss": 0.6844, + "step": 8449 + }, + { + "epoch": 0.400379057095475, + "grad_norm": 0.71484375, + "learning_rate": 0.00013087337067050082, + "loss": 0.9418, + "step": 8450 + }, + { + "epoch": 0.4004264392324094, + "grad_norm": 1.171875, + "learning_rate": 0.00013085920525973904, + "loss": 0.6466, + "step": 8451 + }, + { + "epoch": 0.40047382136934373, + "grad_norm": 0.61328125, + "learning_rate": 0.00013084503916455301, + "loss": 0.9358, + "step": 8452 + }, + { + "epoch": 0.4005212035062781, + "grad_norm": 0.71875, + "learning_rate": 0.00013083087238525685, + "loss": 1.5236, + "step": 8453 + }, + { + "epoch": 0.4005685856432125, + "grad_norm": 0.94140625, + "learning_rate": 0.00013081670492216474, + "loss": 0.8968, + "step": 8454 + }, + { + "epoch": 0.4006159677801469, + "grad_norm": 0.7109375, + "learning_rate": 0.00013080253677559095, + "loss": 0.7483, + "step": 8455 + }, + { + "epoch": 0.40066334991708125, + "grad_norm": 0.30078125, + "learning_rate": 0.00013078836794584971, + "loss": 0.0287, + "step": 8456 + }, + { + "epoch": 0.40071073205401564, + "grad_norm": 0.5859375, + "learning_rate": 0.0001307741984332553, + "loss": 0.7295, + "step": 8457 + }, + { + "epoch": 0.40075811419095003, + "grad_norm": 0.6328125, + "learning_rate": 0.0001307600282381219, + "loss": 0.9218, + "step": 8458 + }, + { + "epoch": 0.40080549632788437, + "grad_norm": 0.68359375, + "learning_rate": 0.00013074585736076386, + "loss": 0.6534, + "step": 8459 + }, + { + "epoch": 0.40085287846481876, + "grad_norm": 0.73828125, + "learning_rate": 0.00013073168580149546, + "loss": 1.2929, + "step": 8460 + }, + { + "epoch": 0.40090026060175316, + "grad_norm": 0.07470703125, + "learning_rate": 0.00013071751356063098, + "loss": 0.0068, + "step": 8461 + }, + { + "epoch": 0.4009476427386875, + "grad_norm": 0.8046875, + "learning_rate": 0.00013070334063848478, + "loss": 1.1136, + "step": 8462 + }, + { + "epoch": 0.4009950248756219, + "grad_norm": 0.62890625, + "learning_rate": 0.0001306891670353712, + "loss": 0.0706, + "step": 8463 + }, + { + "epoch": 0.4010424070125563, + "grad_norm": 0.16015625, + "learning_rate": 0.00013067499275160459, + "loss": 0.129, + "step": 8464 + }, + { + "epoch": 0.4010897891494906, + "grad_norm": 0.2578125, + "learning_rate": 0.00013066081778749928, + "loss": 0.1445, + "step": 8465 + }, + { + "epoch": 0.401137171286425, + "grad_norm": 0.55859375, + "learning_rate": 0.0001306466421433697, + "loss": 0.8519, + "step": 8466 + }, + { + "epoch": 0.4011845534233594, + "grad_norm": 0.5546875, + "learning_rate": 0.00013063246581953027, + "loss": 0.8091, + "step": 8467 + }, + { + "epoch": 0.4012319355602938, + "grad_norm": 0.279296875, + "learning_rate": 0.00013061828881629533, + "loss": 0.0299, + "step": 8468 + }, + { + "epoch": 0.40127931769722813, + "grad_norm": 0.55078125, + "learning_rate": 0.00013060411113397936, + "loss": 0.8453, + "step": 8469 + }, + { + "epoch": 0.4013266998341625, + "grad_norm": 0.73046875, + "learning_rate": 0.00013058993277289684, + "loss": 1.1901, + "step": 8470 + }, + { + "epoch": 0.4013740819710969, + "grad_norm": 1.1875, + "learning_rate": 0.0001305757537333622, + "loss": 1.4622, + "step": 8471 + }, + { + "epoch": 0.40142146410803126, + "grad_norm": 0.455078125, + "learning_rate": 0.00013056157401568984, + "loss": 0.3346, + "step": 8472 + }, + { + "epoch": 0.40146884624496565, + "grad_norm": 0.65625, + "learning_rate": 0.00013054739362019442, + "loss": 0.9678, + "step": 8473 + }, + { + "epoch": 0.40151622838190004, + "grad_norm": 0.56640625, + "learning_rate": 0.00013053321254719028, + "loss": 0.4233, + "step": 8474 + }, + { + "epoch": 0.4015636105188344, + "grad_norm": 0.52734375, + "learning_rate": 0.00013051903079699205, + "loss": 0.9604, + "step": 8475 + }, + { + "epoch": 0.4016109926557688, + "grad_norm": 0.625, + "learning_rate": 0.00013050484836991418, + "loss": 0.8562, + "step": 8476 + }, + { + "epoch": 0.40165837479270317, + "grad_norm": 0.5703125, + "learning_rate": 0.00013049066526627128, + "loss": 1.1919, + "step": 8477 + }, + { + "epoch": 0.4017057569296375, + "grad_norm": 0.56640625, + "learning_rate": 0.00013047648148637787, + "loss": 0.9606, + "step": 8478 + }, + { + "epoch": 0.4017531390665719, + "grad_norm": 0.7890625, + "learning_rate": 0.00013046229703054862, + "loss": 1.0702, + "step": 8479 + }, + { + "epoch": 0.4018005212035063, + "grad_norm": 0.6171875, + "learning_rate": 0.00013044811189909803, + "loss": 1.1904, + "step": 8480 + }, + { + "epoch": 0.4018479033404406, + "grad_norm": 1.0546875, + "learning_rate": 0.00013043392609234078, + "loss": 0.7271, + "step": 8481 + }, + { + "epoch": 0.401895285477375, + "grad_norm": 0.86328125, + "learning_rate": 0.00013041973961059147, + "loss": 1.1301, + "step": 8482 + }, + { + "epoch": 0.4019426676143094, + "grad_norm": 0.39453125, + "learning_rate": 0.0001304055524541647, + "loss": 0.0194, + "step": 8483 + }, + { + "epoch": 0.4019900497512438, + "grad_norm": 0.67578125, + "learning_rate": 0.0001303913646233752, + "loss": 0.6856, + "step": 8484 + }, + { + "epoch": 0.40203743188817814, + "grad_norm": 0.55859375, + "learning_rate": 0.00013037717611853758, + "loss": 1.0049, + "step": 8485 + }, + { + "epoch": 0.40208481402511254, + "grad_norm": 0.67578125, + "learning_rate": 0.00013036298693996657, + "loss": 0.7963, + "step": 8486 + }, + { + "epoch": 0.40213219616204693, + "grad_norm": 0.546875, + "learning_rate": 0.00013034879708797683, + "loss": 0.5421, + "step": 8487 + }, + { + "epoch": 0.40217957829898127, + "grad_norm": 0.65234375, + "learning_rate": 0.00013033460656288308, + "loss": 1.0176, + "step": 8488 + }, + { + "epoch": 0.40222696043591566, + "grad_norm": 0.60546875, + "learning_rate": 0.0001303204153650001, + "loss": 0.8604, + "step": 8489 + }, + { + "epoch": 0.40227434257285005, + "grad_norm": 0.71484375, + "learning_rate": 0.00013030622349464262, + "loss": 0.9674, + "step": 8490 + }, + { + "epoch": 0.4023217247097844, + "grad_norm": 0.6171875, + "learning_rate": 0.00013029203095212534, + "loss": 0.9742, + "step": 8491 + }, + { + "epoch": 0.4023691068467188, + "grad_norm": 0.22265625, + "learning_rate": 0.0001302778377377631, + "loss": 0.034, + "step": 8492 + }, + { + "epoch": 0.4024164889836532, + "grad_norm": 0.66796875, + "learning_rate": 0.00013026364385187065, + "loss": 1.0015, + "step": 8493 + }, + { + "epoch": 0.4024638711205875, + "grad_norm": 0.86328125, + "learning_rate": 0.00013024944929476283, + "loss": 1.2935, + "step": 8494 + }, + { + "epoch": 0.4025112532575219, + "grad_norm": 0.6875, + "learning_rate": 0.00013023525406675445, + "loss": 1.0523, + "step": 8495 + }, + { + "epoch": 0.4025586353944563, + "grad_norm": 0.59765625, + "learning_rate": 0.00013022105816816034, + "loss": 1.1254, + "step": 8496 + }, + { + "epoch": 0.4026060175313907, + "grad_norm": 0.6328125, + "learning_rate": 0.00013020686159929536, + "loss": 0.8375, + "step": 8497 + }, + { + "epoch": 0.40265339966832503, + "grad_norm": 0.71484375, + "learning_rate": 0.00013019266436047438, + "loss": 1.2827, + "step": 8498 + }, + { + "epoch": 0.4027007818052594, + "grad_norm": 0.47265625, + "learning_rate": 0.00013017846645201221, + "loss": 0.343, + "step": 8499 + }, + { + "epoch": 0.4027481639421938, + "grad_norm": 0.7421875, + "learning_rate": 0.00013016426787422383, + "loss": 1.2327, + "step": 8500 + }, + { + "epoch": 0.40279554607912815, + "grad_norm": 0.82421875, + "learning_rate": 0.0001301500686274241, + "loss": 0.943, + "step": 8501 + }, + { + "epoch": 0.40284292821606255, + "grad_norm": 0.6640625, + "learning_rate": 0.00013013586871192797, + "loss": 0.0642, + "step": 8502 + }, + { + "epoch": 0.40289031035299694, + "grad_norm": 0.1884765625, + "learning_rate": 0.00013012166812805039, + "loss": 0.1377, + "step": 8503 + }, + { + "epoch": 0.4029376924899313, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001301074668761063, + "loss": 0.0106, + "step": 8504 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 0.08056640625, + "learning_rate": 0.00013009326495641061, + "loss": 0.0096, + "step": 8505 + }, + { + "epoch": 0.40303245676380006, + "grad_norm": 0.7109375, + "learning_rate": 0.0001300790623692784, + "loss": 1.1677, + "step": 8506 + }, + { + "epoch": 0.4030798389007344, + "grad_norm": 0.61328125, + "learning_rate": 0.0001300648591150246, + "loss": 0.7628, + "step": 8507 + }, + { + "epoch": 0.4031272210376688, + "grad_norm": 0.1865234375, + "learning_rate": 0.00013005065519396429, + "loss": 0.0236, + "step": 8508 + }, + { + "epoch": 0.4031746031746032, + "grad_norm": 0.6328125, + "learning_rate": 0.00013003645060641244, + "loss": 1.0059, + "step": 8509 + }, + { + "epoch": 0.4032219853115375, + "grad_norm": 0.298828125, + "learning_rate": 0.0001300222453526841, + "loss": 0.0604, + "step": 8510 + }, + { + "epoch": 0.4032693674484719, + "grad_norm": 0.58984375, + "learning_rate": 0.00013000803943309433, + "loss": 0.9209, + "step": 8511 + }, + { + "epoch": 0.4033167495854063, + "grad_norm": 0.64453125, + "learning_rate": 0.0001299938328479582, + "loss": 1.0169, + "step": 8512 + }, + { + "epoch": 0.4033641317223407, + "grad_norm": 0.66796875, + "learning_rate": 0.00012997962559759086, + "loss": 1.153, + "step": 8513 + }, + { + "epoch": 0.40341151385927504, + "grad_norm": 0.15625, + "learning_rate": 0.0001299654176823073, + "loss": 0.0071, + "step": 8514 + }, + { + "epoch": 0.40345889599620943, + "grad_norm": 0.765625, + "learning_rate": 0.0001299512091024227, + "loss": 1.1245, + "step": 8515 + }, + { + "epoch": 0.4035062781331438, + "grad_norm": 0.72265625, + "learning_rate": 0.0001299369998582522, + "loss": 1.1065, + "step": 8516 + }, + { + "epoch": 0.40355366027007816, + "grad_norm": 0.69140625, + "learning_rate": 0.00012992278995011095, + "loss": 1.4431, + "step": 8517 + }, + { + "epoch": 0.40360104240701256, + "grad_norm": 0.609375, + "learning_rate": 0.00012990857937831405, + "loss": 0.6911, + "step": 8518 + }, + { + "epoch": 0.40364842454394695, + "grad_norm": 0.052734375, + "learning_rate": 0.0001298943681431767, + "loss": 0.0012, + "step": 8519 + }, + { + "epoch": 0.4036958066808813, + "grad_norm": 0.9453125, + "learning_rate": 0.00012988015624501418, + "loss": 0.0266, + "step": 8520 + }, + { + "epoch": 0.4037431888178157, + "grad_norm": 0.6796875, + "learning_rate": 0.00012986594368414158, + "loss": 1.0844, + "step": 8521 + }, + { + "epoch": 0.40379057095475007, + "grad_norm": 0.2021484375, + "learning_rate": 0.00012985173046087416, + "loss": 0.0193, + "step": 8522 + }, + { + "epoch": 0.4038379530916844, + "grad_norm": 0.11962890625, + "learning_rate": 0.00012983751657552714, + "loss": 0.0063, + "step": 8523 + }, + { + "epoch": 0.4038853352286188, + "grad_norm": 0.546875, + "learning_rate": 0.0001298233020284158, + "loss": 0.4996, + "step": 8524 + }, + { + "epoch": 0.4039327173655532, + "grad_norm": 0.19140625, + "learning_rate": 0.0001298090868198554, + "loss": 0.1321, + "step": 8525 + }, + { + "epoch": 0.4039800995024876, + "grad_norm": 0.53515625, + "learning_rate": 0.00012979487095016122, + "loss": 0.8057, + "step": 8526 + }, + { + "epoch": 0.4040274816394219, + "grad_norm": 0.83203125, + "learning_rate": 0.00012978065441964848, + "loss": 0.8067, + "step": 8527 + }, + { + "epoch": 0.4040748637763563, + "grad_norm": 0.12890625, + "learning_rate": 0.0001297664372286326, + "loss": 0.0177, + "step": 8528 + }, + { + "epoch": 0.4041222459132907, + "grad_norm": 0.6484375, + "learning_rate": 0.00012975221937742886, + "loss": 0.9545, + "step": 8529 + }, + { + "epoch": 0.40416962805022505, + "grad_norm": 0.1298828125, + "learning_rate": 0.00012973800086635252, + "loss": 0.016, + "step": 8530 + }, + { + "epoch": 0.40421701018715944, + "grad_norm": 0.353515625, + "learning_rate": 0.00012972378169571905, + "loss": 0.0937, + "step": 8531 + }, + { + "epoch": 0.40426439232409384, + "grad_norm": 0.328125, + "learning_rate": 0.00012970956186584374, + "loss": 0.0335, + "step": 8532 + }, + { + "epoch": 0.4043117744610282, + "grad_norm": 0.053466796875, + "learning_rate": 0.000129695341377042, + "loss": 0.0048, + "step": 8533 + }, + { + "epoch": 0.40435915659796257, + "grad_norm": 0.578125, + "learning_rate": 0.00012968112022962918, + "loss": 0.8922, + "step": 8534 + }, + { + "epoch": 0.40440653873489696, + "grad_norm": 0.162109375, + "learning_rate": 0.00012966689842392075, + "loss": 0.1109, + "step": 8535 + }, + { + "epoch": 0.4044539208718313, + "grad_norm": 0.22265625, + "learning_rate": 0.00012965267596023216, + "loss": 0.1531, + "step": 8536 + }, + { + "epoch": 0.4045013030087657, + "grad_norm": 0.8125, + "learning_rate": 0.00012963845283887876, + "loss": 1.0037, + "step": 8537 + }, + { + "epoch": 0.4045486851457001, + "grad_norm": 0.18359375, + "learning_rate": 0.00012962422906017605, + "loss": 0.0206, + "step": 8538 + }, + { + "epoch": 0.4045960672826344, + "grad_norm": 0.69140625, + "learning_rate": 0.00012961000462443948, + "loss": 1.3797, + "step": 8539 + }, + { + "epoch": 0.4046434494195688, + "grad_norm": 0.7890625, + "learning_rate": 0.00012959577953198457, + "loss": 0.9688, + "step": 8540 + }, + { + "epoch": 0.4046908315565032, + "grad_norm": 0.71484375, + "learning_rate": 0.00012958155378312678, + "loss": 1.0048, + "step": 8541 + }, + { + "epoch": 0.4047382136934376, + "grad_norm": 0.59375, + "learning_rate": 0.0001295673273781816, + "loss": 0.6154, + "step": 8542 + }, + { + "epoch": 0.40478559583037194, + "grad_norm": 0.53515625, + "learning_rate": 0.00012955310031746467, + "loss": 0.7247, + "step": 8543 + }, + { + "epoch": 0.40483297796730633, + "grad_norm": 0.5390625, + "learning_rate": 0.00012953887260129144, + "loss": 0.9636, + "step": 8544 + }, + { + "epoch": 0.4048803601042407, + "grad_norm": 0.6875, + "learning_rate": 0.00012952464422997744, + "loss": 0.9512, + "step": 8545 + }, + { + "epoch": 0.40492774224117506, + "grad_norm": 0.8046875, + "learning_rate": 0.0001295104152038383, + "loss": 1.5034, + "step": 8546 + }, + { + "epoch": 0.40497512437810945, + "grad_norm": 0.03857421875, + "learning_rate": 0.00012949618552318955, + "loss": 0.0033, + "step": 8547 + }, + { + "epoch": 0.40502250651504385, + "grad_norm": 0.73828125, + "learning_rate": 0.00012948195518834688, + "loss": 1.1164, + "step": 8548 + }, + { + "epoch": 0.4050698886519782, + "grad_norm": 0.57421875, + "learning_rate": 0.0001294677241996258, + "loss": 0.9889, + "step": 8549 + }, + { + "epoch": 0.4051172707889126, + "grad_norm": 1.0, + "learning_rate": 0.000129453492557342, + "loss": 1.658, + "step": 8550 + }, + { + "epoch": 0.40516465292584697, + "grad_norm": 0.76953125, + "learning_rate": 0.00012943926026181112, + "loss": 1.1179, + "step": 8551 + }, + { + "epoch": 0.4052120350627813, + "grad_norm": 0.76171875, + "learning_rate": 0.00012942502731334884, + "loss": 0.413, + "step": 8552 + }, + { + "epoch": 0.4052594171997157, + "grad_norm": 0.671875, + "learning_rate": 0.00012941079371227074, + "loss": 0.9343, + "step": 8553 + }, + { + "epoch": 0.4053067993366501, + "grad_norm": 0.64453125, + "learning_rate": 0.00012939655945889256, + "loss": 0.7487, + "step": 8554 + }, + { + "epoch": 0.4053541814735845, + "grad_norm": 0.296875, + "learning_rate": 0.00012938232455353004, + "loss": 0.0043, + "step": 8555 + }, + { + "epoch": 0.4054015636105188, + "grad_norm": 0.78125, + "learning_rate": 0.00012936808899649882, + "loss": 1.2496, + "step": 8556 + }, + { + "epoch": 0.4054489457474532, + "grad_norm": 0.8203125, + "learning_rate": 0.00012935385278811467, + "loss": 1.3019, + "step": 8557 + }, + { + "epoch": 0.4054963278843876, + "grad_norm": 0.6953125, + "learning_rate": 0.00012933961592869335, + "loss": 0.9618, + "step": 8558 + }, + { + "epoch": 0.40554371002132195, + "grad_norm": 0.7421875, + "learning_rate": 0.0001293253784185506, + "loss": 1.1096, + "step": 8559 + }, + { + "epoch": 0.40559109215825634, + "grad_norm": 0.84765625, + "learning_rate": 0.0001293111402580022, + "loss": 0.8938, + "step": 8560 + }, + { + "epoch": 0.40563847429519073, + "grad_norm": 0.62890625, + "learning_rate": 0.0001292969014473639, + "loss": 1.2005, + "step": 8561 + }, + { + "epoch": 0.40568585643212507, + "grad_norm": 0.8125, + "learning_rate": 0.00012928266198695154, + "loss": 1.3525, + "step": 8562 + }, + { + "epoch": 0.40573323856905946, + "grad_norm": 0.546875, + "learning_rate": 0.00012926842187708094, + "loss": 0.9494, + "step": 8563 + }, + { + "epoch": 0.40578062070599386, + "grad_norm": 0.6328125, + "learning_rate": 0.0001292541811180679, + "loss": 0.7537, + "step": 8564 + }, + { + "epoch": 0.4058280028429282, + "grad_norm": 0.734375, + "learning_rate": 0.00012923993971022832, + "loss": 1.1526, + "step": 8565 + }, + { + "epoch": 0.4058753849798626, + "grad_norm": 0.53125, + "learning_rate": 0.000129225697653878, + "loss": 0.9039, + "step": 8566 + }, + { + "epoch": 0.405922767116797, + "grad_norm": 0.71484375, + "learning_rate": 0.00012921145494933285, + "loss": 1.4967, + "step": 8567 + }, + { + "epoch": 0.4059701492537313, + "grad_norm": 0.75, + "learning_rate": 0.00012919721159690873, + "loss": 1.018, + "step": 8568 + }, + { + "epoch": 0.4060175313906657, + "grad_norm": 0.423828125, + "learning_rate": 0.00012918296759692154, + "loss": 0.1771, + "step": 8569 + }, + { + "epoch": 0.4060649135276001, + "grad_norm": 0.01397705078125, + "learning_rate": 0.00012916872294968724, + "loss": 0.0008, + "step": 8570 + }, + { + "epoch": 0.4061122956645345, + "grad_norm": 0.055908203125, + "learning_rate": 0.00012915447765552172, + "loss": 0.0083, + "step": 8571 + }, + { + "epoch": 0.40615967780146883, + "grad_norm": 1.6328125, + "learning_rate": 0.00012914023171474096, + "loss": 0.0438, + "step": 8572 + }, + { + "epoch": 0.4062070599384032, + "grad_norm": 0.56640625, + "learning_rate": 0.00012912598512766085, + "loss": 0.7081, + "step": 8573 + }, + { + "epoch": 0.4062544420753376, + "grad_norm": 0.287109375, + "learning_rate": 0.00012911173789459745, + "loss": 0.3742, + "step": 8574 + }, + { + "epoch": 0.40630182421227196, + "grad_norm": 0.58203125, + "learning_rate": 0.00012909749001586672, + "loss": 1.0059, + "step": 8575 + }, + { + "epoch": 0.40634920634920635, + "grad_norm": 0.197265625, + "learning_rate": 0.00012908324149178463, + "loss": 0.0195, + "step": 8576 + }, + { + "epoch": 0.40639658848614074, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012906899232266724, + "loss": 0.0254, + "step": 8577 + }, + { + "epoch": 0.4064439706230751, + "grad_norm": 0.294921875, + "learning_rate": 0.00012905474250883057, + "loss": 0.0615, + "step": 8578 + }, + { + "epoch": 0.4064913527600095, + "grad_norm": 0.58984375, + "learning_rate": 0.00012904049205059065, + "loss": 0.5947, + "step": 8579 + }, + { + "epoch": 0.40653873489694387, + "grad_norm": 0.68359375, + "learning_rate": 0.00012902624094826353, + "loss": 0.644, + "step": 8580 + }, + { + "epoch": 0.4065861170338782, + "grad_norm": 0.55859375, + "learning_rate": 0.00012901198920216528, + "loss": 1.1777, + "step": 8581 + }, + { + "epoch": 0.4066334991708126, + "grad_norm": 0.65234375, + "learning_rate": 0.00012899773681261206, + "loss": 0.5417, + "step": 8582 + }, + { + "epoch": 0.406680881307747, + "grad_norm": 0.5546875, + "learning_rate": 0.00012898348377991993, + "loss": 0.8528, + "step": 8583 + }, + { + "epoch": 0.4067282634446814, + "grad_norm": 0.115234375, + "learning_rate": 0.00012896923010440498, + "loss": 0.0175, + "step": 8584 + }, + { + "epoch": 0.4067756455816157, + "grad_norm": 0.6640625, + "learning_rate": 0.0001289549757863834, + "loss": 0.9764, + "step": 8585 + }, + { + "epoch": 0.4068230277185501, + "grad_norm": 0.53125, + "learning_rate": 0.00012894072082617126, + "loss": 0.6498, + "step": 8586 + }, + { + "epoch": 0.4068704098554845, + "grad_norm": 0.7109375, + "learning_rate": 0.00012892646522408475, + "loss": 0.3805, + "step": 8587 + }, + { + "epoch": 0.40691779199241884, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012891220898044006, + "loss": 0.1385, + "step": 8588 + }, + { + "epoch": 0.40696517412935324, + "grad_norm": 0.76953125, + "learning_rate": 0.0001288979520955534, + "loss": 0.8701, + "step": 8589 + }, + { + "epoch": 0.40701255626628763, + "grad_norm": 0.359375, + "learning_rate": 0.0001288836945697409, + "loss": 0.2025, + "step": 8590 + }, + { + "epoch": 0.40705993840322197, + "grad_norm": 0.57421875, + "learning_rate": 0.00012886943640331885, + "loss": 0.7638, + "step": 8591 + }, + { + "epoch": 0.40710732054015636, + "grad_norm": 0.62890625, + "learning_rate": 0.00012885517759660345, + "loss": 1.0234, + "step": 8592 + }, + { + "epoch": 0.40715470267709075, + "grad_norm": 0.65625, + "learning_rate": 0.00012884091814991094, + "loss": 1.3021, + "step": 8593 + }, + { + "epoch": 0.4072020848140251, + "grad_norm": 0.6953125, + "learning_rate": 0.00012882665806355757, + "loss": 0.7696, + "step": 8594 + }, + { + "epoch": 0.4072494669509595, + "grad_norm": 0.5078125, + "learning_rate": 0.00012881239733785965, + "loss": 0.7608, + "step": 8595 + }, + { + "epoch": 0.4072968490878939, + "grad_norm": 0.4921875, + "learning_rate": 0.00012879813597313345, + "loss": 0.7023, + "step": 8596 + }, + { + "epoch": 0.4073442312248282, + "grad_norm": 0.80078125, + "learning_rate": 0.00012878387396969524, + "loss": 0.8593, + "step": 8597 + }, + { + "epoch": 0.4073916133617626, + "grad_norm": 0.69921875, + "learning_rate": 0.0001287696113278614, + "loss": 1.1883, + "step": 8598 + }, + { + "epoch": 0.407438995498697, + "grad_norm": 0.515625, + "learning_rate": 0.0001287553480479482, + "loss": 0.5168, + "step": 8599 + }, + { + "epoch": 0.4074863776356314, + "grad_norm": 0.5078125, + "learning_rate": 0.000128741084130272, + "loss": 0.5142, + "step": 8600 + }, + { + "epoch": 0.40753375977256573, + "grad_norm": 0.67578125, + "learning_rate": 0.00012872681957514922, + "loss": 0.1935, + "step": 8601 + }, + { + "epoch": 0.4075811419095001, + "grad_norm": 0.357421875, + "learning_rate": 0.00012871255438289613, + "loss": 0.0267, + "step": 8602 + }, + { + "epoch": 0.4076285240464345, + "grad_norm": 0.58984375, + "learning_rate": 0.00012869828855382917, + "loss": 1.2798, + "step": 8603 + }, + { + "epoch": 0.40767590618336885, + "grad_norm": 0.59765625, + "learning_rate": 0.00012868402208826473, + "loss": 1.0788, + "step": 8604 + }, + { + "epoch": 0.40772328832030325, + "grad_norm": 0.41015625, + "learning_rate": 0.00012866975498651928, + "loss": 0.1732, + "step": 8605 + }, + { + "epoch": 0.40777067045723764, + "grad_norm": 1.0, + "learning_rate": 0.0001286554872489092, + "loss": 1.135, + "step": 8606 + }, + { + "epoch": 0.407818052594172, + "grad_norm": 0.5546875, + "learning_rate": 0.0001286412188757509, + "loss": 0.6741, + "step": 8607 + }, + { + "epoch": 0.40786543473110637, + "grad_norm": 0.63671875, + "learning_rate": 0.00012862694986736086, + "loss": 0.8214, + "step": 8608 + }, + { + "epoch": 0.40791281686804076, + "grad_norm": 0.54296875, + "learning_rate": 0.00012861268022405558, + "loss": 0.7118, + "step": 8609 + }, + { + "epoch": 0.4079601990049751, + "grad_norm": 0.7890625, + "learning_rate": 0.00012859840994615155, + "loss": 0.7582, + "step": 8610 + }, + { + "epoch": 0.4080075811419095, + "grad_norm": 0.703125, + "learning_rate": 0.00012858413903396522, + "loss": 0.7089, + "step": 8611 + }, + { + "epoch": 0.4080549632788439, + "grad_norm": 0.8046875, + "learning_rate": 0.00012856986748781317, + "loss": 0.9185, + "step": 8612 + }, + { + "epoch": 0.4081023454157783, + "grad_norm": 0.69921875, + "learning_rate": 0.00012855559530801185, + "loss": 0.6825, + "step": 8613 + }, + { + "epoch": 0.4081497275527126, + "grad_norm": 1.03125, + "learning_rate": 0.00012854132249487786, + "loss": 0.7458, + "step": 8614 + }, + { + "epoch": 0.408197109689647, + "grad_norm": 0.0084228515625, + "learning_rate": 0.00012852704904872775, + "loss": 0.0001, + "step": 8615 + }, + { + "epoch": 0.4082444918265814, + "grad_norm": 0.306640625, + "learning_rate": 0.00012851277496987807, + "loss": 0.033, + "step": 8616 + }, + { + "epoch": 0.40829187396351574, + "grad_norm": 0.5703125, + "learning_rate": 0.0001284985002586454, + "loss": 0.7717, + "step": 8617 + }, + { + "epoch": 0.40833925610045013, + "grad_norm": 0.52734375, + "learning_rate": 0.00012848422491534636, + "loss": 0.5768, + "step": 8618 + }, + { + "epoch": 0.4083866382373845, + "grad_norm": 0.73828125, + "learning_rate": 0.00012846994894029755, + "loss": 1.0762, + "step": 8619 + }, + { + "epoch": 0.40843402037431886, + "grad_norm": 0.9296875, + "learning_rate": 0.0001284556723338156, + "loss": 1.1826, + "step": 8620 + }, + { + "epoch": 0.40848140251125326, + "grad_norm": 0.38671875, + "learning_rate": 0.00012844139509621714, + "loss": 0.0071, + "step": 8621 + }, + { + "epoch": 0.40852878464818765, + "grad_norm": 0.71875, + "learning_rate": 0.00012842711722781884, + "loss": 1.2741, + "step": 8622 + }, + { + "epoch": 0.408576166785122, + "grad_norm": 0.447265625, + "learning_rate": 0.00012841283872893735, + "loss": 0.7631, + "step": 8623 + }, + { + "epoch": 0.4086235489220564, + "grad_norm": 0.64453125, + "learning_rate": 0.00012839855959988935, + "loss": 0.9233, + "step": 8624 + }, + { + "epoch": 0.40867093105899077, + "grad_norm": 0.7109375, + "learning_rate": 0.00012838427984099158, + "loss": 1.0251, + "step": 8625 + }, + { + "epoch": 0.4087183131959251, + "grad_norm": 0.51171875, + "learning_rate": 0.00012836999945256067, + "loss": 0.7825, + "step": 8626 + }, + { + "epoch": 0.4087656953328595, + "grad_norm": 0.54296875, + "learning_rate": 0.00012835571843491342, + "loss": 0.4097, + "step": 8627 + }, + { + "epoch": 0.4088130774697939, + "grad_norm": 0.55859375, + "learning_rate": 0.00012834143678836654, + "loss": 0.5478, + "step": 8628 + }, + { + "epoch": 0.4088604596067283, + "grad_norm": 0.72265625, + "learning_rate": 0.00012832715451323678, + "loss": 1.1211, + "step": 8629 + }, + { + "epoch": 0.4089078417436626, + "grad_norm": 0.67578125, + "learning_rate": 0.00012831287160984092, + "loss": 0.9444, + "step": 8630 + }, + { + "epoch": 0.408955223880597, + "grad_norm": 0.427734375, + "learning_rate": 0.00012829858807849567, + "loss": 0.1511, + "step": 8631 + }, + { + "epoch": 0.4090026060175314, + "grad_norm": 0.66015625, + "learning_rate": 0.00012828430391951794, + "loss": 1.0221, + "step": 8632 + }, + { + "epoch": 0.40904998815446575, + "grad_norm": 0.6796875, + "learning_rate": 0.00012827001913322445, + "loss": 1.001, + "step": 8633 + }, + { + "epoch": 0.40909737029140014, + "grad_norm": 0.75390625, + "learning_rate": 0.00012825573371993206, + "loss": 0.9136, + "step": 8634 + }, + { + "epoch": 0.40914475242833453, + "grad_norm": 0.25390625, + "learning_rate": 0.00012824144767995758, + "loss": 0.1502, + "step": 8635 + }, + { + "epoch": 0.4091921345652689, + "grad_norm": 0.16015625, + "learning_rate": 0.00012822716101361788, + "loss": 0.1261, + "step": 8636 + }, + { + "epoch": 0.40923951670220327, + "grad_norm": 0.5, + "learning_rate": 0.00012821287372122978, + "loss": 0.3275, + "step": 8637 + }, + { + "epoch": 0.40928689883913766, + "grad_norm": 0.6484375, + "learning_rate": 0.00012819858580311022, + "loss": 1.0782, + "step": 8638 + }, + { + "epoch": 0.409334280976072, + "grad_norm": 0.5625, + "learning_rate": 0.00012818429725957605, + "loss": 0.9956, + "step": 8639 + }, + { + "epoch": 0.4093816631130064, + "grad_norm": 0.482421875, + "learning_rate": 0.00012817000809094424, + "loss": 0.0998, + "step": 8640 + }, + { + "epoch": 0.4094290452499408, + "grad_norm": 0.67578125, + "learning_rate": 0.0001281557182975316, + "loss": 0.9675, + "step": 8641 + }, + { + "epoch": 0.4094764273868752, + "grad_norm": 0.85546875, + "learning_rate": 0.00012814142787965512, + "loss": 1.1673, + "step": 8642 + }, + { + "epoch": 0.4095238095238095, + "grad_norm": 0.16796875, + "learning_rate": 0.0001281271368376318, + "loss": 0.1159, + "step": 8643 + }, + { + "epoch": 0.4095711916607439, + "grad_norm": 0.8203125, + "learning_rate": 0.00012811284517177848, + "loss": 1.1577, + "step": 8644 + }, + { + "epoch": 0.4096185737976783, + "grad_norm": 0.62890625, + "learning_rate": 0.0001280985528824122, + "loss": 0.7769, + "step": 8645 + }, + { + "epoch": 0.40966595593461264, + "grad_norm": 0.1923828125, + "learning_rate": 0.00012808425996985, + "loss": 0.0316, + "step": 8646 + }, + { + "epoch": 0.40971333807154703, + "grad_norm": 0.416015625, + "learning_rate": 0.00012806996643440881, + "loss": 0.0188, + "step": 8647 + }, + { + "epoch": 0.4097607202084814, + "grad_norm": 0.66796875, + "learning_rate": 0.00012805567227640565, + "loss": 0.8488, + "step": 8648 + }, + { + "epoch": 0.40980810234541576, + "grad_norm": 0.1875, + "learning_rate": 0.00012804137749615753, + "loss": 0.0344, + "step": 8649 + }, + { + "epoch": 0.40985548448235015, + "grad_norm": 0.6640625, + "learning_rate": 0.00012802708209398155, + "loss": 0.7397, + "step": 8650 + }, + { + "epoch": 0.40990286661928454, + "grad_norm": 0.4921875, + "learning_rate": 0.00012801278607019477, + "loss": 0.8155, + "step": 8651 + }, + { + "epoch": 0.4099502487562189, + "grad_norm": 0.423828125, + "learning_rate": 0.00012799848942511422, + "loss": 0.1896, + "step": 8652 + }, + { + "epoch": 0.4099976308931533, + "grad_norm": 0.50390625, + "learning_rate": 0.000127984192159057, + "loss": 1.0832, + "step": 8653 + }, + { + "epoch": 0.41004501303008767, + "grad_norm": 0.671875, + "learning_rate": 0.00012796989427234017, + "loss": 1.272, + "step": 8654 + }, + { + "epoch": 0.410092395167022, + "grad_norm": 0.6171875, + "learning_rate": 0.00012795559576528093, + "loss": 0.0558, + "step": 8655 + }, + { + "epoch": 0.4101397773039564, + "grad_norm": 0.57421875, + "learning_rate": 0.0001279412966381963, + "loss": 1.2894, + "step": 8656 + }, + { + "epoch": 0.4101871594408908, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012792699689140351, + "loss": 0.0748, + "step": 8657 + }, + { + "epoch": 0.4102345415778252, + "grad_norm": 0.625, + "learning_rate": 0.00012791269652521965, + "loss": 0.7179, + "step": 8658 + }, + { + "epoch": 0.4102819237147595, + "grad_norm": 0.62109375, + "learning_rate": 0.00012789839553996194, + "loss": 1.427, + "step": 8659 + }, + { + "epoch": 0.4103293058516939, + "grad_norm": 0.52734375, + "learning_rate": 0.00012788409393594752, + "loss": 0.7396, + "step": 8660 + }, + { + "epoch": 0.4103766879886283, + "grad_norm": 0.578125, + "learning_rate": 0.0001278697917134936, + "loss": 0.7206, + "step": 8661 + }, + { + "epoch": 0.41042407012556265, + "grad_norm": 0.6796875, + "learning_rate": 0.00012785548887291737, + "loss": 0.8337, + "step": 8662 + }, + { + "epoch": 0.41047145226249704, + "grad_norm": 0.0908203125, + "learning_rate": 0.00012784118541453609, + "loss": 0.0057, + "step": 8663 + }, + { + "epoch": 0.41051883439943143, + "grad_norm": 0.671875, + "learning_rate": 0.00012782688133866697, + "loss": 1.4189, + "step": 8664 + }, + { + "epoch": 0.41056621653636577, + "grad_norm": 0.455078125, + "learning_rate": 0.00012781257664562726, + "loss": 0.252, + "step": 8665 + }, + { + "epoch": 0.41061359867330016, + "grad_norm": 0.5234375, + "learning_rate": 0.00012779827133573422, + "loss": 1.1228, + "step": 8666 + }, + { + "epoch": 0.41066098081023455, + "grad_norm": 0.5546875, + "learning_rate": 0.00012778396540930512, + "loss": 0.5086, + "step": 8667 + }, + { + "epoch": 0.4107083629471689, + "grad_norm": 1.0625, + "learning_rate": 0.00012776965886665727, + "loss": 0.7849, + "step": 8668 + }, + { + "epoch": 0.4107557450841033, + "grad_norm": 0.703125, + "learning_rate": 0.00012775535170810796, + "loss": 0.8292, + "step": 8669 + }, + { + "epoch": 0.4108031272210377, + "grad_norm": 0.0498046875, + "learning_rate": 0.00012774104393397452, + "loss": 0.0033, + "step": 8670 + }, + { + "epoch": 0.41085050935797207, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001277267355445743, + "loss": 0.0006, + "step": 8671 + }, + { + "epoch": 0.4108978914949064, + "grad_norm": 0.671875, + "learning_rate": 0.00012771242654022454, + "loss": 1.0628, + "step": 8672 + }, + { + "epoch": 0.4109452736318408, + "grad_norm": 2.046875, + "learning_rate": 0.00012769811692124274, + "loss": 0.4934, + "step": 8673 + }, + { + "epoch": 0.4109926557687752, + "grad_norm": 0.7265625, + "learning_rate": 0.00012768380668794622, + "loss": 1.1352, + "step": 8674 + }, + { + "epoch": 0.41104003790570953, + "grad_norm": 0.83984375, + "learning_rate": 0.0001276694958406523, + "loss": 0.6742, + "step": 8675 + }, + { + "epoch": 0.4110874200426439, + "grad_norm": 0.6640625, + "learning_rate": 0.00012765518437967847, + "loss": 1.153, + "step": 8676 + }, + { + "epoch": 0.4111348021795783, + "grad_norm": 0.70703125, + "learning_rate": 0.00012764087230534213, + "loss": 1.3431, + "step": 8677 + }, + { + "epoch": 0.41118218431651266, + "grad_norm": 0.154296875, + "learning_rate": 0.00012762655961796066, + "loss": 0.0087, + "step": 8678 + }, + { + "epoch": 0.41122956645344705, + "grad_norm": 0.50390625, + "learning_rate": 0.00012761224631785152, + "loss": 0.7768, + "step": 8679 + }, + { + "epoch": 0.41127694859038144, + "grad_norm": 0.65625, + "learning_rate": 0.00012759793240533216, + "loss": 1.138, + "step": 8680 + }, + { + "epoch": 0.4113243307273158, + "grad_norm": 0.609375, + "learning_rate": 0.00012758361788072007, + "loss": 1.1168, + "step": 8681 + }, + { + "epoch": 0.41137171286425017, + "grad_norm": 0.515625, + "learning_rate": 0.0001275693027443327, + "loss": 0.6417, + "step": 8682 + }, + { + "epoch": 0.41141909500118456, + "grad_norm": 0.49609375, + "learning_rate": 0.00012755498699648757, + "loss": 0.1818, + "step": 8683 + }, + { + "epoch": 0.4114664771381189, + "grad_norm": 0.828125, + "learning_rate": 0.0001275406706375022, + "loss": 1.044, + "step": 8684 + }, + { + "epoch": 0.4115138592750533, + "grad_norm": 0.703125, + "learning_rate": 0.00012752635366769405, + "loss": 1.1786, + "step": 8685 + }, + { + "epoch": 0.4115612414119877, + "grad_norm": 0.68359375, + "learning_rate": 0.00012751203608738073, + "loss": 0.9176, + "step": 8686 + }, + { + "epoch": 0.4116086235489221, + "grad_norm": 0.7734375, + "learning_rate": 0.00012749771789687974, + "loss": 0.9885, + "step": 8687 + }, + { + "epoch": 0.4116560056858564, + "grad_norm": 0.58203125, + "learning_rate": 0.00012748339909650865, + "loss": 0.6357, + "step": 8688 + }, + { + "epoch": 0.4117033878227908, + "grad_norm": 0.28515625, + "learning_rate": 0.00012746907968658508, + "loss": 0.0454, + "step": 8689 + }, + { + "epoch": 0.4117507699597252, + "grad_norm": 0.08154296875, + "learning_rate": 0.00012745475966742653, + "loss": 0.0118, + "step": 8690 + }, + { + "epoch": 0.41179815209665954, + "grad_norm": 0.003387451171875, + "learning_rate": 0.00012744043903935065, + "loss": 0.0001, + "step": 8691 + }, + { + "epoch": 0.41184553423359394, + "grad_norm": 0.72265625, + "learning_rate": 0.0001274261178026751, + "loss": 1.1407, + "step": 8692 + }, + { + "epoch": 0.41189291637052833, + "grad_norm": 0.62109375, + "learning_rate": 0.00012741179595771748, + "loss": 1.2763, + "step": 8693 + }, + { + "epoch": 0.41194029850746267, + "grad_norm": 0.83984375, + "learning_rate": 0.00012739747350479542, + "loss": 1.1881, + "step": 8694 + }, + { + "epoch": 0.41198768064439706, + "grad_norm": 1.0078125, + "learning_rate": 0.00012738315044422653, + "loss": 1.1318, + "step": 8695 + }, + { + "epoch": 0.41203506278133145, + "grad_norm": 0.6484375, + "learning_rate": 0.00012736882677632854, + "loss": 1.301, + "step": 8696 + }, + { + "epoch": 0.4120824449182658, + "grad_norm": 0.51171875, + "learning_rate": 0.00012735450250141915, + "loss": 0.0163, + "step": 8697 + }, + { + "epoch": 0.4121298270552002, + "grad_norm": 0.7734375, + "learning_rate": 0.00012734017761981603, + "loss": 1.1192, + "step": 8698 + }, + { + "epoch": 0.4121772091921346, + "grad_norm": 0.6796875, + "learning_rate": 0.00012732585213183687, + "loss": 0.7565, + "step": 8699 + }, + { + "epoch": 0.41222459132906897, + "grad_norm": 0.75390625, + "learning_rate": 0.00012731152603779946, + "loss": 0.9723, + "step": 8700 + }, + { + "epoch": 0.4122719734660033, + "grad_norm": 0.9765625, + "learning_rate": 0.00012729719933802147, + "loss": 1.3824, + "step": 8701 + }, + { + "epoch": 0.4123193556029377, + "grad_norm": 0.859375, + "learning_rate": 0.00012728287203282065, + "loss": 0.8237, + "step": 8702 + }, + { + "epoch": 0.4123667377398721, + "grad_norm": 0.87890625, + "learning_rate": 0.0001272685441225148, + "loss": 0.891, + "step": 8703 + }, + { + "epoch": 0.41241411987680643, + "grad_norm": 0.7578125, + "learning_rate": 0.0001272542156074217, + "loss": 0.9442, + "step": 8704 + }, + { + "epoch": 0.4124615020137408, + "grad_norm": 0.66796875, + "learning_rate": 0.00012723988648785913, + "loss": 0.7798, + "step": 8705 + }, + { + "epoch": 0.4125088841506752, + "grad_norm": 0.90625, + "learning_rate": 0.0001272255567641449, + "loss": 0.8677, + "step": 8706 + }, + { + "epoch": 0.41255626628760955, + "grad_norm": 0.7265625, + "learning_rate": 0.00012721122643659678, + "loss": 0.8635, + "step": 8707 + }, + { + "epoch": 0.41260364842454395, + "grad_norm": 0.29296875, + "learning_rate": 0.00012719689550553268, + "loss": 0.0828, + "step": 8708 + }, + { + "epoch": 0.41265103056147834, + "grad_norm": 0.6875, + "learning_rate": 0.00012718256397127042, + "loss": 1.1555, + "step": 8709 + }, + { + "epoch": 0.4126984126984127, + "grad_norm": 0.73828125, + "learning_rate": 0.0001271682318341278, + "loss": 1.2882, + "step": 8710 + }, + { + "epoch": 0.41274579483534707, + "grad_norm": 0.6015625, + "learning_rate": 0.00012715389909442274, + "loss": 1.1442, + "step": 8711 + }, + { + "epoch": 0.41279317697228146, + "grad_norm": 0.67578125, + "learning_rate": 0.00012713956575247318, + "loss": 1.3723, + "step": 8712 + }, + { + "epoch": 0.4128405591092158, + "grad_norm": 0.494140625, + "learning_rate": 0.00012712523180859688, + "loss": 0.5592, + "step": 8713 + }, + { + "epoch": 0.4128879412461502, + "grad_norm": 0.6640625, + "learning_rate": 0.00012711089726311187, + "loss": 0.7329, + "step": 8714 + }, + { + "epoch": 0.4129353233830846, + "grad_norm": 0.671875, + "learning_rate": 0.00012709656211633603, + "loss": 0.2984, + "step": 8715 + }, + { + "epoch": 0.412982705520019, + "grad_norm": 0.6484375, + "learning_rate": 0.00012708222636858733, + "loss": 1.2764, + "step": 8716 + }, + { + "epoch": 0.4130300876569533, + "grad_norm": 0.6015625, + "learning_rate": 0.00012706789002018366, + "loss": 0.9975, + "step": 8717 + }, + { + "epoch": 0.4130774697938877, + "grad_norm": 0.263671875, + "learning_rate": 0.00012705355307144305, + "loss": 0.0301, + "step": 8718 + }, + { + "epoch": 0.4131248519308221, + "grad_norm": 0.9296875, + "learning_rate": 0.0001270392155226834, + "loss": 1.0671, + "step": 8719 + }, + { + "epoch": 0.41317223406775644, + "grad_norm": 0.5390625, + "learning_rate": 0.0001270248773742228, + "loss": 0.6159, + "step": 8720 + }, + { + "epoch": 0.41321961620469083, + "grad_norm": 0.251953125, + "learning_rate": 0.00012701053862637917, + "loss": 0.0311, + "step": 8721 + }, + { + "epoch": 0.4132669983416252, + "grad_norm": 0.87890625, + "learning_rate": 0.0001269961992794706, + "loss": 0.9573, + "step": 8722 + }, + { + "epoch": 0.41331438047855956, + "grad_norm": 0.765625, + "learning_rate": 0.00012698185933381507, + "loss": 1.1541, + "step": 8723 + }, + { + "epoch": 0.41336176261549396, + "grad_norm": 0.73828125, + "learning_rate": 0.00012696751878973065, + "loss": 1.4427, + "step": 8724 + }, + { + "epoch": 0.41340914475242835, + "grad_norm": 0.61328125, + "learning_rate": 0.00012695317764753537, + "loss": 0.9246, + "step": 8725 + }, + { + "epoch": 0.4134565268893627, + "grad_norm": 0.72265625, + "learning_rate": 0.0001269388359075473, + "loss": 0.7327, + "step": 8726 + }, + { + "epoch": 0.4135039090262971, + "grad_norm": 0.8203125, + "learning_rate": 0.0001269244935700846, + "loss": 0.9051, + "step": 8727 + }, + { + "epoch": 0.41355129116323147, + "grad_norm": 0.453125, + "learning_rate": 0.00012691015063546525, + "loss": 0.1719, + "step": 8728 + }, + { + "epoch": 0.41359867330016586, + "grad_norm": 0.578125, + "learning_rate": 0.00012689580710400746, + "loss": 0.9895, + "step": 8729 + }, + { + "epoch": 0.4136460554371002, + "grad_norm": 0.361328125, + "learning_rate": 0.00012688146297602928, + "loss": 0.0378, + "step": 8730 + }, + { + "epoch": 0.4136934375740346, + "grad_norm": 0.7890625, + "learning_rate": 0.00012686711825184895, + "loss": 0.9236, + "step": 8731 + }, + { + "epoch": 0.413740819710969, + "grad_norm": 1.1875, + "learning_rate": 0.0001268527729317845, + "loss": 1.1369, + "step": 8732 + }, + { + "epoch": 0.4137882018479033, + "grad_norm": 0.2080078125, + "learning_rate": 0.00012683842701615417, + "loss": 0.0278, + "step": 8733 + }, + { + "epoch": 0.4138355839848377, + "grad_norm": 0.609375, + "learning_rate": 0.0001268240805052761, + "loss": 0.6815, + "step": 8734 + }, + { + "epoch": 0.4138829661217721, + "grad_norm": 0.12060546875, + "learning_rate": 0.00012680973339946854, + "loss": 0.0055, + "step": 8735 + }, + { + "epoch": 0.41393034825870645, + "grad_norm": 0.67578125, + "learning_rate": 0.0001267953856990496, + "loss": 1.0057, + "step": 8736 + }, + { + "epoch": 0.41397773039564084, + "grad_norm": 0.61328125, + "learning_rate": 0.00012678103740433754, + "loss": 0.7069, + "step": 8737 + }, + { + "epoch": 0.41402511253257523, + "grad_norm": 0.53515625, + "learning_rate": 0.0001267666885156506, + "loss": 0.0898, + "step": 8738 + }, + { + "epoch": 0.41407249466950957, + "grad_norm": 0.74609375, + "learning_rate": 0.00012675233903330707, + "loss": 0.9589, + "step": 8739 + }, + { + "epoch": 0.41411987680644397, + "grad_norm": 0.66796875, + "learning_rate": 0.00012673798895762513, + "loss": 1.1259, + "step": 8740 + }, + { + "epoch": 0.41416725894337836, + "grad_norm": 0.5390625, + "learning_rate": 0.00012672363828892307, + "loss": 0.5192, + "step": 8741 + }, + { + "epoch": 0.4142146410803127, + "grad_norm": 0.09228515625, + "learning_rate": 0.00012670928702751915, + "loss": 0.0111, + "step": 8742 + }, + { + "epoch": 0.4142620232172471, + "grad_norm": 0.68359375, + "learning_rate": 0.00012669493517373175, + "loss": 0.9285, + "step": 8743 + }, + { + "epoch": 0.4143094053541815, + "grad_norm": 0.1015625, + "learning_rate": 0.00012668058272787908, + "loss": 0.013, + "step": 8744 + }, + { + "epoch": 0.4143567874911159, + "grad_norm": 0.54296875, + "learning_rate": 0.00012666622969027952, + "loss": 0.6148, + "step": 8745 + }, + { + "epoch": 0.4144041696280502, + "grad_norm": 0.6875, + "learning_rate": 0.00012665187606125138, + "loss": 0.7689, + "step": 8746 + }, + { + "epoch": 0.4144515517649846, + "grad_norm": 0.62890625, + "learning_rate": 0.00012663752184111302, + "loss": 1.0613, + "step": 8747 + }, + { + "epoch": 0.414498933901919, + "grad_norm": 0.74609375, + "learning_rate": 0.00012662316703018279, + "loss": 1.1997, + "step": 8748 + }, + { + "epoch": 0.41454631603885334, + "grad_norm": 0.6796875, + "learning_rate": 0.00012660881162877908, + "loss": 0.5236, + "step": 8749 + }, + { + "epoch": 0.41459369817578773, + "grad_norm": 1.046875, + "learning_rate": 0.0001265944556372203, + "loss": 0.0557, + "step": 8750 + }, + { + "epoch": 0.4146410803127221, + "grad_norm": 0.435546875, + "learning_rate": 0.0001265800990558248, + "loss": 0.132, + "step": 8751 + }, + { + "epoch": 0.41468846244965646, + "grad_norm": 0.53515625, + "learning_rate": 0.00012656574188491099, + "loss": 0.4266, + "step": 8752 + }, + { + "epoch": 0.41473584458659085, + "grad_norm": 0.6171875, + "learning_rate": 0.00012655138412479732, + "loss": 1.1053, + "step": 8753 + }, + { + "epoch": 0.41478322672352524, + "grad_norm": 0.6796875, + "learning_rate": 0.00012653702577580228, + "loss": 1.6178, + "step": 8754 + }, + { + "epoch": 0.4148306088604596, + "grad_norm": 0.447265625, + "learning_rate": 0.00012652266683824424, + "loss": 0.2094, + "step": 8755 + }, + { + "epoch": 0.414877990997394, + "grad_norm": 0.94140625, + "learning_rate": 0.0001265083073124417, + "loss": 0.754, + "step": 8756 + }, + { + "epoch": 0.41492537313432837, + "grad_norm": 0.055419921875, + "learning_rate": 0.00012649394719871314, + "loss": 0.0055, + "step": 8757 + }, + { + "epoch": 0.41497275527126276, + "grad_norm": 0.59765625, + "learning_rate": 0.00012647958649737707, + "loss": 0.97, + "step": 8758 + }, + { + "epoch": 0.4150201374081971, + "grad_norm": 0.81640625, + "learning_rate": 0.00012646522520875196, + "loss": 0.0633, + "step": 8759 + }, + { + "epoch": 0.4150675195451315, + "grad_norm": 0.765625, + "learning_rate": 0.00012645086333315633, + "loss": 1.0843, + "step": 8760 + }, + { + "epoch": 0.4151149016820659, + "grad_norm": 0.5546875, + "learning_rate": 0.00012643650087090875, + "loss": 0.9516, + "step": 8761 + }, + { + "epoch": 0.4151622838190002, + "grad_norm": 0.5703125, + "learning_rate": 0.00012642213782232775, + "loss": 1.0484, + "step": 8762 + }, + { + "epoch": 0.4152096659559346, + "grad_norm": 0.74609375, + "learning_rate": 0.00012640777418773186, + "loss": 1.0012, + "step": 8763 + }, + { + "epoch": 0.415257048092869, + "grad_norm": 0.73828125, + "learning_rate": 0.00012639340996743967, + "loss": 0.5978, + "step": 8764 + }, + { + "epoch": 0.41530443022980335, + "grad_norm": 0.80078125, + "learning_rate": 0.00012637904516176975, + "loss": 0.4532, + "step": 8765 + }, + { + "epoch": 0.41535181236673774, + "grad_norm": 0.85546875, + "learning_rate": 0.0001263646797710407, + "loss": 0.25, + "step": 8766 + }, + { + "epoch": 0.41539919450367213, + "grad_norm": 0.55859375, + "learning_rate": 0.00012635031379557116, + "loss": 1.0736, + "step": 8767 + }, + { + "epoch": 0.41544657664060647, + "grad_norm": 0.625, + "learning_rate": 0.00012633594723567972, + "loss": 1.2801, + "step": 8768 + }, + { + "epoch": 0.41549395877754086, + "grad_norm": 0.39453125, + "learning_rate": 0.000126321580091685, + "loss": 0.3909, + "step": 8769 + }, + { + "epoch": 0.41554134091447525, + "grad_norm": 0.31640625, + "learning_rate": 0.0001263072123639057, + "loss": 0.0057, + "step": 8770 + }, + { + "epoch": 0.4155887230514096, + "grad_norm": 0.609375, + "learning_rate": 0.00012629284405266044, + "loss": 0.879, + "step": 8771 + }, + { + "epoch": 0.415636105188344, + "grad_norm": 0.06005859375, + "learning_rate": 0.0001262784751582679, + "loss": 0.0036, + "step": 8772 + }, + { + "epoch": 0.4156834873252784, + "grad_norm": 0.490234375, + "learning_rate": 0.0001262641056810468, + "loss": 0.0827, + "step": 8773 + }, + { + "epoch": 0.41573086946221277, + "grad_norm": 0.5390625, + "learning_rate": 0.00012624973562131578, + "loss": 0.5373, + "step": 8774 + }, + { + "epoch": 0.4157782515991471, + "grad_norm": 0.267578125, + "learning_rate": 0.00012623536497939356, + "loss": 0.0488, + "step": 8775 + }, + { + "epoch": 0.4158256337360815, + "grad_norm": 0.64453125, + "learning_rate": 0.00012622099375559894, + "loss": 0.0914, + "step": 8776 + }, + { + "epoch": 0.4158730158730159, + "grad_norm": 0.55078125, + "learning_rate": 0.00012620662195025057, + "loss": 0.032, + "step": 8777 + }, + { + "epoch": 0.41592039800995023, + "grad_norm": 0.62109375, + "learning_rate": 0.00012619224956366724, + "loss": 1.1273, + "step": 8778 + }, + { + "epoch": 0.4159677801468846, + "grad_norm": 0.609375, + "learning_rate": 0.00012617787659616775, + "loss": 0.6303, + "step": 8779 + }, + { + "epoch": 0.416015162283819, + "grad_norm": 0.84375, + "learning_rate": 0.00012616350304807083, + "loss": 1.3105, + "step": 8780 + }, + { + "epoch": 0.41606254442075336, + "grad_norm": 0.828125, + "learning_rate": 0.00012614912891969525, + "loss": 1.2002, + "step": 8781 + }, + { + "epoch": 0.41610992655768775, + "grad_norm": 0.62109375, + "learning_rate": 0.00012613475421135987, + "loss": 1.2593, + "step": 8782 + }, + { + "epoch": 0.41615730869462214, + "grad_norm": 1.0859375, + "learning_rate": 0.00012612037892338347, + "loss": 1.1892, + "step": 8783 + }, + { + "epoch": 0.4162046908315565, + "grad_norm": 0.53515625, + "learning_rate": 0.0001261060030560849, + "loss": 0.271, + "step": 8784 + }, + { + "epoch": 0.41625207296849087, + "grad_norm": 0.337890625, + "learning_rate": 0.00012609162660978296, + "loss": 0.1717, + "step": 8785 + }, + { + "epoch": 0.41629945510542526, + "grad_norm": 0.640625, + "learning_rate": 0.00012607724958479656, + "loss": 0.7417, + "step": 8786 + }, + { + "epoch": 0.41634683724235966, + "grad_norm": 0.6953125, + "learning_rate": 0.00012606287198144454, + "loss": 0.5609, + "step": 8787 + }, + { + "epoch": 0.416394219379294, + "grad_norm": 0.55078125, + "learning_rate": 0.00012604849380004577, + "loss": 0.7522, + "step": 8788 + }, + { + "epoch": 0.4164416015162284, + "grad_norm": 0.419921875, + "learning_rate": 0.00012603411504091917, + "loss": 0.8829, + "step": 8789 + }, + { + "epoch": 0.4164889836531628, + "grad_norm": 1.0078125, + "learning_rate": 0.00012601973570438362, + "loss": 0.49, + "step": 8790 + }, + { + "epoch": 0.4165363657900971, + "grad_norm": 0.59765625, + "learning_rate": 0.00012600535579075802, + "loss": 1.0319, + "step": 8791 + }, + { + "epoch": 0.4165837479270315, + "grad_norm": 0.6796875, + "learning_rate": 0.00012599097530036136, + "loss": 0.6672, + "step": 8792 + }, + { + "epoch": 0.4166311300639659, + "grad_norm": 0.7109375, + "learning_rate": 0.00012597659423351252, + "loss": 1.1843, + "step": 8793 + }, + { + "epoch": 0.41667851220090024, + "grad_norm": 0.55859375, + "learning_rate": 0.00012596221259053052, + "loss": 0.6169, + "step": 8794 + }, + { + "epoch": 0.41672589433783463, + "grad_norm": 0.7109375, + "learning_rate": 0.0001259478303717343, + "loss": 1.0632, + "step": 8795 + }, + { + "epoch": 0.416773276474769, + "grad_norm": 0.67578125, + "learning_rate": 0.00012593344757744286, + "loss": 0.8213, + "step": 8796 + }, + { + "epoch": 0.41682065861170337, + "grad_norm": 0.875, + "learning_rate": 0.00012591906420797512, + "loss": 1.3638, + "step": 8797 + }, + { + "epoch": 0.41686804074863776, + "grad_norm": 0.51953125, + "learning_rate": 0.00012590468026365016, + "loss": 0.0186, + "step": 8798 + }, + { + "epoch": 0.41691542288557215, + "grad_norm": 0.166015625, + "learning_rate": 0.00012589029574478698, + "loss": 0.0075, + "step": 8799 + }, + { + "epoch": 0.4169628050225065, + "grad_norm": 0.06396484375, + "learning_rate": 0.00012587591065170466, + "loss": 0.0046, + "step": 8800 + }, + { + "epoch": 0.4170101871594409, + "grad_norm": 1.015625, + "learning_rate": 0.00012586152498472218, + "loss": 1.0377, + "step": 8801 + }, + { + "epoch": 0.4170575692963753, + "grad_norm": 0.640625, + "learning_rate": 0.0001258471387441586, + "loss": 1.0316, + "step": 8802 + }, + { + "epoch": 0.41710495143330967, + "grad_norm": 0.005462646484375, + "learning_rate": 0.00012583275193033303, + "loss": 0.0002, + "step": 8803 + }, + { + "epoch": 0.417152333570244, + "grad_norm": 0.251953125, + "learning_rate": 0.00012581836454356456, + "loss": 0.0459, + "step": 8804 + }, + { + "epoch": 0.4171997157071784, + "grad_norm": 0.71484375, + "learning_rate": 0.00012580397658417224, + "loss": 1.2865, + "step": 8805 + }, + { + "epoch": 0.4172470978441128, + "grad_norm": 0.85546875, + "learning_rate": 0.00012578958805247522, + "loss": 0.7229, + "step": 8806 + }, + { + "epoch": 0.41729447998104713, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012577519894879256, + "loss": 0.0235, + "step": 8807 + }, + { + "epoch": 0.4173418621179815, + "grad_norm": 0.12158203125, + "learning_rate": 0.00012576080927344352, + "loss": 0.0176, + "step": 8808 + }, + { + "epoch": 0.4173892442549159, + "grad_norm": 0.65234375, + "learning_rate": 0.0001257464190267471, + "loss": 0.6251, + "step": 8809 + }, + { + "epoch": 0.41743662639185025, + "grad_norm": 0.625, + "learning_rate": 0.00012573202820902256, + "loss": 1.1001, + "step": 8810 + }, + { + "epoch": 0.41748400852878464, + "grad_norm": 0.451171875, + "learning_rate": 0.00012571763682058904, + "loss": 0.5405, + "step": 8811 + }, + { + "epoch": 0.41753139066571904, + "grad_norm": 0.796875, + "learning_rate": 0.00012570324486176575, + "loss": 0.988, + "step": 8812 + }, + { + "epoch": 0.4175787728026534, + "grad_norm": 0.4921875, + "learning_rate": 0.0001256888523328718, + "loss": 0.0729, + "step": 8813 + }, + { + "epoch": 0.41762615493958777, + "grad_norm": 0.50390625, + "learning_rate": 0.00012567445923422654, + "loss": 0.8022, + "step": 8814 + }, + { + "epoch": 0.41767353707652216, + "grad_norm": 0.390625, + "learning_rate": 0.00012566006556614912, + "loss": 0.0961, + "step": 8815 + }, + { + "epoch": 0.41772091921345655, + "grad_norm": 0.53515625, + "learning_rate": 0.00012564567132895873, + "loss": 0.932, + "step": 8816 + }, + { + "epoch": 0.4177683013503909, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012563127652297467, + "loss": 0.0159, + "step": 8817 + }, + { + "epoch": 0.4178156834873253, + "grad_norm": 0.52734375, + "learning_rate": 0.00012561688114851624, + "loss": 0.6744, + "step": 8818 + }, + { + "epoch": 0.4178630656242597, + "grad_norm": 0.498046875, + "learning_rate": 0.00012560248520590263, + "loss": 0.556, + "step": 8819 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 0.70703125, + "learning_rate": 0.0001255880886954532, + "loss": 1.4377, + "step": 8820 + }, + { + "epoch": 0.4179578298981284, + "grad_norm": 0.72265625, + "learning_rate": 0.0001255736916174872, + "loss": 1.3856, + "step": 8821 + }, + { + "epoch": 0.4180052120350628, + "grad_norm": 0.52734375, + "learning_rate": 0.00012555929397232396, + "loss": 0.9241, + "step": 8822 + }, + { + "epoch": 0.41805259417199714, + "grad_norm": 0.70703125, + "learning_rate": 0.00012554489576028282, + "loss": 0.9361, + "step": 8823 + }, + { + "epoch": 0.41809997630893153, + "grad_norm": 0.89453125, + "learning_rate": 0.0001255304969816831, + "loss": 0.0571, + "step": 8824 + }, + { + "epoch": 0.4181473584458659, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0001255160976368441, + "loss": 0.0014, + "step": 8825 + }, + { + "epoch": 0.41819474058280026, + "grad_norm": 0.267578125, + "learning_rate": 0.0001255016977260853, + "loss": 0.017, + "step": 8826 + }, + { + "epoch": 0.41824212271973465, + "grad_norm": 0.54296875, + "learning_rate": 0.000125487297249726, + "loss": 0.5335, + "step": 8827 + }, + { + "epoch": 0.41828950485666905, + "grad_norm": 0.8046875, + "learning_rate": 0.00012547289620808556, + "loss": 0.751, + "step": 8828 + }, + { + "epoch": 0.4183368869936034, + "grad_norm": 0.9765625, + "learning_rate": 0.00012545849460148342, + "loss": 0.5795, + "step": 8829 + }, + { + "epoch": 0.4183842691305378, + "grad_norm": 0.14453125, + "learning_rate": 0.000125444092430239, + "loss": 0.018, + "step": 8830 + }, + { + "epoch": 0.41843165126747217, + "grad_norm": 0.6484375, + "learning_rate": 0.00012542968969467172, + "loss": 1.1114, + "step": 8831 + }, + { + "epoch": 0.41847903340440656, + "grad_norm": 0.76171875, + "learning_rate": 0.00012541528639510097, + "loss": 1.3415, + "step": 8832 + }, + { + "epoch": 0.4185264155413409, + "grad_norm": 0.64453125, + "learning_rate": 0.00012540088253184628, + "loss": 0.1147, + "step": 8833 + }, + { + "epoch": 0.4185737976782753, + "grad_norm": 0.0196533203125, + "learning_rate": 0.00012538647810522708, + "loss": 0.0015, + "step": 8834 + }, + { + "epoch": 0.4186211798152097, + "grad_norm": 0.2392578125, + "learning_rate": 0.00012537207311556282, + "loss": 0.0165, + "step": 8835 + }, + { + "epoch": 0.418668561952144, + "grad_norm": 0.431640625, + "learning_rate": 0.000125357667563173, + "loss": 0.0302, + "step": 8836 + }, + { + "epoch": 0.4187159440890784, + "grad_norm": 0.625, + "learning_rate": 0.00012534326144837712, + "loss": 0.0651, + "step": 8837 + }, + { + "epoch": 0.4187633262260128, + "grad_norm": 0.8125, + "learning_rate": 0.0001253288547714947, + "loss": 1.0661, + "step": 8838 + }, + { + "epoch": 0.41881070836294715, + "grad_norm": 0.6796875, + "learning_rate": 0.00012531444753284524, + "loss": 0.9368, + "step": 8839 + }, + { + "epoch": 0.41885809049988154, + "grad_norm": 0.58203125, + "learning_rate": 0.0001253000397327483, + "loss": 1.0135, + "step": 8840 + }, + { + "epoch": 0.41890547263681593, + "grad_norm": 0.6484375, + "learning_rate": 0.00012528563137152344, + "loss": 0.989, + "step": 8841 + }, + { + "epoch": 0.41895285477375027, + "grad_norm": 0.55078125, + "learning_rate": 0.00012527122244949024, + "loss": 0.571, + "step": 8842 + }, + { + "epoch": 0.41900023691068466, + "grad_norm": 0.7890625, + "learning_rate": 0.0001252568129669682, + "loss": 0.7921, + "step": 8843 + }, + { + "epoch": 0.41904761904761906, + "grad_norm": 0.62109375, + "learning_rate": 0.00012524240292427697, + "loss": 0.8786, + "step": 8844 + }, + { + "epoch": 0.41909500118455345, + "grad_norm": 0.5625, + "learning_rate": 0.00012522799232173612, + "loss": 0.8598, + "step": 8845 + }, + { + "epoch": 0.4191423833214878, + "grad_norm": 0.609375, + "learning_rate": 0.00012521358115966528, + "loss": 0.6554, + "step": 8846 + }, + { + "epoch": 0.4191897654584222, + "grad_norm": 0.9453125, + "learning_rate": 0.00012519916943838405, + "loss": 0.698, + "step": 8847 + }, + { + "epoch": 0.4192371475953566, + "grad_norm": 0.375, + "learning_rate": 0.0001251847571582121, + "loss": 0.1573, + "step": 8848 + }, + { + "epoch": 0.4192845297322909, + "grad_norm": 0.796875, + "learning_rate": 0.0001251703443194691, + "loss": 1.1251, + "step": 8849 + }, + { + "epoch": 0.4193319118692253, + "grad_norm": 0.59765625, + "learning_rate": 0.0001251559309224746, + "loss": 1.0505, + "step": 8850 + }, + { + "epoch": 0.4193792940061597, + "grad_norm": 0.58984375, + "learning_rate": 0.0001251415169675484, + "loss": 0.6131, + "step": 8851 + }, + { + "epoch": 0.41942667614309403, + "grad_norm": 0.80078125, + "learning_rate": 0.0001251271024550101, + "loss": 0.8077, + "step": 8852 + }, + { + "epoch": 0.41947405828002843, + "grad_norm": 0.73046875, + "learning_rate": 0.00012511268738517943, + "loss": 1.0084, + "step": 8853 + }, + { + "epoch": 0.4195214404169628, + "grad_norm": 1.0078125, + "learning_rate": 0.00012509827175837614, + "loss": 0.3676, + "step": 8854 + }, + { + "epoch": 0.41956882255389716, + "grad_norm": 0.5859375, + "learning_rate": 0.00012508385557491987, + "loss": 0.7035, + "step": 8855 + }, + { + "epoch": 0.41961620469083155, + "grad_norm": 0.53125, + "learning_rate": 0.00012506943883513043, + "loss": 1.079, + "step": 8856 + }, + { + "epoch": 0.41966358682776594, + "grad_norm": 0.59765625, + "learning_rate": 0.00012505502153932753, + "loss": 1.1291, + "step": 8857 + }, + { + "epoch": 0.4197109689647003, + "grad_norm": 0.671875, + "learning_rate": 0.00012504060368783096, + "loss": 1.1054, + "step": 8858 + }, + { + "epoch": 0.4197583511016347, + "grad_norm": 0.6640625, + "learning_rate": 0.00012502618528096045, + "loss": 1.0571, + "step": 8859 + }, + { + "epoch": 0.41980573323856907, + "grad_norm": 0.81640625, + "learning_rate": 0.00012501176631903583, + "loss": 0.9061, + "step": 8860 + }, + { + "epoch": 0.41985311537550346, + "grad_norm": 0.6953125, + "learning_rate": 0.00012499734680237684, + "loss": 1.1701, + "step": 8861 + }, + { + "epoch": 0.4199004975124378, + "grad_norm": 0.62890625, + "learning_rate": 0.00012498292673130334, + "loss": 0.6589, + "step": 8862 + }, + { + "epoch": 0.4199478796493722, + "grad_norm": 0.7890625, + "learning_rate": 0.00012496850610613512, + "loss": 1.3303, + "step": 8863 + }, + { + "epoch": 0.4199952617863066, + "grad_norm": 0.55078125, + "learning_rate": 0.00012495408492719205, + "loss": 0.1845, + "step": 8864 + }, + { + "epoch": 0.4200426439232409, + "grad_norm": 0.25390625, + "learning_rate": 0.00012493966319479397, + "loss": 0.0253, + "step": 8865 + }, + { + "epoch": 0.4200900260601753, + "grad_norm": 1.109375, + "learning_rate": 0.0001249252409092607, + "loss": 1.3075, + "step": 8866 + }, + { + "epoch": 0.4201374081971097, + "grad_norm": 0.60546875, + "learning_rate": 0.00012491081807091214, + "loss": 0.6048, + "step": 8867 + }, + { + "epoch": 0.42018479033404404, + "grad_norm": 0.6171875, + "learning_rate": 0.0001248963946800682, + "loss": 0.9905, + "step": 8868 + }, + { + "epoch": 0.42023217247097844, + "grad_norm": 0.54296875, + "learning_rate": 0.0001248819707370487, + "loss": 1.0346, + "step": 8869 + }, + { + "epoch": 0.42027955460791283, + "grad_norm": 0.2138671875, + "learning_rate": 0.00012486754624217361, + "loss": 0.0315, + "step": 8870 + }, + { + "epoch": 0.42032693674484717, + "grad_norm": 0.63671875, + "learning_rate": 0.0001248531211957628, + "loss": 0.9402, + "step": 8871 + }, + { + "epoch": 0.42037431888178156, + "grad_norm": 0.65234375, + "learning_rate": 0.00012483869559813627, + "loss": 0.748, + "step": 8872 + }, + { + "epoch": 0.42042170101871595, + "grad_norm": 0.57421875, + "learning_rate": 0.00012482426944961396, + "loss": 1.1636, + "step": 8873 + }, + { + "epoch": 0.42046908315565035, + "grad_norm": 0.255859375, + "learning_rate": 0.00012480984275051575, + "loss": 0.0295, + "step": 8874 + }, + { + "epoch": 0.4205164652925847, + "grad_norm": 0.291015625, + "learning_rate": 0.00012479541550116166, + "loss": 0.026, + "step": 8875 + }, + { + "epoch": 0.4205638474295191, + "grad_norm": 0.640625, + "learning_rate": 0.00012478098770187166, + "loss": 0.3643, + "step": 8876 + }, + { + "epoch": 0.42061122956645347, + "grad_norm": 0.7734375, + "learning_rate": 0.00012476655935296575, + "loss": 1.3416, + "step": 8877 + }, + { + "epoch": 0.4206586117033878, + "grad_norm": 0.6640625, + "learning_rate": 0.00012475213045476394, + "loss": 1.367, + "step": 8878 + }, + { + "epoch": 0.4207059938403222, + "grad_norm": 0.345703125, + "learning_rate": 0.00012473770100758624, + "loss": 0.0348, + "step": 8879 + }, + { + "epoch": 0.4207533759772566, + "grad_norm": 0.53515625, + "learning_rate": 0.0001247232710117527, + "loss": 0.4657, + "step": 8880 + }, + { + "epoch": 0.42080075811419093, + "grad_norm": 0.56640625, + "learning_rate": 0.00012470884046758332, + "loss": 0.5723, + "step": 8881 + }, + { + "epoch": 0.4208481402511253, + "grad_norm": 0.671875, + "learning_rate": 0.00012469440937539817, + "loss": 0.9915, + "step": 8882 + }, + { + "epoch": 0.4208955223880597, + "grad_norm": 0.58203125, + "learning_rate": 0.00012467997773551735, + "loss": 1.0745, + "step": 8883 + }, + { + "epoch": 0.42094290452499405, + "grad_norm": 0.65234375, + "learning_rate": 0.00012466554554826088, + "loss": 1.0999, + "step": 8884 + }, + { + "epoch": 0.42099028666192845, + "grad_norm": 0.68359375, + "learning_rate": 0.0001246511128139489, + "loss": 1.0777, + "step": 8885 + }, + { + "epoch": 0.42103766879886284, + "grad_norm": 0.6640625, + "learning_rate": 0.0001246366795329015, + "loss": 1.0084, + "step": 8886 + }, + { + "epoch": 0.4210850509357972, + "grad_norm": 0.55859375, + "learning_rate": 0.00012462224570543876, + "loss": 0.646, + "step": 8887 + }, + { + "epoch": 0.42113243307273157, + "grad_norm": 0.1943359375, + "learning_rate": 0.00012460781133188088, + "loss": 0.1525, + "step": 8888 + }, + { + "epoch": 0.42117981520966596, + "grad_norm": 0.59765625, + "learning_rate": 0.00012459337641254796, + "loss": 0.1222, + "step": 8889 + }, + { + "epoch": 0.42122719734660036, + "grad_norm": 0.78515625, + "learning_rate": 0.0001245789409477601, + "loss": 0.7078, + "step": 8890 + }, + { + "epoch": 0.4212745794835347, + "grad_norm": 0.73046875, + "learning_rate": 0.00012456450493783752, + "loss": 0.5646, + "step": 8891 + }, + { + "epoch": 0.4213219616204691, + "grad_norm": 0.765625, + "learning_rate": 0.00012455006838310042, + "loss": 0.8092, + "step": 8892 + }, + { + "epoch": 0.4213693437574035, + "grad_norm": 0.8125, + "learning_rate": 0.00012453563128386893, + "loss": 0.9258, + "step": 8893 + }, + { + "epoch": 0.4214167258943378, + "grad_norm": 0.70703125, + "learning_rate": 0.00012452119364046325, + "loss": 1.4296, + "step": 8894 + }, + { + "epoch": 0.4214641080312722, + "grad_norm": 0.7578125, + "learning_rate": 0.00012450675545320366, + "loss": 1.3191, + "step": 8895 + }, + { + "epoch": 0.4215114901682066, + "grad_norm": 0.2177734375, + "learning_rate": 0.00012449231672241032, + "loss": 0.0116, + "step": 8896 + }, + { + "epoch": 0.42155887230514094, + "grad_norm": 0.002197265625, + "learning_rate": 0.0001244778774484035, + "loss": 0.0001, + "step": 8897 + }, + { + "epoch": 0.42160625444207533, + "grad_norm": 0.42578125, + "learning_rate": 0.0001244634376315034, + "loss": 0.0157, + "step": 8898 + }, + { + "epoch": 0.4216536365790097, + "grad_norm": 0.6796875, + "learning_rate": 0.0001244489972720303, + "loss": 1.2617, + "step": 8899 + }, + { + "epoch": 0.42170101871594406, + "grad_norm": 0.54296875, + "learning_rate": 0.00012443455637030452, + "loss": 1.3783, + "step": 8900 + }, + { + "epoch": 0.42174840085287846, + "grad_norm": 0.95703125, + "learning_rate": 0.00012442011492664628, + "loss": 1.0486, + "step": 8901 + }, + { + "epoch": 0.42179578298981285, + "grad_norm": 0.62890625, + "learning_rate": 0.0001244056729413759, + "loss": 1.0188, + "step": 8902 + }, + { + "epoch": 0.42184316512674724, + "grad_norm": 0.73828125, + "learning_rate": 0.0001243912304148137, + "loss": 1.1449, + "step": 8903 + }, + { + "epoch": 0.4218905472636816, + "grad_norm": 0.515625, + "learning_rate": 0.00012437678734728, + "loss": 0.5396, + "step": 8904 + }, + { + "epoch": 0.421937929400616, + "grad_norm": 0.91796875, + "learning_rate": 0.00012436234373909512, + "loss": 0.8482, + "step": 8905 + }, + { + "epoch": 0.42198531153755037, + "grad_norm": 0.140625, + "learning_rate": 0.0001243478995905794, + "loss": 0.0132, + "step": 8906 + }, + { + "epoch": 0.4220326936744847, + "grad_norm": 0.5859375, + "learning_rate": 0.00012433345490205322, + "loss": 0.5625, + "step": 8907 + }, + { + "epoch": 0.4220800758114191, + "grad_norm": 0.2490234375, + "learning_rate": 0.00012431900967383686, + "loss": 0.0742, + "step": 8908 + }, + { + "epoch": 0.4221274579483535, + "grad_norm": 0.56640625, + "learning_rate": 0.00012430456390625082, + "loss": 0.529, + "step": 8909 + }, + { + "epoch": 0.42217484008528783, + "grad_norm": 0.80078125, + "learning_rate": 0.00012429011759961545, + "loss": 0.6306, + "step": 8910 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.56640625, + "learning_rate": 0.00012427567075425113, + "loss": 0.1279, + "step": 8911 + }, + { + "epoch": 0.4222696043591566, + "grad_norm": 0.65625, + "learning_rate": 0.0001242612233704783, + "loss": 1.2095, + "step": 8912 + }, + { + "epoch": 0.42231698649609095, + "grad_norm": 0.78125, + "learning_rate": 0.00012424677544861738, + "loss": 0.959, + "step": 8913 + }, + { + "epoch": 0.42236436863302534, + "grad_norm": 0.79296875, + "learning_rate": 0.00012423232698898878, + "loss": 0.9641, + "step": 8914 + }, + { + "epoch": 0.42241175076995974, + "grad_norm": 0.2021484375, + "learning_rate": 0.000124217877991913, + "loss": 0.016, + "step": 8915 + }, + { + "epoch": 0.4224591329068941, + "grad_norm": 0.74609375, + "learning_rate": 0.00012420342845771048, + "loss": 0.5453, + "step": 8916 + }, + { + "epoch": 0.42250651504382847, + "grad_norm": 0.6484375, + "learning_rate": 0.00012418897838670166, + "loss": 0.8954, + "step": 8917 + }, + { + "epoch": 0.42255389718076286, + "grad_norm": 0.6796875, + "learning_rate": 0.00012417452777920712, + "loss": 1.2048, + "step": 8918 + }, + { + "epoch": 0.42260127931769725, + "grad_norm": 0.158203125, + "learning_rate": 0.0001241600766355473, + "loss": 0.01, + "step": 8919 + }, + { + "epoch": 0.4226486614546316, + "grad_norm": 0.65625, + "learning_rate": 0.00012414562495604268, + "loss": 1.2205, + "step": 8920 + }, + { + "epoch": 0.422696043591566, + "grad_norm": 0.94921875, + "learning_rate": 0.00012413117274101386, + "loss": 1.1835, + "step": 8921 + }, + { + "epoch": 0.4227434257285004, + "grad_norm": 0.0703125, + "learning_rate": 0.00012411671999078128, + "loss": 0.0057, + "step": 8922 + }, + { + "epoch": 0.4227908078654347, + "grad_norm": 0.67578125, + "learning_rate": 0.0001241022667056656, + "loss": 0.8422, + "step": 8923 + }, + { + "epoch": 0.4228381900023691, + "grad_norm": 0.4140625, + "learning_rate": 0.00012408781288598725, + "loss": 0.0834, + "step": 8924 + }, + { + "epoch": 0.4228855721393035, + "grad_norm": 0.63671875, + "learning_rate": 0.0001240733585320669, + "loss": 0.8948, + "step": 8925 + }, + { + "epoch": 0.42293295427623784, + "grad_norm": 0.61328125, + "learning_rate": 0.0001240589036442251, + "loss": 1.1781, + "step": 8926 + }, + { + "epoch": 0.42298033641317223, + "grad_norm": 0.67578125, + "learning_rate": 0.00012404444822278242, + "loss": 0.6403, + "step": 8927 + }, + { + "epoch": 0.4230277185501066, + "grad_norm": 0.703125, + "learning_rate": 0.00012402999226805955, + "loss": 0.6801, + "step": 8928 + }, + { + "epoch": 0.42307510068704096, + "grad_norm": 0.19921875, + "learning_rate": 0.00012401553578037698, + "loss": 0.0082, + "step": 8929 + }, + { + "epoch": 0.42312248282397535, + "grad_norm": 0.66796875, + "learning_rate": 0.00012400107876005544, + "loss": 0.8599, + "step": 8930 + }, + { + "epoch": 0.42316986496090975, + "grad_norm": 0.80859375, + "learning_rate": 0.00012398662120741553, + "loss": 0.2746, + "step": 8931 + }, + { + "epoch": 0.42321724709784414, + "grad_norm": 0.625, + "learning_rate": 0.0001239721631227779, + "loss": 1.0153, + "step": 8932 + }, + { + "epoch": 0.4232646292347785, + "grad_norm": 0.63671875, + "learning_rate": 0.00012395770450646324, + "loss": 0.7281, + "step": 8933 + }, + { + "epoch": 0.42331201137171287, + "grad_norm": 0.6015625, + "learning_rate": 0.00012394324535879223, + "loss": 0.9411, + "step": 8934 + }, + { + "epoch": 0.42335939350864726, + "grad_norm": 0.11767578125, + "learning_rate": 0.00012392878568008554, + "loss": 0.0147, + "step": 8935 + }, + { + "epoch": 0.4234067756455816, + "grad_norm": 0.62109375, + "learning_rate": 0.00012391432547066383, + "loss": 1.3564, + "step": 8936 + }, + { + "epoch": 0.423454157782516, + "grad_norm": 0.578125, + "learning_rate": 0.0001238998647308479, + "loss": 1.171, + "step": 8937 + }, + { + "epoch": 0.4235015399194504, + "grad_norm": 0.71484375, + "learning_rate": 0.0001238854034609584, + "loss": 1.0655, + "step": 8938 + }, + { + "epoch": 0.4235489220563847, + "grad_norm": 0.734375, + "learning_rate": 0.0001238709416613161, + "loss": 0.6354, + "step": 8939 + }, + { + "epoch": 0.4235963041933191, + "grad_norm": 0.8359375, + "learning_rate": 0.00012385647933224172, + "loss": 1.0352, + "step": 8940 + }, + { + "epoch": 0.4236436863302535, + "grad_norm": 0.55078125, + "learning_rate": 0.00012384201647405607, + "loss": 0.0452, + "step": 8941 + }, + { + "epoch": 0.42369106846718785, + "grad_norm": 0.8203125, + "learning_rate": 0.00012382755308707987, + "loss": 0.2959, + "step": 8942 + }, + { + "epoch": 0.42373845060412224, + "grad_norm": 0.6796875, + "learning_rate": 0.00012381308917163393, + "loss": 0.7554, + "step": 8943 + }, + { + "epoch": 0.42378583274105663, + "grad_norm": 0.55859375, + "learning_rate": 0.00012379862472803904, + "loss": 0.5578, + "step": 8944 + }, + { + "epoch": 0.42383321487799097, + "grad_norm": 0.57421875, + "learning_rate": 0.000123784159756616, + "loss": 0.9, + "step": 8945 + }, + { + "epoch": 0.42388059701492536, + "grad_norm": 0.6953125, + "learning_rate": 0.00012376969425768563, + "loss": 1.1052, + "step": 8946 + }, + { + "epoch": 0.42392797915185976, + "grad_norm": 0.7890625, + "learning_rate": 0.00012375522823156876, + "loss": 1.1587, + "step": 8947 + }, + { + "epoch": 0.42397536128879415, + "grad_norm": 0.6484375, + "learning_rate": 0.0001237407616785862, + "loss": 1.0681, + "step": 8948 + }, + { + "epoch": 0.4240227434257285, + "grad_norm": 0.1015625, + "learning_rate": 0.00012372629459905888, + "loss": 0.0124, + "step": 8949 + }, + { + "epoch": 0.4240701255626629, + "grad_norm": 0.484375, + "learning_rate": 0.0001237118269933076, + "loss": 0.7137, + "step": 8950 + }, + { + "epoch": 0.4241175076995973, + "grad_norm": 0.6015625, + "learning_rate": 0.00012369735886165326, + "loss": 0.7155, + "step": 8951 + }, + { + "epoch": 0.4241648898365316, + "grad_norm": 0.283203125, + "learning_rate": 0.00012368289020441676, + "loss": 0.0344, + "step": 8952 + }, + { + "epoch": 0.424212271973466, + "grad_norm": 0.6171875, + "learning_rate": 0.00012366842102191894, + "loss": 0.7535, + "step": 8953 + }, + { + "epoch": 0.4242596541104004, + "grad_norm": 0.7109375, + "learning_rate": 0.0001236539513144808, + "loss": 0.624, + "step": 8954 + }, + { + "epoch": 0.42430703624733473, + "grad_norm": 0.671875, + "learning_rate": 0.00012363948108242315, + "loss": 0.4886, + "step": 8955 + }, + { + "epoch": 0.4243544183842691, + "grad_norm": 0.6640625, + "learning_rate": 0.00012362501032606702, + "loss": 0.7854, + "step": 8956 + }, + { + "epoch": 0.4244018005212035, + "grad_norm": 0.69921875, + "learning_rate": 0.00012361053904573333, + "loss": 1.3908, + "step": 8957 + }, + { + "epoch": 0.42444918265813786, + "grad_norm": 0.71875, + "learning_rate": 0.00012359606724174303, + "loss": 0.9789, + "step": 8958 + }, + { + "epoch": 0.42449656479507225, + "grad_norm": 0.40625, + "learning_rate": 0.0001235815949144171, + "loss": 0.03, + "step": 8959 + }, + { + "epoch": 0.42454394693200664, + "grad_norm": 0.89453125, + "learning_rate": 0.00012356712206407653, + "loss": 0.806, + "step": 8960 + }, + { + "epoch": 0.42459132906894104, + "grad_norm": 0.58984375, + "learning_rate": 0.0001235526486910423, + "loss": 0.1725, + "step": 8961 + }, + { + "epoch": 0.4246387112058754, + "grad_norm": 0.921875, + "learning_rate": 0.00012353817479563535, + "loss": 0.9963, + "step": 8962 + }, + { + "epoch": 0.42468609334280977, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001235237003781768, + "loss": 0.0063, + "step": 8963 + }, + { + "epoch": 0.42473347547974416, + "grad_norm": 0.0986328125, + "learning_rate": 0.00012350922543898757, + "loss": 0.0165, + "step": 8964 + }, + { + "epoch": 0.4247808576166785, + "grad_norm": 0.74609375, + "learning_rate": 0.00012349474997838883, + "loss": 1.1218, + "step": 8965 + }, + { + "epoch": 0.4248282397536129, + "grad_norm": 0.62890625, + "learning_rate": 0.00012348027399670155, + "loss": 1.1432, + "step": 8966 + }, + { + "epoch": 0.4248756218905473, + "grad_norm": 0.06640625, + "learning_rate": 0.00012346579749424679, + "loss": 0.0073, + "step": 8967 + }, + { + "epoch": 0.4249230040274816, + "grad_norm": 0.6015625, + "learning_rate": 0.00012345132047134565, + "loss": 1.2107, + "step": 8968 + }, + { + "epoch": 0.424970386164416, + "grad_norm": 0.236328125, + "learning_rate": 0.00012343684292831917, + "loss": 0.0673, + "step": 8969 + }, + { + "epoch": 0.4250177683013504, + "grad_norm": 0.765625, + "learning_rate": 0.0001234223648654885, + "loss": 1.0352, + "step": 8970 + }, + { + "epoch": 0.42506515043828474, + "grad_norm": 0.65625, + "learning_rate": 0.00012340788628317472, + "loss": 0.9669, + "step": 8971 + }, + { + "epoch": 0.42511253257521914, + "grad_norm": 0.7265625, + "learning_rate": 0.000123393407181699, + "loss": 0.526, + "step": 8972 + }, + { + "epoch": 0.42515991471215353, + "grad_norm": 0.2080078125, + "learning_rate": 0.00012337892756138235, + "loss": 0.1482, + "step": 8973 + }, + { + "epoch": 0.42520729684908787, + "grad_norm": 0.8828125, + "learning_rate": 0.00012336444742254603, + "loss": 0.4275, + "step": 8974 + }, + { + "epoch": 0.42525467898602226, + "grad_norm": 0.66796875, + "learning_rate": 0.00012334996676551115, + "loss": 0.082, + "step": 8975 + }, + { + "epoch": 0.42530206112295665, + "grad_norm": 0.369140625, + "learning_rate": 0.0001233354855905989, + "loss": 0.0929, + "step": 8976 + }, + { + "epoch": 0.42534944325989105, + "grad_norm": 0.67578125, + "learning_rate": 0.0001233210038981304, + "loss": 0.9906, + "step": 8977 + }, + { + "epoch": 0.4253968253968254, + "grad_norm": 0.75390625, + "learning_rate": 0.00012330652168842686, + "loss": 0.8739, + "step": 8978 + }, + { + "epoch": 0.4254442075337598, + "grad_norm": 0.90625, + "learning_rate": 0.00012329203896180953, + "loss": 0.9353, + "step": 8979 + }, + { + "epoch": 0.42549158967069417, + "grad_norm": 0.5859375, + "learning_rate": 0.0001232775557185996, + "loss": 0.1222, + "step": 8980 + }, + { + "epoch": 0.4255389718076285, + "grad_norm": 0.8828125, + "learning_rate": 0.00012326307195911822, + "loss": 1.0315, + "step": 8981 + }, + { + "epoch": 0.4255863539445629, + "grad_norm": 0.60546875, + "learning_rate": 0.0001232485876836867, + "loss": 0.897, + "step": 8982 + }, + { + "epoch": 0.4256337360814973, + "grad_norm": 0.73828125, + "learning_rate": 0.0001232341028926263, + "loss": 1.1748, + "step": 8983 + }, + { + "epoch": 0.42568111821843163, + "grad_norm": 0.1865234375, + "learning_rate": 0.00012321961758625824, + "loss": 0.0113, + "step": 8984 + }, + { + "epoch": 0.425728500355366, + "grad_norm": 0.6015625, + "learning_rate": 0.00012320513176490377, + "loss": 0.4843, + "step": 8985 + }, + { + "epoch": 0.4257758824923004, + "grad_norm": 0.68359375, + "learning_rate": 0.0001231906454288842, + "loss": 1.2123, + "step": 8986 + }, + { + "epoch": 0.42582326462923475, + "grad_norm": 0.84765625, + "learning_rate": 0.00012317615857852083, + "loss": 0.4092, + "step": 8987 + }, + { + "epoch": 0.42587064676616915, + "grad_norm": 1.265625, + "learning_rate": 0.00012316167121413497, + "loss": 1.2504, + "step": 8988 + }, + { + "epoch": 0.42591802890310354, + "grad_norm": 0.74609375, + "learning_rate": 0.00012314718333604786, + "loss": 0.5383, + "step": 8989 + }, + { + "epoch": 0.42596541104003793, + "grad_norm": 0.0031585693359375, + "learning_rate": 0.00012313269494458088, + "loss": 0.0002, + "step": 8990 + }, + { + "epoch": 0.42601279317697227, + "grad_norm": 0.2353515625, + "learning_rate": 0.00012311820604005543, + "loss": 0.0202, + "step": 8991 + }, + { + "epoch": 0.42606017531390666, + "grad_norm": 0.6953125, + "learning_rate": 0.00012310371662279277, + "loss": 1.3952, + "step": 8992 + }, + { + "epoch": 0.42610755745084106, + "grad_norm": 0.447265625, + "learning_rate": 0.00012308922669311426, + "loss": 0.7924, + "step": 8993 + }, + { + "epoch": 0.4261549395877754, + "grad_norm": 0.57421875, + "learning_rate": 0.0001230747362513413, + "loss": 1.1176, + "step": 8994 + }, + { + "epoch": 0.4262023217247098, + "grad_norm": 0.8515625, + "learning_rate": 0.00012306024529779526, + "loss": 1.4522, + "step": 8995 + }, + { + "epoch": 0.4262497038616442, + "grad_norm": 0.42578125, + "learning_rate": 0.00012304575383279755, + "loss": 0.1515, + "step": 8996 + }, + { + "epoch": 0.4262970859985785, + "grad_norm": 0.07470703125, + "learning_rate": 0.00012303126185666958, + "loss": 0.0047, + "step": 8997 + }, + { + "epoch": 0.4263444681355129, + "grad_norm": 0.466796875, + "learning_rate": 0.00012301676936973273, + "loss": 0.2999, + "step": 8998 + }, + { + "epoch": 0.4263918502724473, + "grad_norm": 0.5234375, + "learning_rate": 0.0001230022763723085, + "loss": 1.2043, + "step": 8999 + }, + { + "epoch": 0.42643923240938164, + "grad_norm": 0.76171875, + "learning_rate": 0.00012298778286471825, + "loss": 1.2351, + "step": 9000 + }, + { + "epoch": 0.42648661454631603, + "grad_norm": 0.193359375, + "learning_rate": 0.00012297328884728346, + "loss": 0.1498, + "step": 9001 + }, + { + "epoch": 0.4265339966832504, + "grad_norm": 0.15625, + "learning_rate": 0.00012295879432032558, + "loss": 0.043, + "step": 9002 + }, + { + "epoch": 0.42658137882018476, + "grad_norm": 0.77734375, + "learning_rate": 0.0001229442992841661, + "loss": 1.1026, + "step": 9003 + }, + { + "epoch": 0.42662876095711916, + "grad_norm": 0.78125, + "learning_rate": 0.00012292980373912652, + "loss": 0.0639, + "step": 9004 + }, + { + "epoch": 0.42667614309405355, + "grad_norm": 0.765625, + "learning_rate": 0.0001229153076855283, + "loss": 1.012, + "step": 9005 + }, + { + "epoch": 0.42672352523098794, + "grad_norm": 0.56640625, + "learning_rate": 0.00012290081112369298, + "loss": 1.2843, + "step": 9006 + }, + { + "epoch": 0.4267709073679223, + "grad_norm": 0.859375, + "learning_rate": 0.00012288631405394206, + "loss": 0.7494, + "step": 9007 + }, + { + "epoch": 0.4268182895048567, + "grad_norm": 0.69140625, + "learning_rate": 0.00012287181647659706, + "loss": 0.8909, + "step": 9008 + }, + { + "epoch": 0.42686567164179107, + "grad_norm": 0.6796875, + "learning_rate": 0.00012285731839197954, + "loss": 1.3663, + "step": 9009 + }, + { + "epoch": 0.4269130537787254, + "grad_norm": 0.75390625, + "learning_rate": 0.00012284281980041103, + "loss": 1.3305, + "step": 9010 + }, + { + "epoch": 0.4269604359156598, + "grad_norm": 0.59765625, + "learning_rate": 0.00012282832070221314, + "loss": 1.2073, + "step": 9011 + }, + { + "epoch": 0.4270078180525942, + "grad_norm": 0.73046875, + "learning_rate": 0.0001228138210977074, + "loss": 0.1763, + "step": 9012 + }, + { + "epoch": 0.4270552001895285, + "grad_norm": 0.62109375, + "learning_rate": 0.0001227993209872154, + "loss": 1.3054, + "step": 9013 + }, + { + "epoch": 0.4271025823264629, + "grad_norm": 0.67578125, + "learning_rate": 0.00012278482037105873, + "loss": 1.2087, + "step": 9014 + }, + { + "epoch": 0.4271499644633973, + "grad_norm": 0.51171875, + "learning_rate": 0.00012277031924955905, + "loss": 0.0055, + "step": 9015 + }, + { + "epoch": 0.42719734660033165, + "grad_norm": 0.8984375, + "learning_rate": 0.0001227558176230379, + "loss": 0.9413, + "step": 9016 + }, + { + "epoch": 0.42724472873726604, + "grad_norm": 0.2158203125, + "learning_rate": 0.00012274131549181697, + "loss": 0.0318, + "step": 9017 + }, + { + "epoch": 0.42729211087420044, + "grad_norm": 0.88671875, + "learning_rate": 0.0001227268128562179, + "loss": 0.8257, + "step": 9018 + }, + { + "epoch": 0.4273394930111348, + "grad_norm": 0.625, + "learning_rate": 0.00012271230971656235, + "loss": 1.3141, + "step": 9019 + }, + { + "epoch": 0.42738687514806917, + "grad_norm": 0.9765625, + "learning_rate": 0.0001226978060731719, + "loss": 1.0274, + "step": 9020 + }, + { + "epoch": 0.42743425728500356, + "grad_norm": 0.435546875, + "learning_rate": 0.00012268330192636832, + "loss": 0.0155, + "step": 9021 + }, + { + "epoch": 0.42748163942193795, + "grad_norm": 0.58984375, + "learning_rate": 0.00012266879727647326, + "loss": 0.8564, + "step": 9022 + }, + { + "epoch": 0.4275290215588723, + "grad_norm": 0.7734375, + "learning_rate": 0.00012265429212380847, + "loss": 1.0449, + "step": 9023 + }, + { + "epoch": 0.4275764036958067, + "grad_norm": 0.58984375, + "learning_rate": 0.00012263978646869555, + "loss": 1.0867, + "step": 9024 + }, + { + "epoch": 0.4276237858327411, + "grad_norm": 0.8984375, + "learning_rate": 0.00012262528031145627, + "loss": 1.1932, + "step": 9025 + }, + { + "epoch": 0.4276711679696754, + "grad_norm": 0.625, + "learning_rate": 0.00012261077365241242, + "loss": 0.8738, + "step": 9026 + }, + { + "epoch": 0.4277185501066098, + "grad_norm": 0.65625, + "learning_rate": 0.00012259626649188568, + "loss": 0.9239, + "step": 9027 + }, + { + "epoch": 0.4277659322435442, + "grad_norm": 0.65234375, + "learning_rate": 0.0001225817588301978, + "loss": 0.7741, + "step": 9028 + }, + { + "epoch": 0.42781331438047854, + "grad_norm": 0.73046875, + "learning_rate": 0.0001225672506676706, + "loss": 1.0425, + "step": 9029 + }, + { + "epoch": 0.42786069651741293, + "grad_norm": 0.55859375, + "learning_rate": 0.0001225527420046258, + "loss": 0.6123, + "step": 9030 + }, + { + "epoch": 0.4279080786543473, + "grad_norm": 0.021240234375, + "learning_rate": 0.0001225382328413852, + "loss": 0.0008, + "step": 9031 + }, + { + "epoch": 0.42795546079128166, + "grad_norm": 0.6328125, + "learning_rate": 0.00012252372317827056, + "loss": 0.8573, + "step": 9032 + }, + { + "epoch": 0.42800284292821605, + "grad_norm": 0.703125, + "learning_rate": 0.00012250921301560377, + "loss": 1.1215, + "step": 9033 + }, + { + "epoch": 0.42805022506515045, + "grad_norm": 0.17578125, + "learning_rate": 0.00012249470235370664, + "loss": 0.0241, + "step": 9034 + }, + { + "epoch": 0.42809760720208484, + "grad_norm": 0.63671875, + "learning_rate": 0.00012248019119290093, + "loss": 1.162, + "step": 9035 + }, + { + "epoch": 0.4281449893390192, + "grad_norm": 0.6328125, + "learning_rate": 0.00012246567953350852, + "loss": 1.4999, + "step": 9036 + }, + { + "epoch": 0.42819237147595357, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001224511673758513, + "loss": 0.0294, + "step": 9037 + }, + { + "epoch": 0.42823975361288796, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00012243665472025112, + "loss": 0.0009, + "step": 9038 + }, + { + "epoch": 0.4282871357498223, + "grad_norm": 0.71484375, + "learning_rate": 0.0001224221415670298, + "loss": 1.0573, + "step": 9039 + }, + { + "epoch": 0.4283345178867567, + "grad_norm": 0.4765625, + "learning_rate": 0.00012240762791650923, + "loss": 0.7919, + "step": 9040 + }, + { + "epoch": 0.4283819000236911, + "grad_norm": 0.28515625, + "learning_rate": 0.0001223931137690114, + "loss": 0.0177, + "step": 9041 + }, + { + "epoch": 0.4284292821606254, + "grad_norm": 0.7578125, + "learning_rate": 0.00012237859912485815, + "loss": 1.1608, + "step": 9042 + }, + { + "epoch": 0.4284766642975598, + "grad_norm": 0.6953125, + "learning_rate": 0.00012236408398437135, + "loss": 0.5714, + "step": 9043 + }, + { + "epoch": 0.4285240464344942, + "grad_norm": 0.416015625, + "learning_rate": 0.00012234956834787303, + "loss": 0.0568, + "step": 9044 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.8359375, + "learning_rate": 0.00012233505221568512, + "loss": 1.2182, + "step": 9045 + }, + { + "epoch": 0.42861881070836294, + "grad_norm": 0.67578125, + "learning_rate": 0.00012232053558812952, + "loss": 0.1509, + "step": 9046 + }, + { + "epoch": 0.42866619284529733, + "grad_norm": 0.76953125, + "learning_rate": 0.0001223060184655282, + "loss": 1.013, + "step": 9047 + }, + { + "epoch": 0.42871357498223167, + "grad_norm": 0.76171875, + "learning_rate": 0.00012229150084820315, + "loss": 1.0359, + "step": 9048 + }, + { + "epoch": 0.42876095711916606, + "grad_norm": 0.52734375, + "learning_rate": 0.0001222769827364764, + "loss": 1.0239, + "step": 9049 + }, + { + "epoch": 0.42880833925610046, + "grad_norm": 0.00162506103515625, + "learning_rate": 0.00012226246413066984, + "loss": 0.0001, + "step": 9050 + }, + { + "epoch": 0.42885572139303485, + "grad_norm": 0.6171875, + "learning_rate": 0.00012224794503110556, + "loss": 1.1826, + "step": 9051 + }, + { + "epoch": 0.4289031035299692, + "grad_norm": 0.462890625, + "learning_rate": 0.00012223342543810556, + "loss": 0.7507, + "step": 9052 + }, + { + "epoch": 0.4289504856669036, + "grad_norm": 0.546875, + "learning_rate": 0.00012221890535199186, + "loss": 0.4868, + "step": 9053 + }, + { + "epoch": 0.428997867803838, + "grad_norm": 0.5546875, + "learning_rate": 0.00012220438477308655, + "loss": 0.8132, + "step": 9054 + }, + { + "epoch": 0.4290452499407723, + "grad_norm": 0.62109375, + "learning_rate": 0.00012218986370171158, + "loss": 0.6508, + "step": 9055 + }, + { + "epoch": 0.4290926320777067, + "grad_norm": 0.1044921875, + "learning_rate": 0.00012217534213818907, + "loss": 0.0103, + "step": 9056 + }, + { + "epoch": 0.4291400142146411, + "grad_norm": 0.333984375, + "learning_rate": 0.00012216082008284114, + "loss": 0.0368, + "step": 9057 + }, + { + "epoch": 0.42918739635157543, + "grad_norm": 0.66015625, + "learning_rate": 0.00012214629753598978, + "loss": 0.1443, + "step": 9058 + }, + { + "epoch": 0.4292347784885098, + "grad_norm": 1.0546875, + "learning_rate": 0.00012213177449795713, + "loss": 0.7987, + "step": 9059 + }, + { + "epoch": 0.4292821606254442, + "grad_norm": 0.5546875, + "learning_rate": 0.0001221172509690653, + "loss": 0.7702, + "step": 9060 + }, + { + "epoch": 0.42932954276237856, + "grad_norm": 1.0078125, + "learning_rate": 0.00012210272694963644, + "loss": 0.7252, + "step": 9061 + }, + { + "epoch": 0.42937692489931295, + "grad_norm": 0.6328125, + "learning_rate": 0.00012208820243999263, + "loss": 0.817, + "step": 9062 + }, + { + "epoch": 0.42942430703624734, + "grad_norm": 0.859375, + "learning_rate": 0.00012207367744045597, + "loss": 0.2632, + "step": 9063 + }, + { + "epoch": 0.42947168917318174, + "grad_norm": 0.87109375, + "learning_rate": 0.0001220591519513487, + "loss": 1.2632, + "step": 9064 + }, + { + "epoch": 0.4295190713101161, + "grad_norm": 0.7109375, + "learning_rate": 0.00012204462597299291, + "loss": 1.1329, + "step": 9065 + }, + { + "epoch": 0.42956645344705047, + "grad_norm": 0.6484375, + "learning_rate": 0.00012203009950571078, + "loss": 1.0932, + "step": 9066 + }, + { + "epoch": 0.42961383558398486, + "grad_norm": 0.5, + "learning_rate": 0.00012201557254982454, + "loss": 0.6318, + "step": 9067 + }, + { + "epoch": 0.4296612177209192, + "grad_norm": 0.244140625, + "learning_rate": 0.00012200104510565635, + "loss": 0.05, + "step": 9068 + }, + { + "epoch": 0.4297085998578536, + "grad_norm": 0.68359375, + "learning_rate": 0.0001219865171735284, + "loss": 1.1741, + "step": 9069 + }, + { + "epoch": 0.429755981994788, + "grad_norm": 0.6953125, + "learning_rate": 0.00012197198875376295, + "loss": 1.007, + "step": 9070 + }, + { + "epoch": 0.4298033641317223, + "grad_norm": 0.734375, + "learning_rate": 0.00012195745984668216, + "loss": 0.9968, + "step": 9071 + }, + { + "epoch": 0.4298507462686567, + "grad_norm": 0.578125, + "learning_rate": 0.00012194293045260832, + "loss": 1.262, + "step": 9072 + }, + { + "epoch": 0.4298981284055911, + "grad_norm": 0.74609375, + "learning_rate": 0.00012192840057186363, + "loss": 1.5263, + "step": 9073 + }, + { + "epoch": 0.42994551054252544, + "grad_norm": 0.8984375, + "learning_rate": 0.00012191387020477038, + "loss": 0.0267, + "step": 9074 + }, + { + "epoch": 0.42999289267945984, + "grad_norm": 0.2119140625, + "learning_rate": 0.00012189933935165086, + "loss": 0.0294, + "step": 9075 + }, + { + "epoch": 0.43004027481639423, + "grad_norm": 0.19140625, + "learning_rate": 0.00012188480801282729, + "loss": 0.0222, + "step": 9076 + }, + { + "epoch": 0.43008765695332857, + "grad_norm": 0.54296875, + "learning_rate": 0.00012187027618862199, + "loss": 1.0799, + "step": 9077 + }, + { + "epoch": 0.43013503909026296, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012185574387935727, + "loss": 0.014, + "step": 9078 + }, + { + "epoch": 0.43018242122719735, + "grad_norm": 0.58984375, + "learning_rate": 0.00012184121108535543, + "loss": 0.6301, + "step": 9079 + }, + { + "epoch": 0.43022980336413175, + "grad_norm": 0.93359375, + "learning_rate": 0.0001218266778069388, + "loss": 0.2887, + "step": 9080 + }, + { + "epoch": 0.4302771855010661, + "grad_norm": 0.265625, + "learning_rate": 0.00012181214404442967, + "loss": 0.0134, + "step": 9081 + }, + { + "epoch": 0.4303245676380005, + "grad_norm": 0.640625, + "learning_rate": 0.00012179760979815045, + "loss": 0.9382, + "step": 9082 + }, + { + "epoch": 0.43037194977493487, + "grad_norm": 0.546875, + "learning_rate": 0.00012178307506842345, + "loss": 0.7161, + "step": 9083 + }, + { + "epoch": 0.4304193319118692, + "grad_norm": 0.1279296875, + "learning_rate": 0.00012176853985557104, + "loss": 0.0159, + "step": 9084 + }, + { + "epoch": 0.4304667140488036, + "grad_norm": 0.291015625, + "learning_rate": 0.00012175400415991563, + "loss": 0.1446, + "step": 9085 + }, + { + "epoch": 0.430514096185738, + "grad_norm": 0.8515625, + "learning_rate": 0.00012173946798177956, + "loss": 0.7958, + "step": 9086 + }, + { + "epoch": 0.43056147832267233, + "grad_norm": 0.09375, + "learning_rate": 0.00012172493132148525, + "loss": 0.0021, + "step": 9087 + }, + { + "epoch": 0.4306088604596067, + "grad_norm": 0.7890625, + "learning_rate": 0.0001217103941793551, + "loss": 0.4472, + "step": 9088 + }, + { + "epoch": 0.4306562425965411, + "grad_norm": 0.73828125, + "learning_rate": 0.00012169585655571154, + "loss": 0.5643, + "step": 9089 + }, + { + "epoch": 0.43070362473347545, + "grad_norm": 0.453125, + "learning_rate": 0.00012168131845087699, + "loss": 0.5358, + "step": 9090 + }, + { + "epoch": 0.43075100687040985, + "grad_norm": 0.578125, + "learning_rate": 0.00012166677986517387, + "loss": 0.8892, + "step": 9091 + }, + { + "epoch": 0.43079838900734424, + "grad_norm": 0.69921875, + "learning_rate": 0.00012165224079892467, + "loss": 1.2856, + "step": 9092 + }, + { + "epoch": 0.43084577114427863, + "grad_norm": 0.68359375, + "learning_rate": 0.00012163770125245185, + "loss": 1.1084, + "step": 9093 + }, + { + "epoch": 0.43089315328121297, + "grad_norm": 0.90625, + "learning_rate": 0.00012162316122607781, + "loss": 0.9945, + "step": 9094 + }, + { + "epoch": 0.43094053541814736, + "grad_norm": 0.8125, + "learning_rate": 0.00012160862072012515, + "loss": 1.2901, + "step": 9095 + }, + { + "epoch": 0.43098791755508176, + "grad_norm": 0.0400390625, + "learning_rate": 0.00012159407973491625, + "loss": 0.0036, + "step": 9096 + }, + { + "epoch": 0.4310352996920161, + "grad_norm": 0.6171875, + "learning_rate": 0.00012157953827077367, + "loss": 1.0957, + "step": 9097 + }, + { + "epoch": 0.4310826818289505, + "grad_norm": 0.6953125, + "learning_rate": 0.00012156499632801994, + "loss": 1.5313, + "step": 9098 + }, + { + "epoch": 0.4311300639658849, + "grad_norm": 0.002532958984375, + "learning_rate": 0.00012155045390697754, + "loss": 0.0002, + "step": 9099 + }, + { + "epoch": 0.4311774461028192, + "grad_norm": 0.6875, + "learning_rate": 0.00012153591100796903, + "loss": 1.1189, + "step": 9100 + }, + { + "epoch": 0.4312248282397536, + "grad_norm": 0.70703125, + "learning_rate": 0.00012152136763131697, + "loss": 0.5701, + "step": 9101 + }, + { + "epoch": 0.431272210376688, + "grad_norm": 0.0283203125, + "learning_rate": 0.00012150682377734388, + "loss": 0.0027, + "step": 9102 + }, + { + "epoch": 0.43131959251362234, + "grad_norm": 0.62109375, + "learning_rate": 0.00012149227944637235, + "loss": 0.9697, + "step": 9103 + }, + { + "epoch": 0.43136697465055673, + "grad_norm": 0.169921875, + "learning_rate": 0.00012147773463872496, + "loss": 0.0147, + "step": 9104 + }, + { + "epoch": 0.4314143567874911, + "grad_norm": 0.73828125, + "learning_rate": 0.0001214631893547243, + "loss": 1.3039, + "step": 9105 + }, + { + "epoch": 0.43146173892442546, + "grad_norm": 0.69140625, + "learning_rate": 0.00012144864359469294, + "loss": 0.8633, + "step": 9106 + }, + { + "epoch": 0.43150912106135986, + "grad_norm": 0.64453125, + "learning_rate": 0.00012143409735895353, + "loss": 0.8172, + "step": 9107 + }, + { + "epoch": 0.43155650319829425, + "grad_norm": 0.1259765625, + "learning_rate": 0.00012141955064782866, + "loss": 0.0044, + "step": 9108 + }, + { + "epoch": 0.43160388533522864, + "grad_norm": 0.69140625, + "learning_rate": 0.00012140500346164099, + "loss": 1.3199, + "step": 9109 + }, + { + "epoch": 0.431651267472163, + "grad_norm": 0.71484375, + "learning_rate": 0.00012139045580071313, + "loss": 1.3542, + "step": 9110 + }, + { + "epoch": 0.4316986496090974, + "grad_norm": 0.58984375, + "learning_rate": 0.00012137590766536775, + "loss": 0.7236, + "step": 9111 + }, + { + "epoch": 0.43174603174603177, + "grad_norm": 0.078125, + "learning_rate": 0.00012136135905592749, + "loss": 0.0086, + "step": 9112 + }, + { + "epoch": 0.4317934138829661, + "grad_norm": 0.65625, + "learning_rate": 0.00012134680997271504, + "loss": 1.334, + "step": 9113 + }, + { + "epoch": 0.4318407960199005, + "grad_norm": 0.515625, + "learning_rate": 0.0001213322604160531, + "loss": 0.8218, + "step": 9114 + }, + { + "epoch": 0.4318881781568349, + "grad_norm": 0.80859375, + "learning_rate": 0.00012131771038626434, + "loss": 0.8312, + "step": 9115 + }, + { + "epoch": 0.4319355602937692, + "grad_norm": 0.5859375, + "learning_rate": 0.00012130315988367145, + "loss": 0.7438, + "step": 9116 + }, + { + "epoch": 0.4319829424307036, + "grad_norm": 0.025634765625, + "learning_rate": 0.00012128860890859722, + "loss": 0.0011, + "step": 9117 + }, + { + "epoch": 0.432030324567638, + "grad_norm": 0.7890625, + "learning_rate": 0.00012127405746136427, + "loss": 1.1243, + "step": 9118 + }, + { + "epoch": 0.43207770670457235, + "grad_norm": 0.78125, + "learning_rate": 0.00012125950554229539, + "loss": 1.2505, + "step": 9119 + }, + { + "epoch": 0.43212508884150674, + "grad_norm": 0.6015625, + "learning_rate": 0.00012124495315171334, + "loss": 1.7117, + "step": 9120 + }, + { + "epoch": 0.43217247097844114, + "grad_norm": 0.1513671875, + "learning_rate": 0.00012123040028994086, + "loss": 0.0113, + "step": 9121 + }, + { + "epoch": 0.43221985311537553, + "grad_norm": 0.67578125, + "learning_rate": 0.00012121584695730068, + "loss": 1.0387, + "step": 9122 + }, + { + "epoch": 0.43226723525230987, + "grad_norm": 0.46875, + "learning_rate": 0.00012120129315411564, + "loss": 0.7958, + "step": 9123 + }, + { + "epoch": 0.43231461738924426, + "grad_norm": 0.1025390625, + "learning_rate": 0.00012118673888070848, + "loss": 0.0095, + "step": 9124 + }, + { + "epoch": 0.43236199952617865, + "grad_norm": 0.337890625, + "learning_rate": 0.00012117218413740206, + "loss": 0.1888, + "step": 9125 + }, + { + "epoch": 0.432409381663113, + "grad_norm": 0.1162109375, + "learning_rate": 0.00012115762892451915, + "loss": 0.0096, + "step": 9126 + }, + { + "epoch": 0.4324567638000474, + "grad_norm": 0.70703125, + "learning_rate": 0.00012114307324238253, + "loss": 1.0692, + "step": 9127 + }, + { + "epoch": 0.4325041459369818, + "grad_norm": 0.53125, + "learning_rate": 0.00012112851709131505, + "loss": 0.5198, + "step": 9128 + }, + { + "epoch": 0.4325515280739161, + "grad_norm": 0.703125, + "learning_rate": 0.0001211139604716396, + "loss": 0.9162, + "step": 9129 + }, + { + "epoch": 0.4325989102108505, + "grad_norm": 0.059814453125, + "learning_rate": 0.00012109940338367897, + "loss": 0.004, + "step": 9130 + }, + { + "epoch": 0.4326462923477849, + "grad_norm": 0.69921875, + "learning_rate": 0.00012108484582775608, + "loss": 1.0419, + "step": 9131 + }, + { + "epoch": 0.43269367448471924, + "grad_norm": 0.953125, + "learning_rate": 0.00012107028780419374, + "loss": 0.6108, + "step": 9132 + }, + { + "epoch": 0.43274105662165363, + "grad_norm": 0.419921875, + "learning_rate": 0.0001210557293133149, + "loss": 0.0552, + "step": 9133 + }, + { + "epoch": 0.432788438758588, + "grad_norm": 0.52734375, + "learning_rate": 0.00012104117035544238, + "loss": 1.0201, + "step": 9134 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 0.353515625, + "learning_rate": 0.00012102661093089911, + "loss": 0.4299, + "step": 9135 + }, + { + "epoch": 0.43288320303245675, + "grad_norm": 0.044921875, + "learning_rate": 0.00012101205104000801, + "loss": 0.0036, + "step": 9136 + }, + { + "epoch": 0.43293058516939115, + "grad_norm": 0.66015625, + "learning_rate": 0.00012099749068309201, + "loss": 0.5321, + "step": 9137 + }, + { + "epoch": 0.43297796730632554, + "grad_norm": 0.298828125, + "learning_rate": 0.00012098292986047402, + "loss": 0.017, + "step": 9138 + }, + { + "epoch": 0.4330253494432599, + "grad_norm": 0.251953125, + "learning_rate": 0.000120968368572477, + "loss": 0.0328, + "step": 9139 + }, + { + "epoch": 0.43307273158019427, + "grad_norm": 0.703125, + "learning_rate": 0.0001209538068194239, + "loss": 0.8218, + "step": 9140 + }, + { + "epoch": 0.43312011371712866, + "grad_norm": 0.62109375, + "learning_rate": 0.0001209392446016377, + "loss": 1.2307, + "step": 9141 + }, + { + "epoch": 0.433167495854063, + "grad_norm": 0.578125, + "learning_rate": 0.00012092468191944133, + "loss": 0.5895, + "step": 9142 + }, + { + "epoch": 0.4332148779909974, + "grad_norm": 0.73828125, + "learning_rate": 0.0001209101187731578, + "loss": 0.6921, + "step": 9143 + }, + { + "epoch": 0.4332622601279318, + "grad_norm": 0.64453125, + "learning_rate": 0.00012089555516311016, + "loss": 1.3407, + "step": 9144 + }, + { + "epoch": 0.4333096422648661, + "grad_norm": 0.56640625, + "learning_rate": 0.0001208809910896213, + "loss": 0.6376, + "step": 9145 + }, + { + "epoch": 0.4333570244018005, + "grad_norm": 0.09521484375, + "learning_rate": 0.00012086642655301432, + "loss": 0.0064, + "step": 9146 + }, + { + "epoch": 0.4334044065387349, + "grad_norm": 0.09912109375, + "learning_rate": 0.00012085186155361224, + "loss": 0.0053, + "step": 9147 + }, + { + "epoch": 0.43345178867566925, + "grad_norm": 0.703125, + "learning_rate": 0.00012083729609173806, + "loss": 1.034, + "step": 9148 + }, + { + "epoch": 0.43349917081260364, + "grad_norm": 0.51953125, + "learning_rate": 0.00012082273016771488, + "loss": 0.196, + "step": 9149 + }, + { + "epoch": 0.43354655294953803, + "grad_norm": 0.62109375, + "learning_rate": 0.0001208081637818657, + "loss": 1.3181, + "step": 9150 + }, + { + "epoch": 0.4335939350864724, + "grad_norm": 0.80078125, + "learning_rate": 0.0001207935969345136, + "loss": 1.2652, + "step": 9151 + }, + { + "epoch": 0.43364131722340676, + "grad_norm": 0.244140625, + "learning_rate": 0.00012077902962598171, + "loss": 0.0348, + "step": 9152 + }, + { + "epoch": 0.43368869936034116, + "grad_norm": 0.31640625, + "learning_rate": 0.00012076446185659302, + "loss": 0.0056, + "step": 9153 + }, + { + "epoch": 0.43373608149727555, + "grad_norm": 0.6875, + "learning_rate": 0.0001207498936266707, + "loss": 0.772, + "step": 9154 + }, + { + "epoch": 0.4337834636342099, + "grad_norm": 0.68359375, + "learning_rate": 0.00012073532493653786, + "loss": 1.2163, + "step": 9155 + }, + { + "epoch": 0.4338308457711443, + "grad_norm": 0.6953125, + "learning_rate": 0.00012072075578651762, + "loss": 1.0997, + "step": 9156 + }, + { + "epoch": 0.4338782279080787, + "grad_norm": 0.75390625, + "learning_rate": 0.00012070618617693306, + "loss": 1.0526, + "step": 9157 + }, + { + "epoch": 0.433925610045013, + "grad_norm": 0.55859375, + "learning_rate": 0.00012069161610810734, + "loss": 0.8277, + "step": 9158 + }, + { + "epoch": 0.4339729921819474, + "grad_norm": 0.6484375, + "learning_rate": 0.00012067704558036362, + "loss": 1.0736, + "step": 9159 + }, + { + "epoch": 0.4340203743188818, + "grad_norm": 0.70703125, + "learning_rate": 0.00012066247459402507, + "loss": 0.7963, + "step": 9160 + }, + { + "epoch": 0.43406775645581613, + "grad_norm": 0.67578125, + "learning_rate": 0.0001206479031494148, + "loss": 1.3514, + "step": 9161 + }, + { + "epoch": 0.4341151385927505, + "grad_norm": 0.2431640625, + "learning_rate": 0.00012063333124685606, + "loss": 0.0038, + "step": 9162 + }, + { + "epoch": 0.4341625207296849, + "grad_norm": 0.08935546875, + "learning_rate": 0.00012061875888667203, + "loss": 0.009, + "step": 9163 + }, + { + "epoch": 0.43420990286661926, + "grad_norm": 0.59375, + "learning_rate": 0.00012060418606918587, + "loss": 0.7726, + "step": 9164 + }, + { + "epoch": 0.43425728500355365, + "grad_norm": 0.6796875, + "learning_rate": 0.00012058961279472079, + "loss": 0.8138, + "step": 9165 + }, + { + "epoch": 0.43430466714048804, + "grad_norm": 0.4921875, + "learning_rate": 0.00012057503906360004, + "loss": 0.5046, + "step": 9166 + }, + { + "epoch": 0.43435204927742244, + "grad_norm": 0.58984375, + "learning_rate": 0.00012056046487614687, + "loss": 1.3172, + "step": 9167 + }, + { + "epoch": 0.4343994314143568, + "grad_norm": 0.1953125, + "learning_rate": 0.00012054589023268445, + "loss": 0.1465, + "step": 9168 + }, + { + "epoch": 0.43444681355129117, + "grad_norm": 0.466796875, + "learning_rate": 0.00012053131513353608, + "loss": 0.811, + "step": 9169 + }, + { + "epoch": 0.43449419568822556, + "grad_norm": 0.68359375, + "learning_rate": 0.00012051673957902501, + "loss": 1.2989, + "step": 9170 + }, + { + "epoch": 0.4345415778251599, + "grad_norm": 0.63671875, + "learning_rate": 0.00012050216356947453, + "loss": 1.5614, + "step": 9171 + }, + { + "epoch": 0.4345889599620943, + "grad_norm": 0.5859375, + "learning_rate": 0.00012048758710520789, + "loss": 0.9579, + "step": 9172 + }, + { + "epoch": 0.4346363420990287, + "grad_norm": 0.97265625, + "learning_rate": 0.00012047301018654838, + "loss": 1.2662, + "step": 9173 + }, + { + "epoch": 0.434683724235963, + "grad_norm": 0.69921875, + "learning_rate": 0.0001204584328138193, + "loss": 1.0081, + "step": 9174 + }, + { + "epoch": 0.4347311063728974, + "grad_norm": 0.6171875, + "learning_rate": 0.00012044385498734398, + "loss": 0.835, + "step": 9175 + }, + { + "epoch": 0.4347784885098318, + "grad_norm": 0.7734375, + "learning_rate": 0.00012042927670744574, + "loss": 1.023, + "step": 9176 + }, + { + "epoch": 0.43482587064676614, + "grad_norm": 0.65625, + "learning_rate": 0.00012041469797444788, + "loss": 0.7017, + "step": 9177 + }, + { + "epoch": 0.43487325278370054, + "grad_norm": 0.703125, + "learning_rate": 0.00012040011878867379, + "loss": 0.9192, + "step": 9178 + }, + { + "epoch": 0.43492063492063493, + "grad_norm": 0.59765625, + "learning_rate": 0.00012038553915044679, + "loss": 0.8073, + "step": 9179 + }, + { + "epoch": 0.4349680170575693, + "grad_norm": 0.1875, + "learning_rate": 0.00012037095906009026, + "loss": 0.0218, + "step": 9180 + }, + { + "epoch": 0.43501539919450366, + "grad_norm": 0.84765625, + "learning_rate": 0.00012035637851792754, + "loss": 0.7896, + "step": 9181 + }, + { + "epoch": 0.43506278133143805, + "grad_norm": 0.75390625, + "learning_rate": 0.00012034179752428203, + "loss": 0.7577, + "step": 9182 + }, + { + "epoch": 0.43511016346837245, + "grad_norm": 0.4140625, + "learning_rate": 0.00012032721607947712, + "loss": 0.0411, + "step": 9183 + }, + { + "epoch": 0.4351575456053068, + "grad_norm": 0.6484375, + "learning_rate": 0.00012031263418383618, + "loss": 1.0111, + "step": 9184 + }, + { + "epoch": 0.4352049277422412, + "grad_norm": 0.7578125, + "learning_rate": 0.00012029805183768268, + "loss": 1.3509, + "step": 9185 + }, + { + "epoch": 0.43525230987917557, + "grad_norm": 0.61328125, + "learning_rate": 0.00012028346904134003, + "loss": 1.2804, + "step": 9186 + }, + { + "epoch": 0.4352996920161099, + "grad_norm": 0.58203125, + "learning_rate": 0.0001202688857951316, + "loss": 0.8546, + "step": 9187 + }, + { + "epoch": 0.4353470741530443, + "grad_norm": 0.28125, + "learning_rate": 0.00012025430209938089, + "loss": 0.116, + "step": 9188 + }, + { + "epoch": 0.4353944562899787, + "grad_norm": 0.625, + "learning_rate": 0.00012023971795441133, + "loss": 0.5732, + "step": 9189 + }, + { + "epoch": 0.43544183842691303, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0001202251333605464, + "loss": 0.0006, + "step": 9190 + }, + { + "epoch": 0.4354892205638474, + "grad_norm": 0.185546875, + "learning_rate": 0.00012021054831810953, + "loss": 0.0279, + "step": 9191 + }, + { + "epoch": 0.4355366027007818, + "grad_norm": 0.072265625, + "learning_rate": 0.00012019596282742424, + "loss": 0.0075, + "step": 9192 + }, + { + "epoch": 0.43558398483771615, + "grad_norm": 0.7734375, + "learning_rate": 0.00012018137688881399, + "loss": 1.0802, + "step": 9193 + }, + { + "epoch": 0.43563136697465055, + "grad_norm": 0.6640625, + "learning_rate": 0.00012016679050260232, + "loss": 1.4969, + "step": 9194 + }, + { + "epoch": 0.43567874911158494, + "grad_norm": 0.84375, + "learning_rate": 0.00012015220366911273, + "loss": 0.711, + "step": 9195 + }, + { + "epoch": 0.43572613124851933, + "grad_norm": 0.0016021728515625, + "learning_rate": 0.0001201376163886687, + "loss": 0.0001, + "step": 9196 + }, + { + "epoch": 0.43577351338545367, + "grad_norm": 0.416015625, + "learning_rate": 0.00012012302866159377, + "loss": 0.0915, + "step": 9197 + }, + { + "epoch": 0.43582089552238806, + "grad_norm": 0.2353515625, + "learning_rate": 0.00012010844048821155, + "loss": 0.0159, + "step": 9198 + }, + { + "epoch": 0.43586827765932246, + "grad_norm": 0.57421875, + "learning_rate": 0.0001200938518688455, + "loss": 0.3029, + "step": 9199 + }, + { + "epoch": 0.4359156597962568, + "grad_norm": 0.6796875, + "learning_rate": 0.00012007926280381921, + "loss": 0.9484, + "step": 9200 + }, + { + "epoch": 0.4359630419331912, + "grad_norm": 0.62109375, + "learning_rate": 0.00012006467329345627, + "loss": 0.8423, + "step": 9201 + }, + { + "epoch": 0.4360104240701256, + "grad_norm": 0.0302734375, + "learning_rate": 0.00012005008333808028, + "loss": 0.0023, + "step": 9202 + }, + { + "epoch": 0.4360578062070599, + "grad_norm": 0.640625, + "learning_rate": 0.00012003549293801477, + "loss": 0.8052, + "step": 9203 + }, + { + "epoch": 0.4361051883439943, + "grad_norm": 0.1875, + "learning_rate": 0.00012002090209358336, + "loss": 0.1232, + "step": 9204 + }, + { + "epoch": 0.4361525704809287, + "grad_norm": 0.369140625, + "learning_rate": 0.00012000631080510969, + "loss": 0.3767, + "step": 9205 + }, + { + "epoch": 0.43619995261786304, + "grad_norm": 0.64453125, + "learning_rate": 0.00011999171907291735, + "loss": 0.7324, + "step": 9206 + }, + { + "epoch": 0.43624733475479743, + "grad_norm": 0.64453125, + "learning_rate": 0.00011997712689732996, + "loss": 1.2838, + "step": 9207 + }, + { + "epoch": 0.4362947168917318, + "grad_norm": 0.6328125, + "learning_rate": 0.00011996253427867116, + "loss": 0.8453, + "step": 9208 + }, + { + "epoch": 0.4363420990286662, + "grad_norm": 0.6328125, + "learning_rate": 0.00011994794121726467, + "loss": 1.2646, + "step": 9209 + }, + { + "epoch": 0.43638948116560056, + "grad_norm": 0.64453125, + "learning_rate": 0.00011993334771343405, + "loss": 0.8129, + "step": 9210 + }, + { + "epoch": 0.43643686330253495, + "grad_norm": 0.59765625, + "learning_rate": 0.00011991875376750303, + "loss": 0.8692, + "step": 9211 + }, + { + "epoch": 0.43648424543946934, + "grad_norm": 0.96484375, + "learning_rate": 0.00011990415937979525, + "loss": 1.3055, + "step": 9212 + }, + { + "epoch": 0.4365316275764037, + "grad_norm": 0.6328125, + "learning_rate": 0.00011988956455063442, + "loss": 1.3004, + "step": 9213 + }, + { + "epoch": 0.4365790097133381, + "grad_norm": 0.7421875, + "learning_rate": 0.00011987496928034423, + "loss": 0.9849, + "step": 9214 + }, + { + "epoch": 0.43662639185027247, + "grad_norm": 0.0791015625, + "learning_rate": 0.00011986037356924839, + "loss": 0.0081, + "step": 9215 + }, + { + "epoch": 0.4366737739872068, + "grad_norm": 0.74609375, + "learning_rate": 0.00011984577741767062, + "loss": 0.7798, + "step": 9216 + }, + { + "epoch": 0.4367211561241412, + "grad_norm": 0.5078125, + "learning_rate": 0.00011983118082593466, + "loss": 0.1623, + "step": 9217 + }, + { + "epoch": 0.4367685382610756, + "grad_norm": 0.73828125, + "learning_rate": 0.00011981658379436423, + "loss": 1.3366, + "step": 9218 + }, + { + "epoch": 0.4368159203980099, + "grad_norm": 0.69140625, + "learning_rate": 0.00011980198632328307, + "loss": 1.4171, + "step": 9219 + }, + { + "epoch": 0.4368633025349443, + "grad_norm": 0.061279296875, + "learning_rate": 0.00011978738841301494, + "loss": 0.0048, + "step": 9220 + }, + { + "epoch": 0.4369106846718787, + "grad_norm": 0.1875, + "learning_rate": 0.00011977279006388365, + "loss": 0.0173, + "step": 9221 + }, + { + "epoch": 0.43695806680881305, + "grad_norm": 0.5546875, + "learning_rate": 0.00011975819127621289, + "loss": 0.6574, + "step": 9222 + }, + { + "epoch": 0.43700544894574744, + "grad_norm": 0.6875, + "learning_rate": 0.00011974359205032648, + "loss": 1.1759, + "step": 9223 + }, + { + "epoch": 0.43705283108268184, + "grad_norm": 0.12451171875, + "learning_rate": 0.00011972899238654827, + "loss": 0.0073, + "step": 9224 + }, + { + "epoch": 0.43710021321961623, + "grad_norm": 0.49609375, + "learning_rate": 0.00011971439228520203, + "loss": 0.3249, + "step": 9225 + }, + { + "epoch": 0.43714759535655057, + "grad_norm": 0.8046875, + "learning_rate": 0.00011969979174661152, + "loss": 0.088, + "step": 9226 + }, + { + "epoch": 0.43719497749348496, + "grad_norm": 0.62109375, + "learning_rate": 0.00011968519077110065, + "loss": 1.2362, + "step": 9227 + }, + { + "epoch": 0.43724235963041935, + "grad_norm": 0.69921875, + "learning_rate": 0.0001196705893589932, + "loss": 1.108, + "step": 9228 + }, + { + "epoch": 0.4372897417673537, + "grad_norm": 0.16015625, + "learning_rate": 0.00011965598751061307, + "loss": 0.0033, + "step": 9229 + }, + { + "epoch": 0.4373371239042881, + "grad_norm": 0.2578125, + "learning_rate": 0.00011964138522628403, + "loss": 0.0473, + "step": 9230 + }, + { + "epoch": 0.4373845060412225, + "grad_norm": 0.59375, + "learning_rate": 0.00011962678250632999, + "loss": 1.0528, + "step": 9231 + }, + { + "epoch": 0.4374318881781568, + "grad_norm": 0.62109375, + "learning_rate": 0.00011961217935107482, + "loss": 0.8092, + "step": 9232 + }, + { + "epoch": 0.4374792703150912, + "grad_norm": 0.67578125, + "learning_rate": 0.00011959757576084243, + "loss": 1.3905, + "step": 9233 + }, + { + "epoch": 0.4375266524520256, + "grad_norm": 0.68359375, + "learning_rate": 0.00011958297173595666, + "loss": 1.2362, + "step": 9234 + }, + { + "epoch": 0.43757403458895994, + "grad_norm": 0.71875, + "learning_rate": 0.00011956836727674143, + "loss": 1.259, + "step": 9235 + }, + { + "epoch": 0.43762141672589433, + "grad_norm": 0.50390625, + "learning_rate": 0.00011955376238352069, + "loss": 0.779, + "step": 9236 + }, + { + "epoch": 0.4376687988628287, + "grad_norm": 0.046630859375, + "learning_rate": 0.00011953915705661829, + "loss": 0.0048, + "step": 9237 + }, + { + "epoch": 0.4377161809997631, + "grad_norm": 0.5625, + "learning_rate": 0.0001195245512963582, + "loss": 0.0321, + "step": 9238 + }, + { + "epoch": 0.43776356313669745, + "grad_norm": 0.66796875, + "learning_rate": 0.00011950994510306437, + "loss": 0.7033, + "step": 9239 + }, + { + "epoch": 0.43781094527363185, + "grad_norm": 0.9375, + "learning_rate": 0.00011949533847706076, + "loss": 1.2161, + "step": 9240 + }, + { + "epoch": 0.43785832741056624, + "grad_norm": 0.5859375, + "learning_rate": 0.00011948073141867126, + "loss": 0.9879, + "step": 9241 + }, + { + "epoch": 0.4379057095475006, + "grad_norm": 0.58984375, + "learning_rate": 0.00011946612392821992, + "loss": 0.4406, + "step": 9242 + }, + { + "epoch": 0.43795309168443497, + "grad_norm": 0.71875, + "learning_rate": 0.0001194515160060307, + "loss": 1.2926, + "step": 9243 + }, + { + "epoch": 0.43800047382136936, + "grad_norm": 0.6171875, + "learning_rate": 0.00011943690765242757, + "loss": 0.8098, + "step": 9244 + }, + { + "epoch": 0.4380478559583037, + "grad_norm": 0.7890625, + "learning_rate": 0.00011942229886773451, + "loss": 0.9731, + "step": 9245 + }, + { + "epoch": 0.4380952380952381, + "grad_norm": 0.95703125, + "learning_rate": 0.00011940768965227553, + "loss": 0.9692, + "step": 9246 + }, + { + "epoch": 0.4381426202321725, + "grad_norm": 0.5703125, + "learning_rate": 0.00011939308000637471, + "loss": 0.6649, + "step": 9247 + }, + { + "epoch": 0.4381900023691068, + "grad_norm": 0.65625, + "learning_rate": 0.00011937846993035602, + "loss": 1.4241, + "step": 9248 + }, + { + "epoch": 0.4382373845060412, + "grad_norm": 0.61328125, + "learning_rate": 0.00011936385942454348, + "loss": 0.6976, + "step": 9249 + }, + { + "epoch": 0.4382847666429756, + "grad_norm": 0.53515625, + "learning_rate": 0.00011934924848926122, + "loss": 0.0258, + "step": 9250 + }, + { + "epoch": 0.43833214877990995, + "grad_norm": 0.8125, + "learning_rate": 0.00011933463712483318, + "loss": 0.5183, + "step": 9251 + }, + { + "epoch": 0.43837953091684434, + "grad_norm": 0.345703125, + "learning_rate": 0.00011932002533158351, + "loss": 0.0113, + "step": 9252 + }, + { + "epoch": 0.43842691305377873, + "grad_norm": 0.6875, + "learning_rate": 0.00011930541310983625, + "loss": 0.9669, + "step": 9253 + }, + { + "epoch": 0.4384742951907131, + "grad_norm": 0.380859375, + "learning_rate": 0.00011929080045991549, + "loss": 0.2126, + "step": 9254 + }, + { + "epoch": 0.43852167732764746, + "grad_norm": 0.2099609375, + "learning_rate": 0.00011927618738214534, + "loss": 0.1515, + "step": 9255 + }, + { + "epoch": 0.43856905946458186, + "grad_norm": 0.6171875, + "learning_rate": 0.00011926157387684986, + "loss": 1.0796, + "step": 9256 + }, + { + "epoch": 0.43861644160151625, + "grad_norm": 0.67578125, + "learning_rate": 0.0001192469599443532, + "loss": 1.2017, + "step": 9257 + }, + { + "epoch": 0.4386638237384506, + "grad_norm": 0.5546875, + "learning_rate": 0.00011923234558497948, + "loss": 0.6035, + "step": 9258 + }, + { + "epoch": 0.438711205875385, + "grad_norm": 0.64453125, + "learning_rate": 0.00011921773079905283, + "loss": 0.9509, + "step": 9259 + }, + { + "epoch": 0.4387585880123194, + "grad_norm": 0.474609375, + "learning_rate": 0.00011920311558689734, + "loss": 0.2578, + "step": 9260 + }, + { + "epoch": 0.4388059701492537, + "grad_norm": 0.61328125, + "learning_rate": 0.00011918849994883721, + "loss": 0.9338, + "step": 9261 + }, + { + "epoch": 0.4388533522861881, + "grad_norm": 0.75, + "learning_rate": 0.00011917388388519661, + "loss": 1.1853, + "step": 9262 + }, + { + "epoch": 0.4389007344231225, + "grad_norm": 0.7578125, + "learning_rate": 0.00011915926739629968, + "loss": 0.7985, + "step": 9263 + }, + { + "epoch": 0.43894811656005683, + "grad_norm": 0.5078125, + "learning_rate": 0.0001191446504824706, + "loss": 0.7152, + "step": 9264 + }, + { + "epoch": 0.4389954986969912, + "grad_norm": 0.58203125, + "learning_rate": 0.00011913003314403357, + "loss": 0.4354, + "step": 9265 + }, + { + "epoch": 0.4390428808339256, + "grad_norm": 0.57421875, + "learning_rate": 0.00011911541538131279, + "loss": 0.8526, + "step": 9266 + }, + { + "epoch": 0.43909026297086, + "grad_norm": 0.474609375, + "learning_rate": 0.00011910079719463247, + "loss": 0.1765, + "step": 9267 + }, + { + "epoch": 0.43913764510779435, + "grad_norm": 0.482421875, + "learning_rate": 0.00011908617858431679, + "loss": 0.8484, + "step": 9268 + }, + { + "epoch": 0.43918502724472874, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011907155955068999, + "loss": 0.1277, + "step": 9269 + }, + { + "epoch": 0.43923240938166314, + "grad_norm": 0.404296875, + "learning_rate": 0.00011905694009407631, + "loss": 0.1151, + "step": 9270 + }, + { + "epoch": 0.4392797915185975, + "grad_norm": 0.67578125, + "learning_rate": 0.00011904232021480002, + "loss": 0.9624, + "step": 9271 + }, + { + "epoch": 0.43932717365553187, + "grad_norm": 1.9375, + "learning_rate": 0.00011902769991318534, + "loss": 0.7456, + "step": 9272 + }, + { + "epoch": 0.43937455579246626, + "grad_norm": 0.796875, + "learning_rate": 0.00011901307918955653, + "loss": 0.6132, + "step": 9273 + }, + { + "epoch": 0.4394219379294006, + "grad_norm": 0.040771484375, + "learning_rate": 0.0001189984580442379, + "loss": 0.003, + "step": 9274 + }, + { + "epoch": 0.439469320066335, + "grad_norm": 0.5, + "learning_rate": 0.00011898383647755374, + "loss": 0.9202, + "step": 9275 + }, + { + "epoch": 0.4395167022032694, + "grad_norm": 0.7109375, + "learning_rate": 0.00011896921448982825, + "loss": 1.2101, + "step": 9276 + }, + { + "epoch": 0.4395640843402037, + "grad_norm": 0.63671875, + "learning_rate": 0.00011895459208138579, + "loss": 0.7341, + "step": 9277 + }, + { + "epoch": 0.4396114664771381, + "grad_norm": 0.248046875, + "learning_rate": 0.00011893996925255069, + "loss": 0.0061, + "step": 9278 + }, + { + "epoch": 0.4396588486140725, + "grad_norm": 0.05615234375, + "learning_rate": 0.00011892534600364725, + "loss": 0.003, + "step": 9279 + }, + { + "epoch": 0.43970623075100684, + "grad_norm": 0.7734375, + "learning_rate": 0.00011891072233499976, + "loss": 0.8399, + "step": 9280 + }, + { + "epoch": 0.43975361288794124, + "grad_norm": 0.60546875, + "learning_rate": 0.0001188960982469326, + "loss": 0.874, + "step": 9281 + }, + { + "epoch": 0.43980099502487563, + "grad_norm": 0.43359375, + "learning_rate": 0.00011888147373977014, + "loss": 0.9002, + "step": 9282 + }, + { + "epoch": 0.43984837716181, + "grad_norm": 0.89453125, + "learning_rate": 0.00011886684881383669, + "loss": 1.1015, + "step": 9283 + }, + { + "epoch": 0.43989575929874436, + "grad_norm": 0.546875, + "learning_rate": 0.00011885222346945661, + "loss": 0.5635, + "step": 9284 + }, + { + "epoch": 0.43994314143567875, + "grad_norm": 0.66015625, + "learning_rate": 0.00011883759770695431, + "loss": 0.8537, + "step": 9285 + }, + { + "epoch": 0.43999052357261315, + "grad_norm": 0.5078125, + "learning_rate": 0.00011882297152665416, + "loss": 0.7777, + "step": 9286 + }, + { + "epoch": 0.4400379057095475, + "grad_norm": 0.6796875, + "learning_rate": 0.00011880834492888056, + "loss": 1.4362, + "step": 9287 + }, + { + "epoch": 0.4400852878464819, + "grad_norm": 1.1484375, + "learning_rate": 0.00011879371791395785, + "loss": 0.007, + "step": 9288 + }, + { + "epoch": 0.44013266998341627, + "grad_norm": 0.921875, + "learning_rate": 0.00011877909048221056, + "loss": 1.1929, + "step": 9289 + }, + { + "epoch": 0.4401800521203506, + "grad_norm": 0.1396484375, + "learning_rate": 0.00011876446263396303, + "loss": 0.0123, + "step": 9290 + }, + { + "epoch": 0.440227434257285, + "grad_norm": 0.26953125, + "learning_rate": 0.0001187498343695397, + "loss": 0.0907, + "step": 9291 + }, + { + "epoch": 0.4402748163942194, + "grad_norm": 0.73046875, + "learning_rate": 0.000118735205689265, + "loss": 1.025, + "step": 9292 + }, + { + "epoch": 0.44032219853115373, + "grad_norm": 0.1796875, + "learning_rate": 0.0001187205765934634, + "loss": 0.1393, + "step": 9293 + }, + { + "epoch": 0.4403695806680881, + "grad_norm": 0.53515625, + "learning_rate": 0.00011870594708245937, + "loss": 0.9188, + "step": 9294 + }, + { + "epoch": 0.4404169628050225, + "grad_norm": 0.7578125, + "learning_rate": 0.00011869131715657733, + "loss": 0.0155, + "step": 9295 + }, + { + "epoch": 0.4404643449419569, + "grad_norm": 0.62109375, + "learning_rate": 0.00011867668681614179, + "loss": 0.6337, + "step": 9296 + }, + { + "epoch": 0.44051172707889125, + "grad_norm": 0.84765625, + "learning_rate": 0.00011866205606147725, + "loss": 0.5353, + "step": 9297 + }, + { + "epoch": 0.44055910921582564, + "grad_norm": 0.90234375, + "learning_rate": 0.00011864742489290819, + "loss": 1.0244, + "step": 9298 + }, + { + "epoch": 0.44060649135276003, + "grad_norm": 0.052001953125, + "learning_rate": 0.0001186327933107591, + "loss": 0.0026, + "step": 9299 + }, + { + "epoch": 0.44065387348969437, + "grad_norm": 0.7109375, + "learning_rate": 0.00011861816131535447, + "loss": 1.4961, + "step": 9300 + }, + { + "epoch": 0.44070125562662876, + "grad_norm": 0.78125, + "learning_rate": 0.00011860352890701889, + "loss": 1.1084, + "step": 9301 + }, + { + "epoch": 0.44074863776356316, + "grad_norm": 0.27734375, + "learning_rate": 0.00011858889608607684, + "loss": 0.1376, + "step": 9302 + }, + { + "epoch": 0.4407960199004975, + "grad_norm": 0.64453125, + "learning_rate": 0.00011857426285285287, + "loss": 1.4545, + "step": 9303 + }, + { + "epoch": 0.4408434020374319, + "grad_norm": 0.201171875, + "learning_rate": 0.00011855962920767152, + "loss": 0.0084, + "step": 9304 + }, + { + "epoch": 0.4408907841743663, + "grad_norm": 0.427734375, + "learning_rate": 0.00011854499515085741, + "loss": 0.9402, + "step": 9305 + }, + { + "epoch": 0.4409381663113006, + "grad_norm": 0.828125, + "learning_rate": 0.00011853036068273504, + "loss": 0.8362, + "step": 9306 + }, + { + "epoch": 0.440985548448235, + "grad_norm": 0.1083984375, + "learning_rate": 0.000118515725803629, + "loss": 0.0034, + "step": 9307 + }, + { + "epoch": 0.4410329305851694, + "grad_norm": 0.7890625, + "learning_rate": 0.00011850109051386387, + "loss": 0.9941, + "step": 9308 + }, + { + "epoch": 0.44108031272210374, + "grad_norm": 0.7265625, + "learning_rate": 0.00011848645481376428, + "loss": 1.1298, + "step": 9309 + }, + { + "epoch": 0.44112769485903813, + "grad_norm": 0.7109375, + "learning_rate": 0.00011847181870365481, + "loss": 0.8825, + "step": 9310 + }, + { + "epoch": 0.4411750769959725, + "grad_norm": 0.62109375, + "learning_rate": 0.00011845718218386006, + "loss": 0.8857, + "step": 9311 + }, + { + "epoch": 0.4412224591329069, + "grad_norm": 0.48828125, + "learning_rate": 0.00011844254525470468, + "loss": 0.6584, + "step": 9312 + }, + { + "epoch": 0.44126984126984126, + "grad_norm": 0.92578125, + "learning_rate": 0.00011842790791651333, + "loss": 0.9399, + "step": 9313 + }, + { + "epoch": 0.44131722340677565, + "grad_norm": 0.5234375, + "learning_rate": 0.00011841327016961055, + "loss": 1.0528, + "step": 9314 + }, + { + "epoch": 0.44136460554371004, + "grad_norm": 0.263671875, + "learning_rate": 0.00011839863201432106, + "loss": 0.0161, + "step": 9315 + }, + { + "epoch": 0.4414119876806444, + "grad_norm": 0.70703125, + "learning_rate": 0.00011838399345096953, + "loss": 0.6023, + "step": 9316 + }, + { + "epoch": 0.4414593698175788, + "grad_norm": 0.048095703125, + "learning_rate": 0.00011836935447988061, + "loss": 0.005, + "step": 9317 + }, + { + "epoch": 0.44150675195451317, + "grad_norm": 0.1513671875, + "learning_rate": 0.00011835471510137895, + "loss": 0.0147, + "step": 9318 + }, + { + "epoch": 0.4415541340914475, + "grad_norm": 0.5703125, + "learning_rate": 0.00011834007531578927, + "loss": 0.6816, + "step": 9319 + }, + { + "epoch": 0.4416015162283819, + "grad_norm": 0.54296875, + "learning_rate": 0.00011832543512343628, + "loss": 0.6251, + "step": 9320 + }, + { + "epoch": 0.4416488983653163, + "grad_norm": 0.66796875, + "learning_rate": 0.00011831079452464464, + "loss": 1.2253, + "step": 9321 + }, + { + "epoch": 0.4416962805022506, + "grad_norm": 0.82421875, + "learning_rate": 0.00011829615351973906, + "loss": 1.0539, + "step": 9322 + }, + { + "epoch": 0.441743662639185, + "grad_norm": 0.62109375, + "learning_rate": 0.0001182815121090443, + "loss": 1.1218, + "step": 9323 + }, + { + "epoch": 0.4417910447761194, + "grad_norm": 0.58203125, + "learning_rate": 0.0001182668702928851, + "loss": 0.6269, + "step": 9324 + }, + { + "epoch": 0.4418384269130538, + "grad_norm": 0.9609375, + "learning_rate": 0.00011825222807158613, + "loss": 1.0764, + "step": 9325 + }, + { + "epoch": 0.44188580904998814, + "grad_norm": 0.17578125, + "learning_rate": 0.00011823758544547221, + "loss": 0.1038, + "step": 9326 + }, + { + "epoch": 0.44193319118692254, + "grad_norm": 0.451171875, + "learning_rate": 0.00011822294241486805, + "loss": 0.1845, + "step": 9327 + }, + { + "epoch": 0.44198057332385693, + "grad_norm": 0.02978515625, + "learning_rate": 0.00011820829898009847, + "loss": 0.0009, + "step": 9328 + }, + { + "epoch": 0.44202795546079127, + "grad_norm": 0.69140625, + "learning_rate": 0.00011819365514148819, + "loss": 1.1411, + "step": 9329 + }, + { + "epoch": 0.44207533759772566, + "grad_norm": 0.3125, + "learning_rate": 0.00011817901089936201, + "loss": 0.0144, + "step": 9330 + }, + { + "epoch": 0.44212271973466005, + "grad_norm": 0.57421875, + "learning_rate": 0.00011816436625404474, + "loss": 0.0671, + "step": 9331 + }, + { + "epoch": 0.4421701018715944, + "grad_norm": 0.69921875, + "learning_rate": 0.00011814972120586118, + "loss": 1.2879, + "step": 9332 + }, + { + "epoch": 0.4422174840085288, + "grad_norm": 0.70703125, + "learning_rate": 0.00011813507575513614, + "loss": 0.9976, + "step": 9333 + }, + { + "epoch": 0.4422648661454632, + "grad_norm": 0.65625, + "learning_rate": 0.00011812042990219441, + "loss": 0.8429, + "step": 9334 + }, + { + "epoch": 0.4423122482823975, + "grad_norm": 0.78125, + "learning_rate": 0.00011810578364736089, + "loss": 1.0488, + "step": 9335 + }, + { + "epoch": 0.4423596304193319, + "grad_norm": 0.63671875, + "learning_rate": 0.00011809113699096033, + "loss": 1.1678, + "step": 9336 + }, + { + "epoch": 0.4424070125562663, + "grad_norm": 0.70703125, + "learning_rate": 0.00011807648993331766, + "loss": 0.9482, + "step": 9337 + }, + { + "epoch": 0.44245439469320064, + "grad_norm": 0.69140625, + "learning_rate": 0.00011806184247475766, + "loss": 0.8572, + "step": 9338 + }, + { + "epoch": 0.44250177683013503, + "grad_norm": 0.65234375, + "learning_rate": 0.00011804719461560525, + "loss": 0.8175, + "step": 9339 + }, + { + "epoch": 0.4425491589670694, + "grad_norm": 0.56640625, + "learning_rate": 0.00011803254635618531, + "loss": 1.0617, + "step": 9340 + }, + { + "epoch": 0.4425965411040038, + "grad_norm": 0.6796875, + "learning_rate": 0.00011801789769682266, + "loss": 0.6566, + "step": 9341 + }, + { + "epoch": 0.44264392324093815, + "grad_norm": 0.60546875, + "learning_rate": 0.00011800324863784225, + "loss": 1.1249, + "step": 9342 + }, + { + "epoch": 0.44269130537787255, + "grad_norm": 0.63671875, + "learning_rate": 0.00011798859917956897, + "loss": 0.5442, + "step": 9343 + }, + { + "epoch": 0.44273868751480694, + "grad_norm": 0.55859375, + "learning_rate": 0.00011797394932232769, + "loss": 0.6662, + "step": 9344 + }, + { + "epoch": 0.4427860696517413, + "grad_norm": 0.83984375, + "learning_rate": 0.00011795929906644339, + "loss": 1.1757, + "step": 9345 + }, + { + "epoch": 0.44283345178867567, + "grad_norm": 0.85546875, + "learning_rate": 0.00011794464841224095, + "loss": 0.2145, + "step": 9346 + }, + { + "epoch": 0.44288083392561006, + "grad_norm": 0.53515625, + "learning_rate": 0.00011792999736004533, + "loss": 0.75, + "step": 9347 + }, + { + "epoch": 0.4429282160625444, + "grad_norm": 0.71875, + "learning_rate": 0.00011791534591018148, + "loss": 1.4978, + "step": 9348 + }, + { + "epoch": 0.4429755981994788, + "grad_norm": 0.6953125, + "learning_rate": 0.00011790069406297431, + "loss": 0.9863, + "step": 9349 + }, + { + "epoch": 0.4430229803364132, + "grad_norm": 0.63671875, + "learning_rate": 0.00011788604181874881, + "loss": 0.8307, + "step": 9350 + }, + { + "epoch": 0.4430703624733475, + "grad_norm": 0.765625, + "learning_rate": 0.00011787138917782999, + "loss": 1.3348, + "step": 9351 + }, + { + "epoch": 0.4431177446102819, + "grad_norm": 0.004852294921875, + "learning_rate": 0.00011785673614054277, + "loss": 0.0002, + "step": 9352 + }, + { + "epoch": 0.4431651267472163, + "grad_norm": 0.255859375, + "learning_rate": 0.00011784208270721216, + "loss": 0.1688, + "step": 9353 + }, + { + "epoch": 0.4432125088841507, + "grad_norm": 0.6328125, + "learning_rate": 0.00011782742887816316, + "loss": 1.3078, + "step": 9354 + }, + { + "epoch": 0.44325989102108504, + "grad_norm": 0.6328125, + "learning_rate": 0.0001178127746537208, + "loss": 1.1072, + "step": 9355 + }, + { + "epoch": 0.44330727315801943, + "grad_norm": 0.66015625, + "learning_rate": 0.00011779812003421004, + "loss": 0.9367, + "step": 9356 + }, + { + "epoch": 0.4433546552949538, + "grad_norm": 0.69921875, + "learning_rate": 0.00011778346501995594, + "loss": 0.7521, + "step": 9357 + }, + { + "epoch": 0.44340203743188816, + "grad_norm": 0.68359375, + "learning_rate": 0.00011776880961128356, + "loss": 0.9882, + "step": 9358 + }, + { + "epoch": 0.44344941956882256, + "grad_norm": 0.66796875, + "learning_rate": 0.00011775415380851787, + "loss": 1.1479, + "step": 9359 + }, + { + "epoch": 0.44349680170575695, + "grad_norm": 0.244140625, + "learning_rate": 0.000117739497611984, + "loss": 0.1426, + "step": 9360 + }, + { + "epoch": 0.4435441838426913, + "grad_norm": 0.66796875, + "learning_rate": 0.00011772484102200693, + "loss": 1.0151, + "step": 9361 + }, + { + "epoch": 0.4435915659796257, + "grad_norm": 0.765625, + "learning_rate": 0.00011771018403891176, + "loss": 1.0357, + "step": 9362 + }, + { + "epoch": 0.4436389481165601, + "grad_norm": 0.62890625, + "learning_rate": 0.00011769552666302361, + "loss": 1.2855, + "step": 9363 + }, + { + "epoch": 0.4436863302534944, + "grad_norm": 0.6796875, + "learning_rate": 0.00011768086889466752, + "loss": 1.3557, + "step": 9364 + }, + { + "epoch": 0.4437337123904288, + "grad_norm": 0.185546875, + "learning_rate": 0.00011766621073416857, + "loss": 0.1146, + "step": 9365 + }, + { + "epoch": 0.4437810945273632, + "grad_norm": 0.5390625, + "learning_rate": 0.00011765155218185188, + "loss": 0.685, + "step": 9366 + }, + { + "epoch": 0.44382847666429753, + "grad_norm": 0.640625, + "learning_rate": 0.00011763689323804258, + "loss": 0.8738, + "step": 9367 + }, + { + "epoch": 0.4438758588012319, + "grad_norm": 0.76171875, + "learning_rate": 0.00011762223390306579, + "loss": 0.9329, + "step": 9368 + }, + { + "epoch": 0.4439232409381663, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011760757417724659, + "loss": 0.0706, + "step": 9369 + }, + { + "epoch": 0.4439706230751007, + "grad_norm": 0.0693359375, + "learning_rate": 0.00011759291406091018, + "loss": 0.0074, + "step": 9370 + }, + { + "epoch": 0.44401800521203505, + "grad_norm": 0.73046875, + "learning_rate": 0.00011757825355438165, + "loss": 1.3688, + "step": 9371 + }, + { + "epoch": 0.44406538734896944, + "grad_norm": 0.44921875, + "learning_rate": 0.00011756359265798618, + "loss": 0.2759, + "step": 9372 + }, + { + "epoch": 0.44411276948590384, + "grad_norm": 0.032470703125, + "learning_rate": 0.00011754893137204895, + "loss": 0.0032, + "step": 9373 + }, + { + "epoch": 0.4441601516228382, + "grad_norm": 0.54296875, + "learning_rate": 0.0001175342696968951, + "loss": 0.7585, + "step": 9374 + }, + { + "epoch": 0.44420753375977257, + "grad_norm": 0.072265625, + "learning_rate": 0.00011751960763284981, + "loss": 0.0044, + "step": 9375 + }, + { + "epoch": 0.44425491589670696, + "grad_norm": 0.5390625, + "learning_rate": 0.00011750494518023834, + "loss": 0.7121, + "step": 9376 + }, + { + "epoch": 0.4443022980336413, + "grad_norm": 0.6484375, + "learning_rate": 0.00011749028233938577, + "loss": 0.8757, + "step": 9377 + }, + { + "epoch": 0.4443496801705757, + "grad_norm": 0.212890625, + "learning_rate": 0.00011747561911061742, + "loss": 0.1564, + "step": 9378 + }, + { + "epoch": 0.4443970623075101, + "grad_norm": 0.5, + "learning_rate": 0.00011746095549425841, + "loss": 1.078, + "step": 9379 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.6484375, + "learning_rate": 0.00011744629149063403, + "loss": 0.6784, + "step": 9380 + }, + { + "epoch": 0.4444918265813788, + "grad_norm": 0.625, + "learning_rate": 0.00011743162710006947, + "loss": 0.051, + "step": 9381 + }, + { + "epoch": 0.4445392087183132, + "grad_norm": 0.72265625, + "learning_rate": 0.00011741696232288998, + "loss": 0.983, + "step": 9382 + }, + { + "epoch": 0.4445865908552476, + "grad_norm": 0.625, + "learning_rate": 0.00011740229715942083, + "loss": 0.6347, + "step": 9383 + }, + { + "epoch": 0.44463397299218194, + "grad_norm": 0.5859375, + "learning_rate": 0.00011738763160998729, + "loss": 0.9181, + "step": 9384 + }, + { + "epoch": 0.44468135512911633, + "grad_norm": 0.0023345947265625, + "learning_rate": 0.00011737296567491459, + "loss": 0.0001, + "step": 9385 + }, + { + "epoch": 0.4447287372660507, + "grad_norm": 0.46875, + "learning_rate": 0.000117358299354528, + "loss": 0.0823, + "step": 9386 + }, + { + "epoch": 0.44477611940298506, + "grad_norm": 0.51171875, + "learning_rate": 0.00011734363264915282, + "loss": 0.7144, + "step": 9387 + }, + { + "epoch": 0.44482350153991945, + "grad_norm": 0.2314453125, + "learning_rate": 0.00011732896555911435, + "loss": 0.1086, + "step": 9388 + }, + { + "epoch": 0.44487088367685385, + "grad_norm": 0.185546875, + "learning_rate": 0.00011731429808473788, + "loss": 0.1366, + "step": 9389 + }, + { + "epoch": 0.4449182658137882, + "grad_norm": 0.73046875, + "learning_rate": 0.00011729963022634872, + "loss": 1.0817, + "step": 9390 + }, + { + "epoch": 0.4449656479507226, + "grad_norm": 0.435546875, + "learning_rate": 0.00011728496198427217, + "loss": 0.7142, + "step": 9391 + }, + { + "epoch": 0.44501303008765697, + "grad_norm": 0.6171875, + "learning_rate": 0.00011727029335883361, + "loss": 0.649, + "step": 9392 + }, + { + "epoch": 0.4450604122245913, + "grad_norm": 0.703125, + "learning_rate": 0.00011725562435035833, + "loss": 1.0864, + "step": 9393 + }, + { + "epoch": 0.4451077943615257, + "grad_norm": 0.4609375, + "learning_rate": 0.00011724095495917167, + "loss": 0.5156, + "step": 9394 + }, + { + "epoch": 0.4451551764984601, + "grad_norm": 1.703125, + "learning_rate": 0.00011722628518559899, + "loss": 0.0558, + "step": 9395 + }, + { + "epoch": 0.44520255863539443, + "grad_norm": 0.765625, + "learning_rate": 0.00011721161502996565, + "loss": 0.5618, + "step": 9396 + }, + { + "epoch": 0.4452499407723288, + "grad_norm": 0.48828125, + "learning_rate": 0.00011719694449259706, + "loss": 0.7703, + "step": 9397 + }, + { + "epoch": 0.4452973229092632, + "grad_norm": 0.87109375, + "learning_rate": 0.0001171822735738185, + "loss": 0.9965, + "step": 9398 + }, + { + "epoch": 0.4453447050461976, + "grad_norm": 0.64453125, + "learning_rate": 0.00011716760227395545, + "loss": 1.0266, + "step": 9399 + }, + { + "epoch": 0.44539208718313195, + "grad_norm": 0.06005859375, + "learning_rate": 0.00011715293059333327, + "loss": 0.0017, + "step": 9400 + }, + { + "epoch": 0.44543946932006634, + "grad_norm": 0.6328125, + "learning_rate": 0.00011713825853227738, + "loss": 1.2956, + "step": 9401 + }, + { + "epoch": 0.44548685145700073, + "grad_norm": 0.64453125, + "learning_rate": 0.00011712358609111311, + "loss": 1.0393, + "step": 9402 + }, + { + "epoch": 0.44553423359393507, + "grad_norm": 0.79296875, + "learning_rate": 0.00011710891327016597, + "loss": 1.3742, + "step": 9403 + }, + { + "epoch": 0.44558161573086946, + "grad_norm": 1.171875, + "learning_rate": 0.00011709424006976138, + "loss": 1.0638, + "step": 9404 + }, + { + "epoch": 0.44562899786780386, + "grad_norm": 0.62890625, + "learning_rate": 0.0001170795664902247, + "loss": 0.94, + "step": 9405 + }, + { + "epoch": 0.4456763800047382, + "grad_norm": 0.470703125, + "learning_rate": 0.00011706489253188147, + "loss": 0.5283, + "step": 9406 + }, + { + "epoch": 0.4457237621416726, + "grad_norm": 0.68359375, + "learning_rate": 0.0001170502181950571, + "loss": 1.1417, + "step": 9407 + }, + { + "epoch": 0.445771144278607, + "grad_norm": 0.8046875, + "learning_rate": 0.00011703554348007704, + "loss": 0.9421, + "step": 9408 + }, + { + "epoch": 0.4458185264155413, + "grad_norm": 0.6953125, + "learning_rate": 0.00011702086838726679, + "loss": 0.7954, + "step": 9409 + }, + { + "epoch": 0.4458659085524757, + "grad_norm": 0.64453125, + "learning_rate": 0.00011700619291695178, + "loss": 1.0548, + "step": 9410 + }, + { + "epoch": 0.4459132906894101, + "grad_norm": 0.447265625, + "learning_rate": 0.00011699151706945753, + "loss": 0.0486, + "step": 9411 + }, + { + "epoch": 0.4459606728263445, + "grad_norm": 0.4921875, + "learning_rate": 0.00011697684084510956, + "loss": 0.8602, + "step": 9412 + }, + { + "epoch": 0.44600805496327883, + "grad_norm": 1.203125, + "learning_rate": 0.0001169621642442333, + "loss": 0.7969, + "step": 9413 + }, + { + "epoch": 0.4460554371002132, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011694748726715432, + "loss": 0.1151, + "step": 9414 + }, + { + "epoch": 0.4461028192371476, + "grad_norm": 0.119140625, + "learning_rate": 0.00011693280991419815, + "loss": 0.0798, + "step": 9415 + }, + { + "epoch": 0.44615020137408196, + "grad_norm": 0.5625, + "learning_rate": 0.00011691813218569027, + "loss": 0.7561, + "step": 9416 + }, + { + "epoch": 0.44619758351101635, + "grad_norm": 0.703125, + "learning_rate": 0.00011690345408195624, + "loss": 1.4186, + "step": 9417 + }, + { + "epoch": 0.44624496564795074, + "grad_norm": 0.609375, + "learning_rate": 0.00011688877560332158, + "loss": 0.9857, + "step": 9418 + }, + { + "epoch": 0.4462923477848851, + "grad_norm": 0.625, + "learning_rate": 0.00011687409675011189, + "loss": 0.8451, + "step": 9419 + }, + { + "epoch": 0.4463397299218195, + "grad_norm": 0.55859375, + "learning_rate": 0.00011685941752265272, + "loss": 0.5844, + "step": 9420 + }, + { + "epoch": 0.44638711205875387, + "grad_norm": 0.69140625, + "learning_rate": 0.0001168447379212696, + "loss": 0.0114, + "step": 9421 + }, + { + "epoch": 0.4464344941956882, + "grad_norm": 0.13671875, + "learning_rate": 0.00011683005794628814, + "loss": 0.017, + "step": 9422 + }, + { + "epoch": 0.4464818763326226, + "grad_norm": 0.234375, + "learning_rate": 0.00011681537759803393, + "loss": 0.0175, + "step": 9423 + }, + { + "epoch": 0.446529258469557, + "grad_norm": 0.66015625, + "learning_rate": 0.0001168006968768326, + "loss": 0.9998, + "step": 9424 + }, + { + "epoch": 0.4465766406064913, + "grad_norm": 0.38671875, + "learning_rate": 0.00011678601578300965, + "loss": 0.4831, + "step": 9425 + }, + { + "epoch": 0.4466240227434257, + "grad_norm": 0.67578125, + "learning_rate": 0.00011677133431689075, + "loss": 0.8602, + "step": 9426 + }, + { + "epoch": 0.4466714048803601, + "grad_norm": 0.59765625, + "learning_rate": 0.00011675665247880154, + "loss": 0.6564, + "step": 9427 + }, + { + "epoch": 0.4467187870172945, + "grad_norm": 0.5078125, + "learning_rate": 0.00011674197026906762, + "loss": 0.7945, + "step": 9428 + }, + { + "epoch": 0.44676616915422884, + "grad_norm": 0.94140625, + "learning_rate": 0.00011672728768801461, + "loss": 0.4851, + "step": 9429 + }, + { + "epoch": 0.44681355129116324, + "grad_norm": 0.828125, + "learning_rate": 0.00011671260473596819, + "loss": 0.5367, + "step": 9430 + }, + { + "epoch": 0.44686093342809763, + "grad_norm": 0.69140625, + "learning_rate": 0.00011669792141325402, + "loss": 0.5629, + "step": 9431 + }, + { + "epoch": 0.44690831556503197, + "grad_norm": 0.69921875, + "learning_rate": 0.00011668323772019774, + "loss": 0.0414, + "step": 9432 + }, + { + "epoch": 0.44695569770196636, + "grad_norm": 0.68359375, + "learning_rate": 0.00011666855365712499, + "loss": 1.2052, + "step": 9433 + }, + { + "epoch": 0.44700307983890075, + "grad_norm": 0.69140625, + "learning_rate": 0.00011665386922436148, + "loss": 0.7529, + "step": 9434 + }, + { + "epoch": 0.4470504619758351, + "grad_norm": 0.54296875, + "learning_rate": 0.00011663918442223292, + "loss": 1.1898, + "step": 9435 + }, + { + "epoch": 0.4470978441127695, + "grad_norm": 0.2138671875, + "learning_rate": 0.00011662449925106493, + "loss": 0.0194, + "step": 9436 + }, + { + "epoch": 0.4471452262497039, + "grad_norm": 0.80859375, + "learning_rate": 0.00011660981371118327, + "loss": 0.4117, + "step": 9437 + }, + { + "epoch": 0.4471926083866382, + "grad_norm": 0.69140625, + "learning_rate": 0.00011659512780291364, + "loss": 1.0694, + "step": 9438 + }, + { + "epoch": 0.4472399905235726, + "grad_norm": 0.16015625, + "learning_rate": 0.00011658044152658179, + "loss": 0.128, + "step": 9439 + }, + { + "epoch": 0.447287372660507, + "grad_norm": 0.59765625, + "learning_rate": 0.00011656575488251333, + "loss": 0.9505, + "step": 9440 + }, + { + "epoch": 0.4473347547974414, + "grad_norm": 0.126953125, + "learning_rate": 0.00011655106787103411, + "loss": 0.016, + "step": 9441 + }, + { + "epoch": 0.44738213693437573, + "grad_norm": 0.6328125, + "learning_rate": 0.00011653638049246982, + "loss": 0.5005, + "step": 9442 + }, + { + "epoch": 0.4474295190713101, + "grad_norm": 0.6640625, + "learning_rate": 0.00011652169274714626, + "loss": 1.0562, + "step": 9443 + }, + { + "epoch": 0.4474769012082445, + "grad_norm": 0.54296875, + "learning_rate": 0.0001165070046353891, + "loss": 0.2267, + "step": 9444 + }, + { + "epoch": 0.44752428334517885, + "grad_norm": 0.75390625, + "learning_rate": 0.0001164923161575242, + "loss": 1.0425, + "step": 9445 + }, + { + "epoch": 0.44757166548211325, + "grad_norm": 0.58984375, + "learning_rate": 0.0001164776273138773, + "loss": 0.9655, + "step": 9446 + }, + { + "epoch": 0.44761904761904764, + "grad_norm": 0.60546875, + "learning_rate": 0.00011646293810477418, + "loss": 1.1615, + "step": 9447 + }, + { + "epoch": 0.447666429755982, + "grad_norm": 0.85546875, + "learning_rate": 0.0001164482485305406, + "loss": 1.0926, + "step": 9448 + }, + { + "epoch": 0.44771381189291637, + "grad_norm": 0.37109375, + "learning_rate": 0.00011643355859150237, + "loss": 0.1574, + "step": 9449 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.67578125, + "learning_rate": 0.00011641886828798536, + "loss": 0.8106, + "step": 9450 + }, + { + "epoch": 0.4478085761667851, + "grad_norm": 0.16796875, + "learning_rate": 0.00011640417762031533, + "loss": 0.1354, + "step": 9451 + }, + { + "epoch": 0.4478559583037195, + "grad_norm": 0.578125, + "learning_rate": 0.00011638948658881808, + "loss": 0.5431, + "step": 9452 + }, + { + "epoch": 0.4479033404406539, + "grad_norm": 0.66796875, + "learning_rate": 0.00011637479519381949, + "loss": 1.068, + "step": 9453 + }, + { + "epoch": 0.4479507225775882, + "grad_norm": 0.55859375, + "learning_rate": 0.0001163601034356454, + "loss": 1.0824, + "step": 9454 + }, + { + "epoch": 0.4479981047145226, + "grad_norm": 0.546875, + "learning_rate": 0.00011634541131462167, + "loss": 0.9222, + "step": 9455 + }, + { + "epoch": 0.448045486851457, + "grad_norm": 0.39453125, + "learning_rate": 0.00011633071883107407, + "loss": 0.1449, + "step": 9456 + }, + { + "epoch": 0.4480928689883914, + "grad_norm": 0.7109375, + "learning_rate": 0.00011631602598532854, + "loss": 0.99, + "step": 9457 + }, + { + "epoch": 0.44814025112532574, + "grad_norm": 0.4453125, + "learning_rate": 0.00011630133277771096, + "loss": 0.6691, + "step": 9458 + }, + { + "epoch": 0.44818763326226013, + "grad_norm": 0.5859375, + "learning_rate": 0.00011628663920854714, + "loss": 0.7797, + "step": 9459 + }, + { + "epoch": 0.4482350153991945, + "grad_norm": 0.93359375, + "learning_rate": 0.00011627194527816304, + "loss": 0.6866, + "step": 9460 + }, + { + "epoch": 0.44828239753612886, + "grad_norm": 0.6875, + "learning_rate": 0.0001162572509868845, + "loss": 1.1991, + "step": 9461 + }, + { + "epoch": 0.44832977967306326, + "grad_norm": 0.62109375, + "learning_rate": 0.00011624255633503749, + "loss": 1.1431, + "step": 9462 + }, + { + "epoch": 0.44837716180999765, + "grad_norm": 0.671875, + "learning_rate": 0.00011622786132294784, + "loss": 1.2445, + "step": 9463 + }, + { + "epoch": 0.448424543946932, + "grad_norm": 0.69140625, + "learning_rate": 0.00011621316595094151, + "loss": 0.8336, + "step": 9464 + }, + { + "epoch": 0.4484719260838664, + "grad_norm": 0.74609375, + "learning_rate": 0.00011619847021934445, + "loss": 1.1235, + "step": 9465 + }, + { + "epoch": 0.44851930822080077, + "grad_norm": 0.703125, + "learning_rate": 0.00011618377412848258, + "loss": 1.0955, + "step": 9466 + }, + { + "epoch": 0.4485666903577351, + "grad_norm": 0.9375, + "learning_rate": 0.00011616907767868181, + "loss": 0.2217, + "step": 9467 + }, + { + "epoch": 0.4486140724946695, + "grad_norm": 0.6171875, + "learning_rate": 0.00011615438087026816, + "loss": 0.8414, + "step": 9468 + }, + { + "epoch": 0.4486614546316039, + "grad_norm": 0.79296875, + "learning_rate": 0.00011613968370356754, + "loss": 1.3303, + "step": 9469 + }, + { + "epoch": 0.4487088367685383, + "grad_norm": 0.6484375, + "learning_rate": 0.0001161249861789059, + "loss": 0.9061, + "step": 9470 + }, + { + "epoch": 0.4487562189054726, + "grad_norm": 1.171875, + "learning_rate": 0.00011611028829660925, + "loss": 0.924, + "step": 9471 + }, + { + "epoch": 0.448803601042407, + "grad_norm": 0.62890625, + "learning_rate": 0.00011609559005700356, + "loss": 1.097, + "step": 9472 + }, + { + "epoch": 0.4488509831793414, + "grad_norm": 0.484375, + "learning_rate": 0.00011608089146041487, + "loss": 0.8251, + "step": 9473 + }, + { + "epoch": 0.44889836531627575, + "grad_norm": 0.13671875, + "learning_rate": 0.0001160661925071691, + "loss": 0.0071, + "step": 9474 + }, + { + "epoch": 0.44894574745321014, + "grad_norm": 0.828125, + "learning_rate": 0.00011605149319759228, + "loss": 1.0481, + "step": 9475 + }, + { + "epoch": 0.44899312959014454, + "grad_norm": 1.0625, + "learning_rate": 0.00011603679353201044, + "loss": 1.073, + "step": 9476 + }, + { + "epoch": 0.4490405117270789, + "grad_norm": 0.7421875, + "learning_rate": 0.00011602209351074963, + "loss": 0.9888, + "step": 9477 + }, + { + "epoch": 0.44908789386401327, + "grad_norm": 0.58984375, + "learning_rate": 0.00011600739313413587, + "loss": 0.9214, + "step": 9478 + }, + { + "epoch": 0.44913527600094766, + "grad_norm": 0.66796875, + "learning_rate": 0.00011599269240249513, + "loss": 0.9524, + "step": 9479 + }, + { + "epoch": 0.449182658137882, + "grad_norm": 0.314453125, + "learning_rate": 0.00011597799131615352, + "loss": 0.1399, + "step": 9480 + }, + { + "epoch": 0.4492300402748164, + "grad_norm": 0.04541015625, + "learning_rate": 0.0001159632898754371, + "loss": 0.0011, + "step": 9481 + }, + { + "epoch": 0.4492774224117508, + "grad_norm": 0.546875, + "learning_rate": 0.00011594858808067188, + "loss": 0.6639, + "step": 9482 + }, + { + "epoch": 0.4493248045486851, + "grad_norm": 0.03857421875, + "learning_rate": 0.000115933885932184, + "loss": 0.0029, + "step": 9483 + }, + { + "epoch": 0.4493721866856195, + "grad_norm": 0.63671875, + "learning_rate": 0.00011591918343029946, + "loss": 0.8705, + "step": 9484 + }, + { + "epoch": 0.4494195688225539, + "grad_norm": 0.08642578125, + "learning_rate": 0.00011590448057534445, + "loss": 0.007, + "step": 9485 + }, + { + "epoch": 0.4494669509594883, + "grad_norm": 0.1337890625, + "learning_rate": 0.00011588977736764497, + "loss": 0.0142, + "step": 9486 + }, + { + "epoch": 0.44951433309642264, + "grad_norm": 0.62109375, + "learning_rate": 0.00011587507380752713, + "loss": 0.7774, + "step": 9487 + }, + { + "epoch": 0.44956171523335703, + "grad_norm": 0.578125, + "learning_rate": 0.00011586036989531707, + "loss": 0.823, + "step": 9488 + }, + { + "epoch": 0.4496090973702914, + "grad_norm": 0.703125, + "learning_rate": 0.00011584566563134094, + "loss": 0.9035, + "step": 9489 + }, + { + "epoch": 0.44965647950722576, + "grad_norm": 0.6796875, + "learning_rate": 0.00011583096101592479, + "loss": 1.0848, + "step": 9490 + }, + { + "epoch": 0.44970386164416015, + "grad_norm": 0.65234375, + "learning_rate": 0.00011581625604939477, + "loss": 1.6401, + "step": 9491 + }, + { + "epoch": 0.44975124378109455, + "grad_norm": 0.59375, + "learning_rate": 0.00011580155073207707, + "loss": 1.2194, + "step": 9492 + }, + { + "epoch": 0.4497986259180289, + "grad_norm": 0.216796875, + "learning_rate": 0.0001157868450642978, + "loss": 0.0073, + "step": 9493 + }, + { + "epoch": 0.4498460080549633, + "grad_norm": 0.796875, + "learning_rate": 0.00011577213904638313, + "loss": 1.1233, + "step": 9494 + }, + { + "epoch": 0.44989339019189767, + "grad_norm": 0.2109375, + "learning_rate": 0.0001157574326786592, + "loss": 0.1325, + "step": 9495 + }, + { + "epoch": 0.449940772328832, + "grad_norm": 0.0274658203125, + "learning_rate": 0.00011574272596145221, + "loss": 0.0009, + "step": 9496 + }, + { + "epoch": 0.4499881544657664, + "grad_norm": 0.166015625, + "learning_rate": 0.00011572801889508831, + "loss": 0.1232, + "step": 9497 + }, + { + "epoch": 0.4500355366027008, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001157133114798937, + "loss": 0.1474, + "step": 9498 + }, + { + "epoch": 0.4500829187396352, + "grad_norm": 0.5546875, + "learning_rate": 0.00011569860371619459, + "loss": 0.1166, + "step": 9499 + }, + { + "epoch": 0.4501303008765695, + "grad_norm": 0.75390625, + "learning_rate": 0.00011568389560431719, + "loss": 0.7246, + "step": 9500 + }, + { + "epoch": 0.4501776830135039, + "grad_norm": 0.5703125, + "learning_rate": 0.00011566918714458765, + "loss": 0.4216, + "step": 9501 + }, + { + "epoch": 0.4502250651504383, + "grad_norm": 0.95703125, + "learning_rate": 0.00011565447833733227, + "loss": 0.0335, + "step": 9502 + }, + { + "epoch": 0.45027244728737265, + "grad_norm": 0.095703125, + "learning_rate": 0.0001156397691828772, + "loss": 0.0089, + "step": 9503 + }, + { + "epoch": 0.45031982942430704, + "grad_norm": 0.455078125, + "learning_rate": 0.00011562505968154876, + "loss": 0.4767, + "step": 9504 + }, + { + "epoch": 0.45036721156124143, + "grad_norm": 0.578125, + "learning_rate": 0.00011561034983367307, + "loss": 0.5419, + "step": 9505 + }, + { + "epoch": 0.45041459369817577, + "grad_norm": 0.435546875, + "learning_rate": 0.00011559563963957649, + "loss": 0.4571, + "step": 9506 + }, + { + "epoch": 0.45046197583511016, + "grad_norm": 0.7421875, + "learning_rate": 0.0001155809290995852, + "loss": 0.8394, + "step": 9507 + }, + { + "epoch": 0.45050935797204456, + "grad_norm": 0.1416015625, + "learning_rate": 0.00011556621821402556, + "loss": 0.008, + "step": 9508 + }, + { + "epoch": 0.4505567401089789, + "grad_norm": 0.7734375, + "learning_rate": 0.00011555150698322372, + "loss": 1.2913, + "step": 9509 + }, + { + "epoch": 0.4506041222459133, + "grad_norm": 0.10693359375, + "learning_rate": 0.00011553679540750605, + "loss": 0.0034, + "step": 9510 + }, + { + "epoch": 0.4506515043828477, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001155220834871988, + "loss": 0.0453, + "step": 9511 + }, + { + "epoch": 0.450698886519782, + "grad_norm": 0.162109375, + "learning_rate": 0.0001155073712226283, + "loss": 0.1167, + "step": 9512 + }, + { + "epoch": 0.4507462686567164, + "grad_norm": 0.62109375, + "learning_rate": 0.00011549265861412077, + "loss": 1.2963, + "step": 9513 + }, + { + "epoch": 0.4507936507936508, + "grad_norm": 0.63671875, + "learning_rate": 0.00011547794566200261, + "loss": 1.3112, + "step": 9514 + }, + { + "epoch": 0.4508410329305852, + "grad_norm": 0.263671875, + "learning_rate": 0.0001154632323666001, + "loss": 0.1733, + "step": 9515 + }, + { + "epoch": 0.45088841506751953, + "grad_norm": 0.5703125, + "learning_rate": 0.00011544851872823956, + "loss": 0.8656, + "step": 9516 + }, + { + "epoch": 0.4509357972044539, + "grad_norm": 0.8203125, + "learning_rate": 0.00011543380474724735, + "loss": 1.2624, + "step": 9517 + }, + { + "epoch": 0.4509831793413883, + "grad_norm": 0.57421875, + "learning_rate": 0.00011541909042394974, + "loss": 1.0939, + "step": 9518 + }, + { + "epoch": 0.45103056147832266, + "grad_norm": 0.75390625, + "learning_rate": 0.00011540437575867315, + "loss": 0.6567, + "step": 9519 + }, + { + "epoch": 0.45107794361525705, + "grad_norm": 0.47265625, + "learning_rate": 0.00011538966075174396, + "loss": 0.0171, + "step": 9520 + }, + { + "epoch": 0.45112532575219144, + "grad_norm": 0.59375, + "learning_rate": 0.00011537494540348843, + "loss": 0.7626, + "step": 9521 + }, + { + "epoch": 0.4511727078891258, + "grad_norm": 0.7109375, + "learning_rate": 0.000115360229714233, + "loss": 1.0279, + "step": 9522 + }, + { + "epoch": 0.45122009002606017, + "grad_norm": 0.78125, + "learning_rate": 0.00011534551368430404, + "loss": 0.9124, + "step": 9523 + }, + { + "epoch": 0.45126747216299457, + "grad_norm": 0.6328125, + "learning_rate": 0.00011533079731402793, + "loss": 0.5132, + "step": 9524 + }, + { + "epoch": 0.4513148542999289, + "grad_norm": 0.65234375, + "learning_rate": 0.00011531608060373109, + "loss": 1.246, + "step": 9525 + }, + { + "epoch": 0.4513622364368633, + "grad_norm": 0.63671875, + "learning_rate": 0.00011530136355373986, + "loss": 1.0366, + "step": 9526 + }, + { + "epoch": 0.4514096185737977, + "grad_norm": 0.01202392578125, + "learning_rate": 0.00011528664616438072, + "loss": 0.0007, + "step": 9527 + }, + { + "epoch": 0.4514570007107321, + "grad_norm": 0.8671875, + "learning_rate": 0.00011527192843598002, + "loss": 1.3601, + "step": 9528 + }, + { + "epoch": 0.4515043828476664, + "grad_norm": 0.416015625, + "learning_rate": 0.00011525721036886421, + "loss": 0.1206, + "step": 9529 + }, + { + "epoch": 0.4515517649846008, + "grad_norm": 0.28515625, + "learning_rate": 0.00011524249196335974, + "loss": 0.0013, + "step": 9530 + }, + { + "epoch": 0.4515991471215352, + "grad_norm": 0.65234375, + "learning_rate": 0.00011522777321979303, + "loss": 0.4378, + "step": 9531 + }, + { + "epoch": 0.45164652925846954, + "grad_norm": 0.80859375, + "learning_rate": 0.00011521305413849055, + "loss": 0.9599, + "step": 9532 + }, + { + "epoch": 0.45169391139540394, + "grad_norm": 0.51953125, + "learning_rate": 0.00011519833471977873, + "loss": 1.0324, + "step": 9533 + }, + { + "epoch": 0.45174129353233833, + "grad_norm": 0.296875, + "learning_rate": 0.000115183614963984, + "loss": 0.0441, + "step": 9534 + }, + { + "epoch": 0.45178867566927267, + "grad_norm": 0.79296875, + "learning_rate": 0.0001151688948714329, + "loss": 1.2256, + "step": 9535 + }, + { + "epoch": 0.45183605780620706, + "grad_norm": 0.484375, + "learning_rate": 0.00011515417444245185, + "loss": 1.162, + "step": 9536 + }, + { + "epoch": 0.45188343994314145, + "grad_norm": 0.55859375, + "learning_rate": 0.00011513945367736733, + "loss": 0.1426, + "step": 9537 + }, + { + "epoch": 0.4519308220800758, + "grad_norm": 0.038818359375, + "learning_rate": 0.00011512473257650592, + "loss": 0.0041, + "step": 9538 + }, + { + "epoch": 0.4519782042170102, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00011511001114019399, + "loss": 0.002, + "step": 9539 + }, + { + "epoch": 0.4520255863539446, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001150952893687581, + "loss": 0.0135, + "step": 9540 + }, + { + "epoch": 0.4520729684908789, + "grad_norm": 0.5703125, + "learning_rate": 0.00011508056726252482, + "loss": 1.2194, + "step": 9541 + }, + { + "epoch": 0.4521203506278133, + "grad_norm": 0.74609375, + "learning_rate": 0.00011506584482182061, + "loss": 0.9896, + "step": 9542 + }, + { + "epoch": 0.4521677327647477, + "grad_norm": 0.734375, + "learning_rate": 0.000115051122046972, + "loss": 0.9118, + "step": 9543 + }, + { + "epoch": 0.4522151149016821, + "grad_norm": 0.7265625, + "learning_rate": 0.00011503639893830553, + "loss": 1.2259, + "step": 9544 + }, + { + "epoch": 0.45226249703861643, + "grad_norm": 0.34765625, + "learning_rate": 0.00011502167549614773, + "loss": 0.0313, + "step": 9545 + }, + { + "epoch": 0.4523098791755508, + "grad_norm": 0.291015625, + "learning_rate": 0.00011500695172082521, + "loss": 0.1573, + "step": 9546 + }, + { + "epoch": 0.4523572613124852, + "grad_norm": 0.6640625, + "learning_rate": 0.00011499222761266446, + "loss": 0.8081, + "step": 9547 + }, + { + "epoch": 0.45240464344941955, + "grad_norm": 0.6640625, + "learning_rate": 0.00011497750317199208, + "loss": 0.8415, + "step": 9548 + }, + { + "epoch": 0.45245202558635395, + "grad_norm": 0.81640625, + "learning_rate": 0.00011496277839913463, + "loss": 0.1733, + "step": 9549 + }, + { + "epoch": 0.45249940772328834, + "grad_norm": 0.71484375, + "learning_rate": 0.00011494805329441871, + "loss": 1.1652, + "step": 9550 + }, + { + "epoch": 0.4525467898602227, + "grad_norm": 0.70703125, + "learning_rate": 0.00011493332785817086, + "loss": 1.0033, + "step": 9551 + }, + { + "epoch": 0.45259417199715707, + "grad_norm": 0.78515625, + "learning_rate": 0.0001149186020907177, + "loss": 1.1452, + "step": 9552 + }, + { + "epoch": 0.45264155413409146, + "grad_norm": 0.126953125, + "learning_rate": 0.00011490387599238586, + "loss": 0.017, + "step": 9553 + }, + { + "epoch": 0.4526889362710258, + "grad_norm": 0.7734375, + "learning_rate": 0.00011488914956350191, + "loss": 1.0636, + "step": 9554 + }, + { + "epoch": 0.4527363184079602, + "grad_norm": 0.63671875, + "learning_rate": 0.00011487442280439249, + "loss": 0.8769, + "step": 9555 + }, + { + "epoch": 0.4527837005448946, + "grad_norm": 0.75, + "learning_rate": 0.0001148596957153842, + "loss": 0.8713, + "step": 9556 + }, + { + "epoch": 0.452831082681829, + "grad_norm": 0.6328125, + "learning_rate": 0.00011484496829680371, + "loss": 1.2281, + "step": 9557 + }, + { + "epoch": 0.4528784648187633, + "grad_norm": 0.043701171875, + "learning_rate": 0.00011483024054897764, + "loss": 0.0025, + "step": 9558 + }, + { + "epoch": 0.4529258469556977, + "grad_norm": 0.345703125, + "learning_rate": 0.00011481551247223261, + "loss": 0.0514, + "step": 9559 + }, + { + "epoch": 0.4529732290926321, + "grad_norm": 0.625, + "learning_rate": 0.0001148007840668953, + "loss": 0.5776, + "step": 9560 + }, + { + "epoch": 0.45302061122956644, + "grad_norm": 0.251953125, + "learning_rate": 0.00011478605533329239, + "loss": 0.1592, + "step": 9561 + }, + { + "epoch": 0.45306799336650083, + "grad_norm": 0.9296875, + "learning_rate": 0.00011477132627175046, + "loss": 0.8947, + "step": 9562 + }, + { + "epoch": 0.4531153755034352, + "grad_norm": 0.21484375, + "learning_rate": 0.0001147565968825963, + "loss": 0.0262, + "step": 9563 + }, + { + "epoch": 0.45316275764036956, + "grad_norm": 0.703125, + "learning_rate": 0.00011474186716615651, + "loss": 0.9152, + "step": 9564 + }, + { + "epoch": 0.45321013977730396, + "grad_norm": 0.60546875, + "learning_rate": 0.00011472713712275784, + "loss": 0.8025, + "step": 9565 + }, + { + "epoch": 0.45325752191423835, + "grad_norm": 0.6328125, + "learning_rate": 0.00011471240675272694, + "loss": 0.4409, + "step": 9566 + }, + { + "epoch": 0.4533049040511727, + "grad_norm": 1.0546875, + "learning_rate": 0.00011469767605639052, + "loss": 1.0586, + "step": 9567 + }, + { + "epoch": 0.4533522861881071, + "grad_norm": 0.154296875, + "learning_rate": 0.00011468294503407531, + "loss": 0.1058, + "step": 9568 + }, + { + "epoch": 0.45339966832504147, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011466821368610803, + "loss": 0.0348, + "step": 9569 + }, + { + "epoch": 0.4534470504619758, + "grad_norm": 0.0303955078125, + "learning_rate": 0.00011465348201281538, + "loss": 0.0017, + "step": 9570 + }, + { + "epoch": 0.4534944325989102, + "grad_norm": 0.609375, + "learning_rate": 0.00011463875001452409, + "loss": 0.9173, + "step": 9571 + }, + { + "epoch": 0.4535418147358446, + "grad_norm": 0.31640625, + "learning_rate": 0.00011462401769156095, + "loss": 0.0327, + "step": 9572 + }, + { + "epoch": 0.453589196872779, + "grad_norm": 0.67578125, + "learning_rate": 0.00011460928504425267, + "loss": 1.0341, + "step": 9573 + }, + { + "epoch": 0.4536365790097133, + "grad_norm": 0.86328125, + "learning_rate": 0.000114594552072926, + "loss": 0.8655, + "step": 9574 + }, + { + "epoch": 0.4536839611466477, + "grad_norm": 0.51953125, + "learning_rate": 0.00011457981877790769, + "loss": 1.1325, + "step": 9575 + }, + { + "epoch": 0.4537313432835821, + "grad_norm": 0.66796875, + "learning_rate": 0.00011456508515952456, + "loss": 1.2329, + "step": 9576 + }, + { + "epoch": 0.45377872542051645, + "grad_norm": 0.478515625, + "learning_rate": 0.00011455035121810334, + "loss": 0.3296, + "step": 9577 + }, + { + "epoch": 0.45382610755745084, + "grad_norm": 0.443359375, + "learning_rate": 0.00011453561695397081, + "loss": 0.0984, + "step": 9578 + }, + { + "epoch": 0.45387348969438523, + "grad_norm": 0.734375, + "learning_rate": 0.00011452088236745382, + "loss": 1.1948, + "step": 9579 + }, + { + "epoch": 0.4539208718313196, + "grad_norm": 0.77734375, + "learning_rate": 0.00011450614745887911, + "loss": 1.0574, + "step": 9580 + }, + { + "epoch": 0.45396825396825397, + "grad_norm": 0.02001953125, + "learning_rate": 0.00011449141222857351, + "loss": 0.001, + "step": 9581 + }, + { + "epoch": 0.45401563610518836, + "grad_norm": 0.5703125, + "learning_rate": 0.00011447667667686379, + "loss": 0.9499, + "step": 9582 + }, + { + "epoch": 0.4540630182421227, + "grad_norm": 0.640625, + "learning_rate": 0.00011446194080407682, + "loss": 1.3364, + "step": 9583 + }, + { + "epoch": 0.4541104003790571, + "grad_norm": 0.52734375, + "learning_rate": 0.00011444720461053942, + "loss": 0.6499, + "step": 9584 + }, + { + "epoch": 0.4541577825159915, + "grad_norm": 0.671875, + "learning_rate": 0.00011443246809657839, + "loss": 0.2543, + "step": 9585 + }, + { + "epoch": 0.4542051646529259, + "grad_norm": 0.00191497802734375, + "learning_rate": 0.0001144177312625206, + "loss": 0.0001, + "step": 9586 + }, + { + "epoch": 0.4542525467898602, + "grad_norm": 0.2197265625, + "learning_rate": 0.00011440299410869286, + "loss": 0.147, + "step": 9587 + }, + { + "epoch": 0.4542999289267946, + "grad_norm": 0.494140625, + "learning_rate": 0.00011438825663542209, + "loss": 0.0698, + "step": 9588 + }, + { + "epoch": 0.454347311063729, + "grad_norm": 0.6484375, + "learning_rate": 0.00011437351884303513, + "loss": 0.9221, + "step": 9589 + }, + { + "epoch": 0.45439469320066334, + "grad_norm": 0.73828125, + "learning_rate": 0.00011435878073185879, + "loss": 0.6449, + "step": 9590 + }, + { + "epoch": 0.45444207533759773, + "grad_norm": 0.73828125, + "learning_rate": 0.00011434404230222, + "loss": 1.0036, + "step": 9591 + }, + { + "epoch": 0.4544894574745321, + "grad_norm": 0.87890625, + "learning_rate": 0.00011432930355444564, + "loss": 1.5581, + "step": 9592 + }, + { + "epoch": 0.45453683961146646, + "grad_norm": 0.1455078125, + "learning_rate": 0.00011431456448886257, + "loss": 0.0236, + "step": 9593 + }, + { + "epoch": 0.45458422174840085, + "grad_norm": 0.171875, + "learning_rate": 0.0001142998251057977, + "loss": 0.1308, + "step": 9594 + }, + { + "epoch": 0.45463160388533524, + "grad_norm": 0.71484375, + "learning_rate": 0.00011428508540557799, + "loss": 1.1786, + "step": 9595 + }, + { + "epoch": 0.4546789860222696, + "grad_norm": 0.7421875, + "learning_rate": 0.00011427034538853028, + "loss": 0.9578, + "step": 9596 + }, + { + "epoch": 0.454726368159204, + "grad_norm": 0.62109375, + "learning_rate": 0.0001142556050549815, + "loss": 0.8847, + "step": 9597 + }, + { + "epoch": 0.45477375029613837, + "grad_norm": 0.55859375, + "learning_rate": 0.00011424086440525856, + "loss": 1.0814, + "step": 9598 + }, + { + "epoch": 0.4548211324330727, + "grad_norm": 0.671875, + "learning_rate": 0.00011422612343968844, + "loss": 1.1423, + "step": 9599 + }, + { + "epoch": 0.4548685145700071, + "grad_norm": 0.197265625, + "learning_rate": 0.00011421138215859806, + "loss": 0.0239, + "step": 9600 + }, + { + "epoch": 0.4549158967069415, + "grad_norm": 0.578125, + "learning_rate": 0.00011419664056231436, + "loss": 0.4021, + "step": 9601 + }, + { + "epoch": 0.4549632788438759, + "grad_norm": 0.9765625, + "learning_rate": 0.00011418189865116429, + "loss": 1.1078, + "step": 9602 + }, + { + "epoch": 0.4550106609808102, + "grad_norm": 0.208984375, + "learning_rate": 0.00011416715642547482, + "loss": 0.1314, + "step": 9603 + }, + { + "epoch": 0.4550580431177446, + "grad_norm": 1.0078125, + "learning_rate": 0.00011415241388557292, + "loss": 0.8406, + "step": 9604 + }, + { + "epoch": 0.455105425254679, + "grad_norm": 0.7109375, + "learning_rate": 0.00011413767103178553, + "loss": 1.1282, + "step": 9605 + }, + { + "epoch": 0.45515280739161335, + "grad_norm": 0.63671875, + "learning_rate": 0.00011412292786443965, + "loss": 0.4674, + "step": 9606 + }, + { + "epoch": 0.45520018952854774, + "grad_norm": 0.765625, + "learning_rate": 0.0001141081843838623, + "loss": 1.5249, + "step": 9607 + }, + { + "epoch": 0.45524757166548213, + "grad_norm": 0.5625, + "learning_rate": 0.00011409344059038043, + "loss": 1.1895, + "step": 9608 + }, + { + "epoch": 0.45529495380241647, + "grad_norm": 0.6171875, + "learning_rate": 0.00011407869648432105, + "loss": 0.8841, + "step": 9609 + }, + { + "epoch": 0.45534233593935086, + "grad_norm": 0.71484375, + "learning_rate": 0.00011406395206601119, + "loss": 0.8649, + "step": 9610 + }, + { + "epoch": 0.45538971807628525, + "grad_norm": 0.421875, + "learning_rate": 0.00011404920733577786, + "loss": 0.3384, + "step": 9611 + }, + { + "epoch": 0.4554371002132196, + "grad_norm": 0.671875, + "learning_rate": 0.00011403446229394809, + "loss": 0.8103, + "step": 9612 + }, + { + "epoch": 0.455484482350154, + "grad_norm": 0.65234375, + "learning_rate": 0.00011401971694084887, + "loss": 1.1253, + "step": 9613 + }, + { + "epoch": 0.4555318644870884, + "grad_norm": 0.6796875, + "learning_rate": 0.00011400497127680727, + "loss": 0.8887, + "step": 9614 + }, + { + "epoch": 0.45557924662402277, + "grad_norm": 0.2578125, + "learning_rate": 0.00011399022530215033, + "loss": 0.0379, + "step": 9615 + }, + { + "epoch": 0.4556266287609571, + "grad_norm": 0.07763671875, + "learning_rate": 0.00011397547901720508, + "loss": 0.0031, + "step": 9616 + }, + { + "epoch": 0.4556740108978915, + "grad_norm": 0.369140625, + "learning_rate": 0.00011396073242229859, + "loss": 0.0252, + "step": 9617 + }, + { + "epoch": 0.4557213930348259, + "grad_norm": 0.671875, + "learning_rate": 0.00011394598551775794, + "loss": 0.8843, + "step": 9618 + }, + { + "epoch": 0.45576877517176023, + "grad_norm": 0.5, + "learning_rate": 0.00011393123830391019, + "loss": 0.6872, + "step": 9619 + }, + { + "epoch": 0.4558161573086946, + "grad_norm": 0.166015625, + "learning_rate": 0.0001139164907810824, + "loss": 0.0144, + "step": 9620 + }, + { + "epoch": 0.455863539445629, + "grad_norm": 0.66015625, + "learning_rate": 0.00011390174294960165, + "loss": 1.0954, + "step": 9621 + }, + { + "epoch": 0.45591092158256336, + "grad_norm": 0.02392578125, + "learning_rate": 0.00011388699480979507, + "loss": 0.0009, + "step": 9622 + }, + { + "epoch": 0.45595830371949775, + "grad_norm": 0.66796875, + "learning_rate": 0.00011387224636198977, + "loss": 0.9338, + "step": 9623 + }, + { + "epoch": 0.45600568585643214, + "grad_norm": 0.734375, + "learning_rate": 0.00011385749760651276, + "loss": 0.8699, + "step": 9624 + }, + { + "epoch": 0.4560530679933665, + "grad_norm": 0.515625, + "learning_rate": 0.00011384274854369124, + "loss": 1.0061, + "step": 9625 + }, + { + "epoch": 0.45610045013030087, + "grad_norm": 0.609375, + "learning_rate": 0.00011382799917385232, + "loss": 0.9334, + "step": 9626 + }, + { + "epoch": 0.45614783226723526, + "grad_norm": 0.7578125, + "learning_rate": 0.0001138132494973231, + "loss": 1.0398, + "step": 9627 + }, + { + "epoch": 0.4561952144041696, + "grad_norm": 0.4921875, + "learning_rate": 0.00011379849951443071, + "loss": 0.8752, + "step": 9628 + }, + { + "epoch": 0.456242596541104, + "grad_norm": 0.50390625, + "learning_rate": 0.00011378374922550228, + "loss": 0.0754, + "step": 9629 + }, + { + "epoch": 0.4562899786780384, + "grad_norm": 0.314453125, + "learning_rate": 0.00011376899863086501, + "loss": 0.0124, + "step": 9630 + }, + { + "epoch": 0.4563373608149728, + "grad_norm": 0.62109375, + "learning_rate": 0.00011375424773084599, + "loss": 0.8615, + "step": 9631 + }, + { + "epoch": 0.4563847429519071, + "grad_norm": 0.58203125, + "learning_rate": 0.0001137394965257724, + "loss": 0.9877, + "step": 9632 + }, + { + "epoch": 0.4564321250888415, + "grad_norm": 0.7578125, + "learning_rate": 0.00011372474501597144, + "loss": 0.7045, + "step": 9633 + }, + { + "epoch": 0.4564795072257759, + "grad_norm": 0.1572265625, + "learning_rate": 0.00011370999320177022, + "loss": 0.0186, + "step": 9634 + }, + { + "epoch": 0.45652688936271024, + "grad_norm": 0.2060546875, + "learning_rate": 0.00011369524108349601, + "loss": 0.1419, + "step": 9635 + }, + { + "epoch": 0.45657427149964463, + "grad_norm": 0.75, + "learning_rate": 0.00011368048866147589, + "loss": 0.9008, + "step": 9636 + }, + { + "epoch": 0.45662165363657903, + "grad_norm": 0.5703125, + "learning_rate": 0.0001136657359360371, + "loss": 0.913, + "step": 9637 + }, + { + "epoch": 0.45666903577351337, + "grad_norm": 0.61328125, + "learning_rate": 0.00011365098290750689, + "loss": 1.543, + "step": 9638 + }, + { + "epoch": 0.45671641791044776, + "grad_norm": 0.7109375, + "learning_rate": 0.00011363622957621238, + "loss": 0.655, + "step": 9639 + }, + { + "epoch": 0.45676380004738215, + "grad_norm": 0.46484375, + "learning_rate": 0.00011362147594248082, + "loss": 0.0694, + "step": 9640 + }, + { + "epoch": 0.4568111821843165, + "grad_norm": 0.57421875, + "learning_rate": 0.00011360672200663946, + "loss": 0.7387, + "step": 9641 + }, + { + "epoch": 0.4568585643212509, + "grad_norm": 0.50390625, + "learning_rate": 0.00011359196776901548, + "loss": 0.4316, + "step": 9642 + }, + { + "epoch": 0.4569059464581853, + "grad_norm": 0.40625, + "learning_rate": 0.00011357721322993615, + "loss": 0.5524, + "step": 9643 + }, + { + "epoch": 0.4569533285951196, + "grad_norm": 0.71484375, + "learning_rate": 0.00011356245838972868, + "loss": 1.012, + "step": 9644 + }, + { + "epoch": 0.457000710732054, + "grad_norm": 0.484375, + "learning_rate": 0.0001135477032487203, + "loss": 0.9261, + "step": 9645 + }, + { + "epoch": 0.4570480928689884, + "grad_norm": 0.5625, + "learning_rate": 0.00011353294780723836, + "loss": 0.5779, + "step": 9646 + }, + { + "epoch": 0.4570954750059228, + "grad_norm": 0.09375, + "learning_rate": 0.00011351819206561, + "loss": 0.0075, + "step": 9647 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.5546875, + "learning_rate": 0.00011350343602416254, + "loss": 0.8601, + "step": 9648 + }, + { + "epoch": 0.4571902392797915, + "grad_norm": 0.94921875, + "learning_rate": 0.00011348867968322327, + "loss": 0.982, + "step": 9649 + }, + { + "epoch": 0.4572376214167259, + "grad_norm": 0.48828125, + "learning_rate": 0.00011347392304311944, + "loss": 0.8338, + "step": 9650 + }, + { + "epoch": 0.45728500355366025, + "grad_norm": 0.169921875, + "learning_rate": 0.00011345916610417837, + "loss": 0.1226, + "step": 9651 + }, + { + "epoch": 0.45733238569059464, + "grad_norm": 1.125, + "learning_rate": 0.00011344440886672732, + "loss": 0.7333, + "step": 9652 + }, + { + "epoch": 0.45737976782752904, + "grad_norm": 0.279296875, + "learning_rate": 0.00011342965133109361, + "loss": 0.08, + "step": 9653 + }, + { + "epoch": 0.4574271499644634, + "grad_norm": 0.734375, + "learning_rate": 0.00011341489349760453, + "loss": 1.1604, + "step": 9654 + }, + { + "epoch": 0.45747453210139777, + "grad_norm": 0.53515625, + "learning_rate": 0.00011340013536658738, + "loss": 0.7503, + "step": 9655 + }, + { + "epoch": 0.45752191423833216, + "grad_norm": 0.734375, + "learning_rate": 0.0001133853769383695, + "loss": 0.3503, + "step": 9656 + }, + { + "epoch": 0.4575692963752665, + "grad_norm": 0.37109375, + "learning_rate": 0.00011337061821327825, + "loss": 0.1039, + "step": 9657 + }, + { + "epoch": 0.4576166785122009, + "grad_norm": 0.5078125, + "learning_rate": 0.00011335585919164092, + "loss": 0.5785, + "step": 9658 + }, + { + "epoch": 0.4576640606491353, + "grad_norm": 0.7265625, + "learning_rate": 0.00011334109987378485, + "loss": 1.3708, + "step": 9659 + }, + { + "epoch": 0.4577114427860697, + "grad_norm": 0.85546875, + "learning_rate": 0.00011332634026003741, + "loss": 1.051, + "step": 9660 + }, + { + "epoch": 0.457758824923004, + "grad_norm": 0.71875, + "learning_rate": 0.00011331158035072593, + "loss": 0.8012, + "step": 9661 + }, + { + "epoch": 0.4578062070599384, + "grad_norm": 0.76953125, + "learning_rate": 0.00011329682014617777, + "loss": 0.8043, + "step": 9662 + }, + { + "epoch": 0.4578535891968728, + "grad_norm": 0.1298828125, + "learning_rate": 0.00011328205964672029, + "loss": 0.0104, + "step": 9663 + }, + { + "epoch": 0.45790097133380714, + "grad_norm": 0.212890625, + "learning_rate": 0.00011326729885268088, + "loss": 0.0549, + "step": 9664 + }, + { + "epoch": 0.45794835347074153, + "grad_norm": 0.578125, + "learning_rate": 0.00011325253776438695, + "loss": 0.7109, + "step": 9665 + }, + { + "epoch": 0.4579957356076759, + "grad_norm": 0.13671875, + "learning_rate": 0.00011323777638216582, + "loss": 0.0102, + "step": 9666 + }, + { + "epoch": 0.45804311774461026, + "grad_norm": 0.267578125, + "learning_rate": 0.0001132230147063449, + "loss": 0.0063, + "step": 9667 + }, + { + "epoch": 0.45809049988154465, + "grad_norm": 0.37109375, + "learning_rate": 0.00011320825273725162, + "loss": 0.0473, + "step": 9668 + }, + { + "epoch": 0.45813788201847905, + "grad_norm": 0.64453125, + "learning_rate": 0.00011319349047521337, + "loss": 1.3497, + "step": 9669 + }, + { + "epoch": 0.4581852641554134, + "grad_norm": 0.66015625, + "learning_rate": 0.00011317872792055752, + "loss": 1.1318, + "step": 9670 + }, + { + "epoch": 0.4582326462923478, + "grad_norm": 0.578125, + "learning_rate": 0.00011316396507361153, + "loss": 1.4103, + "step": 9671 + }, + { + "epoch": 0.45828002842928217, + "grad_norm": 0.158203125, + "learning_rate": 0.00011314920193470284, + "loss": 0.015, + "step": 9672 + }, + { + "epoch": 0.4583274105662165, + "grad_norm": 0.70703125, + "learning_rate": 0.00011313443850415884, + "loss": 1.2142, + "step": 9673 + }, + { + "epoch": 0.4583747927031509, + "grad_norm": 0.96484375, + "learning_rate": 0.000113119674782307, + "loss": 0.119, + "step": 9674 + }, + { + "epoch": 0.4584221748400853, + "grad_norm": 0.703125, + "learning_rate": 0.00011310491076947474, + "loss": 0.6329, + "step": 9675 + }, + { + "epoch": 0.4584695569770197, + "grad_norm": 0.71484375, + "learning_rate": 0.00011309014646598953, + "loss": 0.9227, + "step": 9676 + }, + { + "epoch": 0.458516939113954, + "grad_norm": 0.001556396484375, + "learning_rate": 0.00011307538187217879, + "loss": 0.0001, + "step": 9677 + }, + { + "epoch": 0.4585643212508884, + "grad_norm": 0.5234375, + "learning_rate": 0.00011306061698837, + "loss": 1.0653, + "step": 9678 + }, + { + "epoch": 0.4586117033878228, + "grad_norm": 0.453125, + "learning_rate": 0.00011304585181489065, + "loss": 0.3294, + "step": 9679 + }, + { + "epoch": 0.45865908552475715, + "grad_norm": 0.5390625, + "learning_rate": 0.0001130310863520682, + "loss": 0.9124, + "step": 9680 + }, + { + "epoch": 0.45870646766169154, + "grad_norm": 0.83203125, + "learning_rate": 0.00011301632060023015, + "loss": 1.1995, + "step": 9681 + }, + { + "epoch": 0.45875384979862593, + "grad_norm": 0.66015625, + "learning_rate": 0.00011300155455970396, + "loss": 1.0626, + "step": 9682 + }, + { + "epoch": 0.45880123193556027, + "grad_norm": 0.9765625, + "learning_rate": 0.00011298678823081714, + "loss": 1.5004, + "step": 9683 + }, + { + "epoch": 0.45884861407249466, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001129720216138972, + "loss": 0.0103, + "step": 9684 + }, + { + "epoch": 0.45889599620942906, + "grad_norm": 0.578125, + "learning_rate": 0.00011295725470927163, + "loss": 0.5508, + "step": 9685 + }, + { + "epoch": 0.4589433783463634, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011294248751726795, + "loss": 0.1358, + "step": 9686 + }, + { + "epoch": 0.4589907604832978, + "grad_norm": 1.1015625, + "learning_rate": 0.00011292772003821366, + "loss": 0.8689, + "step": 9687 + }, + { + "epoch": 0.4590381426202322, + "grad_norm": 0.703125, + "learning_rate": 0.00011291295227243634, + "loss": 1.5178, + "step": 9688 + }, + { + "epoch": 0.4590855247571666, + "grad_norm": 0.796875, + "learning_rate": 0.00011289818422026348, + "loss": 0.5054, + "step": 9689 + }, + { + "epoch": 0.4591329068941009, + "grad_norm": 0.296875, + "learning_rate": 0.00011288341588202266, + "loss": 0.0367, + "step": 9690 + }, + { + "epoch": 0.4591802890310353, + "grad_norm": 0.5703125, + "learning_rate": 0.00011286864725804136, + "loss": 0.5464, + "step": 9691 + }, + { + "epoch": 0.4592276711679697, + "grad_norm": 0.5859375, + "learning_rate": 0.0001128538783486472, + "loss": 1.1001, + "step": 9692 + }, + { + "epoch": 0.45927505330490404, + "grad_norm": 0.1982421875, + "learning_rate": 0.00011283910915416771, + "loss": 0.1493, + "step": 9693 + }, + { + "epoch": 0.45932243544183843, + "grad_norm": 0.53515625, + "learning_rate": 0.00011282433967493042, + "loss": 0.6702, + "step": 9694 + }, + { + "epoch": 0.4593698175787728, + "grad_norm": 0.08251953125, + "learning_rate": 0.00011280956991126297, + "loss": 0.0083, + "step": 9695 + }, + { + "epoch": 0.45941719971570716, + "grad_norm": 0.2314453125, + "learning_rate": 0.00011279479986349288, + "loss": 0.1184, + "step": 9696 + }, + { + "epoch": 0.45946458185264155, + "grad_norm": 0.054931640625, + "learning_rate": 0.00011278002953194777, + "loss": 0.003, + "step": 9697 + }, + { + "epoch": 0.45951196398957594, + "grad_norm": 0.259765625, + "learning_rate": 0.00011276525891695521, + "loss": 0.1499, + "step": 9698 + }, + { + "epoch": 0.4595593461265103, + "grad_norm": 0.76171875, + "learning_rate": 0.00011275048801884284, + "loss": 0.8919, + "step": 9699 + }, + { + "epoch": 0.4596067282634447, + "grad_norm": 0.859375, + "learning_rate": 0.0001127357168379382, + "loss": 2.1475, + "step": 9700 + }, + { + "epoch": 0.45965411040037907, + "grad_norm": 0.1220703125, + "learning_rate": 0.0001127209453745689, + "loss": 0.0036, + "step": 9701 + }, + { + "epoch": 0.4597014925373134, + "grad_norm": 0.00628662109375, + "learning_rate": 0.0001127061736290626, + "loss": 0.0002, + "step": 9702 + }, + { + "epoch": 0.4597488746742478, + "grad_norm": 0.107421875, + "learning_rate": 0.00011269140160174694, + "loss": 0.0031, + "step": 9703 + }, + { + "epoch": 0.4597962568111822, + "grad_norm": 0.50390625, + "learning_rate": 0.00011267662929294946, + "loss": 0.5549, + "step": 9704 + }, + { + "epoch": 0.4598436389481166, + "grad_norm": 0.6171875, + "learning_rate": 0.00011266185670299785, + "loss": 0.8531, + "step": 9705 + }, + { + "epoch": 0.4598910210850509, + "grad_norm": 0.1279296875, + "learning_rate": 0.00011264708383221978, + "loss": 0.0104, + "step": 9706 + }, + { + "epoch": 0.4599384032219853, + "grad_norm": 0.734375, + "learning_rate": 0.00011263231068094285, + "loss": 1.5082, + "step": 9707 + }, + { + "epoch": 0.4599857853589197, + "grad_norm": 0.625, + "learning_rate": 0.00011261753724949471, + "loss": 0.7843, + "step": 9708 + }, + { + "epoch": 0.46003316749585405, + "grad_norm": 0.66796875, + "learning_rate": 0.00011260276353820303, + "loss": 0.9899, + "step": 9709 + }, + { + "epoch": 0.46008054963278844, + "grad_norm": 0.51953125, + "learning_rate": 0.00011258798954739547, + "loss": 0.4392, + "step": 9710 + }, + { + "epoch": 0.46012793176972283, + "grad_norm": 0.6171875, + "learning_rate": 0.00011257321527739974, + "loss": 0.7733, + "step": 9711 + }, + { + "epoch": 0.46017531390665717, + "grad_norm": 0.62890625, + "learning_rate": 0.00011255844072854347, + "loss": 0.9897, + "step": 9712 + }, + { + "epoch": 0.46022269604359156, + "grad_norm": 0.7890625, + "learning_rate": 0.00011254366590115435, + "loss": 0.3876, + "step": 9713 + }, + { + "epoch": 0.46027007818052595, + "grad_norm": 0.71484375, + "learning_rate": 0.00011252889079556011, + "loss": 1.0065, + "step": 9714 + }, + { + "epoch": 0.4603174603174603, + "grad_norm": 0.470703125, + "learning_rate": 0.00011251411541208843, + "loss": 0.8447, + "step": 9715 + }, + { + "epoch": 0.4603648424543947, + "grad_norm": 0.70703125, + "learning_rate": 0.00011249933975106697, + "loss": 0.7846, + "step": 9716 + }, + { + "epoch": 0.4604122245913291, + "grad_norm": 0.78125, + "learning_rate": 0.00011248456381282344, + "loss": 1.0592, + "step": 9717 + }, + { + "epoch": 0.46045960672826347, + "grad_norm": 0.625, + "learning_rate": 0.00011246978759768563, + "loss": 1.2718, + "step": 9718 + }, + { + "epoch": 0.4605069888651978, + "grad_norm": 0.796875, + "learning_rate": 0.0001124550111059812, + "loss": 0.0379, + "step": 9719 + }, + { + "epoch": 0.4605543710021322, + "grad_norm": 0.298828125, + "learning_rate": 0.00011244023433803788, + "loss": 0.1691, + "step": 9720 + }, + { + "epoch": 0.4606017531390666, + "grad_norm": 0.357421875, + "learning_rate": 0.00011242545729418342, + "loss": 0.0345, + "step": 9721 + }, + { + "epoch": 0.46064913527600093, + "grad_norm": 0.65234375, + "learning_rate": 0.00011241067997474557, + "loss": 1.0231, + "step": 9722 + }, + { + "epoch": 0.4606965174129353, + "grad_norm": 0.69140625, + "learning_rate": 0.00011239590238005204, + "loss": 1.3441, + "step": 9723 + }, + { + "epoch": 0.4607438995498697, + "grad_norm": 0.67578125, + "learning_rate": 0.00011238112451043059, + "loss": 1.337, + "step": 9724 + }, + { + "epoch": 0.46079128168680406, + "grad_norm": 0.033203125, + "learning_rate": 0.00011236634636620899, + "loss": 0.0008, + "step": 9725 + }, + { + "epoch": 0.46083866382373845, + "grad_norm": 0.515625, + "learning_rate": 0.00011235156794771502, + "loss": 0.7237, + "step": 9726 + }, + { + "epoch": 0.46088604596067284, + "grad_norm": 0.10107421875, + "learning_rate": 0.0001123367892552764, + "loss": 0.0114, + "step": 9727 + }, + { + "epoch": 0.4609334280976072, + "grad_norm": 0.98046875, + "learning_rate": 0.00011232201028922093, + "loss": 1.2869, + "step": 9728 + }, + { + "epoch": 0.46098081023454157, + "grad_norm": 0.1083984375, + "learning_rate": 0.00011230723104987644, + "loss": 0.0132, + "step": 9729 + }, + { + "epoch": 0.46102819237147596, + "grad_norm": 0.81640625, + "learning_rate": 0.00011229245153757067, + "loss": 0.8768, + "step": 9730 + }, + { + "epoch": 0.4610755745084103, + "grad_norm": 1.0546875, + "learning_rate": 0.00011227767175263138, + "loss": 2.0491, + "step": 9731 + }, + { + "epoch": 0.4611229566453447, + "grad_norm": 0.69921875, + "learning_rate": 0.00011226289169538642, + "loss": 0.728, + "step": 9732 + }, + { + "epoch": 0.4611703387822791, + "grad_norm": 0.033447265625, + "learning_rate": 0.00011224811136616358, + "loss": 0.0014, + "step": 9733 + }, + { + "epoch": 0.4612177209192135, + "grad_norm": 0.69921875, + "learning_rate": 0.00011223333076529071, + "loss": 1.6146, + "step": 9734 + }, + { + "epoch": 0.4612651030561478, + "grad_norm": 0.33984375, + "learning_rate": 0.00011221854989309555, + "loss": 0.0202, + "step": 9735 + }, + { + "epoch": 0.4613124851930822, + "grad_norm": 0.65234375, + "learning_rate": 0.00011220376874990599, + "loss": 0.8589, + "step": 9736 + }, + { + "epoch": 0.4613598673300166, + "grad_norm": 0.65234375, + "learning_rate": 0.00011218898733604985, + "loss": 1.0107, + "step": 9737 + }, + { + "epoch": 0.46140724946695094, + "grad_norm": 0.06005859375, + "learning_rate": 0.00011217420565185495, + "loss": 0.004, + "step": 9738 + }, + { + "epoch": 0.46145463160388533, + "grad_norm": 0.9140625, + "learning_rate": 0.00011215942369764912, + "loss": 0.7273, + "step": 9739 + }, + { + "epoch": 0.4615020137408197, + "grad_norm": 0.02587890625, + "learning_rate": 0.00011214464147376022, + "loss": 0.001, + "step": 9740 + }, + { + "epoch": 0.46154939587775407, + "grad_norm": 0.65234375, + "learning_rate": 0.00011212985898051613, + "loss": 0.9893, + "step": 9741 + }, + { + "epoch": 0.46159677801468846, + "grad_norm": 0.66015625, + "learning_rate": 0.00011211507621824467, + "loss": 0.6534, + "step": 9742 + }, + { + "epoch": 0.46164416015162285, + "grad_norm": 0.73046875, + "learning_rate": 0.00011210029318727374, + "loss": 1.0455, + "step": 9743 + }, + { + "epoch": 0.4616915422885572, + "grad_norm": 0.5859375, + "learning_rate": 0.00011208550988793116, + "loss": 1.008, + "step": 9744 + }, + { + "epoch": 0.4617389244254916, + "grad_norm": 0.05908203125, + "learning_rate": 0.00011207072632054489, + "loss": 0.0047, + "step": 9745 + }, + { + "epoch": 0.461786306562426, + "grad_norm": 0.248046875, + "learning_rate": 0.00011205594248544275, + "loss": 0.013, + "step": 9746 + }, + { + "epoch": 0.46183368869936037, + "grad_norm": 0.26171875, + "learning_rate": 0.00011204115838295263, + "loss": 0.021, + "step": 9747 + }, + { + "epoch": 0.4618810708362947, + "grad_norm": 0.6640625, + "learning_rate": 0.00011202637401340246, + "loss": 0.5427, + "step": 9748 + }, + { + "epoch": 0.4619284529732291, + "grad_norm": 0.6796875, + "learning_rate": 0.00011201158937712013, + "loss": 0.1054, + "step": 9749 + }, + { + "epoch": 0.4619758351101635, + "grad_norm": 0.2255859375, + "learning_rate": 0.00011199680447443352, + "loss": 0.0799, + "step": 9750 + }, + { + "epoch": 0.46202321724709783, + "grad_norm": 0.65234375, + "learning_rate": 0.00011198201930567059, + "loss": 0.6151, + "step": 9751 + }, + { + "epoch": 0.4620705993840322, + "grad_norm": 0.73828125, + "learning_rate": 0.00011196723387115922, + "loss": 1.362, + "step": 9752 + }, + { + "epoch": 0.4621179815209666, + "grad_norm": 0.87109375, + "learning_rate": 0.00011195244817122736, + "loss": 0.6715, + "step": 9753 + }, + { + "epoch": 0.46216536365790095, + "grad_norm": 0.69140625, + "learning_rate": 0.00011193766220620291, + "loss": 1.1283, + "step": 9754 + }, + { + "epoch": 0.46221274579483534, + "grad_norm": 0.83203125, + "learning_rate": 0.00011192287597641385, + "loss": 1.0834, + "step": 9755 + }, + { + "epoch": 0.46226012793176974, + "grad_norm": 0.3046875, + "learning_rate": 0.00011190808948218807, + "loss": 0.0404, + "step": 9756 + }, + { + "epoch": 0.4623075100687041, + "grad_norm": 0.84375, + "learning_rate": 0.00011189330272385359, + "loss": 0.5915, + "step": 9757 + }, + { + "epoch": 0.46235489220563847, + "grad_norm": 0.49609375, + "learning_rate": 0.00011187851570173831, + "loss": 0.1652, + "step": 9758 + }, + { + "epoch": 0.46240227434257286, + "grad_norm": 0.6953125, + "learning_rate": 0.00011186372841617019, + "loss": 0.0703, + "step": 9759 + }, + { + "epoch": 0.4624496564795072, + "grad_norm": 0.73046875, + "learning_rate": 0.00011184894086747722, + "loss": 1.0824, + "step": 9760 + }, + { + "epoch": 0.4624970386164416, + "grad_norm": 0.59765625, + "learning_rate": 0.00011183415305598737, + "loss": 0.9035, + "step": 9761 + }, + { + "epoch": 0.462544420753376, + "grad_norm": 0.6796875, + "learning_rate": 0.0001118193649820286, + "loss": 1.2383, + "step": 9762 + }, + { + "epoch": 0.4625918028903104, + "grad_norm": 1.0703125, + "learning_rate": 0.0001118045766459289, + "loss": 1.1803, + "step": 9763 + }, + { + "epoch": 0.4626391850272447, + "grad_norm": 0.71484375, + "learning_rate": 0.00011178978804801627, + "loss": 0.7214, + "step": 9764 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011177499918861869, + "loss": 0.0074, + "step": 9765 + }, + { + "epoch": 0.4627339493011135, + "grad_norm": 0.6171875, + "learning_rate": 0.00011176021006806418, + "loss": 0.9706, + "step": 9766 + }, + { + "epoch": 0.46278133143804784, + "grad_norm": 0.8203125, + "learning_rate": 0.00011174542068668073, + "loss": 1.4274, + "step": 9767 + }, + { + "epoch": 0.46282871357498223, + "grad_norm": 0.828125, + "learning_rate": 0.00011173063104479638, + "loss": 1.291, + "step": 9768 + }, + { + "epoch": 0.4628760957119166, + "grad_norm": 0.57421875, + "learning_rate": 0.00011171584114273912, + "loss": 0.6011, + "step": 9769 + }, + { + "epoch": 0.46292347784885096, + "grad_norm": 0.70703125, + "learning_rate": 0.00011170105098083696, + "loss": 1.3595, + "step": 9770 + }, + { + "epoch": 0.46297085998578535, + "grad_norm": 0.59765625, + "learning_rate": 0.00011168626055941798, + "loss": 0.6471, + "step": 9771 + }, + { + "epoch": 0.46301824212271975, + "grad_norm": 0.69140625, + "learning_rate": 0.00011167146987881019, + "loss": 0.9433, + "step": 9772 + }, + { + "epoch": 0.4630656242596541, + "grad_norm": 0.2265625, + "learning_rate": 0.0001116566789393416, + "loss": 0.0392, + "step": 9773 + }, + { + "epoch": 0.4631130063965885, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001116418877413403, + "loss": 0.1455, + "step": 9774 + }, + { + "epoch": 0.46316038853352287, + "grad_norm": 0.515625, + "learning_rate": 0.00011162709628513435, + "loss": 0.5217, + "step": 9775 + }, + { + "epoch": 0.46320777067045726, + "grad_norm": 0.89453125, + "learning_rate": 0.00011161230457105176, + "loss": 0.6512, + "step": 9776 + }, + { + "epoch": 0.4632551528073916, + "grad_norm": 0.64453125, + "learning_rate": 0.00011159751259942066, + "loss": 1.0447, + "step": 9777 + }, + { + "epoch": 0.463302534944326, + "grad_norm": 0.625, + "learning_rate": 0.00011158272037056905, + "loss": 0.8032, + "step": 9778 + }, + { + "epoch": 0.4633499170812604, + "grad_norm": 0.58203125, + "learning_rate": 0.00011156792788482504, + "loss": 1.1034, + "step": 9779 + }, + { + "epoch": 0.4633972992181947, + "grad_norm": 0.63671875, + "learning_rate": 0.00011155313514251673, + "loss": 1.1353, + "step": 9780 + }, + { + "epoch": 0.4634446813551291, + "grad_norm": 0.46875, + "learning_rate": 0.00011153834214397219, + "loss": 0.8526, + "step": 9781 + }, + { + "epoch": 0.4634920634920635, + "grad_norm": 0.62109375, + "learning_rate": 0.00011152354888951948, + "loss": 0.4919, + "step": 9782 + }, + { + "epoch": 0.46353944562899785, + "grad_norm": 0.7265625, + "learning_rate": 0.00011150875537948677, + "loss": 1.0076, + "step": 9783 + }, + { + "epoch": 0.46358682776593224, + "grad_norm": 0.68359375, + "learning_rate": 0.00011149396161420211, + "loss": 1.3346, + "step": 9784 + }, + { + "epoch": 0.46363420990286663, + "grad_norm": 0.734375, + "learning_rate": 0.00011147916759399362, + "loss": 1.3384, + "step": 9785 + }, + { + "epoch": 0.46368159203980097, + "grad_norm": 0.9296875, + "learning_rate": 0.00011146437331918939, + "loss": 1.0324, + "step": 9786 + }, + { + "epoch": 0.46372897417673536, + "grad_norm": 0.6953125, + "learning_rate": 0.00011144957879011764, + "loss": 0.759, + "step": 9787 + }, + { + "epoch": 0.46377635631366976, + "grad_norm": 0.7421875, + "learning_rate": 0.00011143478400710636, + "loss": 1.0396, + "step": 9788 + }, + { + "epoch": 0.4638237384506041, + "grad_norm": 0.1943359375, + "learning_rate": 0.00011141998897048378, + "loss": 0.1246, + "step": 9789 + }, + { + "epoch": 0.4638711205875385, + "grad_norm": 0.6953125, + "learning_rate": 0.000111405193680578, + "loss": 0.8911, + "step": 9790 + }, + { + "epoch": 0.4639185027244729, + "grad_norm": 0.7109375, + "learning_rate": 0.00011139039813771719, + "loss": 1.1938, + "step": 9791 + }, + { + "epoch": 0.4639658848614073, + "grad_norm": 0.8828125, + "learning_rate": 0.0001113756023422295, + "loss": 0.6653, + "step": 9792 + }, + { + "epoch": 0.4640132669983416, + "grad_norm": 0.2353515625, + "learning_rate": 0.00011136080629444303, + "loss": 0.0119, + "step": 9793 + }, + { + "epoch": 0.464060649135276, + "grad_norm": 0.7421875, + "learning_rate": 0.00011134600999468596, + "loss": 0.8786, + "step": 9794 + }, + { + "epoch": 0.4641080312722104, + "grad_norm": 0.77734375, + "learning_rate": 0.00011133121344328652, + "loss": 0.7408, + "step": 9795 + }, + { + "epoch": 0.46415541340914473, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011131641664057282, + "loss": 0.0056, + "step": 9796 + }, + { + "epoch": 0.4642027955460791, + "grad_norm": 0.765625, + "learning_rate": 0.00011130161958687304, + "loss": 1.5517, + "step": 9797 + }, + { + "epoch": 0.4642501776830135, + "grad_norm": 0.671875, + "learning_rate": 0.00011128682228251538, + "loss": 1.2288, + "step": 9798 + }, + { + "epoch": 0.46429755981994786, + "grad_norm": 0.65625, + "learning_rate": 0.00011127202472782802, + "loss": 1.3046, + "step": 9799 + }, + { + "epoch": 0.46434494195688225, + "grad_norm": 0.828125, + "learning_rate": 0.00011125722692313918, + "loss": 0.8764, + "step": 9800 + }, + { + "epoch": 0.46439232409381664, + "grad_norm": 0.61328125, + "learning_rate": 0.00011124242886877703, + "loss": 0.7067, + "step": 9801 + }, + { + "epoch": 0.464439706230751, + "grad_norm": 0.208984375, + "learning_rate": 0.00011122763056506975, + "loss": 0.0462, + "step": 9802 + }, + { + "epoch": 0.4644870883676854, + "grad_norm": 0.83203125, + "learning_rate": 0.00011121283201234562, + "loss": 1.4981, + "step": 9803 + }, + { + "epoch": 0.46453447050461977, + "grad_norm": 0.6875, + "learning_rate": 0.00011119803321093281, + "loss": 1.1497, + "step": 9804 + }, + { + "epoch": 0.46458185264155416, + "grad_norm": 0.439453125, + "learning_rate": 0.00011118323416115955, + "loss": 0.4918, + "step": 9805 + }, + { + "epoch": 0.4646292347784885, + "grad_norm": 0.5703125, + "learning_rate": 0.00011116843486335407, + "loss": 0.8862, + "step": 9806 + }, + { + "epoch": 0.4646766169154229, + "grad_norm": 0.69140625, + "learning_rate": 0.0001111536353178446, + "loss": 1.2946, + "step": 9807 + }, + { + "epoch": 0.4647239990523573, + "grad_norm": 0.83203125, + "learning_rate": 0.00011113883552495938, + "loss": 1.6293, + "step": 9808 + }, + { + "epoch": 0.4647713811892916, + "grad_norm": 1.125, + "learning_rate": 0.00011112403548502664, + "loss": 0.7951, + "step": 9809 + }, + { + "epoch": 0.464818763326226, + "grad_norm": 0.625, + "learning_rate": 0.00011110923519837466, + "loss": 0.8851, + "step": 9810 + }, + { + "epoch": 0.4648661454631604, + "grad_norm": 0.09033203125, + "learning_rate": 0.00011109443466533167, + "loss": 0.0103, + "step": 9811 + }, + { + "epoch": 0.46491352760009474, + "grad_norm": 1.40625, + "learning_rate": 0.00011107963388622594, + "loss": 0.8671, + "step": 9812 + }, + { + "epoch": 0.46496090973702914, + "grad_norm": 0.6875, + "learning_rate": 0.0001110648328613857, + "loss": 0.6951, + "step": 9813 + }, + { + "epoch": 0.46500829187396353, + "grad_norm": 0.671875, + "learning_rate": 0.0001110500315911393, + "loss": 1.1671, + "step": 9814 + }, + { + "epoch": 0.46505567401089787, + "grad_norm": 0.65625, + "learning_rate": 0.00011103523007581494, + "loss": 1.1655, + "step": 9815 + }, + { + "epoch": 0.46510305614783226, + "grad_norm": 0.30078125, + "learning_rate": 0.00011102042831574095, + "loss": 0.0534, + "step": 9816 + }, + { + "epoch": 0.46515043828476665, + "grad_norm": 0.55078125, + "learning_rate": 0.00011100562631124558, + "loss": 1.0242, + "step": 9817 + }, + { + "epoch": 0.465197820421701, + "grad_norm": 0.60546875, + "learning_rate": 0.00011099082406265715, + "loss": 0.5828, + "step": 9818 + }, + { + "epoch": 0.4652452025586354, + "grad_norm": 0.7265625, + "learning_rate": 0.00011097602157030394, + "loss": 1.1612, + "step": 9819 + }, + { + "epoch": 0.4652925846955698, + "grad_norm": 0.65234375, + "learning_rate": 0.00011096121883451427, + "loss": 0.5682, + "step": 9820 + }, + { + "epoch": 0.46533996683250417, + "grad_norm": 0.64453125, + "learning_rate": 0.00011094641585561645, + "loss": 0.653, + "step": 9821 + }, + { + "epoch": 0.4653873489694385, + "grad_norm": 0.63671875, + "learning_rate": 0.00011093161263393876, + "loss": 1.2252, + "step": 9822 + }, + { + "epoch": 0.4654347311063729, + "grad_norm": 0.027099609375, + "learning_rate": 0.00011091680916980957, + "loss": 0.0018, + "step": 9823 + }, + { + "epoch": 0.4654821132433073, + "grad_norm": 0.62890625, + "learning_rate": 0.00011090200546355718, + "loss": 0.9737, + "step": 9824 + }, + { + "epoch": 0.46552949538024163, + "grad_norm": 0.95703125, + "learning_rate": 0.00011088720151550991, + "loss": 0.8436, + "step": 9825 + }, + { + "epoch": 0.465576877517176, + "grad_norm": 0.072265625, + "learning_rate": 0.00011087239732599612, + "loss": 0.0059, + "step": 9826 + }, + { + "epoch": 0.4656242596541104, + "grad_norm": 0.1328125, + "learning_rate": 0.00011085759289534412, + "loss": 0.0125, + "step": 9827 + }, + { + "epoch": 0.46567164179104475, + "grad_norm": 0.6875, + "learning_rate": 0.00011084278822388228, + "loss": 1.3405, + "step": 9828 + }, + { + "epoch": 0.46571902392797915, + "grad_norm": 0.7265625, + "learning_rate": 0.00011082798331193898, + "loss": 1.1513, + "step": 9829 + }, + { + "epoch": 0.46576640606491354, + "grad_norm": 0.7890625, + "learning_rate": 0.0001108131781598425, + "loss": 1.184, + "step": 9830 + }, + { + "epoch": 0.4658137882018479, + "grad_norm": 0.56640625, + "learning_rate": 0.00011079837276792125, + "loss": 0.7758, + "step": 9831 + }, + { + "epoch": 0.46586117033878227, + "grad_norm": 0.1201171875, + "learning_rate": 0.00011078356713650361, + "loss": 0.0137, + "step": 9832 + }, + { + "epoch": 0.46590855247571666, + "grad_norm": 0.7890625, + "learning_rate": 0.00011076876126591796, + "loss": 1.1446, + "step": 9833 + }, + { + "epoch": 0.46595593461265106, + "grad_norm": 0.240234375, + "learning_rate": 0.00011075395515649261, + "loss": 0.0185, + "step": 9834 + }, + { + "epoch": 0.4660033167495854, + "grad_norm": 0.7890625, + "learning_rate": 0.000110739148808556, + "loss": 0.614, + "step": 9835 + }, + { + "epoch": 0.4660506988865198, + "grad_norm": 0.69140625, + "learning_rate": 0.00011072434222243651, + "loss": 0.7432, + "step": 9836 + }, + { + "epoch": 0.4660980810234542, + "grad_norm": 0.63671875, + "learning_rate": 0.00011070953539846254, + "loss": 0.9163, + "step": 9837 + }, + { + "epoch": 0.4661454631603885, + "grad_norm": 0.66015625, + "learning_rate": 0.00011069472833696249, + "loss": 0.8119, + "step": 9838 + }, + { + "epoch": 0.4661928452973229, + "grad_norm": 0.2890625, + "learning_rate": 0.00011067992103826474, + "loss": 0.0054, + "step": 9839 + }, + { + "epoch": 0.4662402274342573, + "grad_norm": 0.53515625, + "learning_rate": 0.00011066511350269773, + "loss": 0.6951, + "step": 9840 + }, + { + "epoch": 0.46628760957119164, + "grad_norm": 0.609375, + "learning_rate": 0.00011065030573058987, + "loss": 0.88, + "step": 9841 + }, + { + "epoch": 0.46633499170812603, + "grad_norm": 1.03125, + "learning_rate": 0.00011063549772226955, + "loss": 1.2468, + "step": 9842 + }, + { + "epoch": 0.4663823738450604, + "grad_norm": 0.2216796875, + "learning_rate": 0.00011062068947806521, + "loss": 0.1288, + "step": 9843 + }, + { + "epoch": 0.46642975598199476, + "grad_norm": 0.6328125, + "learning_rate": 0.00011060588099830533, + "loss": 1.4737, + "step": 9844 + }, + { + "epoch": 0.46647713811892916, + "grad_norm": 0.67578125, + "learning_rate": 0.00011059107228331829, + "loss": 1.4271, + "step": 9845 + }, + { + "epoch": 0.46652452025586355, + "grad_norm": 0.5078125, + "learning_rate": 0.00011057626333343251, + "loss": 0.6395, + "step": 9846 + }, + { + "epoch": 0.4665719023927979, + "grad_norm": 0.58984375, + "learning_rate": 0.00011056145414897654, + "loss": 0.7846, + "step": 9847 + }, + { + "epoch": 0.4666192845297323, + "grad_norm": 0.76171875, + "learning_rate": 0.00011054664473027871, + "loss": 0.8299, + "step": 9848 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.578125, + "learning_rate": 0.00011053183507766758, + "loss": 0.9766, + "step": 9849 + }, + { + "epoch": 0.46671404880360107, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011051702519147153, + "loss": 0.1263, + "step": 9850 + }, + { + "epoch": 0.4667614309405354, + "grad_norm": 0.15234375, + "learning_rate": 0.00011050221507201908, + "loss": 0.0947, + "step": 9851 + }, + { + "epoch": 0.4668088130774698, + "grad_norm": 0.453125, + "learning_rate": 0.00011048740471963868, + "loss": 0.1709, + "step": 9852 + }, + { + "epoch": 0.4668561952144042, + "grad_norm": 0.56640625, + "learning_rate": 0.00011047259413465882, + "loss": 0.7236, + "step": 9853 + }, + { + "epoch": 0.46690357735133853, + "grad_norm": 0.6796875, + "learning_rate": 0.00011045778331740797, + "loss": 1.0616, + "step": 9854 + }, + { + "epoch": 0.4669509594882729, + "grad_norm": 0.63671875, + "learning_rate": 0.00011044297226821463, + "loss": 1.0608, + "step": 9855 + }, + { + "epoch": 0.4669983416252073, + "grad_norm": 0.65234375, + "learning_rate": 0.00011042816098740732, + "loss": 0.7149, + "step": 9856 + }, + { + "epoch": 0.46704572376214165, + "grad_norm": 0.69921875, + "learning_rate": 0.00011041334947531445, + "loss": 1.1244, + "step": 9857 + }, + { + "epoch": 0.46709310589907604, + "grad_norm": 0.25, + "learning_rate": 0.00011039853773226461, + "loss": 0.0236, + "step": 9858 + }, + { + "epoch": 0.46714048803601044, + "grad_norm": 0.6953125, + "learning_rate": 0.00011038372575858625, + "loss": 0.9527, + "step": 9859 + }, + { + "epoch": 0.4671878701729448, + "grad_norm": 0.80859375, + "learning_rate": 0.00011036891355460795, + "loss": 0.8516, + "step": 9860 + }, + { + "epoch": 0.46723525230987917, + "grad_norm": 0.6875, + "learning_rate": 0.00011035410112065819, + "loss": 0.7951, + "step": 9861 + }, + { + "epoch": 0.46728263444681356, + "grad_norm": 0.62109375, + "learning_rate": 0.00011033928845706545, + "loss": 0.8273, + "step": 9862 + }, + { + "epoch": 0.46733001658374795, + "grad_norm": 0.69921875, + "learning_rate": 0.00011032447556415838, + "loss": 1.5249, + "step": 9863 + }, + { + "epoch": 0.4673773987206823, + "grad_norm": 0.765625, + "learning_rate": 0.0001103096624422654, + "loss": 1.0421, + "step": 9864 + }, + { + "epoch": 0.4674247808576167, + "grad_norm": 0.59375, + "learning_rate": 0.00011029484909171508, + "loss": 1.0365, + "step": 9865 + }, + { + "epoch": 0.4674721629945511, + "grad_norm": 0.97265625, + "learning_rate": 0.00011028003551283597, + "loss": 1.1412, + "step": 9866 + }, + { + "epoch": 0.4675195451314854, + "grad_norm": 0.90625, + "learning_rate": 0.00011026522170595663, + "loss": 1.4166, + "step": 9867 + }, + { + "epoch": 0.4675669272684198, + "grad_norm": 0.61328125, + "learning_rate": 0.00011025040767140562, + "loss": 0.9339, + "step": 9868 + }, + { + "epoch": 0.4676143094053542, + "grad_norm": 0.2734375, + "learning_rate": 0.00011023559340951146, + "loss": 0.0557, + "step": 9869 + }, + { + "epoch": 0.46766169154228854, + "grad_norm": 0.703125, + "learning_rate": 0.00011022077892060274, + "loss": 0.7653, + "step": 9870 + }, + { + "epoch": 0.46770907367922293, + "grad_norm": 0.6015625, + "learning_rate": 0.00011020596420500807, + "loss": 0.6284, + "step": 9871 + }, + { + "epoch": 0.4677564558161573, + "grad_norm": 0.59765625, + "learning_rate": 0.00011019114926305597, + "loss": 0.6233, + "step": 9872 + }, + { + "epoch": 0.46780383795309166, + "grad_norm": 0.55859375, + "learning_rate": 0.00011017633409507502, + "loss": 0.7435, + "step": 9873 + }, + { + "epoch": 0.46785122009002605, + "grad_norm": 0.66015625, + "learning_rate": 0.0001101615187013938, + "loss": 1.2383, + "step": 9874 + }, + { + "epoch": 0.46789860222696045, + "grad_norm": 0.68359375, + "learning_rate": 0.00011014670308234096, + "loss": 1.1249, + "step": 9875 + }, + { + "epoch": 0.4679459843638948, + "grad_norm": 0.7109375, + "learning_rate": 0.00011013188723824504, + "loss": 0.1879, + "step": 9876 + }, + { + "epoch": 0.4679933665008292, + "grad_norm": 0.6875, + "learning_rate": 0.00011011707116943463, + "loss": 1.1596, + "step": 9877 + }, + { + "epoch": 0.46804074863776357, + "grad_norm": 0.5390625, + "learning_rate": 0.00011010225487623837, + "loss": 0.6584, + "step": 9878 + }, + { + "epoch": 0.46808813077469796, + "grad_norm": 0.69140625, + "learning_rate": 0.0001100874383589849, + "loss": 1.255, + "step": 9879 + }, + { + "epoch": 0.4681355129116323, + "grad_norm": 0.72265625, + "learning_rate": 0.00011007262161800276, + "loss": 0.9886, + "step": 9880 + }, + { + "epoch": 0.4681828950485667, + "grad_norm": 0.0458984375, + "learning_rate": 0.00011005780465362057, + "loss": 0.0037, + "step": 9881 + }, + { + "epoch": 0.4682302771855011, + "grad_norm": 0.6328125, + "learning_rate": 0.00011004298746616701, + "loss": 1.0849, + "step": 9882 + }, + { + "epoch": 0.4682776593224354, + "grad_norm": 0.61328125, + "learning_rate": 0.0001100281700559707, + "loss": 0.7927, + "step": 9883 + }, + { + "epoch": 0.4683250414593698, + "grad_norm": 0.51953125, + "learning_rate": 0.00011001335242336023, + "loss": 0.0405, + "step": 9884 + }, + { + "epoch": 0.4683724235963042, + "grad_norm": 0.62109375, + "learning_rate": 0.00010999853456866429, + "loss": 0.0701, + "step": 9885 + }, + { + "epoch": 0.46841980573323855, + "grad_norm": 0.625, + "learning_rate": 0.0001099837164922115, + "loss": 1.1635, + "step": 9886 + }, + { + "epoch": 0.46846718787017294, + "grad_norm": 0.57421875, + "learning_rate": 0.00010996889819433053, + "loss": 1.3223, + "step": 9887 + }, + { + "epoch": 0.46851457000710733, + "grad_norm": 0.53515625, + "learning_rate": 0.00010995407967535, + "loss": 0.6921, + "step": 9888 + }, + { + "epoch": 0.46856195214404167, + "grad_norm": 0.248046875, + "learning_rate": 0.00010993926093559859, + "loss": 0.0444, + "step": 9889 + }, + { + "epoch": 0.46860933428097606, + "grad_norm": 0.1455078125, + "learning_rate": 0.00010992444197540495, + "loss": 0.0119, + "step": 9890 + }, + { + "epoch": 0.46865671641791046, + "grad_norm": 0.9140625, + "learning_rate": 0.00010990962279509775, + "loss": 0.0671, + "step": 9891 + }, + { + "epoch": 0.46870409855484485, + "grad_norm": 0.65625, + "learning_rate": 0.00010989480339500569, + "loss": 1.021, + "step": 9892 + }, + { + "epoch": 0.4687514806917792, + "grad_norm": 0.57421875, + "learning_rate": 0.0001098799837754574, + "loss": 0.3846, + "step": 9893 + }, + { + "epoch": 0.4687988628287136, + "grad_norm": 0.56640625, + "learning_rate": 0.00010986516393678164, + "loss": 1.0384, + "step": 9894 + }, + { + "epoch": 0.468846244965648, + "grad_norm": 0.76171875, + "learning_rate": 0.00010985034387930704, + "loss": 0.3264, + "step": 9895 + }, + { + "epoch": 0.4688936271025823, + "grad_norm": 0.326171875, + "learning_rate": 0.0001098355236033623, + "loss": 0.0473, + "step": 9896 + }, + { + "epoch": 0.4689410092395167, + "grad_norm": 0.59765625, + "learning_rate": 0.0001098207031092761, + "loss": 1.0076, + "step": 9897 + }, + { + "epoch": 0.4689883913764511, + "grad_norm": 0.030517578125, + "learning_rate": 0.0001098058823973772, + "loss": 0.0017, + "step": 9898 + }, + { + "epoch": 0.46903577351338543, + "grad_norm": 0.7890625, + "learning_rate": 0.00010979106146799425, + "loss": 0.8646, + "step": 9899 + }, + { + "epoch": 0.4690831556503198, + "grad_norm": 0.71484375, + "learning_rate": 0.00010977624032145597, + "loss": 1.15, + "step": 9900 + }, + { + "epoch": 0.4691305377872542, + "grad_norm": 0.59375, + "learning_rate": 0.00010976141895809111, + "loss": 0.8969, + "step": 9901 + }, + { + "epoch": 0.46917791992418856, + "grad_norm": 1.375, + "learning_rate": 0.00010974659737822842, + "loss": 0.3929, + "step": 9902 + }, + { + "epoch": 0.46922530206112295, + "grad_norm": 0.7578125, + "learning_rate": 0.00010973177558219651, + "loss": 1.0758, + "step": 9903 + }, + { + "epoch": 0.46927268419805734, + "grad_norm": 0.70703125, + "learning_rate": 0.00010971695357032423, + "loss": 0.696, + "step": 9904 + }, + { + "epoch": 0.4693200663349917, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010970213134294023, + "loss": 0.1151, + "step": 9905 + }, + { + "epoch": 0.4693674484719261, + "grad_norm": 0.71484375, + "learning_rate": 0.00010968730890037333, + "loss": 0.7491, + "step": 9906 + }, + { + "epoch": 0.46941483060886047, + "grad_norm": 0.55859375, + "learning_rate": 0.00010967248624295221, + "loss": 0.4829, + "step": 9907 + }, + { + "epoch": 0.46946221274579486, + "grad_norm": 0.71484375, + "learning_rate": 0.00010965766337100567, + "loss": 1.2206, + "step": 9908 + }, + { + "epoch": 0.4695095948827292, + "grad_norm": 0.5546875, + "learning_rate": 0.00010964284028486245, + "loss": 0.8439, + "step": 9909 + }, + { + "epoch": 0.4695569770196636, + "grad_norm": 1.5625, + "learning_rate": 0.00010962801698485128, + "loss": 0.3366, + "step": 9910 + }, + { + "epoch": 0.469604359156598, + "grad_norm": 0.00140380859375, + "learning_rate": 0.00010961319347130095, + "loss": 0.0001, + "step": 9911 + }, + { + "epoch": 0.4696517412935323, + "grad_norm": 0.51171875, + "learning_rate": 0.00010959836974454023, + "loss": 0.5039, + "step": 9912 + }, + { + "epoch": 0.4696991234304667, + "grad_norm": 0.62890625, + "learning_rate": 0.00010958354580489791, + "loss": 0.8668, + "step": 9913 + }, + { + "epoch": 0.4697465055674011, + "grad_norm": 0.0888671875, + "learning_rate": 0.00010956872165270273, + "loss": 0.0019, + "step": 9914 + }, + { + "epoch": 0.46979388770433544, + "grad_norm": 0.004180908203125, + "learning_rate": 0.0001095538972882835, + "loss": 0.0002, + "step": 9915 + }, + { + "epoch": 0.46984126984126984, + "grad_norm": 0.6640625, + "learning_rate": 0.000109539072711969, + "loss": 0.6317, + "step": 9916 + }, + { + "epoch": 0.46988865197820423, + "grad_norm": 0.5703125, + "learning_rate": 0.00010952424792408804, + "loss": 0.9663, + "step": 9917 + }, + { + "epoch": 0.46993603411513857, + "grad_norm": 0.1318359375, + "learning_rate": 0.00010950942292496942, + "loss": 0.026, + "step": 9918 + }, + { + "epoch": 0.46998341625207296, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0001094945977149419, + "loss": 0.0009, + "step": 9919 + }, + { + "epoch": 0.47003079838900735, + "grad_norm": 0.6484375, + "learning_rate": 0.00010947977229433433, + "loss": 1.3223, + "step": 9920 + }, + { + "epoch": 0.47007818052594175, + "grad_norm": 0.7265625, + "learning_rate": 0.00010946494666347551, + "loss": 0.8865, + "step": 9921 + }, + { + "epoch": 0.4701255626628761, + "grad_norm": 0.75390625, + "learning_rate": 0.00010945012082269423, + "loss": 1.0039, + "step": 9922 + }, + { + "epoch": 0.4701729447998105, + "grad_norm": 0.71484375, + "learning_rate": 0.00010943529477231936, + "loss": 1.3037, + "step": 9923 + }, + { + "epoch": 0.47022032693674487, + "grad_norm": 0.1484375, + "learning_rate": 0.00010942046851267968, + "loss": 0.0193, + "step": 9924 + }, + { + "epoch": 0.4702677090736792, + "grad_norm": 0.60546875, + "learning_rate": 0.00010940564204410408, + "loss": 0.7356, + "step": 9925 + }, + { + "epoch": 0.4703150912106136, + "grad_norm": 0.625, + "learning_rate": 0.00010939081536692135, + "loss": 0.8879, + "step": 9926 + }, + { + "epoch": 0.470362473347548, + "grad_norm": 0.341796875, + "learning_rate": 0.00010937598848146032, + "loss": 0.0262, + "step": 9927 + }, + { + "epoch": 0.47040985548448233, + "grad_norm": 0.640625, + "learning_rate": 0.00010936116138804985, + "loss": 0.7974, + "step": 9928 + }, + { + "epoch": 0.4704572376214167, + "grad_norm": 0.5234375, + "learning_rate": 0.00010934633408701883, + "loss": 0.715, + "step": 9929 + }, + { + "epoch": 0.4705046197583511, + "grad_norm": 0.6796875, + "learning_rate": 0.00010933150657869602, + "loss": 1.0631, + "step": 9930 + }, + { + "epoch": 0.47055200189528545, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010931667886341035, + "loss": 0.0301, + "step": 9931 + }, + { + "epoch": 0.47059938403221985, + "grad_norm": 0.69140625, + "learning_rate": 0.00010930185094149068, + "loss": 0.9386, + "step": 9932 + }, + { + "epoch": 0.47064676616915424, + "grad_norm": 0.71875, + "learning_rate": 0.00010928702281326586, + "loss": 1.4621, + "step": 9933 + }, + { + "epoch": 0.4706941483060886, + "grad_norm": 0.71875, + "learning_rate": 0.00010927219447906478, + "loss": 0.9693, + "step": 9934 + }, + { + "epoch": 0.47074153044302297, + "grad_norm": 0.173828125, + "learning_rate": 0.00010925736593921627, + "loss": 0.1278, + "step": 9935 + }, + { + "epoch": 0.47078891257995736, + "grad_norm": 0.30859375, + "learning_rate": 0.00010924253719404929, + "loss": 0.1093, + "step": 9936 + }, + { + "epoch": 0.47083629471689176, + "grad_norm": 0.8671875, + "learning_rate": 0.00010922770824389264, + "loss": 0.8521, + "step": 9937 + }, + { + "epoch": 0.4708836768538261, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010921287908907525, + "loss": 0.1137, + "step": 9938 + }, + { + "epoch": 0.4709310589907605, + "grad_norm": 0.1357421875, + "learning_rate": 0.000109198049729926, + "loss": 0.0876, + "step": 9939 + }, + { + "epoch": 0.4709784411276949, + "grad_norm": 0.62109375, + "learning_rate": 0.00010918322016677385, + "loss": 1.2775, + "step": 9940 + }, + { + "epoch": 0.4710258232646292, + "grad_norm": 0.46875, + "learning_rate": 0.00010916839039994766, + "loss": 0.374, + "step": 9941 + }, + { + "epoch": 0.4710732054015636, + "grad_norm": 0.279296875, + "learning_rate": 0.00010915356042977632, + "loss": 0.096, + "step": 9942 + }, + { + "epoch": 0.471120587538498, + "grad_norm": 0.494140625, + "learning_rate": 0.00010913873025658874, + "loss": 1.0659, + "step": 9943 + }, + { + "epoch": 0.47116796967543234, + "grad_norm": 0.66796875, + "learning_rate": 0.00010912389988071388, + "loss": 0.206, + "step": 9944 + }, + { + "epoch": 0.47121535181236673, + "grad_norm": 0.55078125, + "learning_rate": 0.00010910906930248061, + "loss": 0.7639, + "step": 9945 + }, + { + "epoch": 0.4712627339493011, + "grad_norm": 0.75, + "learning_rate": 0.00010909423852221792, + "loss": 0.8366, + "step": 9946 + }, + { + "epoch": 0.47131011608623546, + "grad_norm": 0.1357421875, + "learning_rate": 0.00010907940754025468, + "loss": 0.0766, + "step": 9947 + }, + { + "epoch": 0.47135749822316986, + "grad_norm": 0.6484375, + "learning_rate": 0.00010906457635691987, + "loss": 0.1732, + "step": 9948 + }, + { + "epoch": 0.47140488036010425, + "grad_norm": 0.21484375, + "learning_rate": 0.00010904974497254241, + "loss": 0.0058, + "step": 9949 + }, + { + "epoch": 0.47145226249703864, + "grad_norm": 0.0019683837890625, + "learning_rate": 0.00010903491338745124, + "loss": 0.0001, + "step": 9950 + }, + { + "epoch": 0.471499644633973, + "grad_norm": 0.4453125, + "learning_rate": 0.0001090200816019753, + "loss": 0.5352, + "step": 9951 + }, + { + "epoch": 0.4715470267709074, + "grad_norm": 0.94921875, + "learning_rate": 0.00010900524961644361, + "loss": 0.7032, + "step": 9952 + }, + { + "epoch": 0.47159440890784177, + "grad_norm": 0.6875, + "learning_rate": 0.00010899041743118501, + "loss": 0.8182, + "step": 9953 + }, + { + "epoch": 0.4716417910447761, + "grad_norm": 0.8359375, + "learning_rate": 0.00010897558504652856, + "loss": 1.6718, + "step": 9954 + }, + { + "epoch": 0.4716891731817105, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001089607524628032, + "loss": 0.0083, + "step": 9955 + }, + { + "epoch": 0.4717365553186449, + "grad_norm": 0.142578125, + "learning_rate": 0.00010894591968033787, + "loss": 0.019, + "step": 9956 + }, + { + "epoch": 0.4717839374555792, + "grad_norm": 0.451171875, + "learning_rate": 0.00010893108669946162, + "loss": 0.0577, + "step": 9957 + }, + { + "epoch": 0.4718313195925136, + "grad_norm": 0.68359375, + "learning_rate": 0.00010891625352050332, + "loss": 0.9744, + "step": 9958 + }, + { + "epoch": 0.471878701729448, + "grad_norm": 0.8359375, + "learning_rate": 0.00010890142014379205, + "loss": 1.0241, + "step": 9959 + }, + { + "epoch": 0.47192608386638235, + "grad_norm": 0.69140625, + "learning_rate": 0.00010888658656965675, + "loss": 0.6482, + "step": 9960 + }, + { + "epoch": 0.47197346600331674, + "grad_norm": 0.3828125, + "learning_rate": 0.00010887175279842643, + "loss": 0.1113, + "step": 9961 + }, + { + "epoch": 0.47202084814025114, + "grad_norm": 0.7421875, + "learning_rate": 0.00010885691883043008, + "loss": 1.0869, + "step": 9962 + }, + { + "epoch": 0.4720682302771855, + "grad_norm": 0.7265625, + "learning_rate": 0.0001088420846659967, + "loss": 0.9253, + "step": 9963 + }, + { + "epoch": 0.47211561241411987, + "grad_norm": 0.4921875, + "learning_rate": 0.00010882725030545531, + "loss": 0.3006, + "step": 9964 + }, + { + "epoch": 0.47216299455105426, + "grad_norm": 0.56640625, + "learning_rate": 0.00010881241574913492, + "loss": 0.8257, + "step": 9965 + }, + { + "epoch": 0.47221037668798865, + "grad_norm": 0.000728607177734375, + "learning_rate": 0.00010879758099736453, + "loss": 0.0001, + "step": 9966 + }, + { + "epoch": 0.472257758824923, + "grad_norm": 0.4609375, + "learning_rate": 0.00010878274605047317, + "loss": 0.0298, + "step": 9967 + }, + { + "epoch": 0.4723051409618574, + "grad_norm": 0.07177734375, + "learning_rate": 0.00010876791090878981, + "loss": 0.008, + "step": 9968 + }, + { + "epoch": 0.4723525230987918, + "grad_norm": 0.46484375, + "learning_rate": 0.00010875307557264356, + "loss": 0.4073, + "step": 9969 + }, + { + "epoch": 0.4723999052357261, + "grad_norm": 0.79296875, + "learning_rate": 0.00010873824004236342, + "loss": 1.0676, + "step": 9970 + }, + { + "epoch": 0.4724472873726605, + "grad_norm": 0.70703125, + "learning_rate": 0.00010872340431827841, + "loss": 1.1719, + "step": 9971 + }, + { + "epoch": 0.4724946695095949, + "grad_norm": 0.55859375, + "learning_rate": 0.0001087085684007176, + "loss": 1.0475, + "step": 9972 + }, + { + "epoch": 0.47254205164652924, + "grad_norm": 0.765625, + "learning_rate": 0.00010869373229001001, + "loss": 1.3784, + "step": 9973 + }, + { + "epoch": 0.47258943378346363, + "grad_norm": 0.462890625, + "learning_rate": 0.0001086788959864847, + "loss": 0.148, + "step": 9974 + }, + { + "epoch": 0.472636815920398, + "grad_norm": 0.6328125, + "learning_rate": 0.00010866405949047074, + "loss": 1.0741, + "step": 9975 + }, + { + "epoch": 0.47268419805733236, + "grad_norm": 0.90625, + "learning_rate": 0.00010864922280229714, + "loss": 0.9483, + "step": 9976 + }, + { + "epoch": 0.47273158019426675, + "grad_norm": 0.65625, + "learning_rate": 0.00010863438592229299, + "loss": 1.1607, + "step": 9977 + }, + { + "epoch": 0.47277896233120115, + "grad_norm": 0.8984375, + "learning_rate": 0.00010861954885078738, + "loss": 1.3892, + "step": 9978 + }, + { + "epoch": 0.47282634446813554, + "grad_norm": 0.82421875, + "learning_rate": 0.00010860471158810934, + "loss": 1.0734, + "step": 9979 + }, + { + "epoch": 0.4728737266050699, + "grad_norm": 0.68359375, + "learning_rate": 0.00010858987413458797, + "loss": 1.1007, + "step": 9980 + }, + { + "epoch": 0.47292110874200427, + "grad_norm": 0.74609375, + "learning_rate": 0.00010857503649055234, + "loss": 1.2357, + "step": 9981 + }, + { + "epoch": 0.47296849087893866, + "grad_norm": 0.19921875, + "learning_rate": 0.00010856019865633156, + "loss": 0.1384, + "step": 9982 + }, + { + "epoch": 0.473015873015873, + "grad_norm": 0.609375, + "learning_rate": 0.00010854536063225465, + "loss": 0.9593, + "step": 9983 + }, + { + "epoch": 0.4730632551528074, + "grad_norm": 0.68359375, + "learning_rate": 0.00010853052241865076, + "loss": 1.2983, + "step": 9984 + }, + { + "epoch": 0.4731106372897418, + "grad_norm": 0.96484375, + "learning_rate": 0.00010851568401584895, + "loss": 0.8646, + "step": 9985 + }, + { + "epoch": 0.4731580194266761, + "grad_norm": 0.5625, + "learning_rate": 0.00010850084542417837, + "loss": 1.2175, + "step": 9986 + }, + { + "epoch": 0.4732054015636105, + "grad_norm": 0.71875, + "learning_rate": 0.00010848600664396807, + "loss": 1.2886, + "step": 9987 + }, + { + "epoch": 0.4732527837005449, + "grad_norm": 0.58203125, + "learning_rate": 0.00010847116767554719, + "loss": 0.8181, + "step": 9988 + }, + { + "epoch": 0.47330016583747925, + "grad_norm": 0.73046875, + "learning_rate": 0.00010845632851924485, + "loss": 1.0204, + "step": 9989 + }, + { + "epoch": 0.47334754797441364, + "grad_norm": 0.01513671875, + "learning_rate": 0.00010844148917539014, + "loss": 0.0006, + "step": 9990 + }, + { + "epoch": 0.47339493011134803, + "grad_norm": 0.193359375, + "learning_rate": 0.00010842664964431219, + "loss": 0.1346, + "step": 9991 + }, + { + "epoch": 0.47344231224828237, + "grad_norm": 0.59765625, + "learning_rate": 0.0001084118099263401, + "loss": 0.9228, + "step": 9992 + }, + { + "epoch": 0.47348969438521676, + "grad_norm": 0.64453125, + "learning_rate": 0.00010839697002180305, + "loss": 0.9449, + "step": 9993 + }, + { + "epoch": 0.47353707652215116, + "grad_norm": 0.81640625, + "learning_rate": 0.00010838212993103016, + "loss": 0.7914, + "step": 9994 + }, + { + "epoch": 0.47358445865908555, + "grad_norm": 0.1318359375, + "learning_rate": 0.00010836728965435054, + "loss": 0.0168, + "step": 9995 + }, + { + "epoch": 0.4736318407960199, + "grad_norm": 0.58984375, + "learning_rate": 0.00010835244919209337, + "loss": 0.7113, + "step": 9996 + }, + { + "epoch": 0.4736792229329543, + "grad_norm": 0.578125, + "learning_rate": 0.00010833760854458774, + "loss": 0.6393, + "step": 9997 + }, + { + "epoch": 0.4737266050698887, + "grad_norm": 0.47265625, + "learning_rate": 0.00010832276771216288, + "loss": 0.2326, + "step": 9998 + }, + { + "epoch": 0.473773987206823, + "grad_norm": 0.74609375, + "learning_rate": 0.00010830792669514784, + "loss": 1.2877, + "step": 9999 + }, + { + "epoch": 0.4738213693437574, + "grad_norm": 0.5703125, + "learning_rate": 0.00010829308549387187, + "loss": 0.5168, + "step": 10000 + }, + { + "epoch": 0.4738687514806918, + "grad_norm": 0.71875, + "learning_rate": 0.00010827824410866409, + "loss": 0.8626, + "step": 10001 + }, + { + "epoch": 0.47391613361762613, + "grad_norm": 0.57421875, + "learning_rate": 0.00010826340253985368, + "loss": 0.9617, + "step": 10002 + }, + { + "epoch": 0.4739635157545605, + "grad_norm": 0.578125, + "learning_rate": 0.00010824856078776979, + "loss": 0.7941, + "step": 10003 + }, + { + "epoch": 0.4740108978914949, + "grad_norm": 0.734375, + "learning_rate": 0.00010823371885274163, + "loss": 1.194, + "step": 10004 + }, + { + "epoch": 0.47405828002842926, + "grad_norm": 0.007568359375, + "learning_rate": 0.00010821887673509834, + "loss": 0.0004, + "step": 10005 + }, + { + "epoch": 0.47410566216536365, + "grad_norm": 0.7265625, + "learning_rate": 0.00010820403443516911, + "loss": 1.1485, + "step": 10006 + }, + { + "epoch": 0.47415304430229804, + "grad_norm": 0.18359375, + "learning_rate": 0.00010818919195328315, + "loss": 0.0852, + "step": 10007 + }, + { + "epoch": 0.47420042643923244, + "grad_norm": 0.69140625, + "learning_rate": 0.00010817434928976962, + "loss": 0.7768, + "step": 10008 + }, + { + "epoch": 0.4742478085761668, + "grad_norm": 0.5234375, + "learning_rate": 0.00010815950644495774, + "loss": 0.8561, + "step": 10009 + }, + { + "epoch": 0.47429519071310117, + "grad_norm": 0.189453125, + "learning_rate": 0.00010814466341917669, + "loss": 0.0363, + "step": 10010 + }, + { + "epoch": 0.47434257285003556, + "grad_norm": 0.76953125, + "learning_rate": 0.00010812982021275569, + "loss": 0.5881, + "step": 10011 + }, + { + "epoch": 0.4743899549869699, + "grad_norm": 1.34375, + "learning_rate": 0.00010811497682602393, + "loss": 0.626, + "step": 10012 + }, + { + "epoch": 0.4744373371239043, + "grad_norm": 0.57421875, + "learning_rate": 0.00010810013325931065, + "loss": 0.7399, + "step": 10013 + }, + { + "epoch": 0.4744847192608387, + "grad_norm": 0.59765625, + "learning_rate": 0.00010808528951294504, + "loss": 0.7295, + "step": 10014 + }, + { + "epoch": 0.474532101397773, + "grad_norm": 0.515625, + "learning_rate": 0.00010807044558725627, + "loss": 0.5209, + "step": 10015 + }, + { + "epoch": 0.4745794835347074, + "grad_norm": 0.60546875, + "learning_rate": 0.00010805560148257365, + "loss": 0.6022, + "step": 10016 + }, + { + "epoch": 0.4746268656716418, + "grad_norm": 0.71875, + "learning_rate": 0.00010804075719922638, + "loss": 1.0504, + "step": 10017 + }, + { + "epoch": 0.47467424780857614, + "grad_norm": 0.54296875, + "learning_rate": 0.00010802591273754365, + "loss": 0.1208, + "step": 10018 + }, + { + "epoch": 0.47472162994551054, + "grad_norm": 0.59375, + "learning_rate": 0.00010801106809785475, + "loss": 0.6259, + "step": 10019 + }, + { + "epoch": 0.47476901208244493, + "grad_norm": 0.6015625, + "learning_rate": 0.00010799622328048888, + "loss": 1.0677, + "step": 10020 + }, + { + "epoch": 0.47481639421937927, + "grad_norm": 0.95703125, + "learning_rate": 0.00010798137828577533, + "loss": 1.0154, + "step": 10021 + }, + { + "epoch": 0.47486377635631366, + "grad_norm": 1.1015625, + "learning_rate": 0.00010796653311404326, + "loss": 0.2195, + "step": 10022 + }, + { + "epoch": 0.47491115849324805, + "grad_norm": 0.68359375, + "learning_rate": 0.00010795168776562198, + "loss": 0.2982, + "step": 10023 + }, + { + "epoch": 0.47495854063018245, + "grad_norm": 0.69140625, + "learning_rate": 0.00010793684224084077, + "loss": 1.2982, + "step": 10024 + }, + { + "epoch": 0.4750059227671168, + "grad_norm": 0.796875, + "learning_rate": 0.00010792199654002881, + "loss": 1.3599, + "step": 10025 + }, + { + "epoch": 0.4750533049040512, + "grad_norm": 0.59375, + "learning_rate": 0.00010790715066351542, + "loss": 0.8199, + "step": 10026 + }, + { + "epoch": 0.47510068704098557, + "grad_norm": 0.703125, + "learning_rate": 0.00010789230461162984, + "loss": 0.9086, + "step": 10027 + }, + { + "epoch": 0.4751480691779199, + "grad_norm": 0.78515625, + "learning_rate": 0.0001078774583847014, + "loss": 1.1019, + "step": 10028 + }, + { + "epoch": 0.4751954513148543, + "grad_norm": 0.62890625, + "learning_rate": 0.00010786261198305929, + "loss": 0.8015, + "step": 10029 + }, + { + "epoch": 0.4752428334517887, + "grad_norm": 0.65625, + "learning_rate": 0.00010784776540703281, + "loss": 1.0677, + "step": 10030 + }, + { + "epoch": 0.47529021558872303, + "grad_norm": 0.6796875, + "learning_rate": 0.00010783291865695127, + "loss": 1.1803, + "step": 10031 + }, + { + "epoch": 0.4753375977256574, + "grad_norm": 0.68359375, + "learning_rate": 0.00010781807173314394, + "loss": 0.953, + "step": 10032 + }, + { + "epoch": 0.4753849798625918, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010780322463594008, + "loss": 0.0117, + "step": 10033 + }, + { + "epoch": 0.47543236199952615, + "grad_norm": 0.67578125, + "learning_rate": 0.00010778837736566902, + "loss": 1.2231, + "step": 10034 + }, + { + "epoch": 0.47547974413646055, + "grad_norm": 0.154296875, + "learning_rate": 0.00010777352992266006, + "loss": 0.1227, + "step": 10035 + }, + { + "epoch": 0.47552712627339494, + "grad_norm": 0.640625, + "learning_rate": 0.00010775868230724249, + "loss": 0.9482, + "step": 10036 + }, + { + "epoch": 0.47557450841032933, + "grad_norm": 0.67578125, + "learning_rate": 0.00010774383451974559, + "loss": 1.0058, + "step": 10037 + }, + { + "epoch": 0.47562189054726367, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001077289865604987, + "loss": 0.145, + "step": 10038 + }, + { + "epoch": 0.47566927268419806, + "grad_norm": 0.95703125, + "learning_rate": 0.00010771413842983111, + "loss": 1.3951, + "step": 10039 + }, + { + "epoch": 0.47571665482113246, + "grad_norm": 0.765625, + "learning_rate": 0.00010769929012807216, + "loss": 0.9192, + "step": 10040 + }, + { + "epoch": 0.4757640369580668, + "grad_norm": 0.703125, + "learning_rate": 0.00010768444165555114, + "loss": 0.8916, + "step": 10041 + }, + { + "epoch": 0.4758114190950012, + "grad_norm": 0.90625, + "learning_rate": 0.0001076695930125974, + "loss": 1.0961, + "step": 10042 + }, + { + "epoch": 0.4758588012319356, + "grad_norm": 0.671875, + "learning_rate": 0.00010765474419954027, + "loss": 0.5015, + "step": 10043 + }, + { + "epoch": 0.4759061833688699, + "grad_norm": 0.91796875, + "learning_rate": 0.00010763989521670905, + "loss": 1.1383, + "step": 10044 + }, + { + "epoch": 0.4759535655058043, + "grad_norm": 0.71875, + "learning_rate": 0.00010762504606443308, + "loss": 1.0595, + "step": 10045 + }, + { + "epoch": 0.4760009476427387, + "grad_norm": 0.76953125, + "learning_rate": 0.00010761019674304173, + "loss": 1.2456, + "step": 10046 + }, + { + "epoch": 0.47604832977967304, + "grad_norm": 1.0703125, + "learning_rate": 0.00010759534725286434, + "loss": 0.9614, + "step": 10047 + }, + { + "epoch": 0.47609571191660743, + "grad_norm": 0.21875, + "learning_rate": 0.0001075804975942302, + "loss": 0.1372, + "step": 10048 + }, + { + "epoch": 0.4761430940535418, + "grad_norm": 0.76953125, + "learning_rate": 0.0001075656477674687, + "loss": 1.1255, + "step": 10049 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.609375, + "learning_rate": 0.0001075507977729092, + "loss": 0.3984, + "step": 10050 + }, + { + "epoch": 0.47623785832741056, + "grad_norm": 0.66796875, + "learning_rate": 0.00010753594761088109, + "loss": 0.7424, + "step": 10051 + }, + { + "epoch": 0.47628524046434495, + "grad_norm": 0.7890625, + "learning_rate": 0.00010752109728171363, + "loss": 0.8824, + "step": 10052 + }, + { + "epoch": 0.47633262260127934, + "grad_norm": 0.87890625, + "learning_rate": 0.00010750624678573627, + "loss": 0.1074, + "step": 10053 + }, + { + "epoch": 0.4763800047382137, + "grad_norm": 0.75390625, + "learning_rate": 0.00010749139612327832, + "loss": 0.9001, + "step": 10054 + }, + { + "epoch": 0.4764273868751481, + "grad_norm": 0.8359375, + "learning_rate": 0.00010747654529466921, + "loss": 0.4796, + "step": 10055 + }, + { + "epoch": 0.47647476901208247, + "grad_norm": 0.9296875, + "learning_rate": 0.0001074616943002383, + "loss": 0.7125, + "step": 10056 + }, + { + "epoch": 0.4765221511490168, + "grad_norm": 0.73046875, + "learning_rate": 0.00010744684314031492, + "loss": 1.3499, + "step": 10057 + }, + { + "epoch": 0.4765695332859512, + "grad_norm": 0.65625, + "learning_rate": 0.00010743199181522853, + "loss": 0.8206, + "step": 10058 + }, + { + "epoch": 0.4766169154228856, + "grad_norm": 0.228515625, + "learning_rate": 0.0001074171403253085, + "loss": 0.0325, + "step": 10059 + }, + { + "epoch": 0.4766642975598199, + "grad_norm": 0.81640625, + "learning_rate": 0.00010740228867088415, + "loss": 0.8872, + "step": 10060 + }, + { + "epoch": 0.4767116796967543, + "grad_norm": 0.5234375, + "learning_rate": 0.00010738743685228492, + "loss": 0.4424, + "step": 10061 + }, + { + "epoch": 0.4767590618336887, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010737258486984024, + "loss": 0.1372, + "step": 10062 + }, + { + "epoch": 0.47680644397062305, + "grad_norm": 0.396484375, + "learning_rate": 0.00010735773272387945, + "loss": 0.0906, + "step": 10063 + }, + { + "epoch": 0.47685382610755744, + "grad_norm": 0.63671875, + "learning_rate": 0.000107342880414732, + "loss": 0.7328, + "step": 10064 + }, + { + "epoch": 0.47690120824449184, + "grad_norm": 0.26953125, + "learning_rate": 0.00010732802794272727, + "loss": 0.0246, + "step": 10065 + }, + { + "epoch": 0.47694859038142623, + "grad_norm": 0.2158203125, + "learning_rate": 0.00010731317530819471, + "loss": 0.0452, + "step": 10066 + }, + { + "epoch": 0.47699597251836057, + "grad_norm": 0.19921875, + "learning_rate": 0.00010729832251146372, + "loss": 0.0289, + "step": 10067 + }, + { + "epoch": 0.47704335465529496, + "grad_norm": 0.6171875, + "learning_rate": 0.00010728346955286368, + "loss": 0.9414, + "step": 10068 + }, + { + "epoch": 0.47709073679222935, + "grad_norm": 0.05322265625, + "learning_rate": 0.00010726861643272406, + "loss": 0.0023, + "step": 10069 + }, + { + "epoch": 0.4771381189291637, + "grad_norm": 0.62109375, + "learning_rate": 0.00010725376315137427, + "loss": 1.0377, + "step": 10070 + }, + { + "epoch": 0.4771855010660981, + "grad_norm": 0.5078125, + "learning_rate": 0.00010723890970914371, + "loss": 0.5354, + "step": 10071 + }, + { + "epoch": 0.4772328832030325, + "grad_norm": 0.54296875, + "learning_rate": 0.00010722405610636188, + "loss": 0.5258, + "step": 10072 + }, + { + "epoch": 0.4772802653399668, + "grad_norm": 0.578125, + "learning_rate": 0.00010720920234335816, + "loss": 0.9203, + "step": 10073 + }, + { + "epoch": 0.4773276474769012, + "grad_norm": 0.72265625, + "learning_rate": 0.00010719434842046203, + "loss": 0.1152, + "step": 10074 + }, + { + "epoch": 0.4773750296138356, + "grad_norm": 0.97265625, + "learning_rate": 0.00010717949433800292, + "loss": 1.1106, + "step": 10075 + }, + { + "epoch": 0.47742241175076994, + "grad_norm": 0.734375, + "learning_rate": 0.00010716464009631024, + "loss": 1.0645, + "step": 10076 + }, + { + "epoch": 0.47746979388770433, + "grad_norm": 0.5625, + "learning_rate": 0.00010714978569571347, + "loss": 0.8776, + "step": 10077 + }, + { + "epoch": 0.4775171760246387, + "grad_norm": 0.61328125, + "learning_rate": 0.00010713493113654212, + "loss": 0.5713, + "step": 10078 + }, + { + "epoch": 0.47756455816157306, + "grad_norm": 0.54296875, + "learning_rate": 0.00010712007641912556, + "loss": 1.1019, + "step": 10079 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 0.66015625, + "learning_rate": 0.00010710522154379328, + "loss": 1.1372, + "step": 10080 + }, + { + "epoch": 0.47765932243544185, + "grad_norm": 0.46875, + "learning_rate": 0.00010709036651087478, + "loss": 0.7831, + "step": 10081 + }, + { + "epoch": 0.47770670457237624, + "grad_norm": 0.7578125, + "learning_rate": 0.00010707551132069949, + "loss": 0.8754, + "step": 10082 + }, + { + "epoch": 0.4777540867093106, + "grad_norm": 0.53515625, + "learning_rate": 0.00010706065597359692, + "loss": 1.0916, + "step": 10083 + }, + { + "epoch": 0.47780146884624497, + "grad_norm": 0.73046875, + "learning_rate": 0.00010704580046989648, + "loss": 1.004, + "step": 10084 + }, + { + "epoch": 0.47784885098317936, + "grad_norm": 0.455078125, + "learning_rate": 0.0001070309448099277, + "loss": 0.3348, + "step": 10085 + }, + { + "epoch": 0.4778962331201137, + "grad_norm": 0.484375, + "learning_rate": 0.00010701608899402009, + "loss": 0.8956, + "step": 10086 + }, + { + "epoch": 0.4779436152570481, + "grad_norm": 0.427734375, + "learning_rate": 0.00010700123302250307, + "loss": 0.1105, + "step": 10087 + }, + { + "epoch": 0.4779909973939825, + "grad_norm": 0.56640625, + "learning_rate": 0.00010698637689570614, + "loss": 0.8867, + "step": 10088 + }, + { + "epoch": 0.4780383795309168, + "grad_norm": 0.40625, + "learning_rate": 0.00010697152061395885, + "loss": 0.0139, + "step": 10089 + }, + { + "epoch": 0.4780857616678512, + "grad_norm": 0.61328125, + "learning_rate": 0.00010695666417759062, + "loss": 1.2305, + "step": 10090 + }, + { + "epoch": 0.4781331438047856, + "grad_norm": 0.181640625, + "learning_rate": 0.00010694180758693104, + "loss": 0.1164, + "step": 10091 + }, + { + "epoch": 0.47818052594171995, + "grad_norm": 0.54296875, + "learning_rate": 0.00010692695084230952, + "loss": 0.5308, + "step": 10092 + }, + { + "epoch": 0.47822790807865434, + "grad_norm": 0.71875, + "learning_rate": 0.00010691209394405564, + "loss": 1.223, + "step": 10093 + }, + { + "epoch": 0.47827529021558873, + "grad_norm": 0.66015625, + "learning_rate": 0.00010689723689249884, + "loss": 1.0544, + "step": 10094 + }, + { + "epoch": 0.4783226723525231, + "grad_norm": 0.64453125, + "learning_rate": 0.00010688237968796869, + "loss": 1.0804, + "step": 10095 + }, + { + "epoch": 0.47837005448945746, + "grad_norm": 0.6484375, + "learning_rate": 0.00010686752233079467, + "loss": 1.0493, + "step": 10096 + }, + { + "epoch": 0.47841743662639186, + "grad_norm": 0.6796875, + "learning_rate": 0.00010685266482130636, + "loss": 0.8879, + "step": 10097 + }, + { + "epoch": 0.47846481876332625, + "grad_norm": 0.2138671875, + "learning_rate": 0.00010683780715983323, + "loss": 0.1609, + "step": 10098 + }, + { + "epoch": 0.4785122009002606, + "grad_norm": 0.609375, + "learning_rate": 0.00010682294934670482, + "loss": 0.3973, + "step": 10099 + }, + { + "epoch": 0.478559583037195, + "grad_norm": 0.7109375, + "learning_rate": 0.00010680809138225065, + "loss": 1.0785, + "step": 10100 + }, + { + "epoch": 0.4786069651741294, + "grad_norm": 0.7265625, + "learning_rate": 0.00010679323326680029, + "loss": 0.8985, + "step": 10101 + }, + { + "epoch": 0.4786543473110637, + "grad_norm": 0.609375, + "learning_rate": 0.00010677837500068322, + "loss": 0.8989, + "step": 10102 + }, + { + "epoch": 0.4787017294479981, + "grad_norm": 0.3515625, + "learning_rate": 0.000106763516584229, + "loss": 0.0173, + "step": 10103 + }, + { + "epoch": 0.4787491115849325, + "grad_norm": 0.11328125, + "learning_rate": 0.00010674865801776723, + "loss": 0.0018, + "step": 10104 + }, + { + "epoch": 0.47879649372186683, + "grad_norm": 0.55859375, + "learning_rate": 0.00010673379930162739, + "loss": 1.0238, + "step": 10105 + }, + { + "epoch": 0.4788438758588012, + "grad_norm": 0.73828125, + "learning_rate": 0.00010671894043613908, + "loss": 0.7204, + "step": 10106 + }, + { + "epoch": 0.4788912579957356, + "grad_norm": 0.5078125, + "learning_rate": 0.0001067040814216318, + "loss": 0.3304, + "step": 10107 + }, + { + "epoch": 0.47893864013266996, + "grad_norm": 0.2177734375, + "learning_rate": 0.00010668922225843512, + "loss": 0.129, + "step": 10108 + }, + { + "epoch": 0.47898602226960435, + "grad_norm": 0.6484375, + "learning_rate": 0.00010667436294687865, + "loss": 1.1233, + "step": 10109 + }, + { + "epoch": 0.47903340440653874, + "grad_norm": 0.49609375, + "learning_rate": 0.0001066595034872919, + "loss": 1.0682, + "step": 10110 + }, + { + "epoch": 0.47908078654347314, + "grad_norm": 0.80078125, + "learning_rate": 0.00010664464388000445, + "loss": 1.2712, + "step": 10111 + }, + { + "epoch": 0.4791281686804075, + "grad_norm": 0.609375, + "learning_rate": 0.0001066297841253459, + "loss": 0.521, + "step": 10112 + }, + { + "epoch": 0.47917555081734187, + "grad_norm": 0.72265625, + "learning_rate": 0.00010661492422364578, + "loss": 0.5304, + "step": 10113 + }, + { + "epoch": 0.47922293295427626, + "grad_norm": 0.51953125, + "learning_rate": 0.00010660006417523372, + "loss": 0.8213, + "step": 10114 + }, + { + "epoch": 0.4792703150912106, + "grad_norm": 0.61328125, + "learning_rate": 0.00010658520398043923, + "loss": 0.638, + "step": 10115 + }, + { + "epoch": 0.479317697228145, + "grad_norm": 0.64453125, + "learning_rate": 0.00010657034363959195, + "loss": 0.9732, + "step": 10116 + }, + { + "epoch": 0.4793650793650794, + "grad_norm": 0.625, + "learning_rate": 0.00010655548315302144, + "loss": 0.8203, + "step": 10117 + }, + { + "epoch": 0.4794124615020137, + "grad_norm": 0.1103515625, + "learning_rate": 0.00010654062252105728, + "loss": 0.0159, + "step": 10118 + }, + { + "epoch": 0.4794598436389481, + "grad_norm": 0.69140625, + "learning_rate": 0.00010652576174402909, + "loss": 1.1486, + "step": 10119 + }, + { + "epoch": 0.4795072257758825, + "grad_norm": 0.26171875, + "learning_rate": 0.00010651090082226647, + "loss": 0.0382, + "step": 10120 + }, + { + "epoch": 0.47955460791281684, + "grad_norm": 0.62890625, + "learning_rate": 0.00010649603975609898, + "loss": 1.292, + "step": 10121 + }, + { + "epoch": 0.47960199004975124, + "grad_norm": 0.48046875, + "learning_rate": 0.00010648117854585627, + "loss": 0.0801, + "step": 10122 + }, + { + "epoch": 0.47964937218668563, + "grad_norm": 0.75390625, + "learning_rate": 0.0001064663171918679, + "loss": 0.7903, + "step": 10123 + }, + { + "epoch": 0.47969675432362, + "grad_norm": 0.52734375, + "learning_rate": 0.00010645145569446353, + "loss": 0.3721, + "step": 10124 + }, + { + "epoch": 0.47974413646055436, + "grad_norm": 0.7421875, + "learning_rate": 0.00010643659405397273, + "loss": 0.8507, + "step": 10125 + }, + { + "epoch": 0.47979151859748875, + "grad_norm": 0.58984375, + "learning_rate": 0.00010642173227072511, + "loss": 0.1726, + "step": 10126 + }, + { + "epoch": 0.47983890073442315, + "grad_norm": 0.68359375, + "learning_rate": 0.00010640687034505034, + "loss": 0.3973, + "step": 10127 + }, + { + "epoch": 0.4798862828713575, + "grad_norm": 0.7578125, + "learning_rate": 0.000106392008277278, + "loss": 1.1665, + "step": 10128 + }, + { + "epoch": 0.4799336650082919, + "grad_norm": 0.08740234375, + "learning_rate": 0.00010637714606773773, + "loss": 0.0156, + "step": 10129 + }, + { + "epoch": 0.47998104714522627, + "grad_norm": 0.07421875, + "learning_rate": 0.00010636228371675916, + "loss": 0.0057, + "step": 10130 + }, + { + "epoch": 0.4800284292821606, + "grad_norm": 0.765625, + "learning_rate": 0.0001063474212246719, + "loss": 0.8653, + "step": 10131 + }, + { + "epoch": 0.480075811419095, + "grad_norm": 0.240234375, + "learning_rate": 0.0001063325585918056, + "loss": 0.1549, + "step": 10132 + }, + { + "epoch": 0.4801231935560294, + "grad_norm": 0.78125, + "learning_rate": 0.0001063176958184899, + "loss": 1.2178, + "step": 10133 + }, + { + "epoch": 0.48017057569296373, + "grad_norm": 0.640625, + "learning_rate": 0.0001063028329050544, + "loss": 0.6577, + "step": 10134 + }, + { + "epoch": 0.4802179578298981, + "grad_norm": 0.62109375, + "learning_rate": 0.00010628796985182883, + "loss": 0.7462, + "step": 10135 + }, + { + "epoch": 0.4802653399668325, + "grad_norm": 0.8046875, + "learning_rate": 0.00010627310665914276, + "loss": 0.9332, + "step": 10136 + }, + { + "epoch": 0.48031272210376685, + "grad_norm": 0.51171875, + "learning_rate": 0.00010625824332732586, + "loss": 0.5907, + "step": 10137 + }, + { + "epoch": 0.48036010424070125, + "grad_norm": 0.89453125, + "learning_rate": 0.00010624337985670782, + "loss": 0.2605, + "step": 10138 + }, + { + "epoch": 0.48040748637763564, + "grad_norm": 0.51953125, + "learning_rate": 0.00010622851624761828, + "loss": 1.0526, + "step": 10139 + }, + { + "epoch": 0.48045486851457003, + "grad_norm": 0.75, + "learning_rate": 0.00010621365250038682, + "loss": 1.29, + "step": 10140 + }, + { + "epoch": 0.48050225065150437, + "grad_norm": 0.703125, + "learning_rate": 0.00010619878861534318, + "loss": 1.2478, + "step": 10141 + }, + { + "epoch": 0.48054963278843876, + "grad_norm": 0.08203125, + "learning_rate": 0.00010618392459281703, + "loss": 0.0014, + "step": 10142 + }, + { + "epoch": 0.48059701492537316, + "grad_norm": 0.71875, + "learning_rate": 0.00010616906043313803, + "loss": 0.7464, + "step": 10143 + }, + { + "epoch": 0.4806443970623075, + "grad_norm": 0.81640625, + "learning_rate": 0.00010615419613663582, + "loss": 0.859, + "step": 10144 + }, + { + "epoch": 0.4806917791992419, + "grad_norm": 0.6171875, + "learning_rate": 0.0001061393317036401, + "loss": 1.2051, + "step": 10145 + }, + { + "epoch": 0.4807391613361763, + "grad_norm": 0.74609375, + "learning_rate": 0.00010612446713448054, + "loss": 0.9745, + "step": 10146 + }, + { + "epoch": 0.4807865434731106, + "grad_norm": 0.59765625, + "learning_rate": 0.00010610960242948687, + "loss": 1.1956, + "step": 10147 + }, + { + "epoch": 0.480833925610045, + "grad_norm": 0.65625, + "learning_rate": 0.00010609473758898867, + "loss": 1.363, + "step": 10148 + }, + { + "epoch": 0.4808813077469794, + "grad_norm": 0.6796875, + "learning_rate": 0.00010607987261331567, + "loss": 1.1725, + "step": 10149 + }, + { + "epoch": 0.48092868988391374, + "grad_norm": 0.7265625, + "learning_rate": 0.00010606500750279761, + "loss": 1.0699, + "step": 10150 + }, + { + "epoch": 0.48097607202084813, + "grad_norm": 0.578125, + "learning_rate": 0.00010605014225776412, + "loss": 0.0789, + "step": 10151 + }, + { + "epoch": 0.4810234541577825, + "grad_norm": 0.76171875, + "learning_rate": 0.00010603527687854494, + "loss": 1.2713, + "step": 10152 + }, + { + "epoch": 0.4810708362947169, + "grad_norm": 0.71875, + "learning_rate": 0.00010602041136546971, + "loss": 1.409, + "step": 10153 + }, + { + "epoch": 0.48111821843165126, + "grad_norm": 0.78125, + "learning_rate": 0.00010600554571886823, + "loss": 1.0618, + "step": 10154 + }, + { + "epoch": 0.48116560056858565, + "grad_norm": 0.72265625, + "learning_rate": 0.0001059906799390701, + "loss": 1.3923, + "step": 10155 + }, + { + "epoch": 0.48121298270552004, + "grad_norm": 0.2041015625, + "learning_rate": 0.00010597581402640508, + "loss": 0.1376, + "step": 10156 + }, + { + "epoch": 0.4812603648424544, + "grad_norm": 0.62109375, + "learning_rate": 0.00010596094798120286, + "loss": 0.9505, + "step": 10157 + }, + { + "epoch": 0.4813077469793888, + "grad_norm": 0.41015625, + "learning_rate": 0.00010594608180379317, + "loss": 0.0368, + "step": 10158 + }, + { + "epoch": 0.48135512911632317, + "grad_norm": 0.59765625, + "learning_rate": 0.00010593121549450573, + "loss": 0.9188, + "step": 10159 + }, + { + "epoch": 0.4814025112532575, + "grad_norm": 0.859375, + "learning_rate": 0.00010591634905367023, + "loss": 1.0409, + "step": 10160 + }, + { + "epoch": 0.4814498933901919, + "grad_norm": 0.61328125, + "learning_rate": 0.00010590148248161641, + "loss": 1.4114, + "step": 10161 + }, + { + "epoch": 0.4814972755271263, + "grad_norm": 0.50390625, + "learning_rate": 0.00010588661577867403, + "loss": 0.8748, + "step": 10162 + }, + { + "epoch": 0.4815446576640606, + "grad_norm": 0.2041015625, + "learning_rate": 0.00010587174894517276, + "loss": 0.0263, + "step": 10163 + }, + { + "epoch": 0.481592039800995, + "grad_norm": 0.7265625, + "learning_rate": 0.00010585688198144232, + "loss": 1.2197, + "step": 10164 + }, + { + "epoch": 0.4816394219379294, + "grad_norm": 0.6328125, + "learning_rate": 0.00010584201488781251, + "loss": 1.2011, + "step": 10165 + }, + { + "epoch": 0.48168680407486375, + "grad_norm": 0.5078125, + "learning_rate": 0.00010582714766461304, + "loss": 0.7421, + "step": 10166 + }, + { + "epoch": 0.48173418621179814, + "grad_norm": 0.08203125, + "learning_rate": 0.0001058122803121736, + "loss": 0.0014, + "step": 10167 + }, + { + "epoch": 0.48178156834873254, + "grad_norm": 0.625, + "learning_rate": 0.000105797412830824, + "loss": 0.9324, + "step": 10168 + }, + { + "epoch": 0.48182895048566693, + "grad_norm": 0.049072265625, + "learning_rate": 0.00010578254522089397, + "loss": 0.0013, + "step": 10169 + }, + { + "epoch": 0.48187633262260127, + "grad_norm": 0.609375, + "learning_rate": 0.00010576767748271326, + "loss": 1.0638, + "step": 10170 + }, + { + "epoch": 0.48192371475953566, + "grad_norm": 1.0234375, + "learning_rate": 0.00010575280961661159, + "loss": 1.0468, + "step": 10171 + }, + { + "epoch": 0.48197109689647005, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001057379416229187, + "loss": 0.0099, + "step": 10172 + }, + { + "epoch": 0.4820184790334044, + "grad_norm": 0.78125, + "learning_rate": 0.00010572307350196439, + "loss": 1.5398, + "step": 10173 + }, + { + "epoch": 0.4820658611703388, + "grad_norm": 0.1259765625, + "learning_rate": 0.00010570820525407843, + "loss": 0.0073, + "step": 10174 + }, + { + "epoch": 0.4821132433072732, + "grad_norm": 0.61328125, + "learning_rate": 0.00010569333687959053, + "loss": 1.2509, + "step": 10175 + }, + { + "epoch": 0.4821606254442075, + "grad_norm": 0.71484375, + "learning_rate": 0.00010567846837883049, + "loss": 1.298, + "step": 10176 + }, + { + "epoch": 0.4822080075811419, + "grad_norm": 0.60546875, + "learning_rate": 0.0001056635997521281, + "loss": 0.7765, + "step": 10177 + }, + { + "epoch": 0.4822553897180763, + "grad_norm": 0.6640625, + "learning_rate": 0.00010564873099981308, + "loss": 1.2411, + "step": 10178 + }, + { + "epoch": 0.48230277185501064, + "grad_norm": 0.8125, + "learning_rate": 0.0001056338621222152, + "loss": 0.0974, + "step": 10179 + }, + { + "epoch": 0.48235015399194503, + "grad_norm": 0.69921875, + "learning_rate": 0.00010561899311966428, + "loss": 0.0739, + "step": 10180 + }, + { + "epoch": 0.4823975361288794, + "grad_norm": 0.042724609375, + "learning_rate": 0.00010560412399249009, + "loss": 0.0032, + "step": 10181 + }, + { + "epoch": 0.4824449182658138, + "grad_norm": 0.65234375, + "learning_rate": 0.00010558925474102237, + "loss": 0.6145, + "step": 10182 + }, + { + "epoch": 0.48249230040274815, + "grad_norm": 0.019775390625, + "learning_rate": 0.00010557438536559095, + "loss": 0.0009, + "step": 10183 + }, + { + "epoch": 0.48253968253968255, + "grad_norm": 0.6015625, + "learning_rate": 0.00010555951586652557, + "loss": 1.0975, + "step": 10184 + }, + { + "epoch": 0.48258706467661694, + "grad_norm": 0.09423828125, + "learning_rate": 0.00010554464624415612, + "loss": 0.0134, + "step": 10185 + }, + { + "epoch": 0.4826344468135513, + "grad_norm": 0.65625, + "learning_rate": 0.00010552977649881226, + "loss": 0.8918, + "step": 10186 + }, + { + "epoch": 0.48268182895048567, + "grad_norm": 0.5546875, + "learning_rate": 0.00010551490663082386, + "loss": 1.1871, + "step": 10187 + }, + { + "epoch": 0.48272921108742006, + "grad_norm": 0.71875, + "learning_rate": 0.0001055000366405207, + "loss": 0.7529, + "step": 10188 + }, + { + "epoch": 0.4827765932243544, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010548516652823262, + "loss": 0.1328, + "step": 10189 + }, + { + "epoch": 0.4828239753612888, + "grad_norm": 0.75390625, + "learning_rate": 0.00010547029629428937, + "loss": 1.2239, + "step": 10190 + }, + { + "epoch": 0.4828713574982232, + "grad_norm": 0.54296875, + "learning_rate": 0.00010545542593902076, + "loss": 0.712, + "step": 10191 + }, + { + "epoch": 0.4829187396351575, + "grad_norm": 0.65234375, + "learning_rate": 0.00010544055546275664, + "loss": 0.5598, + "step": 10192 + }, + { + "epoch": 0.4829661217720919, + "grad_norm": 0.734375, + "learning_rate": 0.00010542568486582678, + "loss": 0.9865, + "step": 10193 + }, + { + "epoch": 0.4830135039090263, + "grad_norm": 0.478515625, + "learning_rate": 0.000105410814148561, + "loss": 0.9094, + "step": 10194 + }, + { + "epoch": 0.48306088604596065, + "grad_norm": 0.703125, + "learning_rate": 0.00010539594331128912, + "loss": 1.296, + "step": 10195 + }, + { + "epoch": 0.48310826818289504, + "grad_norm": 0.119140625, + "learning_rate": 0.00010538107235434101, + "loss": 0.005, + "step": 10196 + }, + { + "epoch": 0.48315565031982943, + "grad_norm": 0.10986328125, + "learning_rate": 0.00010536620127804639, + "loss": 0.0074, + "step": 10197 + }, + { + "epoch": 0.4832030324567638, + "grad_norm": 0.5625, + "learning_rate": 0.00010535133008273517, + "loss": 0.886, + "step": 10198 + }, + { + "epoch": 0.48325041459369816, + "grad_norm": 0.78125, + "learning_rate": 0.00010533645876873715, + "loss": 1.3604, + "step": 10199 + }, + { + "epoch": 0.48329779673063256, + "grad_norm": 0.5390625, + "learning_rate": 0.00010532158733638216, + "loss": 0.2285, + "step": 10200 + }, + { + "epoch": 0.48334517886756695, + "grad_norm": 0.22265625, + "learning_rate": 0.00010530671578600005, + "loss": 0.1313, + "step": 10201 + }, + { + "epoch": 0.4833925610045013, + "grad_norm": 0.64453125, + "learning_rate": 0.00010529184411792059, + "loss": 1.0717, + "step": 10202 + }, + { + "epoch": 0.4834399431414357, + "grad_norm": 0.44921875, + "learning_rate": 0.00010527697233247369, + "loss": 0.6953, + "step": 10203 + }, + { + "epoch": 0.4834873252783701, + "grad_norm": 0.57421875, + "learning_rate": 0.00010526210042998916, + "loss": 0.8257, + "step": 10204 + }, + { + "epoch": 0.4835347074153044, + "grad_norm": 0.625, + "learning_rate": 0.00010524722841079684, + "loss": 0.245, + "step": 10205 + }, + { + "epoch": 0.4835820895522388, + "grad_norm": 0.65625, + "learning_rate": 0.00010523235627522659, + "loss": 0.9412, + "step": 10206 + }, + { + "epoch": 0.4836294716891732, + "grad_norm": 0.59765625, + "learning_rate": 0.00010521748402360825, + "loss": 0.9764, + "step": 10207 + }, + { + "epoch": 0.48367685382610753, + "grad_norm": 0.43359375, + "learning_rate": 0.00010520261165627168, + "loss": 0.1322, + "step": 10208 + }, + { + "epoch": 0.4837242359630419, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010518773917354673, + "loss": 0.034, + "step": 10209 + }, + { + "epoch": 0.4837716180999763, + "grad_norm": 0.6171875, + "learning_rate": 0.00010517286657576324, + "loss": 0.8706, + "step": 10210 + }, + { + "epoch": 0.4838190002369107, + "grad_norm": 0.57421875, + "learning_rate": 0.00010515799386325107, + "loss": 0.4869, + "step": 10211 + }, + { + "epoch": 0.48386638237384505, + "grad_norm": 0.609375, + "learning_rate": 0.00010514312103634012, + "loss": 0.9291, + "step": 10212 + }, + { + "epoch": 0.48391376451077944, + "grad_norm": 0.7421875, + "learning_rate": 0.00010512824809536019, + "loss": 1.3326, + "step": 10213 + }, + { + "epoch": 0.48396114664771384, + "grad_norm": 0.63671875, + "learning_rate": 0.00010511337504064118, + "loss": 0.8072, + "step": 10214 + }, + { + "epoch": 0.4840085287846482, + "grad_norm": 1.2578125, + "learning_rate": 0.00010509850187251298, + "loss": 0.2092, + "step": 10215 + }, + { + "epoch": 0.48405591092158257, + "grad_norm": 0.78125, + "learning_rate": 0.00010508362859130546, + "loss": 0.8456, + "step": 10216 + }, + { + "epoch": 0.48410329305851696, + "grad_norm": 0.6015625, + "learning_rate": 0.00010506875519734843, + "loss": 0.2392, + "step": 10217 + }, + { + "epoch": 0.4841506751954513, + "grad_norm": 0.1953125, + "learning_rate": 0.00010505388169097182, + "loss": 0.1449, + "step": 10218 + }, + { + "epoch": 0.4841980573323857, + "grad_norm": 0.341796875, + "learning_rate": 0.00010503900807250548, + "loss": 0.128, + "step": 10219 + }, + { + "epoch": 0.4842454394693201, + "grad_norm": 0.8515625, + "learning_rate": 0.00010502413434227933, + "loss": 1.0062, + "step": 10220 + }, + { + "epoch": 0.4842928216062544, + "grad_norm": 0.58984375, + "learning_rate": 0.00010500926050062323, + "loss": 0.8574, + "step": 10221 + }, + { + "epoch": 0.4843402037431888, + "grad_norm": 0.58984375, + "learning_rate": 0.00010499438654786706, + "loss": 0.9706, + "step": 10222 + }, + { + "epoch": 0.4843875858801232, + "grad_norm": 0.478515625, + "learning_rate": 0.00010497951248434073, + "loss": 0.1635, + "step": 10223 + }, + { + "epoch": 0.48443496801705754, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010496463831037413, + "loss": 0.1218, + "step": 10224 + }, + { + "epoch": 0.48448235015399194, + "grad_norm": 0.330078125, + "learning_rate": 0.0001049497640262971, + "loss": 0.0917, + "step": 10225 + }, + { + "epoch": 0.48452973229092633, + "grad_norm": 0.9296875, + "learning_rate": 0.00010493488963243958, + "loss": 0.0567, + "step": 10226 + }, + { + "epoch": 0.4845771144278607, + "grad_norm": 0.640625, + "learning_rate": 0.0001049200151291315, + "loss": 1.2936, + "step": 10227 + }, + { + "epoch": 0.48462449656479506, + "grad_norm": 0.1884765625, + "learning_rate": 0.00010490514051670271, + "loss": 0.1276, + "step": 10228 + }, + { + "epoch": 0.48467187870172945, + "grad_norm": 0.765625, + "learning_rate": 0.00010489026579548311, + "loss": 1.432, + "step": 10229 + }, + { + "epoch": 0.48471926083866385, + "grad_norm": 0.212890625, + "learning_rate": 0.00010487539096580263, + "loss": 0.1523, + "step": 10230 + }, + { + "epoch": 0.4847666429755982, + "grad_norm": 0.546875, + "learning_rate": 0.00010486051602799118, + "loss": 1.1239, + "step": 10231 + }, + { + "epoch": 0.4848140251125326, + "grad_norm": 0.34765625, + "learning_rate": 0.00010484564098237868, + "loss": 0.0408, + "step": 10232 + }, + { + "epoch": 0.48486140724946697, + "grad_norm": 0.17578125, + "learning_rate": 0.00010483076582929501, + "loss": 0.1257, + "step": 10233 + }, + { + "epoch": 0.4849087893864013, + "grad_norm": 0.54296875, + "learning_rate": 0.00010481589056907006, + "loss": 0.8312, + "step": 10234 + }, + { + "epoch": 0.4849561715233357, + "grad_norm": 0.02490234375, + "learning_rate": 0.00010480101520203385, + "loss": 0.0009, + "step": 10235 + }, + { + "epoch": 0.4850035536602701, + "grad_norm": 0.93359375, + "learning_rate": 0.00010478613972851619, + "loss": 0.7291, + "step": 10236 + }, + { + "epoch": 0.48505093579720443, + "grad_norm": 0.8125, + "learning_rate": 0.00010477126414884706, + "loss": 1.1389, + "step": 10237 + }, + { + "epoch": 0.4850983179341388, + "grad_norm": 0.54296875, + "learning_rate": 0.0001047563884633564, + "loss": 0.4756, + "step": 10238 + }, + { + "epoch": 0.4851457000710732, + "grad_norm": 0.65234375, + "learning_rate": 0.00010474151267237408, + "loss": 0.8407, + "step": 10239 + }, + { + "epoch": 0.4851930822080076, + "grad_norm": 0.76953125, + "learning_rate": 0.00010472663677623009, + "loss": 0.8124, + "step": 10240 + }, + { + "epoch": 0.48524046434494195, + "grad_norm": 0.220703125, + "learning_rate": 0.00010471176077525428, + "loss": 0.1401, + "step": 10241 + }, + { + "epoch": 0.48528784648187634, + "grad_norm": 0.69140625, + "learning_rate": 0.00010469688466977667, + "loss": 0.8301, + "step": 10242 + }, + { + "epoch": 0.48533522861881073, + "grad_norm": 0.62109375, + "learning_rate": 0.00010468200846012717, + "loss": 0.8308, + "step": 10243 + }, + { + "epoch": 0.48538261075574507, + "grad_norm": 0.6484375, + "learning_rate": 0.00010466713214663568, + "loss": 1.3149, + "step": 10244 + }, + { + "epoch": 0.48542999289267946, + "grad_norm": 0.5859375, + "learning_rate": 0.00010465225572963217, + "loss": 0.7159, + "step": 10245 + }, + { + "epoch": 0.48547737502961386, + "grad_norm": 0.91015625, + "learning_rate": 0.00010463737920944663, + "loss": 1.0653, + "step": 10246 + }, + { + "epoch": 0.4855247571665482, + "grad_norm": 0.8671875, + "learning_rate": 0.00010462250258640891, + "loss": 0.9131, + "step": 10247 + }, + { + "epoch": 0.4855721393034826, + "grad_norm": 0.78125, + "learning_rate": 0.00010460762586084905, + "loss": 0.8464, + "step": 10248 + }, + { + "epoch": 0.485619521440417, + "grad_norm": 0.466796875, + "learning_rate": 0.0001045927490330969, + "loss": 0.634, + "step": 10249 + }, + { + "epoch": 0.4856669035773513, + "grad_norm": 0.953125, + "learning_rate": 0.00010457787210348251, + "loss": 1.1345, + "step": 10250 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.55078125, + "learning_rate": 0.00010456299507233577, + "loss": 0.3389, + "step": 10251 + }, + { + "epoch": 0.4857616678512201, + "grad_norm": 0.640625, + "learning_rate": 0.00010454811793998666, + "loss": 0.445, + "step": 10252 + }, + { + "epoch": 0.48580904998815444, + "grad_norm": 0.6875, + "learning_rate": 0.00010453324070676516, + "loss": 0.9121, + "step": 10253 + }, + { + "epoch": 0.48585643212508883, + "grad_norm": 0.5625, + "learning_rate": 0.00010451836337300121, + "loss": 0.838, + "step": 10254 + }, + { + "epoch": 0.4859038142620232, + "grad_norm": 0.82421875, + "learning_rate": 0.00010450348593902475, + "loss": 0.995, + "step": 10255 + }, + { + "epoch": 0.4859511963989576, + "grad_norm": 0.5390625, + "learning_rate": 0.00010448860840516579, + "loss": 0.5999, + "step": 10256 + }, + { + "epoch": 0.48599857853589196, + "grad_norm": 1.1796875, + "learning_rate": 0.00010447373077175427, + "loss": 0.3551, + "step": 10257 + }, + { + "epoch": 0.48604596067282635, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00010445885303912017, + "loss": 0.0014, + "step": 10258 + }, + { + "epoch": 0.48609334280976074, + "grad_norm": 0.51171875, + "learning_rate": 0.00010444397520759343, + "loss": 0.9227, + "step": 10259 + }, + { + "epoch": 0.4861407249466951, + "grad_norm": 0.52734375, + "learning_rate": 0.00010442909727750407, + "loss": 1.0303, + "step": 10260 + }, + { + "epoch": 0.4861881070836295, + "grad_norm": 0.08056640625, + "learning_rate": 0.00010441421924918209, + "loss": 0.0103, + "step": 10261 + }, + { + "epoch": 0.48623548922056387, + "grad_norm": 0.1630859375, + "learning_rate": 0.00010439934112295737, + "loss": 0.0176, + "step": 10262 + }, + { + "epoch": 0.4862828713574982, + "grad_norm": 0.020263671875, + "learning_rate": 0.00010438446289916, + "loss": 0.0008, + "step": 10263 + }, + { + "epoch": 0.4863302534944326, + "grad_norm": 0.57421875, + "learning_rate": 0.00010436958457811987, + "loss": 0.5984, + "step": 10264 + }, + { + "epoch": 0.486377635631367, + "grad_norm": 0.80078125, + "learning_rate": 0.00010435470616016703, + "loss": 1.3061, + "step": 10265 + }, + { + "epoch": 0.4864250177683013, + "grad_norm": 0.59375, + "learning_rate": 0.00010433982764563146, + "loss": 0.8373, + "step": 10266 + }, + { + "epoch": 0.4864723999052357, + "grad_norm": 0.67578125, + "learning_rate": 0.00010432494903484312, + "loss": 0.8359, + "step": 10267 + }, + { + "epoch": 0.4865197820421701, + "grad_norm": 0.4140625, + "learning_rate": 0.00010431007032813199, + "loss": 0.0201, + "step": 10268 + }, + { + "epoch": 0.48656716417910445, + "grad_norm": 0.5390625, + "learning_rate": 0.00010429519152582812, + "loss": 0.7844, + "step": 10269 + }, + { + "epoch": 0.48661454631603884, + "grad_norm": 0.5859375, + "learning_rate": 0.00010428031262826148, + "loss": 1.022, + "step": 10270 + }, + { + "epoch": 0.48666192845297324, + "grad_norm": 0.65234375, + "learning_rate": 0.00010426543363576207, + "loss": 0.8896, + "step": 10271 + }, + { + "epoch": 0.48670931058990763, + "grad_norm": 0.53125, + "learning_rate": 0.00010425055454865987, + "loss": 1.1148, + "step": 10272 + }, + { + "epoch": 0.48675669272684197, + "grad_norm": 0.86328125, + "learning_rate": 0.00010423567536728489, + "loss": 0.8805, + "step": 10273 + }, + { + "epoch": 0.48680407486377636, + "grad_norm": 0.55859375, + "learning_rate": 0.00010422079609196716, + "loss": 0.0896, + "step": 10274 + }, + { + "epoch": 0.48685145700071075, + "grad_norm": 0.68359375, + "learning_rate": 0.00010420591672303666, + "loss": 1.0757, + "step": 10275 + }, + { + "epoch": 0.4868988391376451, + "grad_norm": 0.78125, + "learning_rate": 0.00010419103726082339, + "loss": 1.2818, + "step": 10276 + }, + { + "epoch": 0.4869462212745795, + "grad_norm": 0.546875, + "learning_rate": 0.00010417615770565741, + "loss": 0.677, + "step": 10277 + }, + { + "epoch": 0.4869936034115139, + "grad_norm": 0.322265625, + "learning_rate": 0.00010416127805786869, + "loss": 0.0657, + "step": 10278 + }, + { + "epoch": 0.4870409855484482, + "grad_norm": 0.515625, + "learning_rate": 0.00010414639831778726, + "loss": 0.7278, + "step": 10279 + }, + { + "epoch": 0.4870883676853826, + "grad_norm": 0.57421875, + "learning_rate": 0.00010413151848574309, + "loss": 0.5555, + "step": 10280 + }, + { + "epoch": 0.487135749822317, + "grad_norm": 0.74609375, + "learning_rate": 0.0001041166385620663, + "loss": 1.2327, + "step": 10281 + }, + { + "epoch": 0.48718313195925134, + "grad_norm": 0.1279296875, + "learning_rate": 0.00010410175854708681, + "loss": 0.0819, + "step": 10282 + }, + { + "epoch": 0.48723051409618573, + "grad_norm": 0.85546875, + "learning_rate": 0.0001040868784411347, + "loss": 0.9043, + "step": 10283 + }, + { + "epoch": 0.4872778962331201, + "grad_norm": 0.55078125, + "learning_rate": 0.00010407199824454, + "loss": 0.2155, + "step": 10284 + }, + { + "epoch": 0.4873252783700545, + "grad_norm": 0.0986328125, + "learning_rate": 0.00010405711795763269, + "loss": 0.0108, + "step": 10285 + }, + { + "epoch": 0.48737266050698885, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001040422375807428, + "loss": 0.0133, + "step": 10286 + }, + { + "epoch": 0.48742004264392325, + "grad_norm": 0.67578125, + "learning_rate": 0.00010402735711420045, + "loss": 0.7902, + "step": 10287 + }, + { + "epoch": 0.48746742478085764, + "grad_norm": 0.68359375, + "learning_rate": 0.00010401247655833555, + "loss": 1.0957, + "step": 10288 + }, + { + "epoch": 0.487514806917792, + "grad_norm": 0.05810546875, + "learning_rate": 0.00010399759591347825, + "loss": 0.0049, + "step": 10289 + }, + { + "epoch": 0.48756218905472637, + "grad_norm": 0.376953125, + "learning_rate": 0.00010398271517995848, + "loss": 0.0964, + "step": 10290 + }, + { + "epoch": 0.48760957119166076, + "grad_norm": 0.77734375, + "learning_rate": 0.00010396783435810636, + "loss": 0.9047, + "step": 10291 + }, + { + "epoch": 0.4876569533285951, + "grad_norm": 0.173828125, + "learning_rate": 0.00010395295344825189, + "loss": 0.0149, + "step": 10292 + }, + { + "epoch": 0.4877043354655295, + "grad_norm": 0.62109375, + "learning_rate": 0.00010393807245072511, + "loss": 1.1522, + "step": 10293 + }, + { + "epoch": 0.4877517176024639, + "grad_norm": 0.09619140625, + "learning_rate": 0.00010392319136585609, + "loss": 0.0133, + "step": 10294 + }, + { + "epoch": 0.4877990997393982, + "grad_norm": 0.1474609375, + "learning_rate": 0.00010390831019397487, + "loss": 0.0323, + "step": 10295 + }, + { + "epoch": 0.4878464818763326, + "grad_norm": 0.63671875, + "learning_rate": 0.0001038934289354115, + "loss": 0.9189, + "step": 10296 + }, + { + "epoch": 0.487893864013267, + "grad_norm": 0.70703125, + "learning_rate": 0.000103878547590496, + "loss": 1.0949, + "step": 10297 + }, + { + "epoch": 0.48794124615020135, + "grad_norm": 0.609375, + "learning_rate": 0.00010386366615955844, + "loss": 1.1003, + "step": 10298 + }, + { + "epoch": 0.48798862828713574, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001038487846429289, + "loss": 0.0741, + "step": 10299 + }, + { + "epoch": 0.48803601042407013, + "grad_norm": 0.546875, + "learning_rate": 0.00010383390304093743, + "loss": 1.0395, + "step": 10300 + }, + { + "epoch": 0.4880833925610045, + "grad_norm": 0.2392578125, + "learning_rate": 0.00010381902135391404, + "loss": 0.0056, + "step": 10301 + }, + { + "epoch": 0.48813077469793886, + "grad_norm": 0.54296875, + "learning_rate": 0.00010380413958218884, + "loss": 1.0296, + "step": 10302 + }, + { + "epoch": 0.48817815683487326, + "grad_norm": 0.53515625, + "learning_rate": 0.0001037892577260919, + "loss": 0.4365, + "step": 10303 + }, + { + "epoch": 0.48822553897180765, + "grad_norm": 0.6953125, + "learning_rate": 0.00010377437578595323, + "loss": 1.3547, + "step": 10304 + }, + { + "epoch": 0.488272921108742, + "grad_norm": 0.6875, + "learning_rate": 0.00010375949376210295, + "loss": 0.6985, + "step": 10305 + }, + { + "epoch": 0.4883203032456764, + "grad_norm": 0.95703125, + "learning_rate": 0.00010374461165487105, + "loss": 0.8957, + "step": 10306 + }, + { + "epoch": 0.48836768538261077, + "grad_norm": 0.7421875, + "learning_rate": 0.00010372972946458772, + "loss": 1.1189, + "step": 10307 + }, + { + "epoch": 0.4884150675195451, + "grad_norm": 0.166015625, + "learning_rate": 0.00010371484719158291, + "loss": 0.007, + "step": 10308 + }, + { + "epoch": 0.4884624496564795, + "grad_norm": 0.60546875, + "learning_rate": 0.00010369996483618676, + "loss": 0.8763, + "step": 10309 + }, + { + "epoch": 0.4885098317934139, + "grad_norm": 0.58203125, + "learning_rate": 0.00010368508239872934, + "loss": 0.9605, + "step": 10310 + }, + { + "epoch": 0.48855721393034823, + "grad_norm": 0.000507354736328125, + "learning_rate": 0.00010367019987954072, + "loss": 0.0, + "step": 10311 + }, + { + "epoch": 0.4886045960672826, + "grad_norm": 0.703125, + "learning_rate": 0.00010365531727895099, + "loss": 1.1541, + "step": 10312 + }, + { + "epoch": 0.488651978204217, + "grad_norm": 0.60546875, + "learning_rate": 0.00010364043459729018, + "loss": 0.5847, + "step": 10313 + }, + { + "epoch": 0.4886993603411514, + "grad_norm": 0.87109375, + "learning_rate": 0.00010362555183488842, + "loss": 0.7224, + "step": 10314 + }, + { + "epoch": 0.48874674247808575, + "grad_norm": 0.56640625, + "learning_rate": 0.00010361066899207581, + "loss": 0.724, + "step": 10315 + }, + { + "epoch": 0.48879412461502014, + "grad_norm": 0.046142578125, + "learning_rate": 0.00010359578606918236, + "loss": 0.0045, + "step": 10316 + }, + { + "epoch": 0.48884150675195454, + "grad_norm": 0.154296875, + "learning_rate": 0.00010358090306653822, + "loss": 0.1146, + "step": 10317 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.625, + "learning_rate": 0.00010356601998447351, + "loss": 1.2059, + "step": 10318 + }, + { + "epoch": 0.48893627102582327, + "grad_norm": 0.4296875, + "learning_rate": 0.00010355113682331824, + "loss": 0.433, + "step": 10319 + }, + { + "epoch": 0.48898365316275766, + "grad_norm": 0.6484375, + "learning_rate": 0.00010353625358340253, + "loss": 0.9884, + "step": 10320 + }, + { + "epoch": 0.489031035299692, + "grad_norm": 0.61328125, + "learning_rate": 0.00010352137026505649, + "loss": 0.9016, + "step": 10321 + }, + { + "epoch": 0.4890784174366264, + "grad_norm": 0.052978515625, + "learning_rate": 0.0001035064868686102, + "loss": 0.0052, + "step": 10322 + }, + { + "epoch": 0.4891257995735608, + "grad_norm": 0.8203125, + "learning_rate": 0.00010349160339439379, + "loss": 1.0137, + "step": 10323 + }, + { + "epoch": 0.4891731817104951, + "grad_norm": 0.59765625, + "learning_rate": 0.00010347671984273731, + "loss": 1.1278, + "step": 10324 + }, + { + "epoch": 0.4892205638474295, + "grad_norm": 0.734375, + "learning_rate": 0.00010346183621397087, + "loss": 0.0816, + "step": 10325 + }, + { + "epoch": 0.4892679459843639, + "grad_norm": 0.53515625, + "learning_rate": 0.00010344695250842464, + "loss": 0.5045, + "step": 10326 + }, + { + "epoch": 0.48931532812129824, + "grad_norm": 0.01312255859375, + "learning_rate": 0.00010343206872642865, + "loss": 0.0006, + "step": 10327 + }, + { + "epoch": 0.48936271025823264, + "grad_norm": 0.458984375, + "learning_rate": 0.00010341718486831304, + "loss": 0.6119, + "step": 10328 + }, + { + "epoch": 0.48941009239516703, + "grad_norm": 0.61328125, + "learning_rate": 0.0001034023009344079, + "loss": 1.2241, + "step": 10329 + }, + { + "epoch": 0.4894574745321014, + "grad_norm": 0.65625, + "learning_rate": 0.00010338741692504336, + "loss": 0.8122, + "step": 10330 + }, + { + "epoch": 0.48950485666903576, + "grad_norm": 0.5703125, + "learning_rate": 0.00010337253284054951, + "loss": 1.0274, + "step": 10331 + }, + { + "epoch": 0.48955223880597015, + "grad_norm": 0.41796875, + "learning_rate": 0.00010335764868125646, + "loss": 0.1195, + "step": 10332 + }, + { + "epoch": 0.48959962094290455, + "grad_norm": 0.640625, + "learning_rate": 0.00010334276444749436, + "loss": 1.2966, + "step": 10333 + }, + { + "epoch": 0.4896470030798389, + "grad_norm": 0.7265625, + "learning_rate": 0.00010332788013959331, + "loss": 0.8606, + "step": 10334 + }, + { + "epoch": 0.4896943852167733, + "grad_norm": 0.1416015625, + "learning_rate": 0.00010331299575788342, + "loss": 0.0228, + "step": 10335 + }, + { + "epoch": 0.48974176735370767, + "grad_norm": 0.41796875, + "learning_rate": 0.0001032981113026948, + "loss": 0.0618, + "step": 10336 + }, + { + "epoch": 0.489789149490642, + "grad_norm": 0.19921875, + "learning_rate": 0.00010328322677435757, + "loss": 0.024, + "step": 10337 + }, + { + "epoch": 0.4898365316275764, + "grad_norm": 0.6328125, + "learning_rate": 0.00010326834217320191, + "loss": 1.2858, + "step": 10338 + }, + { + "epoch": 0.4898839137645108, + "grad_norm": 0.173828125, + "learning_rate": 0.00010325345749955789, + "loss": 0.1277, + "step": 10339 + }, + { + "epoch": 0.48993129590144513, + "grad_norm": 0.1796875, + "learning_rate": 0.00010323857275375561, + "loss": 0.1292, + "step": 10340 + }, + { + "epoch": 0.4899786780383795, + "grad_norm": 0.57421875, + "learning_rate": 0.00010322368793612529, + "loss": 0.6946, + "step": 10341 + }, + { + "epoch": 0.4900260601753139, + "grad_norm": 0.4765625, + "learning_rate": 0.00010320880304699699, + "loss": 0.8295, + "step": 10342 + }, + { + "epoch": 0.4900734423122483, + "grad_norm": 0.75390625, + "learning_rate": 0.00010319391808670082, + "loss": 0.8974, + "step": 10343 + }, + { + "epoch": 0.49012082444918265, + "grad_norm": 0.224609375, + "learning_rate": 0.00010317903305556697, + "loss": 0.0449, + "step": 10344 + }, + { + "epoch": 0.49016820658611704, + "grad_norm": 0.515625, + "learning_rate": 0.00010316414795392555, + "loss": 0.8658, + "step": 10345 + }, + { + "epoch": 0.49021558872305143, + "grad_norm": 0.00701904296875, + "learning_rate": 0.00010314926278210673, + "loss": 0.0005, + "step": 10346 + }, + { + "epoch": 0.49026297085998577, + "grad_norm": 0.7265625, + "learning_rate": 0.00010313437754044058, + "loss": 0.6892, + "step": 10347 + }, + { + "epoch": 0.49031035299692016, + "grad_norm": 0.68359375, + "learning_rate": 0.00010311949222925727, + "loss": 0.3972, + "step": 10348 + }, + { + "epoch": 0.49035773513385456, + "grad_norm": 0.66796875, + "learning_rate": 0.00010310460684888699, + "loss": 1.0022, + "step": 10349 + }, + { + "epoch": 0.4904051172707889, + "grad_norm": 0.66015625, + "learning_rate": 0.00010308972139965982, + "loss": 1.0487, + "step": 10350 + }, + { + "epoch": 0.4904524994077233, + "grad_norm": 0.71875, + "learning_rate": 0.0001030748358819059, + "loss": 0.9667, + "step": 10351 + }, + { + "epoch": 0.4904998815446577, + "grad_norm": 0.1103515625, + "learning_rate": 0.00010305995029595539, + "loss": 0.0107, + "step": 10352 + }, + { + "epoch": 0.490547263681592, + "grad_norm": 0.68359375, + "learning_rate": 0.00010304506464213848, + "loss": 1.1267, + "step": 10353 + }, + { + "epoch": 0.4905946458185264, + "grad_norm": 0.2060546875, + "learning_rate": 0.00010303017892078523, + "loss": 0.0282, + "step": 10354 + }, + { + "epoch": 0.4906420279554608, + "grad_norm": 0.6328125, + "learning_rate": 0.00010301529313222587, + "loss": 0.6471, + "step": 10355 + }, + { + "epoch": 0.49068941009239514, + "grad_norm": 0.5625, + "learning_rate": 0.0001030004072767905, + "loss": 0.9969, + "step": 10356 + }, + { + "epoch": 0.49073679222932953, + "grad_norm": 0.087890625, + "learning_rate": 0.00010298552135480932, + "loss": 0.0024, + "step": 10357 + }, + { + "epoch": 0.4907841743662639, + "grad_norm": 0.498046875, + "learning_rate": 0.00010297063536661247, + "loss": 0.7482, + "step": 10358 + }, + { + "epoch": 0.4908315565031983, + "grad_norm": 1.015625, + "learning_rate": 0.00010295574931253005, + "loss": 0.8796, + "step": 10359 + }, + { + "epoch": 0.49087893864013266, + "grad_norm": 0.439453125, + "learning_rate": 0.00010294086319289227, + "loss": 0.4574, + "step": 10360 + }, + { + "epoch": 0.49092632077706705, + "grad_norm": 0.57421875, + "learning_rate": 0.00010292597700802928, + "loss": 1.0798, + "step": 10361 + }, + { + "epoch": 0.49097370291400144, + "grad_norm": 0.875, + "learning_rate": 0.00010291109075827124, + "loss": 1.0982, + "step": 10362 + }, + { + "epoch": 0.4910210850509358, + "grad_norm": 0.490234375, + "learning_rate": 0.0001028962044439483, + "loss": 0.7837, + "step": 10363 + }, + { + "epoch": 0.4910684671878702, + "grad_norm": 0.80078125, + "learning_rate": 0.00010288131806539063, + "loss": 0.7936, + "step": 10364 + }, + { + "epoch": 0.49111584932480457, + "grad_norm": 0.1572265625, + "learning_rate": 0.00010286643162292841, + "loss": 0.0107, + "step": 10365 + }, + { + "epoch": 0.4911632314617389, + "grad_norm": 0.52734375, + "learning_rate": 0.00010285154511689179, + "loss": 0.0806, + "step": 10366 + }, + { + "epoch": 0.4912106135986733, + "grad_norm": 0.75390625, + "learning_rate": 0.00010283665854761091, + "loss": 1.3351, + "step": 10367 + }, + { + "epoch": 0.4912579957356077, + "grad_norm": 0.796875, + "learning_rate": 0.00010282177191541597, + "loss": 0.9226, + "step": 10368 + }, + { + "epoch": 0.491305377872542, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00010280688522063717, + "loss": 0.001, + "step": 10369 + }, + { + "epoch": 0.4913527600094764, + "grad_norm": 0.60546875, + "learning_rate": 0.00010279199846360461, + "loss": 1.0816, + "step": 10370 + }, + { + "epoch": 0.4914001421464108, + "grad_norm": 0.62890625, + "learning_rate": 0.00010277711164464852, + "loss": 1.2541, + "step": 10371 + }, + { + "epoch": 0.4914475242833452, + "grad_norm": 0.859375, + "learning_rate": 0.00010276222476409905, + "loss": 0.6041, + "step": 10372 + }, + { + "epoch": 0.49149490642027954, + "grad_norm": 0.75390625, + "learning_rate": 0.00010274733782228638, + "loss": 1.2356, + "step": 10373 + }, + { + "epoch": 0.49154228855721394, + "grad_norm": 0.734375, + "learning_rate": 0.00010273245081954067, + "loss": 1.1031, + "step": 10374 + }, + { + "epoch": 0.49158967069414833, + "grad_norm": 0.50390625, + "learning_rate": 0.00010271756375619208, + "loss": 0.7845, + "step": 10375 + }, + { + "epoch": 0.49163705283108267, + "grad_norm": 0.51953125, + "learning_rate": 0.00010270267663257088, + "loss": 0.7275, + "step": 10376 + }, + { + "epoch": 0.49168443496801706, + "grad_norm": 0.73828125, + "learning_rate": 0.00010268778944900714, + "loss": 0.8636, + "step": 10377 + }, + { + "epoch": 0.49173181710495145, + "grad_norm": 0.625, + "learning_rate": 0.00010267290220583112, + "loss": 0.8971, + "step": 10378 + }, + { + "epoch": 0.4917791992418858, + "grad_norm": 0.052490234375, + "learning_rate": 0.00010265801490337296, + "loss": 0.0027, + "step": 10379 + }, + { + "epoch": 0.4918265813788202, + "grad_norm": 0.318359375, + "learning_rate": 0.0001026431275419629, + "loss": 0.0894, + "step": 10380 + }, + { + "epoch": 0.4918739635157546, + "grad_norm": 0.57421875, + "learning_rate": 0.00010262824012193107, + "loss": 0.9239, + "step": 10381 + }, + { + "epoch": 0.4919213456526889, + "grad_norm": 0.91796875, + "learning_rate": 0.00010261335264360765, + "loss": 1.1283, + "step": 10382 + }, + { + "epoch": 0.4919687277896233, + "grad_norm": 0.75, + "learning_rate": 0.00010259846510732288, + "loss": 1.5949, + "step": 10383 + }, + { + "epoch": 0.4920161099265577, + "grad_norm": 0.65234375, + "learning_rate": 0.0001025835775134069, + "loss": 1.0749, + "step": 10384 + }, + { + "epoch": 0.49206349206349204, + "grad_norm": 0.318359375, + "learning_rate": 0.00010256868986218993, + "loss": 0.1287, + "step": 10385 + }, + { + "epoch": 0.49211087420042643, + "grad_norm": 0.81640625, + "learning_rate": 0.00010255380215400215, + "loss": 0.9982, + "step": 10386 + }, + { + "epoch": 0.4921582563373608, + "grad_norm": 0.68359375, + "learning_rate": 0.00010253891438917375, + "loss": 1.1091, + "step": 10387 + }, + { + "epoch": 0.4922056384742952, + "grad_norm": 0.77734375, + "learning_rate": 0.00010252402656803497, + "loss": 0.9044, + "step": 10388 + }, + { + "epoch": 0.49225302061122955, + "grad_norm": 0.70703125, + "learning_rate": 0.00010250913869091593, + "loss": 1.4099, + "step": 10389 + }, + { + "epoch": 0.49230040274816395, + "grad_norm": 0.6953125, + "learning_rate": 0.0001024942507581469, + "loss": 1.1162, + "step": 10390 + }, + { + "epoch": 0.49234778488509834, + "grad_norm": 0.6328125, + "learning_rate": 0.000102479362770058, + "loss": 0.086, + "step": 10391 + }, + { + "epoch": 0.4923951670220327, + "grad_norm": 0.05859375, + "learning_rate": 0.00010246447472697953, + "loss": 0.0024, + "step": 10392 + }, + { + "epoch": 0.49244254915896707, + "grad_norm": 0.5625, + "learning_rate": 0.0001024495866292416, + "loss": 1.1615, + "step": 10393 + }, + { + "epoch": 0.49248993129590146, + "grad_norm": 0.69140625, + "learning_rate": 0.00010243469847717444, + "loss": 0.6211, + "step": 10394 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 0.65234375, + "learning_rate": 0.0001024198102711083, + "loss": 1.0741, + "step": 10395 + }, + { + "epoch": 0.4925846955697702, + "grad_norm": 0.72265625, + "learning_rate": 0.0001024049220113733, + "loss": 1.2978, + "step": 10396 + }, + { + "epoch": 0.4926320777067046, + "grad_norm": 0.047607421875, + "learning_rate": 0.00010239003369829974, + "loss": 0.0049, + "step": 10397 + }, + { + "epoch": 0.4926794598436389, + "grad_norm": 0.6640625, + "learning_rate": 0.00010237514533221774, + "loss": 0.6888, + "step": 10398 + }, + { + "epoch": 0.4927268419805733, + "grad_norm": 0.265625, + "learning_rate": 0.00010236025691345757, + "loss": 0.1765, + "step": 10399 + }, + { + "epoch": 0.4927742241175077, + "grad_norm": 0.67578125, + "learning_rate": 0.00010234536844234941, + "loss": 1.2865, + "step": 10400 + }, + { + "epoch": 0.4928216062544421, + "grad_norm": 0.7109375, + "learning_rate": 0.00010233047991922344, + "loss": 0.7992, + "step": 10401 + }, + { + "epoch": 0.49286898839137644, + "grad_norm": 0.671875, + "learning_rate": 0.00010231559134440993, + "loss": 1.3444, + "step": 10402 + }, + { + "epoch": 0.49291637052831083, + "grad_norm": 0.84765625, + "learning_rate": 0.0001023007027182391, + "loss": 1.0896, + "step": 10403 + }, + { + "epoch": 0.4929637526652452, + "grad_norm": 0.296875, + "learning_rate": 0.00010228581404104112, + "loss": 0.0345, + "step": 10404 + }, + { + "epoch": 0.49301113480217956, + "grad_norm": 0.251953125, + "learning_rate": 0.00010227092531314621, + "loss": 0.0355, + "step": 10405 + }, + { + "epoch": 0.49305851693911396, + "grad_norm": 0.6015625, + "learning_rate": 0.0001022560365348846, + "loss": 1.055, + "step": 10406 + }, + { + "epoch": 0.49310589907604835, + "grad_norm": 0.8046875, + "learning_rate": 0.0001022411477065865, + "loss": 1.2288, + "step": 10407 + }, + { + "epoch": 0.4931532812129827, + "grad_norm": 0.7265625, + "learning_rate": 0.00010222625882858212, + "loss": 0.9305, + "step": 10408 + }, + { + "epoch": 0.4932006633499171, + "grad_norm": 0.00274658203125, + "learning_rate": 0.0001022113699012017, + "loss": 0.0002, + "step": 10409 + }, + { + "epoch": 0.49324804548685147, + "grad_norm": 0.59765625, + "learning_rate": 0.00010219648092477545, + "loss": 0.8266, + "step": 10410 + }, + { + "epoch": 0.4932954276237858, + "grad_norm": 0.56640625, + "learning_rate": 0.00010218159189963361, + "loss": 0.9723, + "step": 10411 + }, + { + "epoch": 0.4933428097607202, + "grad_norm": 0.4765625, + "learning_rate": 0.00010216670282610637, + "loss": 0.2313, + "step": 10412 + }, + { + "epoch": 0.4933901918976546, + "grad_norm": 0.546875, + "learning_rate": 0.00010215181370452399, + "loss": 0.5898, + "step": 10413 + }, + { + "epoch": 0.49343757403458893, + "grad_norm": 0.57421875, + "learning_rate": 0.00010213692453521661, + "loss": 0.8182, + "step": 10414 + }, + { + "epoch": 0.4934849561715233, + "grad_norm": 0.435546875, + "learning_rate": 0.00010212203531851459, + "loss": 0.0812, + "step": 10415 + }, + { + "epoch": 0.4935323383084577, + "grad_norm": 0.76953125, + "learning_rate": 0.00010210714605474805, + "loss": 0.2392, + "step": 10416 + }, + { + "epoch": 0.4935797204453921, + "grad_norm": 1.03125, + "learning_rate": 0.00010209225674424727, + "loss": 0.1132, + "step": 10417 + }, + { + "epoch": 0.49362710258232645, + "grad_norm": 0.6953125, + "learning_rate": 0.00010207736738734246, + "loss": 0.7996, + "step": 10418 + }, + { + "epoch": 0.49367448471926084, + "grad_norm": 0.17578125, + "learning_rate": 0.00010206247798436385, + "loss": 0.0155, + "step": 10419 + }, + { + "epoch": 0.49372186685619524, + "grad_norm": 1.203125, + "learning_rate": 0.00010204758853564167, + "loss": 0.3139, + "step": 10420 + }, + { + "epoch": 0.4937692489931296, + "grad_norm": 0.71875, + "learning_rate": 0.00010203269904150619, + "loss": 0.546, + "step": 10421 + }, + { + "epoch": 0.49381663113006397, + "grad_norm": 2.359375, + "learning_rate": 0.00010201780950228759, + "loss": 1.3112, + "step": 10422 + }, + { + "epoch": 0.49386401326699836, + "grad_norm": 0.55078125, + "learning_rate": 0.00010200291991831611, + "loss": 0.8465, + "step": 10423 + }, + { + "epoch": 0.4939113954039327, + "grad_norm": 0.462890625, + "learning_rate": 0.000101988030289922, + "loss": 0.4812, + "step": 10424 + }, + { + "epoch": 0.4939587775408671, + "grad_norm": 0.76953125, + "learning_rate": 0.00010197314061743551, + "loss": 1.0264, + "step": 10425 + }, + { + "epoch": 0.4940061596778015, + "grad_norm": 0.83203125, + "learning_rate": 0.00010195825090118687, + "loss": 0.8086, + "step": 10426 + }, + { + "epoch": 0.4940535418147358, + "grad_norm": 2.34375, + "learning_rate": 0.00010194336114150629, + "loss": 2.0573, + "step": 10427 + }, + { + "epoch": 0.4941009239516702, + "grad_norm": 0.0830078125, + "learning_rate": 0.00010192847133872405, + "loss": 0.0087, + "step": 10428 + }, + { + "epoch": 0.4941483060886046, + "grad_norm": 0.61328125, + "learning_rate": 0.00010191358149317036, + "loss": 0.6567, + "step": 10429 + }, + { + "epoch": 0.494195688225539, + "grad_norm": 0.578125, + "learning_rate": 0.00010189869160517549, + "loss": 0.5986, + "step": 10430 + }, + { + "epoch": 0.49424307036247334, + "grad_norm": 0.216796875, + "learning_rate": 0.00010188380167506963, + "loss": 0.0313, + "step": 10431 + }, + { + "epoch": 0.49429045249940773, + "grad_norm": 0.5078125, + "learning_rate": 0.00010186891170318306, + "loss": 0.4554, + "step": 10432 + }, + { + "epoch": 0.4943378346363421, + "grad_norm": 0.037353515625, + "learning_rate": 0.00010185402168984601, + "loss": 0.0037, + "step": 10433 + }, + { + "epoch": 0.49438521677327646, + "grad_norm": 0.76171875, + "learning_rate": 0.00010183913163538876, + "loss": 0.8794, + "step": 10434 + }, + { + "epoch": 0.49443259891021085, + "grad_norm": 0.8046875, + "learning_rate": 0.00010182424154014152, + "loss": 1.1408, + "step": 10435 + }, + { + "epoch": 0.49447998104714525, + "grad_norm": 0.380859375, + "learning_rate": 0.00010180935140443458, + "loss": 0.1155, + "step": 10436 + }, + { + "epoch": 0.4945273631840796, + "grad_norm": 0.65625, + "learning_rate": 0.0001017944612285981, + "loss": 1.2385, + "step": 10437 + }, + { + "epoch": 0.494574745321014, + "grad_norm": 0.58203125, + "learning_rate": 0.00010177957101296241, + "loss": 0.7973, + "step": 10438 + }, + { + "epoch": 0.49462212745794837, + "grad_norm": 0.59375, + "learning_rate": 0.00010176468075785773, + "loss": 1.2734, + "step": 10439 + }, + { + "epoch": 0.4946695095948827, + "grad_norm": 0.77734375, + "learning_rate": 0.00010174979046361429, + "loss": 1.0335, + "step": 10440 + }, + { + "epoch": 0.4947168917318171, + "grad_norm": 0.6640625, + "learning_rate": 0.00010173490013056237, + "loss": 0.7154, + "step": 10441 + }, + { + "epoch": 0.4947642738687515, + "grad_norm": 0.828125, + "learning_rate": 0.00010172000975903221, + "loss": 1.1064, + "step": 10442 + }, + { + "epoch": 0.49481165600568583, + "grad_norm": 0.1455078125, + "learning_rate": 0.00010170511934935408, + "loss": 0.0372, + "step": 10443 + }, + { + "epoch": 0.4948590381426202, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001016902289018582, + "loss": 0.1204, + "step": 10444 + }, + { + "epoch": 0.4949064202795546, + "grad_norm": 0.75390625, + "learning_rate": 0.00010167533841687486, + "loss": 0.984, + "step": 10445 + }, + { + "epoch": 0.494953802416489, + "grad_norm": 0.09619140625, + "learning_rate": 0.00010166044789473431, + "loss": 0.0056, + "step": 10446 + }, + { + "epoch": 0.49500118455342335, + "grad_norm": 0.62890625, + "learning_rate": 0.00010164555733576675, + "loss": 0.6282, + "step": 10447 + }, + { + "epoch": 0.49504856669035774, + "grad_norm": 0.60546875, + "learning_rate": 0.00010163066674030251, + "loss": 1.0109, + "step": 10448 + }, + { + "epoch": 0.49509594882729213, + "grad_norm": 0.63671875, + "learning_rate": 0.00010161577610867183, + "loss": 1.0961, + "step": 10449 + }, + { + "epoch": 0.49514333096422647, + "grad_norm": 0.7734375, + "learning_rate": 0.00010160088544120494, + "loss": 0.9837, + "step": 10450 + }, + { + "epoch": 0.49519071310116086, + "grad_norm": 0.71875, + "learning_rate": 0.00010158599473823209, + "loss": 1.0436, + "step": 10451 + }, + { + "epoch": 0.49523809523809526, + "grad_norm": 0.4921875, + "learning_rate": 0.00010157110400008363, + "loss": 0.9952, + "step": 10452 + }, + { + "epoch": 0.4952854773750296, + "grad_norm": 0.62890625, + "learning_rate": 0.00010155621322708973, + "loss": 1.1686, + "step": 10453 + }, + { + "epoch": 0.495332859511964, + "grad_norm": 0.64453125, + "learning_rate": 0.00010154132241958065, + "loss": 1.1098, + "step": 10454 + }, + { + "epoch": 0.4953802416488984, + "grad_norm": 0.6328125, + "learning_rate": 0.0001015264315778867, + "loss": 0.4043, + "step": 10455 + }, + { + "epoch": 0.4954276237858327, + "grad_norm": 1.171875, + "learning_rate": 0.00010151154070233813, + "loss": 1.1451, + "step": 10456 + }, + { + "epoch": 0.4954750059227671, + "grad_norm": 0.65625, + "learning_rate": 0.00010149664979326518, + "loss": 0.969, + "step": 10457 + }, + { + "epoch": 0.4955223880597015, + "grad_norm": 0.69140625, + "learning_rate": 0.00010148175885099815, + "loss": 0.9108, + "step": 10458 + }, + { + "epoch": 0.4955697701966359, + "grad_norm": 0.734375, + "learning_rate": 0.00010146686787586728, + "loss": 0.5957, + "step": 10459 + }, + { + "epoch": 0.49561715233357023, + "grad_norm": 0.625, + "learning_rate": 0.00010145197686820285, + "loss": 1.1212, + "step": 10460 + }, + { + "epoch": 0.4956645344705046, + "grad_norm": 0.64453125, + "learning_rate": 0.00010143708582833513, + "loss": 0.6518, + "step": 10461 + }, + { + "epoch": 0.495711916607439, + "grad_norm": 0.515625, + "learning_rate": 0.00010142219475659436, + "loss": 0.4941, + "step": 10462 + }, + { + "epoch": 0.49575929874437336, + "grad_norm": 1.625, + "learning_rate": 0.00010140730365331082, + "loss": 1.5096, + "step": 10463 + }, + { + "epoch": 0.49580668088130775, + "grad_norm": 0.28515625, + "learning_rate": 0.0001013924125188148, + "loss": 0.0465, + "step": 10464 + }, + { + "epoch": 0.49585406301824214, + "grad_norm": 0.169921875, + "learning_rate": 0.00010137752135343653, + "loss": 0.0188, + "step": 10465 + }, + { + "epoch": 0.4959014451551765, + "grad_norm": 0.609375, + "learning_rate": 0.0001013626301575063, + "loss": 1.0475, + "step": 10466 + }, + { + "epoch": 0.49594882729211087, + "grad_norm": 0.6015625, + "learning_rate": 0.00010134773893135438, + "loss": 0.5929, + "step": 10467 + }, + { + "epoch": 0.49599620942904526, + "grad_norm": 0.54296875, + "learning_rate": 0.0001013328476753111, + "loss": 0.4285, + "step": 10468 + }, + { + "epoch": 0.4960435915659796, + "grad_norm": 0.333984375, + "learning_rate": 0.00010131795638970665, + "loss": 0.1665, + "step": 10469 + }, + { + "epoch": 0.496090973702914, + "grad_norm": 0.68359375, + "learning_rate": 0.00010130306507487133, + "loss": 0.8403, + "step": 10470 + }, + { + "epoch": 0.4961383558398484, + "grad_norm": 0.61328125, + "learning_rate": 0.00010128817373113539, + "loss": 0.9496, + "step": 10471 + }, + { + "epoch": 0.4961857379767827, + "grad_norm": 0.50390625, + "learning_rate": 0.00010127328235882915, + "loss": 0.7994, + "step": 10472 + }, + { + "epoch": 0.4962331201137171, + "grad_norm": 0.21484375, + "learning_rate": 0.00010125839095828286, + "loss": 0.0093, + "step": 10473 + }, + { + "epoch": 0.4962805022506515, + "grad_norm": 0.53515625, + "learning_rate": 0.00010124349952982681, + "loss": 0.5962, + "step": 10474 + }, + { + "epoch": 0.4963278843875859, + "grad_norm": 0.0986328125, + "learning_rate": 0.00010122860807379127, + "loss": 0.0048, + "step": 10475 + }, + { + "epoch": 0.49637526652452024, + "grad_norm": 0.2021484375, + "learning_rate": 0.00010121371659050652, + "loss": 0.0295, + "step": 10476 + }, + { + "epoch": 0.49642264866145464, + "grad_norm": 0.72265625, + "learning_rate": 0.00010119882508030279, + "loss": 0.7112, + "step": 10477 + }, + { + "epoch": 0.49647003079838903, + "grad_norm": 0.6328125, + "learning_rate": 0.0001011839335435104, + "loss": 0.5813, + "step": 10478 + }, + { + "epoch": 0.49651741293532337, + "grad_norm": 0.259765625, + "learning_rate": 0.00010116904198045963, + "loss": 0.0494, + "step": 10479 + }, + { + "epoch": 0.49656479507225776, + "grad_norm": 0.044677734375, + "learning_rate": 0.00010115415039148078, + "loss": 0.0034, + "step": 10480 + }, + { + "epoch": 0.49661217720919215, + "grad_norm": 0.5390625, + "learning_rate": 0.00010113925877690406, + "loss": 0.2992, + "step": 10481 + }, + { + "epoch": 0.4966595593461265, + "grad_norm": 0.51953125, + "learning_rate": 0.00010112436713705984, + "loss": 0.7335, + "step": 10482 + }, + { + "epoch": 0.4967069414830609, + "grad_norm": 0.8828125, + "learning_rate": 0.00010110947547227834, + "loss": 1.1745, + "step": 10483 + }, + { + "epoch": 0.4967543236199953, + "grad_norm": 0.54296875, + "learning_rate": 0.00010109458378288987, + "loss": 0.8676, + "step": 10484 + }, + { + "epoch": 0.4968017057569296, + "grad_norm": 0.1376953125, + "learning_rate": 0.00010107969206922465, + "loss": 0.0138, + "step": 10485 + }, + { + "epoch": 0.496849087893864, + "grad_norm": 0.07958984375, + "learning_rate": 0.00010106480033161306, + "loss": 0.0081, + "step": 10486 + }, + { + "epoch": 0.4968964700307984, + "grad_norm": 0.68359375, + "learning_rate": 0.0001010499085703853, + "loss": 1.1629, + "step": 10487 + }, + { + "epoch": 0.4969438521677328, + "grad_norm": 0.5546875, + "learning_rate": 0.00010103501678587171, + "loss": 0.7232, + "step": 10488 + }, + { + "epoch": 0.49699123430466713, + "grad_norm": 0.74609375, + "learning_rate": 0.00010102012497840252, + "loss": 1.1733, + "step": 10489 + }, + { + "epoch": 0.4970386164416015, + "grad_norm": 0.578125, + "learning_rate": 0.00010100523314830806, + "loss": 0.9117, + "step": 10490 + }, + { + "epoch": 0.4970859985785359, + "grad_norm": 0.6328125, + "learning_rate": 0.00010099034129591862, + "loss": 0.9889, + "step": 10491 + }, + { + "epoch": 0.49713338071547025, + "grad_norm": 0.6328125, + "learning_rate": 0.00010097544942156445, + "loss": 1.1874, + "step": 10492 + }, + { + "epoch": 0.49718076285240465, + "grad_norm": 0.6484375, + "learning_rate": 0.00010096055752557583, + "loss": 0.947, + "step": 10493 + }, + { + "epoch": 0.49722814498933904, + "grad_norm": 0.328125, + "learning_rate": 0.0001009456656082831, + "loss": 0.0447, + "step": 10494 + }, + { + "epoch": 0.4972755271262734, + "grad_norm": 0.28515625, + "learning_rate": 0.00010093077367001649, + "loss": 0.0465, + "step": 10495 + }, + { + "epoch": 0.49732290926320777, + "grad_norm": 0.70703125, + "learning_rate": 0.00010091588171110631, + "loss": 1.3243, + "step": 10496 + }, + { + "epoch": 0.49737029140014216, + "grad_norm": 0.193359375, + "learning_rate": 0.00010090098973188286, + "loss": 0.1363, + "step": 10497 + }, + { + "epoch": 0.4974176735370765, + "grad_norm": 0.765625, + "learning_rate": 0.00010088609773267643, + "loss": 0.1587, + "step": 10498 + }, + { + "epoch": 0.4974650556740109, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001008712057138173, + "loss": 0.0312, + "step": 10499 + }, + { + "epoch": 0.4975124378109453, + "grad_norm": 1.0859375, + "learning_rate": 0.00010085631367563573, + "loss": 0.3897, + "step": 10500 + }, + { + "epoch": 0.4975598199478796, + "grad_norm": 0.1044921875, + "learning_rate": 0.00010084142161846202, + "loss": 0.0112, + "step": 10501 + }, + { + "epoch": 0.497607202084814, + "grad_norm": 0.423828125, + "learning_rate": 0.0001008265295426265, + "loss": 0.7306, + "step": 10502 + }, + { + "epoch": 0.4976545842217484, + "grad_norm": 0.65234375, + "learning_rate": 0.00010081163744845945, + "loss": 0.805, + "step": 10503 + }, + { + "epoch": 0.4977019663586828, + "grad_norm": 0.58984375, + "learning_rate": 0.00010079674533629111, + "loss": 0.584, + "step": 10504 + }, + { + "epoch": 0.49774934849561714, + "grad_norm": 0.81640625, + "learning_rate": 0.00010078185320645182, + "loss": 0.773, + "step": 10505 + }, + { + "epoch": 0.49779673063255153, + "grad_norm": 0.032470703125, + "learning_rate": 0.00010076696105927188, + "loss": 0.0028, + "step": 10506 + }, + { + "epoch": 0.4978441127694859, + "grad_norm": 0.10009765625, + "learning_rate": 0.00010075206889508155, + "loss": 0.0104, + "step": 10507 + }, + { + "epoch": 0.49789149490642026, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010073717671421113, + "loss": 0.1281, + "step": 10508 + }, + { + "epoch": 0.49793887704335466, + "grad_norm": 1.125, + "learning_rate": 0.0001007222845169909, + "loss": 0.1277, + "step": 10509 + }, + { + "epoch": 0.49798625918028905, + "grad_norm": 0.70703125, + "learning_rate": 0.0001007073923037512, + "loss": 1.2249, + "step": 10510 + }, + { + "epoch": 0.4980336413172234, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00010069250007482227, + "loss": 0.002, + "step": 10511 + }, + { + "epoch": 0.4980810234541578, + "grad_norm": 0.625, + "learning_rate": 0.00010067760783053441, + "loss": 1.2454, + "step": 10512 + }, + { + "epoch": 0.49812840559109217, + "grad_norm": 0.6015625, + "learning_rate": 0.00010066271557121795, + "loss": 1.1108, + "step": 10513 + }, + { + "epoch": 0.4981757877280265, + "grad_norm": 0.76953125, + "learning_rate": 0.00010064782329720317, + "loss": 1.4221, + "step": 10514 + }, + { + "epoch": 0.4982231698649609, + "grad_norm": 0.435546875, + "learning_rate": 0.00010063293100882038, + "loss": 0.5729, + "step": 10515 + }, + { + "epoch": 0.4982705520018953, + "grad_norm": 0.21875, + "learning_rate": 0.00010061803870639981, + "loss": 0.0268, + "step": 10516 + }, + { + "epoch": 0.4983179341388297, + "grad_norm": 0.392578125, + "learning_rate": 0.00010060314639027182, + "loss": 0.2148, + "step": 10517 + }, + { + "epoch": 0.498365316275764, + "grad_norm": 0.349609375, + "learning_rate": 0.00010058825406076671, + "loss": 0.1574, + "step": 10518 + }, + { + "epoch": 0.4984126984126984, + "grad_norm": 0.72265625, + "learning_rate": 0.0001005733617182147, + "loss": 0.1309, + "step": 10519 + }, + { + "epoch": 0.4984600805496328, + "grad_norm": 0.5, + "learning_rate": 0.00010055846936294616, + "loss": 0.2592, + "step": 10520 + }, + { + "epoch": 0.49850746268656715, + "grad_norm": 0.306640625, + "learning_rate": 0.00010054357699529138, + "loss": 0.1505, + "step": 10521 + }, + { + "epoch": 0.49855484482350154, + "grad_norm": 0.77734375, + "learning_rate": 0.00010052868461558063, + "loss": 0.7928, + "step": 10522 + }, + { + "epoch": 0.49860222696043593, + "grad_norm": 0.23828125, + "learning_rate": 0.00010051379222414423, + "loss": 0.0357, + "step": 10523 + }, + { + "epoch": 0.49864960909737027, + "grad_norm": 0.6796875, + "learning_rate": 0.00010049889982131244, + "loss": 1.3586, + "step": 10524 + }, + { + "epoch": 0.49869699123430467, + "grad_norm": 0.095703125, + "learning_rate": 0.00010048400740741559, + "loss": 0.0079, + "step": 10525 + }, + { + "epoch": 0.49874437337123906, + "grad_norm": 0.734375, + "learning_rate": 0.000100469114982784, + "loss": 0.4986, + "step": 10526 + }, + { + "epoch": 0.4987917555081734, + "grad_norm": 0.640625, + "learning_rate": 0.0001004542225477479, + "loss": 0.6466, + "step": 10527 + }, + { + "epoch": 0.4988391376451078, + "grad_norm": 0.09033203125, + "learning_rate": 0.00010043933010263764, + "loss": 0.0078, + "step": 10528 + }, + { + "epoch": 0.4988865197820422, + "grad_norm": 0.75390625, + "learning_rate": 0.0001004244376477835, + "loss": 1.187, + "step": 10529 + }, + { + "epoch": 0.4989339019189765, + "grad_norm": 0.03515625, + "learning_rate": 0.0001004095451835158, + "loss": 0.0023, + "step": 10530 + }, + { + "epoch": 0.4989812840559109, + "grad_norm": 0.48046875, + "learning_rate": 0.00010039465271016484, + "loss": 0.1877, + "step": 10531 + }, + { + "epoch": 0.4990286661928453, + "grad_norm": 0.67578125, + "learning_rate": 0.00010037976022806088, + "loss": 1.3169, + "step": 10532 + }, + { + "epoch": 0.4990760483297797, + "grad_norm": 0.12890625, + "learning_rate": 0.00010036486773753427, + "loss": 0.0134, + "step": 10533 + }, + { + "epoch": 0.49912343046671404, + "grad_norm": 0.49609375, + "learning_rate": 0.00010034997523891523, + "loss": 1.1287, + "step": 10534 + }, + { + "epoch": 0.49917081260364843, + "grad_norm": 0.83203125, + "learning_rate": 0.00010033508273253414, + "loss": 1.1797, + "step": 10535 + }, + { + "epoch": 0.4992181947405828, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010032019021872126, + "loss": 0.028, + "step": 10536 + }, + { + "epoch": 0.49926557687751716, + "grad_norm": 0.53125, + "learning_rate": 0.00010030529769780693, + "loss": 0.7847, + "step": 10537 + }, + { + "epoch": 0.49931295901445155, + "grad_norm": 0.166015625, + "learning_rate": 0.00010029040517012143, + "loss": 0.0235, + "step": 10538 + }, + { + "epoch": 0.49936034115138594, + "grad_norm": 0.022705078125, + "learning_rate": 0.00010027551263599501, + "loss": 0.0018, + "step": 10539 + }, + { + "epoch": 0.4994077232883203, + "grad_norm": 0.734375, + "learning_rate": 0.00010026062009575803, + "loss": 0.8512, + "step": 10540 + }, + { + "epoch": 0.4994551054252547, + "grad_norm": 1.0, + "learning_rate": 0.00010024572754974078, + "loss": 0.8281, + "step": 10541 + }, + { + "epoch": 0.49950248756218907, + "grad_norm": 0.87109375, + "learning_rate": 0.00010023083499827355, + "loss": 1.0409, + "step": 10542 + }, + { + "epoch": 0.4995498696991234, + "grad_norm": 0.212890625, + "learning_rate": 0.00010021594244168664, + "loss": 0.1338, + "step": 10543 + }, + { + "epoch": 0.4995972518360578, + "grad_norm": 0.478515625, + "learning_rate": 0.00010020104988031038, + "loss": 0.6374, + "step": 10544 + }, + { + "epoch": 0.4996446339729922, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010018615731447502, + "loss": 0.1551, + "step": 10545 + }, + { + "epoch": 0.4996920161099266, + "grad_norm": 0.8359375, + "learning_rate": 0.0001001712647445109, + "loss": 1.5011, + "step": 10546 + }, + { + "epoch": 0.4997393982468609, + "grad_norm": 0.69921875, + "learning_rate": 0.00010015637217074832, + "loss": 1.4456, + "step": 10547 + }, + { + "epoch": 0.4997867803837953, + "grad_norm": 0.09375, + "learning_rate": 0.00010014147959351754, + "loss": 0.0076, + "step": 10548 + }, + { + "epoch": 0.4998341625207297, + "grad_norm": 0.474609375, + "learning_rate": 0.00010012658701314894, + "loss": 0.3427, + "step": 10549 + }, + { + "epoch": 0.49988154465766405, + "grad_norm": 0.6953125, + "learning_rate": 0.00010011169442997274, + "loss": 0.7347, + "step": 10550 + }, + { + "epoch": 0.49992892679459844, + "grad_norm": 0.48046875, + "learning_rate": 0.00010009680184431927, + "loss": 0.1643, + "step": 10551 + }, + { + "epoch": 0.49997630893153283, + "grad_norm": 0.177734375, + "learning_rate": 0.00010008190925651888, + "loss": 0.1251, + "step": 10552 + }, + { + "epoch": 0.5000236910684672, + "grad_norm": 0.65234375, + "learning_rate": 0.00010006701666690179, + "loss": 1.2011, + "step": 10553 + }, + { + "epoch": 0.5000236910684672, + "eval_loss": 0.6681665182113647, + "eval_runtime": 1301.5041, + "eval_samples_per_second": 1.803, + "eval_steps_per_second": 1.803, + "step": 10553 + } + ], + "logging_steps": 1, + "max_steps": 21105, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10553, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.436552471964549e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}