{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997, "eval_steps": 500, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008008008008008008, "grad_norm": 1.6445057392120361, "learning_rate": 2e-05, "loss": 2.3547, "step": 1 }, { "epoch": 0.016016016016016016, "grad_norm": 1.63363778591156, "learning_rate": 4e-05, "loss": 2.3812, "step": 2 }, { "epoch": 0.024024024024024024, "grad_norm": 1.6492197513580322, "learning_rate": 6e-05, "loss": 2.3399, "step": 3 }, { "epoch": 0.03203203203203203, "grad_norm": 1.6518611907958984, "learning_rate": 8e-05, "loss": 2.3172, "step": 4 }, { "epoch": 0.04004004004004004, "grad_norm": 1.7173571586608887, "learning_rate": 0.0001, "loss": 2.2563, "step": 5 }, { "epoch": 0.04804804804804805, "grad_norm": 1.563859224319458, "learning_rate": 0.00012, "loss": 2.0256, "step": 6 }, { "epoch": 0.056056056056056056, "grad_norm": 1.5590581893920898, "learning_rate": 0.00014, "loss": 1.8324, "step": 7 }, { "epoch": 0.06406406406406406, "grad_norm": 1.5127277374267578, "learning_rate": 0.00016, "loss": 1.5787, "step": 8 }, { "epoch": 0.07207207207207207, "grad_norm": 1.5447226762771606, "learning_rate": 0.00018, "loss": 1.3826, "step": 9 }, { "epoch": 0.08008008008008008, "grad_norm": 4.600811004638672, "learning_rate": 0.0002, "loss": 1.2388, "step": 10 }, { "epoch": 0.08808808808808809, "grad_norm": 1.6333264112472534, "learning_rate": 0.00019999629591162656, "loss": 1.0977, "step": 11 }, { "epoch": 0.0960960960960961, "grad_norm": 1.5325253009796143, "learning_rate": 0.00019998518392091164, "loss": 1.0178, "step": 12 }, { "epoch": 0.1041041041041041, "grad_norm": 1.866473913192749, "learning_rate": 0.00019996666485105113, "loss": 0.9454, "step": 13 }, { "epoch": 0.11211211211211211, "grad_norm": 1.450692892074585, "learning_rate": 0.0001999407400739705, "loss": 0.8514, "step": 14 }, { "epoch": 0.12012012012012012, "grad_norm": 5.149086952209473, "learning_rate": 0.00019990741151022301, "loss": 0.9136, "step": 15 }, { "epoch": 0.12812812812812813, "grad_norm": 0.7399551272392273, "learning_rate": 0.00019986668162884762, "loss": 0.8742, "step": 16 }, { "epoch": 0.13613613613613615, "grad_norm": 0.6142033934593201, "learning_rate": 0.00019981855344718588, "loss": 0.8082, "step": 17 }, { "epoch": 0.14414414414414414, "grad_norm": 0.47605425119400024, "learning_rate": 0.00019976303053065859, "loss": 0.8019, "step": 18 }, { "epoch": 0.15215215215215216, "grad_norm": 0.3993614614009857, "learning_rate": 0.00019970011699250152, "loss": 0.7625, "step": 19 }, { "epoch": 0.16016016016016016, "grad_norm": 0.4947199821472168, "learning_rate": 0.00019962981749346078, "loss": 0.7419, "step": 20 }, { "epoch": 0.16816816816816818, "grad_norm": 0.549526572227478, "learning_rate": 0.00019955213724144754, "loss": 0.7468, "step": 21 }, { "epoch": 0.17617617617617617, "grad_norm": 0.34314435720443726, "learning_rate": 0.00019946708199115211, "loss": 0.7482, "step": 22 }, { "epoch": 0.1841841841841842, "grad_norm": 0.38283613324165344, "learning_rate": 0.00019937465804361783, "loss": 0.7304, "step": 23 }, { "epoch": 0.1921921921921922, "grad_norm": 0.28871795535087585, "learning_rate": 0.00019927487224577402, "loss": 0.746, "step": 24 }, { "epoch": 0.2002002002002002, "grad_norm": 0.321494996547699, "learning_rate": 0.000199167731989929, "loss": 0.7461, "step": 25 }, { "epoch": 0.2082082082082082, "grad_norm": 0.315449982881546, "learning_rate": 0.0001990532452132223, "loss": 0.7286, "step": 26 }, { "epoch": 0.21621621621621623, "grad_norm": 0.2904318571090698, "learning_rate": 0.00019893142039703664, "loss": 0.7119, "step": 27 }, { "epoch": 0.22422422422422422, "grad_norm": 0.27874529361724854, "learning_rate": 0.00019880226656636977, "loss": 0.7105, "step": 28 }, { "epoch": 0.23223223223223224, "grad_norm": 0.2948579490184784, "learning_rate": 0.0001986657932891657, "loss": 0.6976, "step": 29 }, { "epoch": 0.24024024024024024, "grad_norm": 0.2542964220046997, "learning_rate": 0.00019852201067560606, "loss": 0.7351, "step": 30 }, { "epoch": 0.24824824824824826, "grad_norm": 0.2960706353187561, "learning_rate": 0.000198370929377361, "loss": 0.7179, "step": 31 }, { "epoch": 0.25625625625625625, "grad_norm": 0.24776384234428406, "learning_rate": 0.00019821256058680006, "loss": 0.7134, "step": 32 }, { "epoch": 0.26426426426426425, "grad_norm": 0.33054184913635254, "learning_rate": 0.00019804691603616324, "loss": 0.6995, "step": 33 }, { "epoch": 0.2722722722722723, "grad_norm": 0.2543237805366516, "learning_rate": 0.00019787400799669154, "loss": 0.7081, "step": 34 }, { "epoch": 0.2802802802802803, "grad_norm": 0.25240710377693176, "learning_rate": 0.0001976938492777182, "loss": 0.6928, "step": 35 }, { "epoch": 0.2882882882882883, "grad_norm": 0.35880276560783386, "learning_rate": 0.0001975064532257195, "loss": 0.7177, "step": 36 }, { "epoch": 0.2962962962962963, "grad_norm": 0.3675362467765808, "learning_rate": 0.0001973118337233262, "loss": 0.6865, "step": 37 }, { "epoch": 0.30430430430430433, "grad_norm": 0.3688451051712036, "learning_rate": 0.00019711000518829507, "loss": 0.6724, "step": 38 }, { "epoch": 0.3123123123123123, "grad_norm": 0.2982208728790283, "learning_rate": 0.00019690098257244064, "loss": 0.671, "step": 39 }, { "epoch": 0.3203203203203203, "grad_norm": 0.24197936058044434, "learning_rate": 0.00019668478136052774, "loss": 0.6777, "step": 40 }, { "epoch": 0.3283283283283283, "grad_norm": 0.748349130153656, "learning_rate": 0.00019646141756912434, "loss": 0.6641, "step": 41 }, { "epoch": 0.33633633633633636, "grad_norm": 0.5585939288139343, "learning_rate": 0.00019623090774541487, "loss": 0.6988, "step": 42 }, { "epoch": 0.34434434434434436, "grad_norm": 0.40285471081733704, "learning_rate": 0.00019599326896597448, "loss": 0.6811, "step": 43 }, { "epoch": 0.35235235235235235, "grad_norm": 0.25714346766471863, "learning_rate": 0.00019574851883550395, "loss": 0.6913, "step": 44 }, { "epoch": 0.36036036036036034, "grad_norm": 0.4926215708255768, "learning_rate": 0.00019549667548552556, "loss": 0.6707, "step": 45 }, { "epoch": 0.3683683683683684, "grad_norm": 0.3760850429534912, "learning_rate": 0.00019523775757303974, "loss": 0.6809, "step": 46 }, { "epoch": 0.3763763763763764, "grad_norm": 0.3734811842441559, "learning_rate": 0.0001949717842791432, "loss": 0.6386, "step": 47 }, { "epoch": 0.3843843843843844, "grad_norm": 0.3447561264038086, "learning_rate": 0.00019469877530760754, "loss": 0.6955, "step": 48 }, { "epoch": 0.3923923923923924, "grad_norm": 0.2680707573890686, "learning_rate": 0.00019441875088341997, "loss": 0.6625, "step": 49 }, { "epoch": 0.4004004004004004, "grad_norm": 0.2692941725254059, "learning_rate": 0.00019413173175128473, "loss": 0.66, "step": 50 }, { "epoch": 0.4084084084084084, "grad_norm": 0.32329630851745605, "learning_rate": 0.00019383773917408642, "loss": 0.6612, "step": 51 }, { "epoch": 0.4164164164164164, "grad_norm": 0.281435489654541, "learning_rate": 0.00019353679493131485, "loss": 0.6621, "step": 52 }, { "epoch": 0.4244244244244244, "grad_norm": 0.22186556458473206, "learning_rate": 0.00019322892131745135, "loss": 0.6465, "step": 53 }, { "epoch": 0.43243243243243246, "grad_norm": 0.2902645468711853, "learning_rate": 0.00019291414114031743, "loss": 0.6693, "step": 54 }, { "epoch": 0.44044044044044045, "grad_norm": 0.2899124324321747, "learning_rate": 0.000192592477719385, "loss": 0.6568, "step": 55 }, { "epoch": 0.44844844844844844, "grad_norm": 0.2124062478542328, "learning_rate": 0.00019226395488404876, "loss": 0.6724, "step": 56 }, { "epoch": 0.45645645645645644, "grad_norm": 0.23896393179893494, "learning_rate": 0.00019192859697186106, "loss": 0.6459, "step": 57 }, { "epoch": 0.4644644644644645, "grad_norm": 0.2762405574321747, "learning_rate": 0.00019158642882672873, "loss": 0.6498, "step": 58 }, { "epoch": 0.4724724724724725, "grad_norm": 0.2079935222864151, "learning_rate": 0.00019123747579707275, "loss": 0.6604, "step": 59 }, { "epoch": 0.4804804804804805, "grad_norm": 0.23864208161830902, "learning_rate": 0.0001908817637339503, "loss": 0.6378, "step": 60 }, { "epoch": 0.48848848848848847, "grad_norm": 0.21718506515026093, "learning_rate": 0.00019051931898913976, "loss": 0.6424, "step": 61 }, { "epoch": 0.4964964964964965, "grad_norm": 0.2773915231227875, "learning_rate": 0.0001901501684131884, "loss": 0.6474, "step": 62 }, { "epoch": 0.5045045045045045, "grad_norm": 0.23982493579387665, "learning_rate": 0.0001897743393534234, "loss": 0.6256, "step": 63 }, { "epoch": 0.5125125125125125, "grad_norm": 0.23621873557567596, "learning_rate": 0.0001893918596519257, "loss": 0.6403, "step": 64 }, { "epoch": 0.5205205205205206, "grad_norm": 0.22759953141212463, "learning_rate": 0.00018900275764346768, "loss": 0.6484, "step": 65 }, { "epoch": 0.5285285285285285, "grad_norm": 0.26695549488067627, "learning_rate": 0.00018860706215341382, "loss": 0.609, "step": 66 }, { "epoch": 0.5365365365365365, "grad_norm": 0.24594709277153015, "learning_rate": 0.00018820480249558537, "loss": 0.6338, "step": 67 }, { "epoch": 0.5445445445445446, "grad_norm": 0.22960062325000763, "learning_rate": 0.00018779600847008884, "loss": 0.6166, "step": 68 }, { "epoch": 0.5525525525525525, "grad_norm": 0.25302109122276306, "learning_rate": 0.00018738071036110808, "loss": 0.6422, "step": 69 }, { "epoch": 0.5605605605605606, "grad_norm": 0.3339892327785492, "learning_rate": 0.0001869589389346611, "loss": 0.6558, "step": 70 }, { "epoch": 0.5685685685685685, "grad_norm": 0.21397258341312408, "learning_rate": 0.00018653072543632062, "loss": 0.6323, "step": 71 }, { "epoch": 0.5765765765765766, "grad_norm": 0.2514493465423584, "learning_rate": 0.00018609610158889942, "loss": 0.657, "step": 72 }, { "epoch": 0.5845845845845846, "grad_norm": 0.25317835807800293, "learning_rate": 0.00018565509959010036, "loss": 0.641, "step": 73 }, { "epoch": 0.5925925925925926, "grad_norm": 0.22669494152069092, "learning_rate": 0.00018520775211013093, "loss": 0.6369, "step": 74 }, { "epoch": 0.6006006006006006, "grad_norm": 0.2214743047952652, "learning_rate": 0.00018475409228928312, "loss": 0.6307, "step": 75 }, { "epoch": 0.6086086086086087, "grad_norm": 0.24376747012138367, "learning_rate": 0.00018429415373547828, "loss": 0.6557, "step": 76 }, { "epoch": 0.6166166166166166, "grad_norm": 0.2158333659172058, "learning_rate": 0.00018382797052177746, "loss": 0.655, "step": 77 }, { "epoch": 0.6246246246246246, "grad_norm": 0.25565382838249207, "learning_rate": 0.000183355577183857, "loss": 0.6299, "step": 78 }, { "epoch": 0.6326326326326326, "grad_norm": 0.20636747777462006, "learning_rate": 0.00018287700871745036, "loss": 0.6283, "step": 79 }, { "epoch": 0.6406406406406406, "grad_norm": 0.21258121728897095, "learning_rate": 0.00018239230057575542, "loss": 0.6174, "step": 80 }, { "epoch": 0.6486486486486487, "grad_norm": 0.2861458957195282, "learning_rate": 0.00018190148866680802, "loss": 0.6547, "step": 81 }, { "epoch": 0.6566566566566566, "grad_norm": 0.23667441308498383, "learning_rate": 0.0001814046093508218, "loss": 0.6416, "step": 82 }, { "epoch": 0.6646646646646647, "grad_norm": 0.23191799223423004, "learning_rate": 0.00018090169943749476, "loss": 0.642, "step": 83 }, { "epoch": 0.6726726726726727, "grad_norm": 0.2622171938419342, "learning_rate": 0.00018039279618328212, "loss": 0.6241, "step": 84 }, { "epoch": 0.6806806806806807, "grad_norm": 0.2891266345977783, "learning_rate": 0.00017987793728863651, "loss": 0.6284, "step": 85 }, { "epoch": 0.6886886886886887, "grad_norm": 0.26767420768737793, "learning_rate": 0.00017935716089521474, "loss": 0.627, "step": 86 }, { "epoch": 0.6966966966966966, "grad_norm": 0.2828672230243683, "learning_rate": 0.00017883050558305255, "loss": 0.6418, "step": 87 }, { "epoch": 0.7047047047047047, "grad_norm": 0.32730573415756226, "learning_rate": 0.00017829801036770628, "loss": 0.6629, "step": 88 }, { "epoch": 0.7127127127127127, "grad_norm": 0.24029900133609772, "learning_rate": 0.0001777597146973627, "loss": 0.614, "step": 89 }, { "epoch": 0.7207207207207207, "grad_norm": 0.2929212152957916, "learning_rate": 0.00017721565844991643, "loss": 0.632, "step": 90 }, { "epoch": 0.7287287287287287, "grad_norm": 0.2860666513442993, "learning_rate": 0.00017666588193001595, "loss": 0.6289, "step": 91 }, { "epoch": 0.7367367367367368, "grad_norm": 0.23325330018997192, "learning_rate": 0.00017611042586607748, "loss": 0.6392, "step": 92 }, { "epoch": 0.7447447447447447, "grad_norm": 0.3126169443130493, "learning_rate": 0.00017554933140726802, "loss": 0.6422, "step": 93 }, { "epoch": 0.7527527527527528, "grad_norm": 0.26704883575439453, "learning_rate": 0.00017498264012045687, "loss": 0.6166, "step": 94 }, { "epoch": 0.7607607607607607, "grad_norm": 0.2184283286333084, "learning_rate": 0.00017441039398713608, "loss": 0.6235, "step": 95 }, { "epoch": 0.7687687687687688, "grad_norm": 0.23906390368938446, "learning_rate": 0.00017383263540031067, "loss": 0.6643, "step": 96 }, { "epoch": 0.7767767767767768, "grad_norm": 0.26839691400527954, "learning_rate": 0.0001732494071613579, "loss": 0.6514, "step": 97 }, { "epoch": 0.7847847847847848, "grad_norm": 0.2805701494216919, "learning_rate": 0.00017266075247685656, "loss": 0.6168, "step": 98 }, { "epoch": 0.7927927927927928, "grad_norm": 0.21650992333889008, "learning_rate": 0.00017206671495538612, "loss": 0.5983, "step": 99 }, { "epoch": 0.8008008008008008, "grad_norm": 0.2302800416946411, "learning_rate": 0.00017146733860429612, "loss": 0.6301, "step": 100 }, { "epoch": 0.8088088088088088, "grad_norm": 0.29078415036201477, "learning_rate": 0.000170862667826446, "loss": 0.616, "step": 101 }, { "epoch": 0.8168168168168168, "grad_norm": 0.24860034883022308, "learning_rate": 0.0001702527474169157, "loss": 0.6352, "step": 102 }, { "epoch": 0.8248248248248248, "grad_norm": 0.26281973719596863, "learning_rate": 0.00016963762255968722, "loss": 0.6218, "step": 103 }, { "epoch": 0.8328328328328328, "grad_norm": 0.29051998257637024, "learning_rate": 0.0001690173388242972, "loss": 0.6233, "step": 104 }, { "epoch": 0.8408408408408409, "grad_norm": 0.2471507042646408, "learning_rate": 0.00016839194216246108, "loss": 0.6147, "step": 105 }, { "epoch": 0.8488488488488488, "grad_norm": 0.2574704587459564, "learning_rate": 0.0001677614789046689, "loss": 0.6174, "step": 106 }, { "epoch": 0.8568568568568569, "grad_norm": 0.2551233172416687, "learning_rate": 0.00016712599575675316, "loss": 0.5989, "step": 107 }, { "epoch": 0.8648648648648649, "grad_norm": 0.2901318371295929, "learning_rate": 0.00016648553979642868, "loss": 0.6241, "step": 108 }, { "epoch": 0.8728728728728729, "grad_norm": 0.23769080638885498, "learning_rate": 0.0001658401584698049, "loss": 0.6044, "step": 109 }, { "epoch": 0.8808808808808809, "grad_norm": 0.2580976188182831, "learning_rate": 0.00016518989958787126, "loss": 0.622, "step": 110 }, { "epoch": 0.8888888888888888, "grad_norm": 0.24077744781970978, "learning_rate": 0.00016453481132295506, "loss": 0.6047, "step": 111 }, { "epoch": 0.8968968968968969, "grad_norm": 0.228902667760849, "learning_rate": 0.00016387494220515274, "loss": 0.6138, "step": 112 }, { "epoch": 0.9049049049049049, "grad_norm": 0.2607581317424774, "learning_rate": 0.00016321034111873488, "loss": 0.6307, "step": 113 }, { "epoch": 0.9129129129129129, "grad_norm": 0.2575569450855255, "learning_rate": 0.00016254105729852464, "loss": 0.6008, "step": 114 }, { "epoch": 0.9209209209209209, "grad_norm": 0.231553852558136, "learning_rate": 0.00016186714032625035, "loss": 0.617, "step": 115 }, { "epoch": 0.928928928928929, "grad_norm": 0.24820354580879211, "learning_rate": 0.00016118864012687245, "loss": 0.5991, "step": 116 }, { "epoch": 0.9369369369369369, "grad_norm": 0.2364109754562378, "learning_rate": 0.00016050560696488492, "loss": 0.6094, "step": 117 }, { "epoch": 0.944944944944945, "grad_norm": 0.2492029368877411, "learning_rate": 0.00015981809144059166, "loss": 0.6143, "step": 118 }, { "epoch": 0.9529529529529529, "grad_norm": 0.27745717763900757, "learning_rate": 0.00015912614448635782, "loss": 0.6203, "step": 119 }, { "epoch": 0.960960960960961, "grad_norm": 0.2555610239505768, "learning_rate": 0.00015842981736283686, "loss": 0.6314, "step": 120 }, { "epoch": 0.968968968968969, "grad_norm": 0.2268420308828354, "learning_rate": 0.00015772916165517273, "loss": 0.6155, "step": 121 }, { "epoch": 0.9769769769769769, "grad_norm": 0.250041127204895, "learning_rate": 0.00015702422926917872, "loss": 0.6226, "step": 122 }, { "epoch": 0.984984984984985, "grad_norm": 0.2596072554588318, "learning_rate": 0.00015631507242749187, "loss": 0.6086, "step": 123 }, { "epoch": 0.992992992992993, "grad_norm": 0.2280743271112442, "learning_rate": 0.00015560174366570446, "loss": 0.5994, "step": 124 }, { "epoch": 1.001001001001001, "grad_norm": 0.23362237215042114, "learning_rate": 0.00015488429582847192, "loss": 0.616, "step": 125 }, { "epoch": 1.006, "grad_norm": 0.2956937849521637, "learning_rate": 0.00015416278206559816, "loss": 0.6038, "step": 126 }, { "epoch": 1.014, "grad_norm": 0.250629723072052, "learning_rate": 0.0001534372558280979, "loss": 0.5991, "step": 127 }, { "epoch": 1.022, "grad_norm": 0.231906458735466, "learning_rate": 0.00015270777086423722, "loss": 0.6088, "step": 128 }, { "epoch": 1.03, "grad_norm": 0.2888093590736389, "learning_rate": 0.0001519743812155516, "loss": 0.5892, "step": 129 }, { "epoch": 1.038, "grad_norm": 0.24940524995326996, "learning_rate": 0.0001512371412128424, "loss": 0.5982, "step": 130 }, { "epoch": 1.046, "grad_norm": 0.24017778038978577, "learning_rate": 0.00015049610547215205, "loss": 0.5608, "step": 131 }, { "epoch": 1.054, "grad_norm": 0.2334035485982895, "learning_rate": 0.00014975132889071807, "loss": 0.6034, "step": 132 }, { "epoch": 1.062, "grad_norm": 0.2773897349834442, "learning_rate": 0.00014900286664290592, "loss": 0.6387, "step": 133 }, { "epoch": 1.07, "grad_norm": 0.24266445636749268, "learning_rate": 0.00014825077417612186, "loss": 0.5612, "step": 134 }, { "epoch": 1.078, "grad_norm": 0.22919470071792603, "learning_rate": 0.00014749510720670506, "loss": 0.599, "step": 135 }, { "epoch": 1.086, "grad_norm": 0.23829148709774017, "learning_rate": 0.00014673592171580025, "loss": 0.6066, "step": 136 }, { "epoch": 1.094, "grad_norm": 0.31981223821640015, "learning_rate": 0.00014597327394521044, "loss": 0.5692, "step": 137 }, { "epoch": 1.102, "grad_norm": 0.2747564911842346, "learning_rate": 0.00014520722039323045, "loss": 0.62, "step": 138 }, { "epoch": 1.11, "grad_norm": 0.2592499852180481, "learning_rate": 0.00014443781781046136, "loss": 0.5937, "step": 139 }, { "epoch": 1.1179999999999999, "grad_norm": 0.31891530752182007, "learning_rate": 0.0001436651231956064, "loss": 0.5973, "step": 140 }, { "epoch": 1.126, "grad_norm": 0.2743702232837677, "learning_rate": 0.00014288919379124837, "loss": 0.6045, "step": 141 }, { "epoch": 1.134, "grad_norm": 0.2665708661079407, "learning_rate": 0.00014211008707960897, "loss": 0.5898, "step": 142 }, { "epoch": 1.142, "grad_norm": 0.33267003297805786, "learning_rate": 0.00014132786077829043, "loss": 0.5945, "step": 143 }, { "epoch": 1.15, "grad_norm": 0.28636589646339417, "learning_rate": 0.00014054257283599973, "loss": 0.5914, "step": 144 }, { "epoch": 1.158, "grad_norm": 0.27305400371551514, "learning_rate": 0.0001397542814282556, "loss": 0.6093, "step": 145 }, { "epoch": 1.166, "grad_norm": 0.2839919924736023, "learning_rate": 0.0001389630449530788, "loss": 0.6074, "step": 146 }, { "epoch": 1.174, "grad_norm": 0.25652188062667847, "learning_rate": 0.0001381689220266659, "loss": 0.6059, "step": 147 }, { "epoch": 1.182, "grad_norm": 0.2549704909324646, "learning_rate": 0.0001373719714790469, "loss": 0.5568, "step": 148 }, { "epoch": 1.19, "grad_norm": 0.2509196400642395, "learning_rate": 0.00013657225234972695, "loss": 0.5968, "step": 149 }, { "epoch": 1.198, "grad_norm": 0.25909289717674255, "learning_rate": 0.0001357698238833126, "loss": 0.5902, "step": 150 }, { "epoch": 1.206, "grad_norm": 0.26162394881248474, "learning_rate": 0.00013496474552512287, "loss": 0.5763, "step": 151 }, { "epoch": 1.214, "grad_norm": 0.2721655070781708, "learning_rate": 0.00013415707691678556, "loss": 0.6037, "step": 152 }, { "epoch": 1.222, "grad_norm": 0.28691592812538147, "learning_rate": 0.0001333468778918187, "loss": 0.625, "step": 153 }, { "epoch": 1.23, "grad_norm": 0.25801119208335876, "learning_rate": 0.00013253420847119803, "loss": 0.5994, "step": 154 }, { "epoch": 1.238, "grad_norm": 0.2609824538230896, "learning_rate": 0.00013171912885891063, "loss": 0.5999, "step": 155 }, { "epoch": 1.246, "grad_norm": 0.2631840407848358, "learning_rate": 0.00013090169943749476, "loss": 0.5844, "step": 156 }, { "epoch": 1.254, "grad_norm": 0.2862647473812103, "learning_rate": 0.00013008198076356676, "loss": 0.5653, "step": 157 }, { "epoch": 1.262, "grad_norm": 0.27327024936676025, "learning_rate": 0.00012926003356333488, "loss": 0.5933, "step": 158 }, { "epoch": 1.27, "grad_norm": 0.2684379518032074, "learning_rate": 0.0001284359187281004, "loss": 0.5842, "step": 159 }, { "epoch": 1.278, "grad_norm": 0.2620231807231903, "learning_rate": 0.00012760969730974694, "loss": 0.6079, "step": 160 }, { "epoch": 1.286, "grad_norm": 0.2584543526172638, "learning_rate": 0.00012678143051621742, "loss": 0.5941, "step": 161 }, { "epoch": 1.294, "grad_norm": 0.26279789209365845, "learning_rate": 0.00012595117970697997, "loss": 0.6086, "step": 162 }, { "epoch": 1.302, "grad_norm": 0.2541520893573761, "learning_rate": 0.00012511900638848195, "loss": 0.5907, "step": 163 }, { "epoch": 1.31, "grad_norm": 0.27384746074676514, "learning_rate": 0.0001242849722095936, "loss": 0.6055, "step": 164 }, { "epoch": 1.318, "grad_norm": 0.25682300329208374, "learning_rate": 0.00012344913895704097, "loss": 0.609, "step": 165 }, { "epoch": 1.326, "grad_norm": 0.25404492020606995, "learning_rate": 0.00012261156855082882, "loss": 0.6121, "step": 166 }, { "epoch": 1.334, "grad_norm": 0.2649850845336914, "learning_rate": 0.0001217723230396532, "loss": 0.5695, "step": 167 }, { "epoch": 1.342, "grad_norm": 0.28349268436431885, "learning_rate": 0.00012093146459630487, "loss": 0.5883, "step": 168 }, { "epoch": 1.35, "grad_norm": 0.27068182826042175, "learning_rate": 0.00012008905551306356, "loss": 0.6147, "step": 169 }, { "epoch": 1.358, "grad_norm": 0.29566338658332825, "learning_rate": 0.000119245158197083, "loss": 0.5901, "step": 170 }, { "epoch": 1.366, "grad_norm": 0.27946797013282776, "learning_rate": 0.00011839983516576802, "loss": 0.5831, "step": 171 }, { "epoch": 1.374, "grad_norm": 0.25005409121513367, "learning_rate": 0.00011755314904214284, "loss": 0.5656, "step": 172 }, { "epoch": 1.3820000000000001, "grad_norm": 0.2621053159236908, "learning_rate": 0.00011670516255021193, "loss": 0.5769, "step": 173 }, { "epoch": 1.3900000000000001, "grad_norm": 0.2531629502773285, "learning_rate": 0.00011585593851031347, "loss": 0.5641, "step": 174 }, { "epoch": 1.3980000000000001, "grad_norm": 0.3026553690433502, "learning_rate": 0.00011500553983446527, "loss": 0.6038, "step": 175 }, { "epoch": 1.4060000000000001, "grad_norm": 0.2748616337776184, "learning_rate": 0.00011415402952170433, "loss": 0.5808, "step": 176 }, { "epoch": 1.414, "grad_norm": 0.27345311641693115, "learning_rate": 0.0001133014706534196, "loss": 0.5848, "step": 177 }, { "epoch": 1.422, "grad_norm": 0.273357629776001, "learning_rate": 0.00011244792638867893, "loss": 0.5906, "step": 178 }, { "epoch": 1.43, "grad_norm": 0.26230183243751526, "learning_rate": 0.00011159345995955006, "loss": 0.5886, "step": 179 }, { "epoch": 1.438, "grad_norm": 0.2515832483768463, "learning_rate": 0.00011073813466641632, "loss": 0.5876, "step": 180 }, { "epoch": 1.446, "grad_norm": 0.27451092004776, "learning_rate": 0.00010988201387328717, "loss": 0.5841, "step": 181 }, { "epoch": 1.454, "grad_norm": 0.2588571012020111, "learning_rate": 0.00010902516100310411, "loss": 0.5674, "step": 182 }, { "epoch": 1.462, "grad_norm": 0.2440604865550995, "learning_rate": 0.00010816763953304227, "loss": 0.5646, "step": 183 }, { "epoch": 1.47, "grad_norm": 0.2681560516357422, "learning_rate": 0.00010730951298980776, "loss": 0.5671, "step": 184 }, { "epoch": 1.478, "grad_norm": 0.297048419713974, "learning_rate": 0.00010645084494493165, "loss": 0.583, "step": 185 }, { "epoch": 1.486, "grad_norm": 0.29275089502334595, "learning_rate": 0.00010559169901006034, "loss": 0.6007, "step": 186 }, { "epoch": 1.494, "grad_norm": 0.2599998414516449, "learning_rate": 0.0001047321388322432, "loss": 0.5692, "step": 187 }, { "epoch": 1.502, "grad_norm": 0.2714841663837433, "learning_rate": 0.00010387222808921746, "loss": 0.5996, "step": 188 }, { "epoch": 1.51, "grad_norm": 0.2618089020252228, "learning_rate": 0.00010301203048469083, "loss": 0.5861, "step": 189 }, { "epoch": 1.518, "grad_norm": 0.27270275354385376, "learning_rate": 0.00010215160974362223, "loss": 0.5771, "step": 190 }, { "epoch": 1.526, "grad_norm": 0.2583703398704529, "learning_rate": 0.00010129102960750092, "loss": 0.5897, "step": 191 }, { "epoch": 1.534, "grad_norm": 0.25854265689849854, "learning_rate": 0.00010043035382962443, "loss": 0.5778, "step": 192 }, { "epoch": 1.542, "grad_norm": 0.27234703302383423, "learning_rate": 9.956964617037558e-05, "loss": 0.5659, "step": 193 }, { "epoch": 1.55, "grad_norm": 0.2672337293624878, "learning_rate": 9.870897039249911e-05, "loss": 0.5792, "step": 194 }, { "epoch": 1.558, "grad_norm": 0.2644350528717041, "learning_rate": 9.784839025637778e-05, "loss": 0.5798, "step": 195 }, { "epoch": 1.5659999999999998, "grad_norm": 0.2572389543056488, "learning_rate": 9.698796951530919e-05, "loss": 0.6008, "step": 196 }, { "epoch": 1.5739999999999998, "grad_norm": 0.2693156898021698, "learning_rate": 9.612777191078258e-05, "loss": 0.5903, "step": 197 }, { "epoch": 1.5819999999999999, "grad_norm": 0.26244068145751953, "learning_rate": 9.526786116775682e-05, "loss": 0.569, "step": 198 }, { "epoch": 1.5899999999999999, "grad_norm": 0.25875815749168396, "learning_rate": 9.440830098993969e-05, "loss": 0.6042, "step": 199 }, { "epoch": 1.5979999999999999, "grad_norm": 0.27209436893463135, "learning_rate": 9.354915505506839e-05, "loss": 0.5846, "step": 200 }, { "epoch": 1.6059999999999999, "grad_norm": 0.270780473947525, "learning_rate": 9.269048701019226e-05, "loss": 0.5957, "step": 201 }, { "epoch": 1.6139999999999999, "grad_norm": 0.25013670325279236, "learning_rate": 9.183236046695777e-05, "loss": 0.5845, "step": 202 }, { "epoch": 1.6219999999999999, "grad_norm": 0.2716957628726959, "learning_rate": 9.09748389968959e-05, "loss": 0.5584, "step": 203 }, { "epoch": 1.63, "grad_norm": 0.2737436890602112, "learning_rate": 9.011798612671286e-05, "loss": 0.5836, "step": 204 }, { "epoch": 1.638, "grad_norm": 0.2748481333255768, "learning_rate": 8.92618653335837e-05, "loss": 0.5927, "step": 205 }, { "epoch": 1.646, "grad_norm": 0.2642996609210968, "learning_rate": 8.840654004044996e-05, "loss": 0.6088, "step": 206 }, { "epoch": 1.654, "grad_norm": 0.26341068744659424, "learning_rate": 8.755207361132108e-05, "loss": 0.5841, "step": 207 }, { "epoch": 1.662, "grad_norm": 0.30515289306640625, "learning_rate": 8.669852934658042e-05, "loss": 0.5525, "step": 208 }, { "epoch": 1.67, "grad_norm": 0.2801468074321747, "learning_rate": 8.58459704782957e-05, "loss": 0.5547, "step": 209 }, { "epoch": 1.678, "grad_norm": 0.27618443965911865, "learning_rate": 8.499446016553474e-05, "loss": 0.5613, "step": 210 }, { "epoch": 1.686, "grad_norm": 0.26961931586265564, "learning_rate": 8.414406148968657e-05, "loss": 0.5639, "step": 211 }, { "epoch": 1.694, "grad_norm": 0.29023805260658264, "learning_rate": 8.32948374497881e-05, "loss": 0.5878, "step": 212 }, { "epoch": 1.702, "grad_norm": 0.2671249508857727, "learning_rate": 8.244685095785719e-05, "loss": 0.5743, "step": 213 }, { "epoch": 1.71, "grad_norm": 0.26124948263168335, "learning_rate": 8.160016483423199e-05, "loss": 0.5801, "step": 214 }, { "epoch": 1.718, "grad_norm": 0.2721916437149048, "learning_rate": 8.075484180291701e-05, "loss": 0.5975, "step": 215 }, { "epoch": 1.726, "grad_norm": 0.2630290687084198, "learning_rate": 7.991094448693648e-05, "loss": 0.5714, "step": 216 }, { "epoch": 1.734, "grad_norm": 0.25962400436401367, "learning_rate": 7.906853540369514e-05, "loss": 0.5912, "step": 217 }, { "epoch": 1.742, "grad_norm": 0.26142629981040955, "learning_rate": 7.822767696034682e-05, "loss": 0.577, "step": 218 }, { "epoch": 1.75, "grad_norm": 0.26839005947113037, "learning_rate": 7.738843144917119e-05, "loss": 0.5747, "step": 219 }, { "epoch": 1.758, "grad_norm": 0.28053659200668335, "learning_rate": 7.655086104295904e-05, "loss": 0.612, "step": 220 }, { "epoch": 1.766, "grad_norm": 0.2728564143180847, "learning_rate": 7.571502779040645e-05, "loss": 0.5895, "step": 221 }, { "epoch": 1.774, "grad_norm": 0.26337721943855286, "learning_rate": 7.48809936115181e-05, "loss": 0.5666, "step": 222 }, { "epoch": 1.782, "grad_norm": 0.2544887661933899, "learning_rate": 7.404882029302003e-05, "loss": 0.5552, "step": 223 }, { "epoch": 1.79, "grad_norm": 0.2690827250480652, "learning_rate": 7.321856948378259e-05, "loss": 0.5559, "step": 224 }, { "epoch": 1.798, "grad_norm": 0.27959364652633667, "learning_rate": 7.239030269025311e-05, "loss": 0.5738, "step": 225 }, { "epoch": 1.806, "grad_norm": 0.29118430614471436, "learning_rate": 7.156408127189965e-05, "loss": 0.5753, "step": 226 }, { "epoch": 1.814, "grad_norm": 0.2585116922855377, "learning_rate": 7.073996643666517e-05, "loss": 0.5499, "step": 227 }, { "epoch": 1.822, "grad_norm": 0.24918395280838013, "learning_rate": 6.991801923643324e-05, "loss": 0.5792, "step": 228 }, { "epoch": 1.83, "grad_norm": 0.2664697766304016, "learning_rate": 6.909830056250527e-05, "loss": 0.582, "step": 229 }, { "epoch": 1.838, "grad_norm": 0.2679463326931, "learning_rate": 6.82808711410894e-05, "loss": 0.5919, "step": 230 }, { "epoch": 1.846, "grad_norm": 0.26800620555877686, "learning_rate": 6.746579152880201e-05, "loss": 0.5774, "step": 231 }, { "epoch": 1.854, "grad_norm": 0.25649094581604004, "learning_rate": 6.665312210818131e-05, "loss": 0.5569, "step": 232 }, { "epoch": 1.862, "grad_norm": 0.2661426067352295, "learning_rate": 6.584292308321445e-05, "loss": 0.5759, "step": 233 }, { "epoch": 1.87, "grad_norm": 0.27011772990226746, "learning_rate": 6.503525447487715e-05, "loss": 0.5752, "step": 234 }, { "epoch": 1.8780000000000001, "grad_norm": 0.28006768226623535, "learning_rate": 6.423017611668745e-05, "loss": 0.5873, "step": 235 }, { "epoch": 1.8860000000000001, "grad_norm": 0.281974196434021, "learning_rate": 6.342774765027309e-05, "loss": 0.5868, "step": 236 }, { "epoch": 1.8940000000000001, "grad_norm": 0.28022536635398865, "learning_rate": 6.262802852095311e-05, "loss": 0.5505, "step": 237 }, { "epoch": 1.9020000000000001, "grad_norm": 0.2971389889717102, "learning_rate": 6.18310779733341e-05, "loss": 0.5879, "step": 238 }, { "epoch": 1.9100000000000001, "grad_norm": 0.2779372036457062, "learning_rate": 6.103695504692122e-05, "loss": 0.5648, "step": 239 }, { "epoch": 1.9180000000000001, "grad_norm": 0.3032248616218567, "learning_rate": 6.024571857174443e-05, "loss": 0.5884, "step": 240 }, { "epoch": 1.9260000000000002, "grad_norm": 0.26035642623901367, "learning_rate": 5.94574271640003e-05, "loss": 0.5837, "step": 241 }, { "epoch": 1.9340000000000002, "grad_norm": 0.2789236009120941, "learning_rate": 5.8672139221709577e-05, "loss": 0.5745, "step": 242 }, { "epoch": 1.942, "grad_norm": 0.27459922432899475, "learning_rate": 5.788991292039103e-05, "loss": 0.568, "step": 243 }, { "epoch": 1.95, "grad_norm": 0.2857699394226074, "learning_rate": 5.7110806208751655e-05, "loss": 0.5619, "step": 244 }, { "epoch": 1.958, "grad_norm": 0.2715800106525421, "learning_rate": 5.633487680439361e-05, "loss": 0.5763, "step": 245 }, { "epoch": 1.966, "grad_norm": 0.27073079347610474, "learning_rate": 5.556218218953868e-05, "loss": 0.5815, "step": 246 }, { "epoch": 1.974, "grad_norm": 0.2630924880504608, "learning_rate": 5.479277960676958e-05, "loss": 0.5735, "step": 247 }, { "epoch": 1.982, "grad_norm": 0.2807694971561432, "learning_rate": 5.40267260547896e-05, "loss": 0.5611, "step": 248 }, { "epoch": 1.99, "grad_norm": 0.28481224179267883, "learning_rate": 5.326407828419979e-05, "loss": 0.5671, "step": 249 }, { "epoch": 1.998, "grad_norm": 0.27675607800483704, "learning_rate": 5.2504892793295e-05, "loss": 0.5825, "step": 250 }, { "epoch": 2.005, "grad_norm": 0.27290546894073486, "learning_rate": 5.174922582387819e-05, "loss": 0.5671, "step": 251 }, { "epoch": 2.013, "grad_norm": 0.264244019985199, "learning_rate": 5.0997133357094085e-05, "loss": 0.5697, "step": 252 }, { "epoch": 2.021, "grad_norm": 0.28058966994285583, "learning_rate": 5.0248671109281934e-05, "loss": 0.548, "step": 253 }, { "epoch": 2.029, "grad_norm": 0.28881585597991943, "learning_rate": 4.9503894527847964e-05, "loss": 0.5448, "step": 254 }, { "epoch": 2.037, "grad_norm": 0.2921096980571747, "learning_rate": 4.876285878715764e-05, "loss": 0.5538, "step": 255 }, { "epoch": 2.045, "grad_norm": 0.28465935587882996, "learning_rate": 4.802561878444845e-05, "loss": 0.5271, "step": 256 }, { "epoch": 2.053, "grad_norm": 0.2935147285461426, "learning_rate": 4.729222913576279e-05, "loss": 0.5654, "step": 257 }, { "epoch": 2.061, "grad_norm": 0.28425726294517517, "learning_rate": 4.656274417190214e-05, "loss": 0.5303, "step": 258 }, { "epoch": 2.069, "grad_norm": 0.3496411144733429, "learning_rate": 4.583721793440188e-05, "loss": 0.5462, "step": 259 }, { "epoch": 2.077, "grad_norm": 0.29614442586898804, "learning_rate": 4.5115704171528105e-05, "loss": 0.5528, "step": 260 }, { "epoch": 2.085, "grad_norm": 0.3171357810497284, "learning_rate": 4.439825633429557e-05, "loss": 0.5296, "step": 261 }, { "epoch": 2.093, "grad_norm": 0.3029578626155853, "learning_rate": 4.368492757250814e-05, "loss": 0.5499, "step": 262 }, { "epoch": 2.101, "grad_norm": 0.30544060468673706, "learning_rate": 4.297577073082129e-05, "loss": 0.565, "step": 263 }, { "epoch": 2.109, "grad_norm": 0.3022664785385132, "learning_rate": 4.227083834482728e-05, "loss": 0.5317, "step": 264 }, { "epoch": 2.117, "grad_norm": 0.29141902923583984, "learning_rate": 4.1570182637163155e-05, "loss": 0.5365, "step": 265 }, { "epoch": 2.125, "grad_norm": 0.2986874282360077, "learning_rate": 4.087385551364219e-05, "loss": 0.5617, "step": 266 }, { "epoch": 2.133, "grad_norm": 0.2957478165626526, "learning_rate": 4.0181908559408366e-05, "loss": 0.5285, "step": 267 }, { "epoch": 2.141, "grad_norm": 0.3051709830760956, "learning_rate": 3.949439303511512e-05, "loss": 0.5388, "step": 268 }, { "epoch": 2.149, "grad_norm": 0.30644309520721436, "learning_rate": 3.881135987312757e-05, "loss": 0.556, "step": 269 }, { "epoch": 2.157, "grad_norm": 0.3261376619338989, "learning_rate": 3.813285967374969e-05, "loss": 0.5568, "step": 270 }, { "epoch": 2.165, "grad_norm": 0.29734110832214355, "learning_rate": 3.745894270147539e-05, "loss": 0.5369, "step": 271 }, { "epoch": 2.173, "grad_norm": 0.28361955285072327, "learning_rate": 3.678965888126513e-05, "loss": 0.5395, "step": 272 }, { "epoch": 2.181, "grad_norm": 0.31067171692848206, "learning_rate": 3.612505779484728e-05, "loss": 0.5634, "step": 273 }, { "epoch": 2.189, "grad_norm": 0.31847238540649414, "learning_rate": 3.546518867704499e-05, "loss": 0.5435, "step": 274 }, { "epoch": 2.197, "grad_norm": 0.3154855966567993, "learning_rate": 3.4810100412128747e-05, "loss": 0.545, "step": 275 }, { "epoch": 2.205, "grad_norm": 0.2924310564994812, "learning_rate": 3.415984153019513e-05, "loss": 0.5512, "step": 276 }, { "epoch": 2.213, "grad_norm": 0.301763117313385, "learning_rate": 3.351446020357136e-05, "loss": 0.5619, "step": 277 }, { "epoch": 2.221, "grad_norm": 0.2811594307422638, "learning_rate": 3.287400424324687e-05, "loss": 0.5421, "step": 278 }, { "epoch": 2.229, "grad_norm": 0.2934137284755707, "learning_rate": 3.223852109533112e-05, "loss": 0.5517, "step": 279 }, { "epoch": 2.237, "grad_norm": 0.3074370324611664, "learning_rate": 3.160805783753897e-05, "loss": 0.5655, "step": 280 }, { "epoch": 2.245, "grad_norm": 0.2928076386451721, "learning_rate": 3.098266117570282e-05, "loss": 0.5519, "step": 281 }, { "epoch": 2.253, "grad_norm": 0.3051488995552063, "learning_rate": 3.0362377440312784e-05, "loss": 0.5493, "step": 282 }, { "epoch": 2.261, "grad_norm": 0.30227115750312805, "learning_rate": 2.9747252583084295e-05, "loss": 0.5501, "step": 283 }, { "epoch": 2.269, "grad_norm": 0.29634636640548706, "learning_rate": 2.9137332173554043e-05, "loss": 0.5264, "step": 284 }, { "epoch": 2.277, "grad_norm": 0.2818604111671448, "learning_rate": 2.853266139570391e-05, "loss": 0.5434, "step": 285 }, { "epoch": 2.285, "grad_norm": 0.2941757142543793, "learning_rate": 2.793328504461391e-05, "loss": 0.5477, "step": 286 }, { "epoch": 2.293, "grad_norm": 0.2820718288421631, "learning_rate": 2.733924752314345e-05, "loss": 0.545, "step": 287 }, { "epoch": 2.301, "grad_norm": 0.29931187629699707, "learning_rate": 2.675059283864214e-05, "loss": 0.5372, "step": 288 }, { "epoch": 2.309, "grad_norm": 0.3119406998157501, "learning_rate": 2.616736459968936e-05, "loss": 0.562, "step": 289 }, { "epoch": 2.317, "grad_norm": 0.3057345747947693, "learning_rate": 2.5589606012863963e-05, "loss": 0.5648, "step": 290 }, { "epoch": 2.325, "grad_norm": 0.29526305198669434, "learning_rate": 2.5017359879543166e-05, "loss": 0.5448, "step": 291 }, { "epoch": 2.333, "grad_norm": 0.32076817750930786, "learning_rate": 2.4450668592731974e-05, "loss": 0.5537, "step": 292 }, { "epoch": 2.341, "grad_norm": 0.3103630542755127, "learning_rate": 2.388957413392253e-05, "loss": 0.5746, "step": 293 }, { "epoch": 2.349, "grad_norm": 0.2959766089916229, "learning_rate": 2.33341180699841e-05, "loss": 0.5532, "step": 294 }, { "epoch": 2.357, "grad_norm": 0.28677570819854736, "learning_rate": 2.2784341550083576e-05, "loss": 0.5439, "step": 295 }, { "epoch": 2.365, "grad_norm": 0.29818570613861084, "learning_rate": 2.224028530263733e-05, "loss": 0.5453, "step": 296 }, { "epoch": 2.373, "grad_norm": 0.2917187809944153, "learning_rate": 2.1701989632293717e-05, "loss": 0.5335, "step": 297 }, { "epoch": 2.3810000000000002, "grad_norm": 0.31331866979599, "learning_rate": 2.1169494416947477e-05, "loss": 0.5663, "step": 298 }, { "epoch": 2.3890000000000002, "grad_norm": 0.2955493927001953, "learning_rate": 2.0642839104785272e-05, "loss": 0.5509, "step": 299 }, { "epoch": 2.3970000000000002, "grad_norm": 0.3018077313899994, "learning_rate": 2.0122062711363532e-05, "loss": 0.5435, "step": 300 }, { "epoch": 2.4050000000000002, "grad_norm": 0.3038371801376343, "learning_rate": 1.9607203816717888e-05, "loss": 0.5544, "step": 301 }, { "epoch": 2.413, "grad_norm": 0.29873090982437134, "learning_rate": 1.9098300562505266e-05, "loss": 0.5606, "step": 302 }, { "epoch": 2.421, "grad_norm": 0.29808226227760315, "learning_rate": 1.859539064917821e-05, "loss": 0.5479, "step": 303 }, { "epoch": 2.429, "grad_norm": 0.3080662190914154, "learning_rate": 1.8098511333192024e-05, "loss": 0.5587, "step": 304 }, { "epoch": 2.437, "grad_norm": 0.29491138458251953, "learning_rate": 1.7607699424244585e-05, "loss": 0.5482, "step": 305 }, { "epoch": 2.445, "grad_norm": 0.3016619384288788, "learning_rate": 1.712299128254965e-05, "loss": 0.5539, "step": 306 }, { "epoch": 2.453, "grad_norm": 0.31340616941452026, "learning_rate": 1.6644422816143024e-05, "loss": 0.5559, "step": 307 }, { "epoch": 2.461, "grad_norm": 0.3533984422683716, "learning_rate": 1.6172029478222594e-05, "loss": 0.5288, "step": 308 }, { "epoch": 2.469, "grad_norm": 0.2987024188041687, "learning_rate": 1.570584626452173e-05, "loss": 0.5405, "step": 309 }, { "epoch": 2.477, "grad_norm": 0.312209814786911, "learning_rate": 1.5245907710716911e-05, "loss": 0.544, "step": 310 }, { "epoch": 2.485, "grad_norm": 0.30217549204826355, "learning_rate": 1.4792247889869071e-05, "loss": 0.5483, "step": 311 }, { "epoch": 2.493, "grad_norm": 0.2916695475578308, "learning_rate": 1.4344900409899642e-05, "loss": 0.5461, "step": 312 }, { "epoch": 2.501, "grad_norm": 0.3004852831363678, "learning_rate": 1.3903898411100568e-05, "loss": 0.5427, "step": 313 }, { "epoch": 2.509, "grad_norm": 0.3007405400276184, "learning_rate": 1.3469274563679402e-05, "loss": 0.5666, "step": 314 }, { "epoch": 2.517, "grad_norm": 0.2975088655948639, "learning_rate": 1.30410610653389e-05, "loss": 0.5528, "step": 315 }, { "epoch": 2.525, "grad_norm": 0.31289756298065186, "learning_rate": 1.261928963889194e-05, "loss": 0.5757, "step": 316 }, { "epoch": 2.533, "grad_norm": 0.32117587327957153, "learning_rate": 1.2203991529911197e-05, "loss": 0.5182, "step": 317 }, { "epoch": 2.541, "grad_norm": 0.31478121876716614, "learning_rate": 1.1795197504414656e-05, "loss": 0.542, "step": 318 }, { "epoch": 2.549, "grad_norm": 0.302287220954895, "learning_rate": 1.1392937846586215e-05, "loss": 0.5275, "step": 319 }, { "epoch": 2.557, "grad_norm": 0.3095117509365082, "learning_rate": 1.0997242356532334e-05, "loss": 0.5534, "step": 320 }, { "epoch": 2.565, "grad_norm": 0.3028477728366852, "learning_rate": 1.0608140348074292e-05, "loss": 0.5463, "step": 321 }, { "epoch": 2.573, "grad_norm": 0.30029499530792236, "learning_rate": 1.0225660646576629e-05, "loss": 0.5449, "step": 322 }, { "epoch": 2.581, "grad_norm": 0.30782079696655273, "learning_rate": 9.849831586811598e-06, "loss": 0.5508, "step": 323 }, { "epoch": 2.589, "grad_norm": 0.2926454544067383, "learning_rate": 9.48068101086026e-06, "loss": 0.5467, "step": 324 }, { "epoch": 2.597, "grad_norm": 0.3094619810581207, "learning_rate": 9.118236266049707e-06, "loss": 0.5496, "step": 325 }, { "epoch": 2.605, "grad_norm": 0.30689892172813416, "learning_rate": 8.76252420292728e-06, "loss": 0.565, "step": 326 }, { "epoch": 2.613, "grad_norm": 0.29666024446487427, "learning_rate": 8.413571173271295e-06, "loss": 0.5264, "step": 327 }, { "epoch": 2.621, "grad_norm": 0.2976740598678589, "learning_rate": 8.071403028138968e-06, "loss": 0.5544, "step": 328 }, { "epoch": 2.629, "grad_norm": 0.2889856994152069, "learning_rate": 7.736045115951251e-06, "loss": 0.5363, "step": 329 }, { "epoch": 2.637, "grad_norm": 0.3127500116825104, "learning_rate": 7.40752228061502e-06, "loss": 0.5636, "step": 330 }, { "epoch": 2.645, "grad_norm": 0.31198227405548096, "learning_rate": 7.085858859682571e-06, "loss": 0.5518, "step": 331 }, { "epoch": 2.653, "grad_norm": 0.318718284368515, "learning_rate": 6.7710786825486705e-06, "loss": 0.5456, "step": 332 }, { "epoch": 2.661, "grad_norm": 0.3112807869911194, "learning_rate": 6.463205068685174e-06, "loss": 0.5297, "step": 333 }, { "epoch": 2.669, "grad_norm": 0.30573782324790955, "learning_rate": 6.16226082591359e-06, "loss": 0.5353, "step": 334 }, { "epoch": 2.677, "grad_norm": 0.3077561855316162, "learning_rate": 5.868268248715292e-06, "loss": 0.5449, "step": 335 }, { "epoch": 2.685, "grad_norm": 0.3065972626209259, "learning_rate": 5.5812491165800675e-06, "loss": 0.5467, "step": 336 }, { "epoch": 2.693, "grad_norm": 0.3004700541496277, "learning_rate": 5.3012246923924816e-06, "loss": 0.5394, "step": 337 }, { "epoch": 2.701, "grad_norm": 0.30005961656570435, "learning_rate": 5.028215720856821e-06, "loss": 0.5605, "step": 338 }, { "epoch": 2.709, "grad_norm": 0.3056250810623169, "learning_rate": 4.762242426960262e-06, "loss": 0.5388, "step": 339 }, { "epoch": 2.717, "grad_norm": 0.3118292987346649, "learning_rate": 4.503324514474483e-06, "loss": 0.5206, "step": 340 }, { "epoch": 2.725, "grad_norm": 0.29597511887550354, "learning_rate": 4.251481164496074e-06, "loss": 0.5595, "step": 341 }, { "epoch": 2.733, "grad_norm": 0.29819923639297485, "learning_rate": 4.006731034025546e-06, "loss": 0.5391, "step": 342 }, { "epoch": 2.741, "grad_norm": 0.3268795907497406, "learning_rate": 3.769092254585138e-06, "loss": 0.5552, "step": 343 }, { "epoch": 2.749, "grad_norm": 0.3218269646167755, "learning_rate": 3.5385824308756587e-06, "loss": 0.5511, "step": 344 }, { "epoch": 2.757, "grad_norm": 0.29198867082595825, "learning_rate": 3.3152186394722505e-06, "loss": 0.5419, "step": 345 }, { "epoch": 2.765, "grad_norm": 0.30343398451805115, "learning_rate": 3.099017427559392e-06, "loss": 0.5283, "step": 346 }, { "epoch": 2.773, "grad_norm": 0.30696964263916016, "learning_rate": 2.889994811704966e-06, "loss": 0.5488, "step": 347 }, { "epoch": 2.781, "grad_norm": 0.30184078216552734, "learning_rate": 2.688166276673809e-06, "loss": 0.5294, "step": 348 }, { "epoch": 2.789, "grad_norm": 0.29285117983818054, "learning_rate": 2.493546774280531e-06, "loss": 0.5586, "step": 349 }, { "epoch": 2.797, "grad_norm": 0.3032819926738739, "learning_rate": 2.30615072228183e-06, "loss": 0.5252, "step": 350 }, { "epoch": 2.805, "grad_norm": 0.2997422516345978, "learning_rate": 2.1259920033084745e-06, "loss": 0.5515, "step": 351 }, { "epoch": 2.8129999999999997, "grad_norm": 0.29675498604774475, "learning_rate": 1.9530839638367995e-06, "loss": 0.5636, "step": 352 }, { "epoch": 2.8209999999999997, "grad_norm": 0.30580922961235046, "learning_rate": 1.7874394131999427e-06, "loss": 0.55, "step": 353 }, { "epoch": 2.8289999999999997, "grad_norm": 0.298286497592926, "learning_rate": 1.6290706226390285e-06, "loss": 0.5306, "step": 354 }, { "epoch": 2.8369999999999997, "grad_norm": 0.30674898624420166, "learning_rate": 1.4779893243939359e-06, "loss": 0.565, "step": 355 }, { "epoch": 2.8449999999999998, "grad_norm": 0.30078810453414917, "learning_rate": 1.334206710834296e-06, "loss": 0.5152, "step": 356 }, { "epoch": 2.8529999999999998, "grad_norm": 0.2936237156391144, "learning_rate": 1.1977334336302438e-06, "loss": 0.5393, "step": 357 }, { "epoch": 2.8609999999999998, "grad_norm": 0.30025407671928406, "learning_rate": 1.068579602963371e-06, "loss": 0.5522, "step": 358 }, { "epoch": 2.8689999999999998, "grad_norm": 0.31208300590515137, "learning_rate": 9.46754786777726e-07, "loss": 0.5616, "step": 359 }, { "epoch": 2.877, "grad_norm": 0.29706162214279175, "learning_rate": 8.322680100710023e-07, "loss": 0.5437, "step": 360 }, { "epoch": 2.885, "grad_norm": 0.30245542526245117, "learning_rate": 7.251277542259849e-07, "loss": 0.5608, "step": 361 }, { "epoch": 2.893, "grad_norm": 0.2977691888809204, "learning_rate": 6.253419563821972e-07, "loss": 0.554, "step": 362 }, { "epoch": 2.901, "grad_norm": 0.32129979133605957, "learning_rate": 5.329180088478935e-07, "loss": 0.5557, "step": 363 }, { "epoch": 2.909, "grad_norm": 0.3130888044834137, "learning_rate": 4.4786275855247527e-07, "loss": 0.5433, "step": 364 }, { "epoch": 2.917, "grad_norm": 0.2962971329689026, "learning_rate": 3.701825065392184e-07, "loss": 0.5362, "step": 365 }, { "epoch": 2.925, "grad_norm": 0.29684919118881226, "learning_rate": 2.998830074984915e-07, "loss": 0.5295, "step": 366 }, { "epoch": 2.933, "grad_norm": 0.3092200756072998, "learning_rate": 2.369694693414304e-07, "loss": 0.5645, "step": 367 }, { "epoch": 2.941, "grad_norm": 0.2915026843547821, "learning_rate": 1.8144655281413513e-07, "loss": 0.5491, "step": 368 }, { "epoch": 2.949, "grad_norm": 0.310531347990036, "learning_rate": 1.333183711524133e-07, "loss": 0.5447, "step": 369 }, { "epoch": 2.957, "grad_norm": 0.28913000226020813, "learning_rate": 9.258848977700129e-08, "loss": 0.5251, "step": 370 }, { "epoch": 2.965, "grad_norm": 0.28818029165267944, "learning_rate": 5.925992602952013e-08, "loss": 0.5324, "step": 371 }, { "epoch": 2.973, "grad_norm": 0.31184566020965576, "learning_rate": 3.333514894887646e-08, "loss": 0.5557, "step": 372 }, { "epoch": 2.981, "grad_norm": 0.29842570424079895, "learning_rate": 1.4816079088375567e-08, "loss": 0.5384, "step": 373 }, { "epoch": 2.989, "grad_norm": 0.3020179867744446, "learning_rate": 3.7040883734462683e-09, "loss": 0.5352, "step": 374 }, { "epoch": 2.997, "grad_norm": 0.28850436210632324, "learning_rate": 0.0, "loss": 0.5472, "step": 375 } ], "logging_steps": 1, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.219477097775104e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }