{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.998, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008008008008008008, "grad_norm": 1.6445057392120361, "learning_rate": 2e-05, "loss": 2.3547, "step": 1 }, { "epoch": 0.016016016016016016, "grad_norm": 1.63363778591156, "learning_rate": 4e-05, "loss": 2.3812, "step": 2 }, { "epoch": 0.024024024024024024, "grad_norm": 1.6492197513580322, "learning_rate": 6e-05, "loss": 2.3399, "step": 3 }, { "epoch": 0.03203203203203203, "grad_norm": 1.6518611907958984, "learning_rate": 8e-05, "loss": 2.3172, "step": 4 }, { "epoch": 0.04004004004004004, "grad_norm": 1.7173571586608887, "learning_rate": 0.0001, "loss": 2.2563, "step": 5 }, { "epoch": 0.04804804804804805, "grad_norm": 1.563859224319458, "learning_rate": 0.00012, "loss": 2.0256, "step": 6 }, { "epoch": 0.056056056056056056, "grad_norm": 1.5590581893920898, "learning_rate": 0.00014, "loss": 1.8324, "step": 7 }, { "epoch": 0.06406406406406406, "grad_norm": 1.5127277374267578, "learning_rate": 0.00016, "loss": 1.5787, "step": 8 }, { "epoch": 0.07207207207207207, "grad_norm": 1.5447226762771606, "learning_rate": 0.00018, "loss": 1.3826, "step": 9 }, { "epoch": 0.08008008008008008, "grad_norm": 4.600811004638672, "learning_rate": 0.0002, "loss": 1.2388, "step": 10 }, { "epoch": 0.08808808808808809, "grad_norm": 1.6333264112472534, "learning_rate": 0.00019999629591162656, "loss": 1.0977, "step": 11 }, { "epoch": 0.0960960960960961, "grad_norm": 1.5325253009796143, "learning_rate": 0.00019998518392091164, "loss": 1.0178, "step": 12 }, { "epoch": 0.1041041041041041, "grad_norm": 1.866473913192749, "learning_rate": 0.00019996666485105113, "loss": 0.9454, "step": 13 }, { "epoch": 0.11211211211211211, "grad_norm": 1.450692892074585, "learning_rate": 0.0001999407400739705, "loss": 0.8514, "step": 14 }, { "epoch": 0.12012012012012012, "grad_norm": 5.149086952209473, "learning_rate": 0.00019990741151022301, "loss": 0.9136, "step": 15 }, { "epoch": 0.12812812812812813, "grad_norm": 0.7399551272392273, "learning_rate": 0.00019986668162884762, "loss": 0.8742, "step": 16 }, { "epoch": 0.13613613613613615, "grad_norm": 0.6142033934593201, "learning_rate": 0.00019981855344718588, "loss": 0.8082, "step": 17 }, { "epoch": 0.14414414414414414, "grad_norm": 0.47605425119400024, "learning_rate": 0.00019976303053065859, "loss": 0.8019, "step": 18 }, { "epoch": 0.15215215215215216, "grad_norm": 0.3993614614009857, "learning_rate": 0.00019970011699250152, "loss": 0.7625, "step": 19 }, { "epoch": 0.16016016016016016, "grad_norm": 0.4947199821472168, "learning_rate": 0.00019962981749346078, "loss": 0.7419, "step": 20 }, { "epoch": 0.16816816816816818, "grad_norm": 0.549526572227478, "learning_rate": 0.00019955213724144754, "loss": 0.7468, "step": 21 }, { "epoch": 0.17617617617617617, "grad_norm": 0.34314435720443726, "learning_rate": 0.00019946708199115211, "loss": 0.7482, "step": 22 }, { "epoch": 0.1841841841841842, "grad_norm": 0.38283613324165344, "learning_rate": 0.00019937465804361783, "loss": 0.7304, "step": 23 }, { "epoch": 0.1921921921921922, "grad_norm": 0.28871795535087585, "learning_rate": 0.00019927487224577402, "loss": 0.746, "step": 24 }, { "epoch": 0.2002002002002002, "grad_norm": 0.321494996547699, "learning_rate": 0.000199167731989929, "loss": 0.7461, "step": 25 }, { "epoch": 0.2082082082082082, "grad_norm": 0.315449982881546, "learning_rate": 0.0001990532452132223, "loss": 0.7286, "step": 26 }, { "epoch": 0.21621621621621623, "grad_norm": 0.2904318571090698, "learning_rate": 0.00019893142039703664, "loss": 0.7119, "step": 27 }, { "epoch": 0.22422422422422422, "grad_norm": 0.27874529361724854, "learning_rate": 0.00019880226656636977, "loss": 0.7105, "step": 28 }, { "epoch": 0.23223223223223224, "grad_norm": 0.2948579490184784, "learning_rate": 0.0001986657932891657, "loss": 0.6976, "step": 29 }, { "epoch": 0.24024024024024024, "grad_norm": 0.2542964220046997, "learning_rate": 0.00019852201067560606, "loss": 0.7351, "step": 30 }, { "epoch": 0.24824824824824826, "grad_norm": 0.2960706353187561, "learning_rate": 0.000198370929377361, "loss": 0.7179, "step": 31 }, { "epoch": 0.25625625625625625, "grad_norm": 0.24776384234428406, "learning_rate": 0.00019821256058680006, "loss": 0.7134, "step": 32 }, { "epoch": 0.26426426426426425, "grad_norm": 0.33054184913635254, "learning_rate": 0.00019804691603616324, "loss": 0.6995, "step": 33 }, { "epoch": 0.2722722722722723, "grad_norm": 0.2543237805366516, "learning_rate": 0.00019787400799669154, "loss": 0.7081, "step": 34 }, { "epoch": 0.2802802802802803, "grad_norm": 0.25240710377693176, "learning_rate": 0.0001976938492777182, "loss": 0.6928, "step": 35 }, { "epoch": 0.2882882882882883, "grad_norm": 0.35880276560783386, "learning_rate": 0.0001975064532257195, "loss": 0.7177, "step": 36 }, { "epoch": 0.2962962962962963, "grad_norm": 0.3675362467765808, "learning_rate": 0.0001973118337233262, "loss": 0.6865, "step": 37 }, { "epoch": 0.30430430430430433, "grad_norm": 0.3688451051712036, "learning_rate": 0.00019711000518829507, "loss": 0.6724, "step": 38 }, { "epoch": 0.3123123123123123, "grad_norm": 0.2982208728790283, "learning_rate": 0.00019690098257244064, "loss": 0.671, "step": 39 }, { "epoch": 0.3203203203203203, "grad_norm": 0.24197936058044434, "learning_rate": 0.00019668478136052774, "loss": 0.6777, "step": 40 }, { "epoch": 0.3283283283283283, "grad_norm": 0.748349130153656, "learning_rate": 0.00019646141756912434, "loss": 0.6641, "step": 41 }, { "epoch": 0.33633633633633636, "grad_norm": 0.5585939288139343, "learning_rate": 0.00019623090774541487, "loss": 0.6988, "step": 42 }, { "epoch": 0.34434434434434436, "grad_norm": 0.40285471081733704, "learning_rate": 0.00019599326896597448, "loss": 0.6811, "step": 43 }, { "epoch": 0.35235235235235235, "grad_norm": 0.25714346766471863, "learning_rate": 0.00019574851883550395, "loss": 0.6913, "step": 44 }, { "epoch": 0.36036036036036034, "grad_norm": 0.4926215708255768, "learning_rate": 0.00019549667548552556, "loss": 0.6707, "step": 45 }, { "epoch": 0.3683683683683684, "grad_norm": 0.3760850429534912, "learning_rate": 0.00019523775757303974, "loss": 0.6809, "step": 46 }, { "epoch": 0.3763763763763764, "grad_norm": 0.3734811842441559, "learning_rate": 0.0001949717842791432, "loss": 0.6386, "step": 47 }, { "epoch": 0.3843843843843844, "grad_norm": 0.3447561264038086, "learning_rate": 0.00019469877530760754, "loss": 0.6955, "step": 48 }, { "epoch": 0.3923923923923924, "grad_norm": 0.2680707573890686, "learning_rate": 0.00019441875088341997, "loss": 0.6625, "step": 49 }, { "epoch": 0.4004004004004004, "grad_norm": 0.2692941725254059, "learning_rate": 0.00019413173175128473, "loss": 0.66, "step": 50 }, { "epoch": 0.4084084084084084, "grad_norm": 0.32329630851745605, "learning_rate": 0.00019383773917408642, "loss": 0.6612, "step": 51 }, { "epoch": 0.4164164164164164, "grad_norm": 0.281435489654541, "learning_rate": 0.00019353679493131485, "loss": 0.6621, "step": 52 }, { "epoch": 0.4244244244244244, "grad_norm": 0.22186556458473206, "learning_rate": 0.00019322892131745135, "loss": 0.6465, "step": 53 }, { "epoch": 0.43243243243243246, "grad_norm": 0.2902645468711853, "learning_rate": 0.00019291414114031743, "loss": 0.6693, "step": 54 }, { "epoch": 0.44044044044044045, "grad_norm": 0.2899124324321747, "learning_rate": 0.000192592477719385, "loss": 0.6568, "step": 55 }, { "epoch": 0.44844844844844844, "grad_norm": 0.2124062478542328, "learning_rate": 0.00019226395488404876, "loss": 0.6724, "step": 56 }, { "epoch": 0.45645645645645644, "grad_norm": 0.23896393179893494, "learning_rate": 0.00019192859697186106, "loss": 0.6459, "step": 57 }, { "epoch": 0.4644644644644645, "grad_norm": 0.2762405574321747, "learning_rate": 0.00019158642882672873, "loss": 0.6498, "step": 58 }, { "epoch": 0.4724724724724725, "grad_norm": 0.2079935222864151, "learning_rate": 0.00019123747579707275, "loss": 0.6604, "step": 59 }, { "epoch": 0.4804804804804805, "grad_norm": 0.23864208161830902, "learning_rate": 0.0001908817637339503, "loss": 0.6378, "step": 60 }, { "epoch": 0.48848848848848847, "grad_norm": 0.21718506515026093, "learning_rate": 0.00019051931898913976, "loss": 0.6424, "step": 61 }, { "epoch": 0.4964964964964965, "grad_norm": 0.2773915231227875, "learning_rate": 0.0001901501684131884, "loss": 0.6474, "step": 62 }, { "epoch": 0.5045045045045045, "grad_norm": 0.23982493579387665, "learning_rate": 0.0001897743393534234, "loss": 0.6256, "step": 63 }, { "epoch": 0.5125125125125125, "grad_norm": 0.23621873557567596, "learning_rate": 0.0001893918596519257, "loss": 0.6403, "step": 64 }, { "epoch": 0.5205205205205206, "grad_norm": 0.22759953141212463, "learning_rate": 0.00018900275764346768, "loss": 0.6484, "step": 65 }, { "epoch": 0.5285285285285285, "grad_norm": 0.26695549488067627, "learning_rate": 0.00018860706215341382, "loss": 0.609, "step": 66 }, { "epoch": 0.5365365365365365, "grad_norm": 0.24594709277153015, "learning_rate": 0.00018820480249558537, "loss": 0.6338, "step": 67 }, { "epoch": 0.5445445445445446, "grad_norm": 0.22960062325000763, "learning_rate": 0.00018779600847008884, "loss": 0.6166, "step": 68 }, { "epoch": 0.5525525525525525, "grad_norm": 0.25302109122276306, "learning_rate": 0.00018738071036110808, "loss": 0.6422, "step": 69 }, { "epoch": 0.5605605605605606, "grad_norm": 0.3339892327785492, "learning_rate": 0.0001869589389346611, "loss": 0.6558, "step": 70 }, { "epoch": 0.5685685685685685, "grad_norm": 0.21397258341312408, "learning_rate": 0.00018653072543632062, "loss": 0.6323, "step": 71 }, { "epoch": 0.5765765765765766, "grad_norm": 0.2514493465423584, "learning_rate": 0.00018609610158889942, "loss": 0.657, "step": 72 }, { "epoch": 0.5845845845845846, "grad_norm": 0.25317835807800293, "learning_rate": 0.00018565509959010036, "loss": 0.641, "step": 73 }, { "epoch": 0.5925925925925926, "grad_norm": 0.22669494152069092, "learning_rate": 0.00018520775211013093, "loss": 0.6369, "step": 74 }, { "epoch": 0.6006006006006006, "grad_norm": 0.2214743047952652, "learning_rate": 0.00018475409228928312, "loss": 0.6307, "step": 75 }, { "epoch": 0.6086086086086087, "grad_norm": 0.24376747012138367, "learning_rate": 0.00018429415373547828, "loss": 0.6557, "step": 76 }, { "epoch": 0.6166166166166166, "grad_norm": 0.2158333659172058, "learning_rate": 0.00018382797052177746, "loss": 0.655, "step": 77 }, { "epoch": 0.6246246246246246, "grad_norm": 0.25565382838249207, "learning_rate": 0.000183355577183857, "loss": 0.6299, "step": 78 }, { "epoch": 0.6326326326326326, "grad_norm": 0.20636747777462006, "learning_rate": 0.00018287700871745036, "loss": 0.6283, "step": 79 }, { "epoch": 0.6406406406406406, "grad_norm": 0.21258121728897095, "learning_rate": 0.00018239230057575542, "loss": 0.6174, "step": 80 }, { "epoch": 0.6486486486486487, "grad_norm": 0.2861458957195282, "learning_rate": 0.00018190148866680802, "loss": 0.6547, "step": 81 }, { "epoch": 0.6566566566566566, "grad_norm": 0.23667441308498383, "learning_rate": 0.0001814046093508218, "loss": 0.6416, "step": 82 }, { "epoch": 0.6646646646646647, "grad_norm": 0.23191799223423004, "learning_rate": 0.00018090169943749476, "loss": 0.642, "step": 83 }, { "epoch": 0.6726726726726727, "grad_norm": 0.2622171938419342, "learning_rate": 0.00018039279618328212, "loss": 0.6241, "step": 84 }, { "epoch": 0.6806806806806807, "grad_norm": 0.2891266345977783, "learning_rate": 0.00017987793728863651, "loss": 0.6284, "step": 85 }, { "epoch": 0.6886886886886887, "grad_norm": 0.26767420768737793, "learning_rate": 0.00017935716089521474, "loss": 0.627, "step": 86 }, { "epoch": 0.6966966966966966, "grad_norm": 0.2828672230243683, "learning_rate": 0.00017883050558305255, "loss": 0.6418, "step": 87 }, { "epoch": 0.7047047047047047, "grad_norm": 0.32730573415756226, "learning_rate": 0.00017829801036770628, "loss": 0.6629, "step": 88 }, { "epoch": 0.7127127127127127, "grad_norm": 0.24029900133609772, "learning_rate": 0.0001777597146973627, "loss": 0.614, "step": 89 }, { "epoch": 0.7207207207207207, "grad_norm": 0.2929212152957916, "learning_rate": 0.00017721565844991643, "loss": 0.632, "step": 90 }, { "epoch": 0.7287287287287287, "grad_norm": 0.2860666513442993, "learning_rate": 0.00017666588193001595, "loss": 0.6289, "step": 91 }, { "epoch": 0.7367367367367368, "grad_norm": 0.23325330018997192, "learning_rate": 0.00017611042586607748, "loss": 0.6392, "step": 92 }, { "epoch": 0.7447447447447447, "grad_norm": 0.3126169443130493, "learning_rate": 0.00017554933140726802, "loss": 0.6422, "step": 93 }, { "epoch": 0.7527527527527528, "grad_norm": 0.26704883575439453, "learning_rate": 0.00017498264012045687, "loss": 0.6166, "step": 94 }, { "epoch": 0.7607607607607607, "grad_norm": 0.2184283286333084, "learning_rate": 0.00017441039398713608, "loss": 0.6235, "step": 95 }, { "epoch": 0.7687687687687688, "grad_norm": 0.23906390368938446, "learning_rate": 0.00017383263540031067, "loss": 0.6643, "step": 96 }, { "epoch": 0.7767767767767768, "grad_norm": 0.26839691400527954, "learning_rate": 0.0001732494071613579, "loss": 0.6514, "step": 97 }, { "epoch": 0.7847847847847848, "grad_norm": 0.2805701494216919, "learning_rate": 0.00017266075247685656, "loss": 0.6168, "step": 98 }, { "epoch": 0.7927927927927928, "grad_norm": 0.21650992333889008, "learning_rate": 0.00017206671495538612, "loss": 0.5983, "step": 99 }, { "epoch": 0.8008008008008008, "grad_norm": 0.2302800416946411, "learning_rate": 0.00017146733860429612, "loss": 0.6301, "step": 100 }, { "epoch": 0.8088088088088088, "grad_norm": 0.29078415036201477, "learning_rate": 0.000170862667826446, "loss": 0.616, "step": 101 }, { "epoch": 0.8168168168168168, "grad_norm": 0.24860034883022308, "learning_rate": 0.0001702527474169157, "loss": 0.6352, "step": 102 }, { "epoch": 0.8248248248248248, "grad_norm": 0.26281973719596863, "learning_rate": 0.00016963762255968722, "loss": 0.6218, "step": 103 }, { "epoch": 0.8328328328328328, "grad_norm": 0.29051998257637024, "learning_rate": 0.0001690173388242972, "loss": 0.6233, "step": 104 }, { "epoch": 0.8408408408408409, "grad_norm": 0.2471507042646408, "learning_rate": 0.00016839194216246108, "loss": 0.6147, "step": 105 }, { "epoch": 0.8488488488488488, "grad_norm": 0.2574704587459564, "learning_rate": 0.0001677614789046689, "loss": 0.6174, "step": 106 }, { "epoch": 0.8568568568568569, "grad_norm": 0.2551233172416687, "learning_rate": 0.00016712599575675316, "loss": 0.5989, "step": 107 }, { "epoch": 0.8648648648648649, "grad_norm": 0.2901318371295929, "learning_rate": 0.00016648553979642868, "loss": 0.6241, "step": 108 }, { "epoch": 0.8728728728728729, "grad_norm": 0.23769080638885498, "learning_rate": 0.0001658401584698049, "loss": 0.6044, "step": 109 }, { "epoch": 0.8808808808808809, "grad_norm": 0.2580976188182831, "learning_rate": 0.00016518989958787126, "loss": 0.622, "step": 110 }, { "epoch": 0.8888888888888888, "grad_norm": 0.24077744781970978, "learning_rate": 0.00016453481132295506, "loss": 0.6047, "step": 111 }, { "epoch": 0.8968968968968969, "grad_norm": 0.228902667760849, "learning_rate": 0.00016387494220515274, "loss": 0.6138, "step": 112 }, { "epoch": 0.9049049049049049, "grad_norm": 0.2607581317424774, "learning_rate": 0.00016321034111873488, "loss": 0.6307, "step": 113 }, { "epoch": 0.9129129129129129, "grad_norm": 0.2575569450855255, "learning_rate": 0.00016254105729852464, "loss": 0.6008, "step": 114 }, { "epoch": 0.9209209209209209, "grad_norm": 0.231553852558136, "learning_rate": 0.00016186714032625035, "loss": 0.617, "step": 115 }, { "epoch": 0.928928928928929, "grad_norm": 0.24820354580879211, "learning_rate": 0.00016118864012687245, "loss": 0.5991, "step": 116 }, { "epoch": 0.9369369369369369, "grad_norm": 0.2364109754562378, "learning_rate": 0.00016050560696488492, "loss": 0.6094, "step": 117 }, { "epoch": 0.944944944944945, "grad_norm": 0.2492029368877411, "learning_rate": 0.00015981809144059166, "loss": 0.6143, "step": 118 }, { "epoch": 0.9529529529529529, "grad_norm": 0.27745717763900757, "learning_rate": 0.00015912614448635782, "loss": 0.6203, "step": 119 }, { "epoch": 0.960960960960961, "grad_norm": 0.2555610239505768, "learning_rate": 0.00015842981736283686, "loss": 0.6314, "step": 120 }, { "epoch": 0.968968968968969, "grad_norm": 0.2268420308828354, "learning_rate": 0.00015772916165517273, "loss": 0.6155, "step": 121 }, { "epoch": 0.9769769769769769, "grad_norm": 0.250041127204895, "learning_rate": 0.00015702422926917872, "loss": 0.6226, "step": 122 }, { "epoch": 0.984984984984985, "grad_norm": 0.2596072554588318, "learning_rate": 0.00015631507242749187, "loss": 0.6086, "step": 123 }, { "epoch": 0.992992992992993, "grad_norm": 0.2280743271112442, "learning_rate": 0.00015560174366570446, "loss": 0.5994, "step": 124 }, { "epoch": 1.001001001001001, "grad_norm": 0.23362237215042114, "learning_rate": 0.00015488429582847192, "loss": 0.616, "step": 125 }, { "epoch": 1.006, "grad_norm": 0.2956937849521637, "learning_rate": 0.00015416278206559816, "loss": 0.6038, "step": 126 }, { "epoch": 1.014, "grad_norm": 0.250629723072052, "learning_rate": 0.0001534372558280979, "loss": 0.5991, "step": 127 }, { "epoch": 1.022, "grad_norm": 0.231906458735466, "learning_rate": 0.00015270777086423722, "loss": 0.6088, "step": 128 }, { "epoch": 1.03, "grad_norm": 0.2888093590736389, "learning_rate": 0.0001519743812155516, "loss": 0.5892, "step": 129 }, { "epoch": 1.038, "grad_norm": 0.24940524995326996, "learning_rate": 0.0001512371412128424, "loss": 0.5982, "step": 130 }, { "epoch": 1.046, "grad_norm": 0.24017778038978577, "learning_rate": 0.00015049610547215205, "loss": 0.5608, "step": 131 }, { "epoch": 1.054, "grad_norm": 0.2334035485982895, "learning_rate": 0.00014975132889071807, "loss": 0.6034, "step": 132 }, { "epoch": 1.062, "grad_norm": 0.2773897349834442, "learning_rate": 0.00014900286664290592, "loss": 0.6387, "step": 133 }, { "epoch": 1.07, "grad_norm": 0.24266445636749268, "learning_rate": 0.00014825077417612186, "loss": 0.5612, "step": 134 }, { "epoch": 1.078, "grad_norm": 0.22919470071792603, "learning_rate": 0.00014749510720670506, "loss": 0.599, "step": 135 }, { "epoch": 1.086, "grad_norm": 0.23829148709774017, "learning_rate": 0.00014673592171580025, "loss": 0.6066, "step": 136 }, { "epoch": 1.094, "grad_norm": 0.31981223821640015, "learning_rate": 0.00014597327394521044, "loss": 0.5692, "step": 137 }, { "epoch": 1.102, "grad_norm": 0.2747564911842346, "learning_rate": 0.00014520722039323045, "loss": 0.62, "step": 138 }, { "epoch": 1.11, "grad_norm": 0.2592499852180481, "learning_rate": 0.00014443781781046136, "loss": 0.5937, "step": 139 }, { "epoch": 1.1179999999999999, "grad_norm": 0.31891530752182007, "learning_rate": 0.0001436651231956064, "loss": 0.5973, "step": 140 }, { "epoch": 1.126, "grad_norm": 0.2743702232837677, "learning_rate": 0.00014288919379124837, "loss": 0.6045, "step": 141 }, { "epoch": 1.134, "grad_norm": 0.2665708661079407, "learning_rate": 0.00014211008707960897, "loss": 0.5898, "step": 142 }, { "epoch": 1.142, "grad_norm": 0.33267003297805786, "learning_rate": 0.00014132786077829043, "loss": 0.5945, "step": 143 }, { "epoch": 1.15, "grad_norm": 0.28636589646339417, "learning_rate": 0.00014054257283599973, "loss": 0.5914, "step": 144 }, { "epoch": 1.158, "grad_norm": 0.27305400371551514, "learning_rate": 0.0001397542814282556, "loss": 0.6093, "step": 145 }, { "epoch": 1.166, "grad_norm": 0.2839919924736023, "learning_rate": 0.0001389630449530788, "loss": 0.6074, "step": 146 }, { "epoch": 1.174, "grad_norm": 0.25652188062667847, "learning_rate": 0.0001381689220266659, "loss": 0.6059, "step": 147 }, { "epoch": 1.182, "grad_norm": 0.2549704909324646, "learning_rate": 0.0001373719714790469, "loss": 0.5568, "step": 148 }, { "epoch": 1.19, "grad_norm": 0.2509196400642395, "learning_rate": 0.00013657225234972695, "loss": 0.5968, "step": 149 }, { "epoch": 1.198, "grad_norm": 0.25909289717674255, "learning_rate": 0.0001357698238833126, "loss": 0.5902, "step": 150 }, { "epoch": 1.206, "grad_norm": 0.26162394881248474, "learning_rate": 0.00013496474552512287, "loss": 0.5763, "step": 151 }, { "epoch": 1.214, "grad_norm": 0.2721655070781708, "learning_rate": 0.00013415707691678556, "loss": 0.6037, "step": 152 }, { "epoch": 1.222, "grad_norm": 0.28691592812538147, "learning_rate": 0.0001333468778918187, "loss": 0.625, "step": 153 }, { "epoch": 1.23, "grad_norm": 0.25801119208335876, "learning_rate": 0.00013253420847119803, "loss": 0.5994, "step": 154 }, { "epoch": 1.238, "grad_norm": 0.2609824538230896, "learning_rate": 0.00013171912885891063, "loss": 0.5999, "step": 155 }, { "epoch": 1.246, "grad_norm": 0.2631840407848358, "learning_rate": 0.00013090169943749476, "loss": 0.5844, "step": 156 }, { "epoch": 1.254, "grad_norm": 0.2862647473812103, "learning_rate": 0.00013008198076356676, "loss": 0.5653, "step": 157 }, { "epoch": 1.262, "grad_norm": 0.27327024936676025, "learning_rate": 0.00012926003356333488, "loss": 0.5933, "step": 158 }, { "epoch": 1.27, "grad_norm": 0.2684379518032074, "learning_rate": 0.0001284359187281004, "loss": 0.5842, "step": 159 }, { "epoch": 1.278, "grad_norm": 0.2620231807231903, "learning_rate": 0.00012760969730974694, "loss": 0.6079, "step": 160 }, { "epoch": 1.286, "grad_norm": 0.2584543526172638, "learning_rate": 0.00012678143051621742, "loss": 0.5941, "step": 161 }, { "epoch": 1.294, "grad_norm": 0.26279789209365845, "learning_rate": 0.00012595117970697997, "loss": 0.6086, "step": 162 }, { "epoch": 1.302, "grad_norm": 0.2541520893573761, "learning_rate": 0.00012511900638848195, "loss": 0.5907, "step": 163 }, { "epoch": 1.31, "grad_norm": 0.27384746074676514, "learning_rate": 0.0001242849722095936, "loss": 0.6055, "step": 164 }, { "epoch": 1.318, "grad_norm": 0.25682300329208374, "learning_rate": 0.00012344913895704097, "loss": 0.609, "step": 165 }, { "epoch": 1.326, "grad_norm": 0.25404492020606995, "learning_rate": 0.00012261156855082882, "loss": 0.6121, "step": 166 }, { "epoch": 1.334, "grad_norm": 0.2649850845336914, "learning_rate": 0.0001217723230396532, "loss": 0.5695, "step": 167 }, { "epoch": 1.342, "grad_norm": 0.28349268436431885, "learning_rate": 0.00012093146459630487, "loss": 0.5883, "step": 168 }, { "epoch": 1.35, "grad_norm": 0.27068182826042175, "learning_rate": 0.00012008905551306356, "loss": 0.6147, "step": 169 }, { "epoch": 1.358, "grad_norm": 0.29566338658332825, "learning_rate": 0.000119245158197083, "loss": 0.5901, "step": 170 }, { "epoch": 1.366, "grad_norm": 0.27946797013282776, "learning_rate": 0.00011839983516576802, "loss": 0.5831, "step": 171 }, { "epoch": 1.374, "grad_norm": 0.25005409121513367, "learning_rate": 0.00011755314904214284, "loss": 0.5656, "step": 172 }, { "epoch": 1.3820000000000001, "grad_norm": 0.2621053159236908, "learning_rate": 0.00011670516255021193, "loss": 0.5769, "step": 173 }, { "epoch": 1.3900000000000001, "grad_norm": 0.2531629502773285, "learning_rate": 0.00011585593851031347, "loss": 0.5641, "step": 174 }, { "epoch": 1.3980000000000001, "grad_norm": 0.3026553690433502, "learning_rate": 0.00011500553983446527, "loss": 0.6038, "step": 175 }, { "epoch": 1.4060000000000001, "grad_norm": 0.2748616337776184, "learning_rate": 0.00011415402952170433, "loss": 0.5808, "step": 176 }, { "epoch": 1.414, "grad_norm": 0.27345311641693115, "learning_rate": 0.0001133014706534196, "loss": 0.5848, "step": 177 }, { "epoch": 1.422, "grad_norm": 0.273357629776001, "learning_rate": 0.00011244792638867893, "loss": 0.5906, "step": 178 }, { "epoch": 1.43, "grad_norm": 0.26230183243751526, "learning_rate": 0.00011159345995955006, "loss": 0.5886, "step": 179 }, { "epoch": 1.438, "grad_norm": 0.2515832483768463, "learning_rate": 0.00011073813466641632, "loss": 0.5876, "step": 180 }, { "epoch": 1.446, "grad_norm": 0.27451092004776, "learning_rate": 0.00010988201387328717, "loss": 0.5841, "step": 181 }, { "epoch": 1.454, "grad_norm": 0.2588571012020111, "learning_rate": 0.00010902516100310411, "loss": 0.5674, "step": 182 }, { "epoch": 1.462, "grad_norm": 0.2440604865550995, "learning_rate": 0.00010816763953304227, "loss": 0.5646, "step": 183 }, { "epoch": 1.47, "grad_norm": 0.2681560516357422, "learning_rate": 0.00010730951298980776, "loss": 0.5671, "step": 184 }, { "epoch": 1.478, "grad_norm": 0.297048419713974, "learning_rate": 0.00010645084494493165, "loss": 0.583, "step": 185 }, { "epoch": 1.486, "grad_norm": 0.29275089502334595, "learning_rate": 0.00010559169901006034, "loss": 0.6007, "step": 186 }, { "epoch": 1.494, "grad_norm": 0.2599998414516449, "learning_rate": 0.0001047321388322432, "loss": 0.5692, "step": 187 }, { "epoch": 1.502, "grad_norm": 0.2714841663837433, "learning_rate": 0.00010387222808921746, "loss": 0.5996, "step": 188 }, { "epoch": 1.51, "grad_norm": 0.2618089020252228, "learning_rate": 0.00010301203048469083, "loss": 0.5861, "step": 189 }, { "epoch": 1.518, "grad_norm": 0.27270275354385376, "learning_rate": 0.00010215160974362223, "loss": 0.5771, "step": 190 }, { "epoch": 1.526, "grad_norm": 0.2583703398704529, "learning_rate": 0.00010129102960750092, "loss": 0.5897, "step": 191 }, { "epoch": 1.534, "grad_norm": 0.25854265689849854, "learning_rate": 0.00010043035382962443, "loss": 0.5778, "step": 192 }, { "epoch": 1.542, "grad_norm": 0.27234703302383423, "learning_rate": 9.956964617037558e-05, "loss": 0.5659, "step": 193 }, { "epoch": 1.55, "grad_norm": 0.2672337293624878, "learning_rate": 9.870897039249911e-05, "loss": 0.5792, "step": 194 }, { "epoch": 1.558, "grad_norm": 0.2644350528717041, "learning_rate": 9.784839025637778e-05, "loss": 0.5798, "step": 195 }, { "epoch": 1.5659999999999998, "grad_norm": 0.2572389543056488, "learning_rate": 9.698796951530919e-05, "loss": 0.6008, "step": 196 }, { "epoch": 1.5739999999999998, "grad_norm": 0.2693156898021698, "learning_rate": 9.612777191078258e-05, "loss": 0.5903, "step": 197 }, { "epoch": 1.5819999999999999, "grad_norm": 0.26244068145751953, "learning_rate": 9.526786116775682e-05, "loss": 0.569, "step": 198 }, { "epoch": 1.5899999999999999, "grad_norm": 0.25875815749168396, "learning_rate": 9.440830098993969e-05, "loss": 0.6042, "step": 199 }, { "epoch": 1.5979999999999999, "grad_norm": 0.27209436893463135, "learning_rate": 9.354915505506839e-05, "loss": 0.5846, "step": 200 }, { "epoch": 1.6059999999999999, "grad_norm": 0.270780473947525, "learning_rate": 9.269048701019226e-05, "loss": 0.5957, "step": 201 }, { "epoch": 1.6139999999999999, "grad_norm": 0.25013670325279236, "learning_rate": 9.183236046695777e-05, "loss": 0.5845, "step": 202 }, { "epoch": 1.6219999999999999, "grad_norm": 0.2716957628726959, "learning_rate": 9.09748389968959e-05, "loss": 0.5584, "step": 203 }, { "epoch": 1.63, "grad_norm": 0.2737436890602112, "learning_rate": 9.011798612671286e-05, "loss": 0.5836, "step": 204 }, { "epoch": 1.638, "grad_norm": 0.2748481333255768, "learning_rate": 8.92618653335837e-05, "loss": 0.5927, "step": 205 }, { "epoch": 1.646, "grad_norm": 0.2642996609210968, "learning_rate": 8.840654004044996e-05, "loss": 0.6088, "step": 206 }, { "epoch": 1.654, "grad_norm": 0.26341068744659424, "learning_rate": 8.755207361132108e-05, "loss": 0.5841, "step": 207 }, { "epoch": 1.662, "grad_norm": 0.30515289306640625, "learning_rate": 8.669852934658042e-05, "loss": 0.5525, "step": 208 }, { "epoch": 1.67, "grad_norm": 0.2801468074321747, "learning_rate": 8.58459704782957e-05, "loss": 0.5547, "step": 209 }, { "epoch": 1.678, "grad_norm": 0.27618443965911865, "learning_rate": 8.499446016553474e-05, "loss": 0.5613, "step": 210 }, { "epoch": 1.686, "grad_norm": 0.26961931586265564, "learning_rate": 8.414406148968657e-05, "loss": 0.5639, "step": 211 }, { "epoch": 1.694, "grad_norm": 0.29023805260658264, "learning_rate": 8.32948374497881e-05, "loss": 0.5878, "step": 212 }, { "epoch": 1.702, "grad_norm": 0.2671249508857727, "learning_rate": 8.244685095785719e-05, "loss": 0.5743, "step": 213 }, { "epoch": 1.71, "grad_norm": 0.26124948263168335, "learning_rate": 8.160016483423199e-05, "loss": 0.5801, "step": 214 }, { "epoch": 1.718, "grad_norm": 0.2721916437149048, "learning_rate": 8.075484180291701e-05, "loss": 0.5975, "step": 215 }, { "epoch": 1.726, "grad_norm": 0.2630290687084198, "learning_rate": 7.991094448693648e-05, "loss": 0.5714, "step": 216 }, { "epoch": 1.734, "grad_norm": 0.25962400436401367, "learning_rate": 7.906853540369514e-05, "loss": 0.5912, "step": 217 }, { "epoch": 1.742, "grad_norm": 0.26142629981040955, "learning_rate": 7.822767696034682e-05, "loss": 0.577, "step": 218 }, { "epoch": 1.75, "grad_norm": 0.26839005947113037, "learning_rate": 7.738843144917119e-05, "loss": 0.5747, "step": 219 }, { "epoch": 1.758, "grad_norm": 0.28053659200668335, "learning_rate": 7.655086104295904e-05, "loss": 0.612, "step": 220 }, { "epoch": 1.766, "grad_norm": 0.2728564143180847, "learning_rate": 7.571502779040645e-05, "loss": 0.5895, "step": 221 }, { "epoch": 1.774, "grad_norm": 0.26337721943855286, "learning_rate": 7.48809936115181e-05, "loss": 0.5666, "step": 222 }, { "epoch": 1.782, "grad_norm": 0.2544887661933899, "learning_rate": 7.404882029302003e-05, "loss": 0.5552, "step": 223 }, { "epoch": 1.79, "grad_norm": 0.2690827250480652, "learning_rate": 7.321856948378259e-05, "loss": 0.5559, "step": 224 }, { "epoch": 1.798, "grad_norm": 0.27959364652633667, "learning_rate": 7.239030269025311e-05, "loss": 0.5738, "step": 225 }, { "epoch": 1.806, "grad_norm": 0.29118430614471436, "learning_rate": 7.156408127189965e-05, "loss": 0.5753, "step": 226 }, { "epoch": 1.814, "grad_norm": 0.2585116922855377, "learning_rate": 7.073996643666517e-05, "loss": 0.5499, "step": 227 }, { "epoch": 1.822, "grad_norm": 0.24918395280838013, "learning_rate": 6.991801923643324e-05, "loss": 0.5792, "step": 228 }, { "epoch": 1.83, "grad_norm": 0.2664697766304016, "learning_rate": 6.909830056250527e-05, "loss": 0.582, "step": 229 }, { "epoch": 1.838, "grad_norm": 0.2679463326931, "learning_rate": 6.82808711410894e-05, "loss": 0.5919, "step": 230 }, { "epoch": 1.846, "grad_norm": 0.26800620555877686, "learning_rate": 6.746579152880201e-05, "loss": 0.5774, "step": 231 }, { "epoch": 1.854, "grad_norm": 0.25649094581604004, "learning_rate": 6.665312210818131e-05, "loss": 0.5569, "step": 232 }, { "epoch": 1.862, "grad_norm": 0.2661426067352295, "learning_rate": 6.584292308321445e-05, "loss": 0.5759, "step": 233 }, { "epoch": 1.87, "grad_norm": 0.27011772990226746, "learning_rate": 6.503525447487715e-05, "loss": 0.5752, "step": 234 }, { "epoch": 1.8780000000000001, "grad_norm": 0.28006768226623535, "learning_rate": 6.423017611668745e-05, "loss": 0.5873, "step": 235 }, { "epoch": 1.8860000000000001, "grad_norm": 0.281974196434021, "learning_rate": 6.342774765027309e-05, "loss": 0.5868, "step": 236 }, { "epoch": 1.8940000000000001, "grad_norm": 0.28022536635398865, "learning_rate": 6.262802852095311e-05, "loss": 0.5505, "step": 237 }, { "epoch": 1.9020000000000001, "grad_norm": 0.2971389889717102, "learning_rate": 6.18310779733341e-05, "loss": 0.5879, "step": 238 }, { "epoch": 1.9100000000000001, "grad_norm": 0.2779372036457062, "learning_rate": 6.103695504692122e-05, "loss": 0.5648, "step": 239 }, { "epoch": 1.9180000000000001, "grad_norm": 0.3032248616218567, "learning_rate": 6.024571857174443e-05, "loss": 0.5884, "step": 240 }, { "epoch": 1.9260000000000002, "grad_norm": 0.26035642623901367, "learning_rate": 5.94574271640003e-05, "loss": 0.5837, "step": 241 }, { "epoch": 1.9340000000000002, "grad_norm": 0.2789236009120941, "learning_rate": 5.8672139221709577e-05, "loss": 0.5745, "step": 242 }, { "epoch": 1.942, "grad_norm": 0.27459922432899475, "learning_rate": 5.788991292039103e-05, "loss": 0.568, "step": 243 }, { "epoch": 1.95, "grad_norm": 0.2857699394226074, "learning_rate": 5.7110806208751655e-05, "loss": 0.5619, "step": 244 }, { "epoch": 1.958, "grad_norm": 0.2715800106525421, "learning_rate": 5.633487680439361e-05, "loss": 0.5763, "step": 245 }, { "epoch": 1.966, "grad_norm": 0.27073079347610474, "learning_rate": 5.556218218953868e-05, "loss": 0.5815, "step": 246 }, { "epoch": 1.974, "grad_norm": 0.2630924880504608, "learning_rate": 5.479277960676958e-05, "loss": 0.5735, "step": 247 }, { "epoch": 1.982, "grad_norm": 0.2807694971561432, "learning_rate": 5.40267260547896e-05, "loss": 0.5611, "step": 248 }, { "epoch": 1.99, "grad_norm": 0.28481224179267883, "learning_rate": 5.326407828419979e-05, "loss": 0.5671, "step": 249 }, { "epoch": 1.998, "grad_norm": 0.27675607800483704, "learning_rate": 5.2504892793295e-05, "loss": 0.5825, "step": 250 } ], "logging_steps": 1, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.479651398516736e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }