|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.997, |
|
"eval_steps": 500, |
|
"global_step": 375, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008008008008008008, |
|
"grad_norm": 1.6445057392120361, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3547, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016016016016016016, |
|
"grad_norm": 1.63363778591156, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3812, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024024024024024024, |
|
"grad_norm": 1.6492197513580322, |
|
"learning_rate": 6e-05, |
|
"loss": 2.3399, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03203203203203203, |
|
"grad_norm": 1.6518611907958984, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3172, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04004004004004004, |
|
"grad_norm": 1.7173571586608887, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2563, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04804804804804805, |
|
"grad_norm": 1.563859224319458, |
|
"learning_rate": 0.00012, |
|
"loss": 2.0256, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056056056056056056, |
|
"grad_norm": 1.5590581893920898, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8324, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06406406406406406, |
|
"grad_norm": 1.5127277374267578, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5787, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07207207207207207, |
|
"grad_norm": 1.5447226762771606, |
|
"learning_rate": 0.00018, |
|
"loss": 1.3826, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08008008008008008, |
|
"grad_norm": 4.600811004638672, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2388, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08808808808808809, |
|
"grad_norm": 1.6333264112472534, |
|
"learning_rate": 0.00019999629591162656, |
|
"loss": 1.0977, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0960960960960961, |
|
"grad_norm": 1.5325253009796143, |
|
"learning_rate": 0.00019998518392091164, |
|
"loss": 1.0178, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1041041041041041, |
|
"grad_norm": 1.866473913192749, |
|
"learning_rate": 0.00019996666485105113, |
|
"loss": 0.9454, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11211211211211211, |
|
"grad_norm": 1.450692892074585, |
|
"learning_rate": 0.0001999407400739705, |
|
"loss": 0.8514, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12012012012012012, |
|
"grad_norm": 5.149086952209473, |
|
"learning_rate": 0.00019990741151022301, |
|
"loss": 0.9136, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12812812812812813, |
|
"grad_norm": 0.7399551272392273, |
|
"learning_rate": 0.00019986668162884762, |
|
"loss": 0.8742, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13613613613613615, |
|
"grad_norm": 0.6142033934593201, |
|
"learning_rate": 0.00019981855344718588, |
|
"loss": 0.8082, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14414414414414414, |
|
"grad_norm": 0.47605425119400024, |
|
"learning_rate": 0.00019976303053065859, |
|
"loss": 0.8019, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.15215215215215216, |
|
"grad_norm": 0.3993614614009857, |
|
"learning_rate": 0.00019970011699250152, |
|
"loss": 0.7625, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16016016016016016, |
|
"grad_norm": 0.4947199821472168, |
|
"learning_rate": 0.00019962981749346078, |
|
"loss": 0.7419, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16816816816816818, |
|
"grad_norm": 0.549526572227478, |
|
"learning_rate": 0.00019955213724144754, |
|
"loss": 0.7468, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.17617617617617617, |
|
"grad_norm": 0.34314435720443726, |
|
"learning_rate": 0.00019946708199115211, |
|
"loss": 0.7482, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1841841841841842, |
|
"grad_norm": 0.38283613324165344, |
|
"learning_rate": 0.00019937465804361783, |
|
"loss": 0.7304, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1921921921921922, |
|
"grad_norm": 0.28871795535087585, |
|
"learning_rate": 0.00019927487224577402, |
|
"loss": 0.746, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2002002002002002, |
|
"grad_norm": 0.321494996547699, |
|
"learning_rate": 0.000199167731989929, |
|
"loss": 0.7461, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2082082082082082, |
|
"grad_norm": 0.315449982881546, |
|
"learning_rate": 0.0001990532452132223, |
|
"loss": 0.7286, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 0.2904318571090698, |
|
"learning_rate": 0.00019893142039703664, |
|
"loss": 0.7119, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.22422422422422422, |
|
"grad_norm": 0.27874529361724854, |
|
"learning_rate": 0.00019880226656636977, |
|
"loss": 0.7105, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23223223223223224, |
|
"grad_norm": 0.2948579490184784, |
|
"learning_rate": 0.0001986657932891657, |
|
"loss": 0.6976, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24024024024024024, |
|
"grad_norm": 0.2542964220046997, |
|
"learning_rate": 0.00019852201067560606, |
|
"loss": 0.7351, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.24824824824824826, |
|
"grad_norm": 0.2960706353187561, |
|
"learning_rate": 0.000198370929377361, |
|
"loss": 0.7179, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.25625625625625625, |
|
"grad_norm": 0.24776384234428406, |
|
"learning_rate": 0.00019821256058680006, |
|
"loss": 0.7134, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.26426426426426425, |
|
"grad_norm": 0.33054184913635254, |
|
"learning_rate": 0.00019804691603616324, |
|
"loss": 0.6995, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2722722722722723, |
|
"grad_norm": 0.2543237805366516, |
|
"learning_rate": 0.00019787400799669154, |
|
"loss": 0.7081, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2802802802802803, |
|
"grad_norm": 0.25240710377693176, |
|
"learning_rate": 0.0001976938492777182, |
|
"loss": 0.6928, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2882882882882883, |
|
"grad_norm": 0.35880276560783386, |
|
"learning_rate": 0.0001975064532257195, |
|
"loss": 0.7177, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.3675362467765808, |
|
"learning_rate": 0.0001973118337233262, |
|
"loss": 0.6865, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.30430430430430433, |
|
"grad_norm": 0.3688451051712036, |
|
"learning_rate": 0.00019711000518829507, |
|
"loss": 0.6724, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3123123123123123, |
|
"grad_norm": 0.2982208728790283, |
|
"learning_rate": 0.00019690098257244064, |
|
"loss": 0.671, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3203203203203203, |
|
"grad_norm": 0.24197936058044434, |
|
"learning_rate": 0.00019668478136052774, |
|
"loss": 0.6777, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3283283283283283, |
|
"grad_norm": 0.748349130153656, |
|
"learning_rate": 0.00019646141756912434, |
|
"loss": 0.6641, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.33633633633633636, |
|
"grad_norm": 0.5585939288139343, |
|
"learning_rate": 0.00019623090774541487, |
|
"loss": 0.6988, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.34434434434434436, |
|
"grad_norm": 0.40285471081733704, |
|
"learning_rate": 0.00019599326896597448, |
|
"loss": 0.6811, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.35235235235235235, |
|
"grad_norm": 0.25714346766471863, |
|
"learning_rate": 0.00019574851883550395, |
|
"loss": 0.6913, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.36036036036036034, |
|
"grad_norm": 0.4926215708255768, |
|
"learning_rate": 0.00019549667548552556, |
|
"loss": 0.6707, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3683683683683684, |
|
"grad_norm": 0.3760850429534912, |
|
"learning_rate": 0.00019523775757303974, |
|
"loss": 0.6809, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3763763763763764, |
|
"grad_norm": 0.3734811842441559, |
|
"learning_rate": 0.0001949717842791432, |
|
"loss": 0.6386, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3843843843843844, |
|
"grad_norm": 0.3447561264038086, |
|
"learning_rate": 0.00019469877530760754, |
|
"loss": 0.6955, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3923923923923924, |
|
"grad_norm": 0.2680707573890686, |
|
"learning_rate": 0.00019441875088341997, |
|
"loss": 0.6625, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4004004004004004, |
|
"grad_norm": 0.2692941725254059, |
|
"learning_rate": 0.00019413173175128473, |
|
"loss": 0.66, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4084084084084084, |
|
"grad_norm": 0.32329630851745605, |
|
"learning_rate": 0.00019383773917408642, |
|
"loss": 0.6612, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4164164164164164, |
|
"grad_norm": 0.281435489654541, |
|
"learning_rate": 0.00019353679493131485, |
|
"loss": 0.6621, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4244244244244244, |
|
"grad_norm": 0.22186556458473206, |
|
"learning_rate": 0.00019322892131745135, |
|
"loss": 0.6465, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.2902645468711853, |
|
"learning_rate": 0.00019291414114031743, |
|
"loss": 0.6693, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.44044044044044045, |
|
"grad_norm": 0.2899124324321747, |
|
"learning_rate": 0.000192592477719385, |
|
"loss": 0.6568, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.44844844844844844, |
|
"grad_norm": 0.2124062478542328, |
|
"learning_rate": 0.00019226395488404876, |
|
"loss": 0.6724, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.45645645645645644, |
|
"grad_norm": 0.23896393179893494, |
|
"learning_rate": 0.00019192859697186106, |
|
"loss": 0.6459, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4644644644644645, |
|
"grad_norm": 0.2762405574321747, |
|
"learning_rate": 0.00019158642882672873, |
|
"loss": 0.6498, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4724724724724725, |
|
"grad_norm": 0.2079935222864151, |
|
"learning_rate": 0.00019123747579707275, |
|
"loss": 0.6604, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4804804804804805, |
|
"grad_norm": 0.23864208161830902, |
|
"learning_rate": 0.0001908817637339503, |
|
"loss": 0.6378, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.48848848848848847, |
|
"grad_norm": 0.21718506515026093, |
|
"learning_rate": 0.00019051931898913976, |
|
"loss": 0.6424, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4964964964964965, |
|
"grad_norm": 0.2773915231227875, |
|
"learning_rate": 0.0001901501684131884, |
|
"loss": 0.6474, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5045045045045045, |
|
"grad_norm": 0.23982493579387665, |
|
"learning_rate": 0.0001897743393534234, |
|
"loss": 0.6256, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5125125125125125, |
|
"grad_norm": 0.23621873557567596, |
|
"learning_rate": 0.0001893918596519257, |
|
"loss": 0.6403, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5205205205205206, |
|
"grad_norm": 0.22759953141212463, |
|
"learning_rate": 0.00018900275764346768, |
|
"loss": 0.6484, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5285285285285285, |
|
"grad_norm": 0.26695549488067627, |
|
"learning_rate": 0.00018860706215341382, |
|
"loss": 0.609, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5365365365365365, |
|
"grad_norm": 0.24594709277153015, |
|
"learning_rate": 0.00018820480249558537, |
|
"loss": 0.6338, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5445445445445446, |
|
"grad_norm": 0.22960062325000763, |
|
"learning_rate": 0.00018779600847008884, |
|
"loss": 0.6166, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5525525525525525, |
|
"grad_norm": 0.25302109122276306, |
|
"learning_rate": 0.00018738071036110808, |
|
"loss": 0.6422, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5605605605605606, |
|
"grad_norm": 0.3339892327785492, |
|
"learning_rate": 0.0001869589389346611, |
|
"loss": 0.6558, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5685685685685685, |
|
"grad_norm": 0.21397258341312408, |
|
"learning_rate": 0.00018653072543632062, |
|
"loss": 0.6323, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5765765765765766, |
|
"grad_norm": 0.2514493465423584, |
|
"learning_rate": 0.00018609610158889942, |
|
"loss": 0.657, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5845845845845846, |
|
"grad_norm": 0.25317835807800293, |
|
"learning_rate": 0.00018565509959010036, |
|
"loss": 0.641, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.22669494152069092, |
|
"learning_rate": 0.00018520775211013093, |
|
"loss": 0.6369, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6006006006006006, |
|
"grad_norm": 0.2214743047952652, |
|
"learning_rate": 0.00018475409228928312, |
|
"loss": 0.6307, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6086086086086087, |
|
"grad_norm": 0.24376747012138367, |
|
"learning_rate": 0.00018429415373547828, |
|
"loss": 0.6557, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6166166166166166, |
|
"grad_norm": 0.2158333659172058, |
|
"learning_rate": 0.00018382797052177746, |
|
"loss": 0.655, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6246246246246246, |
|
"grad_norm": 0.25565382838249207, |
|
"learning_rate": 0.000183355577183857, |
|
"loss": 0.6299, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6326326326326326, |
|
"grad_norm": 0.20636747777462006, |
|
"learning_rate": 0.00018287700871745036, |
|
"loss": 0.6283, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6406406406406406, |
|
"grad_norm": 0.21258121728897095, |
|
"learning_rate": 0.00018239230057575542, |
|
"loss": 0.6174, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.2861458957195282, |
|
"learning_rate": 0.00018190148866680802, |
|
"loss": 0.6547, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6566566566566566, |
|
"grad_norm": 0.23667441308498383, |
|
"learning_rate": 0.0001814046093508218, |
|
"loss": 0.6416, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6646646646646647, |
|
"grad_norm": 0.23191799223423004, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.642, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6726726726726727, |
|
"grad_norm": 0.2622171938419342, |
|
"learning_rate": 0.00018039279618328212, |
|
"loss": 0.6241, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6806806806806807, |
|
"grad_norm": 0.2891266345977783, |
|
"learning_rate": 0.00017987793728863651, |
|
"loss": 0.6284, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6886886886886887, |
|
"grad_norm": 0.26767420768737793, |
|
"learning_rate": 0.00017935716089521474, |
|
"loss": 0.627, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6966966966966966, |
|
"grad_norm": 0.2828672230243683, |
|
"learning_rate": 0.00017883050558305255, |
|
"loss": 0.6418, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7047047047047047, |
|
"grad_norm": 0.32730573415756226, |
|
"learning_rate": 0.00017829801036770628, |
|
"loss": 0.6629, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7127127127127127, |
|
"grad_norm": 0.24029900133609772, |
|
"learning_rate": 0.0001777597146973627, |
|
"loss": 0.614, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 0.2929212152957916, |
|
"learning_rate": 0.00017721565844991643, |
|
"loss": 0.632, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7287287287287287, |
|
"grad_norm": 0.2860666513442993, |
|
"learning_rate": 0.00017666588193001595, |
|
"loss": 0.6289, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7367367367367368, |
|
"grad_norm": 0.23325330018997192, |
|
"learning_rate": 0.00017611042586607748, |
|
"loss": 0.6392, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7447447447447447, |
|
"grad_norm": 0.3126169443130493, |
|
"learning_rate": 0.00017554933140726802, |
|
"loss": 0.6422, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7527527527527528, |
|
"grad_norm": 0.26704883575439453, |
|
"learning_rate": 0.00017498264012045687, |
|
"loss": 0.6166, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7607607607607607, |
|
"grad_norm": 0.2184283286333084, |
|
"learning_rate": 0.00017441039398713608, |
|
"loss": 0.6235, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7687687687687688, |
|
"grad_norm": 0.23906390368938446, |
|
"learning_rate": 0.00017383263540031067, |
|
"loss": 0.6643, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7767767767767768, |
|
"grad_norm": 0.26839691400527954, |
|
"learning_rate": 0.0001732494071613579, |
|
"loss": 0.6514, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7847847847847848, |
|
"grad_norm": 0.2805701494216919, |
|
"learning_rate": 0.00017266075247685656, |
|
"loss": 0.6168, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7927927927927928, |
|
"grad_norm": 0.21650992333889008, |
|
"learning_rate": 0.00017206671495538612, |
|
"loss": 0.5983, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8008008008008008, |
|
"grad_norm": 0.2302800416946411, |
|
"learning_rate": 0.00017146733860429612, |
|
"loss": 0.6301, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8088088088088088, |
|
"grad_norm": 0.29078415036201477, |
|
"learning_rate": 0.000170862667826446, |
|
"loss": 0.616, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8168168168168168, |
|
"grad_norm": 0.24860034883022308, |
|
"learning_rate": 0.0001702527474169157, |
|
"loss": 0.6352, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8248248248248248, |
|
"grad_norm": 0.26281973719596863, |
|
"learning_rate": 0.00016963762255968722, |
|
"loss": 0.6218, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8328328328328328, |
|
"grad_norm": 0.29051998257637024, |
|
"learning_rate": 0.0001690173388242972, |
|
"loss": 0.6233, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8408408408408409, |
|
"grad_norm": 0.2471507042646408, |
|
"learning_rate": 0.00016839194216246108, |
|
"loss": 0.6147, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8488488488488488, |
|
"grad_norm": 0.2574704587459564, |
|
"learning_rate": 0.0001677614789046689, |
|
"loss": 0.6174, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8568568568568569, |
|
"grad_norm": 0.2551233172416687, |
|
"learning_rate": 0.00016712599575675316, |
|
"loss": 0.5989, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.2901318371295929, |
|
"learning_rate": 0.00016648553979642868, |
|
"loss": 0.6241, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8728728728728729, |
|
"grad_norm": 0.23769080638885498, |
|
"learning_rate": 0.0001658401584698049, |
|
"loss": 0.6044, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8808808808808809, |
|
"grad_norm": 0.2580976188182831, |
|
"learning_rate": 0.00016518989958787126, |
|
"loss": 0.622, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.24077744781970978, |
|
"learning_rate": 0.00016453481132295506, |
|
"loss": 0.6047, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8968968968968969, |
|
"grad_norm": 0.228902667760849, |
|
"learning_rate": 0.00016387494220515274, |
|
"loss": 0.6138, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9049049049049049, |
|
"grad_norm": 0.2607581317424774, |
|
"learning_rate": 0.00016321034111873488, |
|
"loss": 0.6307, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9129129129129129, |
|
"grad_norm": 0.2575569450855255, |
|
"learning_rate": 0.00016254105729852464, |
|
"loss": 0.6008, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9209209209209209, |
|
"grad_norm": 0.231553852558136, |
|
"learning_rate": 0.00016186714032625035, |
|
"loss": 0.617, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.928928928928929, |
|
"grad_norm": 0.24820354580879211, |
|
"learning_rate": 0.00016118864012687245, |
|
"loss": 0.5991, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9369369369369369, |
|
"grad_norm": 0.2364109754562378, |
|
"learning_rate": 0.00016050560696488492, |
|
"loss": 0.6094, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.944944944944945, |
|
"grad_norm": 0.2492029368877411, |
|
"learning_rate": 0.00015981809144059166, |
|
"loss": 0.6143, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9529529529529529, |
|
"grad_norm": 0.27745717763900757, |
|
"learning_rate": 0.00015912614448635782, |
|
"loss": 0.6203, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.960960960960961, |
|
"grad_norm": 0.2555610239505768, |
|
"learning_rate": 0.00015842981736283686, |
|
"loss": 0.6314, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.968968968968969, |
|
"grad_norm": 0.2268420308828354, |
|
"learning_rate": 0.00015772916165517273, |
|
"loss": 0.6155, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9769769769769769, |
|
"grad_norm": 0.250041127204895, |
|
"learning_rate": 0.00015702422926917872, |
|
"loss": 0.6226, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.984984984984985, |
|
"grad_norm": 0.2596072554588318, |
|
"learning_rate": 0.00015631507242749187, |
|
"loss": 0.6086, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.992992992992993, |
|
"grad_norm": 0.2280743271112442, |
|
"learning_rate": 0.00015560174366570446, |
|
"loss": 0.5994, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.001001001001001, |
|
"grad_norm": 0.23362237215042114, |
|
"learning_rate": 0.00015488429582847192, |
|
"loss": 0.616, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.006, |
|
"grad_norm": 0.2956937849521637, |
|
"learning_rate": 0.00015416278206559816, |
|
"loss": 0.6038, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.014, |
|
"grad_norm": 0.250629723072052, |
|
"learning_rate": 0.0001534372558280979, |
|
"loss": 0.5991, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.022, |
|
"grad_norm": 0.231906458735466, |
|
"learning_rate": 0.00015270777086423722, |
|
"loss": 0.6088, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.2888093590736389, |
|
"learning_rate": 0.0001519743812155516, |
|
"loss": 0.5892, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.038, |
|
"grad_norm": 0.24940524995326996, |
|
"learning_rate": 0.0001512371412128424, |
|
"loss": 0.5982, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.046, |
|
"grad_norm": 0.24017778038978577, |
|
"learning_rate": 0.00015049610547215205, |
|
"loss": 0.5608, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.054, |
|
"grad_norm": 0.2334035485982895, |
|
"learning_rate": 0.00014975132889071807, |
|
"loss": 0.6034, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.062, |
|
"grad_norm": 0.2773897349834442, |
|
"learning_rate": 0.00014900286664290592, |
|
"loss": 0.6387, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.24266445636749268, |
|
"learning_rate": 0.00014825077417612186, |
|
"loss": 0.5612, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.078, |
|
"grad_norm": 0.22919470071792603, |
|
"learning_rate": 0.00014749510720670506, |
|
"loss": 0.599, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.086, |
|
"grad_norm": 0.23829148709774017, |
|
"learning_rate": 0.00014673592171580025, |
|
"loss": 0.6066, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.094, |
|
"grad_norm": 0.31981223821640015, |
|
"learning_rate": 0.00014597327394521044, |
|
"loss": 0.5692, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.102, |
|
"grad_norm": 0.2747564911842346, |
|
"learning_rate": 0.00014520722039323045, |
|
"loss": 0.62, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.2592499852180481, |
|
"learning_rate": 0.00014443781781046136, |
|
"loss": 0.5937, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.1179999999999999, |
|
"grad_norm": 0.31891530752182007, |
|
"learning_rate": 0.0001436651231956064, |
|
"loss": 0.5973, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.126, |
|
"grad_norm": 0.2743702232837677, |
|
"learning_rate": 0.00014288919379124837, |
|
"loss": 0.6045, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.134, |
|
"grad_norm": 0.2665708661079407, |
|
"learning_rate": 0.00014211008707960897, |
|
"loss": 0.5898, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.142, |
|
"grad_norm": 0.33267003297805786, |
|
"learning_rate": 0.00014132786077829043, |
|
"loss": 0.5945, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.28636589646339417, |
|
"learning_rate": 0.00014054257283599973, |
|
"loss": 0.5914, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.158, |
|
"grad_norm": 0.27305400371551514, |
|
"learning_rate": 0.0001397542814282556, |
|
"loss": 0.6093, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.166, |
|
"grad_norm": 0.2839919924736023, |
|
"learning_rate": 0.0001389630449530788, |
|
"loss": 0.6074, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.174, |
|
"grad_norm": 0.25652188062667847, |
|
"learning_rate": 0.0001381689220266659, |
|
"loss": 0.6059, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.182, |
|
"grad_norm": 0.2549704909324646, |
|
"learning_rate": 0.0001373719714790469, |
|
"loss": 0.5568, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.2509196400642395, |
|
"learning_rate": 0.00013657225234972695, |
|
"loss": 0.5968, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.198, |
|
"grad_norm": 0.25909289717674255, |
|
"learning_rate": 0.0001357698238833126, |
|
"loss": 0.5902, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.206, |
|
"grad_norm": 0.26162394881248474, |
|
"learning_rate": 0.00013496474552512287, |
|
"loss": 0.5763, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.214, |
|
"grad_norm": 0.2721655070781708, |
|
"learning_rate": 0.00013415707691678556, |
|
"loss": 0.6037, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.222, |
|
"grad_norm": 0.28691592812538147, |
|
"learning_rate": 0.0001333468778918187, |
|
"loss": 0.625, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.25801119208335876, |
|
"learning_rate": 0.00013253420847119803, |
|
"loss": 0.5994, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.238, |
|
"grad_norm": 0.2609824538230896, |
|
"learning_rate": 0.00013171912885891063, |
|
"loss": 0.5999, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.246, |
|
"grad_norm": 0.2631840407848358, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.5844, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.254, |
|
"grad_norm": 0.2862647473812103, |
|
"learning_rate": 0.00013008198076356676, |
|
"loss": 0.5653, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.262, |
|
"grad_norm": 0.27327024936676025, |
|
"learning_rate": 0.00012926003356333488, |
|
"loss": 0.5933, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.2684379518032074, |
|
"learning_rate": 0.0001284359187281004, |
|
"loss": 0.5842, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.278, |
|
"grad_norm": 0.2620231807231903, |
|
"learning_rate": 0.00012760969730974694, |
|
"loss": 0.6079, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.286, |
|
"grad_norm": 0.2584543526172638, |
|
"learning_rate": 0.00012678143051621742, |
|
"loss": 0.5941, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.294, |
|
"grad_norm": 0.26279789209365845, |
|
"learning_rate": 0.00012595117970697997, |
|
"loss": 0.6086, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.302, |
|
"grad_norm": 0.2541520893573761, |
|
"learning_rate": 0.00012511900638848195, |
|
"loss": 0.5907, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.27384746074676514, |
|
"learning_rate": 0.0001242849722095936, |
|
"loss": 0.6055, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.318, |
|
"grad_norm": 0.25682300329208374, |
|
"learning_rate": 0.00012344913895704097, |
|
"loss": 0.609, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.326, |
|
"grad_norm": 0.25404492020606995, |
|
"learning_rate": 0.00012261156855082882, |
|
"loss": 0.6121, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.334, |
|
"grad_norm": 0.2649850845336914, |
|
"learning_rate": 0.0001217723230396532, |
|
"loss": 0.5695, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.342, |
|
"grad_norm": 0.28349268436431885, |
|
"learning_rate": 0.00012093146459630487, |
|
"loss": 0.5883, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.27068182826042175, |
|
"learning_rate": 0.00012008905551306356, |
|
"loss": 0.6147, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.358, |
|
"grad_norm": 0.29566338658332825, |
|
"learning_rate": 0.000119245158197083, |
|
"loss": 0.5901, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.366, |
|
"grad_norm": 0.27946797013282776, |
|
"learning_rate": 0.00011839983516576802, |
|
"loss": 0.5831, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.374, |
|
"grad_norm": 0.25005409121513367, |
|
"learning_rate": 0.00011755314904214284, |
|
"loss": 0.5656, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3820000000000001, |
|
"grad_norm": 0.2621053159236908, |
|
"learning_rate": 0.00011670516255021193, |
|
"loss": 0.5769, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3900000000000001, |
|
"grad_norm": 0.2531629502773285, |
|
"learning_rate": 0.00011585593851031347, |
|
"loss": 0.5641, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3980000000000001, |
|
"grad_norm": 0.3026553690433502, |
|
"learning_rate": 0.00011500553983446527, |
|
"loss": 0.6038, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4060000000000001, |
|
"grad_norm": 0.2748616337776184, |
|
"learning_rate": 0.00011415402952170433, |
|
"loss": 0.5808, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.414, |
|
"grad_norm": 0.27345311641693115, |
|
"learning_rate": 0.0001133014706534196, |
|
"loss": 0.5848, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.422, |
|
"grad_norm": 0.273357629776001, |
|
"learning_rate": 0.00011244792638867893, |
|
"loss": 0.5906, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.26230183243751526, |
|
"learning_rate": 0.00011159345995955006, |
|
"loss": 0.5886, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.438, |
|
"grad_norm": 0.2515832483768463, |
|
"learning_rate": 0.00011073813466641632, |
|
"loss": 0.5876, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.446, |
|
"grad_norm": 0.27451092004776, |
|
"learning_rate": 0.00010988201387328717, |
|
"loss": 0.5841, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.454, |
|
"grad_norm": 0.2588571012020111, |
|
"learning_rate": 0.00010902516100310411, |
|
"loss": 0.5674, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.462, |
|
"grad_norm": 0.2440604865550995, |
|
"learning_rate": 0.00010816763953304227, |
|
"loss": 0.5646, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.2681560516357422, |
|
"learning_rate": 0.00010730951298980776, |
|
"loss": 0.5671, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.478, |
|
"grad_norm": 0.297048419713974, |
|
"learning_rate": 0.00010645084494493165, |
|
"loss": 0.583, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.486, |
|
"grad_norm": 0.29275089502334595, |
|
"learning_rate": 0.00010559169901006034, |
|
"loss": 0.6007, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.494, |
|
"grad_norm": 0.2599998414516449, |
|
"learning_rate": 0.0001047321388322432, |
|
"loss": 0.5692, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.502, |
|
"grad_norm": 0.2714841663837433, |
|
"learning_rate": 0.00010387222808921746, |
|
"loss": 0.5996, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.2618089020252228, |
|
"learning_rate": 0.00010301203048469083, |
|
"loss": 0.5861, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.518, |
|
"grad_norm": 0.27270275354385376, |
|
"learning_rate": 0.00010215160974362223, |
|
"loss": 0.5771, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.526, |
|
"grad_norm": 0.2583703398704529, |
|
"learning_rate": 0.00010129102960750092, |
|
"loss": 0.5897, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.534, |
|
"grad_norm": 0.25854265689849854, |
|
"learning_rate": 0.00010043035382962443, |
|
"loss": 0.5778, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.542, |
|
"grad_norm": 0.27234703302383423, |
|
"learning_rate": 9.956964617037558e-05, |
|
"loss": 0.5659, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.2672337293624878, |
|
"learning_rate": 9.870897039249911e-05, |
|
"loss": 0.5792, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.558, |
|
"grad_norm": 0.2644350528717041, |
|
"learning_rate": 9.784839025637778e-05, |
|
"loss": 0.5798, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5659999999999998, |
|
"grad_norm": 0.2572389543056488, |
|
"learning_rate": 9.698796951530919e-05, |
|
"loss": 0.6008, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.5739999999999998, |
|
"grad_norm": 0.2693156898021698, |
|
"learning_rate": 9.612777191078258e-05, |
|
"loss": 0.5903, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.5819999999999999, |
|
"grad_norm": 0.26244068145751953, |
|
"learning_rate": 9.526786116775682e-05, |
|
"loss": 0.569, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5899999999999999, |
|
"grad_norm": 0.25875815749168396, |
|
"learning_rate": 9.440830098993969e-05, |
|
"loss": 0.6042, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.5979999999999999, |
|
"grad_norm": 0.27209436893463135, |
|
"learning_rate": 9.354915505506839e-05, |
|
"loss": 0.5846, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6059999999999999, |
|
"grad_norm": 0.270780473947525, |
|
"learning_rate": 9.269048701019226e-05, |
|
"loss": 0.5957, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.6139999999999999, |
|
"grad_norm": 0.25013670325279236, |
|
"learning_rate": 9.183236046695777e-05, |
|
"loss": 0.5845, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.6219999999999999, |
|
"grad_norm": 0.2716957628726959, |
|
"learning_rate": 9.09748389968959e-05, |
|
"loss": 0.5584, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.2737436890602112, |
|
"learning_rate": 9.011798612671286e-05, |
|
"loss": 0.5836, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.638, |
|
"grad_norm": 0.2748481333255768, |
|
"learning_rate": 8.92618653335837e-05, |
|
"loss": 0.5927, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.646, |
|
"grad_norm": 0.2642996609210968, |
|
"learning_rate": 8.840654004044996e-05, |
|
"loss": 0.6088, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.654, |
|
"grad_norm": 0.26341068744659424, |
|
"learning_rate": 8.755207361132108e-05, |
|
"loss": 0.5841, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.662, |
|
"grad_norm": 0.30515289306640625, |
|
"learning_rate": 8.669852934658042e-05, |
|
"loss": 0.5525, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2801468074321747, |
|
"learning_rate": 8.58459704782957e-05, |
|
"loss": 0.5547, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.678, |
|
"grad_norm": 0.27618443965911865, |
|
"learning_rate": 8.499446016553474e-05, |
|
"loss": 0.5613, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.686, |
|
"grad_norm": 0.26961931586265564, |
|
"learning_rate": 8.414406148968657e-05, |
|
"loss": 0.5639, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.694, |
|
"grad_norm": 0.29023805260658264, |
|
"learning_rate": 8.32948374497881e-05, |
|
"loss": 0.5878, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.702, |
|
"grad_norm": 0.2671249508857727, |
|
"learning_rate": 8.244685095785719e-05, |
|
"loss": 0.5743, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.26124948263168335, |
|
"learning_rate": 8.160016483423199e-05, |
|
"loss": 0.5801, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.718, |
|
"grad_norm": 0.2721916437149048, |
|
"learning_rate": 8.075484180291701e-05, |
|
"loss": 0.5975, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.726, |
|
"grad_norm": 0.2630290687084198, |
|
"learning_rate": 7.991094448693648e-05, |
|
"loss": 0.5714, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.734, |
|
"grad_norm": 0.25962400436401367, |
|
"learning_rate": 7.906853540369514e-05, |
|
"loss": 0.5912, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.742, |
|
"grad_norm": 0.26142629981040955, |
|
"learning_rate": 7.822767696034682e-05, |
|
"loss": 0.577, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.26839005947113037, |
|
"learning_rate": 7.738843144917119e-05, |
|
"loss": 0.5747, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.758, |
|
"grad_norm": 0.28053659200668335, |
|
"learning_rate": 7.655086104295904e-05, |
|
"loss": 0.612, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.766, |
|
"grad_norm": 0.2728564143180847, |
|
"learning_rate": 7.571502779040645e-05, |
|
"loss": 0.5895, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.774, |
|
"grad_norm": 0.26337721943855286, |
|
"learning_rate": 7.48809936115181e-05, |
|
"loss": 0.5666, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.782, |
|
"grad_norm": 0.2544887661933899, |
|
"learning_rate": 7.404882029302003e-05, |
|
"loss": 0.5552, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2690827250480652, |
|
"learning_rate": 7.321856948378259e-05, |
|
"loss": 0.5559, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.798, |
|
"grad_norm": 0.27959364652633667, |
|
"learning_rate": 7.239030269025311e-05, |
|
"loss": 0.5738, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.806, |
|
"grad_norm": 0.29118430614471436, |
|
"learning_rate": 7.156408127189965e-05, |
|
"loss": 0.5753, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.814, |
|
"grad_norm": 0.2585116922855377, |
|
"learning_rate": 7.073996643666517e-05, |
|
"loss": 0.5499, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.822, |
|
"grad_norm": 0.24918395280838013, |
|
"learning_rate": 6.991801923643324e-05, |
|
"loss": 0.5792, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.2664697766304016, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.582, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.838, |
|
"grad_norm": 0.2679463326931, |
|
"learning_rate": 6.82808711410894e-05, |
|
"loss": 0.5919, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.846, |
|
"grad_norm": 0.26800620555877686, |
|
"learning_rate": 6.746579152880201e-05, |
|
"loss": 0.5774, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.854, |
|
"grad_norm": 0.25649094581604004, |
|
"learning_rate": 6.665312210818131e-05, |
|
"loss": 0.5569, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.862, |
|
"grad_norm": 0.2661426067352295, |
|
"learning_rate": 6.584292308321445e-05, |
|
"loss": 0.5759, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.27011772990226746, |
|
"learning_rate": 6.503525447487715e-05, |
|
"loss": 0.5752, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.8780000000000001, |
|
"grad_norm": 0.28006768226623535, |
|
"learning_rate": 6.423017611668745e-05, |
|
"loss": 0.5873, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.8860000000000001, |
|
"grad_norm": 0.281974196434021, |
|
"learning_rate": 6.342774765027309e-05, |
|
"loss": 0.5868, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.8940000000000001, |
|
"grad_norm": 0.28022536635398865, |
|
"learning_rate": 6.262802852095311e-05, |
|
"loss": 0.5505, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.9020000000000001, |
|
"grad_norm": 0.2971389889717102, |
|
"learning_rate": 6.18310779733341e-05, |
|
"loss": 0.5879, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.9100000000000001, |
|
"grad_norm": 0.2779372036457062, |
|
"learning_rate": 6.103695504692122e-05, |
|
"loss": 0.5648, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.9180000000000001, |
|
"grad_norm": 0.3032248616218567, |
|
"learning_rate": 6.024571857174443e-05, |
|
"loss": 0.5884, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9260000000000002, |
|
"grad_norm": 0.26035642623901367, |
|
"learning_rate": 5.94574271640003e-05, |
|
"loss": 0.5837, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.9340000000000002, |
|
"grad_norm": 0.2789236009120941, |
|
"learning_rate": 5.8672139221709577e-05, |
|
"loss": 0.5745, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.942, |
|
"grad_norm": 0.27459922432899475, |
|
"learning_rate": 5.788991292039103e-05, |
|
"loss": 0.568, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2857699394226074, |
|
"learning_rate": 5.7110806208751655e-05, |
|
"loss": 0.5619, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.958, |
|
"grad_norm": 0.2715800106525421, |
|
"learning_rate": 5.633487680439361e-05, |
|
"loss": 0.5763, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.966, |
|
"grad_norm": 0.27073079347610474, |
|
"learning_rate": 5.556218218953868e-05, |
|
"loss": 0.5815, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.974, |
|
"grad_norm": 0.2630924880504608, |
|
"learning_rate": 5.479277960676958e-05, |
|
"loss": 0.5735, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.982, |
|
"grad_norm": 0.2807694971561432, |
|
"learning_rate": 5.40267260547896e-05, |
|
"loss": 0.5611, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.28481224179267883, |
|
"learning_rate": 5.326407828419979e-05, |
|
"loss": 0.5671, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.998, |
|
"grad_norm": 0.27675607800483704, |
|
"learning_rate": 5.2504892793295e-05, |
|
"loss": 0.5825, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.005, |
|
"grad_norm": 0.27290546894073486, |
|
"learning_rate": 5.174922582387819e-05, |
|
"loss": 0.5671, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.013, |
|
"grad_norm": 0.264244019985199, |
|
"learning_rate": 5.0997133357094085e-05, |
|
"loss": 0.5697, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.021, |
|
"grad_norm": 0.28058966994285583, |
|
"learning_rate": 5.0248671109281934e-05, |
|
"loss": 0.548, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.029, |
|
"grad_norm": 0.28881585597991943, |
|
"learning_rate": 4.9503894527847964e-05, |
|
"loss": 0.5448, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.037, |
|
"grad_norm": 0.2921096980571747, |
|
"learning_rate": 4.876285878715764e-05, |
|
"loss": 0.5538, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.045, |
|
"grad_norm": 0.28465935587882996, |
|
"learning_rate": 4.802561878444845e-05, |
|
"loss": 0.5271, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.053, |
|
"grad_norm": 0.2935147285461426, |
|
"learning_rate": 4.729222913576279e-05, |
|
"loss": 0.5654, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.061, |
|
"grad_norm": 0.28425726294517517, |
|
"learning_rate": 4.656274417190214e-05, |
|
"loss": 0.5303, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.069, |
|
"grad_norm": 0.3496411144733429, |
|
"learning_rate": 4.583721793440188e-05, |
|
"loss": 0.5462, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.077, |
|
"grad_norm": 0.29614442586898804, |
|
"learning_rate": 4.5115704171528105e-05, |
|
"loss": 0.5528, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.085, |
|
"grad_norm": 0.3171357810497284, |
|
"learning_rate": 4.439825633429557e-05, |
|
"loss": 0.5296, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.093, |
|
"grad_norm": 0.3029578626155853, |
|
"learning_rate": 4.368492757250814e-05, |
|
"loss": 0.5499, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.101, |
|
"grad_norm": 0.30544060468673706, |
|
"learning_rate": 4.297577073082129e-05, |
|
"loss": 0.565, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.109, |
|
"grad_norm": 0.3022664785385132, |
|
"learning_rate": 4.227083834482728e-05, |
|
"loss": 0.5317, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.117, |
|
"grad_norm": 0.29141902923583984, |
|
"learning_rate": 4.1570182637163155e-05, |
|
"loss": 0.5365, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.2986874282360077, |
|
"learning_rate": 4.087385551364219e-05, |
|
"loss": 0.5617, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.133, |
|
"grad_norm": 0.2957478165626526, |
|
"learning_rate": 4.0181908559408366e-05, |
|
"loss": 0.5285, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.141, |
|
"grad_norm": 0.3051709830760956, |
|
"learning_rate": 3.949439303511512e-05, |
|
"loss": 0.5388, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.149, |
|
"grad_norm": 0.30644309520721436, |
|
"learning_rate": 3.881135987312757e-05, |
|
"loss": 0.556, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.157, |
|
"grad_norm": 0.3261376619338989, |
|
"learning_rate": 3.813285967374969e-05, |
|
"loss": 0.5568, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.165, |
|
"grad_norm": 0.29734110832214355, |
|
"learning_rate": 3.745894270147539e-05, |
|
"loss": 0.5369, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.173, |
|
"grad_norm": 0.28361955285072327, |
|
"learning_rate": 3.678965888126513e-05, |
|
"loss": 0.5395, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.181, |
|
"grad_norm": 0.31067171692848206, |
|
"learning_rate": 3.612505779484728e-05, |
|
"loss": 0.5634, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.189, |
|
"grad_norm": 0.31847238540649414, |
|
"learning_rate": 3.546518867704499e-05, |
|
"loss": 0.5435, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.197, |
|
"grad_norm": 0.3154855966567993, |
|
"learning_rate": 3.4810100412128747e-05, |
|
"loss": 0.545, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.205, |
|
"grad_norm": 0.2924310564994812, |
|
"learning_rate": 3.415984153019513e-05, |
|
"loss": 0.5512, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.213, |
|
"grad_norm": 0.301763117313385, |
|
"learning_rate": 3.351446020357136e-05, |
|
"loss": 0.5619, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.221, |
|
"grad_norm": 0.2811594307422638, |
|
"learning_rate": 3.287400424324687e-05, |
|
"loss": 0.5421, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.229, |
|
"grad_norm": 0.2934137284755707, |
|
"learning_rate": 3.223852109533112e-05, |
|
"loss": 0.5517, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.237, |
|
"grad_norm": 0.3074370324611664, |
|
"learning_rate": 3.160805783753897e-05, |
|
"loss": 0.5655, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.245, |
|
"grad_norm": 0.2928076386451721, |
|
"learning_rate": 3.098266117570282e-05, |
|
"loss": 0.5519, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.253, |
|
"grad_norm": 0.3051488995552063, |
|
"learning_rate": 3.0362377440312784e-05, |
|
"loss": 0.5493, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.261, |
|
"grad_norm": 0.30227115750312805, |
|
"learning_rate": 2.9747252583084295e-05, |
|
"loss": 0.5501, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.269, |
|
"grad_norm": 0.29634636640548706, |
|
"learning_rate": 2.9137332173554043e-05, |
|
"loss": 0.5264, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.277, |
|
"grad_norm": 0.2818604111671448, |
|
"learning_rate": 2.853266139570391e-05, |
|
"loss": 0.5434, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.285, |
|
"grad_norm": 0.2941757142543793, |
|
"learning_rate": 2.793328504461391e-05, |
|
"loss": 0.5477, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.293, |
|
"grad_norm": 0.2820718288421631, |
|
"learning_rate": 2.733924752314345e-05, |
|
"loss": 0.545, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.301, |
|
"grad_norm": 0.29931187629699707, |
|
"learning_rate": 2.675059283864214e-05, |
|
"loss": 0.5372, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.309, |
|
"grad_norm": 0.3119406998157501, |
|
"learning_rate": 2.616736459968936e-05, |
|
"loss": 0.562, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.317, |
|
"grad_norm": 0.3057345747947693, |
|
"learning_rate": 2.5589606012863963e-05, |
|
"loss": 0.5648, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.325, |
|
"grad_norm": 0.29526305198669434, |
|
"learning_rate": 2.5017359879543166e-05, |
|
"loss": 0.5448, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.333, |
|
"grad_norm": 0.32076817750930786, |
|
"learning_rate": 2.4450668592731974e-05, |
|
"loss": 0.5537, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.341, |
|
"grad_norm": 0.3103630542755127, |
|
"learning_rate": 2.388957413392253e-05, |
|
"loss": 0.5746, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.349, |
|
"grad_norm": 0.2959766089916229, |
|
"learning_rate": 2.33341180699841e-05, |
|
"loss": 0.5532, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.357, |
|
"grad_norm": 0.28677570819854736, |
|
"learning_rate": 2.2784341550083576e-05, |
|
"loss": 0.5439, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.365, |
|
"grad_norm": 0.29818570613861084, |
|
"learning_rate": 2.224028530263733e-05, |
|
"loss": 0.5453, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.373, |
|
"grad_norm": 0.2917187809944153, |
|
"learning_rate": 2.1701989632293717e-05, |
|
"loss": 0.5335, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.3810000000000002, |
|
"grad_norm": 0.31331866979599, |
|
"learning_rate": 2.1169494416947477e-05, |
|
"loss": 0.5663, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.3890000000000002, |
|
"grad_norm": 0.2955493927001953, |
|
"learning_rate": 2.0642839104785272e-05, |
|
"loss": 0.5509, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.3970000000000002, |
|
"grad_norm": 0.3018077313899994, |
|
"learning_rate": 2.0122062711363532e-05, |
|
"loss": 0.5435, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4050000000000002, |
|
"grad_norm": 0.3038371801376343, |
|
"learning_rate": 1.9607203816717888e-05, |
|
"loss": 0.5544, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.413, |
|
"grad_norm": 0.29873090982437134, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.5606, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.421, |
|
"grad_norm": 0.29808226227760315, |
|
"learning_rate": 1.859539064917821e-05, |
|
"loss": 0.5479, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.429, |
|
"grad_norm": 0.3080662190914154, |
|
"learning_rate": 1.8098511333192024e-05, |
|
"loss": 0.5587, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.437, |
|
"grad_norm": 0.29491138458251953, |
|
"learning_rate": 1.7607699424244585e-05, |
|
"loss": 0.5482, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.445, |
|
"grad_norm": 0.3016619384288788, |
|
"learning_rate": 1.712299128254965e-05, |
|
"loss": 0.5539, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.453, |
|
"grad_norm": 0.31340616941452026, |
|
"learning_rate": 1.6644422816143024e-05, |
|
"loss": 0.5559, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.461, |
|
"grad_norm": 0.3533984422683716, |
|
"learning_rate": 1.6172029478222594e-05, |
|
"loss": 0.5288, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.469, |
|
"grad_norm": 0.2987024188041687, |
|
"learning_rate": 1.570584626452173e-05, |
|
"loss": 0.5405, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.477, |
|
"grad_norm": 0.312209814786911, |
|
"learning_rate": 1.5245907710716911e-05, |
|
"loss": 0.544, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.485, |
|
"grad_norm": 0.30217549204826355, |
|
"learning_rate": 1.4792247889869071e-05, |
|
"loss": 0.5483, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.493, |
|
"grad_norm": 0.2916695475578308, |
|
"learning_rate": 1.4344900409899642e-05, |
|
"loss": 0.5461, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.501, |
|
"grad_norm": 0.3004852831363678, |
|
"learning_rate": 1.3903898411100568e-05, |
|
"loss": 0.5427, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.509, |
|
"grad_norm": 0.3007405400276184, |
|
"learning_rate": 1.3469274563679402e-05, |
|
"loss": 0.5666, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.517, |
|
"grad_norm": 0.2975088655948639, |
|
"learning_rate": 1.30410610653389e-05, |
|
"loss": 0.5528, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.525, |
|
"grad_norm": 0.31289756298065186, |
|
"learning_rate": 1.261928963889194e-05, |
|
"loss": 0.5757, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.533, |
|
"grad_norm": 0.32117587327957153, |
|
"learning_rate": 1.2203991529911197e-05, |
|
"loss": 0.5182, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.541, |
|
"grad_norm": 0.31478121876716614, |
|
"learning_rate": 1.1795197504414656e-05, |
|
"loss": 0.542, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.549, |
|
"grad_norm": 0.302287220954895, |
|
"learning_rate": 1.1392937846586215e-05, |
|
"loss": 0.5275, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.557, |
|
"grad_norm": 0.3095117509365082, |
|
"learning_rate": 1.0997242356532334e-05, |
|
"loss": 0.5534, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.565, |
|
"grad_norm": 0.3028477728366852, |
|
"learning_rate": 1.0608140348074292e-05, |
|
"loss": 0.5463, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.573, |
|
"grad_norm": 0.30029499530792236, |
|
"learning_rate": 1.0225660646576629e-05, |
|
"loss": 0.5449, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.581, |
|
"grad_norm": 0.30782079696655273, |
|
"learning_rate": 9.849831586811598e-06, |
|
"loss": 0.5508, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.589, |
|
"grad_norm": 0.2926454544067383, |
|
"learning_rate": 9.48068101086026e-06, |
|
"loss": 0.5467, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.597, |
|
"grad_norm": 0.3094619810581207, |
|
"learning_rate": 9.118236266049707e-06, |
|
"loss": 0.5496, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.605, |
|
"grad_norm": 0.30689892172813416, |
|
"learning_rate": 8.76252420292728e-06, |
|
"loss": 0.565, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.613, |
|
"grad_norm": 0.29666024446487427, |
|
"learning_rate": 8.413571173271295e-06, |
|
"loss": 0.5264, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.621, |
|
"grad_norm": 0.2976740598678589, |
|
"learning_rate": 8.071403028138968e-06, |
|
"loss": 0.5544, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.629, |
|
"grad_norm": 0.2889856994152069, |
|
"learning_rate": 7.736045115951251e-06, |
|
"loss": 0.5363, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.637, |
|
"grad_norm": 0.3127500116825104, |
|
"learning_rate": 7.40752228061502e-06, |
|
"loss": 0.5636, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.645, |
|
"grad_norm": 0.31198227405548096, |
|
"learning_rate": 7.085858859682571e-06, |
|
"loss": 0.5518, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.653, |
|
"grad_norm": 0.318718284368515, |
|
"learning_rate": 6.7710786825486705e-06, |
|
"loss": 0.5456, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.661, |
|
"grad_norm": 0.3112807869911194, |
|
"learning_rate": 6.463205068685174e-06, |
|
"loss": 0.5297, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.669, |
|
"grad_norm": 0.30573782324790955, |
|
"learning_rate": 6.16226082591359e-06, |
|
"loss": 0.5353, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.677, |
|
"grad_norm": 0.3077561855316162, |
|
"learning_rate": 5.868268248715292e-06, |
|
"loss": 0.5449, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.685, |
|
"grad_norm": 0.3065972626209259, |
|
"learning_rate": 5.5812491165800675e-06, |
|
"loss": 0.5467, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.693, |
|
"grad_norm": 0.3004700541496277, |
|
"learning_rate": 5.3012246923924816e-06, |
|
"loss": 0.5394, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.701, |
|
"grad_norm": 0.30005961656570435, |
|
"learning_rate": 5.028215720856821e-06, |
|
"loss": 0.5605, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.709, |
|
"grad_norm": 0.3056250810623169, |
|
"learning_rate": 4.762242426960262e-06, |
|
"loss": 0.5388, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.717, |
|
"grad_norm": 0.3118292987346649, |
|
"learning_rate": 4.503324514474483e-06, |
|
"loss": 0.5206, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.725, |
|
"grad_norm": 0.29597511887550354, |
|
"learning_rate": 4.251481164496074e-06, |
|
"loss": 0.5595, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.733, |
|
"grad_norm": 0.29819923639297485, |
|
"learning_rate": 4.006731034025546e-06, |
|
"loss": 0.5391, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.741, |
|
"grad_norm": 0.3268795907497406, |
|
"learning_rate": 3.769092254585138e-06, |
|
"loss": 0.5552, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.749, |
|
"grad_norm": 0.3218269646167755, |
|
"learning_rate": 3.5385824308756587e-06, |
|
"loss": 0.5511, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.757, |
|
"grad_norm": 0.29198867082595825, |
|
"learning_rate": 3.3152186394722505e-06, |
|
"loss": 0.5419, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.765, |
|
"grad_norm": 0.30343398451805115, |
|
"learning_rate": 3.099017427559392e-06, |
|
"loss": 0.5283, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.773, |
|
"grad_norm": 0.30696964263916016, |
|
"learning_rate": 2.889994811704966e-06, |
|
"loss": 0.5488, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.781, |
|
"grad_norm": 0.30184078216552734, |
|
"learning_rate": 2.688166276673809e-06, |
|
"loss": 0.5294, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.789, |
|
"grad_norm": 0.29285117983818054, |
|
"learning_rate": 2.493546774280531e-06, |
|
"loss": 0.5586, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.797, |
|
"grad_norm": 0.3032819926738739, |
|
"learning_rate": 2.30615072228183e-06, |
|
"loss": 0.5252, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.805, |
|
"grad_norm": 0.2997422516345978, |
|
"learning_rate": 2.1259920033084745e-06, |
|
"loss": 0.5515, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.8129999999999997, |
|
"grad_norm": 0.29675498604774475, |
|
"learning_rate": 1.9530839638367995e-06, |
|
"loss": 0.5636, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.8209999999999997, |
|
"grad_norm": 0.30580922961235046, |
|
"learning_rate": 1.7874394131999427e-06, |
|
"loss": 0.55, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.8289999999999997, |
|
"grad_norm": 0.298286497592926, |
|
"learning_rate": 1.6290706226390285e-06, |
|
"loss": 0.5306, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.8369999999999997, |
|
"grad_norm": 0.30674898624420166, |
|
"learning_rate": 1.4779893243939359e-06, |
|
"loss": 0.565, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.8449999999999998, |
|
"grad_norm": 0.30078810453414917, |
|
"learning_rate": 1.334206710834296e-06, |
|
"loss": 0.5152, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.8529999999999998, |
|
"grad_norm": 0.2936237156391144, |
|
"learning_rate": 1.1977334336302438e-06, |
|
"loss": 0.5393, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.8609999999999998, |
|
"grad_norm": 0.30025407671928406, |
|
"learning_rate": 1.068579602963371e-06, |
|
"loss": 0.5522, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.8689999999999998, |
|
"grad_norm": 0.31208300590515137, |
|
"learning_rate": 9.46754786777726e-07, |
|
"loss": 0.5616, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.877, |
|
"grad_norm": 0.29706162214279175, |
|
"learning_rate": 8.322680100710023e-07, |
|
"loss": 0.5437, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.885, |
|
"grad_norm": 0.30245542526245117, |
|
"learning_rate": 7.251277542259849e-07, |
|
"loss": 0.5608, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.893, |
|
"grad_norm": 0.2977691888809204, |
|
"learning_rate": 6.253419563821972e-07, |
|
"loss": 0.554, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.901, |
|
"grad_norm": 0.32129979133605957, |
|
"learning_rate": 5.329180088478935e-07, |
|
"loss": 0.5557, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.909, |
|
"grad_norm": 0.3130888044834137, |
|
"learning_rate": 4.4786275855247527e-07, |
|
"loss": 0.5433, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.917, |
|
"grad_norm": 0.2962971329689026, |
|
"learning_rate": 3.701825065392184e-07, |
|
"loss": 0.5362, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.925, |
|
"grad_norm": 0.29684919118881226, |
|
"learning_rate": 2.998830074984915e-07, |
|
"loss": 0.5295, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.933, |
|
"grad_norm": 0.3092200756072998, |
|
"learning_rate": 2.369694693414304e-07, |
|
"loss": 0.5645, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.941, |
|
"grad_norm": 0.2915026843547821, |
|
"learning_rate": 1.8144655281413513e-07, |
|
"loss": 0.5491, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.949, |
|
"grad_norm": 0.310531347990036, |
|
"learning_rate": 1.333183711524133e-07, |
|
"loss": 0.5447, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.957, |
|
"grad_norm": 0.28913000226020813, |
|
"learning_rate": 9.258848977700129e-08, |
|
"loss": 0.5251, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.965, |
|
"grad_norm": 0.28818029165267944, |
|
"learning_rate": 5.925992602952013e-08, |
|
"loss": 0.5324, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.973, |
|
"grad_norm": 0.31184566020965576, |
|
"learning_rate": 3.333514894887646e-08, |
|
"loss": 0.5557, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.981, |
|
"grad_norm": 0.29842570424079895, |
|
"learning_rate": 1.4816079088375567e-08, |
|
"loss": 0.5384, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.989, |
|
"grad_norm": 0.3020179867744446, |
|
"learning_rate": 3.7040883734462683e-09, |
|
"loss": 0.5352, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.997, |
|
"grad_norm": 0.28850436210632324, |
|
"learning_rate": 0.0, |
|
"loss": 0.5472, |
|
"step": 375 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 375, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 125, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.219477097775104e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|