|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.998, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008008008008008008, |
|
"grad_norm": 1.6445057392120361, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3547, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016016016016016016, |
|
"grad_norm": 1.63363778591156, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3812, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024024024024024024, |
|
"grad_norm": 1.6492197513580322, |
|
"learning_rate": 6e-05, |
|
"loss": 2.3399, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03203203203203203, |
|
"grad_norm": 1.6518611907958984, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3172, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04004004004004004, |
|
"grad_norm": 1.7173571586608887, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2563, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04804804804804805, |
|
"grad_norm": 1.563859224319458, |
|
"learning_rate": 0.00012, |
|
"loss": 2.0256, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056056056056056056, |
|
"grad_norm": 1.5590581893920898, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8324, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06406406406406406, |
|
"grad_norm": 1.5127277374267578, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5787, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07207207207207207, |
|
"grad_norm": 1.5447226762771606, |
|
"learning_rate": 0.00018, |
|
"loss": 1.3826, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08008008008008008, |
|
"grad_norm": 4.600811004638672, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2388, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08808808808808809, |
|
"grad_norm": 1.6333264112472534, |
|
"learning_rate": 0.00019999629591162656, |
|
"loss": 1.0977, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0960960960960961, |
|
"grad_norm": 1.5325253009796143, |
|
"learning_rate": 0.00019998518392091164, |
|
"loss": 1.0178, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1041041041041041, |
|
"grad_norm": 1.866473913192749, |
|
"learning_rate": 0.00019996666485105113, |
|
"loss": 0.9454, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11211211211211211, |
|
"grad_norm": 1.450692892074585, |
|
"learning_rate": 0.0001999407400739705, |
|
"loss": 0.8514, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12012012012012012, |
|
"grad_norm": 5.149086952209473, |
|
"learning_rate": 0.00019990741151022301, |
|
"loss": 0.9136, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12812812812812813, |
|
"grad_norm": 0.7399551272392273, |
|
"learning_rate": 0.00019986668162884762, |
|
"loss": 0.8742, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13613613613613615, |
|
"grad_norm": 0.6142033934593201, |
|
"learning_rate": 0.00019981855344718588, |
|
"loss": 0.8082, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14414414414414414, |
|
"grad_norm": 0.47605425119400024, |
|
"learning_rate": 0.00019976303053065859, |
|
"loss": 0.8019, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.15215215215215216, |
|
"grad_norm": 0.3993614614009857, |
|
"learning_rate": 0.00019970011699250152, |
|
"loss": 0.7625, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16016016016016016, |
|
"grad_norm": 0.4947199821472168, |
|
"learning_rate": 0.00019962981749346078, |
|
"loss": 0.7419, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16816816816816818, |
|
"grad_norm": 0.549526572227478, |
|
"learning_rate": 0.00019955213724144754, |
|
"loss": 0.7468, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.17617617617617617, |
|
"grad_norm": 0.34314435720443726, |
|
"learning_rate": 0.00019946708199115211, |
|
"loss": 0.7482, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1841841841841842, |
|
"grad_norm": 0.38283613324165344, |
|
"learning_rate": 0.00019937465804361783, |
|
"loss": 0.7304, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1921921921921922, |
|
"grad_norm": 0.28871795535087585, |
|
"learning_rate": 0.00019927487224577402, |
|
"loss": 0.746, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2002002002002002, |
|
"grad_norm": 0.321494996547699, |
|
"learning_rate": 0.000199167731989929, |
|
"loss": 0.7461, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2082082082082082, |
|
"grad_norm": 0.315449982881546, |
|
"learning_rate": 0.0001990532452132223, |
|
"loss": 0.7286, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 0.2904318571090698, |
|
"learning_rate": 0.00019893142039703664, |
|
"loss": 0.7119, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.22422422422422422, |
|
"grad_norm": 0.27874529361724854, |
|
"learning_rate": 0.00019880226656636977, |
|
"loss": 0.7105, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23223223223223224, |
|
"grad_norm": 0.2948579490184784, |
|
"learning_rate": 0.0001986657932891657, |
|
"loss": 0.6976, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24024024024024024, |
|
"grad_norm": 0.2542964220046997, |
|
"learning_rate": 0.00019852201067560606, |
|
"loss": 0.7351, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.24824824824824826, |
|
"grad_norm": 0.2960706353187561, |
|
"learning_rate": 0.000198370929377361, |
|
"loss": 0.7179, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.25625625625625625, |
|
"grad_norm": 0.24776384234428406, |
|
"learning_rate": 0.00019821256058680006, |
|
"loss": 0.7134, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.26426426426426425, |
|
"grad_norm": 0.33054184913635254, |
|
"learning_rate": 0.00019804691603616324, |
|
"loss": 0.6995, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2722722722722723, |
|
"grad_norm": 0.2543237805366516, |
|
"learning_rate": 0.00019787400799669154, |
|
"loss": 0.7081, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2802802802802803, |
|
"grad_norm": 0.25240710377693176, |
|
"learning_rate": 0.0001976938492777182, |
|
"loss": 0.6928, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2882882882882883, |
|
"grad_norm": 0.35880276560783386, |
|
"learning_rate": 0.0001975064532257195, |
|
"loss": 0.7177, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.3675362467765808, |
|
"learning_rate": 0.0001973118337233262, |
|
"loss": 0.6865, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.30430430430430433, |
|
"grad_norm": 0.3688451051712036, |
|
"learning_rate": 0.00019711000518829507, |
|
"loss": 0.6724, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3123123123123123, |
|
"grad_norm": 0.2982208728790283, |
|
"learning_rate": 0.00019690098257244064, |
|
"loss": 0.671, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3203203203203203, |
|
"grad_norm": 0.24197936058044434, |
|
"learning_rate": 0.00019668478136052774, |
|
"loss": 0.6777, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3283283283283283, |
|
"grad_norm": 0.748349130153656, |
|
"learning_rate": 0.00019646141756912434, |
|
"loss": 0.6641, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.33633633633633636, |
|
"grad_norm": 0.5585939288139343, |
|
"learning_rate": 0.00019623090774541487, |
|
"loss": 0.6988, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.34434434434434436, |
|
"grad_norm": 0.40285471081733704, |
|
"learning_rate": 0.00019599326896597448, |
|
"loss": 0.6811, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.35235235235235235, |
|
"grad_norm": 0.25714346766471863, |
|
"learning_rate": 0.00019574851883550395, |
|
"loss": 0.6913, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.36036036036036034, |
|
"grad_norm": 0.4926215708255768, |
|
"learning_rate": 0.00019549667548552556, |
|
"loss": 0.6707, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3683683683683684, |
|
"grad_norm": 0.3760850429534912, |
|
"learning_rate": 0.00019523775757303974, |
|
"loss": 0.6809, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3763763763763764, |
|
"grad_norm": 0.3734811842441559, |
|
"learning_rate": 0.0001949717842791432, |
|
"loss": 0.6386, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3843843843843844, |
|
"grad_norm": 0.3447561264038086, |
|
"learning_rate": 0.00019469877530760754, |
|
"loss": 0.6955, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3923923923923924, |
|
"grad_norm": 0.2680707573890686, |
|
"learning_rate": 0.00019441875088341997, |
|
"loss": 0.6625, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4004004004004004, |
|
"grad_norm": 0.2692941725254059, |
|
"learning_rate": 0.00019413173175128473, |
|
"loss": 0.66, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4084084084084084, |
|
"grad_norm": 0.32329630851745605, |
|
"learning_rate": 0.00019383773917408642, |
|
"loss": 0.6612, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4164164164164164, |
|
"grad_norm": 0.281435489654541, |
|
"learning_rate": 0.00019353679493131485, |
|
"loss": 0.6621, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4244244244244244, |
|
"grad_norm": 0.22186556458473206, |
|
"learning_rate": 0.00019322892131745135, |
|
"loss": 0.6465, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.2902645468711853, |
|
"learning_rate": 0.00019291414114031743, |
|
"loss": 0.6693, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.44044044044044045, |
|
"grad_norm": 0.2899124324321747, |
|
"learning_rate": 0.000192592477719385, |
|
"loss": 0.6568, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.44844844844844844, |
|
"grad_norm": 0.2124062478542328, |
|
"learning_rate": 0.00019226395488404876, |
|
"loss": 0.6724, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.45645645645645644, |
|
"grad_norm": 0.23896393179893494, |
|
"learning_rate": 0.00019192859697186106, |
|
"loss": 0.6459, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4644644644644645, |
|
"grad_norm": 0.2762405574321747, |
|
"learning_rate": 0.00019158642882672873, |
|
"loss": 0.6498, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4724724724724725, |
|
"grad_norm": 0.2079935222864151, |
|
"learning_rate": 0.00019123747579707275, |
|
"loss": 0.6604, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4804804804804805, |
|
"grad_norm": 0.23864208161830902, |
|
"learning_rate": 0.0001908817637339503, |
|
"loss": 0.6378, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.48848848848848847, |
|
"grad_norm": 0.21718506515026093, |
|
"learning_rate": 0.00019051931898913976, |
|
"loss": 0.6424, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4964964964964965, |
|
"grad_norm": 0.2773915231227875, |
|
"learning_rate": 0.0001901501684131884, |
|
"loss": 0.6474, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5045045045045045, |
|
"grad_norm": 0.23982493579387665, |
|
"learning_rate": 0.0001897743393534234, |
|
"loss": 0.6256, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5125125125125125, |
|
"grad_norm": 0.23621873557567596, |
|
"learning_rate": 0.0001893918596519257, |
|
"loss": 0.6403, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5205205205205206, |
|
"grad_norm": 0.22759953141212463, |
|
"learning_rate": 0.00018900275764346768, |
|
"loss": 0.6484, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5285285285285285, |
|
"grad_norm": 0.26695549488067627, |
|
"learning_rate": 0.00018860706215341382, |
|
"loss": 0.609, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5365365365365365, |
|
"grad_norm": 0.24594709277153015, |
|
"learning_rate": 0.00018820480249558537, |
|
"loss": 0.6338, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5445445445445446, |
|
"grad_norm": 0.22960062325000763, |
|
"learning_rate": 0.00018779600847008884, |
|
"loss": 0.6166, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5525525525525525, |
|
"grad_norm": 0.25302109122276306, |
|
"learning_rate": 0.00018738071036110808, |
|
"loss": 0.6422, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5605605605605606, |
|
"grad_norm": 0.3339892327785492, |
|
"learning_rate": 0.0001869589389346611, |
|
"loss": 0.6558, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5685685685685685, |
|
"grad_norm": 0.21397258341312408, |
|
"learning_rate": 0.00018653072543632062, |
|
"loss": 0.6323, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5765765765765766, |
|
"grad_norm": 0.2514493465423584, |
|
"learning_rate": 0.00018609610158889942, |
|
"loss": 0.657, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5845845845845846, |
|
"grad_norm": 0.25317835807800293, |
|
"learning_rate": 0.00018565509959010036, |
|
"loss": 0.641, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.22669494152069092, |
|
"learning_rate": 0.00018520775211013093, |
|
"loss": 0.6369, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6006006006006006, |
|
"grad_norm": 0.2214743047952652, |
|
"learning_rate": 0.00018475409228928312, |
|
"loss": 0.6307, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6086086086086087, |
|
"grad_norm": 0.24376747012138367, |
|
"learning_rate": 0.00018429415373547828, |
|
"loss": 0.6557, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6166166166166166, |
|
"grad_norm": 0.2158333659172058, |
|
"learning_rate": 0.00018382797052177746, |
|
"loss": 0.655, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6246246246246246, |
|
"grad_norm": 0.25565382838249207, |
|
"learning_rate": 0.000183355577183857, |
|
"loss": 0.6299, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6326326326326326, |
|
"grad_norm": 0.20636747777462006, |
|
"learning_rate": 0.00018287700871745036, |
|
"loss": 0.6283, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6406406406406406, |
|
"grad_norm": 0.21258121728897095, |
|
"learning_rate": 0.00018239230057575542, |
|
"loss": 0.6174, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.2861458957195282, |
|
"learning_rate": 0.00018190148866680802, |
|
"loss": 0.6547, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6566566566566566, |
|
"grad_norm": 0.23667441308498383, |
|
"learning_rate": 0.0001814046093508218, |
|
"loss": 0.6416, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6646646646646647, |
|
"grad_norm": 0.23191799223423004, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.642, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6726726726726727, |
|
"grad_norm": 0.2622171938419342, |
|
"learning_rate": 0.00018039279618328212, |
|
"loss": 0.6241, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6806806806806807, |
|
"grad_norm": 0.2891266345977783, |
|
"learning_rate": 0.00017987793728863651, |
|
"loss": 0.6284, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6886886886886887, |
|
"grad_norm": 0.26767420768737793, |
|
"learning_rate": 0.00017935716089521474, |
|
"loss": 0.627, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6966966966966966, |
|
"grad_norm": 0.2828672230243683, |
|
"learning_rate": 0.00017883050558305255, |
|
"loss": 0.6418, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7047047047047047, |
|
"grad_norm": 0.32730573415756226, |
|
"learning_rate": 0.00017829801036770628, |
|
"loss": 0.6629, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7127127127127127, |
|
"grad_norm": 0.24029900133609772, |
|
"learning_rate": 0.0001777597146973627, |
|
"loss": 0.614, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 0.2929212152957916, |
|
"learning_rate": 0.00017721565844991643, |
|
"loss": 0.632, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7287287287287287, |
|
"grad_norm": 0.2860666513442993, |
|
"learning_rate": 0.00017666588193001595, |
|
"loss": 0.6289, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7367367367367368, |
|
"grad_norm": 0.23325330018997192, |
|
"learning_rate": 0.00017611042586607748, |
|
"loss": 0.6392, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7447447447447447, |
|
"grad_norm": 0.3126169443130493, |
|
"learning_rate": 0.00017554933140726802, |
|
"loss": 0.6422, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7527527527527528, |
|
"grad_norm": 0.26704883575439453, |
|
"learning_rate": 0.00017498264012045687, |
|
"loss": 0.6166, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7607607607607607, |
|
"grad_norm": 0.2184283286333084, |
|
"learning_rate": 0.00017441039398713608, |
|
"loss": 0.6235, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7687687687687688, |
|
"grad_norm": 0.23906390368938446, |
|
"learning_rate": 0.00017383263540031067, |
|
"loss": 0.6643, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7767767767767768, |
|
"grad_norm": 0.26839691400527954, |
|
"learning_rate": 0.0001732494071613579, |
|
"loss": 0.6514, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7847847847847848, |
|
"grad_norm": 0.2805701494216919, |
|
"learning_rate": 0.00017266075247685656, |
|
"loss": 0.6168, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7927927927927928, |
|
"grad_norm": 0.21650992333889008, |
|
"learning_rate": 0.00017206671495538612, |
|
"loss": 0.5983, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8008008008008008, |
|
"grad_norm": 0.2302800416946411, |
|
"learning_rate": 0.00017146733860429612, |
|
"loss": 0.6301, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8088088088088088, |
|
"grad_norm": 0.29078415036201477, |
|
"learning_rate": 0.000170862667826446, |
|
"loss": 0.616, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8168168168168168, |
|
"grad_norm": 0.24860034883022308, |
|
"learning_rate": 0.0001702527474169157, |
|
"loss": 0.6352, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8248248248248248, |
|
"grad_norm": 0.26281973719596863, |
|
"learning_rate": 0.00016963762255968722, |
|
"loss": 0.6218, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8328328328328328, |
|
"grad_norm": 0.29051998257637024, |
|
"learning_rate": 0.0001690173388242972, |
|
"loss": 0.6233, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8408408408408409, |
|
"grad_norm": 0.2471507042646408, |
|
"learning_rate": 0.00016839194216246108, |
|
"loss": 0.6147, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8488488488488488, |
|
"grad_norm": 0.2574704587459564, |
|
"learning_rate": 0.0001677614789046689, |
|
"loss": 0.6174, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8568568568568569, |
|
"grad_norm": 0.2551233172416687, |
|
"learning_rate": 0.00016712599575675316, |
|
"loss": 0.5989, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.2901318371295929, |
|
"learning_rate": 0.00016648553979642868, |
|
"loss": 0.6241, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8728728728728729, |
|
"grad_norm": 0.23769080638885498, |
|
"learning_rate": 0.0001658401584698049, |
|
"loss": 0.6044, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8808808808808809, |
|
"grad_norm": 0.2580976188182831, |
|
"learning_rate": 0.00016518989958787126, |
|
"loss": 0.622, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.24077744781970978, |
|
"learning_rate": 0.00016453481132295506, |
|
"loss": 0.6047, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8968968968968969, |
|
"grad_norm": 0.228902667760849, |
|
"learning_rate": 0.00016387494220515274, |
|
"loss": 0.6138, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9049049049049049, |
|
"grad_norm": 0.2607581317424774, |
|
"learning_rate": 0.00016321034111873488, |
|
"loss": 0.6307, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9129129129129129, |
|
"grad_norm": 0.2575569450855255, |
|
"learning_rate": 0.00016254105729852464, |
|
"loss": 0.6008, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9209209209209209, |
|
"grad_norm": 0.231553852558136, |
|
"learning_rate": 0.00016186714032625035, |
|
"loss": 0.617, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.928928928928929, |
|
"grad_norm": 0.24820354580879211, |
|
"learning_rate": 0.00016118864012687245, |
|
"loss": 0.5991, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9369369369369369, |
|
"grad_norm": 0.2364109754562378, |
|
"learning_rate": 0.00016050560696488492, |
|
"loss": 0.6094, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.944944944944945, |
|
"grad_norm": 0.2492029368877411, |
|
"learning_rate": 0.00015981809144059166, |
|
"loss": 0.6143, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9529529529529529, |
|
"grad_norm": 0.27745717763900757, |
|
"learning_rate": 0.00015912614448635782, |
|
"loss": 0.6203, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.960960960960961, |
|
"grad_norm": 0.2555610239505768, |
|
"learning_rate": 0.00015842981736283686, |
|
"loss": 0.6314, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.968968968968969, |
|
"grad_norm": 0.2268420308828354, |
|
"learning_rate": 0.00015772916165517273, |
|
"loss": 0.6155, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9769769769769769, |
|
"grad_norm": 0.250041127204895, |
|
"learning_rate": 0.00015702422926917872, |
|
"loss": 0.6226, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.984984984984985, |
|
"grad_norm": 0.2596072554588318, |
|
"learning_rate": 0.00015631507242749187, |
|
"loss": 0.6086, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.992992992992993, |
|
"grad_norm": 0.2280743271112442, |
|
"learning_rate": 0.00015560174366570446, |
|
"loss": 0.5994, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.001001001001001, |
|
"grad_norm": 0.23362237215042114, |
|
"learning_rate": 0.00015488429582847192, |
|
"loss": 0.616, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.006, |
|
"grad_norm": 0.2956937849521637, |
|
"learning_rate": 0.00015416278206559816, |
|
"loss": 0.6038, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.014, |
|
"grad_norm": 0.250629723072052, |
|
"learning_rate": 0.0001534372558280979, |
|
"loss": 0.5991, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.022, |
|
"grad_norm": 0.231906458735466, |
|
"learning_rate": 0.00015270777086423722, |
|
"loss": 0.6088, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.2888093590736389, |
|
"learning_rate": 0.0001519743812155516, |
|
"loss": 0.5892, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.038, |
|
"grad_norm": 0.24940524995326996, |
|
"learning_rate": 0.0001512371412128424, |
|
"loss": 0.5982, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.046, |
|
"grad_norm": 0.24017778038978577, |
|
"learning_rate": 0.00015049610547215205, |
|
"loss": 0.5608, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.054, |
|
"grad_norm": 0.2334035485982895, |
|
"learning_rate": 0.00014975132889071807, |
|
"loss": 0.6034, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.062, |
|
"grad_norm": 0.2773897349834442, |
|
"learning_rate": 0.00014900286664290592, |
|
"loss": 0.6387, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.24266445636749268, |
|
"learning_rate": 0.00014825077417612186, |
|
"loss": 0.5612, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.078, |
|
"grad_norm": 0.22919470071792603, |
|
"learning_rate": 0.00014749510720670506, |
|
"loss": 0.599, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.086, |
|
"grad_norm": 0.23829148709774017, |
|
"learning_rate": 0.00014673592171580025, |
|
"loss": 0.6066, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.094, |
|
"grad_norm": 0.31981223821640015, |
|
"learning_rate": 0.00014597327394521044, |
|
"loss": 0.5692, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.102, |
|
"grad_norm": 0.2747564911842346, |
|
"learning_rate": 0.00014520722039323045, |
|
"loss": 0.62, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.2592499852180481, |
|
"learning_rate": 0.00014443781781046136, |
|
"loss": 0.5937, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.1179999999999999, |
|
"grad_norm": 0.31891530752182007, |
|
"learning_rate": 0.0001436651231956064, |
|
"loss": 0.5973, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.126, |
|
"grad_norm": 0.2743702232837677, |
|
"learning_rate": 0.00014288919379124837, |
|
"loss": 0.6045, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.134, |
|
"grad_norm": 0.2665708661079407, |
|
"learning_rate": 0.00014211008707960897, |
|
"loss": 0.5898, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.142, |
|
"grad_norm": 0.33267003297805786, |
|
"learning_rate": 0.00014132786077829043, |
|
"loss": 0.5945, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.28636589646339417, |
|
"learning_rate": 0.00014054257283599973, |
|
"loss": 0.5914, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.158, |
|
"grad_norm": 0.27305400371551514, |
|
"learning_rate": 0.0001397542814282556, |
|
"loss": 0.6093, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.166, |
|
"grad_norm": 0.2839919924736023, |
|
"learning_rate": 0.0001389630449530788, |
|
"loss": 0.6074, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.174, |
|
"grad_norm": 0.25652188062667847, |
|
"learning_rate": 0.0001381689220266659, |
|
"loss": 0.6059, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.182, |
|
"grad_norm": 0.2549704909324646, |
|
"learning_rate": 0.0001373719714790469, |
|
"loss": 0.5568, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.2509196400642395, |
|
"learning_rate": 0.00013657225234972695, |
|
"loss": 0.5968, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.198, |
|
"grad_norm": 0.25909289717674255, |
|
"learning_rate": 0.0001357698238833126, |
|
"loss": 0.5902, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.206, |
|
"grad_norm": 0.26162394881248474, |
|
"learning_rate": 0.00013496474552512287, |
|
"loss": 0.5763, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.214, |
|
"grad_norm": 0.2721655070781708, |
|
"learning_rate": 0.00013415707691678556, |
|
"loss": 0.6037, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.222, |
|
"grad_norm": 0.28691592812538147, |
|
"learning_rate": 0.0001333468778918187, |
|
"loss": 0.625, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.25801119208335876, |
|
"learning_rate": 0.00013253420847119803, |
|
"loss": 0.5994, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.238, |
|
"grad_norm": 0.2609824538230896, |
|
"learning_rate": 0.00013171912885891063, |
|
"loss": 0.5999, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.246, |
|
"grad_norm": 0.2631840407848358, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.5844, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.254, |
|
"grad_norm": 0.2862647473812103, |
|
"learning_rate": 0.00013008198076356676, |
|
"loss": 0.5653, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.262, |
|
"grad_norm": 0.27327024936676025, |
|
"learning_rate": 0.00012926003356333488, |
|
"loss": 0.5933, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.2684379518032074, |
|
"learning_rate": 0.0001284359187281004, |
|
"loss": 0.5842, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.278, |
|
"grad_norm": 0.2620231807231903, |
|
"learning_rate": 0.00012760969730974694, |
|
"loss": 0.6079, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.286, |
|
"grad_norm": 0.2584543526172638, |
|
"learning_rate": 0.00012678143051621742, |
|
"loss": 0.5941, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.294, |
|
"grad_norm": 0.26279789209365845, |
|
"learning_rate": 0.00012595117970697997, |
|
"loss": 0.6086, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.302, |
|
"grad_norm": 0.2541520893573761, |
|
"learning_rate": 0.00012511900638848195, |
|
"loss": 0.5907, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.27384746074676514, |
|
"learning_rate": 0.0001242849722095936, |
|
"loss": 0.6055, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.318, |
|
"grad_norm": 0.25682300329208374, |
|
"learning_rate": 0.00012344913895704097, |
|
"loss": 0.609, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.326, |
|
"grad_norm": 0.25404492020606995, |
|
"learning_rate": 0.00012261156855082882, |
|
"loss": 0.6121, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.334, |
|
"grad_norm": 0.2649850845336914, |
|
"learning_rate": 0.0001217723230396532, |
|
"loss": 0.5695, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.342, |
|
"grad_norm": 0.28349268436431885, |
|
"learning_rate": 0.00012093146459630487, |
|
"loss": 0.5883, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.27068182826042175, |
|
"learning_rate": 0.00012008905551306356, |
|
"loss": 0.6147, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.358, |
|
"grad_norm": 0.29566338658332825, |
|
"learning_rate": 0.000119245158197083, |
|
"loss": 0.5901, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.366, |
|
"grad_norm": 0.27946797013282776, |
|
"learning_rate": 0.00011839983516576802, |
|
"loss": 0.5831, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.374, |
|
"grad_norm": 0.25005409121513367, |
|
"learning_rate": 0.00011755314904214284, |
|
"loss": 0.5656, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3820000000000001, |
|
"grad_norm": 0.2621053159236908, |
|
"learning_rate": 0.00011670516255021193, |
|
"loss": 0.5769, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3900000000000001, |
|
"grad_norm": 0.2531629502773285, |
|
"learning_rate": 0.00011585593851031347, |
|
"loss": 0.5641, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3980000000000001, |
|
"grad_norm": 0.3026553690433502, |
|
"learning_rate": 0.00011500553983446527, |
|
"loss": 0.6038, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4060000000000001, |
|
"grad_norm": 0.2748616337776184, |
|
"learning_rate": 0.00011415402952170433, |
|
"loss": 0.5808, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.414, |
|
"grad_norm": 0.27345311641693115, |
|
"learning_rate": 0.0001133014706534196, |
|
"loss": 0.5848, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.422, |
|
"grad_norm": 0.273357629776001, |
|
"learning_rate": 0.00011244792638867893, |
|
"loss": 0.5906, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.26230183243751526, |
|
"learning_rate": 0.00011159345995955006, |
|
"loss": 0.5886, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.438, |
|
"grad_norm": 0.2515832483768463, |
|
"learning_rate": 0.00011073813466641632, |
|
"loss": 0.5876, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.446, |
|
"grad_norm": 0.27451092004776, |
|
"learning_rate": 0.00010988201387328717, |
|
"loss": 0.5841, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.454, |
|
"grad_norm": 0.2588571012020111, |
|
"learning_rate": 0.00010902516100310411, |
|
"loss": 0.5674, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.462, |
|
"grad_norm": 0.2440604865550995, |
|
"learning_rate": 0.00010816763953304227, |
|
"loss": 0.5646, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.2681560516357422, |
|
"learning_rate": 0.00010730951298980776, |
|
"loss": 0.5671, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.478, |
|
"grad_norm": 0.297048419713974, |
|
"learning_rate": 0.00010645084494493165, |
|
"loss": 0.583, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.486, |
|
"grad_norm": 0.29275089502334595, |
|
"learning_rate": 0.00010559169901006034, |
|
"loss": 0.6007, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.494, |
|
"grad_norm": 0.2599998414516449, |
|
"learning_rate": 0.0001047321388322432, |
|
"loss": 0.5692, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.502, |
|
"grad_norm": 0.2714841663837433, |
|
"learning_rate": 0.00010387222808921746, |
|
"loss": 0.5996, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.2618089020252228, |
|
"learning_rate": 0.00010301203048469083, |
|
"loss": 0.5861, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.518, |
|
"grad_norm": 0.27270275354385376, |
|
"learning_rate": 0.00010215160974362223, |
|
"loss": 0.5771, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.526, |
|
"grad_norm": 0.2583703398704529, |
|
"learning_rate": 0.00010129102960750092, |
|
"loss": 0.5897, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.534, |
|
"grad_norm": 0.25854265689849854, |
|
"learning_rate": 0.00010043035382962443, |
|
"loss": 0.5778, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.542, |
|
"grad_norm": 0.27234703302383423, |
|
"learning_rate": 9.956964617037558e-05, |
|
"loss": 0.5659, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.2672337293624878, |
|
"learning_rate": 9.870897039249911e-05, |
|
"loss": 0.5792, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.558, |
|
"grad_norm": 0.2644350528717041, |
|
"learning_rate": 9.784839025637778e-05, |
|
"loss": 0.5798, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5659999999999998, |
|
"grad_norm": 0.2572389543056488, |
|
"learning_rate": 9.698796951530919e-05, |
|
"loss": 0.6008, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.5739999999999998, |
|
"grad_norm": 0.2693156898021698, |
|
"learning_rate": 9.612777191078258e-05, |
|
"loss": 0.5903, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.5819999999999999, |
|
"grad_norm": 0.26244068145751953, |
|
"learning_rate": 9.526786116775682e-05, |
|
"loss": 0.569, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5899999999999999, |
|
"grad_norm": 0.25875815749168396, |
|
"learning_rate": 9.440830098993969e-05, |
|
"loss": 0.6042, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.5979999999999999, |
|
"grad_norm": 0.27209436893463135, |
|
"learning_rate": 9.354915505506839e-05, |
|
"loss": 0.5846, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6059999999999999, |
|
"grad_norm": 0.270780473947525, |
|
"learning_rate": 9.269048701019226e-05, |
|
"loss": 0.5957, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.6139999999999999, |
|
"grad_norm": 0.25013670325279236, |
|
"learning_rate": 9.183236046695777e-05, |
|
"loss": 0.5845, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.6219999999999999, |
|
"grad_norm": 0.2716957628726959, |
|
"learning_rate": 9.09748389968959e-05, |
|
"loss": 0.5584, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.2737436890602112, |
|
"learning_rate": 9.011798612671286e-05, |
|
"loss": 0.5836, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.638, |
|
"grad_norm": 0.2748481333255768, |
|
"learning_rate": 8.92618653335837e-05, |
|
"loss": 0.5927, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.646, |
|
"grad_norm": 0.2642996609210968, |
|
"learning_rate": 8.840654004044996e-05, |
|
"loss": 0.6088, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.654, |
|
"grad_norm": 0.26341068744659424, |
|
"learning_rate": 8.755207361132108e-05, |
|
"loss": 0.5841, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.662, |
|
"grad_norm": 0.30515289306640625, |
|
"learning_rate": 8.669852934658042e-05, |
|
"loss": 0.5525, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2801468074321747, |
|
"learning_rate": 8.58459704782957e-05, |
|
"loss": 0.5547, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.678, |
|
"grad_norm": 0.27618443965911865, |
|
"learning_rate": 8.499446016553474e-05, |
|
"loss": 0.5613, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.686, |
|
"grad_norm": 0.26961931586265564, |
|
"learning_rate": 8.414406148968657e-05, |
|
"loss": 0.5639, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.694, |
|
"grad_norm": 0.29023805260658264, |
|
"learning_rate": 8.32948374497881e-05, |
|
"loss": 0.5878, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.702, |
|
"grad_norm": 0.2671249508857727, |
|
"learning_rate": 8.244685095785719e-05, |
|
"loss": 0.5743, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.26124948263168335, |
|
"learning_rate": 8.160016483423199e-05, |
|
"loss": 0.5801, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.718, |
|
"grad_norm": 0.2721916437149048, |
|
"learning_rate": 8.075484180291701e-05, |
|
"loss": 0.5975, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.726, |
|
"grad_norm": 0.2630290687084198, |
|
"learning_rate": 7.991094448693648e-05, |
|
"loss": 0.5714, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.734, |
|
"grad_norm": 0.25962400436401367, |
|
"learning_rate": 7.906853540369514e-05, |
|
"loss": 0.5912, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.742, |
|
"grad_norm": 0.26142629981040955, |
|
"learning_rate": 7.822767696034682e-05, |
|
"loss": 0.577, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.26839005947113037, |
|
"learning_rate": 7.738843144917119e-05, |
|
"loss": 0.5747, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.758, |
|
"grad_norm": 0.28053659200668335, |
|
"learning_rate": 7.655086104295904e-05, |
|
"loss": 0.612, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.766, |
|
"grad_norm": 0.2728564143180847, |
|
"learning_rate": 7.571502779040645e-05, |
|
"loss": 0.5895, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.774, |
|
"grad_norm": 0.26337721943855286, |
|
"learning_rate": 7.48809936115181e-05, |
|
"loss": 0.5666, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.782, |
|
"grad_norm": 0.2544887661933899, |
|
"learning_rate": 7.404882029302003e-05, |
|
"loss": 0.5552, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2690827250480652, |
|
"learning_rate": 7.321856948378259e-05, |
|
"loss": 0.5559, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.798, |
|
"grad_norm": 0.27959364652633667, |
|
"learning_rate": 7.239030269025311e-05, |
|
"loss": 0.5738, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.806, |
|
"grad_norm": 0.29118430614471436, |
|
"learning_rate": 7.156408127189965e-05, |
|
"loss": 0.5753, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.814, |
|
"grad_norm": 0.2585116922855377, |
|
"learning_rate": 7.073996643666517e-05, |
|
"loss": 0.5499, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.822, |
|
"grad_norm": 0.24918395280838013, |
|
"learning_rate": 6.991801923643324e-05, |
|
"loss": 0.5792, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.2664697766304016, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.582, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.838, |
|
"grad_norm": 0.2679463326931, |
|
"learning_rate": 6.82808711410894e-05, |
|
"loss": 0.5919, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.846, |
|
"grad_norm": 0.26800620555877686, |
|
"learning_rate": 6.746579152880201e-05, |
|
"loss": 0.5774, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.854, |
|
"grad_norm": 0.25649094581604004, |
|
"learning_rate": 6.665312210818131e-05, |
|
"loss": 0.5569, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.862, |
|
"grad_norm": 0.2661426067352295, |
|
"learning_rate": 6.584292308321445e-05, |
|
"loss": 0.5759, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.27011772990226746, |
|
"learning_rate": 6.503525447487715e-05, |
|
"loss": 0.5752, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.8780000000000001, |
|
"grad_norm": 0.28006768226623535, |
|
"learning_rate": 6.423017611668745e-05, |
|
"loss": 0.5873, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.8860000000000001, |
|
"grad_norm": 0.281974196434021, |
|
"learning_rate": 6.342774765027309e-05, |
|
"loss": 0.5868, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.8940000000000001, |
|
"grad_norm": 0.28022536635398865, |
|
"learning_rate": 6.262802852095311e-05, |
|
"loss": 0.5505, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.9020000000000001, |
|
"grad_norm": 0.2971389889717102, |
|
"learning_rate": 6.18310779733341e-05, |
|
"loss": 0.5879, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.9100000000000001, |
|
"grad_norm": 0.2779372036457062, |
|
"learning_rate": 6.103695504692122e-05, |
|
"loss": 0.5648, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.9180000000000001, |
|
"grad_norm": 0.3032248616218567, |
|
"learning_rate": 6.024571857174443e-05, |
|
"loss": 0.5884, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9260000000000002, |
|
"grad_norm": 0.26035642623901367, |
|
"learning_rate": 5.94574271640003e-05, |
|
"loss": 0.5837, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.9340000000000002, |
|
"grad_norm": 0.2789236009120941, |
|
"learning_rate": 5.8672139221709577e-05, |
|
"loss": 0.5745, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.942, |
|
"grad_norm": 0.27459922432899475, |
|
"learning_rate": 5.788991292039103e-05, |
|
"loss": 0.568, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2857699394226074, |
|
"learning_rate": 5.7110806208751655e-05, |
|
"loss": 0.5619, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.958, |
|
"grad_norm": 0.2715800106525421, |
|
"learning_rate": 5.633487680439361e-05, |
|
"loss": 0.5763, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.966, |
|
"grad_norm": 0.27073079347610474, |
|
"learning_rate": 5.556218218953868e-05, |
|
"loss": 0.5815, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.974, |
|
"grad_norm": 0.2630924880504608, |
|
"learning_rate": 5.479277960676958e-05, |
|
"loss": 0.5735, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.982, |
|
"grad_norm": 0.2807694971561432, |
|
"learning_rate": 5.40267260547896e-05, |
|
"loss": 0.5611, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.28481224179267883, |
|
"learning_rate": 5.326407828419979e-05, |
|
"loss": 0.5671, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.998, |
|
"grad_norm": 0.27675607800483704, |
|
"learning_rate": 5.2504892793295e-05, |
|
"loss": 0.5825, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 375, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 125, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.479651398516736e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|