|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.001001001001001, |
|
"eval_steps": 500, |
|
"global_step": 125, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008008008008008008, |
|
"grad_norm": 1.6445057392120361, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3547, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016016016016016016, |
|
"grad_norm": 1.63363778591156, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3812, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024024024024024024, |
|
"grad_norm": 1.6492197513580322, |
|
"learning_rate": 6e-05, |
|
"loss": 2.3399, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03203203203203203, |
|
"grad_norm": 1.6518611907958984, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3172, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04004004004004004, |
|
"grad_norm": 1.7173571586608887, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2563, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04804804804804805, |
|
"grad_norm": 1.563859224319458, |
|
"learning_rate": 0.00012, |
|
"loss": 2.0256, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056056056056056056, |
|
"grad_norm": 1.5590581893920898, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8324, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06406406406406406, |
|
"grad_norm": 1.5127277374267578, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5787, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07207207207207207, |
|
"grad_norm": 1.5447226762771606, |
|
"learning_rate": 0.00018, |
|
"loss": 1.3826, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08008008008008008, |
|
"grad_norm": 4.600811004638672, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2388, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08808808808808809, |
|
"grad_norm": 1.6333264112472534, |
|
"learning_rate": 0.00019999629591162656, |
|
"loss": 1.0977, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0960960960960961, |
|
"grad_norm": 1.5325253009796143, |
|
"learning_rate": 0.00019998518392091164, |
|
"loss": 1.0178, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1041041041041041, |
|
"grad_norm": 1.866473913192749, |
|
"learning_rate": 0.00019996666485105113, |
|
"loss": 0.9454, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11211211211211211, |
|
"grad_norm": 1.450692892074585, |
|
"learning_rate": 0.0001999407400739705, |
|
"loss": 0.8514, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12012012012012012, |
|
"grad_norm": 5.149086952209473, |
|
"learning_rate": 0.00019990741151022301, |
|
"loss": 0.9136, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12812812812812813, |
|
"grad_norm": 0.7399551272392273, |
|
"learning_rate": 0.00019986668162884762, |
|
"loss": 0.8742, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13613613613613615, |
|
"grad_norm": 0.6142033934593201, |
|
"learning_rate": 0.00019981855344718588, |
|
"loss": 0.8082, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14414414414414414, |
|
"grad_norm": 0.47605425119400024, |
|
"learning_rate": 0.00019976303053065859, |
|
"loss": 0.8019, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.15215215215215216, |
|
"grad_norm": 0.3993614614009857, |
|
"learning_rate": 0.00019970011699250152, |
|
"loss": 0.7625, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16016016016016016, |
|
"grad_norm": 0.4947199821472168, |
|
"learning_rate": 0.00019962981749346078, |
|
"loss": 0.7419, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16816816816816818, |
|
"grad_norm": 0.549526572227478, |
|
"learning_rate": 0.00019955213724144754, |
|
"loss": 0.7468, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.17617617617617617, |
|
"grad_norm": 0.34314435720443726, |
|
"learning_rate": 0.00019946708199115211, |
|
"loss": 0.7482, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1841841841841842, |
|
"grad_norm": 0.38283613324165344, |
|
"learning_rate": 0.00019937465804361783, |
|
"loss": 0.7304, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1921921921921922, |
|
"grad_norm": 0.28871795535087585, |
|
"learning_rate": 0.00019927487224577402, |
|
"loss": 0.746, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2002002002002002, |
|
"grad_norm": 0.321494996547699, |
|
"learning_rate": 0.000199167731989929, |
|
"loss": 0.7461, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2082082082082082, |
|
"grad_norm": 0.315449982881546, |
|
"learning_rate": 0.0001990532452132223, |
|
"loss": 0.7286, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 0.2904318571090698, |
|
"learning_rate": 0.00019893142039703664, |
|
"loss": 0.7119, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.22422422422422422, |
|
"grad_norm": 0.27874529361724854, |
|
"learning_rate": 0.00019880226656636977, |
|
"loss": 0.7105, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23223223223223224, |
|
"grad_norm": 0.2948579490184784, |
|
"learning_rate": 0.0001986657932891657, |
|
"loss": 0.6976, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24024024024024024, |
|
"grad_norm": 0.2542964220046997, |
|
"learning_rate": 0.00019852201067560606, |
|
"loss": 0.7351, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.24824824824824826, |
|
"grad_norm": 0.2960706353187561, |
|
"learning_rate": 0.000198370929377361, |
|
"loss": 0.7179, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.25625625625625625, |
|
"grad_norm": 0.24776384234428406, |
|
"learning_rate": 0.00019821256058680006, |
|
"loss": 0.7134, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.26426426426426425, |
|
"grad_norm": 0.33054184913635254, |
|
"learning_rate": 0.00019804691603616324, |
|
"loss": 0.6995, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2722722722722723, |
|
"grad_norm": 0.2543237805366516, |
|
"learning_rate": 0.00019787400799669154, |
|
"loss": 0.7081, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2802802802802803, |
|
"grad_norm": 0.25240710377693176, |
|
"learning_rate": 0.0001976938492777182, |
|
"loss": 0.6928, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2882882882882883, |
|
"grad_norm": 0.35880276560783386, |
|
"learning_rate": 0.0001975064532257195, |
|
"loss": 0.7177, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.3675362467765808, |
|
"learning_rate": 0.0001973118337233262, |
|
"loss": 0.6865, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.30430430430430433, |
|
"grad_norm": 0.3688451051712036, |
|
"learning_rate": 0.00019711000518829507, |
|
"loss": 0.6724, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3123123123123123, |
|
"grad_norm": 0.2982208728790283, |
|
"learning_rate": 0.00019690098257244064, |
|
"loss": 0.671, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3203203203203203, |
|
"grad_norm": 0.24197936058044434, |
|
"learning_rate": 0.00019668478136052774, |
|
"loss": 0.6777, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3283283283283283, |
|
"grad_norm": 0.748349130153656, |
|
"learning_rate": 0.00019646141756912434, |
|
"loss": 0.6641, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.33633633633633636, |
|
"grad_norm": 0.5585939288139343, |
|
"learning_rate": 0.00019623090774541487, |
|
"loss": 0.6988, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.34434434434434436, |
|
"grad_norm": 0.40285471081733704, |
|
"learning_rate": 0.00019599326896597448, |
|
"loss": 0.6811, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.35235235235235235, |
|
"grad_norm": 0.25714346766471863, |
|
"learning_rate": 0.00019574851883550395, |
|
"loss": 0.6913, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.36036036036036034, |
|
"grad_norm": 0.4926215708255768, |
|
"learning_rate": 0.00019549667548552556, |
|
"loss": 0.6707, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3683683683683684, |
|
"grad_norm": 0.3760850429534912, |
|
"learning_rate": 0.00019523775757303974, |
|
"loss": 0.6809, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3763763763763764, |
|
"grad_norm": 0.3734811842441559, |
|
"learning_rate": 0.0001949717842791432, |
|
"loss": 0.6386, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3843843843843844, |
|
"grad_norm": 0.3447561264038086, |
|
"learning_rate": 0.00019469877530760754, |
|
"loss": 0.6955, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3923923923923924, |
|
"grad_norm": 0.2680707573890686, |
|
"learning_rate": 0.00019441875088341997, |
|
"loss": 0.6625, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4004004004004004, |
|
"grad_norm": 0.2692941725254059, |
|
"learning_rate": 0.00019413173175128473, |
|
"loss": 0.66, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4084084084084084, |
|
"grad_norm": 0.32329630851745605, |
|
"learning_rate": 0.00019383773917408642, |
|
"loss": 0.6612, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4164164164164164, |
|
"grad_norm": 0.281435489654541, |
|
"learning_rate": 0.00019353679493131485, |
|
"loss": 0.6621, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4244244244244244, |
|
"grad_norm": 0.22186556458473206, |
|
"learning_rate": 0.00019322892131745135, |
|
"loss": 0.6465, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.2902645468711853, |
|
"learning_rate": 0.00019291414114031743, |
|
"loss": 0.6693, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.44044044044044045, |
|
"grad_norm": 0.2899124324321747, |
|
"learning_rate": 0.000192592477719385, |
|
"loss": 0.6568, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.44844844844844844, |
|
"grad_norm": 0.2124062478542328, |
|
"learning_rate": 0.00019226395488404876, |
|
"loss": 0.6724, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.45645645645645644, |
|
"grad_norm": 0.23896393179893494, |
|
"learning_rate": 0.00019192859697186106, |
|
"loss": 0.6459, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4644644644644645, |
|
"grad_norm": 0.2762405574321747, |
|
"learning_rate": 0.00019158642882672873, |
|
"loss": 0.6498, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4724724724724725, |
|
"grad_norm": 0.2079935222864151, |
|
"learning_rate": 0.00019123747579707275, |
|
"loss": 0.6604, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4804804804804805, |
|
"grad_norm": 0.23864208161830902, |
|
"learning_rate": 0.0001908817637339503, |
|
"loss": 0.6378, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.48848848848848847, |
|
"grad_norm": 0.21718506515026093, |
|
"learning_rate": 0.00019051931898913976, |
|
"loss": 0.6424, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4964964964964965, |
|
"grad_norm": 0.2773915231227875, |
|
"learning_rate": 0.0001901501684131884, |
|
"loss": 0.6474, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5045045045045045, |
|
"grad_norm": 0.23982493579387665, |
|
"learning_rate": 0.0001897743393534234, |
|
"loss": 0.6256, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5125125125125125, |
|
"grad_norm": 0.23621873557567596, |
|
"learning_rate": 0.0001893918596519257, |
|
"loss": 0.6403, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5205205205205206, |
|
"grad_norm": 0.22759953141212463, |
|
"learning_rate": 0.00018900275764346768, |
|
"loss": 0.6484, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5285285285285285, |
|
"grad_norm": 0.26695549488067627, |
|
"learning_rate": 0.00018860706215341382, |
|
"loss": 0.609, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5365365365365365, |
|
"grad_norm": 0.24594709277153015, |
|
"learning_rate": 0.00018820480249558537, |
|
"loss": 0.6338, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5445445445445446, |
|
"grad_norm": 0.22960062325000763, |
|
"learning_rate": 0.00018779600847008884, |
|
"loss": 0.6166, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5525525525525525, |
|
"grad_norm": 0.25302109122276306, |
|
"learning_rate": 0.00018738071036110808, |
|
"loss": 0.6422, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5605605605605606, |
|
"grad_norm": 0.3339892327785492, |
|
"learning_rate": 0.0001869589389346611, |
|
"loss": 0.6558, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5685685685685685, |
|
"grad_norm": 0.21397258341312408, |
|
"learning_rate": 0.00018653072543632062, |
|
"loss": 0.6323, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5765765765765766, |
|
"grad_norm": 0.2514493465423584, |
|
"learning_rate": 0.00018609610158889942, |
|
"loss": 0.657, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5845845845845846, |
|
"grad_norm": 0.25317835807800293, |
|
"learning_rate": 0.00018565509959010036, |
|
"loss": 0.641, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.22669494152069092, |
|
"learning_rate": 0.00018520775211013093, |
|
"loss": 0.6369, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6006006006006006, |
|
"grad_norm": 0.2214743047952652, |
|
"learning_rate": 0.00018475409228928312, |
|
"loss": 0.6307, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6086086086086087, |
|
"grad_norm": 0.24376747012138367, |
|
"learning_rate": 0.00018429415373547828, |
|
"loss": 0.6557, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6166166166166166, |
|
"grad_norm": 0.2158333659172058, |
|
"learning_rate": 0.00018382797052177746, |
|
"loss": 0.655, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6246246246246246, |
|
"grad_norm": 0.25565382838249207, |
|
"learning_rate": 0.000183355577183857, |
|
"loss": 0.6299, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6326326326326326, |
|
"grad_norm": 0.20636747777462006, |
|
"learning_rate": 0.00018287700871745036, |
|
"loss": 0.6283, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6406406406406406, |
|
"grad_norm": 0.21258121728897095, |
|
"learning_rate": 0.00018239230057575542, |
|
"loss": 0.6174, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.2861458957195282, |
|
"learning_rate": 0.00018190148866680802, |
|
"loss": 0.6547, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6566566566566566, |
|
"grad_norm": 0.23667441308498383, |
|
"learning_rate": 0.0001814046093508218, |
|
"loss": 0.6416, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6646646646646647, |
|
"grad_norm": 0.23191799223423004, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.642, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6726726726726727, |
|
"grad_norm": 0.2622171938419342, |
|
"learning_rate": 0.00018039279618328212, |
|
"loss": 0.6241, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6806806806806807, |
|
"grad_norm": 0.2891266345977783, |
|
"learning_rate": 0.00017987793728863651, |
|
"loss": 0.6284, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6886886886886887, |
|
"grad_norm": 0.26767420768737793, |
|
"learning_rate": 0.00017935716089521474, |
|
"loss": 0.627, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6966966966966966, |
|
"grad_norm": 0.2828672230243683, |
|
"learning_rate": 0.00017883050558305255, |
|
"loss": 0.6418, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7047047047047047, |
|
"grad_norm": 0.32730573415756226, |
|
"learning_rate": 0.00017829801036770628, |
|
"loss": 0.6629, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7127127127127127, |
|
"grad_norm": 0.24029900133609772, |
|
"learning_rate": 0.0001777597146973627, |
|
"loss": 0.614, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 0.2929212152957916, |
|
"learning_rate": 0.00017721565844991643, |
|
"loss": 0.632, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7287287287287287, |
|
"grad_norm": 0.2860666513442993, |
|
"learning_rate": 0.00017666588193001595, |
|
"loss": 0.6289, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7367367367367368, |
|
"grad_norm": 0.23325330018997192, |
|
"learning_rate": 0.00017611042586607748, |
|
"loss": 0.6392, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7447447447447447, |
|
"grad_norm": 0.3126169443130493, |
|
"learning_rate": 0.00017554933140726802, |
|
"loss": 0.6422, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7527527527527528, |
|
"grad_norm": 0.26704883575439453, |
|
"learning_rate": 0.00017498264012045687, |
|
"loss": 0.6166, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7607607607607607, |
|
"grad_norm": 0.2184283286333084, |
|
"learning_rate": 0.00017441039398713608, |
|
"loss": 0.6235, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7687687687687688, |
|
"grad_norm": 0.23906390368938446, |
|
"learning_rate": 0.00017383263540031067, |
|
"loss": 0.6643, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7767767767767768, |
|
"grad_norm": 0.26839691400527954, |
|
"learning_rate": 0.0001732494071613579, |
|
"loss": 0.6514, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7847847847847848, |
|
"grad_norm": 0.2805701494216919, |
|
"learning_rate": 0.00017266075247685656, |
|
"loss": 0.6168, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7927927927927928, |
|
"grad_norm": 0.21650992333889008, |
|
"learning_rate": 0.00017206671495538612, |
|
"loss": 0.5983, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8008008008008008, |
|
"grad_norm": 0.2302800416946411, |
|
"learning_rate": 0.00017146733860429612, |
|
"loss": 0.6301, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8088088088088088, |
|
"grad_norm": 0.29078415036201477, |
|
"learning_rate": 0.000170862667826446, |
|
"loss": 0.616, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8168168168168168, |
|
"grad_norm": 0.24860034883022308, |
|
"learning_rate": 0.0001702527474169157, |
|
"loss": 0.6352, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8248248248248248, |
|
"grad_norm": 0.26281973719596863, |
|
"learning_rate": 0.00016963762255968722, |
|
"loss": 0.6218, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8328328328328328, |
|
"grad_norm": 0.29051998257637024, |
|
"learning_rate": 0.0001690173388242972, |
|
"loss": 0.6233, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8408408408408409, |
|
"grad_norm": 0.2471507042646408, |
|
"learning_rate": 0.00016839194216246108, |
|
"loss": 0.6147, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8488488488488488, |
|
"grad_norm": 0.2574704587459564, |
|
"learning_rate": 0.0001677614789046689, |
|
"loss": 0.6174, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8568568568568569, |
|
"grad_norm": 0.2551233172416687, |
|
"learning_rate": 0.00016712599575675316, |
|
"loss": 0.5989, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.2901318371295929, |
|
"learning_rate": 0.00016648553979642868, |
|
"loss": 0.6241, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8728728728728729, |
|
"grad_norm": 0.23769080638885498, |
|
"learning_rate": 0.0001658401584698049, |
|
"loss": 0.6044, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8808808808808809, |
|
"grad_norm": 0.2580976188182831, |
|
"learning_rate": 0.00016518989958787126, |
|
"loss": 0.622, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.24077744781970978, |
|
"learning_rate": 0.00016453481132295506, |
|
"loss": 0.6047, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8968968968968969, |
|
"grad_norm": 0.228902667760849, |
|
"learning_rate": 0.00016387494220515274, |
|
"loss": 0.6138, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9049049049049049, |
|
"grad_norm": 0.2607581317424774, |
|
"learning_rate": 0.00016321034111873488, |
|
"loss": 0.6307, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9129129129129129, |
|
"grad_norm": 0.2575569450855255, |
|
"learning_rate": 0.00016254105729852464, |
|
"loss": 0.6008, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9209209209209209, |
|
"grad_norm": 0.231553852558136, |
|
"learning_rate": 0.00016186714032625035, |
|
"loss": 0.617, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.928928928928929, |
|
"grad_norm": 0.24820354580879211, |
|
"learning_rate": 0.00016118864012687245, |
|
"loss": 0.5991, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9369369369369369, |
|
"grad_norm": 0.2364109754562378, |
|
"learning_rate": 0.00016050560696488492, |
|
"loss": 0.6094, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.944944944944945, |
|
"grad_norm": 0.2492029368877411, |
|
"learning_rate": 0.00015981809144059166, |
|
"loss": 0.6143, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9529529529529529, |
|
"grad_norm": 0.27745717763900757, |
|
"learning_rate": 0.00015912614448635782, |
|
"loss": 0.6203, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.960960960960961, |
|
"grad_norm": 0.2555610239505768, |
|
"learning_rate": 0.00015842981736283686, |
|
"loss": 0.6314, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.968968968968969, |
|
"grad_norm": 0.2268420308828354, |
|
"learning_rate": 0.00015772916165517273, |
|
"loss": 0.6155, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9769769769769769, |
|
"grad_norm": 0.250041127204895, |
|
"learning_rate": 0.00015702422926917872, |
|
"loss": 0.6226, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.984984984984985, |
|
"grad_norm": 0.2596072554588318, |
|
"learning_rate": 0.00015631507242749187, |
|
"loss": 0.6086, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.992992992992993, |
|
"grad_norm": 0.2280743271112442, |
|
"learning_rate": 0.00015560174366570446, |
|
"loss": 0.5994, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.001001001001001, |
|
"grad_norm": 0.23362237215042114, |
|
"learning_rate": 0.00015488429582847192, |
|
"loss": 0.616, |
|
"step": 125 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 375, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 125, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.39825699258368e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|