{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.001001001001001, "eval_steps": 500, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008008008008008008, "grad_norm": 1.6445057392120361, "learning_rate": 2e-05, "loss": 2.3547, "step": 1 }, { "epoch": 0.016016016016016016, "grad_norm": 1.63363778591156, "learning_rate": 4e-05, "loss": 2.3812, "step": 2 }, { "epoch": 0.024024024024024024, "grad_norm": 1.6492197513580322, "learning_rate": 6e-05, "loss": 2.3399, "step": 3 }, { "epoch": 0.03203203203203203, "grad_norm": 1.6518611907958984, "learning_rate": 8e-05, "loss": 2.3172, "step": 4 }, { "epoch": 0.04004004004004004, "grad_norm": 1.7173571586608887, "learning_rate": 0.0001, "loss": 2.2563, "step": 5 }, { "epoch": 0.04804804804804805, "grad_norm": 1.563859224319458, "learning_rate": 0.00012, "loss": 2.0256, "step": 6 }, { "epoch": 0.056056056056056056, "grad_norm": 1.5590581893920898, "learning_rate": 0.00014, "loss": 1.8324, "step": 7 }, { "epoch": 0.06406406406406406, "grad_norm": 1.5127277374267578, "learning_rate": 0.00016, "loss": 1.5787, "step": 8 }, { "epoch": 0.07207207207207207, "grad_norm": 1.5447226762771606, "learning_rate": 0.00018, "loss": 1.3826, "step": 9 }, { "epoch": 0.08008008008008008, "grad_norm": 4.600811004638672, "learning_rate": 0.0002, "loss": 1.2388, "step": 10 }, { "epoch": 0.08808808808808809, "grad_norm": 1.6333264112472534, "learning_rate": 0.00019999629591162656, "loss": 1.0977, "step": 11 }, { "epoch": 0.0960960960960961, "grad_norm": 1.5325253009796143, "learning_rate": 0.00019998518392091164, "loss": 1.0178, "step": 12 }, { "epoch": 0.1041041041041041, "grad_norm": 1.866473913192749, "learning_rate": 0.00019996666485105113, "loss": 0.9454, "step": 13 }, { "epoch": 0.11211211211211211, "grad_norm": 1.450692892074585, "learning_rate": 0.0001999407400739705, "loss": 0.8514, "step": 14 }, { "epoch": 0.12012012012012012, "grad_norm": 5.149086952209473, "learning_rate": 0.00019990741151022301, "loss": 0.9136, "step": 15 }, { "epoch": 0.12812812812812813, "grad_norm": 0.7399551272392273, "learning_rate": 0.00019986668162884762, "loss": 0.8742, "step": 16 }, { "epoch": 0.13613613613613615, "grad_norm": 0.6142033934593201, "learning_rate": 0.00019981855344718588, "loss": 0.8082, "step": 17 }, { "epoch": 0.14414414414414414, "grad_norm": 0.47605425119400024, "learning_rate": 0.00019976303053065859, "loss": 0.8019, "step": 18 }, { "epoch": 0.15215215215215216, "grad_norm": 0.3993614614009857, "learning_rate": 0.00019970011699250152, "loss": 0.7625, "step": 19 }, { "epoch": 0.16016016016016016, "grad_norm": 0.4947199821472168, "learning_rate": 0.00019962981749346078, "loss": 0.7419, "step": 20 }, { "epoch": 0.16816816816816818, "grad_norm": 0.549526572227478, "learning_rate": 0.00019955213724144754, "loss": 0.7468, "step": 21 }, { "epoch": 0.17617617617617617, "grad_norm": 0.34314435720443726, "learning_rate": 0.00019946708199115211, "loss": 0.7482, "step": 22 }, { "epoch": 0.1841841841841842, "grad_norm": 0.38283613324165344, "learning_rate": 0.00019937465804361783, "loss": 0.7304, "step": 23 }, { "epoch": 0.1921921921921922, "grad_norm": 0.28871795535087585, "learning_rate": 0.00019927487224577402, "loss": 0.746, "step": 24 }, { "epoch": 0.2002002002002002, "grad_norm": 0.321494996547699, "learning_rate": 0.000199167731989929, "loss": 0.7461, "step": 25 }, { "epoch": 0.2082082082082082, "grad_norm": 0.315449982881546, "learning_rate": 0.0001990532452132223, "loss": 0.7286, "step": 26 }, { "epoch": 0.21621621621621623, "grad_norm": 0.2904318571090698, "learning_rate": 0.00019893142039703664, "loss": 0.7119, "step": 27 }, { "epoch": 0.22422422422422422, "grad_norm": 0.27874529361724854, "learning_rate": 0.00019880226656636977, "loss": 0.7105, "step": 28 }, { "epoch": 0.23223223223223224, "grad_norm": 0.2948579490184784, "learning_rate": 0.0001986657932891657, "loss": 0.6976, "step": 29 }, { "epoch": 0.24024024024024024, "grad_norm": 0.2542964220046997, "learning_rate": 0.00019852201067560606, "loss": 0.7351, "step": 30 }, { "epoch": 0.24824824824824826, "grad_norm": 0.2960706353187561, "learning_rate": 0.000198370929377361, "loss": 0.7179, "step": 31 }, { "epoch": 0.25625625625625625, "grad_norm": 0.24776384234428406, "learning_rate": 0.00019821256058680006, "loss": 0.7134, "step": 32 }, { "epoch": 0.26426426426426425, "grad_norm": 0.33054184913635254, "learning_rate": 0.00019804691603616324, "loss": 0.6995, "step": 33 }, { "epoch": 0.2722722722722723, "grad_norm": 0.2543237805366516, "learning_rate": 0.00019787400799669154, "loss": 0.7081, "step": 34 }, { "epoch": 0.2802802802802803, "grad_norm": 0.25240710377693176, "learning_rate": 0.0001976938492777182, "loss": 0.6928, "step": 35 }, { "epoch": 0.2882882882882883, "grad_norm": 0.35880276560783386, "learning_rate": 0.0001975064532257195, "loss": 0.7177, "step": 36 }, { "epoch": 0.2962962962962963, "grad_norm": 0.3675362467765808, "learning_rate": 0.0001973118337233262, "loss": 0.6865, "step": 37 }, { "epoch": 0.30430430430430433, "grad_norm": 0.3688451051712036, "learning_rate": 0.00019711000518829507, "loss": 0.6724, "step": 38 }, { "epoch": 0.3123123123123123, "grad_norm": 0.2982208728790283, "learning_rate": 0.00019690098257244064, "loss": 0.671, "step": 39 }, { "epoch": 0.3203203203203203, "grad_norm": 0.24197936058044434, "learning_rate": 0.00019668478136052774, "loss": 0.6777, "step": 40 }, { "epoch": 0.3283283283283283, "grad_norm": 0.748349130153656, "learning_rate": 0.00019646141756912434, "loss": 0.6641, "step": 41 }, { "epoch": 0.33633633633633636, "grad_norm": 0.5585939288139343, "learning_rate": 0.00019623090774541487, "loss": 0.6988, "step": 42 }, { "epoch": 0.34434434434434436, "grad_norm": 0.40285471081733704, "learning_rate": 0.00019599326896597448, "loss": 0.6811, "step": 43 }, { "epoch": 0.35235235235235235, "grad_norm": 0.25714346766471863, "learning_rate": 0.00019574851883550395, "loss": 0.6913, "step": 44 }, { "epoch": 0.36036036036036034, "grad_norm": 0.4926215708255768, "learning_rate": 0.00019549667548552556, "loss": 0.6707, "step": 45 }, { "epoch": 0.3683683683683684, "grad_norm": 0.3760850429534912, "learning_rate": 0.00019523775757303974, "loss": 0.6809, "step": 46 }, { "epoch": 0.3763763763763764, "grad_norm": 0.3734811842441559, "learning_rate": 0.0001949717842791432, "loss": 0.6386, "step": 47 }, { "epoch": 0.3843843843843844, "grad_norm": 0.3447561264038086, "learning_rate": 0.00019469877530760754, "loss": 0.6955, "step": 48 }, { "epoch": 0.3923923923923924, "grad_norm": 0.2680707573890686, "learning_rate": 0.00019441875088341997, "loss": 0.6625, "step": 49 }, { "epoch": 0.4004004004004004, "grad_norm": 0.2692941725254059, "learning_rate": 0.00019413173175128473, "loss": 0.66, "step": 50 }, { "epoch": 0.4084084084084084, "grad_norm": 0.32329630851745605, "learning_rate": 0.00019383773917408642, "loss": 0.6612, "step": 51 }, { "epoch": 0.4164164164164164, "grad_norm": 0.281435489654541, "learning_rate": 0.00019353679493131485, "loss": 0.6621, "step": 52 }, { "epoch": 0.4244244244244244, "grad_norm": 0.22186556458473206, "learning_rate": 0.00019322892131745135, "loss": 0.6465, "step": 53 }, { "epoch": 0.43243243243243246, "grad_norm": 0.2902645468711853, "learning_rate": 0.00019291414114031743, "loss": 0.6693, "step": 54 }, { "epoch": 0.44044044044044045, "grad_norm": 0.2899124324321747, "learning_rate": 0.000192592477719385, "loss": 0.6568, "step": 55 }, { "epoch": 0.44844844844844844, "grad_norm": 0.2124062478542328, "learning_rate": 0.00019226395488404876, "loss": 0.6724, "step": 56 }, { "epoch": 0.45645645645645644, "grad_norm": 0.23896393179893494, "learning_rate": 0.00019192859697186106, "loss": 0.6459, "step": 57 }, { "epoch": 0.4644644644644645, "grad_norm": 0.2762405574321747, "learning_rate": 0.00019158642882672873, "loss": 0.6498, "step": 58 }, { "epoch": 0.4724724724724725, "grad_norm": 0.2079935222864151, "learning_rate": 0.00019123747579707275, "loss": 0.6604, "step": 59 }, { "epoch": 0.4804804804804805, "grad_norm": 0.23864208161830902, "learning_rate": 0.0001908817637339503, "loss": 0.6378, "step": 60 }, { "epoch": 0.48848848848848847, "grad_norm": 0.21718506515026093, "learning_rate": 0.00019051931898913976, "loss": 0.6424, "step": 61 }, { "epoch": 0.4964964964964965, "grad_norm": 0.2773915231227875, "learning_rate": 0.0001901501684131884, "loss": 0.6474, "step": 62 }, { "epoch": 0.5045045045045045, "grad_norm": 0.23982493579387665, "learning_rate": 0.0001897743393534234, "loss": 0.6256, "step": 63 }, { "epoch": 0.5125125125125125, "grad_norm": 0.23621873557567596, "learning_rate": 0.0001893918596519257, "loss": 0.6403, "step": 64 }, { "epoch": 0.5205205205205206, "grad_norm": 0.22759953141212463, "learning_rate": 0.00018900275764346768, "loss": 0.6484, "step": 65 }, { "epoch": 0.5285285285285285, "grad_norm": 0.26695549488067627, "learning_rate": 0.00018860706215341382, "loss": 0.609, "step": 66 }, { "epoch": 0.5365365365365365, "grad_norm": 0.24594709277153015, "learning_rate": 0.00018820480249558537, "loss": 0.6338, "step": 67 }, { "epoch": 0.5445445445445446, "grad_norm": 0.22960062325000763, "learning_rate": 0.00018779600847008884, "loss": 0.6166, "step": 68 }, { "epoch": 0.5525525525525525, "grad_norm": 0.25302109122276306, "learning_rate": 0.00018738071036110808, "loss": 0.6422, "step": 69 }, { "epoch": 0.5605605605605606, "grad_norm": 0.3339892327785492, "learning_rate": 0.0001869589389346611, "loss": 0.6558, "step": 70 }, { "epoch": 0.5685685685685685, "grad_norm": 0.21397258341312408, "learning_rate": 0.00018653072543632062, "loss": 0.6323, "step": 71 }, { "epoch": 0.5765765765765766, "grad_norm": 0.2514493465423584, "learning_rate": 0.00018609610158889942, "loss": 0.657, "step": 72 }, { "epoch": 0.5845845845845846, "grad_norm": 0.25317835807800293, "learning_rate": 0.00018565509959010036, "loss": 0.641, "step": 73 }, { "epoch": 0.5925925925925926, "grad_norm": 0.22669494152069092, "learning_rate": 0.00018520775211013093, "loss": 0.6369, "step": 74 }, { "epoch": 0.6006006006006006, "grad_norm": 0.2214743047952652, "learning_rate": 0.00018475409228928312, "loss": 0.6307, "step": 75 }, { "epoch": 0.6086086086086087, "grad_norm": 0.24376747012138367, "learning_rate": 0.00018429415373547828, "loss": 0.6557, "step": 76 }, { "epoch": 0.6166166166166166, "grad_norm": 0.2158333659172058, "learning_rate": 0.00018382797052177746, "loss": 0.655, "step": 77 }, { "epoch": 0.6246246246246246, "grad_norm": 0.25565382838249207, "learning_rate": 0.000183355577183857, "loss": 0.6299, "step": 78 }, { "epoch": 0.6326326326326326, "grad_norm": 0.20636747777462006, "learning_rate": 0.00018287700871745036, "loss": 0.6283, "step": 79 }, { "epoch": 0.6406406406406406, "grad_norm": 0.21258121728897095, "learning_rate": 0.00018239230057575542, "loss": 0.6174, "step": 80 }, { "epoch": 0.6486486486486487, "grad_norm": 0.2861458957195282, "learning_rate": 0.00018190148866680802, "loss": 0.6547, "step": 81 }, { "epoch": 0.6566566566566566, "grad_norm": 0.23667441308498383, "learning_rate": 0.0001814046093508218, "loss": 0.6416, "step": 82 }, { "epoch": 0.6646646646646647, "grad_norm": 0.23191799223423004, "learning_rate": 0.00018090169943749476, "loss": 0.642, "step": 83 }, { "epoch": 0.6726726726726727, "grad_norm": 0.2622171938419342, "learning_rate": 0.00018039279618328212, "loss": 0.6241, "step": 84 }, { "epoch": 0.6806806806806807, "grad_norm": 0.2891266345977783, "learning_rate": 0.00017987793728863651, "loss": 0.6284, "step": 85 }, { "epoch": 0.6886886886886887, "grad_norm": 0.26767420768737793, "learning_rate": 0.00017935716089521474, "loss": 0.627, "step": 86 }, { "epoch": 0.6966966966966966, "grad_norm": 0.2828672230243683, "learning_rate": 0.00017883050558305255, "loss": 0.6418, "step": 87 }, { "epoch": 0.7047047047047047, "grad_norm": 0.32730573415756226, "learning_rate": 0.00017829801036770628, "loss": 0.6629, "step": 88 }, { "epoch": 0.7127127127127127, "grad_norm": 0.24029900133609772, "learning_rate": 0.0001777597146973627, "loss": 0.614, "step": 89 }, { "epoch": 0.7207207207207207, "grad_norm": 0.2929212152957916, "learning_rate": 0.00017721565844991643, "loss": 0.632, "step": 90 }, { "epoch": 0.7287287287287287, "grad_norm": 0.2860666513442993, "learning_rate": 0.00017666588193001595, "loss": 0.6289, "step": 91 }, { "epoch": 0.7367367367367368, "grad_norm": 0.23325330018997192, "learning_rate": 0.00017611042586607748, "loss": 0.6392, "step": 92 }, { "epoch": 0.7447447447447447, "grad_norm": 0.3126169443130493, "learning_rate": 0.00017554933140726802, "loss": 0.6422, "step": 93 }, { "epoch": 0.7527527527527528, "grad_norm": 0.26704883575439453, "learning_rate": 0.00017498264012045687, "loss": 0.6166, "step": 94 }, { "epoch": 0.7607607607607607, "grad_norm": 0.2184283286333084, "learning_rate": 0.00017441039398713608, "loss": 0.6235, "step": 95 }, { "epoch": 0.7687687687687688, "grad_norm": 0.23906390368938446, "learning_rate": 0.00017383263540031067, "loss": 0.6643, "step": 96 }, { "epoch": 0.7767767767767768, "grad_norm": 0.26839691400527954, "learning_rate": 0.0001732494071613579, "loss": 0.6514, "step": 97 }, { "epoch": 0.7847847847847848, "grad_norm": 0.2805701494216919, "learning_rate": 0.00017266075247685656, "loss": 0.6168, "step": 98 }, { "epoch": 0.7927927927927928, "grad_norm": 0.21650992333889008, "learning_rate": 0.00017206671495538612, "loss": 0.5983, "step": 99 }, { "epoch": 0.8008008008008008, "grad_norm": 0.2302800416946411, "learning_rate": 0.00017146733860429612, "loss": 0.6301, "step": 100 }, { "epoch": 0.8088088088088088, "grad_norm": 0.29078415036201477, "learning_rate": 0.000170862667826446, "loss": 0.616, "step": 101 }, { "epoch": 0.8168168168168168, "grad_norm": 0.24860034883022308, "learning_rate": 0.0001702527474169157, "loss": 0.6352, "step": 102 }, { "epoch": 0.8248248248248248, "grad_norm": 0.26281973719596863, "learning_rate": 0.00016963762255968722, "loss": 0.6218, "step": 103 }, { "epoch": 0.8328328328328328, "grad_norm": 0.29051998257637024, "learning_rate": 0.0001690173388242972, "loss": 0.6233, "step": 104 }, { "epoch": 0.8408408408408409, "grad_norm": 0.2471507042646408, "learning_rate": 0.00016839194216246108, "loss": 0.6147, "step": 105 }, { "epoch": 0.8488488488488488, "grad_norm": 0.2574704587459564, "learning_rate": 0.0001677614789046689, "loss": 0.6174, "step": 106 }, { "epoch": 0.8568568568568569, "grad_norm": 0.2551233172416687, "learning_rate": 0.00016712599575675316, "loss": 0.5989, "step": 107 }, { "epoch": 0.8648648648648649, "grad_norm": 0.2901318371295929, "learning_rate": 0.00016648553979642868, "loss": 0.6241, "step": 108 }, { "epoch": 0.8728728728728729, "grad_norm": 0.23769080638885498, "learning_rate": 0.0001658401584698049, "loss": 0.6044, "step": 109 }, { "epoch": 0.8808808808808809, "grad_norm": 0.2580976188182831, "learning_rate": 0.00016518989958787126, "loss": 0.622, "step": 110 }, { "epoch": 0.8888888888888888, "grad_norm": 0.24077744781970978, "learning_rate": 0.00016453481132295506, "loss": 0.6047, "step": 111 }, { "epoch": 0.8968968968968969, "grad_norm": 0.228902667760849, "learning_rate": 0.00016387494220515274, "loss": 0.6138, "step": 112 }, { "epoch": 0.9049049049049049, "grad_norm": 0.2607581317424774, "learning_rate": 0.00016321034111873488, "loss": 0.6307, "step": 113 }, { "epoch": 0.9129129129129129, "grad_norm": 0.2575569450855255, "learning_rate": 0.00016254105729852464, "loss": 0.6008, "step": 114 }, { "epoch": 0.9209209209209209, "grad_norm": 0.231553852558136, "learning_rate": 0.00016186714032625035, "loss": 0.617, "step": 115 }, { "epoch": 0.928928928928929, "grad_norm": 0.24820354580879211, "learning_rate": 0.00016118864012687245, "loss": 0.5991, "step": 116 }, { "epoch": 0.9369369369369369, "grad_norm": 0.2364109754562378, "learning_rate": 0.00016050560696488492, "loss": 0.6094, "step": 117 }, { "epoch": 0.944944944944945, "grad_norm": 0.2492029368877411, "learning_rate": 0.00015981809144059166, "loss": 0.6143, "step": 118 }, { "epoch": 0.9529529529529529, "grad_norm": 0.27745717763900757, "learning_rate": 0.00015912614448635782, "loss": 0.6203, "step": 119 }, { "epoch": 0.960960960960961, "grad_norm": 0.2555610239505768, "learning_rate": 0.00015842981736283686, "loss": 0.6314, "step": 120 }, { "epoch": 0.968968968968969, "grad_norm": 0.2268420308828354, "learning_rate": 0.00015772916165517273, "loss": 0.6155, "step": 121 }, { "epoch": 0.9769769769769769, "grad_norm": 0.250041127204895, "learning_rate": 0.00015702422926917872, "loss": 0.6226, "step": 122 }, { "epoch": 0.984984984984985, "grad_norm": 0.2596072554588318, "learning_rate": 0.00015631507242749187, "loss": 0.6086, "step": 123 }, { "epoch": 0.992992992992993, "grad_norm": 0.2280743271112442, "learning_rate": 0.00015560174366570446, "loss": 0.5994, "step": 124 }, { "epoch": 1.001001001001001, "grad_norm": 0.23362237215042114, "learning_rate": 0.00015488429582847192, "loss": 0.616, "step": 125 } ], "logging_steps": 1, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.39825699258368e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }