|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 11.889192723814054, |
|
"eval_steps": 500, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.059445963619070265, |
|
"grad_norm": 2.0723960399627686, |
|
"learning_rate": 4.985138509095233e-05, |
|
"loss": 3.6788, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11889192723814053, |
|
"grad_norm": 1.9278995990753174, |
|
"learning_rate": 4.970277018190465e-05, |
|
"loss": 3.4742, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1783378908572108, |
|
"grad_norm": 1.4848977327346802, |
|
"learning_rate": 4.955415527285698e-05, |
|
"loss": 3.3942, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.23778385447628106, |
|
"grad_norm": 1.3492341041564941, |
|
"learning_rate": 4.94055403638093e-05, |
|
"loss": 3.3358, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2972298180953513, |
|
"grad_norm": 1.212128758430481, |
|
"learning_rate": 4.925692545476163e-05, |
|
"loss": 3.2851, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3566757817144216, |
|
"grad_norm": 1.1597293615341187, |
|
"learning_rate": 4.9108310545713945e-05, |
|
"loss": 3.2331, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.41612174533349183, |
|
"grad_norm": 0.9653922319412231, |
|
"learning_rate": 4.8959695636666275e-05, |
|
"loss": 3.2339, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4755677089525621, |
|
"grad_norm": 1.0085793733596802, |
|
"learning_rate": 4.88110807276186e-05, |
|
"loss": 3.1856, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5350136725716323, |
|
"grad_norm": 1.0556505918502808, |
|
"learning_rate": 4.866246581857092e-05, |
|
"loss": 3.1748, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5944596361907026, |
|
"grad_norm": 0.9526228904724121, |
|
"learning_rate": 4.851385090952324e-05, |
|
"loss": 3.1529, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6539055998097729, |
|
"grad_norm": 0.984980046749115, |
|
"learning_rate": 4.836523600047557e-05, |
|
"loss": 3.1378, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7133515634288432, |
|
"grad_norm": 1.0135027170181274, |
|
"learning_rate": 4.8216621091427895e-05, |
|
"loss": 3.0848, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7727975270479135, |
|
"grad_norm": 0.9454924464225769, |
|
"learning_rate": 4.806800618238022e-05, |
|
"loss": 3.0916, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.8322434906669837, |
|
"grad_norm": 0.9793129563331604, |
|
"learning_rate": 4.791939127333254e-05, |
|
"loss": 3.0642, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.891689454286054, |
|
"grad_norm": 0.9016062617301941, |
|
"learning_rate": 4.777077636428487e-05, |
|
"loss": 3.0657, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.9511354179051242, |
|
"grad_norm": 0.8690605163574219, |
|
"learning_rate": 4.762216145523719e-05, |
|
"loss": 3.0281, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0105813815241944, |
|
"grad_norm": 0.891808271408081, |
|
"learning_rate": 4.7473546546189516e-05, |
|
"loss": 3.0155, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.0700273451432647, |
|
"grad_norm": 0.9521974325180054, |
|
"learning_rate": 4.732493163714184e-05, |
|
"loss": 2.9713, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.129473308762335, |
|
"grad_norm": 0.9132643938064575, |
|
"learning_rate": 4.717631672809417e-05, |
|
"loss": 2.9663, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.1889192723814053, |
|
"grad_norm": 0.909182608127594, |
|
"learning_rate": 4.702770181904649e-05, |
|
"loss": 2.9616, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.2483652360004756, |
|
"grad_norm": 0.912726104259491, |
|
"learning_rate": 4.687908690999881e-05, |
|
"loss": 2.9653, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.3078111996195458, |
|
"grad_norm": 0.8568936586380005, |
|
"learning_rate": 4.6730472000951136e-05, |
|
"loss": 2.9486, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.3672571632386161, |
|
"grad_norm": 0.9120291471481323, |
|
"learning_rate": 4.6581857091903465e-05, |
|
"loss": 2.932, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.4267031268576864, |
|
"grad_norm": 0.981961190700531, |
|
"learning_rate": 4.643324218285579e-05, |
|
"loss": 2.9345, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.4861490904767567, |
|
"grad_norm": 0.9763424396514893, |
|
"learning_rate": 4.628462727380811e-05, |
|
"loss": 2.9193, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.545595054095827, |
|
"grad_norm": 0.8868328332901001, |
|
"learning_rate": 4.6136012364760434e-05, |
|
"loss": 2.9164, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.605041017714897, |
|
"grad_norm": 0.9175488352775574, |
|
"learning_rate": 4.598739745571276e-05, |
|
"loss": 2.8932, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.6644869813339676, |
|
"grad_norm": 0.890186607837677, |
|
"learning_rate": 4.583878254666508e-05, |
|
"loss": 2.8933, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.7239329449530376, |
|
"grad_norm": 0.9198343753814697, |
|
"learning_rate": 4.569016763761741e-05, |
|
"loss": 2.881, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.783378908572108, |
|
"grad_norm": 0.9706104397773743, |
|
"learning_rate": 4.554155272856973e-05, |
|
"loss": 2.8705, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.8428248721911782, |
|
"grad_norm": 0.9355807304382324, |
|
"learning_rate": 4.539293781952206e-05, |
|
"loss": 2.8601, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.9022708358102485, |
|
"grad_norm": 0.8972137570381165, |
|
"learning_rate": 4.524432291047438e-05, |
|
"loss": 2.8632, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.9617167994293188, |
|
"grad_norm": 0.8553013801574707, |
|
"learning_rate": 4.5095708001426706e-05, |
|
"loss": 2.8696, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.021162763048389, |
|
"grad_norm": 0.8952363133430481, |
|
"learning_rate": 4.494709309237903e-05, |
|
"loss": 2.8541, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.0806087266674593, |
|
"grad_norm": 0.8947279453277588, |
|
"learning_rate": 4.479847818333135e-05, |
|
"loss": 2.8203, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.1400546902865294, |
|
"grad_norm": 0.8680304884910583, |
|
"learning_rate": 4.4649863274283674e-05, |
|
"loss": 2.8088, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.1995006539056, |
|
"grad_norm": 0.8425644040107727, |
|
"learning_rate": 4.4501248365236004e-05, |
|
"loss": 2.8064, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.25894661752467, |
|
"grad_norm": 0.9474213719367981, |
|
"learning_rate": 4.4352633456188327e-05, |
|
"loss": 2.7851, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.3183925811437405, |
|
"grad_norm": 0.9292487502098083, |
|
"learning_rate": 4.420401854714065e-05, |
|
"loss": 2.8062, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.3778385447628105, |
|
"grad_norm": 0.8527488708496094, |
|
"learning_rate": 4.405540363809297e-05, |
|
"loss": 2.7851, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.437284508381881, |
|
"grad_norm": 0.9439261555671692, |
|
"learning_rate": 4.39067887290453e-05, |
|
"loss": 2.7873, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.496730472000951, |
|
"grad_norm": 0.9343836903572083, |
|
"learning_rate": 4.3758173819997624e-05, |
|
"loss": 2.7611, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.5561764356200216, |
|
"grad_norm": 0.9050599932670593, |
|
"learning_rate": 4.360955891094995e-05, |
|
"loss": 2.767, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.6156223992390917, |
|
"grad_norm": 0.9053699374198914, |
|
"learning_rate": 4.346094400190227e-05, |
|
"loss": 2.7873, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.6750683628581617, |
|
"grad_norm": 0.9282116293907166, |
|
"learning_rate": 4.33123290928546e-05, |
|
"loss": 2.7607, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.7345143264772322, |
|
"grad_norm": 0.9617480635643005, |
|
"learning_rate": 4.316371418380692e-05, |
|
"loss": 2.7678, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.7939602900963023, |
|
"grad_norm": 0.9725137948989868, |
|
"learning_rate": 4.3015099274759244e-05, |
|
"loss": 2.7665, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.853406253715373, |
|
"grad_norm": 0.9514666199684143, |
|
"learning_rate": 4.286648436571157e-05, |
|
"loss": 2.7534, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.912852217334443, |
|
"grad_norm": 0.9485461115837097, |
|
"learning_rate": 4.27178694566639e-05, |
|
"loss": 2.7306, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.9722981809535134, |
|
"grad_norm": 1.014106035232544, |
|
"learning_rate": 4.256925454761622e-05, |
|
"loss": 2.736, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.0317441445725835, |
|
"grad_norm": 0.9117903113365173, |
|
"learning_rate": 4.242063963856854e-05, |
|
"loss": 2.7278, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.091190108191654, |
|
"grad_norm": 0.8904880881309509, |
|
"learning_rate": 4.2272024729520865e-05, |
|
"loss": 2.7156, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.150636071810724, |
|
"grad_norm": 0.8653568625450134, |
|
"learning_rate": 4.2123409820473194e-05, |
|
"loss": 2.7137, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.210082035429794, |
|
"grad_norm": 0.9386480450630188, |
|
"learning_rate": 4.197479491142551e-05, |
|
"loss": 2.7021, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.2695279990488646, |
|
"grad_norm": 1.0122427940368652, |
|
"learning_rate": 4.182618000237784e-05, |
|
"loss": 2.699, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 3.3289739626679347, |
|
"grad_norm": 0.9319558143615723, |
|
"learning_rate": 4.167756509333017e-05, |
|
"loss": 2.689, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.388419926287005, |
|
"grad_norm": 0.9281746745109558, |
|
"learning_rate": 4.152895018428249e-05, |
|
"loss": 2.7027, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.4478658899060752, |
|
"grad_norm": 0.9750462770462036, |
|
"learning_rate": 4.1380335275234815e-05, |
|
"loss": 2.6947, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.5073118535251457, |
|
"grad_norm": 0.8887720704078674, |
|
"learning_rate": 4.123172036618714e-05, |
|
"loss": 2.6864, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.566757817144216, |
|
"grad_norm": 0.9884176254272461, |
|
"learning_rate": 4.108310545713947e-05, |
|
"loss": 2.6893, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.6262037807632863, |
|
"grad_norm": 0.9995080828666687, |
|
"learning_rate": 4.093449054809178e-05, |
|
"loss": 2.6734, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.6856497443823564, |
|
"grad_norm": 1.0068608522415161, |
|
"learning_rate": 4.078587563904411e-05, |
|
"loss": 2.6766, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.7450957080014264, |
|
"grad_norm": 1.0225422382354736, |
|
"learning_rate": 4.0637260729996435e-05, |
|
"loss": 2.6757, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.804541671620497, |
|
"grad_norm": 0.9354658126831055, |
|
"learning_rate": 4.0488645820948765e-05, |
|
"loss": 2.6593, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.8639876352395675, |
|
"grad_norm": 0.9209592938423157, |
|
"learning_rate": 4.034003091190108e-05, |
|
"loss": 2.6547, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.9234335988586375, |
|
"grad_norm": 0.8945015668869019, |
|
"learning_rate": 4.019141600285341e-05, |
|
"loss": 2.6719, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.9828795624777076, |
|
"grad_norm": 0.9823748469352722, |
|
"learning_rate": 4.004280109380573e-05, |
|
"loss": 2.6781, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 4.042325526096778, |
|
"grad_norm": 1.0186822414398193, |
|
"learning_rate": 3.989418618475806e-05, |
|
"loss": 2.6469, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 4.101771489715849, |
|
"grad_norm": 0.9255732297897339, |
|
"learning_rate": 3.974557127571038e-05, |
|
"loss": 2.6296, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 4.161217453334919, |
|
"grad_norm": 1.0235294103622437, |
|
"learning_rate": 3.959695636666271e-05, |
|
"loss": 2.6358, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.220663416953989, |
|
"grad_norm": 0.911547064781189, |
|
"learning_rate": 3.944834145761503e-05, |
|
"loss": 2.6354, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 4.280109380573059, |
|
"grad_norm": 1.0124516487121582, |
|
"learning_rate": 3.929972654856735e-05, |
|
"loss": 2.6416, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 4.33955534419213, |
|
"grad_norm": 1.0222316980361938, |
|
"learning_rate": 3.9151111639519676e-05, |
|
"loss": 2.6188, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 4.3990013078112, |
|
"grad_norm": 0.9710135459899902, |
|
"learning_rate": 3.9002496730472005e-05, |
|
"loss": 2.6228, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 4.45844727143027, |
|
"grad_norm": 1.0287182331085205, |
|
"learning_rate": 3.885388182142433e-05, |
|
"loss": 2.6067, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 4.51789323504934, |
|
"grad_norm": 0.9699456095695496, |
|
"learning_rate": 3.870526691237665e-05, |
|
"loss": 2.6385, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.57733919866841, |
|
"grad_norm": 0.9066009521484375, |
|
"learning_rate": 3.855665200332897e-05, |
|
"loss": 2.6284, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.636785162287481, |
|
"grad_norm": 0.8537769317626953, |
|
"learning_rate": 3.84080370942813e-05, |
|
"loss": 2.6135, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.696231125906551, |
|
"grad_norm": 1.0666980743408203, |
|
"learning_rate": 3.8259422185233626e-05, |
|
"loss": 2.6312, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.755677089525621, |
|
"grad_norm": 1.0641474723815918, |
|
"learning_rate": 3.811080727618595e-05, |
|
"loss": 2.6127, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.815123053144691, |
|
"grad_norm": 1.076323390007019, |
|
"learning_rate": 3.796219236713827e-05, |
|
"loss": 2.6184, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.874569016763762, |
|
"grad_norm": 0.8963558077812195, |
|
"learning_rate": 3.78135774580906e-05, |
|
"loss": 2.6165, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.934014980382832, |
|
"grad_norm": 0.968908429145813, |
|
"learning_rate": 3.766496254904292e-05, |
|
"loss": 2.6009, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.993460944001902, |
|
"grad_norm": 0.9362033605575562, |
|
"learning_rate": 3.7516347639995246e-05, |
|
"loss": 2.5956, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 5.052906907620972, |
|
"grad_norm": 1.1101199388504028, |
|
"learning_rate": 3.736773273094757e-05, |
|
"loss": 2.5755, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 5.112352871240043, |
|
"grad_norm": 1.2178868055343628, |
|
"learning_rate": 3.72191178218999e-05, |
|
"loss": 2.5724, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.171798834859113, |
|
"grad_norm": 1.0143418312072754, |
|
"learning_rate": 3.707050291285222e-05, |
|
"loss": 2.5834, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 5.231244798478183, |
|
"grad_norm": 0.9720271825790405, |
|
"learning_rate": 3.6921888003804544e-05, |
|
"loss": 2.586, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 5.290690762097253, |
|
"grad_norm": 0.8847070932388306, |
|
"learning_rate": 3.6773273094756866e-05, |
|
"loss": 2.5953, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 5.3501367257163235, |
|
"grad_norm": 0.9654759764671326, |
|
"learning_rate": 3.6624658185709196e-05, |
|
"loss": 2.5777, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.409582689335394, |
|
"grad_norm": 0.9272730350494385, |
|
"learning_rate": 3.647604327666151e-05, |
|
"loss": 2.5774, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 5.4690286529544645, |
|
"grad_norm": 0.9674676656723022, |
|
"learning_rate": 3.632742836761384e-05, |
|
"loss": 2.5779, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 5.528474616573535, |
|
"grad_norm": 1.0238367319107056, |
|
"learning_rate": 3.6178813458566164e-05, |
|
"loss": 2.5683, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 5.587920580192605, |
|
"grad_norm": 1.1663753986358643, |
|
"learning_rate": 3.603019854951849e-05, |
|
"loss": 2.5802, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 5.647366543811675, |
|
"grad_norm": 0.8961432576179504, |
|
"learning_rate": 3.588158364047081e-05, |
|
"loss": 2.5726, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 5.706812507430746, |
|
"grad_norm": 1.1115467548370361, |
|
"learning_rate": 3.573296873142314e-05, |
|
"loss": 2.5719, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.766258471049816, |
|
"grad_norm": 1.00434148311615, |
|
"learning_rate": 3.558435382237546e-05, |
|
"loss": 2.556, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 5.825704434668886, |
|
"grad_norm": 1.1120518445968628, |
|
"learning_rate": 3.5435738913327784e-05, |
|
"loss": 2.5627, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.885150398287957, |
|
"grad_norm": 0.9611983299255371, |
|
"learning_rate": 3.528712400428011e-05, |
|
"loss": 2.5568, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.944596361907027, |
|
"grad_norm": 1.1176481246948242, |
|
"learning_rate": 3.5138509095232436e-05, |
|
"loss": 2.5634, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 6.004042325526097, |
|
"grad_norm": 0.8676426410675049, |
|
"learning_rate": 3.498989418618476e-05, |
|
"loss": 2.5551, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 6.063488289145167, |
|
"grad_norm": 0.8983253240585327, |
|
"learning_rate": 3.484127927713708e-05, |
|
"loss": 2.5442, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 6.122934252764237, |
|
"grad_norm": 0.9558296203613281, |
|
"learning_rate": 3.4692664368089405e-05, |
|
"loss": 2.5415, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 6.182380216383308, |
|
"grad_norm": 1.1759629249572754, |
|
"learning_rate": 3.4544049459041734e-05, |
|
"loss": 2.5186, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 6.241826180002378, |
|
"grad_norm": 1.186232089996338, |
|
"learning_rate": 3.439543454999406e-05, |
|
"loss": 2.5437, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 6.301272143621448, |
|
"grad_norm": 1.1072938442230225, |
|
"learning_rate": 3.424681964094638e-05, |
|
"loss": 2.5442, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 6.360718107240518, |
|
"grad_norm": 1.1854956150054932, |
|
"learning_rate": 3.40982047318987e-05, |
|
"loss": 2.5265, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 6.420164070859588, |
|
"grad_norm": 1.037420392036438, |
|
"learning_rate": 3.394958982285103e-05, |
|
"loss": 2.5101, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 6.479610034478659, |
|
"grad_norm": 1.0414271354675293, |
|
"learning_rate": 3.3800974913803354e-05, |
|
"loss": 2.5291, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 6.539055998097729, |
|
"grad_norm": 0.8827362656593323, |
|
"learning_rate": 3.365236000475568e-05, |
|
"loss": 2.5187, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 6.598501961716799, |
|
"grad_norm": 0.9146121144294739, |
|
"learning_rate": 3.3503745095708007e-05, |
|
"loss": 2.5234, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 6.657947925335869, |
|
"grad_norm": 1.0134857892990112, |
|
"learning_rate": 3.335513018666033e-05, |
|
"loss": 2.5199, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 6.71739388895494, |
|
"grad_norm": 1.1852586269378662, |
|
"learning_rate": 3.320651527761265e-05, |
|
"loss": 2.5347, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 6.77683985257401, |
|
"grad_norm": 1.0739943981170654, |
|
"learning_rate": 3.3057900368564975e-05, |
|
"loss": 2.5367, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 6.83628581619308, |
|
"grad_norm": 0.9880659580230713, |
|
"learning_rate": 3.2909285459517304e-05, |
|
"loss": 2.5181, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 6.8957317798121505, |
|
"grad_norm": 1.0519931316375732, |
|
"learning_rate": 3.276067055046963e-05, |
|
"loss": 2.5325, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.955177743431221, |
|
"grad_norm": 0.9463315010070801, |
|
"learning_rate": 3.261205564142195e-05, |
|
"loss": 2.5384, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 7.0146237070502915, |
|
"grad_norm": 0.9906750917434692, |
|
"learning_rate": 3.246344073237427e-05, |
|
"loss": 2.5374, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 7.0740696706693615, |
|
"grad_norm": 0.9740419983863831, |
|
"learning_rate": 3.23148258233266e-05, |
|
"loss": 2.4919, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 7.133515634288432, |
|
"grad_norm": 1.0209646224975586, |
|
"learning_rate": 3.2166210914278925e-05, |
|
"loss": 2.5065, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 7.192961597907502, |
|
"grad_norm": 1.1537789106369019, |
|
"learning_rate": 3.201759600523125e-05, |
|
"loss": 2.4888, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 7.252407561526573, |
|
"grad_norm": 1.0545387268066406, |
|
"learning_rate": 3.186898109618357e-05, |
|
"loss": 2.5042, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 7.311853525145643, |
|
"grad_norm": 0.8990502953529358, |
|
"learning_rate": 3.17203661871359e-05, |
|
"loss": 2.4956, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 7.371299488764713, |
|
"grad_norm": 1.0004386901855469, |
|
"learning_rate": 3.1571751278088215e-05, |
|
"loss": 2.5096, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 7.430745452383783, |
|
"grad_norm": 1.192317008972168, |
|
"learning_rate": 3.1423136369040545e-05, |
|
"loss": 2.5038, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 7.490191416002854, |
|
"grad_norm": 0.9577484726905823, |
|
"learning_rate": 3.127452145999287e-05, |
|
"loss": 2.5113, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 7.549637379621924, |
|
"grad_norm": 0.8835137486457825, |
|
"learning_rate": 3.11259065509452e-05, |
|
"loss": 2.4939, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 7.609083343240994, |
|
"grad_norm": 0.8289955258369446, |
|
"learning_rate": 3.097729164189751e-05, |
|
"loss": 2.4716, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 7.668529306860064, |
|
"grad_norm": 0.9576908349990845, |
|
"learning_rate": 3.082867673284984e-05, |
|
"loss": 2.5109, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 7.727975270479135, |
|
"grad_norm": 0.9045142531394958, |
|
"learning_rate": 3.0680061823802165e-05, |
|
"loss": 2.4811, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 7.787421234098205, |
|
"grad_norm": 1.3150789737701416, |
|
"learning_rate": 3.053144691475449e-05, |
|
"loss": 2.505, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 7.846867197717275, |
|
"grad_norm": 0.9815430641174316, |
|
"learning_rate": 3.0382832005706814e-05, |
|
"loss": 2.4923, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 7.906313161336345, |
|
"grad_norm": 1.0355448722839355, |
|
"learning_rate": 3.023421709665914e-05, |
|
"loss": 2.4867, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 7.965759124955415, |
|
"grad_norm": 1.0244001150131226, |
|
"learning_rate": 3.0085602187611463e-05, |
|
"loss": 2.4973, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 8.025205088574486, |
|
"grad_norm": 1.052660584449768, |
|
"learning_rate": 2.993698727856379e-05, |
|
"loss": 2.4976, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 8.084651052193555, |
|
"grad_norm": 1.1590783596038818, |
|
"learning_rate": 2.978837236951611e-05, |
|
"loss": 2.4631, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 8.144097015812626, |
|
"grad_norm": 0.9065755605697632, |
|
"learning_rate": 2.9639757460468438e-05, |
|
"loss": 2.4494, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 8.203542979431697, |
|
"grad_norm": 0.9562356472015381, |
|
"learning_rate": 2.9491142551420757e-05, |
|
"loss": 2.4728, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 8.262988943050766, |
|
"grad_norm": 0.9509665966033936, |
|
"learning_rate": 2.9342527642373087e-05, |
|
"loss": 2.4747, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 8.322434906669837, |
|
"grad_norm": 0.9384153485298157, |
|
"learning_rate": 2.9193912733325406e-05, |
|
"loss": 2.4745, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 8.381880870288906, |
|
"grad_norm": 0.9459151029586792, |
|
"learning_rate": 2.9045297824277735e-05, |
|
"loss": 2.476, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 8.441326833907977, |
|
"grad_norm": 0.9553677439689636, |
|
"learning_rate": 2.8896682915230055e-05, |
|
"loss": 2.4753, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 8.500772797527048, |
|
"grad_norm": 1.014932632446289, |
|
"learning_rate": 2.8748068006182384e-05, |
|
"loss": 2.4647, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 8.560218761146118, |
|
"grad_norm": 0.990463376045227, |
|
"learning_rate": 2.8599453097134704e-05, |
|
"loss": 2.4782, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 8.619664724765189, |
|
"grad_norm": 0.892906665802002, |
|
"learning_rate": 2.845083818808703e-05, |
|
"loss": 2.4736, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 8.67911068838426, |
|
"grad_norm": 0.9943811297416687, |
|
"learning_rate": 2.8302223279039352e-05, |
|
"loss": 2.4554, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 8.738556652003329, |
|
"grad_norm": 0.9325155019760132, |
|
"learning_rate": 2.815360836999168e-05, |
|
"loss": 2.4703, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 8.7980026156224, |
|
"grad_norm": 0.9389231204986572, |
|
"learning_rate": 2.8004993460944e-05, |
|
"loss": 2.4727, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 8.857448579241469, |
|
"grad_norm": 0.9121980667114258, |
|
"learning_rate": 2.7856378551896327e-05, |
|
"loss": 2.4533, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 8.91689454286054, |
|
"grad_norm": 1.046366572380066, |
|
"learning_rate": 2.770776364284865e-05, |
|
"loss": 2.4652, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 8.97634050647961, |
|
"grad_norm": 1.0157803297042847, |
|
"learning_rate": 2.7559148733800976e-05, |
|
"loss": 2.4701, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 9.03578647009868, |
|
"grad_norm": 1.1012301445007324, |
|
"learning_rate": 2.74105338247533e-05, |
|
"loss": 2.4491, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 9.09523243371775, |
|
"grad_norm": 1.000829815864563, |
|
"learning_rate": 2.7261918915705625e-05, |
|
"loss": 2.4434, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 9.15467839733682, |
|
"grad_norm": 1.028676986694336, |
|
"learning_rate": 2.7113304006657948e-05, |
|
"loss": 2.4392, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 9.214124360955891, |
|
"grad_norm": 1.0821462869644165, |
|
"learning_rate": 2.6964689097610274e-05, |
|
"loss": 2.4289, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 9.273570324574962, |
|
"grad_norm": 0.951738715171814, |
|
"learning_rate": 2.6816074188562596e-05, |
|
"loss": 2.4437, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 9.333016288194031, |
|
"grad_norm": 0.9170756936073303, |
|
"learning_rate": 2.6667459279514923e-05, |
|
"loss": 2.4507, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 9.392462251813102, |
|
"grad_norm": 0.9591684937477112, |
|
"learning_rate": 2.6518844370467245e-05, |
|
"loss": 2.4584, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 9.451908215432173, |
|
"grad_norm": 1.1289016008377075, |
|
"learning_rate": 2.637022946141957e-05, |
|
"loss": 2.4595, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 9.511354179051242, |
|
"grad_norm": 1.0114785432815552, |
|
"learning_rate": 2.6221614552371894e-05, |
|
"loss": 2.4404, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 9.570800142670313, |
|
"grad_norm": 1.1835304498672485, |
|
"learning_rate": 2.607299964332422e-05, |
|
"loss": 2.4308, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 9.630246106289382, |
|
"grad_norm": 0.9822309017181396, |
|
"learning_rate": 2.592438473427654e-05, |
|
"loss": 2.4387, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 9.689692069908453, |
|
"grad_norm": 1.114311695098877, |
|
"learning_rate": 2.577576982522887e-05, |
|
"loss": 2.4519, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 9.749138033527524, |
|
"grad_norm": 1.1047866344451904, |
|
"learning_rate": 2.5627154916181195e-05, |
|
"loss": 2.4497, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 9.808583997146593, |
|
"grad_norm": 0.9930892586708069, |
|
"learning_rate": 2.5478540007133518e-05, |
|
"loss": 2.4489, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 9.868029960765664, |
|
"grad_norm": 1.1107361316680908, |
|
"learning_rate": 2.5329925098085844e-05, |
|
"loss": 2.4399, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 9.927475924384733, |
|
"grad_norm": 1.0770343542099, |
|
"learning_rate": 2.5181310189038167e-05, |
|
"loss": 2.4362, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 9.986921888003804, |
|
"grad_norm": 0.9818819761276245, |
|
"learning_rate": 2.5032695279990493e-05, |
|
"loss": 2.4418, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 10.046367851622875, |
|
"grad_norm": 1.1135622262954712, |
|
"learning_rate": 2.4884080370942815e-05, |
|
"loss": 2.428, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 10.105813815241945, |
|
"grad_norm": 1.035888671875, |
|
"learning_rate": 2.4735465461895138e-05, |
|
"loss": 2.4193, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 10.165259778861016, |
|
"grad_norm": 0.9694905281066895, |
|
"learning_rate": 2.458685055284746e-05, |
|
"loss": 2.4165, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 10.224705742480086, |
|
"grad_norm": 1.116449236869812, |
|
"learning_rate": 2.4438235643799787e-05, |
|
"loss": 2.4122, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 10.284151706099156, |
|
"grad_norm": 0.9860423803329468, |
|
"learning_rate": 2.428962073475211e-05, |
|
"loss": 2.4173, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 10.343597669718227, |
|
"grad_norm": 1.1727473735809326, |
|
"learning_rate": 2.4141005825704436e-05, |
|
"loss": 2.4258, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 10.403043633337296, |
|
"grad_norm": 1.0731017589569092, |
|
"learning_rate": 2.399239091665676e-05, |
|
"loss": 2.4289, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 10.462489596956367, |
|
"grad_norm": 1.0740883350372314, |
|
"learning_rate": 2.3843776007609085e-05, |
|
"loss": 2.4142, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 10.521935560575438, |
|
"grad_norm": 1.1342713832855225, |
|
"learning_rate": 2.3695161098561407e-05, |
|
"loss": 2.4315, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 10.581381524194507, |
|
"grad_norm": 1.0230334997177124, |
|
"learning_rate": 2.3546546189513733e-05, |
|
"loss": 2.4352, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 10.640827487813578, |
|
"grad_norm": 1.0113749504089355, |
|
"learning_rate": 2.3397931280466056e-05, |
|
"loss": 2.4128, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 10.700273451432647, |
|
"grad_norm": 1.0363703966140747, |
|
"learning_rate": 2.3249316371418382e-05, |
|
"loss": 2.4343, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 10.759719415051718, |
|
"grad_norm": 1.0065736770629883, |
|
"learning_rate": 2.3100701462370705e-05, |
|
"loss": 2.4268, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 10.819165378670789, |
|
"grad_norm": 0.949798047542572, |
|
"learning_rate": 2.295208655332303e-05, |
|
"loss": 2.4114, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 10.878611342289858, |
|
"grad_norm": 0.9772433042526245, |
|
"learning_rate": 2.2803471644275354e-05, |
|
"loss": 2.4187, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 10.938057305908929, |
|
"grad_norm": 0.9436720609664917, |
|
"learning_rate": 2.2654856735227677e-05, |
|
"loss": 2.4151, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 10.997503269528, |
|
"grad_norm": 0.9903433918952942, |
|
"learning_rate": 2.2506241826180003e-05, |
|
"loss": 2.4332, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 11.05694923314707, |
|
"grad_norm": 0.9285963177680969, |
|
"learning_rate": 2.2357626917132325e-05, |
|
"loss": 2.3895, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 11.11639519676614, |
|
"grad_norm": 1.0996205806732178, |
|
"learning_rate": 2.220901200808465e-05, |
|
"loss": 2.3858, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 11.17584116038521, |
|
"grad_norm": 0.9550360441207886, |
|
"learning_rate": 2.2060397099036974e-05, |
|
"loss": 2.4016, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 11.23528712400428, |
|
"grad_norm": 1.3018606901168823, |
|
"learning_rate": 2.19117821899893e-05, |
|
"loss": 2.4031, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 11.294733087623351, |
|
"grad_norm": 0.9388914704322815, |
|
"learning_rate": 2.1763167280941626e-05, |
|
"loss": 2.4094, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 11.35417905124242, |
|
"grad_norm": 0.9850655794143677, |
|
"learning_rate": 2.161455237189395e-05, |
|
"loss": 2.4054, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 11.413625014861491, |
|
"grad_norm": 1.038522481918335, |
|
"learning_rate": 2.1465937462846275e-05, |
|
"loss": 2.3895, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 11.47307097848056, |
|
"grad_norm": 1.0989197492599487, |
|
"learning_rate": 2.1317322553798598e-05, |
|
"loss": 2.4019, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 11.532516942099631, |
|
"grad_norm": 1.0527700185775757, |
|
"learning_rate": 2.1168707644750924e-05, |
|
"loss": 2.399, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 11.591962905718702, |
|
"grad_norm": 1.273655652999878, |
|
"learning_rate": 2.1020092735703247e-05, |
|
"loss": 2.4259, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 11.651408869337772, |
|
"grad_norm": 1.002064824104309, |
|
"learning_rate": 2.0871477826655573e-05, |
|
"loss": 2.4073, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 11.710854832956842, |
|
"grad_norm": 0.9922045469284058, |
|
"learning_rate": 2.0722862917607896e-05, |
|
"loss": 2.4059, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 11.770300796575913, |
|
"grad_norm": 0.9962035417556763, |
|
"learning_rate": 2.057424800856022e-05, |
|
"loss": 2.4174, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 11.829746760194983, |
|
"grad_norm": 1.0998961925506592, |
|
"learning_rate": 2.0425633099512544e-05, |
|
"loss": 2.4133, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 11.889192723814054, |
|
"grad_norm": 1.0380686521530151, |
|
"learning_rate": 2.027701819046487e-05, |
|
"loss": 2.414, |
|
"step": 100000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 168220, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 4.1803850907648e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|