gpt2-100000 / trainer_state.json
Hwijeen's picture
Upload 13 files
073f570 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 11.889192723814054,
"eval_steps": 500,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.059445963619070265,
"grad_norm": 2.0723960399627686,
"learning_rate": 4.985138509095233e-05,
"loss": 3.6788,
"step": 500
},
{
"epoch": 0.11889192723814053,
"grad_norm": 1.9278995990753174,
"learning_rate": 4.970277018190465e-05,
"loss": 3.4742,
"step": 1000
},
{
"epoch": 0.1783378908572108,
"grad_norm": 1.4848977327346802,
"learning_rate": 4.955415527285698e-05,
"loss": 3.3942,
"step": 1500
},
{
"epoch": 0.23778385447628106,
"grad_norm": 1.3492341041564941,
"learning_rate": 4.94055403638093e-05,
"loss": 3.3358,
"step": 2000
},
{
"epoch": 0.2972298180953513,
"grad_norm": 1.212128758430481,
"learning_rate": 4.925692545476163e-05,
"loss": 3.2851,
"step": 2500
},
{
"epoch": 0.3566757817144216,
"grad_norm": 1.1597293615341187,
"learning_rate": 4.9108310545713945e-05,
"loss": 3.2331,
"step": 3000
},
{
"epoch": 0.41612174533349183,
"grad_norm": 0.9653922319412231,
"learning_rate": 4.8959695636666275e-05,
"loss": 3.2339,
"step": 3500
},
{
"epoch": 0.4755677089525621,
"grad_norm": 1.0085793733596802,
"learning_rate": 4.88110807276186e-05,
"loss": 3.1856,
"step": 4000
},
{
"epoch": 0.5350136725716323,
"grad_norm": 1.0556505918502808,
"learning_rate": 4.866246581857092e-05,
"loss": 3.1748,
"step": 4500
},
{
"epoch": 0.5944596361907026,
"grad_norm": 0.9526228904724121,
"learning_rate": 4.851385090952324e-05,
"loss": 3.1529,
"step": 5000
},
{
"epoch": 0.6539055998097729,
"grad_norm": 0.984980046749115,
"learning_rate": 4.836523600047557e-05,
"loss": 3.1378,
"step": 5500
},
{
"epoch": 0.7133515634288432,
"grad_norm": 1.0135027170181274,
"learning_rate": 4.8216621091427895e-05,
"loss": 3.0848,
"step": 6000
},
{
"epoch": 0.7727975270479135,
"grad_norm": 0.9454924464225769,
"learning_rate": 4.806800618238022e-05,
"loss": 3.0916,
"step": 6500
},
{
"epoch": 0.8322434906669837,
"grad_norm": 0.9793129563331604,
"learning_rate": 4.791939127333254e-05,
"loss": 3.0642,
"step": 7000
},
{
"epoch": 0.891689454286054,
"grad_norm": 0.9016062617301941,
"learning_rate": 4.777077636428487e-05,
"loss": 3.0657,
"step": 7500
},
{
"epoch": 0.9511354179051242,
"grad_norm": 0.8690605163574219,
"learning_rate": 4.762216145523719e-05,
"loss": 3.0281,
"step": 8000
},
{
"epoch": 1.0105813815241944,
"grad_norm": 0.891808271408081,
"learning_rate": 4.7473546546189516e-05,
"loss": 3.0155,
"step": 8500
},
{
"epoch": 1.0700273451432647,
"grad_norm": 0.9521974325180054,
"learning_rate": 4.732493163714184e-05,
"loss": 2.9713,
"step": 9000
},
{
"epoch": 1.129473308762335,
"grad_norm": 0.9132643938064575,
"learning_rate": 4.717631672809417e-05,
"loss": 2.9663,
"step": 9500
},
{
"epoch": 1.1889192723814053,
"grad_norm": 0.909182608127594,
"learning_rate": 4.702770181904649e-05,
"loss": 2.9616,
"step": 10000
},
{
"epoch": 1.2483652360004756,
"grad_norm": 0.912726104259491,
"learning_rate": 4.687908690999881e-05,
"loss": 2.9653,
"step": 10500
},
{
"epoch": 1.3078111996195458,
"grad_norm": 0.8568936586380005,
"learning_rate": 4.6730472000951136e-05,
"loss": 2.9486,
"step": 11000
},
{
"epoch": 1.3672571632386161,
"grad_norm": 0.9120291471481323,
"learning_rate": 4.6581857091903465e-05,
"loss": 2.932,
"step": 11500
},
{
"epoch": 1.4267031268576864,
"grad_norm": 0.981961190700531,
"learning_rate": 4.643324218285579e-05,
"loss": 2.9345,
"step": 12000
},
{
"epoch": 1.4861490904767567,
"grad_norm": 0.9763424396514893,
"learning_rate": 4.628462727380811e-05,
"loss": 2.9193,
"step": 12500
},
{
"epoch": 1.545595054095827,
"grad_norm": 0.8868328332901001,
"learning_rate": 4.6136012364760434e-05,
"loss": 2.9164,
"step": 13000
},
{
"epoch": 1.605041017714897,
"grad_norm": 0.9175488352775574,
"learning_rate": 4.598739745571276e-05,
"loss": 2.8932,
"step": 13500
},
{
"epoch": 1.6644869813339676,
"grad_norm": 0.890186607837677,
"learning_rate": 4.583878254666508e-05,
"loss": 2.8933,
"step": 14000
},
{
"epoch": 1.7239329449530376,
"grad_norm": 0.9198343753814697,
"learning_rate": 4.569016763761741e-05,
"loss": 2.881,
"step": 14500
},
{
"epoch": 1.783378908572108,
"grad_norm": 0.9706104397773743,
"learning_rate": 4.554155272856973e-05,
"loss": 2.8705,
"step": 15000
},
{
"epoch": 1.8428248721911782,
"grad_norm": 0.9355807304382324,
"learning_rate": 4.539293781952206e-05,
"loss": 2.8601,
"step": 15500
},
{
"epoch": 1.9022708358102485,
"grad_norm": 0.8972137570381165,
"learning_rate": 4.524432291047438e-05,
"loss": 2.8632,
"step": 16000
},
{
"epoch": 1.9617167994293188,
"grad_norm": 0.8553013801574707,
"learning_rate": 4.5095708001426706e-05,
"loss": 2.8696,
"step": 16500
},
{
"epoch": 2.021162763048389,
"grad_norm": 0.8952363133430481,
"learning_rate": 4.494709309237903e-05,
"loss": 2.8541,
"step": 17000
},
{
"epoch": 2.0806087266674593,
"grad_norm": 0.8947279453277588,
"learning_rate": 4.479847818333135e-05,
"loss": 2.8203,
"step": 17500
},
{
"epoch": 2.1400546902865294,
"grad_norm": 0.8680304884910583,
"learning_rate": 4.4649863274283674e-05,
"loss": 2.8088,
"step": 18000
},
{
"epoch": 2.1995006539056,
"grad_norm": 0.8425644040107727,
"learning_rate": 4.4501248365236004e-05,
"loss": 2.8064,
"step": 18500
},
{
"epoch": 2.25894661752467,
"grad_norm": 0.9474213719367981,
"learning_rate": 4.4352633456188327e-05,
"loss": 2.7851,
"step": 19000
},
{
"epoch": 2.3183925811437405,
"grad_norm": 0.9292487502098083,
"learning_rate": 4.420401854714065e-05,
"loss": 2.8062,
"step": 19500
},
{
"epoch": 2.3778385447628105,
"grad_norm": 0.8527488708496094,
"learning_rate": 4.405540363809297e-05,
"loss": 2.7851,
"step": 20000
},
{
"epoch": 2.437284508381881,
"grad_norm": 0.9439261555671692,
"learning_rate": 4.39067887290453e-05,
"loss": 2.7873,
"step": 20500
},
{
"epoch": 2.496730472000951,
"grad_norm": 0.9343836903572083,
"learning_rate": 4.3758173819997624e-05,
"loss": 2.7611,
"step": 21000
},
{
"epoch": 2.5561764356200216,
"grad_norm": 0.9050599932670593,
"learning_rate": 4.360955891094995e-05,
"loss": 2.767,
"step": 21500
},
{
"epoch": 2.6156223992390917,
"grad_norm": 0.9053699374198914,
"learning_rate": 4.346094400190227e-05,
"loss": 2.7873,
"step": 22000
},
{
"epoch": 2.6750683628581617,
"grad_norm": 0.9282116293907166,
"learning_rate": 4.33123290928546e-05,
"loss": 2.7607,
"step": 22500
},
{
"epoch": 2.7345143264772322,
"grad_norm": 0.9617480635643005,
"learning_rate": 4.316371418380692e-05,
"loss": 2.7678,
"step": 23000
},
{
"epoch": 2.7939602900963023,
"grad_norm": 0.9725137948989868,
"learning_rate": 4.3015099274759244e-05,
"loss": 2.7665,
"step": 23500
},
{
"epoch": 2.853406253715373,
"grad_norm": 0.9514666199684143,
"learning_rate": 4.286648436571157e-05,
"loss": 2.7534,
"step": 24000
},
{
"epoch": 2.912852217334443,
"grad_norm": 0.9485461115837097,
"learning_rate": 4.27178694566639e-05,
"loss": 2.7306,
"step": 24500
},
{
"epoch": 2.9722981809535134,
"grad_norm": 1.014106035232544,
"learning_rate": 4.256925454761622e-05,
"loss": 2.736,
"step": 25000
},
{
"epoch": 3.0317441445725835,
"grad_norm": 0.9117903113365173,
"learning_rate": 4.242063963856854e-05,
"loss": 2.7278,
"step": 25500
},
{
"epoch": 3.091190108191654,
"grad_norm": 0.8904880881309509,
"learning_rate": 4.2272024729520865e-05,
"loss": 2.7156,
"step": 26000
},
{
"epoch": 3.150636071810724,
"grad_norm": 0.8653568625450134,
"learning_rate": 4.2123409820473194e-05,
"loss": 2.7137,
"step": 26500
},
{
"epoch": 3.210082035429794,
"grad_norm": 0.9386480450630188,
"learning_rate": 4.197479491142551e-05,
"loss": 2.7021,
"step": 27000
},
{
"epoch": 3.2695279990488646,
"grad_norm": 1.0122427940368652,
"learning_rate": 4.182618000237784e-05,
"loss": 2.699,
"step": 27500
},
{
"epoch": 3.3289739626679347,
"grad_norm": 0.9319558143615723,
"learning_rate": 4.167756509333017e-05,
"loss": 2.689,
"step": 28000
},
{
"epoch": 3.388419926287005,
"grad_norm": 0.9281746745109558,
"learning_rate": 4.152895018428249e-05,
"loss": 2.7027,
"step": 28500
},
{
"epoch": 3.4478658899060752,
"grad_norm": 0.9750462770462036,
"learning_rate": 4.1380335275234815e-05,
"loss": 2.6947,
"step": 29000
},
{
"epoch": 3.5073118535251457,
"grad_norm": 0.8887720704078674,
"learning_rate": 4.123172036618714e-05,
"loss": 2.6864,
"step": 29500
},
{
"epoch": 3.566757817144216,
"grad_norm": 0.9884176254272461,
"learning_rate": 4.108310545713947e-05,
"loss": 2.6893,
"step": 30000
},
{
"epoch": 3.6262037807632863,
"grad_norm": 0.9995080828666687,
"learning_rate": 4.093449054809178e-05,
"loss": 2.6734,
"step": 30500
},
{
"epoch": 3.6856497443823564,
"grad_norm": 1.0068608522415161,
"learning_rate": 4.078587563904411e-05,
"loss": 2.6766,
"step": 31000
},
{
"epoch": 3.7450957080014264,
"grad_norm": 1.0225422382354736,
"learning_rate": 4.0637260729996435e-05,
"loss": 2.6757,
"step": 31500
},
{
"epoch": 3.804541671620497,
"grad_norm": 0.9354658126831055,
"learning_rate": 4.0488645820948765e-05,
"loss": 2.6593,
"step": 32000
},
{
"epoch": 3.8639876352395675,
"grad_norm": 0.9209592938423157,
"learning_rate": 4.034003091190108e-05,
"loss": 2.6547,
"step": 32500
},
{
"epoch": 3.9234335988586375,
"grad_norm": 0.8945015668869019,
"learning_rate": 4.019141600285341e-05,
"loss": 2.6719,
"step": 33000
},
{
"epoch": 3.9828795624777076,
"grad_norm": 0.9823748469352722,
"learning_rate": 4.004280109380573e-05,
"loss": 2.6781,
"step": 33500
},
{
"epoch": 4.042325526096778,
"grad_norm": 1.0186822414398193,
"learning_rate": 3.989418618475806e-05,
"loss": 2.6469,
"step": 34000
},
{
"epoch": 4.101771489715849,
"grad_norm": 0.9255732297897339,
"learning_rate": 3.974557127571038e-05,
"loss": 2.6296,
"step": 34500
},
{
"epoch": 4.161217453334919,
"grad_norm": 1.0235294103622437,
"learning_rate": 3.959695636666271e-05,
"loss": 2.6358,
"step": 35000
},
{
"epoch": 4.220663416953989,
"grad_norm": 0.911547064781189,
"learning_rate": 3.944834145761503e-05,
"loss": 2.6354,
"step": 35500
},
{
"epoch": 4.280109380573059,
"grad_norm": 1.0124516487121582,
"learning_rate": 3.929972654856735e-05,
"loss": 2.6416,
"step": 36000
},
{
"epoch": 4.33955534419213,
"grad_norm": 1.0222316980361938,
"learning_rate": 3.9151111639519676e-05,
"loss": 2.6188,
"step": 36500
},
{
"epoch": 4.3990013078112,
"grad_norm": 0.9710135459899902,
"learning_rate": 3.9002496730472005e-05,
"loss": 2.6228,
"step": 37000
},
{
"epoch": 4.45844727143027,
"grad_norm": 1.0287182331085205,
"learning_rate": 3.885388182142433e-05,
"loss": 2.6067,
"step": 37500
},
{
"epoch": 4.51789323504934,
"grad_norm": 0.9699456095695496,
"learning_rate": 3.870526691237665e-05,
"loss": 2.6385,
"step": 38000
},
{
"epoch": 4.57733919866841,
"grad_norm": 0.9066009521484375,
"learning_rate": 3.855665200332897e-05,
"loss": 2.6284,
"step": 38500
},
{
"epoch": 4.636785162287481,
"grad_norm": 0.8537769317626953,
"learning_rate": 3.84080370942813e-05,
"loss": 2.6135,
"step": 39000
},
{
"epoch": 4.696231125906551,
"grad_norm": 1.0666980743408203,
"learning_rate": 3.8259422185233626e-05,
"loss": 2.6312,
"step": 39500
},
{
"epoch": 4.755677089525621,
"grad_norm": 1.0641474723815918,
"learning_rate": 3.811080727618595e-05,
"loss": 2.6127,
"step": 40000
},
{
"epoch": 4.815123053144691,
"grad_norm": 1.076323390007019,
"learning_rate": 3.796219236713827e-05,
"loss": 2.6184,
"step": 40500
},
{
"epoch": 4.874569016763762,
"grad_norm": 0.8963558077812195,
"learning_rate": 3.78135774580906e-05,
"loss": 2.6165,
"step": 41000
},
{
"epoch": 4.934014980382832,
"grad_norm": 0.968908429145813,
"learning_rate": 3.766496254904292e-05,
"loss": 2.6009,
"step": 41500
},
{
"epoch": 4.993460944001902,
"grad_norm": 0.9362033605575562,
"learning_rate": 3.7516347639995246e-05,
"loss": 2.5956,
"step": 42000
},
{
"epoch": 5.052906907620972,
"grad_norm": 1.1101199388504028,
"learning_rate": 3.736773273094757e-05,
"loss": 2.5755,
"step": 42500
},
{
"epoch": 5.112352871240043,
"grad_norm": 1.2178868055343628,
"learning_rate": 3.72191178218999e-05,
"loss": 2.5724,
"step": 43000
},
{
"epoch": 5.171798834859113,
"grad_norm": 1.0143418312072754,
"learning_rate": 3.707050291285222e-05,
"loss": 2.5834,
"step": 43500
},
{
"epoch": 5.231244798478183,
"grad_norm": 0.9720271825790405,
"learning_rate": 3.6921888003804544e-05,
"loss": 2.586,
"step": 44000
},
{
"epoch": 5.290690762097253,
"grad_norm": 0.8847070932388306,
"learning_rate": 3.6773273094756866e-05,
"loss": 2.5953,
"step": 44500
},
{
"epoch": 5.3501367257163235,
"grad_norm": 0.9654759764671326,
"learning_rate": 3.6624658185709196e-05,
"loss": 2.5777,
"step": 45000
},
{
"epoch": 5.409582689335394,
"grad_norm": 0.9272730350494385,
"learning_rate": 3.647604327666151e-05,
"loss": 2.5774,
"step": 45500
},
{
"epoch": 5.4690286529544645,
"grad_norm": 0.9674676656723022,
"learning_rate": 3.632742836761384e-05,
"loss": 2.5779,
"step": 46000
},
{
"epoch": 5.528474616573535,
"grad_norm": 1.0238367319107056,
"learning_rate": 3.6178813458566164e-05,
"loss": 2.5683,
"step": 46500
},
{
"epoch": 5.587920580192605,
"grad_norm": 1.1663753986358643,
"learning_rate": 3.603019854951849e-05,
"loss": 2.5802,
"step": 47000
},
{
"epoch": 5.647366543811675,
"grad_norm": 0.8961432576179504,
"learning_rate": 3.588158364047081e-05,
"loss": 2.5726,
"step": 47500
},
{
"epoch": 5.706812507430746,
"grad_norm": 1.1115467548370361,
"learning_rate": 3.573296873142314e-05,
"loss": 2.5719,
"step": 48000
},
{
"epoch": 5.766258471049816,
"grad_norm": 1.00434148311615,
"learning_rate": 3.558435382237546e-05,
"loss": 2.556,
"step": 48500
},
{
"epoch": 5.825704434668886,
"grad_norm": 1.1120518445968628,
"learning_rate": 3.5435738913327784e-05,
"loss": 2.5627,
"step": 49000
},
{
"epoch": 5.885150398287957,
"grad_norm": 0.9611983299255371,
"learning_rate": 3.528712400428011e-05,
"loss": 2.5568,
"step": 49500
},
{
"epoch": 5.944596361907027,
"grad_norm": 1.1176481246948242,
"learning_rate": 3.5138509095232436e-05,
"loss": 2.5634,
"step": 50000
},
{
"epoch": 6.004042325526097,
"grad_norm": 0.8676426410675049,
"learning_rate": 3.498989418618476e-05,
"loss": 2.5551,
"step": 50500
},
{
"epoch": 6.063488289145167,
"grad_norm": 0.8983253240585327,
"learning_rate": 3.484127927713708e-05,
"loss": 2.5442,
"step": 51000
},
{
"epoch": 6.122934252764237,
"grad_norm": 0.9558296203613281,
"learning_rate": 3.4692664368089405e-05,
"loss": 2.5415,
"step": 51500
},
{
"epoch": 6.182380216383308,
"grad_norm": 1.1759629249572754,
"learning_rate": 3.4544049459041734e-05,
"loss": 2.5186,
"step": 52000
},
{
"epoch": 6.241826180002378,
"grad_norm": 1.186232089996338,
"learning_rate": 3.439543454999406e-05,
"loss": 2.5437,
"step": 52500
},
{
"epoch": 6.301272143621448,
"grad_norm": 1.1072938442230225,
"learning_rate": 3.424681964094638e-05,
"loss": 2.5442,
"step": 53000
},
{
"epoch": 6.360718107240518,
"grad_norm": 1.1854956150054932,
"learning_rate": 3.40982047318987e-05,
"loss": 2.5265,
"step": 53500
},
{
"epoch": 6.420164070859588,
"grad_norm": 1.037420392036438,
"learning_rate": 3.394958982285103e-05,
"loss": 2.5101,
"step": 54000
},
{
"epoch": 6.479610034478659,
"grad_norm": 1.0414271354675293,
"learning_rate": 3.3800974913803354e-05,
"loss": 2.5291,
"step": 54500
},
{
"epoch": 6.539055998097729,
"grad_norm": 0.8827362656593323,
"learning_rate": 3.365236000475568e-05,
"loss": 2.5187,
"step": 55000
},
{
"epoch": 6.598501961716799,
"grad_norm": 0.9146121144294739,
"learning_rate": 3.3503745095708007e-05,
"loss": 2.5234,
"step": 55500
},
{
"epoch": 6.657947925335869,
"grad_norm": 1.0134857892990112,
"learning_rate": 3.335513018666033e-05,
"loss": 2.5199,
"step": 56000
},
{
"epoch": 6.71739388895494,
"grad_norm": 1.1852586269378662,
"learning_rate": 3.320651527761265e-05,
"loss": 2.5347,
"step": 56500
},
{
"epoch": 6.77683985257401,
"grad_norm": 1.0739943981170654,
"learning_rate": 3.3057900368564975e-05,
"loss": 2.5367,
"step": 57000
},
{
"epoch": 6.83628581619308,
"grad_norm": 0.9880659580230713,
"learning_rate": 3.2909285459517304e-05,
"loss": 2.5181,
"step": 57500
},
{
"epoch": 6.8957317798121505,
"grad_norm": 1.0519931316375732,
"learning_rate": 3.276067055046963e-05,
"loss": 2.5325,
"step": 58000
},
{
"epoch": 6.955177743431221,
"grad_norm": 0.9463315010070801,
"learning_rate": 3.261205564142195e-05,
"loss": 2.5384,
"step": 58500
},
{
"epoch": 7.0146237070502915,
"grad_norm": 0.9906750917434692,
"learning_rate": 3.246344073237427e-05,
"loss": 2.5374,
"step": 59000
},
{
"epoch": 7.0740696706693615,
"grad_norm": 0.9740419983863831,
"learning_rate": 3.23148258233266e-05,
"loss": 2.4919,
"step": 59500
},
{
"epoch": 7.133515634288432,
"grad_norm": 1.0209646224975586,
"learning_rate": 3.2166210914278925e-05,
"loss": 2.5065,
"step": 60000
},
{
"epoch": 7.192961597907502,
"grad_norm": 1.1537789106369019,
"learning_rate": 3.201759600523125e-05,
"loss": 2.4888,
"step": 60500
},
{
"epoch": 7.252407561526573,
"grad_norm": 1.0545387268066406,
"learning_rate": 3.186898109618357e-05,
"loss": 2.5042,
"step": 61000
},
{
"epoch": 7.311853525145643,
"grad_norm": 0.8990502953529358,
"learning_rate": 3.17203661871359e-05,
"loss": 2.4956,
"step": 61500
},
{
"epoch": 7.371299488764713,
"grad_norm": 1.0004386901855469,
"learning_rate": 3.1571751278088215e-05,
"loss": 2.5096,
"step": 62000
},
{
"epoch": 7.430745452383783,
"grad_norm": 1.192317008972168,
"learning_rate": 3.1423136369040545e-05,
"loss": 2.5038,
"step": 62500
},
{
"epoch": 7.490191416002854,
"grad_norm": 0.9577484726905823,
"learning_rate": 3.127452145999287e-05,
"loss": 2.5113,
"step": 63000
},
{
"epoch": 7.549637379621924,
"grad_norm": 0.8835137486457825,
"learning_rate": 3.11259065509452e-05,
"loss": 2.4939,
"step": 63500
},
{
"epoch": 7.609083343240994,
"grad_norm": 0.8289955258369446,
"learning_rate": 3.097729164189751e-05,
"loss": 2.4716,
"step": 64000
},
{
"epoch": 7.668529306860064,
"grad_norm": 0.9576908349990845,
"learning_rate": 3.082867673284984e-05,
"loss": 2.5109,
"step": 64500
},
{
"epoch": 7.727975270479135,
"grad_norm": 0.9045142531394958,
"learning_rate": 3.0680061823802165e-05,
"loss": 2.4811,
"step": 65000
},
{
"epoch": 7.787421234098205,
"grad_norm": 1.3150789737701416,
"learning_rate": 3.053144691475449e-05,
"loss": 2.505,
"step": 65500
},
{
"epoch": 7.846867197717275,
"grad_norm": 0.9815430641174316,
"learning_rate": 3.0382832005706814e-05,
"loss": 2.4923,
"step": 66000
},
{
"epoch": 7.906313161336345,
"grad_norm": 1.0355448722839355,
"learning_rate": 3.023421709665914e-05,
"loss": 2.4867,
"step": 66500
},
{
"epoch": 7.965759124955415,
"grad_norm": 1.0244001150131226,
"learning_rate": 3.0085602187611463e-05,
"loss": 2.4973,
"step": 67000
},
{
"epoch": 8.025205088574486,
"grad_norm": 1.052660584449768,
"learning_rate": 2.993698727856379e-05,
"loss": 2.4976,
"step": 67500
},
{
"epoch": 8.084651052193555,
"grad_norm": 1.1590783596038818,
"learning_rate": 2.978837236951611e-05,
"loss": 2.4631,
"step": 68000
},
{
"epoch": 8.144097015812626,
"grad_norm": 0.9065755605697632,
"learning_rate": 2.9639757460468438e-05,
"loss": 2.4494,
"step": 68500
},
{
"epoch": 8.203542979431697,
"grad_norm": 0.9562356472015381,
"learning_rate": 2.9491142551420757e-05,
"loss": 2.4728,
"step": 69000
},
{
"epoch": 8.262988943050766,
"grad_norm": 0.9509665966033936,
"learning_rate": 2.9342527642373087e-05,
"loss": 2.4747,
"step": 69500
},
{
"epoch": 8.322434906669837,
"grad_norm": 0.9384153485298157,
"learning_rate": 2.9193912733325406e-05,
"loss": 2.4745,
"step": 70000
},
{
"epoch": 8.381880870288906,
"grad_norm": 0.9459151029586792,
"learning_rate": 2.9045297824277735e-05,
"loss": 2.476,
"step": 70500
},
{
"epoch": 8.441326833907977,
"grad_norm": 0.9553677439689636,
"learning_rate": 2.8896682915230055e-05,
"loss": 2.4753,
"step": 71000
},
{
"epoch": 8.500772797527048,
"grad_norm": 1.014932632446289,
"learning_rate": 2.8748068006182384e-05,
"loss": 2.4647,
"step": 71500
},
{
"epoch": 8.560218761146118,
"grad_norm": 0.990463376045227,
"learning_rate": 2.8599453097134704e-05,
"loss": 2.4782,
"step": 72000
},
{
"epoch": 8.619664724765189,
"grad_norm": 0.892906665802002,
"learning_rate": 2.845083818808703e-05,
"loss": 2.4736,
"step": 72500
},
{
"epoch": 8.67911068838426,
"grad_norm": 0.9943811297416687,
"learning_rate": 2.8302223279039352e-05,
"loss": 2.4554,
"step": 73000
},
{
"epoch": 8.738556652003329,
"grad_norm": 0.9325155019760132,
"learning_rate": 2.815360836999168e-05,
"loss": 2.4703,
"step": 73500
},
{
"epoch": 8.7980026156224,
"grad_norm": 0.9389231204986572,
"learning_rate": 2.8004993460944e-05,
"loss": 2.4727,
"step": 74000
},
{
"epoch": 8.857448579241469,
"grad_norm": 0.9121980667114258,
"learning_rate": 2.7856378551896327e-05,
"loss": 2.4533,
"step": 74500
},
{
"epoch": 8.91689454286054,
"grad_norm": 1.046366572380066,
"learning_rate": 2.770776364284865e-05,
"loss": 2.4652,
"step": 75000
},
{
"epoch": 8.97634050647961,
"grad_norm": 1.0157803297042847,
"learning_rate": 2.7559148733800976e-05,
"loss": 2.4701,
"step": 75500
},
{
"epoch": 9.03578647009868,
"grad_norm": 1.1012301445007324,
"learning_rate": 2.74105338247533e-05,
"loss": 2.4491,
"step": 76000
},
{
"epoch": 9.09523243371775,
"grad_norm": 1.000829815864563,
"learning_rate": 2.7261918915705625e-05,
"loss": 2.4434,
"step": 76500
},
{
"epoch": 9.15467839733682,
"grad_norm": 1.028676986694336,
"learning_rate": 2.7113304006657948e-05,
"loss": 2.4392,
"step": 77000
},
{
"epoch": 9.214124360955891,
"grad_norm": 1.0821462869644165,
"learning_rate": 2.6964689097610274e-05,
"loss": 2.4289,
"step": 77500
},
{
"epoch": 9.273570324574962,
"grad_norm": 0.951738715171814,
"learning_rate": 2.6816074188562596e-05,
"loss": 2.4437,
"step": 78000
},
{
"epoch": 9.333016288194031,
"grad_norm": 0.9170756936073303,
"learning_rate": 2.6667459279514923e-05,
"loss": 2.4507,
"step": 78500
},
{
"epoch": 9.392462251813102,
"grad_norm": 0.9591684937477112,
"learning_rate": 2.6518844370467245e-05,
"loss": 2.4584,
"step": 79000
},
{
"epoch": 9.451908215432173,
"grad_norm": 1.1289016008377075,
"learning_rate": 2.637022946141957e-05,
"loss": 2.4595,
"step": 79500
},
{
"epoch": 9.511354179051242,
"grad_norm": 1.0114785432815552,
"learning_rate": 2.6221614552371894e-05,
"loss": 2.4404,
"step": 80000
},
{
"epoch": 9.570800142670313,
"grad_norm": 1.1835304498672485,
"learning_rate": 2.607299964332422e-05,
"loss": 2.4308,
"step": 80500
},
{
"epoch": 9.630246106289382,
"grad_norm": 0.9822309017181396,
"learning_rate": 2.592438473427654e-05,
"loss": 2.4387,
"step": 81000
},
{
"epoch": 9.689692069908453,
"grad_norm": 1.114311695098877,
"learning_rate": 2.577576982522887e-05,
"loss": 2.4519,
"step": 81500
},
{
"epoch": 9.749138033527524,
"grad_norm": 1.1047866344451904,
"learning_rate": 2.5627154916181195e-05,
"loss": 2.4497,
"step": 82000
},
{
"epoch": 9.808583997146593,
"grad_norm": 0.9930892586708069,
"learning_rate": 2.5478540007133518e-05,
"loss": 2.4489,
"step": 82500
},
{
"epoch": 9.868029960765664,
"grad_norm": 1.1107361316680908,
"learning_rate": 2.5329925098085844e-05,
"loss": 2.4399,
"step": 83000
},
{
"epoch": 9.927475924384733,
"grad_norm": 1.0770343542099,
"learning_rate": 2.5181310189038167e-05,
"loss": 2.4362,
"step": 83500
},
{
"epoch": 9.986921888003804,
"grad_norm": 0.9818819761276245,
"learning_rate": 2.5032695279990493e-05,
"loss": 2.4418,
"step": 84000
},
{
"epoch": 10.046367851622875,
"grad_norm": 1.1135622262954712,
"learning_rate": 2.4884080370942815e-05,
"loss": 2.428,
"step": 84500
},
{
"epoch": 10.105813815241945,
"grad_norm": 1.035888671875,
"learning_rate": 2.4735465461895138e-05,
"loss": 2.4193,
"step": 85000
},
{
"epoch": 10.165259778861016,
"grad_norm": 0.9694905281066895,
"learning_rate": 2.458685055284746e-05,
"loss": 2.4165,
"step": 85500
},
{
"epoch": 10.224705742480086,
"grad_norm": 1.116449236869812,
"learning_rate": 2.4438235643799787e-05,
"loss": 2.4122,
"step": 86000
},
{
"epoch": 10.284151706099156,
"grad_norm": 0.9860423803329468,
"learning_rate": 2.428962073475211e-05,
"loss": 2.4173,
"step": 86500
},
{
"epoch": 10.343597669718227,
"grad_norm": 1.1727473735809326,
"learning_rate": 2.4141005825704436e-05,
"loss": 2.4258,
"step": 87000
},
{
"epoch": 10.403043633337296,
"grad_norm": 1.0731017589569092,
"learning_rate": 2.399239091665676e-05,
"loss": 2.4289,
"step": 87500
},
{
"epoch": 10.462489596956367,
"grad_norm": 1.0740883350372314,
"learning_rate": 2.3843776007609085e-05,
"loss": 2.4142,
"step": 88000
},
{
"epoch": 10.521935560575438,
"grad_norm": 1.1342713832855225,
"learning_rate": 2.3695161098561407e-05,
"loss": 2.4315,
"step": 88500
},
{
"epoch": 10.581381524194507,
"grad_norm": 1.0230334997177124,
"learning_rate": 2.3546546189513733e-05,
"loss": 2.4352,
"step": 89000
},
{
"epoch": 10.640827487813578,
"grad_norm": 1.0113749504089355,
"learning_rate": 2.3397931280466056e-05,
"loss": 2.4128,
"step": 89500
},
{
"epoch": 10.700273451432647,
"grad_norm": 1.0363703966140747,
"learning_rate": 2.3249316371418382e-05,
"loss": 2.4343,
"step": 90000
},
{
"epoch": 10.759719415051718,
"grad_norm": 1.0065736770629883,
"learning_rate": 2.3100701462370705e-05,
"loss": 2.4268,
"step": 90500
},
{
"epoch": 10.819165378670789,
"grad_norm": 0.949798047542572,
"learning_rate": 2.295208655332303e-05,
"loss": 2.4114,
"step": 91000
},
{
"epoch": 10.878611342289858,
"grad_norm": 0.9772433042526245,
"learning_rate": 2.2803471644275354e-05,
"loss": 2.4187,
"step": 91500
},
{
"epoch": 10.938057305908929,
"grad_norm": 0.9436720609664917,
"learning_rate": 2.2654856735227677e-05,
"loss": 2.4151,
"step": 92000
},
{
"epoch": 10.997503269528,
"grad_norm": 0.9903433918952942,
"learning_rate": 2.2506241826180003e-05,
"loss": 2.4332,
"step": 92500
},
{
"epoch": 11.05694923314707,
"grad_norm": 0.9285963177680969,
"learning_rate": 2.2357626917132325e-05,
"loss": 2.3895,
"step": 93000
},
{
"epoch": 11.11639519676614,
"grad_norm": 1.0996205806732178,
"learning_rate": 2.220901200808465e-05,
"loss": 2.3858,
"step": 93500
},
{
"epoch": 11.17584116038521,
"grad_norm": 0.9550360441207886,
"learning_rate": 2.2060397099036974e-05,
"loss": 2.4016,
"step": 94000
},
{
"epoch": 11.23528712400428,
"grad_norm": 1.3018606901168823,
"learning_rate": 2.19117821899893e-05,
"loss": 2.4031,
"step": 94500
},
{
"epoch": 11.294733087623351,
"grad_norm": 0.9388914704322815,
"learning_rate": 2.1763167280941626e-05,
"loss": 2.4094,
"step": 95000
},
{
"epoch": 11.35417905124242,
"grad_norm": 0.9850655794143677,
"learning_rate": 2.161455237189395e-05,
"loss": 2.4054,
"step": 95500
},
{
"epoch": 11.413625014861491,
"grad_norm": 1.038522481918335,
"learning_rate": 2.1465937462846275e-05,
"loss": 2.3895,
"step": 96000
},
{
"epoch": 11.47307097848056,
"grad_norm": 1.0989197492599487,
"learning_rate": 2.1317322553798598e-05,
"loss": 2.4019,
"step": 96500
},
{
"epoch": 11.532516942099631,
"grad_norm": 1.0527700185775757,
"learning_rate": 2.1168707644750924e-05,
"loss": 2.399,
"step": 97000
},
{
"epoch": 11.591962905718702,
"grad_norm": 1.273655652999878,
"learning_rate": 2.1020092735703247e-05,
"loss": 2.4259,
"step": 97500
},
{
"epoch": 11.651408869337772,
"grad_norm": 1.002064824104309,
"learning_rate": 2.0871477826655573e-05,
"loss": 2.4073,
"step": 98000
},
{
"epoch": 11.710854832956842,
"grad_norm": 0.9922045469284058,
"learning_rate": 2.0722862917607896e-05,
"loss": 2.4059,
"step": 98500
},
{
"epoch": 11.770300796575913,
"grad_norm": 0.9962035417556763,
"learning_rate": 2.057424800856022e-05,
"loss": 2.4174,
"step": 99000
},
{
"epoch": 11.829746760194983,
"grad_norm": 1.0998961925506592,
"learning_rate": 2.0425633099512544e-05,
"loss": 2.4133,
"step": 99500
},
{
"epoch": 11.889192723814054,
"grad_norm": 1.0380686521530151,
"learning_rate": 2.027701819046487e-05,
"loss": 2.414,
"step": 100000
}
],
"logging_steps": 500,
"max_steps": 168220,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 4.1803850907648e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}