|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9661016949152543, |
|
"eval_steps": 500, |
|
"global_step": 1400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00211864406779661, |
|
"grad_norm": 6772.94736433113, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 5.4426, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0211864406779661, |
|
"grad_norm": 43.76552598391156, |
|
"learning_rate": 5.263157894736842e-05, |
|
"loss": 6.5697, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0423728813559322, |
|
"grad_norm": 1.4214989301078385, |
|
"learning_rate": 0.00010526315789473683, |
|
"loss": 5.8015, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0635593220338983, |
|
"grad_norm": 1.8559172078320925, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 4.6978, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0847457627118644, |
|
"grad_norm": 0.9523887604864526, |
|
"learning_rate": 0.00021052631578947367, |
|
"loss": 3.9616, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1059322033898305, |
|
"grad_norm": 0.5313592686196623, |
|
"learning_rate": 0.0002631578947368421, |
|
"loss": 3.2816, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1271186440677966, |
|
"grad_norm": 0.9882421975844679, |
|
"learning_rate": 0.00031578947368421053, |
|
"loss": 2.9453, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1483050847457627, |
|
"grad_norm": 1.3764980998221854, |
|
"learning_rate": 0.00036842105263157896, |
|
"loss": 2.7478, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1694915254237288, |
|
"grad_norm": 0.6188567319965684, |
|
"learning_rate": 0.00042105263157894734, |
|
"loss": 2.6778, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1906779661016949, |
|
"grad_norm": 0.45391127998861047, |
|
"learning_rate": 0.00047368421052631577, |
|
"loss": 2.6651, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.211864406779661, |
|
"grad_norm": 0.3359651886818045, |
|
"learning_rate": 0.0004999904062938913, |
|
"loss": 2.6871, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2330508474576271, |
|
"grad_norm": 0.2538059916992196, |
|
"learning_rate": 0.0004999136610628463, |
|
"loss": 2.6047, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2542372881355932, |
|
"grad_norm": 0.7420239318499638, |
|
"learning_rate": 0.0004997601941609823, |
|
"loss": 2.6122, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2754237288135593, |
|
"grad_norm": 1.0130312150097984, |
|
"learning_rate": 0.0004995300527015189, |
|
"loss": 2.8314, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2966101694915254, |
|
"grad_norm": 0.45185267289061176, |
|
"learning_rate": 0.0004992233073362051, |
|
"loss": 2.7241, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3177966101694915, |
|
"grad_norm": 1.2647337895757225, |
|
"learning_rate": 0.0004988400522336304, |
|
"loss": 2.6269, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 1.8112639454685833, |
|
"learning_rate": 0.0004983804050503152, |
|
"loss": 2.5217, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3601694915254237, |
|
"grad_norm": 3.7153319991642295, |
|
"learning_rate": 0.0004978445068945918, |
|
"loss": 2.6363, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3813559322033898, |
|
"grad_norm": 1.791954534899048, |
|
"learning_rate": 0.0004972325222832848, |
|
"loss": 2.6156, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4025423728813559, |
|
"grad_norm": 0.30066185600399886, |
|
"learning_rate": 0.0004965446390912051, |
|
"loss": 2.5443, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.423728813559322, |
|
"grad_norm": 0.155337614316216, |
|
"learning_rate": 0.0004957810684934746, |
|
"loss": 2.4791, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4449152542372881, |
|
"grad_norm": 0.10527493629733332, |
|
"learning_rate": 0.0004949420449006968, |
|
"loss": 2.4345, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4661016949152542, |
|
"grad_norm": 0.06207368975658736, |
|
"learning_rate": 0.0004940278258869937, |
|
"loss": 2.4284, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4872881355932203, |
|
"grad_norm": 0.06174468192832609, |
|
"learning_rate": 0.0004930386921109333, |
|
"loss": 2.3855, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5084745762711864, |
|
"grad_norm": 0.04921797570119065, |
|
"learning_rate": 0.0004919749472293693, |
|
"loss": 2.3648, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5296610169491526, |
|
"grad_norm": 0.06272467077360039, |
|
"learning_rate": 0.00049083691780422, |
|
"loss": 2.3618, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5508474576271186, |
|
"grad_norm": 0.06952127682493715, |
|
"learning_rate": 0.0004896249532022172, |
|
"loss": 2.3344, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5720338983050848, |
|
"grad_norm": 0.05291425745739597, |
|
"learning_rate": 0.0004883394254876522, |
|
"loss": 2.3401, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5932203389830508, |
|
"grad_norm": 0.09403878675057362, |
|
"learning_rate": 0.0004869807293081555, |
|
"loss": 2.3351, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.614406779661017, |
|
"grad_norm": 0.07509078154330293, |
|
"learning_rate": 0.00048554928177354254, |
|
"loss": 2.3067, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.635593220338983, |
|
"grad_norm": 0.1705698396439175, |
|
"learning_rate": 0.0004840455223277639, |
|
"loss": 2.3114, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6567796610169492, |
|
"grad_norm": 0.09664349408312269, |
|
"learning_rate": 0.0004824699126139995, |
|
"loss": 2.3059, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 0.08308196121963289, |
|
"learning_rate": 0.0004808229363329374, |
|
"loss": 2.2926, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6991525423728814, |
|
"grad_norm": 0.0669444065304248, |
|
"learning_rate": 0.0004791050990942811, |
|
"loss": 2.2862, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7203389830508474, |
|
"grad_norm": 0.05853715985139967, |
|
"learning_rate": 0.0004773169282615311, |
|
"loss": 2.2736, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7415254237288136, |
|
"grad_norm": 0.056477614254721926, |
|
"learning_rate": 0.00047545897279008845, |
|
"loss": 2.2715, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7627118644067796, |
|
"grad_norm": 0.0870456690332402, |
|
"learning_rate": 0.000473531803058729, |
|
"loss": 2.2543, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7838983050847458, |
|
"grad_norm": 0.0954901066459511, |
|
"learning_rate": 0.0004715360106945015, |
|
"loss": 2.2752, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8050847457627118, |
|
"grad_norm": 0.06902843103766237, |
|
"learning_rate": 0.00046947220839110225, |
|
"loss": 2.2698, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.826271186440678, |
|
"grad_norm": 0.10925884550477084, |
|
"learning_rate": 0.00046734102972078326, |
|
"loss": 2.2433, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.847457627118644, |
|
"grad_norm": 0.0659931072508813, |
|
"learning_rate": 0.0004651431289398489, |
|
"loss": 2.2415, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8686440677966102, |
|
"grad_norm": 0.06263383305001168, |
|
"learning_rate": 0.00046287918078780554, |
|
"loss": 2.2388, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8898305084745762, |
|
"grad_norm": 0.09782328625398208, |
|
"learning_rate": 0.00046054988028022007, |
|
"loss": 2.2408, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9110169491525424, |
|
"grad_norm": 0.10848225540509068, |
|
"learning_rate": 0.000458155942495356, |
|
"loss": 2.2337, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9322033898305084, |
|
"grad_norm": 0.07281471757622568, |
|
"learning_rate": 0.0004556981023546495, |
|
"loss": 2.2173, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9533898305084746, |
|
"grad_norm": 0.06260737865841007, |
|
"learning_rate": 0.00045317711439709377, |
|
"loss": 2.2175, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9745762711864406, |
|
"grad_norm": 0.05780118722032544, |
|
"learning_rate": 0.00045059375254760106, |
|
"loss": 2.2194, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9957627118644068, |
|
"grad_norm": 0.07389833905985468, |
|
"learning_rate": 0.0004479488098794134, |
|
"loss": 2.222, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0169491525423728, |
|
"grad_norm": 0.08596309367450944, |
|
"learning_rate": 0.0004452430983706351, |
|
"loss": 2.138, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0381355932203389, |
|
"grad_norm": 0.0716327488863897, |
|
"learning_rate": 0.0004424774486549611, |
|
"loss": 2.1209, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0593220338983051, |
|
"grad_norm": 0.1035432059054005, |
|
"learning_rate": 0.0004396527097666786, |
|
"loss": 2.129, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0805084745762712, |
|
"grad_norm": 0.09433176908109482, |
|
"learning_rate": 0.0004367697488800197, |
|
"loss": 2.1266, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.1016949152542372, |
|
"grad_norm": 0.03811729650126105, |
|
"learning_rate": 0.00043382945104294534, |
|
"loss": 2.1261, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1228813559322033, |
|
"grad_norm": 0.04741201646286949, |
|
"learning_rate": 0.0004308327189054421, |
|
"loss": 2.1076, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1440677966101696, |
|
"grad_norm": 0.053484019027130486, |
|
"learning_rate": 0.0004277804724424146, |
|
"loss": 2.1056, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1652542372881356, |
|
"grad_norm": 0.07045317524776837, |
|
"learning_rate": 0.0004246736486712611, |
|
"loss": 2.1283, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1864406779661016, |
|
"grad_norm": 0.056991644556702734, |
|
"learning_rate": 0.0004215132013642149, |
|
"loss": 2.1073, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.207627118644068, |
|
"grad_norm": 0.10932907349389169, |
|
"learning_rate": 0.0004183001007555436, |
|
"loss": 2.0923, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.228813559322034, |
|
"grad_norm": 0.07565475345058699, |
|
"learning_rate": 0.0004150353332436945, |
|
"loss": 2.1304, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.04806730962026069, |
|
"learning_rate": 0.000411719901088477, |
|
"loss": 2.1019, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.271186440677966, |
|
"grad_norm": 0.12046348461315967, |
|
"learning_rate": 0.00040835482210337684, |
|
"loss": 2.1134, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.292372881355932, |
|
"grad_norm": 0.04785616035170302, |
|
"learning_rate": 0.00040494112934309393, |
|
"loss": 2.0899, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.3135593220338984, |
|
"grad_norm": 0.06950120638475274, |
|
"learning_rate": 0.0004014798707864026, |
|
"loss": 2.1022, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3347457627118644, |
|
"grad_norm": 0.05764573408385284, |
|
"learning_rate": 0.00039797210901443004, |
|
"loss": 2.0954, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.3559322033898304, |
|
"grad_norm": 0.05248977971434518, |
|
"learning_rate": 0.00039441892088445073, |
|
"loss": 2.1058, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.3771186440677967, |
|
"grad_norm": 0.08905978345074242, |
|
"learning_rate": 0.0003908213971993002, |
|
"loss": 2.1041, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3983050847457628, |
|
"grad_norm": 0.06873869897878389, |
|
"learning_rate": 0.0003871806423725056, |
|
"loss": 2.1069, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.4194915254237288, |
|
"grad_norm": 0.07051366599451443, |
|
"learning_rate": 0.0003834977740892396, |
|
"loss": 2.1065, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4406779661016949, |
|
"grad_norm": 0.05550804358265807, |
|
"learning_rate": 0.00037977392296319814, |
|
"loss": 2.0909, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.461864406779661, |
|
"grad_norm": 0.05362391791941435, |
|
"learning_rate": 0.00037601023218951146, |
|
"loss": 2.0812, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4830508474576272, |
|
"grad_norm": 0.07183191275199183, |
|
"learning_rate": 0.000372207857193791, |
|
"loss": 2.1006, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5042372881355932, |
|
"grad_norm": 0.036702339330004795, |
|
"learning_rate": 0.0003683679652774219, |
|
"loss": 2.0801, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5254237288135593, |
|
"grad_norm": 0.04503677538534475, |
|
"learning_rate": 0.00036449173525921033, |
|
"loss": 2.0985, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.5466101694915255, |
|
"grad_norm": 0.08571969684473545, |
|
"learning_rate": 0.0003605803571134947, |
|
"loss": 2.0963, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.5677966101694916, |
|
"grad_norm": 0.07028832284848355, |
|
"learning_rate": 0.00035663503160483214, |
|
"loss": 2.0901, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.5889830508474576, |
|
"grad_norm": 0.0460245424115982, |
|
"learning_rate": 0.0003526569699193726, |
|
"loss": 2.0747, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6101694915254239, |
|
"grad_norm": 0.05695582523407322, |
|
"learning_rate": 0.0003486473932930333, |
|
"loss": 2.0926, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6313559322033897, |
|
"grad_norm": 0.04729193051976345, |
|
"learning_rate": 0.0003446075326365887, |
|
"loss": 2.079, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.652542372881356, |
|
"grad_norm": 0.0440876523157918, |
|
"learning_rate": 0.0003405386281577892, |
|
"loss": 2.0936, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.673728813559322, |
|
"grad_norm": 0.03952017824612962, |
|
"learning_rate": 0.00033644192898062744, |
|
"loss": 2.0751, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.694915254237288, |
|
"grad_norm": 0.043820899114424584, |
|
"learning_rate": 0.0003323186927618648, |
|
"loss": 2.1008, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7161016949152543, |
|
"grad_norm": 0.041923143213303454, |
|
"learning_rate": 0.00032817018530494166, |
|
"loss": 2.0951, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.7372881355932204, |
|
"grad_norm": 0.051309887579675056, |
|
"learning_rate": 0.00032399768017138354, |
|
"loss": 2.0798, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7584745762711864, |
|
"grad_norm": 0.0432756541973623, |
|
"learning_rate": 0.000319802458289828, |
|
"loss": 2.0775, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.7796610169491527, |
|
"grad_norm": 0.04478332673838983, |
|
"learning_rate": 0.0003155858075627886, |
|
"loss": 2.0739, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8008474576271185, |
|
"grad_norm": 0.06738395966368808, |
|
"learning_rate": 0.00031134902247127765, |
|
"loss": 2.0845, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8220338983050848, |
|
"grad_norm": 0.0493251207413769, |
|
"learning_rate": 0.00030709340367741103, |
|
"loss": 2.077, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.8432203389830508, |
|
"grad_norm": 0.04126164753701976, |
|
"learning_rate": 0.0003028202576251134, |
|
"loss": 2.0702, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.8644067796610169, |
|
"grad_norm": 0.040160774820697534, |
|
"learning_rate": 0.00029853089613905015, |
|
"loss": 2.0566, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.8855932203389831, |
|
"grad_norm": 0.05463141520142657, |
|
"learning_rate": 0.00029422663602190655, |
|
"loss": 2.0564, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.9067796610169492, |
|
"grad_norm": 0.05136378797370674, |
|
"learning_rate": 0.0002899087986501388, |
|
"loss": 2.0757, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9279661016949152, |
|
"grad_norm": 0.0438175755611988, |
|
"learning_rate": 0.00028557870956832135, |
|
"loss": 2.0636, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.9491525423728815, |
|
"grad_norm": 0.04252018387450911, |
|
"learning_rate": 0.00028123769808221404, |
|
"loss": 2.0554, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.9703389830508473, |
|
"grad_norm": 0.05122593433878704, |
|
"learning_rate": 0.0002768870968506755, |
|
"loss": 2.0597, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.9915254237288136, |
|
"grad_norm": 0.07844445065272901, |
|
"learning_rate": 0.0002725282414765459, |
|
"loss": 2.0748, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.01271186440678, |
|
"grad_norm": 0.08602891741332974, |
|
"learning_rate": 0.0002681624700966272, |
|
"loss": 1.9986, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0338983050847457, |
|
"grad_norm": 0.08403229732346643, |
|
"learning_rate": 0.00026379112297088454, |
|
"loss": 1.9663, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.055084745762712, |
|
"grad_norm": 0.04805601486638949, |
|
"learning_rate": 0.0002594155420709964, |
|
"loss": 1.9602, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.0762711864406778, |
|
"grad_norm": 0.04177748467643192, |
|
"learning_rate": 0.0002550370706683793, |
|
"loss": 1.9615, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.097457627118644, |
|
"grad_norm": 0.04712018323150014, |
|
"learning_rate": 0.00025065705292181244, |
|
"loss": 1.9685, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.1186440677966103, |
|
"grad_norm": 0.04036226172974584, |
|
"learning_rate": 0.0002462768334647919, |
|
"loss": 1.952, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.139830508474576, |
|
"grad_norm": 0.05659363087865909, |
|
"learning_rate": 0.00024189775699273692, |
|
"loss": 1.9628, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.1610169491525424, |
|
"grad_norm": 0.042136646622118364, |
|
"learning_rate": 0.0002375211678501792, |
|
"loss": 1.9385, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.1822033898305087, |
|
"grad_norm": 0.03941503529723361, |
|
"learning_rate": 0.00023314840961805803, |
|
"loss": 1.9554, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.2033898305084745, |
|
"grad_norm": 0.03700109577217643, |
|
"learning_rate": 0.0002287808247012513, |
|
"loss": 1.9642, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.2245762711864407, |
|
"grad_norm": 0.10375814997593234, |
|
"learning_rate": 0.00022441975391646662, |
|
"loss": 1.9369, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.2457627118644066, |
|
"grad_norm": 0.052752509354616346, |
|
"learning_rate": 0.0002200665360806211, |
|
"loss": 1.9559, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.266949152542373, |
|
"grad_norm": 0.03224571338131167, |
|
"learning_rate": 0.00021572250759983337, |
|
"loss": 1.9617, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.288135593220339, |
|
"grad_norm": 0.03288018024537444, |
|
"learning_rate": 0.00021138900205915702, |
|
"loss": 1.9699, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.309322033898305, |
|
"grad_norm": 0.037431680326990596, |
|
"learning_rate": 0.0002070673498131803, |
|
"loss": 1.9608, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.330508474576271, |
|
"grad_norm": 0.04394885552216967, |
|
"learning_rate": 0.00020275887757761603, |
|
"loss": 1.9412, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.3516949152542375, |
|
"grad_norm": 0.030864374934226058, |
|
"learning_rate": 0.00019846490802201004, |
|
"loss": 1.9565, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.3728813559322033, |
|
"grad_norm": 0.0376528236108464, |
|
"learning_rate": 0.00019418675936369118, |
|
"loss": 1.948, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.3940677966101696, |
|
"grad_norm": 0.03266857205638587, |
|
"learning_rate": 0.0001899257449630886, |
|
"loss": 1.9481, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.415254237288136, |
|
"grad_norm": 0.035295325605865485, |
|
"learning_rate": 0.00018568317292053894, |
|
"loss": 1.9402, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.4364406779661016, |
|
"grad_norm": 0.04817378053212362, |
|
"learning_rate": 0.0001814603456747094, |
|
"loss": 1.9535, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.457627118644068, |
|
"grad_norm": 0.04074945682830249, |
|
"learning_rate": 0.0001772585596027591, |
|
"loss": 1.9525, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.4788135593220337, |
|
"grad_norm": 0.035901200449676485, |
|
"learning_rate": 0.00017307910462235977, |
|
"loss": 1.9731, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.035662558321239934, |
|
"learning_rate": 0.0001689232637957013, |
|
"loss": 1.9362, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.5211864406779663, |
|
"grad_norm": 0.03973557868554537, |
|
"learning_rate": 0.00016479231293560032, |
|
"loss": 1.9374, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.542372881355932, |
|
"grad_norm": 0.03299325997458768, |
|
"learning_rate": 0.0001606875202138356, |
|
"loss": 1.9481, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.5635593220338984, |
|
"grad_norm": 0.04375441334168935, |
|
"learning_rate": 0.0001566101457718281, |
|
"loss": 1.9383, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.584745762711864, |
|
"grad_norm": 0.03880059687569579, |
|
"learning_rate": 0.00015256144133378696, |
|
"loss": 1.9605, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.6059322033898304, |
|
"grad_norm": 0.031915977851280856, |
|
"learning_rate": 0.0001485426498224392, |
|
"loss": 1.9685, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.6271186440677967, |
|
"grad_norm": 0.038809992830277334, |
|
"learning_rate": 0.0001445550049774617, |
|
"loss": 1.9492, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.648305084745763, |
|
"grad_norm": 0.041440483327853134, |
|
"learning_rate": 0.00014059973097673188, |
|
"loss": 1.9644, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.669491525423729, |
|
"grad_norm": 0.03266791279733371, |
|
"learning_rate": 0.00013667804206051392, |
|
"loss": 1.9503, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.690677966101695, |
|
"grad_norm": 0.03449869131827401, |
|
"learning_rate": 0.00013279114215869654, |
|
"loss": 1.9579, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.711864406779661, |
|
"grad_norm": 0.032938848631871014, |
|
"learning_rate": 0.0001289402245211944, |
|
"loss": 1.9452, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.733050847457627, |
|
"grad_norm": 0.050239168482732646, |
|
"learning_rate": 0.00012512647135162964, |
|
"loss": 1.9561, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.7542372881355934, |
|
"grad_norm": 0.0306200091807657, |
|
"learning_rate": 0.00012135105344440394, |
|
"loss": 1.9449, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.7754237288135593, |
|
"grad_norm": 0.03225814110229855, |
|
"learning_rate": 0.00011761512982527305, |
|
"loss": 1.9476, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.7966101694915255, |
|
"grad_norm": 0.03350710910144213, |
|
"learning_rate": 0.00011391984739553427, |
|
"loss": 1.9337, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.8177966101694913, |
|
"grad_norm": 0.03729852355842271, |
|
"learning_rate": 0.00011026634057993648, |
|
"loss": 1.9421, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.8389830508474576, |
|
"grad_norm": 0.028562510049380468, |
|
"learning_rate": 0.00010665573097841957, |
|
"loss": 1.9553, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.860169491525424, |
|
"grad_norm": 0.03292192919214765, |
|
"learning_rate": 0.00010308912702179166, |
|
"loss": 1.9531, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.8813559322033897, |
|
"grad_norm": 0.03463226481188989, |
|
"learning_rate": 9.956762363144892e-05, |
|
"loss": 1.9456, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.902542372881356, |
|
"grad_norm": 0.028278857959505636, |
|
"learning_rate": 9.609230188324286e-05, |
|
"loss": 1.9359, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.923728813559322, |
|
"grad_norm": 0.028175093698422474, |
|
"learning_rate": 9.266422867559753e-05, |
|
"loss": 1.9543, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.944915254237288, |
|
"grad_norm": 0.03450981182902969, |
|
"learning_rate": 8.928445640197971e-05, |
|
"loss": 1.9378, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.9661016949152543, |
|
"grad_norm": 0.029970639983241495, |
|
"learning_rate": 8.5954022627822e-05, |
|
"loss": 1.9465, |
|
"step": 1400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1888, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4703672906612736e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|