{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7160975609756097, "eval_steps": 500, "global_step": 880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01951219512195122, "grad_norm": 0.14488260447978973, "learning_rate": 0.00019934683213585893, "loss": 0.6356, "step": 10 }, { "epoch": 0.03902439024390244, "grad_norm": 0.1634250283241272, "learning_rate": 0.00019804049640757677, "loss": 0.4368, "step": 20 }, { "epoch": 0.05853658536585366, "grad_norm": 0.1504327356815338, "learning_rate": 0.0001967341606792946, "loss": 0.407, "step": 30 }, { "epoch": 0.07804878048780488, "grad_norm": 0.16282524168491364, "learning_rate": 0.00019542782495101242, "loss": 0.3788, "step": 40 }, { "epoch": 0.0975609756097561, "grad_norm": 0.17505913972854614, "learning_rate": 0.00019412148922273026, "loss": 0.3705, "step": 50 }, { "epoch": 0.11707317073170732, "grad_norm": 0.18271876871585846, "learning_rate": 0.00019281515349444807, "loss": 0.3654, "step": 60 }, { "epoch": 0.13658536585365855, "grad_norm": 0.1547907590866089, "learning_rate": 0.0001915088177661659, "loss": 0.3845, "step": 70 }, { "epoch": 0.15609756097560976, "grad_norm": 0.1806977242231369, "learning_rate": 0.00019020248203788375, "loss": 0.362, "step": 80 }, { "epoch": 0.17560975609756097, "grad_norm": 0.16502083837985992, "learning_rate": 0.00018889614630960156, "loss": 0.3436, "step": 90 }, { "epoch": 0.1951219512195122, "grad_norm": 0.15947739779949188, "learning_rate": 0.0001875898105813194, "loss": 0.3221, "step": 100 }, { "epoch": 0.2146341463414634, "grad_norm": 0.16028478741645813, "learning_rate": 0.00018628347485303724, "loss": 0.3291, "step": 110 }, { "epoch": 0.23414634146341465, "grad_norm": 0.16755621135234833, "learning_rate": 0.00018497713912475508, "loss": 0.3454, "step": 120 }, { "epoch": 0.25365853658536586, "grad_norm": 0.17687192559242249, "learning_rate": 0.00018367080339647292, "loss": 0.3265, "step": 130 }, { "epoch": 0.2731707317073171, "grad_norm": 0.12853658199310303, "learning_rate": 0.00018236446766819073, "loss": 0.3136, "step": 140 }, { "epoch": 0.2926829268292683, "grad_norm": 0.15641653537750244, "learning_rate": 0.00018105813193990857, "loss": 0.3124, "step": 150 }, { "epoch": 0.3121951219512195, "grad_norm": 0.1840222179889679, "learning_rate": 0.00017975179621162638, "loss": 0.3308, "step": 160 }, { "epoch": 0.33170731707317075, "grad_norm": 0.16983111202716827, "learning_rate": 0.00017844546048334422, "loss": 0.3116, "step": 170 }, { "epoch": 0.35121951219512193, "grad_norm": 0.18679502606391907, "learning_rate": 0.00017713912475506206, "loss": 0.3458, "step": 180 }, { "epoch": 0.37073170731707317, "grad_norm": 0.1655397266149521, "learning_rate": 0.0001758327890267799, "loss": 0.3041, "step": 190 }, { "epoch": 0.3902439024390244, "grad_norm": 0.16899655759334564, "learning_rate": 0.00017452645329849774, "loss": 0.3179, "step": 200 }, { "epoch": 0.4097560975609756, "grad_norm": 0.14320065081119537, "learning_rate": 0.00017322011757021555, "loss": 0.2896, "step": 210 }, { "epoch": 0.4292682926829268, "grad_norm": 0.18079160153865814, "learning_rate": 0.0001719137818419334, "loss": 0.3121, "step": 220 }, { "epoch": 0.44878048780487806, "grad_norm": 0.15252567827701569, "learning_rate": 0.00017060744611365123, "loss": 0.3054, "step": 230 }, { "epoch": 0.4682926829268293, "grad_norm": 0.1688212901353836, "learning_rate": 0.00016930111038536904, "loss": 0.316, "step": 240 }, { "epoch": 0.4878048780487805, "grad_norm": 0.1604667603969574, "learning_rate": 0.00016799477465708688, "loss": 0.3174, "step": 250 }, { "epoch": 0.5073170731707317, "grad_norm": 0.19493722915649414, "learning_rate": 0.0001666884389288047, "loss": 0.307, "step": 260 }, { "epoch": 0.526829268292683, "grad_norm": 0.13511119782924652, "learning_rate": 0.00016538210320052253, "loss": 0.2918, "step": 270 }, { "epoch": 0.5463414634146342, "grad_norm": 0.1683417409658432, "learning_rate": 0.00016407576747224037, "loss": 0.2704, "step": 280 }, { "epoch": 0.5658536585365853, "grad_norm": 0.16002771258354187, "learning_rate": 0.0001627694317439582, "loss": 0.3401, "step": 290 }, { "epoch": 0.5853658536585366, "grad_norm": 0.1851789504289627, "learning_rate": 0.00016146309601567605, "loss": 0.3134, "step": 300 }, { "epoch": 0.6048780487804878, "grad_norm": 0.18187113106250763, "learning_rate": 0.00016015676028739386, "loss": 0.2732, "step": 310 }, { "epoch": 0.624390243902439, "grad_norm": 0.13298119604587555, "learning_rate": 0.0001588504245591117, "loss": 0.3317, "step": 320 }, { "epoch": 0.6439024390243903, "grad_norm": 0.17413659393787384, "learning_rate": 0.00015754408883082954, "loss": 0.3176, "step": 330 }, { "epoch": 0.6634146341463415, "grad_norm": 0.17591014504432678, "learning_rate": 0.00015623775310254735, "loss": 0.3074, "step": 340 }, { "epoch": 0.6829268292682927, "grad_norm": 0.17484629154205322, "learning_rate": 0.0001549314173742652, "loss": 0.3033, "step": 350 }, { "epoch": 0.7024390243902439, "grad_norm": 0.17957189679145813, "learning_rate": 0.000153625081645983, "loss": 0.2995, "step": 360 }, { "epoch": 0.7219512195121951, "grad_norm": 0.1616777628660202, "learning_rate": 0.00015231874591770084, "loss": 0.2955, "step": 370 }, { "epoch": 0.7414634146341463, "grad_norm": 0.165989488363266, "learning_rate": 0.00015101241018941868, "loss": 0.278, "step": 380 }, { "epoch": 0.7609756097560976, "grad_norm": 0.15299169719219208, "learning_rate": 0.00014970607446113652, "loss": 0.2877, "step": 390 }, { "epoch": 0.7804878048780488, "grad_norm": 0.17317935824394226, "learning_rate": 0.00014839973873285436, "loss": 0.2888, "step": 400 }, { "epoch": 0.8, "grad_norm": 0.17615154385566711, "learning_rate": 0.00014709340300457217, "loss": 0.298, "step": 410 }, { "epoch": 0.8195121951219512, "grad_norm": 0.15918245911598206, "learning_rate": 0.00014578706727629, "loss": 0.3118, "step": 420 }, { "epoch": 0.8390243902439024, "grad_norm": 0.1735045462846756, "learning_rate": 0.00014448073154800785, "loss": 0.2968, "step": 430 }, { "epoch": 0.8585365853658536, "grad_norm": 0.16569170355796814, "learning_rate": 0.00014317439581972566, "loss": 0.2902, "step": 440 }, { "epoch": 0.8780487804878049, "grad_norm": 0.14971968531608582, "learning_rate": 0.0001418680600914435, "loss": 0.2909, "step": 450 }, { "epoch": 0.8975609756097561, "grad_norm": 0.17511232197284698, "learning_rate": 0.00014056172436316132, "loss": 0.2913, "step": 460 }, { "epoch": 0.9170731707317074, "grad_norm": 0.1714879721403122, "learning_rate": 0.00013925538863487918, "loss": 0.3032, "step": 470 }, { "epoch": 0.9365853658536586, "grad_norm": 0.15486101806163788, "learning_rate": 0.00013794905290659702, "loss": 0.2714, "step": 480 }, { "epoch": 0.9560975609756097, "grad_norm": 0.1515657603740692, "learning_rate": 0.00013664271717831483, "loss": 0.2663, "step": 490 }, { "epoch": 0.975609756097561, "grad_norm": 0.1795472502708435, "learning_rate": 0.00013533638145003267, "loss": 0.2761, "step": 500 }, { "epoch": 0.9951219512195122, "grad_norm": 0.18285898864269257, "learning_rate": 0.00013403004572175048, "loss": 0.2709, "step": 510 }, { "epoch": 1.0136585365853659, "grad_norm": 0.16601520776748657, "learning_rate": 0.00013272370999346832, "loss": 0.2678, "step": 520 }, { "epoch": 1.033170731707317, "grad_norm": 0.16676375269889832, "learning_rate": 0.00013141737426518616, "loss": 0.2521, "step": 530 }, { "epoch": 1.0526829268292683, "grad_norm": 0.1423233151435852, "learning_rate": 0.00013011103853690398, "loss": 0.243, "step": 540 }, { "epoch": 1.0721951219512196, "grad_norm": 0.24877290427684784, "learning_rate": 0.00012880470280862181, "loss": 0.2348, "step": 550 }, { "epoch": 1.0917073170731708, "grad_norm": 0.18094764649868011, "learning_rate": 0.00012749836708033965, "loss": 0.2349, "step": 560 }, { "epoch": 1.111219512195122, "grad_norm": 0.1786268949508667, "learning_rate": 0.0001261920313520575, "loss": 0.2508, "step": 570 }, { "epoch": 1.1307317073170733, "grad_norm": 0.21345548331737518, "learning_rate": 0.00012488569562377533, "loss": 0.277, "step": 580 }, { "epoch": 1.1502439024390243, "grad_norm": 0.23655258119106293, "learning_rate": 0.00012357935989549314, "loss": 0.249, "step": 590 }, { "epoch": 1.1697560975609755, "grad_norm": 0.1962760090827942, "learning_rate": 0.00012227302416721098, "loss": 0.2649, "step": 600 }, { "epoch": 1.1892682926829268, "grad_norm": 0.18105126917362213, "learning_rate": 0.00012096668843892882, "loss": 0.2673, "step": 610 }, { "epoch": 1.208780487804878, "grad_norm": 0.16958513855934143, "learning_rate": 0.00011966035271064664, "loss": 0.2409, "step": 620 }, { "epoch": 1.2282926829268292, "grad_norm": 0.22808077931404114, "learning_rate": 0.00011835401698236447, "loss": 0.2564, "step": 630 }, { "epoch": 1.2478048780487805, "grad_norm": 0.162706196308136, "learning_rate": 0.0001170476812540823, "loss": 0.2497, "step": 640 }, { "epoch": 1.2673170731707317, "grad_norm": 0.16657714545726776, "learning_rate": 0.00011574134552580014, "loss": 0.2664, "step": 650 }, { "epoch": 1.286829268292683, "grad_norm": 0.20304907858371735, "learning_rate": 0.00011443500979751798, "loss": 0.251, "step": 660 }, { "epoch": 1.3063414634146342, "grad_norm": 0.18894435465335846, "learning_rate": 0.00011312867406923579, "loss": 0.223, "step": 670 }, { "epoch": 1.3258536585365854, "grad_norm": 0.17584313452243805, "learning_rate": 0.00011182233834095363, "loss": 0.265, "step": 680 }, { "epoch": 1.3453658536585364, "grad_norm": 0.23363268375396729, "learning_rate": 0.00011051600261267146, "loss": 0.2593, "step": 690 }, { "epoch": 1.3648780487804877, "grad_norm": 0.17345435917377472, "learning_rate": 0.0001092096668843893, "loss": 0.2745, "step": 700 }, { "epoch": 1.384390243902439, "grad_norm": 0.20228204131126404, "learning_rate": 0.00010790333115610713, "loss": 0.2329, "step": 710 }, { "epoch": 1.4039024390243902, "grad_norm": 0.16685593128204346, "learning_rate": 0.00010659699542782495, "loss": 0.2427, "step": 720 }, { "epoch": 1.4234146341463414, "grad_norm": 0.18634216487407684, "learning_rate": 0.0001052906596995428, "loss": 0.2323, "step": 730 }, { "epoch": 1.4429268292682926, "grad_norm": 0.2305642068386078, "learning_rate": 0.00010398432397126061, "loss": 0.2537, "step": 740 }, { "epoch": 1.4624390243902439, "grad_norm": 0.19238169491291046, "learning_rate": 0.00010267798824297845, "loss": 0.2549, "step": 750 }, { "epoch": 1.481951219512195, "grad_norm": 0.170588880777359, "learning_rate": 0.00010137165251469629, "loss": 0.2364, "step": 760 }, { "epoch": 1.5014634146341463, "grad_norm": 0.18197551369667053, "learning_rate": 0.00010006531678641412, "loss": 0.2499, "step": 770 }, { "epoch": 1.5209756097560976, "grad_norm": 0.18300370872020721, "learning_rate": 9.875898105813195e-05, "loss": 0.2495, "step": 780 }, { "epoch": 1.5404878048780488, "grad_norm": 0.23338189721107483, "learning_rate": 9.745264532984978e-05, "loss": 0.2596, "step": 790 }, { "epoch": 1.56, "grad_norm": 0.21071650087833405, "learning_rate": 9.61463096015676e-05, "loss": 0.2487, "step": 800 }, { "epoch": 1.5795121951219513, "grad_norm": 0.1906578093767166, "learning_rate": 9.483997387328543e-05, "loss": 0.2611, "step": 810 }, { "epoch": 1.5990243902439025, "grad_norm": 0.20316530764102936, "learning_rate": 9.353363814500327e-05, "loss": 0.2685, "step": 820 }, { "epoch": 1.6185365853658538, "grad_norm": 0.24779777228832245, "learning_rate": 9.222730241672111e-05, "loss": 0.2293, "step": 830 }, { "epoch": 1.638048780487805, "grad_norm": 0.1795492023229599, "learning_rate": 9.092096668843894e-05, "loss": 0.2755, "step": 840 }, { "epoch": 1.6575609756097562, "grad_norm": 0.22917696833610535, "learning_rate": 8.961463096015676e-05, "loss": 0.2726, "step": 850 }, { "epoch": 1.6770731707317075, "grad_norm": 0.17289987206459045, "learning_rate": 8.830829523187459e-05, "loss": 0.2512, "step": 860 }, { "epoch": 1.6965853658536585, "grad_norm": 0.189191073179245, "learning_rate": 8.700195950359243e-05, "loss": 0.2419, "step": 870 }, { "epoch": 1.7160975609756097, "grad_norm": 0.21344304084777832, "learning_rate": 8.569562377531027e-05, "loss": 0.2329, "step": 880 } ], "logging_steps": 10, "max_steps": 1536, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.906907157573632e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }