{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1951219512195122, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001951219512195122, "grad_norm": 0.5293354392051697, "learning_rate": 4e-05, "loss": 0.608, "step": 1 }, { "epoch": 0.003902439024390244, "grad_norm": 0.5153003334999084, "learning_rate": 8e-05, "loss": 0.608, "step": 2 }, { "epoch": 0.005853658536585366, "grad_norm": 0.43859341740608215, "learning_rate": 0.00012, "loss": 0.5807, "step": 3 }, { "epoch": 0.007804878048780488, "grad_norm": 0.4723743796348572, "learning_rate": 0.00016, "loss": 0.7427, "step": 4 }, { "epoch": 0.00975609756097561, "grad_norm": 0.5707408785820007, "learning_rate": 0.0002, "loss": 0.4719, "step": 5 }, { "epoch": 0.011707317073170732, "grad_norm": 0.6417415738105774, "learning_rate": 0.0001998693664271718, "loss": 0.4618, "step": 6 }, { "epoch": 0.013658536585365854, "grad_norm": 0.7731900811195374, "learning_rate": 0.00019973873285434357, "loss": 0.4405, "step": 7 }, { "epoch": 0.015609756097560976, "grad_norm": 0.7877213954925537, "learning_rate": 0.00019960809928151537, "loss": 0.5802, "step": 8 }, { "epoch": 0.0175609756097561, "grad_norm": 0.7449636459350586, "learning_rate": 0.00019947746570868715, "loss": 0.4344, "step": 9 }, { "epoch": 0.01951219512195122, "grad_norm": 0.6625833511352539, "learning_rate": 0.00019934683213585893, "loss": 0.3264, "step": 10 }, { "epoch": 0.021463414634146343, "grad_norm": 0.625950813293457, "learning_rate": 0.0001992161985630307, "loss": 0.3445, "step": 11 }, { "epoch": 0.023414634146341463, "grad_norm": 0.6708886027336121, "learning_rate": 0.0001990855649902025, "loss": 0.3887, "step": 12 }, { "epoch": 0.025365853658536587, "grad_norm": 0.6765628457069397, "learning_rate": 0.00019895493141737426, "loss": 0.347, "step": 13 }, { "epoch": 0.027317073170731707, "grad_norm": 0.6404440402984619, "learning_rate": 0.00019882429784454607, "loss": 0.2538, "step": 14 }, { "epoch": 0.02926829268292683, "grad_norm": 0.5839134454727173, "learning_rate": 0.00019869366427171785, "loss": 0.3337, "step": 15 }, { "epoch": 0.03121951219512195, "grad_norm": 0.6853411793708801, "learning_rate": 0.00019856303069888963, "loss": 0.3713, "step": 16 }, { "epoch": 0.033170731707317075, "grad_norm": 0.5517768263816833, "learning_rate": 0.0001984323971260614, "loss": 0.3044, "step": 17 }, { "epoch": 0.0351219512195122, "grad_norm": 0.38426998257637024, "learning_rate": 0.0001983017635532332, "loss": 0.3431, "step": 18 }, { "epoch": 0.037073170731707315, "grad_norm": 0.7189807891845703, "learning_rate": 0.00019817112998040496, "loss": 0.2947, "step": 19 }, { "epoch": 0.03902439024390244, "grad_norm": 0.7148247957229614, "learning_rate": 0.00019804049640757677, "loss": 0.3812, "step": 20 }, { "epoch": 0.04097560975609756, "grad_norm": 0.571735143661499, "learning_rate": 0.00019790986283474855, "loss": 0.3161, "step": 21 }, { "epoch": 0.042926829268292686, "grad_norm": 0.5547791719436646, "learning_rate": 0.00019777922926192033, "loss": 0.3786, "step": 22 }, { "epoch": 0.0448780487804878, "grad_norm": 0.34905633330345154, "learning_rate": 0.0001976485956890921, "loss": 0.3175, "step": 23 }, { "epoch": 0.04682926829268293, "grad_norm": 0.5606753826141357, "learning_rate": 0.0001975179621162639, "loss": 0.3633, "step": 24 }, { "epoch": 0.04878048780487805, "grad_norm": 0.5134351849555969, "learning_rate": 0.00019738732854343566, "loss": 0.3355, "step": 25 }, { "epoch": 0.050731707317073174, "grad_norm": 0.4639511704444885, "learning_rate": 0.00019725669497060747, "loss": 0.3493, "step": 26 }, { "epoch": 0.05268292682926829, "grad_norm": 0.5047277212142944, "learning_rate": 0.00019712606139777925, "loss": 0.2763, "step": 27 }, { "epoch": 0.054634146341463415, "grad_norm": 0.5306783318519592, "learning_rate": 0.00019699542782495102, "loss": 0.3646, "step": 28 }, { "epoch": 0.05658536585365854, "grad_norm": 0.5353797078132629, "learning_rate": 0.0001968647942521228, "loss": 0.3687, "step": 29 }, { "epoch": 0.05853658536585366, "grad_norm": 0.3748953640460968, "learning_rate": 0.0001967341606792946, "loss": 0.2783, "step": 30 }, { "epoch": 0.06048780487804878, "grad_norm": 0.559394359588623, "learning_rate": 0.00019660352710646636, "loss": 0.3585, "step": 31 }, { "epoch": 0.0624390243902439, "grad_norm": 0.5898977518081665, "learning_rate": 0.00019647289353363816, "loss": 0.318, "step": 32 }, { "epoch": 0.06439024390243903, "grad_norm": 0.5609666109085083, "learning_rate": 0.00019634225996080992, "loss": 0.2512, "step": 33 }, { "epoch": 0.06634146341463415, "grad_norm": 0.46998143196105957, "learning_rate": 0.00019621162638798172, "loss": 0.314, "step": 34 }, { "epoch": 0.06829268292682927, "grad_norm": 0.7506864666938782, "learning_rate": 0.0001960809928151535, "loss": 0.3126, "step": 35 }, { "epoch": 0.0702439024390244, "grad_norm": 0.32043078541755676, "learning_rate": 0.00019595035924232528, "loss": 0.291, "step": 36 }, { "epoch": 0.0721951219512195, "grad_norm": 0.5396966934204102, "learning_rate": 0.00019581972566949706, "loss": 0.3262, "step": 37 }, { "epoch": 0.07414634146341463, "grad_norm": 0.5300573110580444, "learning_rate": 0.00019568909209666886, "loss": 0.364, "step": 38 }, { "epoch": 0.07609756097560975, "grad_norm": 0.3784275949001312, "learning_rate": 0.00019555845852384061, "loss": 0.2737, "step": 39 }, { "epoch": 0.07804878048780488, "grad_norm": 0.3873477876186371, "learning_rate": 0.00019542782495101242, "loss": 0.2575, "step": 40 }, { "epoch": 0.08, "grad_norm": 0.2994961440563202, "learning_rate": 0.0001952971913781842, "loss": 0.2661, "step": 41 }, { "epoch": 0.08195121951219513, "grad_norm": 0.5335369110107422, "learning_rate": 0.00019516655780535598, "loss": 0.3351, "step": 42 }, { "epoch": 0.08390243902439025, "grad_norm": 0.5061212182044983, "learning_rate": 0.00019503592423252776, "loss": 0.2944, "step": 43 }, { "epoch": 0.08585365853658537, "grad_norm": 0.3448638617992401, "learning_rate": 0.00019490529065969956, "loss": 0.3694, "step": 44 }, { "epoch": 0.08780487804878048, "grad_norm": 0.46935510635375977, "learning_rate": 0.00019477465708687134, "loss": 0.3206, "step": 45 }, { "epoch": 0.0897560975609756, "grad_norm": 0.48646026849746704, "learning_rate": 0.00019464402351404312, "loss": 0.306, "step": 46 }, { "epoch": 0.09170731707317073, "grad_norm": 0.36153629422187805, "learning_rate": 0.0001945133899412149, "loss": 0.3398, "step": 47 }, { "epoch": 0.09365853658536585, "grad_norm": 0.44567129015922546, "learning_rate": 0.00019438275636838667, "loss": 0.3129, "step": 48 }, { "epoch": 0.09560975609756098, "grad_norm": 0.2901296615600586, "learning_rate": 0.00019425212279555845, "loss": 0.2857, "step": 49 }, { "epoch": 0.0975609756097561, "grad_norm": 0.6848759055137634, "learning_rate": 0.00019412148922273026, "loss": 0.286, "step": 50 }, { "epoch": 0.09951219512195122, "grad_norm": 0.7011529803276062, "learning_rate": 0.00019399085564990204, "loss": 0.2718, "step": 51 }, { "epoch": 0.10146341463414635, "grad_norm": 0.39393487572669983, "learning_rate": 0.00019386022207707382, "loss": 0.4146, "step": 52 }, { "epoch": 0.10341463414634146, "grad_norm": 0.5121448636054993, "learning_rate": 0.0001937295885042456, "loss": 0.2602, "step": 53 }, { "epoch": 0.10536585365853658, "grad_norm": 0.5868481397628784, "learning_rate": 0.00019359895493141737, "loss": 0.3119, "step": 54 }, { "epoch": 0.1073170731707317, "grad_norm": 0.4504035413265228, "learning_rate": 0.00019346832135858918, "loss": 0.3148, "step": 55 }, { "epoch": 0.10926829268292683, "grad_norm": 0.5948014855384827, "learning_rate": 0.00019333768778576096, "loss": 0.2772, "step": 56 }, { "epoch": 0.11121951219512195, "grad_norm": 0.6200262904167175, "learning_rate": 0.00019320705421293274, "loss": 0.3077, "step": 57 }, { "epoch": 0.11317073170731708, "grad_norm": 0.6139187812805176, "learning_rate": 0.00019307642064010451, "loss": 0.2872, "step": 58 }, { "epoch": 0.1151219512195122, "grad_norm": 0.4784717857837677, "learning_rate": 0.00019294578706727632, "loss": 0.307, "step": 59 }, { "epoch": 0.11707317073170732, "grad_norm": 0.6120780110359192, "learning_rate": 0.00019281515349444807, "loss": 0.3368, "step": 60 }, { "epoch": 0.11902439024390243, "grad_norm": 0.5603389143943787, "learning_rate": 0.00019268451992161988, "loss": 0.2517, "step": 61 }, { "epoch": 0.12097560975609756, "grad_norm": 0.7887749671936035, "learning_rate": 0.00019255388634879166, "loss": 0.3768, "step": 62 }, { "epoch": 0.12292682926829268, "grad_norm": 0.9491720199584961, "learning_rate": 0.00019242325277596343, "loss": 0.3686, "step": 63 }, { "epoch": 0.1248780487804878, "grad_norm": 0.3941563069820404, "learning_rate": 0.0001922926192031352, "loss": 0.2809, "step": 64 }, { "epoch": 0.12682926829268293, "grad_norm": 0.7026630640029907, "learning_rate": 0.00019216198563030702, "loss": 0.2762, "step": 65 }, { "epoch": 0.12878048780487805, "grad_norm": 0.7327045202255249, "learning_rate": 0.00019203135205747877, "loss": 0.2883, "step": 66 }, { "epoch": 0.13073170731707318, "grad_norm": 0.5735700130462646, "learning_rate": 0.00019190071848465057, "loss": 0.3397, "step": 67 }, { "epoch": 0.1326829268292683, "grad_norm": 0.5114302635192871, "learning_rate": 0.00019177008491182235, "loss": 0.3291, "step": 68 }, { "epoch": 0.13463414634146342, "grad_norm": 0.6417020559310913, "learning_rate": 0.00019163945133899413, "loss": 0.2748, "step": 69 }, { "epoch": 0.13658536585365855, "grad_norm": 0.5008614659309387, "learning_rate": 0.0001915088177661659, "loss": 0.3346, "step": 70 }, { "epoch": 0.13853658536585367, "grad_norm": 0.44953638315200806, "learning_rate": 0.00019137818419333772, "loss": 0.2919, "step": 71 }, { "epoch": 0.1404878048780488, "grad_norm": 0.5406004190444946, "learning_rate": 0.00019124755062050947, "loss": 0.2589, "step": 72 }, { "epoch": 0.1424390243902439, "grad_norm": 0.38020092248916626, "learning_rate": 0.00019111691704768127, "loss": 0.2909, "step": 73 }, { "epoch": 0.144390243902439, "grad_norm": 0.45237961411476135, "learning_rate": 0.00019098628347485305, "loss": 0.2772, "step": 74 }, { "epoch": 0.14634146341463414, "grad_norm": 0.3894854187965393, "learning_rate": 0.00019085564990202483, "loss": 0.3513, "step": 75 }, { "epoch": 0.14829268292682926, "grad_norm": 0.3206632137298584, "learning_rate": 0.0001907250163291966, "loss": 0.326, "step": 76 }, { "epoch": 0.15024390243902438, "grad_norm": 0.30855950713157654, "learning_rate": 0.00019059438275636841, "loss": 0.2018, "step": 77 }, { "epoch": 0.1521951219512195, "grad_norm": 0.6048599481582642, "learning_rate": 0.00019046374918354017, "loss": 0.3463, "step": 78 }, { "epoch": 0.15414634146341463, "grad_norm": 0.25345364212989807, "learning_rate": 0.00019033311561071197, "loss": 0.3008, "step": 79 }, { "epoch": 0.15609756097560976, "grad_norm": 0.3038251996040344, "learning_rate": 0.00019020248203788375, "loss": 0.2636, "step": 80 }, { "epoch": 0.15804878048780488, "grad_norm": 0.3111798167228699, "learning_rate": 0.00019007184846505553, "loss": 0.3155, "step": 81 }, { "epoch": 0.16, "grad_norm": 0.2141313999891281, "learning_rate": 0.0001899412148922273, "loss": 0.1978, "step": 82 }, { "epoch": 0.16195121951219513, "grad_norm": 0.24422025680541992, "learning_rate": 0.0001898105813193991, "loss": 0.2462, "step": 83 }, { "epoch": 0.16390243902439025, "grad_norm": 0.3165203034877777, "learning_rate": 0.00018967994774657086, "loss": 0.3017, "step": 84 }, { "epoch": 0.16585365853658537, "grad_norm": 0.4088727533817291, "learning_rate": 0.00018954931417374267, "loss": 0.2991, "step": 85 }, { "epoch": 0.1678048780487805, "grad_norm": 0.294491171836853, "learning_rate": 0.00018941868060091442, "loss": 0.2971, "step": 86 }, { "epoch": 0.16975609756097562, "grad_norm": 0.4098644554615021, "learning_rate": 0.00018928804702808623, "loss": 0.284, "step": 87 }, { "epoch": 0.17170731707317075, "grad_norm": 0.5976632833480835, "learning_rate": 0.000189157413455258, "loss": 0.3176, "step": 88 }, { "epoch": 0.17365853658536584, "grad_norm": 0.3979010283946991, "learning_rate": 0.00018902677988242978, "loss": 0.2745, "step": 89 }, { "epoch": 0.17560975609756097, "grad_norm": 0.6348708868026733, "learning_rate": 0.00018889614630960156, "loss": 0.3255, "step": 90 }, { "epoch": 0.1775609756097561, "grad_norm": 0.5464184880256653, "learning_rate": 0.00018876551273677337, "loss": 0.2286, "step": 91 }, { "epoch": 0.1795121951219512, "grad_norm": 0.5729949474334717, "learning_rate": 0.00018863487916394515, "loss": 0.2798, "step": 92 }, { "epoch": 0.18146341463414634, "grad_norm": 0.6028706431388855, "learning_rate": 0.00018850424559111692, "loss": 0.2685, "step": 93 }, { "epoch": 0.18341463414634146, "grad_norm": 0.40392380952835083, "learning_rate": 0.0001883736120182887, "loss": 0.2729, "step": 94 }, { "epoch": 0.18536585365853658, "grad_norm": 0.578620195388794, "learning_rate": 0.00018824297844546048, "loss": 0.2794, "step": 95 }, { "epoch": 0.1873170731707317, "grad_norm": 0.5235592722892761, "learning_rate": 0.00018811234487263226, "loss": 0.275, "step": 96 }, { "epoch": 0.18926829268292683, "grad_norm": 0.6984370350837708, "learning_rate": 0.00018798171129980407, "loss": 0.2623, "step": 97 }, { "epoch": 0.19121951219512195, "grad_norm": 0.4455844461917877, "learning_rate": 0.00018785107772697584, "loss": 0.2321, "step": 98 }, { "epoch": 0.19317073170731708, "grad_norm": 0.4393198490142822, "learning_rate": 0.00018772044415414762, "loss": 0.2892, "step": 99 }, { "epoch": 0.1951219512195122, "grad_norm": 0.6566756963729858, "learning_rate": 0.0001875898105813194, "loss": 0.2559, "step": 100 } ], "logging_steps": 1, "max_steps": 1536, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.883908951621632e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }