{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 6, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 0.08957596868276596, "learning_rate": 2e-05, "loss": 1.0134, "step": 1 }, { "epoch": 0.03, "eval_loss": 1.0981205701828003, "eval_runtime": 2.5128, "eval_samples_per_second": 1.194, "eval_steps_per_second": 1.194, "step": 1 }, { "epoch": 0.05, "grad_norm": 0.07771213352680206, "learning_rate": 4e-05, "loss": 0.9545, "step": 2 }, { "epoch": 0.08, "grad_norm": 0.1224137470126152, "learning_rate": 6e-05, "loss": 1.1733, "step": 3 }, { "epoch": 0.1, "grad_norm": 0.09190034121274948, "learning_rate": 8e-05, "loss": 0.9954, "step": 4 }, { "epoch": 0.13, "grad_norm": 0.08263542503118515, "learning_rate": 0.0001, "loss": 0.9486, "step": 5 }, { "epoch": 0.15, "grad_norm": 0.09250061959028244, "learning_rate": 0.00012, "loss": 0.972, "step": 6 }, { "epoch": 0.15, "eval_loss": 1.0735869407653809, "eval_runtime": 2.5357, "eval_samples_per_second": 1.183, "eval_steps_per_second": 1.183, "step": 6 }, { "epoch": 0.18, "grad_norm": 0.1398034691810608, "learning_rate": 0.00014, "loss": 1.0445, "step": 7 }, { "epoch": 0.21, "grad_norm": 0.0993918851017952, "learning_rate": 0.00016, "loss": 0.9169, "step": 8 }, { "epoch": 0.23, "grad_norm": 0.07937725633382797, "learning_rate": 0.00018, "loss": 0.8462, "step": 9 }, { "epoch": 0.26, "grad_norm": 0.10001373291015625, "learning_rate": 0.0002, "loss": 0.8708, "step": 10 }, { "epoch": 0.28, "grad_norm": 0.1337287873029709, "learning_rate": 0.00019995690062269984, "loss": 0.86, "step": 11 }, { "epoch": 0.31, "grad_norm": 0.11684636771678925, "learning_rate": 0.00019982763964192585, "loss": 0.7982, "step": 12 }, { "epoch": 0.31, "eval_loss": 0.8548387885093689, "eval_runtime": 2.5536, "eval_samples_per_second": 1.175, "eval_steps_per_second": 1.175, "step": 12 }, { "epoch": 0.33, "grad_norm": 0.12103456258773804, "learning_rate": 0.0001996123284790336, "loss": 0.7906, "step": 13 }, { "epoch": 0.36, "grad_norm": 0.1426106095314026, "learning_rate": 0.00019931115272956405, "loss": 0.7825, "step": 14 }, { "epoch": 0.38, "grad_norm": 0.12367941439151764, "learning_rate": 0.0001989243720032624, "loss": 0.7341, "step": 15 }, { "epoch": 0.41, "grad_norm": 0.10154826194047928, "learning_rate": 0.00019845231970029773, "loss": 0.7064, "step": 16 }, { "epoch": 0.44, "grad_norm": 0.13628405332565308, "learning_rate": 0.0001978954027238763, "loss": 0.6988, "step": 17 }, { "epoch": 0.46, "grad_norm": 0.11276472359895706, "learning_rate": 0.0001972541011294959, "loss": 0.6944, "step": 18 }, { "epoch": 0.46, "eval_loss": 0.7151015400886536, "eval_runtime": 2.5734, "eval_samples_per_second": 1.166, "eval_steps_per_second": 1.166, "step": 18 }, { "epoch": 0.49, "grad_norm": 0.13381372392177582, "learning_rate": 0.00019652896771114414, "loss": 0.6956, "step": 19 }, { "epoch": 0.51, "grad_norm": 0.11248588562011719, "learning_rate": 0.00019572062752479683, "loss": 0.7155, "step": 20 }, { "epoch": 0.54, "grad_norm": 0.17762312293052673, "learning_rate": 0.00019482977734962753, "loss": 0.7357, "step": 21 }, { "epoch": 0.56, "grad_norm": 0.10546916723251343, "learning_rate": 0.00019385718508739262, "loss": 0.6691, "step": 22 }, { "epoch": 0.59, "grad_norm": 0.3150898516178131, "learning_rate": 0.00019280368910050942, "loss": 0.7167, "step": 23 }, { "epoch": 0.62, "grad_norm": 0.13151158392429352, "learning_rate": 0.00019167019748939846, "loss": 0.6808, "step": 24 }, { "epoch": 0.62, "eval_loss": 0.6942548751831055, "eval_runtime": 2.5831, "eval_samples_per_second": 1.161, "eval_steps_per_second": 1.161, "step": 24 }, { "epoch": 0.64, "grad_norm": 0.14906296133995056, "learning_rate": 0.00019045768730971196, "loss": 0.6762, "step": 25 }, { "epoch": 0.67, "grad_norm": 0.19484123587608337, "learning_rate": 0.00018916720373012426, "loss": 0.6854, "step": 26 }, { "epoch": 0.69, "grad_norm": 0.12819896638393402, "learning_rate": 0.00018779985913140924, "loss": 0.6873, "step": 27 }, { "epoch": 0.72, "grad_norm": 0.21385614573955536, "learning_rate": 0.00018635683214758214, "loss": 0.6874, "step": 28 }, { "epoch": 0.74, "grad_norm": 0.12286895513534546, "learning_rate": 0.0001848393666499315, "loss": 0.6843, "step": 29 }, { "epoch": 0.77, "grad_norm": 0.08534862101078033, "learning_rate": 0.00018324877067481783, "loss": 0.6763, "step": 30 }, { "epoch": 0.77, "eval_loss": 0.6821426749229431, "eval_runtime": 2.5911, "eval_samples_per_second": 1.158, "eval_steps_per_second": 1.158, "step": 30 }, { "epoch": 0.79, "grad_norm": 0.17990928888320923, "learning_rate": 0.0001815864152961624, "loss": 0.6789, "step": 31 }, { "epoch": 0.82, "grad_norm": 0.12137839943170547, "learning_rate": 0.0001798537334435986, "loss": 0.6877, "step": 32 }, { "epoch": 0.85, "grad_norm": 0.10240964591503143, "learning_rate": 0.00017805221866730458, "loss": 0.6725, "step": 33 }, { "epoch": 0.87, "grad_norm": 0.14333295822143555, "learning_rate": 0.00017618342385058145, "loss": 0.6745, "step": 34 }, { "epoch": 0.9, "grad_norm": 0.0904482752084732, "learning_rate": 0.00017424895987128722, "loss": 0.6894, "step": 35 }, { "epoch": 0.92, "grad_norm": 0.11753042787313461, "learning_rate": 0.00017225049421328023, "loss": 0.67, "step": 36 }, { "epoch": 0.92, "eval_loss": 0.6763580441474915, "eval_runtime": 2.5955, "eval_samples_per_second": 1.156, "eval_steps_per_second": 1.156, "step": 36 }, { "epoch": 0.95, "grad_norm": 0.13719823956489563, "learning_rate": 0.00017018974952906884, "loss": 0.6589, "step": 37 }, { "epoch": 0.97, "grad_norm": 0.1040361225605011, "learning_rate": 0.0001680685021549063, "loss": 0.666, "step": 38 }, { "epoch": 1.0, "grad_norm": 0.07594098895788193, "learning_rate": 0.00016588858057961113, "loss": 0.645, "step": 39 }, { "epoch": 1.03, "grad_norm": 0.08139798045158386, "learning_rate": 0.0001636518638684325, "loss": 0.6542, "step": 40 }, { "epoch": 1.05, "grad_norm": 0.07313457876443863, "learning_rate": 0.0001613602800433194, "loss": 0.6458, "step": 41 }, { "epoch": 1.08, "grad_norm": 0.07903215289115906, "learning_rate": 0.00015901580442098968, "loss": 0.6424, "step": 42 }, { "epoch": 1.08, "eval_loss": 0.6730008125305176, "eval_runtime": 2.5989, "eval_samples_per_second": 1.154, "eval_steps_per_second": 1.154, "step": 42 }, { "epoch": 1.1, "grad_norm": 0.09322352707386017, "learning_rate": 0.00015662045791023173, "loss": 0.6567, "step": 43 }, { "epoch": 1.13, "grad_norm": 0.07249985635280609, "learning_rate": 0.00015417630526990615, "loss": 0.6384, "step": 44 }, { "epoch": 1.15, "grad_norm": 0.07686451077461243, "learning_rate": 0.0001516854533291494, "loss": 0.665, "step": 45 }, { "epoch": 1.18, "grad_norm": 0.07324113696813583, "learning_rate": 0.00014915004917131344, "loss": 0.6297, "step": 46 }, { "epoch": 1.21, "grad_norm": 0.09203895926475525, "learning_rate": 0.00014657227828320635, "loss": 0.6539, "step": 47 }, { "epoch": 1.23, "grad_norm": 0.09338624030351639, "learning_rate": 0.00014395436267123016, "loss": 0.6552, "step": 48 }, { "epoch": 1.23, "eval_loss": 0.6780009269714355, "eval_runtime": 2.6045, "eval_samples_per_second": 1.152, "eval_steps_per_second": 1.152, "step": 48 }, { "epoch": 1.26, "grad_norm": 0.0812142863869667, "learning_rate": 0.00014129855894603886, "loss": 0.6319, "step": 49 }, { "epoch": 1.28, "grad_norm": 0.19316132366657257, "learning_rate": 0.00013860715637736818, "loss": 0.7, "step": 50 }, { "epoch": 1.31, "grad_norm": 0.10698059946298599, "learning_rate": 0.0001358824749207136, "loss": 0.6725, "step": 51 }, { "epoch": 1.33, "grad_norm": 0.14100198447704315, "learning_rate": 0.00013312686321755761, "loss": 0.6766, "step": 52 }, { "epoch": 1.36, "grad_norm": 0.09599179029464722, "learning_rate": 0.00013034269657086992, "loss": 0.645, "step": 53 }, { "epoch": 1.38, "grad_norm": 0.08999059349298477, "learning_rate": 0.000127532374897626, "loss": 0.6527, "step": 54 }, { "epoch": 1.38, "eval_loss": 0.6689873337745667, "eval_runtime": 2.6108, "eval_samples_per_second": 1.149, "eval_steps_per_second": 1.149, "step": 54 }, { "epoch": 1.41, "grad_norm": 0.13835830986499786, "learning_rate": 0.00012469832066010843, "loss": 0.6561, "step": 55 }, { "epoch": 1.44, "grad_norm": 0.10695886611938477, "learning_rate": 0.00012184297677777463, "loss": 0.6668, "step": 56 }, { "epoch": 1.46, "grad_norm": 0.0739368349313736, "learning_rate": 0.00011896880452149077, "loss": 0.643, "step": 57 }, { "epoch": 1.49, "grad_norm": 0.21791452169418335, "learning_rate": 0.00011607828139194683, "loss": 0.6768, "step": 58 }, { "epoch": 1.51, "grad_norm": 0.06241246312856674, "learning_rate": 0.00011317389898408189, "loss": 0.6252, "step": 59 }, { "epoch": 1.54, "grad_norm": 0.1302526593208313, "learning_rate": 0.00011025816083936036, "loss": 0.6624, "step": 60 }, { "epoch": 1.54, "eval_loss": 0.6632375121116638, "eval_runtime": 2.6043, "eval_samples_per_second": 1.152, "eval_steps_per_second": 1.152, "step": 60 }, { "epoch": 1.56, "grad_norm": 0.11702455580234528, "learning_rate": 0.0001073335802877504, "loss": 0.6522, "step": 61 }, { "epoch": 1.59, "grad_norm": 0.08904154598712921, "learning_rate": 0.00010440267828126478, "loss": 0.6472, "step": 62 }, { "epoch": 1.62, "grad_norm": 0.08021406084299088, "learning_rate": 0.00010146798122093166, "loss": 0.6279, "step": 63 }, { "epoch": 1.64, "grad_norm": 0.07384659349918365, "learning_rate": 9.853201877906836e-05, "loss": 0.6262, "step": 64 }, { "epoch": 1.67, "grad_norm": 0.06457240134477615, "learning_rate": 9.559732171873523e-05, "loss": 0.64, "step": 65 }, { "epoch": 1.69, "grad_norm": 0.07967618852853775, "learning_rate": 9.266641971224963e-05, "loss": 0.6228, "step": 66 }, { "epoch": 1.69, "eval_loss": 0.6625072360038757, "eval_runtime": 2.6047, "eval_samples_per_second": 1.152, "eval_steps_per_second": 1.152, "step": 66 }, { "epoch": 1.72, "grad_norm": 0.09555868804454803, "learning_rate": 8.974183916063968e-05, "loss": 0.635, "step": 67 }, { "epoch": 1.74, "grad_norm": 0.07187359035015106, "learning_rate": 8.682610101591814e-05, "loss": 0.6277, "step": 68 }, { "epoch": 1.77, "grad_norm": 0.091610848903656, "learning_rate": 8.392171860805319e-05, "loss": 0.6649, "step": 69 }, { "epoch": 1.79, "grad_norm": 0.065833680331707, "learning_rate": 8.103119547850924e-05, "loss": 0.6262, "step": 70 }, { "epoch": 1.82, "grad_norm": 0.09459354728460312, "learning_rate": 7.815702322222538e-05, "loss": 0.6359, "step": 71 }, { "epoch": 1.85, "grad_norm": 0.06780053675174713, "learning_rate": 7.530167933989161e-05, "loss": 0.6447, "step": 72 }, { "epoch": 1.85, "eval_loss": 0.6616933941841125, "eval_runtime": 2.607, "eval_samples_per_second": 1.151, "eval_steps_per_second": 1.151, "step": 72 }, { "epoch": 1.87, "grad_norm": 0.0954224094748497, "learning_rate": 7.246762510237403e-05, "loss": 0.6636, "step": 73 }, { "epoch": 1.9, "grad_norm": 0.0937703400850296, "learning_rate": 6.96573034291301e-05, "loss": 0.6381, "step": 74 }, { "epoch": 1.92, "grad_norm": 0.10935033112764359, "learning_rate": 6.687313678244242e-05, "loss": 0.628, "step": 75 }, { "epoch": 1.95, "grad_norm": 0.08154003322124481, "learning_rate": 6.411752507928642e-05, "loss": 0.6386, "step": 76 }, { "epoch": 1.97, "grad_norm": 0.12196218967437744, "learning_rate": 6.139284362263185e-05, "loss": 0.6317, "step": 77 }, { "epoch": 2.0, "grad_norm": 0.11538293212652206, "learning_rate": 5.870144105396118e-05, "loss": 0.6409, "step": 78 }, { "epoch": 2.0, "eval_loss": 0.6598871350288391, "eval_runtime": 2.6073, "eval_samples_per_second": 1.151, "eval_steps_per_second": 1.151, "step": 78 } ], "logging_steps": 1, "max_steps": 117, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.2819075742826496e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }