|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.071246819338422, |
|
"eval_steps": 50, |
|
"global_step": 1600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1272264631043257, |
|
"grad_norm": 10.307743072509766, |
|
"learning_rate": 2.5445292620865143e-06, |
|
"loss": 1.5044, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1272264631043257, |
|
"eval_loss": 0.7622461318969727, |
|
"eval_runtime": 1.1641, |
|
"eval_samples_per_second": 134.01, |
|
"eval_steps_per_second": 6.872, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2544529262086514, |
|
"grad_norm": 5.364140510559082, |
|
"learning_rate": 5.0890585241730285e-06, |
|
"loss": 0.7085, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2544529262086514, |
|
"eval_loss": 0.6879841089248657, |
|
"eval_runtime": 1.1592, |
|
"eval_samples_per_second": 134.58, |
|
"eval_steps_per_second": 6.902, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3816793893129771, |
|
"grad_norm": 5.230589866638184, |
|
"learning_rate": 7.633587786259543e-06, |
|
"loss": 0.682, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3816793893129771, |
|
"eval_loss": 0.687479555606842, |
|
"eval_runtime": 1.1577, |
|
"eval_samples_per_second": 134.75, |
|
"eval_steps_per_second": 6.91, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5089058524173028, |
|
"grad_norm": 4.248990535736084, |
|
"learning_rate": 1.0178117048346057e-05, |
|
"loss": 0.7131, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5089058524173028, |
|
"eval_loss": 0.6910625100135803, |
|
"eval_runtime": 1.157, |
|
"eval_samples_per_second": 134.834, |
|
"eval_steps_per_second": 6.915, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6361323155216285, |
|
"grad_norm": 2.6538403034210205, |
|
"learning_rate": 1.2722646310432571e-05, |
|
"loss": 0.6984, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6361323155216285, |
|
"eval_loss": 0.7002120614051819, |
|
"eval_runtime": 1.1587, |
|
"eval_samples_per_second": 134.632, |
|
"eval_steps_per_second": 6.904, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7633587786259542, |
|
"grad_norm": 2.6850225925445557, |
|
"learning_rate": 1.5267175572519086e-05, |
|
"loss": 0.707, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7633587786259542, |
|
"eval_loss": 0.7008151412010193, |
|
"eval_runtime": 1.1593, |
|
"eval_samples_per_second": 134.559, |
|
"eval_steps_per_second": 6.9, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8905852417302799, |
|
"grad_norm": 2.7403223514556885, |
|
"learning_rate": 1.78117048346056e-05, |
|
"loss": 0.7418, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8905852417302799, |
|
"eval_loss": 0.7069897651672363, |
|
"eval_runtime": 1.1576, |
|
"eval_samples_per_second": 134.759, |
|
"eval_steps_per_second": 6.911, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0178117048346056, |
|
"grad_norm": 2.4426021575927734, |
|
"learning_rate": 1.9999806716709255e-05, |
|
"loss": 0.6877, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0178117048346056, |
|
"eval_loss": 0.7735136151313782, |
|
"eval_runtime": 1.1581, |
|
"eval_samples_per_second": 134.708, |
|
"eval_steps_per_second": 6.908, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1450381679389312, |
|
"grad_norm": 3.299452781677246, |
|
"learning_rate": 1.99871868303953e-05, |
|
"loss": 0.4337, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1450381679389312, |
|
"eval_loss": 0.7848060727119446, |
|
"eval_runtime": 1.158, |
|
"eval_samples_per_second": 134.715, |
|
"eval_steps_per_second": 6.908, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.272264631043257, |
|
"grad_norm": 2.149354934692383, |
|
"learning_rate": 1.9954872604219543e-05, |
|
"loss": 0.4442, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.272264631043257, |
|
"eval_loss": 0.7646350860595703, |
|
"eval_runtime": 1.1748, |
|
"eval_samples_per_second": 132.791, |
|
"eval_steps_per_second": 6.81, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3994910941475827, |
|
"grad_norm": 2.396400213241577, |
|
"learning_rate": 1.9902927760565824e-05, |
|
"loss": 0.4572, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3994910941475827, |
|
"eval_loss": 0.7711533308029175, |
|
"eval_runtime": 1.1703, |
|
"eval_samples_per_second": 133.303, |
|
"eval_steps_per_second": 6.836, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5267175572519083, |
|
"grad_norm": 2.4799375534057617, |
|
"learning_rate": 1.9831454732624023e-05, |
|
"loss": 0.4566, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5267175572519083, |
|
"eval_loss": 0.7778152823448181, |
|
"eval_runtime": 1.1749, |
|
"eval_samples_per_second": 132.781, |
|
"eval_steps_per_second": 6.809, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6539440203562341, |
|
"grad_norm": 2.5714755058288574, |
|
"learning_rate": 1.9740594462395844e-05, |
|
"loss": 0.4761, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6539440203562341, |
|
"eval_loss": 0.7791449427604675, |
|
"eval_runtime": 1.173, |
|
"eval_samples_per_second": 132.988, |
|
"eval_steps_per_second": 6.82, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.78117048346056, |
|
"grad_norm": 2.1057567596435547, |
|
"learning_rate": 1.963052612276272e-05, |
|
"loss": 0.4795, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.78117048346056, |
|
"eval_loss": 0.7723409533500671, |
|
"eval_runtime": 1.1727, |
|
"eval_samples_per_second": 133.024, |
|
"eval_steps_per_second": 6.822, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9083969465648853, |
|
"grad_norm": 2.2185871601104736, |
|
"learning_rate": 1.950146676416393e-05, |
|
"loss": 0.4857, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9083969465648853, |
|
"eval_loss": 0.7690569758415222, |
|
"eval_runtime": 1.1699, |
|
"eval_samples_per_second": 133.35, |
|
"eval_steps_per_second": 6.838, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.035623409669211, |
|
"grad_norm": 1.5796678066253662, |
|
"learning_rate": 1.9353670886581683e-05, |
|
"loss": 0.4129, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.035623409669211, |
|
"eval_loss": 0.8792228698730469, |
|
"eval_runtime": 1.1762, |
|
"eval_samples_per_second": 132.631, |
|
"eval_steps_per_second": 6.802, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.162849872773537, |
|
"grad_norm": 3.589590311050415, |
|
"learning_rate": 1.9187429937677136e-05, |
|
"loss": 0.2354, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.162849872773537, |
|
"eval_loss": 0.8760839700698853, |
|
"eval_runtime": 1.1703, |
|
"eval_samples_per_second": 133.302, |
|
"eval_steps_per_second": 6.836, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.2900763358778624, |
|
"grad_norm": 2.067382574081421, |
|
"learning_rate": 1.9003071738067073e-05, |
|
"loss": 0.245, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.2900763358778624, |
|
"eval_loss": 0.8845335841178894, |
|
"eval_runtime": 1.1737, |
|
"eval_samples_per_second": 132.911, |
|
"eval_steps_per_second": 6.816, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4173027989821882, |
|
"grad_norm": 1.5841869115829468, |
|
"learning_rate": 1.8800959834874534e-05, |
|
"loss": 0.2539, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.4173027989821882, |
|
"eval_loss": 0.8797417879104614, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 133.369, |
|
"eval_steps_per_second": 6.839, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.544529262086514, |
|
"grad_norm": 1.7960104942321777, |
|
"learning_rate": 1.858149278482817e-05, |
|
"loss": 0.252, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.544529262086514, |
|
"eval_loss": 0.8770042061805725, |
|
"eval_runtime": 1.1707, |
|
"eval_samples_per_second": 133.255, |
|
"eval_steps_per_second": 6.834, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.67175572519084, |
|
"grad_norm": 1.3159406185150146, |
|
"learning_rate": 1.834510336832405e-05, |
|
"loss": 0.2506, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.67175572519084, |
|
"eval_loss": 0.8900387287139893, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 133.746, |
|
"eval_steps_per_second": 6.859, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.7989821882951653, |
|
"grad_norm": 1.6281907558441162, |
|
"learning_rate": 1.8092257735999734e-05, |
|
"loss": 0.251, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.7989821882951653, |
|
"eval_loss": 0.8732296824455261, |
|
"eval_runtime": 1.1759, |
|
"eval_samples_per_second": 132.663, |
|
"eval_steps_per_second": 6.803, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.926208651399491, |
|
"grad_norm": 1.672808051109314, |
|
"learning_rate": 1.7823454489503526e-05, |
|
"loss": 0.2586, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.926208651399491, |
|
"eval_loss": 0.8689729571342468, |
|
"eval_runtime": 1.1706, |
|
"eval_samples_per_second": 133.26, |
|
"eval_steps_per_second": 6.834, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.053435114503817, |
|
"grad_norm": 1.7687475681304932, |
|
"learning_rate": 1.753922369827162e-05, |
|
"loss": 0.2095, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.053435114503817, |
|
"eval_loss": 0.9642590284347534, |
|
"eval_runtime": 1.1707, |
|
"eval_samples_per_second": 133.258, |
|
"eval_steps_per_second": 6.834, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.1806615776081424, |
|
"grad_norm": 1.3203155994415283, |
|
"learning_rate": 1.7240125854252043e-05, |
|
"loss": 0.1574, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.1806615776081424, |
|
"eval_loss": 0.9428597688674927, |
|
"eval_runtime": 1.1695, |
|
"eval_samples_per_second": 133.393, |
|
"eval_steps_per_second": 6.841, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.3078880407124682, |
|
"grad_norm": 1.3456603288650513, |
|
"learning_rate": 1.692675076663651e-05, |
|
"loss": 0.1591, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.3078880407124682, |
|
"eval_loss": 0.9436941146850586, |
|
"eval_runtime": 1.1752, |
|
"eval_samples_per_second": 132.74, |
|
"eval_steps_per_second": 6.807, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.435114503816794, |
|
"grad_norm": 1.1123533248901367, |
|
"learning_rate": 1.659971639877992e-05, |
|
"loss": 0.1649, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.435114503816794, |
|
"eval_loss": 0.9538766145706177, |
|
"eval_runtime": 1.1721, |
|
"eval_samples_per_second": 133.097, |
|
"eval_steps_per_second": 6.825, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.5623409669211195, |
|
"grad_norm": 0.7998117804527283, |
|
"learning_rate": 1.6259667649600907e-05, |
|
"loss": 0.1664, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.5623409669211195, |
|
"eval_loss": 0.9487420320510864, |
|
"eval_runtime": 1.177, |
|
"eval_samples_per_second": 132.543, |
|
"eval_steps_per_second": 6.797, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.6895674300254453, |
|
"grad_norm": 1.3396856784820557, |
|
"learning_rate": 1.5907275081866504e-05, |
|
"loss": 0.1648, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.6895674300254453, |
|
"eval_loss": 0.9679978489875793, |
|
"eval_runtime": 1.1736, |
|
"eval_samples_per_second": 132.92, |
|
"eval_steps_per_second": 6.816, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.816793893129771, |
|
"grad_norm": 2.01399827003479, |
|
"learning_rate": 1.5543233599868744e-05, |
|
"loss": 0.1675, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.816793893129771, |
|
"eval_loss": 0.9473732113838196, |
|
"eval_runtime": 1.1771, |
|
"eval_samples_per_second": 132.529, |
|
"eval_steps_per_second": 6.796, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.9440203562340965, |
|
"grad_norm": 1.3724688291549683, |
|
"learning_rate": 1.5168261079100695e-05, |
|
"loss": 0.1637, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.9440203562340965, |
|
"eval_loss": 0.970781683921814, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 133.524, |
|
"eval_steps_per_second": 6.847, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.071246819338422, |
|
"grad_norm": 1.5431095361709595, |
|
"learning_rate": 1.4783096950634211e-05, |
|
"loss": 0.1357, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.071246819338422, |
|
"eval_loss": 1.0111091136932373, |
|
"eval_runtime": 1.1639, |
|
"eval_samples_per_second": 134.027, |
|
"eval_steps_per_second": 6.873, |
|
"step": 1600 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 3930, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 400, |
|
"total_flos": 7.79604032791511e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|