|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.928870292887029, |
|
"eval_steps": 100, |
|
"global_step": 5600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05230125523012552, |
|
"eval_accuracy": 0.10215735262383098, |
|
"eval_loss": 5.751355171203613, |
|
"eval_runtime": 184.0542, |
|
"eval_samples_per_second": 80.194, |
|
"eval_steps_per_second": 0.63, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10460251046025104, |
|
"eval_accuracy": 0.12205444695787844, |
|
"eval_loss": 5.327404975891113, |
|
"eval_runtime": 185.0066, |
|
"eval_samples_per_second": 79.781, |
|
"eval_steps_per_second": 0.627, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15690376569037656, |
|
"eval_accuracy": 0.13593891242440487, |
|
"eval_loss": 5.167089939117432, |
|
"eval_runtime": 184.2672, |
|
"eval_samples_per_second": 80.101, |
|
"eval_steps_per_second": 0.63, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20920502092050208, |
|
"eval_accuracy": 0.14769799376050313, |
|
"eval_loss": 5.03983736038208, |
|
"eval_runtime": 184.4381, |
|
"eval_samples_per_second": 80.027, |
|
"eval_steps_per_second": 0.629, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2615062761506276, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.564156206415621e-05, |
|
"loss": 5.3792, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2615062761506276, |
|
"eval_accuracy": 0.15595240798761018, |
|
"eval_loss": 4.976524353027344, |
|
"eval_runtime": 184.12, |
|
"eval_samples_per_second": 80.165, |
|
"eval_steps_per_second": 0.63, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3138075313807531, |
|
"eval_accuracy": 0.16177388065723808, |
|
"eval_loss": 4.927117347717285, |
|
"eval_runtime": 184.3296, |
|
"eval_samples_per_second": 80.074, |
|
"eval_steps_per_second": 0.629, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.36610878661087864, |
|
"eval_accuracy": 0.16636529077842815, |
|
"eval_loss": 4.898035049438477, |
|
"eval_runtime": 184.3142, |
|
"eval_samples_per_second": 80.081, |
|
"eval_steps_per_second": 0.629, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.41841004184100417, |
|
"eval_accuracy": 0.1694151293866953, |
|
"eval_loss": 4.875, |
|
"eval_runtime": 185.3011, |
|
"eval_samples_per_second": 79.654, |
|
"eval_steps_per_second": 0.626, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4707112970711297, |
|
"eval_accuracy": 0.17219956637036357, |
|
"eval_loss": 4.854589939117432, |
|
"eval_runtime": 184.8498, |
|
"eval_samples_per_second": 79.849, |
|
"eval_steps_per_second": 0.628, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5230125523012552, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.128312412831242e-05, |
|
"loss": 4.8385, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5230125523012552, |
|
"eval_accuracy": 0.17474116808932638, |
|
"eval_loss": 4.833265781402588, |
|
"eval_runtime": 184.5693, |
|
"eval_samples_per_second": 79.97, |
|
"eval_steps_per_second": 0.628, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5753138075313807, |
|
"eval_accuracy": 0.17643840085242252, |
|
"eval_loss": 4.817920207977295, |
|
"eval_runtime": 184.1405, |
|
"eval_samples_per_second": 80.156, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6276150627615062, |
|
"eval_accuracy": 0.1773740269532399, |
|
"eval_loss": 4.811416149139404, |
|
"eval_runtime": 184.0435, |
|
"eval_samples_per_second": 80.198, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6799163179916318, |
|
"eval_accuracy": 0.17846844188652472, |
|
"eval_loss": 4.802201747894287, |
|
"eval_runtime": 185.0497, |
|
"eval_samples_per_second": 79.762, |
|
"eval_steps_per_second": 0.627, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7322175732217573, |
|
"eval_accuracy": 0.1789691379773809, |
|
"eval_loss": 4.7901930809021, |
|
"eval_runtime": 184.9855, |
|
"eval_samples_per_second": 79.79, |
|
"eval_steps_per_second": 0.627, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7845188284518828, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 3.6924686192468624e-05, |
|
"loss": 4.7486, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7845188284518828, |
|
"eval_accuracy": 0.18001693513769368, |
|
"eval_loss": 4.785585880279541, |
|
"eval_runtime": 183.9922, |
|
"eval_samples_per_second": 80.221, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8368200836820083, |
|
"eval_accuracy": 0.18058760065826654, |
|
"eval_loss": 4.779539108276367, |
|
"eval_runtime": 184.1103, |
|
"eval_samples_per_second": 80.169, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8891213389121339, |
|
"eval_accuracy": 0.18107188047030814, |
|
"eval_loss": 4.776101112365723, |
|
"eval_runtime": 184.1674, |
|
"eval_samples_per_second": 80.144, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9414225941422594, |
|
"eval_accuracy": 0.18138128706525738, |
|
"eval_loss": 4.7754740715026855, |
|
"eval_runtime": 184.197, |
|
"eval_samples_per_second": 80.132, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9937238493723849, |
|
"eval_accuracy": 0.18191716908996425, |
|
"eval_loss": 4.767343997955322, |
|
"eval_runtime": 184.0497, |
|
"eval_samples_per_second": 80.196, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.0460251046025104, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.2566248256624825e-05, |
|
"loss": 4.7159, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0460251046025104, |
|
"eval_accuracy": 0.18160189904678178, |
|
"eval_loss": 4.769783020019531, |
|
"eval_runtime": 185.2615, |
|
"eval_samples_per_second": 79.671, |
|
"eval_steps_per_second": 0.626, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.098326359832636, |
|
"eval_accuracy": 0.18223365637995678, |
|
"eval_loss": 4.763906002044678, |
|
"eval_runtime": 184.3991, |
|
"eval_samples_per_second": 80.044, |
|
"eval_steps_per_second": 0.629, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.1506276150627615, |
|
"eval_accuracy": 0.18256457647135646, |
|
"eval_loss": 4.761280536651611, |
|
"eval_runtime": 185.0808, |
|
"eval_samples_per_second": 79.749, |
|
"eval_steps_per_second": 0.627, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.202928870292887, |
|
"eval_accuracy": 0.18278985816542134, |
|
"eval_loss": 4.7557759284973145, |
|
"eval_runtime": 184.6355, |
|
"eval_samples_per_second": 79.941, |
|
"eval_steps_per_second": 0.628, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.2552301255230125, |
|
"eval_accuracy": 0.18294602847055624, |
|
"eval_loss": 4.75867223739624, |
|
"eval_runtime": 184.9626, |
|
"eval_samples_per_second": 79.8, |
|
"eval_steps_per_second": 0.627, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3075313807531381, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.8207810320781032e-05, |
|
"loss": 4.6997, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.3075313807531381, |
|
"eval_accuracy": 0.18342178332552747, |
|
"eval_loss": 4.754149913787842, |
|
"eval_runtime": 185.1377, |
|
"eval_samples_per_second": 79.724, |
|
"eval_steps_per_second": 0.627, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.3598326359832635, |
|
"eval_accuracy": 0.1834138543952502, |
|
"eval_loss": 4.75181245803833, |
|
"eval_runtime": 184.6159, |
|
"eval_samples_per_second": 79.95, |
|
"eval_steps_per_second": 0.628, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4121338912133892, |
|
"eval_accuracy": 0.1836629457159783, |
|
"eval_loss": 4.746477127075195, |
|
"eval_runtime": 185.4196, |
|
"eval_samples_per_second": 79.603, |
|
"eval_steps_per_second": 0.626, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.4644351464435146, |
|
"eval_accuracy": 0.18383634667304455, |
|
"eval_loss": 4.750728130340576, |
|
"eval_runtime": 184.604, |
|
"eval_samples_per_second": 79.955, |
|
"eval_steps_per_second": 0.628, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.5167364016736402, |
|
"eval_accuracy": 0.1834985251891202, |
|
"eval_loss": 4.751083850860596, |
|
"eval_runtime": 183.9829, |
|
"eval_samples_per_second": 80.225, |
|
"eval_steps_per_second": 0.63, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.5690376569037658, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.3849372384937242e-05, |
|
"loss": 4.6905, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.5690376569037658, |
|
"eval_accuracy": 0.18385013375825832, |
|
"eval_loss": 4.750813007354736, |
|
"eval_runtime": 184.3904, |
|
"eval_samples_per_second": 80.048, |
|
"eval_steps_per_second": 0.629, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.6213389121338913, |
|
"eval_accuracy": 0.18415784707342428, |
|
"eval_loss": 4.746849536895752, |
|
"eval_runtime": 184.2103, |
|
"eval_samples_per_second": 80.126, |
|
"eval_steps_per_second": 0.63, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.6736401673640167, |
|
"eval_accuracy": 0.1842263748450887, |
|
"eval_loss": 4.746747970581055, |
|
"eval_runtime": 184.1204, |
|
"eval_samples_per_second": 80.165, |
|
"eval_steps_per_second": 0.63, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.7259414225941423, |
|
"eval_accuracy": 0.18430792521293346, |
|
"eval_loss": 4.745037078857422, |
|
"eval_runtime": 184.2802, |
|
"eval_samples_per_second": 80.095, |
|
"eval_steps_per_second": 0.629, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.778242677824268, |
|
"eval_accuracy": 0.184430838994751, |
|
"eval_loss": 4.746375560760498, |
|
"eval_runtime": 184.234, |
|
"eval_samples_per_second": 80.116, |
|
"eval_steps_per_second": 0.63, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.8305439330543933, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.9490934449093446e-05, |
|
"loss": 4.687, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.8305439330543933, |
|
"eval_accuracy": 0.18449599939000436, |
|
"eval_loss": 4.7423272132873535, |
|
"eval_runtime": 184.1596, |
|
"eval_samples_per_second": 80.148, |
|
"eval_steps_per_second": 0.63, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.8828451882845187, |
|
"eval_accuracy": 0.18466320271445863, |
|
"eval_loss": 4.74322509765625, |
|
"eval_runtime": 183.9907, |
|
"eval_samples_per_second": 80.221, |
|
"eval_steps_per_second": 0.63, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.9351464435146444, |
|
"eval_accuracy": 0.18432753476168612, |
|
"eval_loss": 4.744410514831543, |
|
"eval_runtime": 183.9473, |
|
"eval_samples_per_second": 80.24, |
|
"eval_steps_per_second": 0.631, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.98744769874477, |
|
"eval_accuracy": 0.18470474137931034, |
|
"eval_loss": 4.74097204208374, |
|
"eval_runtime": 184.0059, |
|
"eval_samples_per_second": 80.215, |
|
"eval_steps_per_second": 0.63, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.0397489539748954, |
|
"eval_accuracy": 0.1845896528685144, |
|
"eval_loss": 4.744495391845703, |
|
"eval_runtime": 184.1792, |
|
"eval_samples_per_second": 80.139, |
|
"eval_steps_per_second": 0.63, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.092050209205021, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.5132496513249652e-05, |
|
"loss": 4.6822, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.092050209205021, |
|
"eval_accuracy": 0.1840621610356637, |
|
"eval_loss": 4.743766784667969, |
|
"eval_runtime": 184.0079, |
|
"eval_samples_per_second": 80.214, |
|
"eval_steps_per_second": 0.63, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.1443514644351462, |
|
"eval_accuracy": 0.18438914008407487, |
|
"eval_loss": 4.742242336273193, |
|
"eval_runtime": 184.052, |
|
"eval_samples_per_second": 80.195, |
|
"eval_steps_per_second": 0.63, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.196652719665272, |
|
"eval_accuracy": 0.18475792298529636, |
|
"eval_loss": 4.741429328918457, |
|
"eval_runtime": 184.719, |
|
"eval_samples_per_second": 79.905, |
|
"eval_steps_per_second": 0.628, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.2489539748953975, |
|
"eval_accuracy": 0.1848885987251326, |
|
"eval_loss": 4.740514755249023, |
|
"eval_runtime": 184.0592, |
|
"eval_samples_per_second": 80.192, |
|
"eval_steps_per_second": 0.63, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.301255230125523, |
|
"eval_accuracy": 0.18497058180948975, |
|
"eval_loss": 4.738888740539551, |
|
"eval_runtime": 184.7818, |
|
"eval_samples_per_second": 79.878, |
|
"eval_steps_per_second": 0.628, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.3535564853556483, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.0774058577405859e-05, |
|
"loss": 4.6787, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.3535564853556483, |
|
"eval_accuracy": 0.18458716879639453, |
|
"eval_loss": 4.743495941162109, |
|
"eval_runtime": 185.012, |
|
"eval_samples_per_second": 79.779, |
|
"eval_steps_per_second": 0.627, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.405857740585774, |
|
"eval_accuracy": 0.18485314383095383, |
|
"eval_loss": 4.742412090301514, |
|
"eval_runtime": 184.018, |
|
"eval_samples_per_second": 80.21, |
|
"eval_steps_per_second": 0.63, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.4581589958158996, |
|
"eval_accuracy": 0.18451762653729054, |
|
"eval_loss": 4.744495391845703, |
|
"eval_runtime": 184.1989, |
|
"eval_samples_per_second": 80.131, |
|
"eval_steps_per_second": 0.63, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.510460251046025, |
|
"eval_accuracy": 0.18499375944599727, |
|
"eval_loss": 4.742056369781494, |
|
"eval_runtime": 184.9544, |
|
"eval_samples_per_second": 79.803, |
|
"eval_steps_per_second": 0.627, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.562761506276151, |
|
"eval_accuracy": 0.1845635125940072, |
|
"eval_loss": 4.74495267868042, |
|
"eval_runtime": 184.45, |
|
"eval_samples_per_second": 80.022, |
|
"eval_steps_per_second": 0.629, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.6150627615062763, |
|
"grad_norm": 1.25, |
|
"learning_rate": 6.415620641562065e-06, |
|
"loss": 4.6809, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.6150627615062763, |
|
"eval_accuracy": 0.18461182940869522, |
|
"eval_loss": 4.739973068237305, |
|
"eval_runtime": 185.0928, |
|
"eval_samples_per_second": 79.744, |
|
"eval_steps_per_second": 0.627, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.6673640167364017, |
|
"eval_accuracy": 0.18471778348337312, |
|
"eval_loss": 4.740243911743164, |
|
"eval_runtime": 184.3979, |
|
"eval_samples_per_second": 80.044, |
|
"eval_steps_per_second": 0.629, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.719665271966527, |
|
"eval_accuracy": 0.18489124970131737, |
|
"eval_loss": 4.738804340362549, |
|
"eval_runtime": 184.6067, |
|
"eval_samples_per_second": 79.954, |
|
"eval_steps_per_second": 0.628, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.7719665271966525, |
|
"eval_accuracy": 0.18462476994845917, |
|
"eval_loss": 4.741514205932617, |
|
"eval_runtime": 184.2661, |
|
"eval_samples_per_second": 80.102, |
|
"eval_steps_per_second": 0.63, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.8242677824267783, |
|
"eval_accuracy": 0.18472434773026122, |
|
"eval_loss": 4.739160060882568, |
|
"eval_runtime": 185.2276, |
|
"eval_samples_per_second": 79.686, |
|
"eval_steps_per_second": 0.626, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.8765690376569037, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.057182705718271e-06, |
|
"loss": 4.6819, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.8765690376569037, |
|
"eval_accuracy": 0.18477532080101736, |
|
"eval_loss": 4.742056369781494, |
|
"eval_runtime": 184.4871, |
|
"eval_samples_per_second": 80.006, |
|
"eval_steps_per_second": 0.629, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.928870292887029, |
|
"eval_accuracy": 0.18478108276943064, |
|
"eval_loss": 4.740701198577881, |
|
"eval_runtime": 184.224, |
|
"eval_samples_per_second": 80.12, |
|
"eval_steps_per_second": 0.63, |
|
"step": 5600 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 5736, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"total_flos": 6.743272090868122e+17, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|