{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.928870292887029, "eval_steps": 100, "global_step": 5600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05230125523012552, "eval_accuracy": 0.10215735262383098, "eval_loss": 5.751355171203613, "eval_runtime": 184.0542, "eval_samples_per_second": 80.194, "eval_steps_per_second": 0.63, "step": 100 }, { "epoch": 0.10460251046025104, "eval_accuracy": 0.12205444695787844, "eval_loss": 5.327404975891113, "eval_runtime": 185.0066, "eval_samples_per_second": 79.781, "eval_steps_per_second": 0.627, "step": 200 }, { "epoch": 0.15690376569037656, "eval_accuracy": 0.13593891242440487, "eval_loss": 5.167089939117432, "eval_runtime": 184.2672, "eval_samples_per_second": 80.101, "eval_steps_per_second": 0.63, "step": 300 }, { "epoch": 0.20920502092050208, "eval_accuracy": 0.14769799376050313, "eval_loss": 5.03983736038208, "eval_runtime": 184.4381, "eval_samples_per_second": 80.027, "eval_steps_per_second": 0.629, "step": 400 }, { "epoch": 0.2615062761506276, "grad_norm": 1.4375, "learning_rate": 4.564156206415621e-05, "loss": 5.3792, "step": 500 }, { "epoch": 0.2615062761506276, "eval_accuracy": 0.15595240798761018, "eval_loss": 4.976524353027344, "eval_runtime": 184.12, "eval_samples_per_second": 80.165, "eval_steps_per_second": 0.63, "step": 500 }, { "epoch": 0.3138075313807531, "eval_accuracy": 0.16177388065723808, "eval_loss": 4.927117347717285, "eval_runtime": 184.3296, "eval_samples_per_second": 80.074, "eval_steps_per_second": 0.629, "step": 600 }, { "epoch": 0.36610878661087864, "eval_accuracy": 0.16636529077842815, "eval_loss": 4.898035049438477, "eval_runtime": 184.3142, "eval_samples_per_second": 80.081, "eval_steps_per_second": 0.629, "step": 700 }, { "epoch": 0.41841004184100417, "eval_accuracy": 0.1694151293866953, "eval_loss": 4.875, "eval_runtime": 185.3011, "eval_samples_per_second": 79.654, "eval_steps_per_second": 0.626, "step": 800 }, { "epoch": 0.4707112970711297, "eval_accuracy": 0.17219956637036357, "eval_loss": 4.854589939117432, "eval_runtime": 184.8498, "eval_samples_per_second": 79.849, "eval_steps_per_second": 0.628, "step": 900 }, { "epoch": 0.5230125523012552, "grad_norm": 1.2265625, "learning_rate": 4.128312412831242e-05, "loss": 4.8385, "step": 1000 }, { "epoch": 0.5230125523012552, "eval_accuracy": 0.17474116808932638, "eval_loss": 4.833265781402588, "eval_runtime": 184.5693, "eval_samples_per_second": 79.97, "eval_steps_per_second": 0.628, "step": 1000 }, { "epoch": 0.5753138075313807, "eval_accuracy": 0.17643840085242252, "eval_loss": 4.817920207977295, "eval_runtime": 184.1405, "eval_samples_per_second": 80.156, "eval_steps_per_second": 0.63, "step": 1100 }, { "epoch": 0.6276150627615062, "eval_accuracy": 0.1773740269532399, "eval_loss": 4.811416149139404, "eval_runtime": 184.0435, "eval_samples_per_second": 80.198, "eval_steps_per_second": 0.63, "step": 1200 }, { "epoch": 0.6799163179916318, "eval_accuracy": 0.17846844188652472, "eval_loss": 4.802201747894287, "eval_runtime": 185.0497, "eval_samples_per_second": 79.762, "eval_steps_per_second": 0.627, "step": 1300 }, { "epoch": 0.7322175732217573, "eval_accuracy": 0.1789691379773809, "eval_loss": 4.7901930809021, "eval_runtime": 184.9855, "eval_samples_per_second": 79.79, "eval_steps_per_second": 0.627, "step": 1400 }, { "epoch": 0.7845188284518828, "grad_norm": 1.5703125, "learning_rate": 3.6924686192468624e-05, "loss": 4.7486, "step": 1500 }, { "epoch": 0.7845188284518828, "eval_accuracy": 0.18001693513769368, "eval_loss": 4.785585880279541, "eval_runtime": 183.9922, "eval_samples_per_second": 80.221, "eval_steps_per_second": 0.63, "step": 1500 }, { "epoch": 0.8368200836820083, "eval_accuracy": 0.18058760065826654, "eval_loss": 4.779539108276367, "eval_runtime": 184.1103, "eval_samples_per_second": 80.169, "eval_steps_per_second": 0.63, "step": 1600 }, { "epoch": 0.8891213389121339, "eval_accuracy": 0.18107188047030814, "eval_loss": 4.776101112365723, "eval_runtime": 184.1674, "eval_samples_per_second": 80.144, "eval_steps_per_second": 0.63, "step": 1700 }, { "epoch": 0.9414225941422594, "eval_accuracy": 0.18138128706525738, "eval_loss": 4.7754740715026855, "eval_runtime": 184.197, "eval_samples_per_second": 80.132, "eval_steps_per_second": 0.63, "step": 1800 }, { "epoch": 0.9937238493723849, "eval_accuracy": 0.18191716908996425, "eval_loss": 4.767343997955322, "eval_runtime": 184.0497, "eval_samples_per_second": 80.196, "eval_steps_per_second": 0.63, "step": 1900 }, { "epoch": 1.0460251046025104, "grad_norm": 1.203125, "learning_rate": 3.2566248256624825e-05, "loss": 4.7159, "step": 2000 }, { "epoch": 1.0460251046025104, "eval_accuracy": 0.18160189904678178, "eval_loss": 4.769783020019531, "eval_runtime": 185.2615, "eval_samples_per_second": 79.671, "eval_steps_per_second": 0.626, "step": 2000 }, { "epoch": 1.098326359832636, "eval_accuracy": 0.18223365637995678, "eval_loss": 4.763906002044678, "eval_runtime": 184.3991, "eval_samples_per_second": 80.044, "eval_steps_per_second": 0.629, "step": 2100 }, { "epoch": 1.1506276150627615, "eval_accuracy": 0.18256457647135646, "eval_loss": 4.761280536651611, "eval_runtime": 185.0808, "eval_samples_per_second": 79.749, "eval_steps_per_second": 0.627, "step": 2200 }, { "epoch": 1.202928870292887, "eval_accuracy": 0.18278985816542134, "eval_loss": 4.7557759284973145, "eval_runtime": 184.6355, "eval_samples_per_second": 79.941, "eval_steps_per_second": 0.628, "step": 2300 }, { "epoch": 1.2552301255230125, "eval_accuracy": 0.18294602847055624, "eval_loss": 4.75867223739624, "eval_runtime": 184.9626, "eval_samples_per_second": 79.8, "eval_steps_per_second": 0.627, "step": 2400 }, { "epoch": 1.3075313807531381, "grad_norm": 1.75, "learning_rate": 2.8207810320781032e-05, "loss": 4.6997, "step": 2500 }, { "epoch": 1.3075313807531381, "eval_accuracy": 0.18342178332552747, "eval_loss": 4.754149913787842, "eval_runtime": 185.1377, "eval_samples_per_second": 79.724, "eval_steps_per_second": 0.627, "step": 2500 }, { "epoch": 1.3598326359832635, "eval_accuracy": 0.1834138543952502, "eval_loss": 4.75181245803833, "eval_runtime": 184.6159, "eval_samples_per_second": 79.95, "eval_steps_per_second": 0.628, "step": 2600 }, { "epoch": 1.4121338912133892, "eval_accuracy": 0.1836629457159783, "eval_loss": 4.746477127075195, "eval_runtime": 185.4196, "eval_samples_per_second": 79.603, "eval_steps_per_second": 0.626, "step": 2700 }, { "epoch": 1.4644351464435146, "eval_accuracy": 0.18383634667304455, "eval_loss": 4.750728130340576, "eval_runtime": 184.604, "eval_samples_per_second": 79.955, "eval_steps_per_second": 0.628, "step": 2800 }, { "epoch": 1.5167364016736402, "eval_accuracy": 0.1834985251891202, "eval_loss": 4.751083850860596, "eval_runtime": 183.9829, "eval_samples_per_second": 80.225, "eval_steps_per_second": 0.63, "step": 2900 }, { "epoch": 1.5690376569037658, "grad_norm": 1.1328125, "learning_rate": 2.3849372384937242e-05, "loss": 4.6905, "step": 3000 }, { "epoch": 1.5690376569037658, "eval_accuracy": 0.18385013375825832, "eval_loss": 4.750813007354736, "eval_runtime": 184.3904, "eval_samples_per_second": 80.048, "eval_steps_per_second": 0.629, "step": 3000 }, { "epoch": 1.6213389121338913, "eval_accuracy": 0.18415784707342428, "eval_loss": 4.746849536895752, "eval_runtime": 184.2103, "eval_samples_per_second": 80.126, "eval_steps_per_second": 0.63, "step": 3100 }, { "epoch": 1.6736401673640167, "eval_accuracy": 0.1842263748450887, "eval_loss": 4.746747970581055, "eval_runtime": 184.1204, "eval_samples_per_second": 80.165, "eval_steps_per_second": 0.63, "step": 3200 }, { "epoch": 1.7259414225941423, "eval_accuracy": 0.18430792521293346, "eval_loss": 4.745037078857422, "eval_runtime": 184.2802, "eval_samples_per_second": 80.095, "eval_steps_per_second": 0.629, "step": 3300 }, { "epoch": 1.778242677824268, "eval_accuracy": 0.184430838994751, "eval_loss": 4.746375560760498, "eval_runtime": 184.234, "eval_samples_per_second": 80.116, "eval_steps_per_second": 0.63, "step": 3400 }, { "epoch": 1.8305439330543933, "grad_norm": 2.46875, "learning_rate": 1.9490934449093446e-05, "loss": 4.687, "step": 3500 }, { "epoch": 1.8305439330543933, "eval_accuracy": 0.18449599939000436, "eval_loss": 4.7423272132873535, "eval_runtime": 184.1596, "eval_samples_per_second": 80.148, "eval_steps_per_second": 0.63, "step": 3500 }, { "epoch": 1.8828451882845187, "eval_accuracy": 0.18466320271445863, "eval_loss": 4.74322509765625, "eval_runtime": 183.9907, "eval_samples_per_second": 80.221, "eval_steps_per_second": 0.63, "step": 3600 }, { "epoch": 1.9351464435146444, "eval_accuracy": 0.18432753476168612, "eval_loss": 4.744410514831543, "eval_runtime": 183.9473, "eval_samples_per_second": 80.24, "eval_steps_per_second": 0.631, "step": 3700 }, { "epoch": 1.98744769874477, "eval_accuracy": 0.18470474137931034, "eval_loss": 4.74097204208374, "eval_runtime": 184.0059, "eval_samples_per_second": 80.215, "eval_steps_per_second": 0.63, "step": 3800 }, { "epoch": 2.0397489539748954, "eval_accuracy": 0.1845896528685144, "eval_loss": 4.744495391845703, "eval_runtime": 184.1792, "eval_samples_per_second": 80.139, "eval_steps_per_second": 0.63, "step": 3900 }, { "epoch": 2.092050209205021, "grad_norm": 1.015625, "learning_rate": 1.5132496513249652e-05, "loss": 4.6822, "step": 4000 }, { "epoch": 2.092050209205021, "eval_accuracy": 0.1840621610356637, "eval_loss": 4.743766784667969, "eval_runtime": 184.0079, "eval_samples_per_second": 80.214, "eval_steps_per_second": 0.63, "step": 4000 }, { "epoch": 2.1443514644351462, "eval_accuracy": 0.18438914008407487, "eval_loss": 4.742242336273193, "eval_runtime": 184.052, "eval_samples_per_second": 80.195, "eval_steps_per_second": 0.63, "step": 4100 }, { "epoch": 2.196652719665272, "eval_accuracy": 0.18475792298529636, "eval_loss": 4.741429328918457, "eval_runtime": 184.719, "eval_samples_per_second": 79.905, "eval_steps_per_second": 0.628, "step": 4200 }, { "epoch": 2.2489539748953975, "eval_accuracy": 0.1848885987251326, "eval_loss": 4.740514755249023, "eval_runtime": 184.0592, "eval_samples_per_second": 80.192, "eval_steps_per_second": 0.63, "step": 4300 }, { "epoch": 2.301255230125523, "eval_accuracy": 0.18497058180948975, "eval_loss": 4.738888740539551, "eval_runtime": 184.7818, "eval_samples_per_second": 79.878, "eval_steps_per_second": 0.628, "step": 4400 }, { "epoch": 2.3535564853556483, "grad_norm": 1.5546875, "learning_rate": 1.0774058577405859e-05, "loss": 4.6787, "step": 4500 }, { "epoch": 2.3535564853556483, "eval_accuracy": 0.18458716879639453, "eval_loss": 4.743495941162109, "eval_runtime": 185.012, "eval_samples_per_second": 79.779, "eval_steps_per_second": 0.627, "step": 4500 }, { "epoch": 2.405857740585774, "eval_accuracy": 0.18485314383095383, "eval_loss": 4.742412090301514, "eval_runtime": 184.018, "eval_samples_per_second": 80.21, "eval_steps_per_second": 0.63, "step": 4600 }, { "epoch": 2.4581589958158996, "eval_accuracy": 0.18451762653729054, "eval_loss": 4.744495391845703, "eval_runtime": 184.1989, "eval_samples_per_second": 80.131, "eval_steps_per_second": 0.63, "step": 4700 }, { "epoch": 2.510460251046025, "eval_accuracy": 0.18499375944599727, "eval_loss": 4.742056369781494, "eval_runtime": 184.9544, "eval_samples_per_second": 79.803, "eval_steps_per_second": 0.627, "step": 4800 }, { "epoch": 2.562761506276151, "eval_accuracy": 0.1845635125940072, "eval_loss": 4.74495267868042, "eval_runtime": 184.45, "eval_samples_per_second": 80.022, "eval_steps_per_second": 0.629, "step": 4900 }, { "epoch": 2.6150627615062763, "grad_norm": 1.25, "learning_rate": 6.415620641562065e-06, "loss": 4.6809, "step": 5000 }, { "epoch": 2.6150627615062763, "eval_accuracy": 0.18461182940869522, "eval_loss": 4.739973068237305, "eval_runtime": 185.0928, "eval_samples_per_second": 79.744, "eval_steps_per_second": 0.627, "step": 5000 }, { "epoch": 2.6673640167364017, "eval_accuracy": 0.18471778348337312, "eval_loss": 4.740243911743164, "eval_runtime": 184.3979, "eval_samples_per_second": 80.044, "eval_steps_per_second": 0.629, "step": 5100 }, { "epoch": 2.719665271966527, "eval_accuracy": 0.18489124970131737, "eval_loss": 4.738804340362549, "eval_runtime": 184.6067, "eval_samples_per_second": 79.954, "eval_steps_per_second": 0.628, "step": 5200 }, { "epoch": 2.7719665271966525, "eval_accuracy": 0.18462476994845917, "eval_loss": 4.741514205932617, "eval_runtime": 184.2661, "eval_samples_per_second": 80.102, "eval_steps_per_second": 0.63, "step": 5300 }, { "epoch": 2.8242677824267783, "eval_accuracy": 0.18472434773026122, "eval_loss": 4.739160060882568, "eval_runtime": 185.2276, "eval_samples_per_second": 79.686, "eval_steps_per_second": 0.626, "step": 5400 }, { "epoch": 2.8765690376569037, "grad_norm": 1.6796875, "learning_rate": 2.057182705718271e-06, "loss": 4.6819, "step": 5500 }, { "epoch": 2.8765690376569037, "eval_accuracy": 0.18477532080101736, "eval_loss": 4.742056369781494, "eval_runtime": 184.4871, "eval_samples_per_second": 80.006, "eval_steps_per_second": 0.629, "step": 5500 }, { "epoch": 2.928870292887029, "eval_accuracy": 0.18478108276943064, "eval_loss": 4.740701198577881, "eval_runtime": 184.224, "eval_samples_per_second": 80.12, "eval_steps_per_second": 0.63, "step": 5600 } ], "logging_steps": 500, "max_steps": 5736, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "total_flos": 6.743272090868122e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }