|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9980031062791213, |
|
"eval_steps": 500, |
|
"global_step": 3378, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.863227128982544, |
|
"learning_rate": 2.0414201183431953e-06, |
|
"loss": 10.8959, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.9202582836151123, |
|
"learning_rate": 4.171597633136095e-06, |
|
"loss": 10.7254, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.297222375869751, |
|
"learning_rate": 6.390532544378699e-06, |
|
"loss": 10.3548, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.734739065170288, |
|
"learning_rate": 8.609467455621303e-06, |
|
"loss": 9.7686, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.3171167373657227, |
|
"learning_rate": 1.0828402366863907e-05, |
|
"loss": 8.8642, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.8683223724365234, |
|
"learning_rate": 1.304733727810651e-05, |
|
"loss": 7.7751, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.3291046619415283, |
|
"learning_rate": 1.5266272189349113e-05, |
|
"loss": 7.0121, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.4056954383850098, |
|
"learning_rate": 1.7485207100591714e-05, |
|
"loss": 6.3242, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1085917949676514, |
|
"learning_rate": 1.9704142011834322e-05, |
|
"loss": 5.7192, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3172917366027832, |
|
"learning_rate": 2.1923076923076924e-05, |
|
"loss": 5.0999, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0901626348495483, |
|
"learning_rate": 2.414201183431953e-05, |
|
"loss": 4.6548, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.2347195148468018, |
|
"learning_rate": 2.6360946745562133e-05, |
|
"loss": 4.3473, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0249868631362915, |
|
"learning_rate": 2.8579881656804735e-05, |
|
"loss": 4.0633, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2612847089767456, |
|
"learning_rate": 2.9911184210526318e-05, |
|
"loss": 3.8203, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.0956445932388306, |
|
"learning_rate": 2.9664473684210528e-05, |
|
"loss": 3.5213, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.187477946281433, |
|
"learning_rate": 2.9417763157894738e-05, |
|
"loss": 3.4107, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.2866570949554443, |
|
"learning_rate": 2.9171052631578948e-05, |
|
"loss": 3.2937, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.3654049634933472, |
|
"learning_rate": 2.8924342105263158e-05, |
|
"loss": 3.148, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.0226898193359375, |
|
"learning_rate": 2.8677631578947368e-05, |
|
"loss": 2.992, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.858105182647705, |
|
"learning_rate": 2.843092105263158e-05, |
|
"loss": 2.8713, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.1696932315826416, |
|
"learning_rate": 2.818421052631579e-05, |
|
"loss": 2.6683, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.214972972869873, |
|
"learning_rate": 2.79375e-05, |
|
"loss": 2.5497, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.712171792984009, |
|
"learning_rate": 2.769078947368421e-05, |
|
"loss": 2.5092, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.073754072189331, |
|
"learning_rate": 2.744407894736842e-05, |
|
"loss": 2.3336, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.3905112743377686, |
|
"learning_rate": 2.719736842105263e-05, |
|
"loss": 2.2231, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.6293680667877197, |
|
"learning_rate": 2.695065789473684e-05, |
|
"loss": 2.149, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.6342837810516357, |
|
"learning_rate": 2.6703947368421052e-05, |
|
"loss": 2.1733, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 5.107319355010986, |
|
"learning_rate": 2.6457236842105262e-05, |
|
"loss": 2.0175, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.6173806190490723, |
|
"learning_rate": 2.6210526315789475e-05, |
|
"loss": 1.9354, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.709559440612793, |
|
"learning_rate": 2.5963815789473685e-05, |
|
"loss": 1.8339, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.523397922515869, |
|
"learning_rate": 2.5717105263157895e-05, |
|
"loss": 1.9872, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.3423144817352295, |
|
"learning_rate": 2.5470394736842105e-05, |
|
"loss": 1.8357, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.349581241607666, |
|
"learning_rate": 2.5223684210526315e-05, |
|
"loss": 1.7938, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.185107707977295, |
|
"learning_rate": 2.4976973684210526e-05, |
|
"loss": 1.7019, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.9526580572128296, |
|
"learning_rate": 2.4730263157894736e-05, |
|
"loss": 1.7636, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.120420455932617, |
|
"learning_rate": 2.4483552631578946e-05, |
|
"loss": 1.7237, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.198755979537964, |
|
"learning_rate": 2.4236842105263156e-05, |
|
"loss": 1.6563, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.3641481399536133, |
|
"learning_rate": 2.3990131578947366e-05, |
|
"loss": 1.7045, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.742950916290283, |
|
"learning_rate": 2.3743421052631583e-05, |
|
"loss": 1.6134, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.467517614364624, |
|
"learning_rate": 2.3496710526315793e-05, |
|
"loss": 1.5613, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.6223593950271606, |
|
"learning_rate": 2.3250000000000003e-05, |
|
"loss": 1.6858, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.8574585914611816, |
|
"learning_rate": 2.3003289473684213e-05, |
|
"loss": 1.5701, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.3951303958892822, |
|
"learning_rate": 2.2756578947368423e-05, |
|
"loss": 1.4515, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.0053939819335938, |
|
"learning_rate": 2.2509868421052633e-05, |
|
"loss": 1.4744, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.9645187854766846, |
|
"learning_rate": 2.2263157894736843e-05, |
|
"loss": 1.4719, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.9311598539352417, |
|
"learning_rate": 2.2016447368421053e-05, |
|
"loss": 1.5421, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.094773054122925, |
|
"learning_rate": 2.1769736842105263e-05, |
|
"loss": 1.4853, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.9455292224884033, |
|
"learning_rate": 2.1523026315789476e-05, |
|
"loss": 1.4912, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.0593693256378174, |
|
"learning_rate": 2.1276315789473687e-05, |
|
"loss": 1.3928, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.7141344547271729, |
|
"learning_rate": 2.1029605263157897e-05, |
|
"loss": 1.3613, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.213785409927368, |
|
"learning_rate": 2.0782894736842107e-05, |
|
"loss": 1.3813, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.477902412414551, |
|
"learning_rate": 2.0536184210526317e-05, |
|
"loss": 1.4228, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.996187686920166, |
|
"learning_rate": 2.0289473684210527e-05, |
|
"loss": 1.3857, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.7344149351119995, |
|
"learning_rate": 2.0042763157894737e-05, |
|
"loss": 1.341, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.0151705741882324, |
|
"learning_rate": 1.9796052631578947e-05, |
|
"loss": 1.3863, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.9713648557662964, |
|
"learning_rate": 1.9549342105263157e-05, |
|
"loss": 1.397, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.835036039352417, |
|
"learning_rate": 1.9302631578947367e-05, |
|
"loss": 1.4165, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.019484043121338, |
|
"learning_rate": 1.905592105263158e-05, |
|
"loss": 1.3714, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.829540491104126, |
|
"learning_rate": 1.880921052631579e-05, |
|
"loss": 1.3149, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.961879014968872, |
|
"learning_rate": 1.85625e-05, |
|
"loss": 1.3801, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.2035787105560303, |
|
"learning_rate": 1.831578947368421e-05, |
|
"loss": 1.3486, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.8642491102218628, |
|
"learning_rate": 1.806907894736842e-05, |
|
"loss": 1.3207, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.992825984954834, |
|
"learning_rate": 1.782236842105263e-05, |
|
"loss": 1.3175, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.385167121887207, |
|
"learning_rate": 1.757565789473684e-05, |
|
"loss": 1.286, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.328168272972107, |
|
"learning_rate": 1.732894736842105e-05, |
|
"loss": 1.2824, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.647040605545044, |
|
"learning_rate": 1.708223684210526e-05, |
|
"loss": 1.3084, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.4075373411178589, |
|
"learning_rate": 1.6835526315789474e-05, |
|
"loss": 1.3287, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.74918794631958, |
|
"learning_rate": 1.6588815789473684e-05, |
|
"loss": 1.3314, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.8052846193313599, |
|
"learning_rate": 1.6342105263157894e-05, |
|
"loss": 1.2832, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.4929707050323486, |
|
"learning_rate": 1.6095394736842105e-05, |
|
"loss": 1.2565, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.5688847303390503, |
|
"learning_rate": 1.5848684210526318e-05, |
|
"loss": 1.2811, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.8629792928695679, |
|
"learning_rate": 1.5601973684210528e-05, |
|
"loss": 1.3485, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.946643352508545, |
|
"learning_rate": 1.5355263157894738e-05, |
|
"loss": 1.3489, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.6834330558776855, |
|
"learning_rate": 1.5108552631578946e-05, |
|
"loss": 1.2513, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.3857834339141846, |
|
"learning_rate": 1.4861842105263158e-05, |
|
"loss": 1.263, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.413714051246643, |
|
"learning_rate": 1.4615131578947368e-05, |
|
"loss": 1.2398, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.75337815284729, |
|
"learning_rate": 1.4368421052631578e-05, |
|
"loss": 1.3227, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.1332099437713623, |
|
"learning_rate": 1.4121710526315788e-05, |
|
"loss": 1.3663, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.0604429244995117, |
|
"learning_rate": 1.3875000000000002e-05, |
|
"loss": 1.1767, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.452750325202942, |
|
"learning_rate": 1.3628289473684212e-05, |
|
"loss": 1.2854, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.8815256357192993, |
|
"learning_rate": 1.3381578947368422e-05, |
|
"loss": 1.2671, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.517378568649292, |
|
"learning_rate": 1.3134868421052632e-05, |
|
"loss": 1.2302, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.5040243864059448, |
|
"learning_rate": 1.2888157894736842e-05, |
|
"loss": 1.2802, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.7564135789871216, |
|
"learning_rate": 1.2641447368421054e-05, |
|
"loss": 1.2337, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.5423864126205444, |
|
"learning_rate": 1.2394736842105264e-05, |
|
"loss": 1.2374, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.9401406049728394, |
|
"learning_rate": 1.2148026315789474e-05, |
|
"loss": 1.2256, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.3875536918640137, |
|
"learning_rate": 1.1901315789473684e-05, |
|
"loss": 1.2301, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.7349203824996948, |
|
"learning_rate": 1.1654605263157894e-05, |
|
"loss": 1.2379, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.1179356575012207, |
|
"learning_rate": 1.1407894736842106e-05, |
|
"loss": 1.2672, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.8508802652359009, |
|
"learning_rate": 1.1161184210526316e-05, |
|
"loss": 1.2543, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.7846442461013794, |
|
"learning_rate": 1.0914473684210526e-05, |
|
"loss": 1.1996, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.691226601600647, |
|
"learning_rate": 1.0667763157894736e-05, |
|
"loss": 1.2433, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.7513465881347656, |
|
"learning_rate": 1.0421052631578948e-05, |
|
"loss": 1.2417, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 2.2930405139923096, |
|
"learning_rate": 1.0174342105263158e-05, |
|
"loss": 1.2202, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.8147985935211182, |
|
"learning_rate": 9.92763157894737e-06, |
|
"loss": 1.2027, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.118342161178589, |
|
"learning_rate": 9.68092105263158e-06, |
|
"loss": 1.2547, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.2660759687423706, |
|
"learning_rate": 9.43421052631579e-06, |
|
"loss": 1.185, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.54926598072052, |
|
"learning_rate": 9.187500000000001e-06, |
|
"loss": 1.2254, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.9687325954437256, |
|
"learning_rate": 8.940789473684211e-06, |
|
"loss": 1.217, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.6193515062332153, |
|
"learning_rate": 8.694078947368422e-06, |
|
"loss": 1.2075, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.7565534114837646, |
|
"learning_rate": 8.447368421052632e-06, |
|
"loss": 1.2187, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.8760586977005005, |
|
"learning_rate": 8.200657894736842e-06, |
|
"loss": 1.1957, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 5.671737194061279, |
|
"learning_rate": 7.963815789473685e-06, |
|
"loss": 1.156, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 2.1643118858337402, |
|
"learning_rate": 7.717105263157895e-06, |
|
"loss": 1.2163, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.6212981939315796, |
|
"learning_rate": 7.470394736842106e-06, |
|
"loss": 1.1774, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.421762466430664, |
|
"learning_rate": 7.223684210526316e-06, |
|
"loss": 1.2097, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.7923425436019897, |
|
"learning_rate": 6.976973684210526e-06, |
|
"loss": 1.2083, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.4467811584472656, |
|
"learning_rate": 6.730263157894737e-06, |
|
"loss": 1.1972, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.48775315284729, |
|
"learning_rate": 6.483552631578947e-06, |
|
"loss": 1.1967, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.6732630729675293, |
|
"learning_rate": 6.236842105263159e-06, |
|
"loss": 1.1978, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.5414642095565796, |
|
"learning_rate": 5.990131578947369e-06, |
|
"loss": 1.2124, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.6581816673278809, |
|
"learning_rate": 5.74342105263158e-06, |
|
"loss": 1.2193, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.688584327697754, |
|
"learning_rate": 5.49671052631579e-06, |
|
"loss": 1.1672, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 2.130878210067749, |
|
"learning_rate": 5.25e-06, |
|
"loss": 1.1778, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.325465679168701, |
|
"learning_rate": 5.003289473684211e-06, |
|
"loss": 1.2189, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 1.1922625303268433, |
|
"learning_rate": 4.756578947368421e-06, |
|
"loss": 1.1062, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.36138117313385, |
|
"learning_rate": 4.5098684210526316e-06, |
|
"loss": 1.2074, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.736745834350586, |
|
"learning_rate": 4.2631578947368425e-06, |
|
"loss": 1.1532, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.8273382186889648, |
|
"learning_rate": 4.016447368421053e-06, |
|
"loss": 1.1977, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.504334568977356, |
|
"learning_rate": 3.7697368421052634e-06, |
|
"loss": 1.1679, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.4739781618118286, |
|
"learning_rate": 3.523026315789474e-06, |
|
"loss": 1.1503, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.9392614364624023, |
|
"learning_rate": 3.2763157894736844e-06, |
|
"loss": 1.126, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.8378140926361084, |
|
"learning_rate": 3.0296052631578945e-06, |
|
"loss": 1.242, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.7033443450927734, |
|
"learning_rate": 2.7828947368421054e-06, |
|
"loss": 1.1892, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.5492761135101318, |
|
"learning_rate": 2.536184210526316e-06, |
|
"loss": 1.1594, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.5671361684799194, |
|
"learning_rate": 2.2894736842105263e-06, |
|
"loss": 1.179, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.7326922416687012, |
|
"learning_rate": 2.042763157894737e-06, |
|
"loss": 1.2256, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.4645403623580933, |
|
"learning_rate": 1.7960526315789473e-06, |
|
"loss": 1.1041, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.4518282413482666, |
|
"learning_rate": 1.549342105263158e-06, |
|
"loss": 1.1979, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.488720417022705, |
|
"learning_rate": 1.3026315789473685e-06, |
|
"loss": 1.1615, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.3193929195404053, |
|
"learning_rate": 1.055921052631579e-06, |
|
"loss": 1.1275, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.5246660709381104, |
|
"learning_rate": 8.092105263157895e-07, |
|
"loss": 1.1317, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.8375893831253052, |
|
"learning_rate": 5.625e-07, |
|
"loss": 1.1291, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.09460186958313, |
|
"learning_rate": 3.1578947368421055e-07, |
|
"loss": 1.2138, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.9802443981170654, |
|
"learning_rate": 6.907894736842104e-08, |
|
"loss": 1.1591, |
|
"step": 3375 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 3378, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 5.978827851300864e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|