|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.05951200158698671, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007934933544931561, |
|
"grad_norm": 48.7606315612793, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 29.6105, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0007934933544931561, |
|
"eval_loss": 3.7225465774536133, |
|
"eval_runtime": 212.1774, |
|
"eval_samples_per_second": 5.005, |
|
"eval_steps_per_second": 2.503, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0015869867089863122, |
|
"grad_norm": 46.967803955078125, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 29.9806, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0023804800634794686, |
|
"grad_norm": 50.5672492980957, |
|
"learning_rate": 0.0002, |
|
"loss": 26.9378, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0031739734179726245, |
|
"grad_norm": 46.023780822753906, |
|
"learning_rate": 0.0001999048221581858, |
|
"loss": 21.9639, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003967466772465781, |
|
"grad_norm": 31.295907974243164, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 10.6674, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004760960126958937, |
|
"grad_norm": 23.777172088623047, |
|
"learning_rate": 0.00019914448613738106, |
|
"loss": 6.2121, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005554453481452093, |
|
"grad_norm": 23.954105377197266, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 4.3349, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006347946835945249, |
|
"grad_norm": 7.861544132232666, |
|
"learning_rate": 0.00019762960071199333, |
|
"loss": 3.1486, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007141440190438405, |
|
"grad_norm": 6.988774299621582, |
|
"learning_rate": 0.00019659258262890683, |
|
"loss": 2.6602, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.007934933544931562, |
|
"grad_norm": 32.20113754272461, |
|
"learning_rate": 0.0001953716950748227, |
|
"loss": 4.0596, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008728426899424718, |
|
"grad_norm": 14.095051765441895, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 2.995, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.009521920253917874, |
|
"grad_norm": 13.5059232711792, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 3.2571, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.010315413608411029, |
|
"grad_norm": 7.761572360992432, |
|
"learning_rate": 0.000190630778703665, |
|
"loss": 3.0002, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011108906962904185, |
|
"grad_norm": 4.439308166503906, |
|
"learning_rate": 0.00018870108331782217, |
|
"loss": 2.6362, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.011902400317397342, |
|
"grad_norm": 6.415318489074707, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 3.1925, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012695893671890498, |
|
"grad_norm": 6.409482955932617, |
|
"learning_rate": 0.0001843391445812886, |
|
"loss": 3.062, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.013489387026383654, |
|
"grad_norm": 3.2289886474609375, |
|
"learning_rate": 0.0001819152044288992, |
|
"loss": 2.8807, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01428288038087681, |
|
"grad_norm": 7.170831680297852, |
|
"learning_rate": 0.00017933533402912354, |
|
"loss": 3.1952, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.015076373735369967, |
|
"grad_norm": 8.997602462768555, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 2.9534, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.015869867089863123, |
|
"grad_norm": 18.465681076049805, |
|
"learning_rate": 0.0001737277336810124, |
|
"loss": 6.0253, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01666336044435628, |
|
"grad_norm": 6.664717674255371, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 3.1157, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.017456853798849436, |
|
"grad_norm": 6.979747772216797, |
|
"learning_rate": 0.00016755902076156604, |
|
"loss": 3.0816, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.018250347153342592, |
|
"grad_norm": 4.373898506164551, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 2.921, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.01904384050783575, |
|
"grad_norm": 3.7028324604034424, |
|
"learning_rate": 0.00016087614290087208, |
|
"loss": 2.8931, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0198373338623289, |
|
"grad_norm": 2.8346545696258545, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 2.8573, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0198373338623289, |
|
"eval_loss": 0.37545695900917053, |
|
"eval_runtime": 213.9659, |
|
"eval_samples_per_second": 4.963, |
|
"eval_steps_per_second": 2.482, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.020630827216822058, |
|
"grad_norm": 3.2634661197662354, |
|
"learning_rate": 0.0001537299608346824, |
|
"loss": 2.9078, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.021424320571315214, |
|
"grad_norm": 4.43280553817749, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 3.0779, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02221781392580837, |
|
"grad_norm": 3.53800106048584, |
|
"learning_rate": 0.00014617486132350343, |
|
"loss": 2.9961, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.023011307280301527, |
|
"grad_norm": 2.2324697971343994, |
|
"learning_rate": 0.00014226182617406996, |
|
"loss": 2.7108, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.023804800634794683, |
|
"grad_norm": 3.370065212249756, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 2.9733, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02459829398928784, |
|
"grad_norm": 2.1509768962860107, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 2.7909, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.025391787343780996, |
|
"grad_norm": 3.518296957015991, |
|
"learning_rate": 0.00013007057995042732, |
|
"loss": 3.0504, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.026185280698274152, |
|
"grad_norm": 2.118720054626465, |
|
"learning_rate": 0.00012588190451025207, |
|
"loss": 2.8059, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02697877405276731, |
|
"grad_norm": 2.4879894256591797, |
|
"learning_rate": 0.00012164396139381029, |
|
"loss": 2.8901, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.027772267407260465, |
|
"grad_norm": 2.6533515453338623, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 2.8234, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02856576076175362, |
|
"grad_norm": 8.765356063842773, |
|
"learning_rate": 0.00011305261922200519, |
|
"loss": 4.3363, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.029359254116246777, |
|
"grad_norm": 2.3454673290252686, |
|
"learning_rate": 0.00010871557427476583, |
|
"loss": 3.0552, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.030152747470739934, |
|
"grad_norm": 1.8576560020446777, |
|
"learning_rate": 0.00010436193873653361, |
|
"loss": 2.8869, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.03094624082523309, |
|
"grad_norm": 2.4082350730895996, |
|
"learning_rate": 0.0001, |
|
"loss": 2.8681, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.031739734179726246, |
|
"grad_norm": 2.3808937072753906, |
|
"learning_rate": 9.563806126346642e-05, |
|
"loss": 2.8245, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0325332275342194, |
|
"grad_norm": 1.8626981973648071, |
|
"learning_rate": 9.128442572523417e-05, |
|
"loss": 2.8774, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03332672088871256, |
|
"grad_norm": 2.4850103855133057, |
|
"learning_rate": 8.694738077799488e-05, |
|
"loss": 2.9511, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.034120214243205715, |
|
"grad_norm": 2.5530030727386475, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 2.7849, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03491370759769887, |
|
"grad_norm": 3.354628562927246, |
|
"learning_rate": 7.835603860618972e-05, |
|
"loss": 2.8894, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03570720095219203, |
|
"grad_norm": 2.490819215774536, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 2.8177, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.036500694306685184, |
|
"grad_norm": 4.005395412445068, |
|
"learning_rate": 6.992942004957271e-05, |
|
"loss": 3.0671, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03729418766117834, |
|
"grad_norm": 2.593477725982666, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 2.9372, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0380876810156715, |
|
"grad_norm": 4.0660319328308105, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 2.9659, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.038881174370164646, |
|
"grad_norm": 3.1026699542999268, |
|
"learning_rate": 5.773817382593008e-05, |
|
"loss": 3.0151, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0396746677246578, |
|
"grad_norm": 6.865238189697266, |
|
"learning_rate": 5.382513867649663e-05, |
|
"loss": 2.9836, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0396746677246578, |
|
"eval_loss": 0.3641993999481201, |
|
"eval_runtime": 214.9129, |
|
"eval_samples_per_second": 4.942, |
|
"eval_steps_per_second": 2.471, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04046816107915096, |
|
"grad_norm": 3.206982374191284, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 2.8024, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.041261654433644115, |
|
"grad_norm": 2.6523642539978027, |
|
"learning_rate": 4.6270039165317605e-05, |
|
"loss": 2.9125, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04205514778813727, |
|
"grad_norm": 2.145852565765381, |
|
"learning_rate": 4.264235636489542e-05, |
|
"loss": 2.7628, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04284864114263043, |
|
"grad_norm": 1.956112027168274, |
|
"learning_rate": 3.9123857099127936e-05, |
|
"loss": 2.829, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.043642134497123584, |
|
"grad_norm": 3.5577127933502197, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 2.9393, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04443562785161674, |
|
"grad_norm": 2.462873697280884, |
|
"learning_rate": 3.244097923843398e-05, |
|
"loss": 2.9588, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0452291212061099, |
|
"grad_norm": 1.5341908931732178, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 2.7976, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.04602261456060305, |
|
"grad_norm": 3.1381309032440186, |
|
"learning_rate": 2.6272266318987603e-05, |
|
"loss": 2.8753, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.04681610791509621, |
|
"grad_norm": 3.1369926929473877, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 2.8046, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.047609601269589366, |
|
"grad_norm": 3.5989041328430176, |
|
"learning_rate": 2.0664665970876496e-05, |
|
"loss": 3.0486, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04840309462408252, |
|
"grad_norm": 3.038649559020996, |
|
"learning_rate": 1.808479557110081e-05, |
|
"loss": 2.8922, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04919658797857568, |
|
"grad_norm": 3.415806293487549, |
|
"learning_rate": 1.566085541871145e-05, |
|
"loss": 2.7841, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.049990081333068835, |
|
"grad_norm": 1.7945884466171265, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 2.9487, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.05078357468756199, |
|
"grad_norm": 2.6868271827697754, |
|
"learning_rate": 1.129891668217783e-05, |
|
"loss": 2.8241, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.05157706804205515, |
|
"grad_norm": 2.454937219619751, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 2.7385, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.052370561396548304, |
|
"grad_norm": 2.422858953475952, |
|
"learning_rate": 7.612046748871327e-06, |
|
"loss": 2.9928, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05316405475104146, |
|
"grad_norm": 2.9807822704315186, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 2.873, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.05395754810553462, |
|
"grad_norm": 2.892643451690674, |
|
"learning_rate": 4.628304925177318e-06, |
|
"loss": 2.8405, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05475104146002777, |
|
"grad_norm": 2.1034739017486572, |
|
"learning_rate": 3.40741737109318e-06, |
|
"loss": 2.7657, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05554453481452093, |
|
"grad_norm": 2.7781882286071777, |
|
"learning_rate": 2.3703992880066638e-06, |
|
"loss": 2.9382, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.056338028169014086, |
|
"grad_norm": 1.8283110857009888, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 2.809, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.05713152152350724, |
|
"grad_norm": 3.2760870456695557, |
|
"learning_rate": 8.555138626189618e-07, |
|
"loss": 2.9855, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0579250148780004, |
|
"grad_norm": 2.1711838245391846, |
|
"learning_rate": 3.805301908254455e-07, |
|
"loss": 2.914, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.058718508232493555, |
|
"grad_norm": 1.9169692993164062, |
|
"learning_rate": 9.517784181422019e-08, |
|
"loss": 2.868, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.05951200158698671, |
|
"grad_norm": 2.8653454780578613, |
|
"learning_rate": 0.0, |
|
"loss": 2.9482, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05951200158698671, |
|
"eval_loss": 0.36626261472702026, |
|
"eval_runtime": 214.2761, |
|
"eval_samples_per_second": 4.956, |
|
"eval_steps_per_second": 2.478, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.80187302854656e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|