|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 90, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02247191011235955, |
|
"grad_norm": 0.2744160592556, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.7619, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0449438202247191, |
|
"grad_norm": 0.32727673649787903, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.8704, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06741573033707865, |
|
"grad_norm": 0.22079522907733917, |
|
"learning_rate": 3e-06, |
|
"loss": 0.7241, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": 0.3042285144329071, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9591, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.11235955056179775, |
|
"grad_norm": 0.28325748443603516, |
|
"learning_rate": 5e-06, |
|
"loss": 1.061, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1348314606741573, |
|
"grad_norm": 0.29770785570144653, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0407, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.15730337078651685, |
|
"grad_norm": 0.2635294198989868, |
|
"learning_rate": 7e-06, |
|
"loss": 0.9524, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 0.2629376947879791, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.0095, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.20224719101123595, |
|
"grad_norm": 0.30229848623275757, |
|
"learning_rate": 9e-06, |
|
"loss": 0.7805, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 0.2629299759864807, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8706, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24719101123595505, |
|
"grad_norm": 0.26694297790527344, |
|
"learning_rate": 9.999146252290264e-06, |
|
"loss": 0.9653, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 0.2794544994831085, |
|
"learning_rate": 9.996585300715117e-06, |
|
"loss": 0.7668, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.29213483146067415, |
|
"grad_norm": 0.30685344338417053, |
|
"learning_rate": 9.992318019837171e-06, |
|
"loss": 0.8074, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.3146067415730337, |
|
"grad_norm": 0.3292616605758667, |
|
"learning_rate": 9.98634586692894e-06, |
|
"loss": 0.8413, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.33707865168539325, |
|
"grad_norm": 0.34031111001968384, |
|
"learning_rate": 9.978670881475173e-06, |
|
"loss": 0.6885, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 0.31020820140838623, |
|
"learning_rate": 9.96929568447637e-06, |
|
"loss": 1.0567, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.38202247191011235, |
|
"grad_norm": 0.28236857056617737, |
|
"learning_rate": 9.958223477553715e-06, |
|
"loss": 0.7969, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.4044943820224719, |
|
"grad_norm": 0.3202574551105499, |
|
"learning_rate": 9.945458041855732e-06, |
|
"loss": 0.8376, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.42696629213483145, |
|
"grad_norm": 0.2797839045524597, |
|
"learning_rate": 9.931003736767013e-06, |
|
"loss": 0.8911, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 0.3114033639431, |
|
"learning_rate": 9.91486549841951e-06, |
|
"loss": 0.9042, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.47191011235955055, |
|
"grad_norm": 0.288392037153244, |
|
"learning_rate": 9.89704883800683e-06, |
|
"loss": 0.9409, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.4943820224719101, |
|
"grad_norm": 0.31594759225845337, |
|
"learning_rate": 9.877559839902185e-06, |
|
"loss": 0.7856, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.5168539325842697, |
|
"grad_norm": 0.33034929633140564, |
|
"learning_rate": 9.85640515958057e-06, |
|
"loss": 0.8219, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 0.3292032778263092, |
|
"learning_rate": 9.833592021345938e-06, |
|
"loss": 0.7739, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 0.2713082432746887, |
|
"learning_rate": 9.809128215864096e-06, |
|
"loss": 0.6795, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5842696629213483, |
|
"grad_norm": 0.28911784291267395, |
|
"learning_rate": 9.783022097502204e-06, |
|
"loss": 0.8294, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.6067415730337079, |
|
"grad_norm": 0.2532961666584015, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.8253, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 0.23403476178646088, |
|
"learning_rate": 9.7259191408041e-06, |
|
"loss": 0.8501, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.651685393258427, |
|
"grad_norm": 0.2743186056613922, |
|
"learning_rate": 9.694941803075285e-06, |
|
"loss": 1.2514, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 0.22719059884548187, |
|
"learning_rate": 9.66236114702178e-06, |
|
"loss": 0.7012, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6966292134831461, |
|
"grad_norm": 0.22790080308914185, |
|
"learning_rate": 9.628188298907782e-06, |
|
"loss": 0.7681, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 0.18746985495090485, |
|
"learning_rate": 9.592434928729617e-06, |
|
"loss": 0.8587, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7415730337078652, |
|
"grad_norm": 0.19890236854553223, |
|
"learning_rate": 9.555113246230443e-06, |
|
"loss": 0.8129, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.7640449438202247, |
|
"grad_norm": 0.16481083631515503, |
|
"learning_rate": 9.516235996730645e-06, |
|
"loss": 0.658, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7865168539325843, |
|
"grad_norm": 0.19077646732330322, |
|
"learning_rate": 9.475816456775313e-06, |
|
"loss": 0.6721, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 0.20115327835083008, |
|
"learning_rate": 9.43386842960031e-06, |
|
"loss": 0.7361, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.8314606741573034, |
|
"grad_norm": 0.16515249013900757, |
|
"learning_rate": 9.39040624041849e-06, |
|
"loss": 0.6858, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.8539325842696629, |
|
"grad_norm": 0.16454002261161804, |
|
"learning_rate": 9.345444731527642e-06, |
|
"loss": 0.902, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8764044943820225, |
|
"grad_norm": 0.12956053018569946, |
|
"learning_rate": 9.298999257241862e-06, |
|
"loss": 0.7653, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.1354527771472931, |
|
"learning_rate": 9.251085678648072e-06, |
|
"loss": 0.6624, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9213483146067416, |
|
"grad_norm": 0.14311733841896057, |
|
"learning_rate": 9.201720358189464e-06, |
|
"loss": 0.6757, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.9438202247191011, |
|
"grad_norm": 0.14996573328971863, |
|
"learning_rate": 9.150920154077753e-06, |
|
"loss": 0.639, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9662921348314607, |
|
"grad_norm": 0.1524282693862915, |
|
"learning_rate": 9.098702414536107e-06, |
|
"loss": 0.8575, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 0.1349412202835083, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.6459, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.0112359550561798, |
|
"grad_norm": 0.12348250299692154, |
|
"learning_rate": 8.990086136401199e-06, |
|
"loss": 0.6682, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0167597765363128, |
|
"grad_norm": 0.12682083249092102, |
|
"learning_rate": 8.933724690167417e-06, |
|
"loss": 0.6996, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.0391061452513966, |
|
"grad_norm": 0.14481687545776367, |
|
"learning_rate": 8.87601988055565e-06, |
|
"loss": 0.9028, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.0614525139664805, |
|
"grad_norm": 0.20267356932163239, |
|
"learning_rate": 8.816991413705515e-06, |
|
"loss": 0.7655, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.0837988826815643, |
|
"grad_norm": 0.16279681026935577, |
|
"learning_rate": 8.756659447784367e-06, |
|
"loss": 0.6583, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.106145251396648, |
|
"grad_norm": 0.1354082226753235, |
|
"learning_rate": 8.695044586103297e-06, |
|
"loss": 0.8319, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1284916201117319, |
|
"grad_norm": 0.14600947499275208, |
|
"learning_rate": 8.632167870081122e-06, |
|
"loss": 0.6148, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.1508379888268156, |
|
"grad_norm": 0.2995755672454834, |
|
"learning_rate": 8.568050772058763e-06, |
|
"loss": 0.8722, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.1731843575418994, |
|
"grad_norm": 0.1370067149400711, |
|
"learning_rate": 8.502715187966455e-06, |
|
"loss": 0.8219, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.1955307262569832, |
|
"grad_norm": 0.14704233407974243, |
|
"learning_rate": 8.436183429846314e-06, |
|
"loss": 0.8698, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.217877094972067, |
|
"grad_norm": 0.13113325834274292, |
|
"learning_rate": 8.368478218232787e-06, |
|
"loss": 0.6741, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.2402234636871508, |
|
"grad_norm": 0.15853165090084076, |
|
"learning_rate": 8.299622674393615e-06, |
|
"loss": 0.6672, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2625698324022347, |
|
"grad_norm": 0.12545864284038544, |
|
"learning_rate": 8.229640312433938e-06, |
|
"loss": 0.6325, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.2849162011173183, |
|
"grad_norm": 0.13042129576206207, |
|
"learning_rate": 8.158555031266255e-06, |
|
"loss": 0.8421, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.3072625698324023, |
|
"grad_norm": 0.12107989937067032, |
|
"learning_rate": 8.086391106448965e-06, |
|
"loss": 0.849, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.329608938547486, |
|
"grad_norm": 0.12185141444206238, |
|
"learning_rate": 8.013173181896283e-06, |
|
"loss": 0.6157, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3519553072625698, |
|
"grad_norm": 0.14786343276500702, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.8534, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.3743016759776536, |
|
"grad_norm": 0.1170087456703186, |
|
"learning_rate": 7.863675700402527e-06, |
|
"loss": 0.6024, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.3966480446927374, |
|
"grad_norm": 0.12368728965520859, |
|
"learning_rate": 7.787447196714428e-06, |
|
"loss": 0.621, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.4189944134078212, |
|
"grad_norm": 0.13665403425693512, |
|
"learning_rate": 7.710266782362248e-06, |
|
"loss": 0.8647, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.441340782122905, |
|
"grad_norm": 0.14922167360782623, |
|
"learning_rate": 7.63216081438678e-06, |
|
"loss": 0.6963, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.463687150837989, |
|
"grad_norm": 0.20886604487895966, |
|
"learning_rate": 7.553155965904535e-06, |
|
"loss": 0.9604, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.4860335195530725, |
|
"grad_norm": 0.12474123388528824, |
|
"learning_rate": 7.473279216998896e-06, |
|
"loss": 0.7405, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.5083798882681565, |
|
"grad_norm": 0.12420104444026947, |
|
"learning_rate": 7.392557845506433e-06, |
|
"loss": 0.6579, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.5307262569832403, |
|
"grad_norm": 0.13506121933460236, |
|
"learning_rate": 7.311019417701567e-06, |
|
"loss": 0.8146, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.553072625698324, |
|
"grad_norm": 0.13394294679164886, |
|
"learning_rate": 7.2286917788826926e-06, |
|
"loss": 0.6438, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5754189944134078, |
|
"grad_norm": 0.12186373025178909, |
|
"learning_rate": 7.145603043863045e-06, |
|
"loss": 0.6964, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.5977653631284916, |
|
"grad_norm": 0.12778373062610626, |
|
"learning_rate": 7.061781587369518e-06, |
|
"loss": 0.6403, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.6201117318435754, |
|
"grad_norm": 0.12650710344314575, |
|
"learning_rate": 6.977256034352713e-06, |
|
"loss": 0.7991, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.6424581005586592, |
|
"grad_norm": 0.12929368019104004, |
|
"learning_rate": 6.892055250211552e-06, |
|
"loss": 0.731, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.6648044692737431, |
|
"grad_norm": 0.11519794166088104, |
|
"learning_rate": 6.806208330935766e-06, |
|
"loss": 0.787, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6871508379888267, |
|
"grad_norm": 0.11956265568733215, |
|
"learning_rate": 6.719744593169642e-06, |
|
"loss": 0.7078, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.7094972067039107, |
|
"grad_norm": 0.11849120259284973, |
|
"learning_rate": 6.6326935642004165e-06, |
|
"loss": 0.6098, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.7318435754189943, |
|
"grad_norm": 0.1370634287595749, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.775, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.7541899441340782, |
|
"grad_norm": 0.1339302361011505, |
|
"learning_rate": 6.456948734446624e-06, |
|
"loss": 0.7778, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.776536312849162, |
|
"grad_norm": 0.1872129589319229, |
|
"learning_rate": 6.368314950360416e-06, |
|
"loss": 0.7051, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7988826815642458, |
|
"grad_norm": 0.13236720860004425, |
|
"learning_rate": 6.279213887972179e-06, |
|
"loss": 0.6648, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.8212290502793296, |
|
"grad_norm": 0.14064921438694, |
|
"learning_rate": 6.189675975213094e-06, |
|
"loss": 0.9904, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.8435754189944134, |
|
"grad_norm": 0.14369916915893555, |
|
"learning_rate": 6.099731789198344e-06, |
|
"loss": 0.7428, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.8659217877094973, |
|
"grad_norm": 0.12341982126235962, |
|
"learning_rate": 6.009412045785051e-06, |
|
"loss": 0.679, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.888268156424581, |
|
"grad_norm": 0.12746700644493103, |
|
"learning_rate": 5.918747589082853e-06, |
|
"loss": 0.8121, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.910614525139665, |
|
"grad_norm": 0.1257738620042801, |
|
"learning_rate": 5.82776938092065e-06, |
|
"loss": 0.75, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.9329608938547485, |
|
"grad_norm": 0.1163194552063942, |
|
"learning_rate": 5.736508490273189e-06, |
|
"loss": 0.6534, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.9553072625698324, |
|
"grad_norm": 0.14038220047950745, |
|
"learning_rate": 5.644996082651018e-06, |
|
"loss": 0.6898, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.9776536312849162, |
|
"grad_norm": 0.11753737926483154, |
|
"learning_rate": 5.553263409457504e-06, |
|
"loss": 0.5927, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.1380361169576645, |
|
"learning_rate": 5.46134179731651e-06, |
|
"loss": 0.5046, |
|
"step": 90 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 180, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 45, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.822342948814848e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|